Image-ExifTool

 view release on metacpan or  search on metacpan

lib/Image/ExifTool/HTML.pm  view on Meta::CPAN

    'frac14' => 188,  'Kappa'  => 922,  'hArr'   => 8660,
    'frac12' => 189,  'Lambda' => 923,  'forall' => 8704,
    'frac34' => 190,  'Mu'     => 924,  'part'   => 8706,
    'iquest' => 191,  'Nu'     => 925,  'exist'  => 8707,
    'Agrave' => 192,  'Xi'     => 926,  'empty'  => 8709,
    'Aacute' => 193,  'Omicron'=> 927,  'nabla'  => 8711,
    'Acirc'  => 194,  'Pi'     => 928,  'isin'   => 8712,
    'Atilde' => 195,  'Rho'    => 929,  'notin'  => 8713,
    'Auml'   => 196,  'Sigma'  => 931,  'ni'     => 8715,
    'Aring'  => 197,  'Tau'    => 932,  'prod'   => 8719,
    'AElig'  => 198,  'Upsilon'=> 933,  'sum'    => 8721,
    'Ccedil' => 199,  'Phi'    => 934,  'minus'  => 8722,
    'Egrave' => 200,  'Chi'    => 935,  'lowast' => 8727,
    'Eacute' => 201,  'Psi'    => 936,  'radic'  => 8730,
    'Ecirc'  => 202,  'Omega'  => 937,  'prop'   => 8733,
    'Euml'   => 203,  'alpha'  => 945,  'infin'  => 8734,
    'Igrave' => 204,  'beta'   => 946,  'ang'    => 8736,
    'Iacute' => 205,  'gamma'  => 947,  'and'    => 8743,
    'Icirc'  => 206,  'delta'  => 948,  'or'     => 8744,
    'Iuml'   => 207,  'epsilon'=> 949,  'cap'    => 8745,
    'ETH'    => 208,  'zeta'   => 950,  'cup'    => 8746,
    'Ntilde' => 209,  'eta'    => 951,  'int'    => 8747,
    'Ograve' => 210,  'theta'  => 952,  'there4' => 8756,
    'Oacute' => 211,  'iota'   => 953,  'sim'    => 8764,
    'Ocirc'  => 212,  'kappa'  => 954,  'cong'   => 8773,
    'Otilde' => 213,  'lambda' => 955,  'asymp'  => 8776,
    'Ouml'   => 214,  'mu'     => 956,  'ne'     => 8800,
    'times'  => 215,  'nu'     => 957,  'equiv'  => 8801,
    'Oslash' => 216,  'xi'     => 958,  'le'     => 8804,
    'Ugrave' => 217,  'omicron'=> 959,  'ge'     => 8805,
    'Uacute' => 218,  'pi'     => 960,  'sub'    => 8834,
    'Ucirc'  => 219,  'rho'    => 961,  'sup'    => 8835,
    'Uuml'   => 220,  'sigmaf' => 962,  'nsub'   => 8836,
    'Yacute' => 221,  'sigma'  => 963,  'sube'   => 8838,
    'THORN'  => 222,  'tau'    => 964,  'supe'   => 8839,
    'szlig'  => 223,  'upsilon'=> 965,  'oplus'  => 8853,
    'agrave' => 224,  'phi'    => 966,  'otimes' => 8855,
    'aacute' => 225,  'chi'    => 967,  'perp'   => 8869,
    'acirc'  => 226,  'psi'    => 968,  'sdot'   => 8901,
    'atilde' => 227,  'omega'  => 969,  'lceil'  => 8968,
    'auml'   => 228,  'thetasym'=>977,  'rceil'  => 8969,
    'aring'  => 229,  'upsih'  => 978,  'lfloor' => 8970,
    'aelig'  => 230,  'piv'    => 982,  'rfloor' => 8971,
    'ccedil' => 231,  'ensp'   => 8194, 'lang'   => 9001,
    'egrave' => 232,  'emsp'   => 8195, 'rang'   => 9002,
    'eacute' => 233,  'thinsp' => 8201, 'loz'    => 9674,
    'ecirc'  => 234,  'zwnj'   => 8204, 'spades' => 9824,
    'euml'   => 235,  'zwj'    => 8205, 'clubs'  => 9827,
    'igrave' => 236,  'lrm'    => 8206, 'hearts' => 9829,
    'iacute' => 237,  'rlm'    => 8207, 'diams'  => 9830,
    'icirc'  => 238,  'ndash'  => 8211,
    'iuml'   => 239,  'mdash'  => 8212,
);
my %entityName; # look up entity names by number (built as necessary)

# HTML info
# (tag ID's are case insensitive and must be all lower case in tables)
%Image::ExifTool::HTML::Main = (
    GROUPS => { 2 => 'Document' },
    NOTES => q{
        Meta information extracted from the header of HTML and XHTML files.  This is
        a mix of information found in the C<META> elements, C<XML> element, and the
        C<TITLE> element.
    },
    dc => {
        Name => 'DC',
        SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' },
    },
    ncc => {
        Name => 'NCC',
        SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' },
    },
    prod => {
        Name => 'Prod',
        SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' },
    },
    vw96 => {
        Name => 'VW96',
        SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' },
    },
   'http-equiv' => {
        Name => 'HTTP-equiv',
        SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' },
    },
    o => {
        Name => 'Office',
        SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' },
    },
    abstract        => { },
    author          => { },
    classification  => { },
   'content-language'=>{ Name => 'ContentLanguage' },
    copyright       => { },
    description     => { },
    distribution    => { },
   'doc-class'      => { Name => 'DocClass' },
   'doc-rights'     => { Name => 'DocRights' },
   'doc-type'       => { Name => 'DocType' },
    formatter       => { },
    generator       => { },
    generatorversion=> { Name => 'GeneratorVersion' },
    googlebot       => { Name => 'GoogleBot' },
    keywords        => { List => 1 },
    mssmarttagspreventparsing => { Name => 'NoMSSmartTags' },
    originator      => { },
    owner           => { },
    progid          => { Name => 'ProgID' },
    rating          => { },
    refresh         => { },
   'resource-type'  => { Name => 'ResourceType' },
   'revisit-after'  => { Name => 'RevisitAfter' },
    robots          => { List => 1 },
    title           => { Notes => "the only extracted tag which isn't from an HTML META element" },
);

# ref 2
%Image::ExifTool::HTML::dc = (
    GROUPS => { 1 => 'HTML-dc', 2 => 'Document' },
    NOTES => 'Dublin Core schema tags (also used in XMP).',
    contributor => { Groups => { 2 => 'Author' }, List => 'Bag' },
    coverage    => { },

lib/Image/ExifTool/HTML.pm  view on Meta::CPAN

        ($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch);
    }
    return '?' unless defined $val;
    return "&$entityName{$val};" if $entityName{$val};
    return sprintf('&#x%x;',$val);
}

#------------------------------------------------------------------------------
# Escape any special characters for HTML
# Inputs: 0) string to be escaped, 1) optional string encoding (default 'UTF8')
# Returns: escaped string
sub EscapeHTML($;$)
{
    my ($str, $enc) = @_;
    # escape XML characters
    $str = EscapeXML($str);
    # escape other special characters if they exist
    if ($str =~ /[\x80-\xff]/) {
        # generate entity name lookup if necessary
        unless (%entityName) {
            local $_;
            foreach (keys %entityNum) {
                $entityName{$entityNum{$_}} = $_;
            }
            delete $entityName{39};  # 'apos' is not valid HTML
        }
        # suppress warnings
        local $SIG{'__WARN__'} = sub { 1 };
        if ($enc and $enc ne 'UTF8') {
            $str = Image::ExifTool::Decode(undef, $str, $enc, undef, 'UTF8');
            $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge;
            $str = Image::ExifTool::Decode(undef, $str, 'UTF8', undef, $enc);
        } else {
            # escape any non-ascii characters for HTML
            $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge;
        }
    }
    return $str;
}

#------------------------------------------------------------------------------
# Unescape all HTML character references
# Inputs: 0) string to be unescaped, 1) optional string encoding (default 'UTF8')
# Returns: unescaped string
sub UnescapeHTML($;$)
{
    my ($str, $enc) = @_;
    return UnescapeXML($str, \%entityNum, $enc);
}

#------------------------------------------------------------------------------
# Extract information from a HTML file
# Inputs: 0) ExifTool object reference, 1) DirInfo reference
# Returns: 1 on success, 0 if this wasn't a valid HTML file
sub ProcessHTML($$)
{
    my ($et, $dirInfo) = @_;
    my $raf = $$dirInfo{RAF};
    my $buff;

    # validate HTML or XHTML file
    $raf->Read($buff, 256) or return 0;
    $buff =~ /^(\xef\xbb\xbf)?\s*<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0;
    $buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $2 eq '?xml';
    $et->SetFileType();

    $raf->Seek(0,0) or $et->Warn('Seek error'), return 1;

    local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf);
    $/ or $et->Warn('Invalid HTML data'), return 1;

    # extract header information
    my $doc;
    while ($raf->ReadLine($buff)) {
        if (not defined $doc) {
            # look for 'head' element
            next unless $buff =~ /<head\b/ig;
            $doc = substr($buff, pos($buff));
            next;
        }
        $doc .= $buff;
        last if $buff =~ m{</head>}i;
    }
    return 1 unless defined $doc;

    # process all elements in header
    my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main');
    for (;;) {
        last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg;
        my ($tagName, $attrs) = ($1, $2);
        my $tag = lc($tagName);
        my ($val, $grp);
        if ($attrs =~ m{/$}) {  # self-contained XHTML tags end in '/>'
            $val = '';
        } else {
            # look for element close
            my $pos = pos($doc);
            my $close = "</$tagName>";
            # the following doesn't work on Solaris Perl 5.6.1 due to Perl bug:
            # if ($doc =~ m{(.*?)</$tagName>}sg) {
            #     $val = $1;
            if ($doc =~ m{$close}sg) {
                $val = substr($doc, $pos, pos($doc)-$pos-length($close));
            } else {
                pos($doc) = $pos;
                next unless $tag eq 'meta'; # META tags don't need to be closed
                $val = '';
            }
        }
        my $table = $tagTablePtr;
        if ($tag eq 'meta') {
            # parse HTML META element
            undef $tag;
            # tag name is in NAME or HTTP-EQUIV attribute
            if ($attrs =~ /\bname\s*=\s*['"]?([\w:.-]+)/si) {
                $tagName = $1;
            } elsif ($attrs =~ /\bhttp-equiv\s*=\s*['"]?([\w:.-]+)/si) {
                $tagName = "HTTP-equiv.$1";
            } else {
                next;   # no name
            }
            $tag = lc($tagName) or next;
            # tag value is in CONTENT attribute
            if ($attrs =~ /\bcontent\s*=\s*(['"])(.*?)\1/si or
                $attrs =~ /\bcontent\s*=\s*(['"]?)([\w:.-]+)/si)
            {
                $val = $2;
            } else {
                next unless length $val;
            }
            # isolate group name (separator is '.' in HTML, but ':' in ref 2)
            if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) {
                ($grp, $tag) = ($1, $2);
                my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp);
                if ($tagInfo and $$tagInfo{SubDirectory}) {
                    $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
                } else {
                    $tag = "$grp.$tag";
                }
            }
        } elsif ($tag eq 'xml') {
            $et->VPrint(0, "Parsing XML\n");
            # parse XML tags (quick-and-dirty)
            my $xml = $val;
            while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) {
                ($grp, $tag, $val) = ($1, $2, $4);
                my $tagInfo = $et->GetTagInfo($tagTablePtr, $grp);
                next unless $tagInfo and $$tagInfo{SubDirectory};
                $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
                unless ($$table{$tag}) {
                    my $name = ucfirst $tag;
                    $name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes
                    $name =~ s/\s(.)/\U$1/g;     # capitalize all words in tag name



( run in 1.191 second using v1.01-cache-2.11-cpan-5511b514fd6 )