Image-ExifTool

 view release on metacpan or  search on metacpan

lib/Image/ExifTool/XMP.pm  view on Meta::CPAN


    return $count;  # return the number of elements found at this level
}

#------------------------------------------------------------------------------
# Process XMP data
# Inputs: 0) ExifTool ref, 1) dirInfo ref, 2) tag table ref
# Returns: 1 on success
# Notes: The following flavours of XMP files are currently recognized:
# - standard XMP with xpacket, x:xmpmeta and rdf:RDF elements
# - XMP that is missing the xpacket and/or x:xmpmeta elements
# - mutant Microsoft XMP with xmp:xmpmeta element
# - XML files beginning with "<xml"
# - SVG files that begin with "<svg" or "<!DOCTYPE svg"
# - XMP and XML files beginning with a UTF-8 byte order mark
# - UTF-8, UTF-16 and UTF-32 encoded XMP
# - erroneously double-UTF8 encoded XMP
# - otherwise valid files with leading XML comment
sub ProcessXMP($$;$)
{
    my ($et, $dirInfo, $tagTablePtr) = @_;
    my $dataPt = $$dirInfo{DataPt};
    my ($dirStart, $dirLen, $dataLen, $double);
    my ($buff, $fmt, $hasXMP, $isXML, $isRDF, $isSVG);
    my $rtnVal = 0;
    my $bom = 0;
    my $path = $et->MetadataPath();

    # namespaces and prefixes currently in effect while parsing the file,
    # and lookup to translate brain-dead-Microsoft-Photo-software prefixes
    $$et{curURI} = { };
    $$et{curNS}  = { };
    $$et{xlatNS} = { };
    $$et{definedNS} = { };
    delete $$et{XmpAbout};
    delete $$et{XmpValidate};   # don't validate by default
    delete $$et{XmpValidateLangAlt};

    # ignore non-standard XMP while in strict MWG compatibility mode
    if (($Image::ExifTool::MWG::strict or $$et{OPTIONS}{Validate}) and
        not ($$et{XMP_CAPTURE} or $$et{DOC_NUM}) and
        (($$dirInfo{DirName} || '') eq 'XMP' or $$et{FILE_TYPE} eq 'XMP'))
    {
        $$et{XmpValidate} = { } if $$et{OPTIONS}{Validate};
        my $nonStd = ($stdPath{$$et{FILE_TYPE}} and $path ne $stdPath{$$et{FILE_TYPE}});
        if ($nonStd and $Image::ExifTool::MWG::strict) {
            $et->Warn("Ignored non-standard XMP at $path");
            return 1;
        }
        if ($nonStd) {
            $et->Warn("Non-standard XMP at $path", 1);
        } elsif (not $$dirInfo{IsExtended}) {
            $et->Warn("Duplicate XMP at $path") if $$et{DIR_COUNT}{XMP};
            $$et{DIR_COUNT}{XMP} = ($$et{DIR_COUNT}{XMP} || 0) + 1; # count standard XMP
        }
    }
    if ($dataPt) {
        $dirStart = $$dirInfo{DirStart} || 0;
        $dirLen = $$dirInfo{DirLen} || (length($$dataPt) - $dirStart);
        $dataLen = $$dirInfo{DataLen} || length($$dataPt);
        # check leading BOM (may indicate double-encoded UTF)
        pos($$dataPt) = $dirStart;
        if ($$dataPt =~ /\G((\0\0)?\xfe\xff|\xff\xfe(\0\0)?|\xef\xbb\xbf)\0*<\0*\?\0*x\0*p\0*a\0*c\0*k\0*e\0*t/g) {
            $double = $1;
        } else {
            # handle UTF-16/32 XML
            pos($$dataPt) = $dirStart;
            if ($$dataPt =~ /\G((\0\0)?\xfe\xff|\xff\xfe(\0\0)?|\xef\xbb\xbf)\0*<\0*\?\0*x\0*m\0*l\0* /g) {
                my $tmp = $1;
                $fmt = $tmp =~ /\xfe\xff/ ? 'n' : 'v';
                $fmt = uc($fmt) if $tmp =~ /\0\0/;
                $isXML = 1;
            }
        }
    } else {
        my ($type, $mime, $buf2, $buf3);
        # read information from XMP file
        my $raf = $$dirInfo{RAF} or return 0;
        $raf->Read($buff, 256) or return 0;
        ($buf2 = $buff) =~ tr/\0//d;    # cheap conversion to UTF-8
        # remove leading comments if they exist (eg. ImageIngester)
        while ($buf2 =~ /^\s*<!--/) {
            # remove the comment if it is complete
            if ($buf2 =~ s/^\s*<!--.*?-->\s+//s) {
                # continue with parsing if we have more than 128 bytes remaining
                next if length $buf2 > 128;
            } else {
                # don't read more than 10k when looking for the end of comment
                return 0 if length($buf2) > 10000;
            }
            $raf->Read($buf3, 256) or last; # read more data if available
            $buff .= $buf3;
            $buf3 =~ tr/\0//d;
            $buf2 .= $buf3;
        }
        # check to see if this is XMP format
        # (CS2 writes .XMP files without the "xpacket begin")
        if ($buf2 =~ /^\s*(<\?xpacket begin=|<x(mp)?:x[ma]pmeta)/) {
            $hasXMP = 1;
        } else {
            # also recognize XML files and .XMP files with BOM and without x:xmpmeta
            if ($buf2 =~ /^(\xfe\xff)(<\?xml|<rdf:RDF|<x(mp)?:x[ma]pmeta)/g) {
                $fmt = 'n';     # UTF-16 or 32 MM with BOM
            } elsif ($buf2 =~ /^(\xff\xfe)(<\?xml|<rdf:RDF|<x(mp)?:x[ma]pmeta)/g) {
                $fmt = 'v';     # UTF-16 or 32 II with BOM
            } elsif ($buf2 =~ /^(\xef\xbb\xbf)?(<\?xml|<rdf:RDF|<x(mp)?:x[ma]pmeta|<svg\b)/g) {
                $fmt = 0;       # UTF-8 with BOM or unknown encoding without BOM
            } elsif ($buf2 =~ /^(\xfe\xff|\xff\xfe|\xef\xbb\xbf)(<\?xpacket begin=)/g) {
                $double = $1;   # double-encoded UTF
            } else {
                return 0;       # not recognized XMP or XML
            }
            $bom = 1 if $1;
            if ($2 eq '<?xml') {
                if (defined $fmt and not $fmt and $buf2 =~ /^[^\n\r]*[\n\r]+<\?aid /s) {
                    undef $$et{XmpValidate};    # don't validate INX
                    if ($$et{XMP_CAPTURE}) {
                        $et->Error("ExifTool does not yet support writing of INX files");
                        return 0;
                    }
                    $type = 'INX';
                } elsif ($buf2 =~ /<x(mp)?:x[ma]pmeta/) {
                    $hasXMP = 1;
                } else {
                    undef $$et{XmpValidate};    # don't validate XML
                    # identify SVG images and PLIST files by DOCTYPE if available
                    if ($buf2 =~ /<!DOCTYPE\s+(\w+)/) {
                        if ($1 eq 'svg') {
                            $isSVG = 1;
                        } elsif ($1 eq 'plist') {
                            $type = 'PLIST';
                        } elsif ($1 eq 'REDXIF') {
                            $type = 'RMD';
                            $mime = 'application/xml';
                        } elsif ($1 ne 'fcpxml') { # Final Cut Pro XML
                            return 0;
                        }
                    } elsif ($buf2 =~ /<svg[\s>]/) {
                        $isSVG = 1;
                    } elsif ($buf2 =~ /<rdf:RDF/) {
                        $isRDF = 1;
                    } elsif ($buf2 =~ /<plist[\s>]/) {
                        $type = 'PLIST';
                    }
                }
                $isXML = 1;
            } elsif ($2 eq '<rdf:RDF') {
                $isRDF = 1;     # recognize XMP without x:xmpmeta element
            } elsif ($2 eq '<svg') {
                $isSVG = $isXML = 1;
            }
            if ($isSVG and $$et{XMP_CAPTURE}) {
                $et->Error("ExifTool does not yet support writing of SVG images");
                return 0;
            }
            if ($buff =~ /^\0\0/) {
                $fmt = 'N';     # UTF-32 MM with or without BOM
            } elsif ($buff =~ /^..\0\0/s) {
                $fmt = 'V';     # UTF-32 II with or without BOM
            } elsif (not $fmt) {
                if ($buff =~ /^\0/) {
                    $fmt = 'n'; # UTF-16 MM without BOM
                } elsif ($buff =~ /^.\0/s) {
                    $fmt = 'v'; # UTF-16 II without BOM
                }
            }
        }
        my $size;
        if ($type) {
            if ($type eq 'PLIST') {
                my $ext = $$et{FILE_EXT};
                $type = $ext if $ext and $ext eq 'MODD';
                $tagTablePtr = GetTagTable('Image::ExifTool::PLIST::Main');
                $$dirInfo{XMPParseOpts}{FoundProc} = \&Image::ExifTool::PLIST::FoundTag;
            }
        } else {
            if ($isSVG) {
                $type = 'SVG';
            } elsif ($isXML and not $hasXMP and not $isRDF) {
                $type = 'XML';
                my $ext = $$et{FILE_EXT};
                $type = $ext if $ext and $ext eq 'COS'; # recognize COS by extension
            }
        }
        $et->SetFileType($type, $mime);

        my $fast = $et->Options('FastScan');
        return 1 if $fast and $fast == 3;

        if ($type and $type eq 'INX') {
            # brute force search for first XMP packet in INX file
            # start: '<![CDATA[<?xpacket begin' (24 bytes)
            # end:   '<?xpacket end="r"?>]]>'   (22 bytes)
            $raf->Seek(0, 0) or return 0;
            $raf->Read($buff, 65536) or return 1;
            for (;;) {
                last if $buff =~ /<!\[CDATA\[<\?xpacket begin/g;
                $raf->Read($buf2, 65536) or return 1;
                $buff = substr($buff, -24) . $buf2;
            }
            $buff = substr($buff, pos($buff) - 15); # (discard '<![CDATA[' and before)
            for (;;) {
                last if $buff =~ /<\?xpacket end="[rw]"\?>\]\]>/g;
                my $n = length $buff;
                $raf->Read($buf2, 65536) or $et->Warn('Missing xpacket end'), return 1;
                $buff .= $buf2;
                pos($buff) = $n - 22;   # don't miss end pattern if it was split
            }
            $size = pos($buff) - 3;     # (discard ']]>' and after)
            $buff = substr($buff, 0, $size);
        } else {
            # read the entire file
            $raf->Seek(0, 2) or return 0;
            $size = $raf->Tell() or return 0;
            $raf->Seek(0, 0) or return 0;
            $raf->Read($buff, $size) == $size or return 0;
        }
        $dataPt = \$buff;
        $dirStart = 0;
        $dirLen = $dataLen = $size;
    }

    # decode the first layer of double-encoded UTF text (if necessary)
    if ($double) {
        my ($buf2, $fmt);
        $buff = substr($$dataPt, $dirStart + length $double); # remove leading BOM
        Image::ExifTool::SetWarning(undef); # clear old warning
        local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
        # assume that character data has been re-encoded in UTF, so re-pack
        # as characters and look for warnings indicating a false assumption
        if ($double eq "\xef\xbb\xbf") {
            require Image::ExifTool::Charset;
            my $uni = Image::ExifTool::Charset::Decompose(undef,$buff,'UTF8');
            $buf2 = pack('C*', @$uni);
        } else {
            if (length($double) == 2) {
                $fmt = ($double eq "\xfe\xff") ? 'n' : 'v';
            } else {
                $fmt = ($double eq "\0\0\xfe\xff") ? 'N' : 'V';
            }
            $buf2 = pack('C*', unpack("$fmt*",$buff));
        }
        if (Image::ExifTool::GetWarning()) {
            $et->Warn('Superfluous BOM at start of XMP') unless $$dirInfo{RAF};
            $dataPt = \$buff;   # use XMP with the BOM removed
        } else {
            $et->Warn('XMP is double UTF-encoded');
            $dataPt = \$buf2;   # use the decoded XMP
        }
        $dirStart = 0;
        $dirLen = $dataLen = length $$dataPt;
    }

    # extract XMP/XML as a block if specified
    my $blockName = $$dirInfo{BlockInfo} ? $$dirInfo{BlockInfo}{Name} : 'XMP';
    my $blockExtract = $et->Options('BlockExtract');
    if (($$et{REQ_TAG_LOOKUP}{lc $blockName} or ($$et{TAGS_FROM_FILE} and
        not $$et{EXCL_TAG_LOOKUP}{lc $blockName}) or $blockExtract) and
        (($$et{FileType} eq 'XMP' and $blockName eq 'XMP') or
        ($$dirInfo{DirName} and $$dirInfo{DirName} eq $blockName)))
    {
        $et->FoundTag($$dirInfo{BlockInfo} || 'XMP', substr($$dataPt, $dirStart, $dirLen));
        return 1 if $blockExtract and $blockExtract > 1;
    }

    $tagTablePtr or $tagTablePtr = GetTagTable('Image::ExifTool::XMP::Main');
    if ($et->Options('Verbose') and not $$et{XMP_CAPTURE}) {
        my $dirType = $isSVG ? 'SVG' : $$tagTablePtr{GROUPS}{1};
        $et->VerboseDir($dirType, 0, $dirLen);
    }
#
# convert UTF-16 or UTF-32 encoded XMP to UTF-8 if necessary
#
    my $begin = '<?xpacket begin=';
    my $dirEnd = $dirStart + $dirLen;
    pos($$dataPt) = $dirStart;
    delete $$et{XMP_IS_XML};
    delete $$et{XMP_IS_SVG};
    if ($isXML or $isRDF) {
        $$et{XMP_IS_XML} = $isXML;
        $$et{XMP_IS_SVG} = $isSVG;
        $$et{XMP_NO_XPACKET} = 1 + $bom;
    } elsif ($$dataPt =~ /\G\Q$begin\E/gc) {
        delete $$et{XMP_NO_XPACKET};
    } elsif ($$dataPt =~ /<x(mp)?:x[ma]pmeta/gc and
             pos($$dataPt) > $dirStart and pos($$dataPt) < $dirEnd)
    {
        $$et{XMP_NO_XPACKET} = 1 + $bom;
    } else {
        delete $$et{XMP_NO_XPACKET};
        # check for UTF-16 encoding (insert one \0 between characters)
        $begin = join "\0", split //, $begin;
        # must reset pos because it was killed by previous unsuccessful //g match
        pos($$dataPt) = $dirStart;
        my $badEnc;
        if ($$dataPt =~ /\G(\0)?\Q$begin\E\0./sg) {
            # validate byte ordering by checking for U+FEFF character
            if ($1) {
                # should be big-endian since we had a leading \0
                $fmt = 'n';
                $badEnc = 1 unless $$dataPt =~ /\G\xfe\xff/g;
            } else {
                $fmt = 'v';
                $badEnc = 1 unless $$dataPt =~ /\G\0\xff\xfe/g;
            }



( run in 2.294 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )