Image-ExifTool
view release on metacpan or search on metacpan
lib/Image/ExifTool/XMP.pm view on Meta::CPAN
return $count; # return the number of elements found at this level
}
#------------------------------------------------------------------------------
# Process XMP data
# Inputs: 0) ExifTool ref, 1) dirInfo ref, 2) tag table ref
# Returns: 1 on success
# Notes: The following flavours of XMP files are currently recognized:
# - standard XMP with xpacket, x:xmpmeta and rdf:RDF elements
# - XMP that is missing the xpacket and/or x:xmpmeta elements
# - mutant Microsoft XMP with xmp:xmpmeta element
# - XML files beginning with "<xml"
# - SVG files that begin with "<svg" or "<!DOCTYPE svg"
# - XMP and XML files beginning with a UTF-8 byte order mark
# - UTF-8, UTF-16 and UTF-32 encoded XMP
# - erroneously double-UTF8 encoded XMP
# - otherwise valid files with leading XML comment
sub ProcessXMP($$;$)
{
my ($et, $dirInfo, $tagTablePtr) = @_;
my $dataPt = $$dirInfo{DataPt};
my ($dirStart, $dirLen, $dataLen, $double);
my ($buff, $fmt, $hasXMP, $isXML, $isRDF, $isSVG);
my $rtnVal = 0;
my $bom = 0;
my $path = $et->MetadataPath();
# namespaces and prefixes currently in effect while parsing the file,
# and lookup to translate brain-dead-Microsoft-Photo-software prefixes
$$et{curURI} = { };
$$et{curNS} = { };
$$et{xlatNS} = { };
$$et{definedNS} = { };
delete $$et{XmpAbout};
delete $$et{XmpValidate}; # don't validate by default
delete $$et{XmpValidateLangAlt};
# ignore non-standard XMP while in strict MWG compatibility mode
if (($Image::ExifTool::MWG::strict or $$et{OPTIONS}{Validate}) and
not ($$et{XMP_CAPTURE} or $$et{DOC_NUM}) and
(($$dirInfo{DirName} || '') eq 'XMP' or $$et{FILE_TYPE} eq 'XMP'))
{
$$et{XmpValidate} = { } if $$et{OPTIONS}{Validate};
my $nonStd = ($stdPath{$$et{FILE_TYPE}} and $path ne $stdPath{$$et{FILE_TYPE}});
if ($nonStd and $Image::ExifTool::MWG::strict) {
$et->Warn("Ignored non-standard XMP at $path");
return 1;
}
if ($nonStd) {
$et->Warn("Non-standard XMP at $path", 1);
} elsif (not $$dirInfo{IsExtended}) {
$et->Warn("Duplicate XMP at $path") if $$et{DIR_COUNT}{XMP};
$$et{DIR_COUNT}{XMP} = ($$et{DIR_COUNT}{XMP} || 0) + 1; # count standard XMP
}
}
if ($dataPt) {
$dirStart = $$dirInfo{DirStart} || 0;
$dirLen = $$dirInfo{DirLen} || (length($$dataPt) - $dirStart);
$dataLen = $$dirInfo{DataLen} || length($$dataPt);
# check leading BOM (may indicate double-encoded UTF)
pos($$dataPt) = $dirStart;
if ($$dataPt =~ /\G((\0\0)?\xfe\xff|\xff\xfe(\0\0)?|\xef\xbb\xbf)\0*<\0*\?\0*x\0*p\0*a\0*c\0*k\0*e\0*t/g) {
$double = $1;
} else {
# handle UTF-16/32 XML
pos($$dataPt) = $dirStart;
if ($$dataPt =~ /\G((\0\0)?\xfe\xff|\xff\xfe(\0\0)?|\xef\xbb\xbf)\0*<\0*\?\0*x\0*m\0*l\0* /g) {
my $tmp = $1;
$fmt = $tmp =~ /\xfe\xff/ ? 'n' : 'v';
$fmt = uc($fmt) if $tmp =~ /\0\0/;
$isXML = 1;
}
}
} else {
my ($type, $mime, $buf2, $buf3);
# read information from XMP file
my $raf = $$dirInfo{RAF} or return 0;
$raf->Read($buff, 256) or return 0;
($buf2 = $buff) =~ tr/\0//d; # cheap conversion to UTF-8
# remove leading comments if they exist (eg. ImageIngester)
while ($buf2 =~ /^\s*<!--/) {
# remove the comment if it is complete
if ($buf2 =~ s/^\s*<!--.*?-->\s+//s) {
# continue with parsing if we have more than 128 bytes remaining
next if length $buf2 > 128;
} else {
# don't read more than 10k when looking for the end of comment
return 0 if length($buf2) > 10000;
}
$raf->Read($buf3, 256) or last; # read more data if available
$buff .= $buf3;
$buf3 =~ tr/\0//d;
$buf2 .= $buf3;
}
# check to see if this is XMP format
# (CS2 writes .XMP files without the "xpacket begin")
if ($buf2 =~ /^\s*(<\?xpacket begin=|<x(mp)?:x[ma]pmeta)/) {
$hasXMP = 1;
} else {
# also recognize XML files and .XMP files with BOM and without x:xmpmeta
if ($buf2 =~ /^(\xfe\xff)(<\?xml|<rdf:RDF|<x(mp)?:x[ma]pmeta)/g) {
$fmt = 'n'; # UTF-16 or 32 MM with BOM
} elsif ($buf2 =~ /^(\xff\xfe)(<\?xml|<rdf:RDF|<x(mp)?:x[ma]pmeta)/g) {
$fmt = 'v'; # UTF-16 or 32 II with BOM
} elsif ($buf2 =~ /^(\xef\xbb\xbf)?(<\?xml|<rdf:RDF|<x(mp)?:x[ma]pmeta|<svg\b)/g) {
$fmt = 0; # UTF-8 with BOM or unknown encoding without BOM
} elsif ($buf2 =~ /^(\xfe\xff|\xff\xfe|\xef\xbb\xbf)(<\?xpacket begin=)/g) {
$double = $1; # double-encoded UTF
} else {
return 0; # not recognized XMP or XML
}
$bom = 1 if $1;
if ($2 eq '<?xml') {
if (defined $fmt and not $fmt and $buf2 =~ /^[^\n\r]*[\n\r]+<\?aid /s) {
undef $$et{XmpValidate}; # don't validate INX
if ($$et{XMP_CAPTURE}) {
$et->Error("ExifTool does not yet support writing of INX files");
return 0;
}
$type = 'INX';
} elsif ($buf2 =~ /<x(mp)?:x[ma]pmeta/) {
$hasXMP = 1;
} else {
undef $$et{XmpValidate}; # don't validate XML
# identify SVG images and PLIST files by DOCTYPE if available
if ($buf2 =~ /<!DOCTYPE\s+(\w+)/) {
if ($1 eq 'svg') {
$isSVG = 1;
} elsif ($1 eq 'plist') {
$type = 'PLIST';
} elsif ($1 eq 'REDXIF') {
$type = 'RMD';
$mime = 'application/xml';
} elsif ($1 ne 'fcpxml') { # Final Cut Pro XML
return 0;
}
} elsif ($buf2 =~ /<svg[\s>]/) {
$isSVG = 1;
} elsif ($buf2 =~ /<rdf:RDF/) {
$isRDF = 1;
} elsif ($buf2 =~ /<plist[\s>]/) {
$type = 'PLIST';
}
}
$isXML = 1;
} elsif ($2 eq '<rdf:RDF') {
$isRDF = 1; # recognize XMP without x:xmpmeta element
} elsif ($2 eq '<svg') {
$isSVG = $isXML = 1;
}
if ($isSVG and $$et{XMP_CAPTURE}) {
$et->Error("ExifTool does not yet support writing of SVG images");
return 0;
}
if ($buff =~ /^\0\0/) {
$fmt = 'N'; # UTF-32 MM with or without BOM
} elsif ($buff =~ /^..\0\0/s) {
$fmt = 'V'; # UTF-32 II with or without BOM
} elsif (not $fmt) {
if ($buff =~ /^\0/) {
$fmt = 'n'; # UTF-16 MM without BOM
} elsif ($buff =~ /^.\0/s) {
$fmt = 'v'; # UTF-16 II without BOM
}
}
}
my $size;
if ($type) {
if ($type eq 'PLIST') {
my $ext = $$et{FILE_EXT};
$type = $ext if $ext and $ext eq 'MODD';
$tagTablePtr = GetTagTable('Image::ExifTool::PLIST::Main');
$$dirInfo{XMPParseOpts}{FoundProc} = \&Image::ExifTool::PLIST::FoundTag;
}
} else {
if ($isSVG) {
$type = 'SVG';
} elsif ($isXML and not $hasXMP and not $isRDF) {
$type = 'XML';
my $ext = $$et{FILE_EXT};
$type = $ext if $ext and $ext eq 'COS'; # recognize COS by extension
}
}
$et->SetFileType($type, $mime);
my $fast = $et->Options('FastScan');
return 1 if $fast and $fast == 3;
if ($type and $type eq 'INX') {
# brute force search for first XMP packet in INX file
# start: '<![CDATA[<?xpacket begin' (24 bytes)
# end: '<?xpacket end="r"?>]]>' (22 bytes)
$raf->Seek(0, 0) or return 0;
$raf->Read($buff, 65536) or return 1;
for (;;) {
last if $buff =~ /<!\[CDATA\[<\?xpacket begin/g;
$raf->Read($buf2, 65536) or return 1;
$buff = substr($buff, -24) . $buf2;
}
$buff = substr($buff, pos($buff) - 15); # (discard '<![CDATA[' and before)
for (;;) {
last if $buff =~ /<\?xpacket end="[rw]"\?>\]\]>/g;
my $n = length $buff;
$raf->Read($buf2, 65536) or $et->Warn('Missing xpacket end'), return 1;
$buff .= $buf2;
pos($buff) = $n - 22; # don't miss end pattern if it was split
}
$size = pos($buff) - 3; # (discard ']]>' and after)
$buff = substr($buff, 0, $size);
} else {
# read the entire file
$raf->Seek(0, 2) or return 0;
$size = $raf->Tell() or return 0;
$raf->Seek(0, 0) or return 0;
$raf->Read($buff, $size) == $size or return 0;
}
$dataPt = \$buff;
$dirStart = 0;
$dirLen = $dataLen = $size;
}
# decode the first layer of double-encoded UTF text (if necessary)
if ($double) {
my ($buf2, $fmt);
$buff = substr($$dataPt, $dirStart + length $double); # remove leading BOM
Image::ExifTool::SetWarning(undef); # clear old warning
local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
# assume that character data has been re-encoded in UTF, so re-pack
# as characters and look for warnings indicating a false assumption
if ($double eq "\xef\xbb\xbf") {
require Image::ExifTool::Charset;
my $uni = Image::ExifTool::Charset::Decompose(undef,$buff,'UTF8');
$buf2 = pack('C*', @$uni);
} else {
if (length($double) == 2) {
$fmt = ($double eq "\xfe\xff") ? 'n' : 'v';
} else {
$fmt = ($double eq "\0\0\xfe\xff") ? 'N' : 'V';
}
$buf2 = pack('C*', unpack("$fmt*",$buff));
}
if (Image::ExifTool::GetWarning()) {
$et->Warn('Superfluous BOM at start of XMP') unless $$dirInfo{RAF};
$dataPt = \$buff; # use XMP with the BOM removed
} else {
$et->Warn('XMP is double UTF-encoded');
$dataPt = \$buf2; # use the decoded XMP
}
$dirStart = 0;
$dirLen = $dataLen = length $$dataPt;
}
# extract XMP/XML as a block if specified
my $blockName = $$dirInfo{BlockInfo} ? $$dirInfo{BlockInfo}{Name} : 'XMP';
my $blockExtract = $et->Options('BlockExtract');
if (($$et{REQ_TAG_LOOKUP}{lc $blockName} or ($$et{TAGS_FROM_FILE} and
not $$et{EXCL_TAG_LOOKUP}{lc $blockName}) or $blockExtract) and
(($$et{FileType} eq 'XMP' and $blockName eq 'XMP') or
($$dirInfo{DirName} and $$dirInfo{DirName} eq $blockName)))
{
$et->FoundTag($$dirInfo{BlockInfo} || 'XMP', substr($$dataPt, $dirStart, $dirLen));
return 1 if $blockExtract and $blockExtract > 1;
}
$tagTablePtr or $tagTablePtr = GetTagTable('Image::ExifTool::XMP::Main');
if ($et->Options('Verbose') and not $$et{XMP_CAPTURE}) {
my $dirType = $isSVG ? 'SVG' : $$tagTablePtr{GROUPS}{1};
$et->VerboseDir($dirType, 0, $dirLen);
}
#
# convert UTF-16 or UTF-32 encoded XMP to UTF-8 if necessary
#
my $begin = '<?xpacket begin=';
my $dirEnd = $dirStart + $dirLen;
pos($$dataPt) = $dirStart;
delete $$et{XMP_IS_XML};
delete $$et{XMP_IS_SVG};
if ($isXML or $isRDF) {
$$et{XMP_IS_XML} = $isXML;
$$et{XMP_IS_SVG} = $isSVG;
$$et{XMP_NO_XPACKET} = 1 + $bom;
} elsif ($$dataPt =~ /\G\Q$begin\E/gc) {
delete $$et{XMP_NO_XPACKET};
} elsif ($$dataPt =~ /<x(mp)?:x[ma]pmeta/gc and
pos($$dataPt) > $dirStart and pos($$dataPt) < $dirEnd)
{
$$et{XMP_NO_XPACKET} = 1 + $bom;
} else {
delete $$et{XMP_NO_XPACKET};
# check for UTF-16 encoding (insert one \0 between characters)
$begin = join "\0", split //, $begin;
# must reset pos because it was killed by previous unsuccessful //g match
pos($$dataPt) = $dirStart;
my $badEnc;
if ($$dataPt =~ /\G(\0)?\Q$begin\E\0./sg) {
# validate byte ordering by checking for U+FEFF character
if ($1) {
# should be big-endian since we had a leading \0
$fmt = 'n';
$badEnc = 1 unless $$dataPt =~ /\G\xfe\xff/g;
} else {
$fmt = 'v';
$badEnc = 1 unless $$dataPt =~ /\G\0\xff\xfe/g;
}
( run in 2.294 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )