Image-ExifTool
view release on metacpan or search on metacpan
lib/Image/ExifTool/Charset.pm view on Meta::CPAN
#------------------------------------------------------------------------------
# Decompose string with specified encoding into an array of integer code points
# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
# Returns: Reference to array of Unicode values
# Notes: Accepts any type of character set
# - byte order only used for fixed-width 2-byte and 4-byte character sets
# - byte order mark observed and then removed with UCS2 and UCS4
# - no warnings are issued if ExifTool object is not provided
# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
sub Decompose($$$;$)
{
local $_;
my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
my $type = $csType{$charset};
my (@uni, $conv);
if ($type & 0x001) {
$conv = LoadCharset($charset);
unless ($conv) {
# (shouldn't happen)
$et->Warn("Invalid character set $charset") if $et;
return \@uni; # error!
}
} elsif ($type == 0x100) {
# convert ASCII and UTF8 (treat ASCII as UTF8)
if ($] < 5.006001) {
# do it ourself
@uni = Image::ExifTool::UnpackUTF8($val);
} else {
# handle warnings from malformed UTF-8
undef $Image::ExifTool::evalWarning;
local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
# (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
@uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
# issue warning if we had errors
if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
$et->Warn('Malformed UTF-8 character(s)');
$$et{WarnBadUTF8} = 1;
}
}
return \@uni; # all done!
}
if ($type & 0x100) { # 1-byte fixed-width characters
@uni = unpack('C*', $val);
foreach (@uni) {
$_ = $$conv{$_} if defined $$conv{$_};
}
} elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
my $unknown;
my $byteOrder = $_[3];
if (not $byteOrder) {
$byteOrder = GetByteOrder();
} elsif ($byteOrder eq 'Unknown') {
$byteOrder = GetByteOrder();
$unknown = 1;
}
my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
if ($type & 0x400) { # 4-byte
$fmt = uc $fmt; # unpack as 'N*' or 'V*'
# honour BOM if it exists
$val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*';
undef $unknown; # (byte order logic applies to 2-byte only)
} elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) {
$fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*';
undef $unknown;
}
# convert from UCS2 or UCS4
@uni = unpack($fmt, $val);
if (not $conv) {
# no translation necessary
if ($unknown) {
# check the byte order
my (%bh, %bl);
my ($zh, $zl) = (0, 0);
foreach (@uni) {
$bh{$_ >> 8} = 1;
$bl{$_ & 0xff} = 1;
++$zh unless $_ & 0xff00;
++$zl unless $_ & 0x00ff;
}
# count the number of unique values in the hi and lo bytes
my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
# the byte with the greater number of unique values should be
# the low-order byte, otherwise the byte which is zero more
# often is likely the high-order byte
if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
# we guessed wrong, so decode using the other byte order
$fmt =~ tr/nvNV/vnVN/;
@uni = unpack($fmt, $val);
$$et{WrongByteOrder} = 1;
}
}
# handle surrogate pairs of UTF-16
if ($charset eq 'UTF16') {
my $i;
for ($i=0; $i<$#uni; ++$i) {
next unless ($uni[$i] & 0xfc00) == 0xd800 and
($uni[$i+1] & 0xfc00) == 0xdc00;
my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
splice(@uni, $i, 2, $cp);
}
}
} elsif ($unknown) {
# count encoding errors as we do the translation
my $e1 = 0;
foreach (@uni) {
defined $$conv{$_} and $_ = $$conv{$_}, next;
++$e1;
}
# try the other byte order if we had any errors
if ($e1) {
$fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed)
my @try = unpack($fmt, $val);
my $e2 = 0;
foreach (@try) {
defined $$conv{$_} and $_ = $$conv{$_}, next;
++$e2;
}
# use this byte order if there are fewer errors
( run in 0.934 second using v1.01-cache-2.11-cpan-39bf76dae61 )