Image-ExifTool

 view release on metacpan or  search on metacpan

lib/Image/ExifTool/Charset.pm  view on Meta::CPAN

#------------------------------------------------------------------------------
# Decompose string with specified encoding into an array of integer code points
# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
#         3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
# Returns: Reference to array of Unicode values
# Notes: Accepts any type of character set
# - byte order only used for fixed-width 2-byte and 4-byte character sets
# - byte order mark observed and then removed with UCS2 and UCS4
# - no warnings are issued if ExifTool object is not provided
# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
sub Decompose($$$;$)
{
    local $_;
    my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
    my $type = $csType{$charset};
    my (@uni, $conv);

    if ($type & 0x001) {
        $conv = LoadCharset($charset);
        unless ($conv) {
            # (shouldn't happen)
            $et->Warn("Invalid character set $charset") if $et;
            return \@uni;   # error!
        }
    } elsif ($type == 0x100) {
        # convert ASCII and UTF8 (treat ASCII as UTF8)
        if ($] < 5.006001) {
            # do it ourself
            @uni = Image::ExifTool::UnpackUTF8($val);
        } else {
            # handle warnings from malformed UTF-8
            undef $Image::ExifTool::evalWarning;
            local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
            # (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
            @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
            # issue warning if we had errors
            if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
                $et->Warn('Malformed UTF-8 character(s)');
                $$et{WarnBadUTF8} = 1;
            }
        }
        return \@uni;       # all done!
    }
    if ($type & 0x100) {        # 1-byte fixed-width characters
        @uni = unpack('C*', $val);
        foreach (@uni) {
            $_ = $$conv{$_} if defined $$conv{$_};
        }
    } elsif ($type & 0x600) {   # 2-byte or 4-byte fixed-width characters
        my $unknown;
        my $byteOrder = $_[3];
        if (not $byteOrder) {
            $byteOrder = GetByteOrder();
        } elsif ($byteOrder eq 'Unknown') {
            $byteOrder = GetByteOrder();
            $unknown = 1;
        }
        my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
        if ($type & 0x400) {    # 4-byte
            $fmt = uc $fmt; # unpack as 'N*' or 'V*'
            # honour BOM if it exists
            $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*';
            undef $unknown; # (byte order logic applies to 2-byte only)
        } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) {
            $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*';
            undef $unknown;
        }
        # convert from UCS2 or UCS4
        @uni = unpack($fmt, $val);

        if (not $conv) {
            # no translation necessary
            if ($unknown) {
                # check the byte order
                my (%bh, %bl);
                my ($zh, $zl) = (0, 0);
                foreach (@uni) {
                    $bh{$_ >> 8} = 1;
                    $bl{$_ & 0xff} = 1;
                    ++$zh unless $_ & 0xff00;
                    ++$zl unless $_ & 0x00ff;
                }
                # count the number of unique values in the hi and lo bytes
                my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
                # the byte with the greater number of unique values should be
                # the low-order byte, otherwise the byte which is zero more
                # often is likely the high-order byte
                if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
                    # we guessed wrong, so decode using the other byte order
                    $fmt =~ tr/nvNV/vnVN/;
                    @uni = unpack($fmt, $val);
                    $$et{WrongByteOrder} = 1;
                }
            }
            # handle surrogate pairs of UTF-16
            if ($charset eq 'UTF16') {
                my $i;
                for ($i=0; $i<$#uni; ++$i) {
                    next unless ($uni[$i]   & 0xfc00) == 0xd800 and
                                ($uni[$i+1] & 0xfc00) == 0xdc00;
                    my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
                    splice(@uni, $i, 2, $cp);
                }
            }
        } elsif ($unknown) {
            # count encoding errors as we do the translation
            my $e1 = 0;
            foreach (@uni) {
                defined $$conv{$_} and $_ = $$conv{$_}, next;
                ++$e1;
            }
            # try the other byte order if we had any errors
            if ($e1) {
                $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed)
                my @try = unpack($fmt, $val);
                my $e2 = 0;
                foreach (@try) {
                    defined $$conv{$_} and $_ = $$conv{$_}, next;
                    ++$e2;
                }
                # use this byte order if there are fewer errors



( run in 0.934 second using v1.01-cache-2.11-cpan-39bf76dae61 )