Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
#############################################################################
#
# Methods
#
##############################################################################
sub new
{
my $proto=shift;
my $class=ref($proto)||$proto;
my $parent=ref($proto)&&$proto;
my $self={};
bless($self,$class);
$self->_init(@_);
$self->_set_err_state($ERR_OK);
$self->{docTypeWizard}=
Alvis::Document::Type->new(defaultType=>
$self->{defaultDocType},
defaultSubType=>
$self->{defaultDocSubType});
if (!defined($self->{docTypeWizard}))
{
$self->_set_err_state($ERR_DOC_TYPE_WIZARD);
return undef;
}
Encode::Guess->add_suspects(qw/latin1 iso-8859-2 iso-8859-5 iso-8859-10 iso-8859-7 windows-1252/);
return $self;
}
sub _init
{
my $self=shift;
$self->{defaultDocType}='text';
$self->{defaultDocSubType}='html';
$self->{defaultEncoding}='iso-8859-1';
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
##########################################################################
#
# Public methods
#
#########################################################################
#
# Returns 1 if the (decimal) character code is legal UTF-8
#
sub code_is_utf8
{
my $self=shift;
my $dec_code=shift;
# check for invalid codes
if ($dec_code<0 || $dec_code>1114111 || $InvalidUtf8Code{$dec_code})
{
return 0;
}
return 1;
}
#
# Returns 1 if all of the characters of the text are legal UTF-8
# Else, returns 0 and sets an error message specifying the location
# (1..) of the first illegal character code
# If you wish to obtain the position and offending code, pass a
# hash ref
#
sub is_utf8
{
my $self=shift;
my $text=shift;
my $err=shift;
# Go over the text char by char and check for invalid char codes
my @chars=split(//,$text);
my $i=1;
for my $char (@chars)
{
# We test for valid code
#
my $code=ord($char);
if (!$self->code_is_utf8($code))
{
$self->_set_err_state($ERR_ILLEGAL_CODE,
sprintf("Position: #%d, character code: %#x",
$i,$code));
if (defined($err) && ref($err) eq 'HASH')
{
$err->{pos}=$i;
$err->{code}=$code;
}
return 0;
}
$i++;
}
return 1;
}
#
# type: The type of the document as one of the recognized types
# defined in Alvis::Document::Type (superset of MIME).
#
sub guess
{
my $self=shift;
my $text=shift;
my $type=shift;
my $sub_type=shift;
$self->_set_err_state($ERR_OK); # clean the slate
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
}
if (!(defined($type) && defined($sub_type)))
{
($type,$sub_type)=$self->{docTypeWizard}->guess($text);
if (!(defined($type) && defined($sub_type)))
{
$self->_set_err_state($ERR_TYPE_GUESS,
$self->{docTypeWizard}->errmsg());
return undef;
}
}
# warn "($type,$sub_type)";
my @guesses;
if ($type eq 'text')
{
if ($sub_type eq 'html')
{
@guesses=$self->_HTML($text);
if (scalar(@guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
if (defined($self->{defaultEncoding}))
{
@guesses=($self->{defaultEncoding});
}
}
}
elsif ($sub_type eq 'xhtml')
{
@guesses=$self->_XHTML($text);
if (scalar(@guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
if (defined($self->{defaultEncoding}))
{
@guesses=($self->{defaultEncoding});
}
}
}
elsif ($sub_type eq 'plain')
{
@guesses=$self->_plain_text($text);
if (scalar(@guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
if (defined($self->{defaultEncoding}))
{
@guesses=($self->{defaultEncoding});
}
}
}
}
return @guesses;
}
sub from_to
{
my $self=shift;
my $text=shift;
my $source_enc=shift;
my $target_enc=shift;
{
eval
{
Encode::from_to($text,
$source_enc,$target_enc,0);
# $source_enc,$target_enc,Encode::FB_QUIET);
};
if ($@)
{
my $err=$@;
$err=~s/ at .*$//isgo;
$self->_set_err_state($ERR_WRONG_GUESS,
"source encoding: $source_enc, " .
"target encoding: $target_enc. Why? $err.");
return undef;
}
if ($target_enc=~/^\s*utf-?8\s*$/isgo)
{
# leaves the bl***y UTF-8 flag on
Encode::_utf8_on($text);
}
}
return $text;
}
#
# Should always leave the UTF-8 flag on, if target is UTF-8
#
sub convert
{
my $self=shift;
my $text=shift;
my $source_enc=shift;
my $target_enc=shift;
my %err;
if ($source_enc=~/^\s*utf-?8\s*$/isgo)
{
if (!$self->is_utf8($text,\%err))
{
$self->_set_err_state($ERR_ILLEGAL_CHAR,
" Position: $err{pos}," .
"Code:$err{code}");
return undef;
}
}
my $try=$self->from_to($text,$source_enc,$target_enc);
if (!defined($try))
{
my @possible_src_typo_fixes=$self->guess_typo_fixes($source_enc);
my @possible_trg_typo_fixes=$self->guess_typo_fixes($target_enc);
for my $src_enc_guess ($self->guess_typo_fixes($source_enc))
{
for my $trg_enc_guess ($self->guess_typo_fixes($target_enc))
{
my $try=$self->from_to($text,$src_enc_guess,$trg_enc_guess);
if (defined($try))
{
return $try;
}
}
}
$self->_set_err_state($ERR_UNABLE_TO_GUESS);
return undef;
}
return $try;
}
sub guess_and_convert
{
my $self=shift;
my $text=shift;
my $type=shift;
my $sub_type=shift;
my $target_enc=shift;
$self->_set_err_state($ERR_OK);
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
}
if (!(defined($type) && defined($sub_type)))
{
($type,$sub_type)=$self->{docTypeWizard}->guess($text);
if (!(defined($type) && defined($sub_type)))
{
$self->_set_err_state($ERR_TYPE_GUESS,
$self->{docTypeWizard}->errmsg());
return undef;
}
}
my @enc_guesses=$self->guess($text,$type,$sub_type);
if (scalar(@enc_guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
return undef;
}
my $result;
for my $enc_guess (@enc_guesses)
{
if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
return $text;
} else {
$result=$self->convert($text,$enc_guess,$target_enc);
if (defined($result))
{
return $result;
}
}
}
if (!defined($result))
{
# test if its UTF-8 already
&Encode::_utf8_on($text);
if ( &Encode::is_utf8($text) ) {
return $text;
}
&Encode::_utf8_off($text);
$self->_set_err_state($ERR_GUESS_AND_CONVERT);
# print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
# print STDERR "\n$text\n\n";
return undef;
}
return $result;
}
sub guess_typo_fixes
{
my $self=shift;
my $typo=shift;
my @possibilities=($typo);
if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
{
push(@possibilities,'utf8');
}
if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)
{
push(@possibilities,'UTF-16');
}
if ($typo=~/^\s*iso-?8559-?1\s*$/isgo)
{
push(@possibilities,'iso-8859-1');
}
if ($typo=~/^\s*ecu-?(kr|jp|cn|tw|jisx0213)\s*$/isgo)
{
push(@possibilities,"euc-$1");
}
if ($typo=~/^\s*(?:uft|utf)-?32\s*$/isgo)
{
push(@possibilities,'UTF-32');
}
if ($typo=~/^\s*(?:acsii|asici)\s*$/isgo)
{
push(@possibilities,'ascii');
}
if ($typo=~/^\s*(?:acsii|asici)-?ctrl\s*$/isgo)
{
push(@possibilities,'ascii-ctrl');
}
if ($typo=~/^\s*(?:utf|uft)-?7\s*$/isgo)
{
push(@possibilities,'UTF-7');
}
if ($typo=~/^\s*macintosh\s*$/isgo)
{
for (my $i=1; $i<=11; $i++)
{
push(@possibilities,"iso-8859-$i");
}
push(@possibilities,'viscii');
}
if ($typo=~/^\s*iso-8559-(\d)\s*$/isgo)
{
push(@possibilities,"iso-8859-$1");
}
if ($typo=~/^\s*iso-8895-(\d)\s*$/isgo)
{
push(@possibilities,"iso-8859-$1");
}
if ($typo=~/^\s*(?:utf|uft)-?16be\s*$/isgo)
{
push(@possibilities,'UTF-16BE');
}
if ($typo=~/^\s*(?:utf|uft)-?16le\s*$/isgo)
{
push(@possibilities,'UTF-16LE');
}
return @possibilities;
}
########################################################################3
#
# Private methods
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
@guesses=HTML::Encoding::encoding_from_xml_document($text);
};
if ($@)
{
$self->_set_err_state($ERR_XML,"$@");
return ();
}
if (scalar(@guesses))
{
return @guesses;
}
return $self->_HTML($text);
}
sub _plain_text
{
my $self=shift;
my $text=shift;
if (!defined($text) || length($text)<1)
{
$self->_set_err_state($ERR_DOC);
return ();
}
my $enc=guess_encoding($text);
if (ref($enc))
{
return ($enc->name());
}
else
{
$self->_set_err_state($ERR_ENCODE_GUESS,"$@");
return ();
}
}
1;
__END__
=head1 NAME
Alvis::Encoding - Perl extension for guessing and checking the encoding of
documents.
=head1 SYNOPSIS
use Alvis::Encoding;
# Create a new instance
my $e=Alvis::Encoding->new();
if (!defined($e))
{
die "Instantiating Alvis::Encoding failed.";
}
# Check that a (decimal) character code is legal UTF-8
my $code=55;
if (!$e->code_is_utf8($code))
{
# The message will contain the position and the offending character's code
die $e->errmsg();
}
# Check that a text is legal UTF-8
my $text;
if (!$e->is_utf8($text))
{
# The message will contain the position and the offending character's code
die $e->errmsg();
}
# If you need to obtain the position (1..) and the offending character,
# pass a placeholder in a hash ref argument:
my %err=();
if (!$e->is_utf8($text,\%err))
{
my $position=$err{pos};
my $code=$err{code};
. . .
}
#
# Guess the encoding of a document given a guess for its type
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_encoding=$e->guess($text,$doc_type,$doc_sub_type);
if (!defined($doc_encoding))
{
die('Cannot guess. ' . $e->errmsg());
}
#
# Try converting a document to UTF-8 with only its type known
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
if (!defined($doc_in_utf8))
{
die('Cannot guess. ' . $e->errmsg());
}
# Try to guess what was meant
my @possibilities=$e->guess_typo_fixes('uft-8');
=head1 DESCRIPTION
A collection of methods for guessing, confirming and fixing the encoding
of a document.
=head1 METHODS
=head2 new()
Options:
defaultDocType default type for a document. Default: text.
defaultDocSubType default sub type for a document. Default: html.
defaultEncoding default encoding for a document. Default: iso-8859-1.
=head2 code_is_utf8(decimal_code)
Returns 1 if the (decimal) character code is legal UTF-8.
=head2 is_utf8(text,err_hash_ref)
Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.
=head2 guess(text,doc_type,doc_sub_type)
Guess the encoding of a document given a guess for its type (and subtype).
=head2 guess_and_convert(text,doc_type,doc_sub_type,target_encoding)
Tries to first guess the encoding of the document given a guess at its
type and subtype, and then tries to convert it to $target_encoding.
=head2 convert(text,source_encoding,target_encoding)
Tries to convert $text from $source_encoding to $target_encoding.
=head2 guess_typo_fixes($typo)
Returns a set of guesses for the meant encoding in a case of an encoding
name containing typos.
=head2 errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
=head1 SEE ALSO
Alvis::Document::Type
=head1 AUTHOR
Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2006 by Kimmo Valtonen
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
( run in 2.184 seconds using v1.01-cache-2.11-cpan-140bd7fdf52 )