Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

package Alvis::Document::Encoding;

use warnings;
use strict;

$Alvis::Document::Encoding::VERSION = '0.1';

use HTML::Encoding;
use Encode;
use Encode::Guess;

use Alvis::Document::Type;

#############################################################################
#
#  A collection of routines for checking UTF-8 validity, guessing the
#  encoding of a document etc.
#
#############################################################################

#############################################################################
#
#     Global variables & constants
#
##############################################################################

my $DEBUG=0;

#
# Invalid Utf-8 codes (UTF-16 surrogates and non-valid codes) 
#
my %InvalidUtf8Code=(
 55296=>1,  55297=>1,  55298=>1,  55299=>1,  55300=>1,  55301=>1,  55302=>1, 
 55303=>1,  55304=>1,  55305=>1,  55306=>1,  55307=>1,  55308=>1,  55309=>1, 
 55310=>1,  55311=>1,  55312=>1,  55313=>1,  55314=>1,  55315=>1,  55316=>1, 
 55317=>1,  55318=>1,  55319=>1,  55320=>1,  55321=>1,  55322=>1,  55323=>1, 
 55324=>1,  55325=>1,  55326=>1,  55327=>1,  55328=>1,  55329=>1,  55330=>1, 
 55331=>1,  55332=>1,  55333=>1,  55334=>1,  55335=>1,  55336=>1,  55337=>1, 
 55338=>1,  55339=>1,  55340=>1,  55341=>1,  55342=>1,  55343=>1,  55344=>1, 
 55345=>1,  55346=>1,  55347=>1,  55348=>1,  55349=>1,  55350=>1,  55351=>1, 
 55352=>1,  55353=>1,  55354=>1,  55355=>1,  55356=>1,  55357=>1,  55358=>1, 
 55359=>1,  55360=>1,  55361=>1,  55362=>1,  55363=>1,  55364=>1,  55365=>1, 
 55366=>1,  55367=>1,  55368=>1,  55369=>1,  55370=>1,  55371=>1,  55372=>1, 
 55373=>1,  55374=>1,  55375=>1,  55376=>1,  55377=>1,  55378=>1,  55379=>1, 
 55380=>1,  55381=>1,  55382=>1,  55383=>1,  55384=>1,  55385=>1,  55386=>1, 
 55387=>1,  55388=>1,  55389=>1,  55390=>1,  55391=>1,  55392=>1,  55393=>1, 
 55394=>1,  55395=>1,  55396=>1,  55397=>1,  55398=>1,  55399=>1,  55400=>1, 
 55401=>1,  55402=>1,  55403=>1,  55404=>1,  55405=>1,  55406=>1,  55407=>1, 
 55408=>1,  55409=>1,  55410=>1,  55411=>1,  55412=>1,  55413=>1,  55414=>1, 
 55415=>1,  55416=>1,  55417=>1,  55418=>1,  55419=>1,  55420=>1,  55421=>1, 
 55422=>1,  55423=>1,  55424=>1,  55425=>1,  55426=>1,  55427=>1,  55428=>1, 
 55429=>1,  55430=>1,  55431=>1,  55432=>1,  55433=>1,  55434=>1,  55435=>1, 
 55436=>1,  55437=>1,  55438=>1,  55439=>1,  55440=>1,  55441=>1,  55442=>1, 
 55443=>1,  55444=>1,  55445=>1,  55446=>1,  55447=>1,  55448=>1,  55449=>1, 
 55450=>1,  55451=>1,  55452=>1,  55453=>1,  55454=>1,  55455=>1,  55456=>1, 
 55457=>1,  55458=>1,  55459=>1,  55460=>1,  55461=>1,  55462=>1,  55463=>1, 
 55464=>1,  55465=>1,  55466=>1,  55467=>1,  55468=>1,  55469=>1,  55470=>1, 
 55471=>1,  55472=>1,  55473=>1,  55474=>1,  55475=>1,  55476=>1,  55477=>1, 
 55478=>1,  55479=>1,  55480=>1,  55481=>1,  55482=>1,  55483=>1,  55484=>1, 
 55485=>1,  55486=>1,  55487=>1,  55488=>1,  55489=>1,  55490=>1,  55491=>1, 
 55492=>1,  55493=>1,  55494=>1,  55495=>1,  55496=>1,  55497=>1,  55498=>1, 
 55499=>1,  55500=>1,  55501=>1,  55502=>1,  55503=>1,  55504=>1,  55505=>1, 
 55506=>1,  55507=>1,  55508=>1,  55509=>1,  55510=>1,  55511=>1,  55512=>1, 
 55513=>1,  55514=>1,  55515=>1,  55516=>1,  55517=>1,  55518=>1,  55519=>1, 
 55520=>1,  55521=>1,  55522=>1,  55523=>1,  55524=>1,  55525=>1,  55526=>1, 
 55527=>1,  55528=>1,  55529=>1,  55530=>1,  55531=>1,  55532=>1,  55533=>1, 
 55534=>1,  55535=>1,  55536=>1,  55537=>1,  55538=>1,  55539=>1,  55540=>1, 
 55541=>1,  55542=>1,  55543=>1,  55544=>1,  55545=>1,  55546=>1,  55547=>1, 
 55548=>1,  55549=>1,  55550=>1,  55551=>1,  55552=>1,  55553=>1,  55554=>1, 
 55555=>1,  55556=>1,  55557=>1,  55558=>1,  55559=>1,  55560=>1,  55561=>1, 
 55562=>1,  55563=>1,  55564=>1,  55565=>1,  55566=>1,  55567=>1,  55568=>1, 
 55569=>1,  55570=>1,  55571=>1,  55572=>1,  55573=>1,  55574=>1,  55575=>1, 
 55576=>1,  55577=>1,  55578=>1,  55579=>1,  55580=>1,  55581=>1,  55582=>1, 
 55583=>1,  55584=>1,  55585=>1,  55586=>1,  55587=>1,  55588=>1,  55589=>1, 
 55590=>1,  55591=>1,  55592=>1,  55593=>1,  55594=>1,  55595=>1,  55596=>1, 
 55597=>1,  55598=>1,  55599=>1,  55600=>1,  55601=>1,  55602=>1,  55603=>1, 

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

 57263=>1,  57264=>1,  57265=>1,  57266=>1,  57267=>1,  57268=>1,  57269=>1, 
 57270=>1,  57271=>1,  57272=>1,  57273=>1,  57274=>1,  57275=>1,  57276=>1, 
 57277=>1,  57278=>1,  57279=>1,  57280=>1,  57281=>1,  57282=>1,  57283=>1, 
 57284=>1,  57285=>1,  57286=>1,  57287=>1,  57288=>1,  57289=>1,  57290=>1, 
 57291=>1,  57292=>1,  57293=>1,  57294=>1,  57295=>1,  57296=>1,  57297=>1, 
 57298=>1,  57299=>1,  57300=>1,  57301=>1,  57302=>1,  57303=>1,  57304=>1, 
 57305=>1,  57306=>1,  57307=>1,  57308=>1,  57309=>1,  57310=>1,  57311=>1, 
 57312=>1,  57313=>1,  57314=>1,  57315=>1,  57316=>1,  57317=>1,  57318=>1, 
 57319=>1,  57320=>1,  57321=>1,  57322=>1,  57323=>1,  57324=>1,  57325=>1, 
 57326=>1,  57327=>1,  57328=>1,  57329=>1,  57330=>1,  57331=>1,  57332=>1, 
 57333=>1,  57334=>1,  57335=>1,  57336=>1,  57337=>1,  57338=>1,  57339=>1, 
 57340=>1,  57341=>1,  57342=>1,  57343=>1,  64976=>1,  64977=>1,  64978=>1, 
 64979=>1,  64980=>1,  64981=>1,  64982=>1,  64983=>1,  64984=>1,  64985=>1, 
 64986=>1,  64987=>1,  64988=>1,  64989=>1,  64990=>1,  64991=>1,  64992=>1, 
 64993=>1,  64994=>1,  64995=>1,  64996=>1,  64997=>1,  64998=>1,  64999=>1, 
 65000=>1,  65001=>1,  65002=>1,  65003=>1,  65004=>1,  65005=>1,  65006=>1, 
 65007=>1,  65534=>1,  65535=>1,  131070=>1,  131071=>1,  196606=>1,  196607=>1, 
 262142=>1,  262143=>1,  327678=>1,  327679=>1,  393214=>1,  393215=>1,  458750=>1, 
 458751=>1,  524286=>1,  524287=>1,  589822=>1,  589823=>1,  655358=>1,  655359=>1, 
 720894=>1,  720895=>1,  786430=>1,  786431=>1,  851966=>1,  851967=>1,  917502=>1, 
 917503=>1,  983038=>1,  983039=>1,  1048574=>1,  1048575=>1,  1114110=>1,  1114111=>1, 
);


#############################################################################
#
#     Error message stuff
#
#############################################################################

my $ErrStr;
my ($ERR_OK,
    $ERR_ILLEGAL_CODE,
    $ERR_DOC,
    $ERR_DOC_TYPE,
    $ERR_DOC_SUB_TYPE,
    $ERR_BOM,
    $ERR_FIRST_CHARS,
    $ERR_META,
    $ERR_XML,
    $ERR_GUESS,
    $ERR_WRONG_GUESS,
    $ERR_ILLEGAL_CHAR,
    $ERR_DOC_TYPE_WIZARD,
    $ERR_TYPE_GUESS,
    $ERR_ENCODE_GUESS,
    $ERR_GUESS_AND_CONVERT,
    $ERR_UNABLE_TO_GUESS
    )=(0..16);
my %ErrMsgs=($ERR_OK=>"",
	     $ERR_ILLEGAL_CODE=>"Illegal UTF-8 code.",
	     $ERR_DOC=>"No document text.",
	     $ERR_DOC_TYPE=>"No document type.",
	     $ERR_DOC_SUB_TYPE=>"No document subtype.",
	     $ERR_BOM=>"Byte order mark recognition failed miserably.",
	     $ERR_FIRST_CHARS=>"Guessing from the first characters " .
	     "failed miserably.",
	     $ERR_META=>"Guessing from the meta information " .
	     "failed miserably.",
	     $ERR_XML=>"Guessing from XML format failed miserably.",
	     $ERR_GUESS=>"Unable to guess at the encoding.",
	     $ERR_WRONG_GUESS=>"This pair does not convert:",
	     $ERR_ILLEGAL_CHAR=>"Illegal character in supposedly UTF-8 " .
	     "result.",
	     $ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type",
	     $ERR_TYPE_GUESS=>"Guessing the document type failed.",
	     $ERR_ENCODE_GUESS=>"Encode::Guess failed.",
	     $ERR_GUESS_AND_CONVERT=>"Guessing an encoding and then " .
	     "converting failed.",
	     $ERR_UNABLE_TO_GUESS=>"Unable to guess at encoding name " .
	     "corrections."
   );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;

    if (!defined($errcode))
    {
        confess("set_err_state() called with an undefined argument.");
    }

    if (exists($ErrMsgs{$errcode}))
    {
        if ($errcode==$ERR_OK)
        {
            $self->{errstr}="";
        }
        else
        {
            $self->{errstr}.=" " . $ErrMsgs{$errcode};
            if (defined($errmsg))
            {
                $self->{errstr}.=" " . $errmsg;
            }
        }
    }
    else
    {
        confess("Internal error: set_err_state() called with an " .
                "unrecognized argument ($errcode).")
    }
}

sub errmsg
{
    my $self=shift;

    return $self->{errstr};
}

#############################################################################
#
#      Methods
#
##############################################################################
 
sub new
{
    my $proto=shift;

    my $class=ref($proto)||$proto;
    my $parent=ref($proto)&&$proto;
    my $self={};
    bless($self,$class);

    $self->_init(@_);

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

#
# Returns 1 if the (decimal) character code is legal UTF-8
#
sub code_is_utf8
{
    my $self=shift;
    my $dec_code=shift;

    # check for invalid codes 
    if ($dec_code<0 || $dec_code>1114111 || $InvalidUtf8Code{$dec_code})
    {
	return 0;
    }

    return 1;
}

#
# Returns 1 if all of the characters of the text are legal UTF-8
# Else, returns 0 and sets an error message specifying the location
# (1..) of the first illegal character code
# If you wish to obtain the position and offending code, pass a 
# hash ref
#
sub is_utf8
{
    my $self=shift;
    my $text=shift;
    my $err=shift;

    # Go over the text char by char and check for invalid char codes
    my @chars=split(//,$text);
    my $i=1;
    for my $char (@chars)
    {
	# We test for valid code
	#
	my $code=ord($char);
	if (!$self->code_is_utf8($code))
	{
	    $self->_set_err_state($ERR_ILLEGAL_CODE,
				  sprintf("Position: #%d, character code: %#x",
					  $i,$code));
	    if (defined($err) && ref($err) eq 'HASH')
	    {
		$err->{pos}=$i;
		$err->{code}=$code;
	    }
	    return 0;
	}
	$i++;
    }

    return 1;
}

#
# type: The type of the document as one of the recognized types
#       defined in Alvis::Document::Type (superset of MIME). 
#
sub guess
{
    my $self=shift;
    my $text=shift;
    my $type=shift;
    my $sub_type=shift;

    $self->_set_err_state($ERR_OK);  # clean the slate

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    if (!(defined($type) && defined($sub_type)))
    {
	($type,$sub_type)=$self->{docTypeWizard}->guess($text);
	if (!(defined($type) && defined($sub_type)))
	{
	    $self->_set_err_state($ERR_TYPE_GUESS,
				  $self->{docTypeWizard}->errmsg());
	    return undef;
	}
    }

#    warn "($type,$sub_type)";

    my @guesses;
    if ($type eq 'text')
    {
	if ($sub_type eq 'html')
	{
	    @guesses=$self->_HTML($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
	elsif ($sub_type eq 'xhtml')
	{
	    @guesses=$self->_XHTML($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
	elsif ($sub_type eq 'plain')
	{
	    @guesses=$self->_plain_text($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
    }

    return @guesses;
}

sub from_to
{
    my $self=shift;
    my $text=shift;
    my $source_enc=shift;
    my $target_enc=shift;

    {
	eval
	{
	    Encode::from_to($text,
			    $source_enc,$target_enc,0);
			    # $source_enc,$target_enc,Encode::FB_QUIET);
	};
	if ($@)
	{
	    my $err=$@;
	    $err=~s/ at .*$//isgo;
	    $self->_set_err_state($ERR_WRONG_GUESS,
				  "source encoding: $source_enc, " .
				  "target encoding: $target_enc. Why? $err.");
	    return undef;
	}
        if ($target_enc=~/^\s*utf-?8\s*$/isgo)
        {
	    # leaves the bl***y UTF-8 flag on
	    Encode::_utf8_on($text); 
        }
    }
    return $text;
}

# 
# Should always leave the UTF-8 flag on, if target is UTF-8
#
sub convert
{
    my $self=shift;
    my $text=shift;
    my $source_enc=shift;
    my $target_enc=shift;

    my %err;
    if ($source_enc=~/^\s*utf-?8\s*$/isgo)
    {
	if (!$self->is_utf8($text,\%err))
	{
	    $self->_set_err_state($ERR_ILLEGAL_CHAR,
				  " Position: $err{pos}," .
				  "Code:$err{code}");
	    return undef;
	}
    }

    my $try=$self->from_to($text,$source_enc,$target_enc);
    if (!defined($try))
    {
	my @possible_src_typo_fixes=$self->guess_typo_fixes($source_enc);
	my @possible_trg_typo_fixes=$self->guess_typo_fixes($target_enc);

	for my $src_enc_guess ($self->guess_typo_fixes($source_enc))
	{
	    for my $trg_enc_guess ($self->guess_typo_fixes($target_enc))
	    {
		my $try=$self->from_to($text,$src_enc_guess,$trg_enc_guess);
		if (defined($try))
		{
		    return $try;
		}
	    }
	}
	
	$self->_set_err_state($ERR_UNABLE_TO_GUESS);
	return undef;
    }

    return $try;
}

sub guess_and_convert
{
    my $self=shift;
    my $text=shift;
    my $type=shift;
    my $sub_type=shift;
    my $target_enc=shift;

    $self->_set_err_state($ERR_OK);

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    if (!(defined($type) && defined($sub_type)))
    {
	($type,$sub_type)=$self->{docTypeWizard}->guess($text);
	if (!(defined($type) && defined($sub_type)))
	{
	    $self->_set_err_state($ERR_TYPE_GUESS,
				  $self->{docTypeWizard}->errmsg());
	    return undef;
	}
    }
    
    my @enc_guesses=$self->guess($text,$type,$sub_type);
    if (scalar(@enc_guesses)==0)
    {
	$self->_set_err_state($ERR_GUESS);
	return undef;
    }

    my $result; 
    for my $enc_guess (@enc_guesses)
    {
	if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
	   return $text;
        } else {
           $result=$self->convert($text,$enc_guess,$target_enc);
	   if (defined($result))
	   {
	       return $result;
	   }
	}
    }
    if (!defined($result))
    {
        #  test if its UTF-8 already
	&Encode::_utf8_on($text);
	if (  &Encode::is_utf8($text) ) {
		return $text;
        }
         &Encode::_utf8_off($text);
	$self->_set_err_state($ERR_GUESS_AND_CONVERT);
        # print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
	# print STDERR "\n$text\n\n";
	return undef;
    }

    return $result;
}

sub guess_typo_fixes
{
    my $self=shift;
    my $typo=shift;

    my @possibilities=($typo);
    if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
    {
	push(@possibilities,'utf8');
    }
    if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)
    {
	push(@possibilities,'UTF-16');
    }
    if ($typo=~/^\s*iso-?8559-?1\s*$/isgo)
    {
	push(@possibilities,'iso-8859-1');
    }
    if ($typo=~/^\s*ecu-?(kr|jp|cn|tw|jisx0213)\s*$/isgo)
    {
	push(@possibilities,"euc-$1");
    }
    if ($typo=~/^\s*(?:uft|utf)-?32\s*$/isgo)
    {
	push(@possibilities,'UTF-32');
    }
    if ($typo=~/^\s*(?:acsii|asici)\s*$/isgo)
    {
	push(@possibilities,'ascii');
    }
    if ($typo=~/^\s*(?:acsii|asici)-?ctrl\s*$/isgo)
    {
	push(@possibilities,'ascii-ctrl');
    }
    if ($typo=~/^\s*(?:utf|uft)-?7\s*$/isgo)
    {
	push(@possibilities,'UTF-7');
    }
    if ($typo=~/^\s*macintosh\s*$/isgo)
    {
	for (my $i=1; $i<=11; $i++)
	{
	    push(@possibilities,"iso-8859-$i");
	}
	push(@possibilities,'viscii');
    }
    if ($typo=~/^\s*iso-8559-(\d)\s*$/isgo)
    {
	push(@possibilities,"iso-8859-$1");
    }
    if ($typo=~/^\s*iso-8895-(\d)\s*$/isgo)
    {
	push(@possibilities,"iso-8859-$1");
    }
    if ($typo=~/^\s*(?:utf|uft)-?16be\s*$/isgo)
    {
	push(@possibilities,'UTF-16BE');
    }
    if ($typo=~/^\s*(?:utf|uft)-?16le\s*$/isgo)
    {
	push(@possibilities,'UTF-16LE');
    }

    return @possibilities;
}

########################################################################3
#
# Private methods
#
##########################################################################
#
# HTML::Encoding has a nasty bug
#
sub _HTML
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my @guesses;

    eval
    {
	@guesses=HTML::Encoding::encoding_from_byte_order_mark($text,xhtml=>0);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_BOM,"$@");
    }
    if (scalar(@guesses))
    {
	return @guesses;
    }

    # Sanity check to exclude e.g. UTF-32
    #
    eval
    {
	@guesses=
	    HTML::Encoding::encoding_from_first_chars($text);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_FIRST_CHARS,"$@");
    }

    my @tries;
    if (scalar(@guesses))
    {
	@tries=@guesses;
    }
    else
    {
	@tries=@{$HTML::Encoding::DEFAULT_ENCODINGS};
    }
    foreach my $try (@tries)
    {
	if ($try=~/^\s*UTF-(?:16|32)((?:B|L)E)?\s*$/isgo)
	{
	    # HTML::Encoding is a bit imperfect
	    next;
	}

	my @try_results;
	eval
	{
	    @try_results=
		HTML::Encoding::encoding_from_meta_element($text,$try);
	};
	if ($@)
	{
	    $self->_set_err_state($ERR_META,"$@");
	}
	
	@guesses=(@try_results,@guesses);
    }

    return @guesses;
}

sub _XHTML
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my @guesses;
    eval
    {
	@guesses=HTML::Encoding::encoding_from_xml_document($text);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_XML,"$@");
	return ();
    }
    
    if (scalar(@guesses))
    {
	return @guesses;
    }
    
    return $self->_HTML($text);
}

sub _plain_text
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my $enc=guess_encoding($text);
    if (ref($enc))
    {
	return ($enc->name());
    }
    else
    {
	$self->_set_err_state($ERR_ENCODE_GUESS,"$@");
	return ();
    }
}

1;
__END__

=head1 NAME

Alvis::Encoding - Perl extension for guessing and checking the encoding of
documents.

=head1 SYNOPSIS

 use Alvis::Encoding;

 # Create a new instance
 my $e=Alvis::Encoding->new();
 if (!defined($e))
 {
    die "Instantiating Alvis::Encoding failed.";
 }

 # Check that a (decimal) character code is legal UTF-8
 my $code=55;
 if (!$e->code_is_utf8($code))
 {
    # The message will contain the position and the offending character's code 
    die $e->errmsg();
 }

 # Check that a text is legal UTF-8
 my $text;
 if (!$e->is_utf8($text))
 {
    # The message will contain the position and the offending character's code 
    die $e->errmsg();
 }

 # If you need to obtain the position (1..) and the offending character,
 # pass a placeholder in a hash ref argument:
 my %err=();
 if (!$e->is_utf8($text,\%err))
 {
    my $position=$err{pos};
    my $code=$err{code};
    . . . 
 }

 # 
 # Guess the encoding of a document given a guess for its type 
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_encoding=$e->guess($text,$doc_type,$doc_sub_type);
 if (!defined($doc_encoding))
 {
     die('Cannot guess. ' . $e->errmsg());
 }

 # 
 # Try converting a document to UTF-8 with only its type known
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
 if (!defined($doc_in_utf8))
 {
     die('Cannot guess. ' . $e->errmsg());
 }
 
 # Try to guess what was meant 
 my @possibilities=$e->guess_typo_fixes('uft-8');

=head1 DESCRIPTION

A collection of methods for guessing, confirming and fixing the encoding
of a document.

=head1 METHODS

=head2 new()

Options:

    defaultDocType       default type for a document. Default: text.
    defaultDocSubType    default sub type for a document. Default: html.
    defaultEncoding      default encoding for a document. Default: iso-8859-1.

=head2 code_is_utf8(decimal_code)

Returns 1 if the (decimal) character code is legal UTF-8.

=head2 is_utf8(text,err_hash_ref)

Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a 
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.

=head2 guess(text,doc_type,doc_sub_type)

Guess the encoding of a document given a guess for its type (and subtype).

=head2 guess_and_convert(text,doc_type,doc_sub_type,target_encoding)

Tries to first guess the encoding of the document given a guess at its
type and subtype, and then tries to convert it to $target_encoding.

=head2 convert(text,source_encoding,target_encoding)

Tries to convert $text from $source_encoding to $target_encoding.

=head2 guess_typo_fixes($typo)

Returns a set of guesses for the meant encoding in a case of an encoding
name containing typos.

=head2 errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

=head1 SEE ALSO

Alvis::Document::Type

=head1 AUTHOR

Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Kimmo Valtonen

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut



( run in 1.108 second using v1.01-cache-2.11-cpan-39bf76dae61 )