Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

 57095=>1,  57096=>1,  57097=>1,  57098=>1,  57099=>1,  57100=>1,  57101=>1, 
 57102=>1,  57103=>1,  57104=>1,  57105=>1,  57106=>1,  57107=>1,  57108=>1, 
 57109=>1,  57110=>1,  57111=>1,  57112=>1,  57113=>1,  57114=>1,  57115=>1, 
 57116=>1,  57117=>1,  57118=>1,  57119=>1,  57120=>1,  57121=>1,  57122=>1, 
 57123=>1,  57124=>1,  57125=>1,  57126=>1,  57127=>1,  57128=>1,  57129=>1, 
 57130=>1,  57131=>1,  57132=>1,  57133=>1,  57134=>1,  57135=>1,  57136=>1, 
 57137=>1,  57138=>1,  57139=>1,  57140=>1,  57141=>1,  57142=>1,  57143=>1, 
 57144=>1,  57145=>1,  57146=>1,  57147=>1,  57148=>1,  57149=>1,  57150=>1, 
 57151=>1,  57152=>1,  57153=>1,  57154=>1,  57155=>1,  57156=>1,  57157=>1, 
 57158=>1,  57159=>1,  57160=>1,  57161=>1,  57162=>1,  57163=>1,  57164=>1, 
 57165=>1,  57166=>1,  57167=>1,  57168=>1,  57169=>1,  57170=>1,  57171=>1, 
 57172=>1,  57173=>1,  57174=>1,  57175=>1,  57176=>1,  57177=>1,  57178=>1, 
 57179=>1,  57180=>1,  57181=>1,  57182=>1,  57183=>1,  57184=>1,  57185=>1, 
 57186=>1,  57187=>1,  57188=>1,  57189=>1,  57190=>1,  57191=>1,  57192=>1, 
 57193=>1,  57194=>1,  57195=>1,  57196=>1,  57197=>1,  57198=>1,  57199=>1, 
 57200=>1,  57201=>1,  57202=>1,  57203=>1,  57204=>1,  57205=>1,  57206=>1, 
 57207=>1,  57208=>1,  57209=>1,  57210=>1,  57211=>1,  57212=>1,  57213=>1, 
 57214=>1,  57215=>1,  57216=>1,  57217=>1,  57218=>1,  57219=>1,  57220=>1, 
 57221=>1,  57222=>1,  57223=>1,  57224=>1,  57225=>1,  57226=>1,  57227=>1, 
 57228=>1,  57229=>1,  57230=>1,  57231=>1,  57232=>1,  57233=>1,  57234=>1, 
 57235=>1,  57236=>1,  57237=>1,  57238=>1,  57239=>1,  57240=>1,  57241=>1, 
 57242=>1,  57243=>1,  57244=>1,  57245=>1,  57246=>1,  57247=>1,  57248=>1, 
 57249=>1,  57250=>1,  57251=>1,  57252=>1,  57253=>1,  57254=>1,  57255=>1, 
 57256=>1,  57257=>1,  57258=>1,  57259=>1,  57260=>1,  57261=>1,  57262=>1, 
 57263=>1,  57264=>1,  57265=>1,  57266=>1,  57267=>1,  57268=>1,  57269=>1, 
 57270=>1,  57271=>1,  57272=>1,  57273=>1,  57274=>1,  57275=>1,  57276=>1, 
 57277=>1,  57278=>1,  57279=>1,  57280=>1,  57281=>1,  57282=>1,  57283=>1, 
 57284=>1,  57285=>1,  57286=>1,  57287=>1,  57288=>1,  57289=>1,  57290=>1, 
 57291=>1,  57292=>1,  57293=>1,  57294=>1,  57295=>1,  57296=>1,  57297=>1, 
 57298=>1,  57299=>1,  57300=>1,  57301=>1,  57302=>1,  57303=>1,  57304=>1, 
 57305=>1,  57306=>1,  57307=>1,  57308=>1,  57309=>1,  57310=>1,  57311=>1, 
 57312=>1,  57313=>1,  57314=>1,  57315=>1,  57316=>1,  57317=>1,  57318=>1, 
 57319=>1,  57320=>1,  57321=>1,  57322=>1,  57323=>1,  57324=>1,  57325=>1, 
 57326=>1,  57327=>1,  57328=>1,  57329=>1,  57330=>1,  57331=>1,  57332=>1, 
 57333=>1,  57334=>1,  57335=>1,  57336=>1,  57337=>1,  57338=>1,  57339=>1, 
 57340=>1,  57341=>1,  57342=>1,  57343=>1,  64976=>1,  64977=>1,  64978=>1, 
 64979=>1,  64980=>1,  64981=>1,  64982=>1,  64983=>1,  64984=>1,  64985=>1, 
 64986=>1,  64987=>1,  64988=>1,  64989=>1,  64990=>1,  64991=>1,  64992=>1, 
 64993=>1,  64994=>1,  64995=>1,  64996=>1,  64997=>1,  64998=>1,  64999=>1, 
 65000=>1,  65001=>1,  65002=>1,  65003=>1,  65004=>1,  65005=>1,  65006=>1, 
 65007=>1,  65534=>1,  65535=>1,  131070=>1,  131071=>1,  196606=>1,  196607=>1, 
 262142=>1,  262143=>1,  327678=>1,  327679=>1,  393214=>1,  393215=>1,  458750=>1, 
 458751=>1,  524286=>1,  524287=>1,  589822=>1,  589823=>1,  655358=>1,  655359=>1, 
 720894=>1,  720895=>1,  786430=>1,  786431=>1,  851966=>1,  851967=>1,  917502=>1, 
 917503=>1,  983038=>1,  983039=>1,  1048574=>1,  1048575=>1,  1114110=>1,  1114111=>1, 
);


#############################################################################
#
#     Error message stuff
#
#############################################################################

my $ErrStr;
my ($ERR_OK,
    $ERR_ILLEGAL_CODE,
    $ERR_DOC,
    $ERR_DOC_TYPE,
    $ERR_DOC_SUB_TYPE,
    $ERR_BOM,
    $ERR_FIRST_CHARS,
    $ERR_META,
    $ERR_XML,
    $ERR_GUESS,
    $ERR_WRONG_GUESS,
    $ERR_ILLEGAL_CHAR,
    $ERR_DOC_TYPE_WIZARD,
    $ERR_TYPE_GUESS,
    $ERR_ENCODE_GUESS,
    $ERR_GUESS_AND_CONVERT,
    $ERR_UNABLE_TO_GUESS
    )=(0..16);
my %ErrMsgs=($ERR_OK=>"",
	     $ERR_ILLEGAL_CODE=>"Illegal UTF-8 code.",
	     $ERR_DOC=>"No document text.",
	     $ERR_DOC_TYPE=>"No document type.",
	     $ERR_DOC_SUB_TYPE=>"No document subtype.",
	     $ERR_BOM=>"Byte order mark recognition failed miserably.",
	     $ERR_FIRST_CHARS=>"Guessing from the first characters " .
	     "failed miserably.",
	     $ERR_META=>"Guessing from the meta information " .
	     "failed miserably.",
	     $ERR_XML=>"Guessing from XML format failed miserably.",
	     $ERR_GUESS=>"Unable to guess at the encoding.",
	     $ERR_WRONG_GUESS=>"This pair does not convert:",
	     $ERR_ILLEGAL_CHAR=>"Illegal character in supposedly UTF-8 " .
	     "result.",
	     $ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type",
	     $ERR_TYPE_GUESS=>"Guessing the document type failed.",
	     $ERR_ENCODE_GUESS=>"Encode::Guess failed.",
	     $ERR_GUESS_AND_CONVERT=>"Guessing an encoding and then " .
	     "converting failed.",
	     $ERR_UNABLE_TO_GUESS=>"Unable to guess at encoding name " .
	     "corrections."
   );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;

    if (!defined($errcode))
    {
        confess("set_err_state() called with an undefined argument.");
    }

    if (exists($ErrMsgs{$errcode}))
    {
        if ($errcode==$ERR_OK)
        {
            $self->{errstr}="";
        }
        else
        {
            $self->{errstr}.=" " . $ErrMsgs{$errcode};
            if (defined($errmsg))
            {
                $self->{errstr}.=" " . $errmsg;
            }
        }
    }
    else
    {
        confess("Internal error: set_err_state() called with an " .
                "unrecognized argument ($errcode).")
    }
}

sub errmsg
{
    my $self=shift;

    return $self->{errstr};
}

#############################################################################
#

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

    }
    if ($typo=~/^\s*(?:utf|uft)-?7\s*$/isgo)
    {
	push(@possibilities,'UTF-7');
    }
    if ($typo=~/^\s*macintosh\s*$/isgo)
    {
	for (my $i=1; $i<=11; $i++)
	{
	    push(@possibilities,"iso-8859-$i");
	}
	push(@possibilities,'viscii');
    }
    if ($typo=~/^\s*iso-8559-(\d)\s*$/isgo)
    {
	push(@possibilities,"iso-8859-$1");
    }
    if ($typo=~/^\s*iso-8895-(\d)\s*$/isgo)
    {
	push(@possibilities,"iso-8859-$1");
    }
    if ($typo=~/^\s*(?:utf|uft)-?16be\s*$/isgo)
    {
	push(@possibilities,'UTF-16BE');
    }
    if ($typo=~/^\s*(?:utf|uft)-?16le\s*$/isgo)
    {
	push(@possibilities,'UTF-16LE');
    }

    return @possibilities;
}

########################################################################3
#
# Private methods
#
##########################################################################
#
# HTML::Encoding has a nasty bug
#
sub _HTML
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my @guesses;

    eval
    {
	@guesses=HTML::Encoding::encoding_from_byte_order_mark($text,xhtml=>0);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_BOM,"$@");
    }
    if (scalar(@guesses))
    {
	return @guesses;
    }

    # Sanity check to exclude e.g. UTF-32
    #
    eval
    {
	@guesses=
	    HTML::Encoding::encoding_from_first_chars($text);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_FIRST_CHARS,"$@");
    }

    my @tries;
    if (scalar(@guesses))
    {
	@tries=@guesses;
    }
    else
    {
	@tries=@{$HTML::Encoding::DEFAULT_ENCODINGS};
    }
    foreach my $try (@tries)
    {
	if ($try=~/^\s*UTF-(?:16|32)((?:B|L)E)?\s*$/isgo)
	{
	    # HTML::Encoding is a bit imperfect
	    next;
	}

	my @try_results;
	eval
	{
	    @try_results=
		HTML::Encoding::encoding_from_meta_element($text,$try);
	};
	if ($@)
	{
	    $self->_set_err_state($ERR_META,"$@");
	}
	
	@guesses=(@try_results,@guesses);
    }

    return @guesses;
}

sub _XHTML
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);



( run in 1.161 second using v1.01-cache-2.11-cpan-13bb782fe5a )