Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN


#############################################################################
#
#      Methods
#
##############################################################################
 
sub new
{
    my $proto=shift;

    my $class=ref($proto)||$proto;
    my $parent=ref($proto)&&$proto;
    my $self={};
    bless($self,$class);

    $self->_init(@_);

    $self->_set_err_state($ERR_OK);

    $self->{docTypeWizard}=
	Alvis::Document::Type->new(defaultType=>
				   $self->{defaultDocType},
				   defaultSubType=>
				   $self->{defaultDocSubType});
    if (!defined($self->{docTypeWizard}))
    {
	$self->_set_err_state($ERR_DOC_TYPE_WIZARD);
	return undef;
    }

    Encode::Guess->add_suspects(qw/latin1 iso-8859-2 iso-8859-5 iso-8859-10 iso-8859-7 windows-1252/);

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{defaultDocType}='text';
    $self->{defaultDocSubType}='html';
    $self->{defaultEncoding}='iso-8859-1';

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

##########################################################################
#
#  Public methods
#
#########################################################################

#
# Returns 1 if the (decimal) character code is legal UTF-8
#
sub code_is_utf8
{
    my $self=shift;
    my $dec_code=shift;

    # check for invalid codes 
    if ($dec_code<0 || $dec_code>1114111 || $InvalidUtf8Code{$dec_code})
    {
	return 0;
    }

    return 1;
}

#
# Returns 1 if all of the characters of the text are legal UTF-8
# Else, returns 0 and sets an error message specifying the location
# (1..) of the first illegal character code
# If you wish to obtain the position and offending code, pass a 
# hash ref
#
sub is_utf8
{
    my $self=shift;
    my $text=shift;
    my $err=shift;

    # Go over the text char by char and check for invalid char codes
    my @chars=split(//,$text);
    my $i=1;
    for my $char (@chars)
    {
	# We test for valid code
	#
	my $code=ord($char);
	if (!$self->code_is_utf8($code))
	{
	    $self->_set_err_state($ERR_ILLEGAL_CODE,
				  sprintf("Position: #%d, character code: %#x",
					  $i,$code));
	    if (defined($err) && ref($err) eq 'HASH')
	    {
		$err->{pos}=$i;
		$err->{code}=$code;
	    }
	    return 0;
	}
	$i++;
    }

    return 1;
}

#
# type: The type of the document as one of the recognized types
#       defined in Alvis::Document::Type (superset of MIME). 
#
sub guess
{
    my $self=shift;
    my $text=shift;
    my $type=shift;
    my $sub_type=shift;

    $self->_set_err_state($ERR_OK);  # clean the slate

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    if (!(defined($type) && defined($sub_type)))
    {
	($type,$sub_type)=$self->{docTypeWizard}->guess($text);
	if (!(defined($type) && defined($sub_type)))
	{
	    $self->_set_err_state($ERR_TYPE_GUESS,
				  $self->{docTypeWizard}->errmsg());
	    return undef;
	}
    }

#    warn "($type,$sub_type)";

    my @guesses;
    if ($type eq 'text')
    {
	if ($sub_type eq 'html')
	{
	    @guesses=$self->_HTML($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
	elsif ($sub_type eq 'xhtml')
	{
	    @guesses=$self->_XHTML($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
	elsif ($sub_type eq 'plain')
	{
	    @guesses=$self->_plain_text($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
    }

    return @guesses;
}

sub from_to
{
    my $self=shift;
    my $text=shift;
    my $source_enc=shift;
    my $target_enc=shift;

    {
	eval
	{
	    Encode::from_to($text,
			    $source_enc,$target_enc,0);
			    # $source_enc,$target_enc,Encode::FB_QUIET);
	};
	if ($@)
	{
	    my $err=$@;
	    $err=~s/ at .*$//isgo;
	    $self->_set_err_state($ERR_WRONG_GUESS,
				  "source encoding: $source_enc, " .
				  "target encoding: $target_enc. Why? $err.");
	    return undef;
	}
        if ($target_enc=~/^\s*utf-?8\s*$/isgo)
        {
	    # leaves the bl***y UTF-8 flag on
	    Encode::_utf8_on($text); 
        }
    }
    return $text;
}

# 
# Should always leave the UTF-8 flag on, if target is UTF-8
#
sub convert
{
    my $self=shift;
    my $text=shift;
    my $source_enc=shift;
    my $target_enc=shift;

    my %err;
    if ($source_enc=~/^\s*utf-?8\s*$/isgo)
    {
	if (!$self->is_utf8($text,\%err))
	{
	    $self->_set_err_state($ERR_ILLEGAL_CHAR,
				  " Position: $err{pos}," .
				  "Code:$err{code}");
	    return undef;
	}
    }

    my $try=$self->from_to($text,$source_enc,$target_enc);
    if (!defined($try))
    {
	my @possible_src_typo_fixes=$self->guess_typo_fixes($source_enc);
	my @possible_trg_typo_fixes=$self->guess_typo_fixes($target_enc);

	for my $src_enc_guess ($self->guess_typo_fixes($source_enc))
	{
	    for my $trg_enc_guess ($self->guess_typo_fixes($target_enc))
	    {
		my $try=$self->from_to($text,$src_enc_guess,$trg_enc_guess);
		if (defined($try))
		{
		    return $try;
		}
	    }
	}
	
	$self->_set_err_state($ERR_UNABLE_TO_GUESS);
	return undef;
    }

    return $try;
}

sub guess_and_convert
{
    my $self=shift;
    my $text=shift;
    my $type=shift;
    my $sub_type=shift;
    my $target_enc=shift;

    $self->_set_err_state($ERR_OK);

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    if (!(defined($type) && defined($sub_type)))
    {
	($type,$sub_type)=$self->{docTypeWizard}->guess($text);
	if (!(defined($type) && defined($sub_type)))
	{
	    $self->_set_err_state($ERR_TYPE_GUESS,
				  $self->{docTypeWizard}->errmsg());
	    return undef;
	}
    }
    
    my @enc_guesses=$self->guess($text,$type,$sub_type);
    if (scalar(@enc_guesses)==0)
    {
	$self->_set_err_state($ERR_GUESS);
	return undef;
    }

    my $result; 
    for my $enc_guess (@enc_guesses)
    {
	if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
	   return $text;
        } else {
           $result=$self->convert($text,$enc_guess,$target_enc);
	   if (defined($result))
	   {
	       return $result;
	   }
	}
    }
    if (!defined($result))
    {
        #  test if its UTF-8 already
	&Encode::_utf8_on($text);
	if (  &Encode::is_utf8($text) ) {
		return $text;
        }
         &Encode::_utf8_off($text);
	$self->_set_err_state($ERR_GUESS_AND_CONVERT);
        # print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
	# print STDERR "\n$text\n\n";
	return undef;
    }

    return $result;
}

sub guess_typo_fixes
{
    my $self=shift;
    my $typo=shift;

    my @possibilities=($typo);
    if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
    {
	push(@possibilities,'utf8');
    }
    if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)
    {
	push(@possibilities,'UTF-16');
    }
    if ($typo=~/^\s*iso-?8559-?1\s*$/isgo)
    {
	push(@possibilities,'iso-8859-1');
    }
    if ($typo=~/^\s*ecu-?(kr|jp|cn|tw|jisx0213)\s*$/isgo)
    {
	push(@possibilities,"euc-$1");
    }
    if ($typo=~/^\s*(?:uft|utf)-?32\s*$/isgo)
    {
	push(@possibilities,'UTF-32');
    }
    if ($typo=~/^\s*(?:acsii|asici)\s*$/isgo)
    {
	push(@possibilities,'ascii');
    }
    if ($typo=~/^\s*(?:acsii|asici)-?ctrl\s*$/isgo)
    {
	push(@possibilities,'ascii-ctrl');
    }
    if ($typo=~/^\s*(?:utf|uft)-?7\s*$/isgo)
    {
	push(@possibilities,'UTF-7');
    }
    if ($typo=~/^\s*macintosh\s*$/isgo)
    {
	for (my $i=1; $i<=11; $i++)
	{
	    push(@possibilities,"iso-8859-$i");
	}
	push(@possibilities,'viscii');
    }
    if ($typo=~/^\s*iso-8559-(\d)\s*$/isgo)
    {
	push(@possibilities,"iso-8859-$1");
    }
    if ($typo=~/^\s*iso-8895-(\d)\s*$/isgo)
    {
	push(@possibilities,"iso-8859-$1");
    }
    if ($typo=~/^\s*(?:utf|uft)-?16be\s*$/isgo)
    {
	push(@possibilities,'UTF-16BE');
    }
    if ($typo=~/^\s*(?:utf|uft)-?16le\s*$/isgo)
    {
	push(@possibilities,'UTF-16LE');
    }

    return @possibilities;
}

########################################################################3
#
# Private methods

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

	@guesses=HTML::Encoding::encoding_from_xml_document($text);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_XML,"$@");
	return ();
    }
    
    if (scalar(@guesses))
    {
	return @guesses;
    }
    
    return $self->_HTML($text);
}

sub _plain_text
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my $enc=guess_encoding($text);
    if (ref($enc))
    {
	return ($enc->name());
    }
    else
    {
	$self->_set_err_state($ERR_ENCODE_GUESS,"$@");
	return ();
    }
}

1;
__END__

=head1 NAME

Alvis::Encoding - Perl extension for guessing and checking the encoding of
documents.

=head1 SYNOPSIS

 use Alvis::Encoding;

 # Create a new instance
 my $e=Alvis::Encoding->new();
 if (!defined($e))
 {
    die "Instantiating Alvis::Encoding failed.";
 }

 # Check that a (decimal) character code is legal UTF-8
 my $code=55;
 if (!$e->code_is_utf8($code))
 {
    # The message will contain the position and the offending character's code 
    die $e->errmsg();
 }

 # Check that a text is legal UTF-8
 my $text;
 if (!$e->is_utf8($text))
 {
    # The message will contain the position and the offending character's code 
    die $e->errmsg();
 }

 # If you need to obtain the position (1..) and the offending character,
 # pass a placeholder in a hash ref argument:
 my %err=();
 if (!$e->is_utf8($text,\%err))
 {
    my $position=$err{pos};
    my $code=$err{code};
    . . . 
 }

 # 
 # Guess the encoding of a document given a guess for its type 
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_encoding=$e->guess($text,$doc_type,$doc_sub_type);
 if (!defined($doc_encoding))
 {
     die('Cannot guess. ' . $e->errmsg());
 }

 # 
 # Try converting a document to UTF-8 with only its type known
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
 if (!defined($doc_in_utf8))
 {
     die('Cannot guess. ' . $e->errmsg());
 }
 
 # Try to guess what was meant 
 my @possibilities=$e->guess_typo_fixes('uft-8');

=head1 DESCRIPTION

A collection of methods for guessing, confirming and fixing the encoding
of a document.

=head1 METHODS

=head2 new()

Options:

    defaultDocType       default type for a document. Default: text.
    defaultDocSubType    default sub type for a document. Default: html.
    defaultEncoding      default encoding for a document. Default: iso-8859-1.

=head2 code_is_utf8(decimal_code)

Returns 1 if the (decimal) character code is legal UTF-8.

=head2 is_utf8(text,err_hash_ref)

Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a 
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.

=head2 guess(text,doc_type,doc_sub_type)

Guess the encoding of a document given a guess for its type (and subtype).

=head2 guess_and_convert(text,doc_type,doc_sub_type,target_encoding)

Tries to first guess the encoding of the document given a guess at its
type and subtype, and then tries to convert it to $target_encoding.

=head2 convert(text,source_encoding,target_encoding)

Tries to convert $text from $source_encoding to $target_encoding.

=head2 guess_typo_fixes($typo)

Returns a set of guesses for the meant encoding in a case of an encoding
name containing typos.

=head2 errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

=head1 SEE ALSO

Alvis::Document::Type

=head1 AUTHOR

Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Kimmo Valtonen

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut



( run in 2.184 seconds using v1.01-cache-2.11-cpan-140bd7fdf52 )