Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Canonical.pm  view on Meta::CPAN

package Alvis::Canonical;

use warnings;
use strict;

use Alvis::HTML;

$Alvis::Canonical::VERSION = '0.31';

#############################################################################
#
# Converts an original document in some format to an Alvis canonicalDocument
#
#############################################################################

#############################################################################
#
#     Global variables & constants
#
##############################################################################

my $DEF_WARNINGS=0;    # add warning comments about fixes to the doc? 
my $DEF_CONVERT_CHAR_ENTS=1; # convert "relevant" char ents
my $DEF_CONVERT_NUM_ENTS=1;  # convert numerical entities
my $DEF_SRC_ENC=undef; # guess the source encoding

my $DEBUG=0;

#############################################################################
#
#     Error message stuff
#
#############################################################################

my $ErrStr;
my ($ERR_OK,
    $ERR_NO_HTML_CONV,
    $ERR_HTML_CONV,
    $ERR_CONT2CAN_DOC,
    $ERR_NO_HTML_CLEAN,
    $ERR_MISFORMED_REL_URL,
    $ERR_REL_URL_VS_BASE_MISMATCH
    )=(0..6);
my %ErrMsgs=($ERR_OK=>"",
	     $ERR_NO_HTML_CONV=>"Unable to instantiate the HTML converter",
	     $ERR_HTML_CONV=>"Extracting the contents of HTML failed",
	     $ERR_CONT2CAN_DOC=>"Converting the HTML's contents failed",
	     $ERR_NO_HTML_CLEAN=>"Unable to instantiate the HTML cleaner",
	     $ERR_MISFORMED_REL_URL=>"Misformed relative URL",
	     $ERR_REL_URL_VS_BASE_MISMATCH=>"Cannot match a relative URL " .
	     "and the URL base"
   );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;

    if (!defined($errcode))
    {
        confess("set_err_state() called with an undefined argument.");
    }

    if (exists($ErrMsgs{$errcode}))
    {
        if ($errcode==$ERR_OK)
        {
            $self->{errstr}="";
        }
        else
        {
            $self->{errstr}.=" " . $ErrMsgs{$errcode};
            if (defined($errmsg))
            {
                $self->{errstr}.=" " . $errmsg;
            }
        }
    }
    else
    {
        confess("Internal error: set_err_state() called with an " .
                "unrecognized argument ($errcode).")
    }
}

lib/Alvis/Canonical.pm  view on Meta::CPAN


sub _alvis_tags2chars
{
    my $self=shift;
    my $contents=shift;

    return "<$contents>";
}

1;
__END__

=head1 NAME

Alvis::Canonical - Perl extension for converting documents in various formats into the Alvis canonical format for documents 

=head1 SYNOPSIS

 use Alvis::Canonical;

 # Create a new instance, specify the conversion of both numeric and 
 # symbolic character entities to Unicode characters
 my $C=Alvis::Canonical->new(convertCharEnts=>1,
                             convertNumEnts=>1);
 if (!defined($C))
 {
     die("Unable to instantiate Alvis::Canonical.");
 }

 # Convert an HTML document text in UTF-8 to the canonical format.
 # Specify that you want the title and baseURL as well, if any can be
 # determined.
 my ($txt,$header)=$C->HTML($html,
                            {title=>1,
         		     baseURL=>1});
 if (!defined($txt))
 {
    die $C->errmsg();
 }

=head1 DESCRIPTION

Assumes the input is in UTF-8 and does NOT contain '\0's (or rather that 
they carry no meaning and are removable). 

=head1 METHODS

=head2 new()

Available options:

    warnings         Issue warnings about badly faulty original HTML where
                     we have to resort to an heuristic solution.
                     Puts a warning to STDERR documenting the error and
                     the solution. Default: no.
    convertCharEnts  Convert HTML symbolic character entities to UTF-8 
                     characters? Default: yes.
    convertNumEnts   Convert HTML numerical character entities to UTF-8 
                     characters? Default: yes.
    sourceEncoding   the encoding of the source documents. Default: undef,
                     which means it is guessed.  
     
  my $C=Alvis::Canonical->new(convertCharEnts=>1,
                              convertNumEnts=>1);
  if (!defined($C))
  {
    die die("Unable to instantiate Alvis::Canonical.");
  }

=head2 HTML($html,$options)

Converts dirty HTML to a valid Alvis canonicalDocument. $options is
a mechanism for returning the title and base URL of the document.
If their extraction is desired, set fields 'title' and 'baseURL'
to a defined value. If you know the encoding of the source document,
set option 'sourceEncoding', e.g. 

  my ($txt,$header)=$C->HTML($html,
                            {title=>1,
         		     baseURL=>1,
                             sourceEncoding=>'iso-8859-2'});

=head2 errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

=head1 SEE ALSO

Alvis::Convert

=head1 AUTHOR

Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Kimmo Valtonen

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut



( run in 1.718 second using v1.01-cache-2.11-cpan-e1769b4cff6 )