Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Canonical.pm view on Meta::CPAN
package Alvis::Canonical;
use warnings;
use strict;
use Alvis::HTML;
$Alvis::Canonical::VERSION = '0.31';
#############################################################################
#
# Converts an original document in some format to an Alvis canonicalDocument
#
#############################################################################
#############################################################################
#
# Global variables & constants
#
##############################################################################
my $DEF_WARNINGS=0; # add warning comments about fixes to the doc?
my $DEF_CONVERT_CHAR_ENTS=1; # convert "relevant" char ents
my $DEF_CONVERT_NUM_ENTS=1; # convert numerical entities
my $DEF_SRC_ENC=undef; # guess the source encoding
my $DEBUG=0;
#############################################################################
#
# Error message stuff
#
#############################################################################
my $ErrStr;
my ($ERR_OK,
$ERR_NO_HTML_CONV,
$ERR_HTML_CONV,
$ERR_CONT2CAN_DOC,
$ERR_NO_HTML_CLEAN,
$ERR_MISFORMED_REL_URL,
$ERR_REL_URL_VS_BASE_MISMATCH
)=(0..6);
my %ErrMsgs=($ERR_OK=>"",
$ERR_NO_HTML_CONV=>"Unable to instantiate the HTML converter",
$ERR_HTML_CONV=>"Extracting the contents of HTML failed",
$ERR_CONT2CAN_DOC=>"Converting the HTML's contents failed",
$ERR_NO_HTML_CLEAN=>"Unable to instantiate the HTML cleaner",
$ERR_MISFORMED_REL_URL=>"Misformed relative URL",
$ERR_REL_URL_VS_BASE_MISMATCH=>"Cannot match a relative URL " .
"and the URL base"
);
sub _set_err_state
{
my $self=shift;
my $errcode=shift;
my $errmsg=shift;
if (!defined($errcode))
{
confess("set_err_state() called with an undefined argument.");
}
if (exists($ErrMsgs{$errcode}))
{
if ($errcode==$ERR_OK)
{
$self->{errstr}="";
}
else
{
$self->{errstr}.=" " . $ErrMsgs{$errcode};
if (defined($errmsg))
{
$self->{errstr}.=" " . $errmsg;
}
}
}
else
{
confess("Internal error: set_err_state() called with an " .
"unrecognized argument ($errcode).")
}
}
lib/Alvis/Canonical.pm view on Meta::CPAN
sub _alvis_tags2chars
{
my $self=shift;
my $contents=shift;
return "<$contents>";
}
1;
__END__
=head1 NAME
Alvis::Canonical - Perl extension for converting documents in various formats into the Alvis canonical format for documents
=head1 SYNOPSIS
use Alvis::Canonical;
# Create a new instance, specify the conversion of both numeric and
# symbolic character entities to Unicode characters
my $C=Alvis::Canonical->new(convertCharEnts=>1,
convertNumEnts=>1);
if (!defined($C))
{
die("Unable to instantiate Alvis::Canonical.");
}
# Convert an HTML document text in UTF-8 to the canonical format.
# Specify that you want the title and baseURL as well, if any can be
# determined.
my ($txt,$header)=$C->HTML($html,
{title=>1,
baseURL=>1});
if (!defined($txt))
{
die $C->errmsg();
}
=head1 DESCRIPTION
Assumes the input is in UTF-8 and does NOT contain '\0's (or rather that
they carry no meaning and are removable).
=head1 METHODS
=head2 new()
Available options:
warnings Issue warnings about badly faulty original HTML where
we have to resort to an heuristic solution.
Puts a warning to STDERR documenting the error and
the solution. Default: no.
convertCharEnts Convert HTML symbolic character entities to UTF-8
characters? Default: yes.
convertNumEnts Convert HTML numerical character entities to UTF-8
characters? Default: yes.
sourceEncoding the encoding of the source documents. Default: undef,
which means it is guessed.
my $C=Alvis::Canonical->new(convertCharEnts=>1,
convertNumEnts=>1);
if (!defined($C))
{
die die("Unable to instantiate Alvis::Canonical.");
}
=head2 HTML($html,$options)
Converts dirty HTML to a valid Alvis canonicalDocument. $options is
a mechanism for returning the title and base URL of the document.
If their extraction is desired, set fields 'title' and 'baseURL'
to a defined value. If you know the encoding of the source document,
set option 'sourceEncoding', e.g.
my ($txt,$header)=$C->HTML($html,
{title=>1,
baseURL=>1,
sourceEncoding=>'iso-8859-2'});
=head2 errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
=head1 SEE ALSO
Alvis::Convert
=head1 AUTHOR
Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2006 by Kimmo Valtonen
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
( run in 1.718 second using v1.01-cache-2.11-cpan-e1769b4cff6 )