Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/HTML.pm view on Meta::CPAN
$self->{assertSourceAssumptions}=$DEF_SRC_ASS;;
$self->{convertCharEnts}=$DEF_CONVERT_CHAR_ENTS;
$self->{convertNumEnts}=$DEF_CONVERT_NUM_ENTS;
$self->{cleanWhitespace}=$DEF_CLEAN_WS;
$self->{sourceEncoding}=$DEF_SRC_ENCODING;
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#############################################################################
#
# Public methods
#
##############################################################################
#
# Returns (<contents as text>,<header hash ref>)
#
sub clean
{
my $self=shift;
my $html=shift;
my $opts=shift; # if a title/base URL is wished for as well
# returned in a header hash with keys
# title, baseURL
my %header=(title=>undef,
baseURL=>undef);
$self->_set_err_state($ERR_OK); # clean the slate
# Make it utf-8 if not already
my $src_enc;
if ($opts->{sourceEncoding})
{
$src_enc=$opts->{sourceEncoding};
}
elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
{
$src_enc=$self->{sourceEncoding};
}
if ($src_enc)
{
if ($src_enc!~/^\s*utf-?8\s*$/)
{
$html=$self->{encodingWiz}->convert($html,
$src_enc,
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
}
else # try guessing the encoding
{
$html=$self->{encodingWiz}->guess_and_convert($html,
'text',
'html',
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
# ex nihilo nihil
#
if (!defined($html) || $html=~/^\s*$/sgo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_EMPTY_DOC);
return (undef,\%header); # signals "do not pass on"
}
}
# Check if this really looks like "HTML"
#
if ($self->{assertHTML})
{
#
# If we're lucky...
#
if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type!~/(?:html|wml)/igo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_UNK_DOCTYPE,"($type)");
return (undef,\%header); # signals "do not pass on"
}
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($html!~/<(?:(?i)html|body)\W/sgo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
lib/Alvis/HTML.pm view on Meta::CPAN
baseURL=>1});
if (!defined($txt))
{
die "Instantiating Alvis::HTML failed.";
}
#
# Remove all HTML tags from the document. Assert that the document actually
# is HTML. HTML is in 'iso-8859-1', (output is always in UTF-8).
# Assert that the source assumptions (UTF-8, no '\0') hold before
# trying to convert.
#
$C=Alvis::HTML->new(alvisKeep=>1,
alvisRemove=>1,
obsolete=>1,
proprietary=>1,
xhtml=>1,
wml=>1,
keepAll=>1,
assertHTML=>0,
convertCharEnts=>1,
convertNumEnts=>1,
sourceEncoding=>'iso-8859-1',
assertSourceAssumptions=>1
);
=head1 DESCRIPTION
Assumes the input is in UTF-8 and does NOT contain '\0's (or rather that
they carry no meaning and are removable).
=head1 METHODS
=head2 new()
Options available:
assertHTML if 1, try to check if the source really
is in any of the recognized dialects.
keepAll if 1, pass all documents on regardless of
their HTMLness. Non-HTML goes forward as '\n'.
Options to specify HTML subsets whose tags to remove: (set to defined)
alvisKeep W3's HTML 4.01 tags Alvis::Convert
is interested in
alvisRemove 4.01 tags Alvis::Convert is NOT interested in
obsolete HTML <4.01
proprietary Net-escape,Exploder,...
xhtml XHTML 1.1
wml WML
Note: alvisKeep + alvisRemove == remove all HTML 4.01 tags
convertCharEnts convert symbolic character entities to UTF-8 characters.
convertNumEnts convert numerical character entities to UTF-8
characters.
sourceEncoding encoding of the source HTML text (default: 'utf-8')
If not 'utf-8', HTML is converted to UTF-8.
If undefined, the encoding is guessed first.
assertSourceAssumptions
make sure that before any operations the source is
in UTF-8 and contains no null bytes.
=head2 clean(html,options)
Remove unwanted tags from $html (text). $options is
a mechanism for returning the title and base URL of the document and
setting call-specific parameters.
If their extraction is desired, set fields 'title' and 'baseURL'
to a defined value. e.g.
my ($txt,$header)=$C->clean($html,
{title=>1,
baseURL=>1});
In $options you can also set the source and target encodings
(sourceEncoding,targetEncoding).
my ($txt,$header)=$C->clean($html,
{title=>1,
baseURL=>1,
sourceEncoding=>'iso-8859-1'});
This will guess the encoding first:
my ($txt,$header)=$C->clean($html,
{title=>1,
baseURL=>1,
sourceEncoding=>undef});
will convert from 'iso-8859-1' to default output encoding (UTF-8).
=head2 errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
=head1 SEE ALSO
Alvis::Canonical
=head1 AUTHOR
Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2006 by Kimmo Valtonen
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
( run in 1.678 second using v1.01-cache-2.11-cpan-39bf76dae61 )