Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/HTML.pm view on Meta::CPAN
package Alvis::HTML;
use warnings;
use strict;
use Alvis::Document::Encoding;
$Alvis::HTML::VERSION = '0.31';
#############################################################################
#
# Kimmo Valtonen, based on earlier work assisted by Ville Tuulos and
# Antti Tuominen
#
#############################################################################
#############################################################################
#
# Global variables & constants
#
##############################################################################
# Do we assert that our assumptions about the source hold?
my $DEF_SRC_ASS=1;
# Do we check first to see if the document really looks like HTML?
my $DEF_ASSERT_HTML=1;
# Do we pass on even non-HTML documents?
my $DEF_KEEP_ALL=0;
# Do we replace character entities with actual characters?
my $DEF_CONVERT_CHAR_ENTS=1;
# Do we replace numerical character entities with actual characters?
my $DEF_CONVERT_NUM_ENTS=0;
# Do we try to clean extra whitespace?
my $DEF_CLEAN_WS=0;
# Source encoding
my $DEF_SRC_ENCODING='utf-8';
my $DEBUG=0;
###########################################################################
#
# Symbolic character entity to Unicode (decimal) mapping
#
###########################################################################
my %Ent2Unicode=(
quot =>'34', # " HTML 2.0 quotation mark
amp =>'38', # & HTML 2.0 ampersand
lt =>'60', # < HTML 2.0 less-than sign
gt =>'62', # > HTML 2.0 greater-than sign
nbsp =>'160', # HTML 3.2 no-break space
iexcl =>'161', # ¡ HTML 3.2 inverted exclamation mark
cent =>'162', # ¢ HTML 3.2 cent sign
pound =>'163', # £ HTML 3.2 pound sign
curren =>'164', # ¤ HTML 3.2 currency sign
yen =>'165', # ¥ HTML 3.2 yen sign
brvbar =>'166', # ¦ HTML 3.2 broken bar
sect =>'167', # § HTML 3.2 section sign
uml =>'168', # ¨ HTML 3.2 diaeresis
copy =>'169', # © HTML 3.2 copyright sign
ordf =>'170', # ª HTML 3.2 feminine ordinal indicator
laquo =>'171', # « HTML 3.2 left-pointing double angle quotation mark
not =>'172', # ¬ HTML 3.2 not sign
shy =>'173', # HTML 3.2 soft hyphen
reg =>'174', # ® HTML 3.2 registered sign
macr =>'175', # ¯ HTML 3.2 macron
deg =>'176', # ° HTML 3.2 degree sign
plusmn =>'177', # ± HTML 3.2 plus-minus sign
sup2 =>'178', # ² HTML 3.2 superscript two
sup3 =>'179', # ³ HTML 3.2 superscript three
acute =>'180', # ´ HTML 3.2 acute accent
micro =>'181', # µ HTML 3.2 micro sign
para =>'182', # ¶ HTML 3.2 pilcrow sign
middot =>'183', # · HTML 3.2 middle dot
cedil =>'184', # ¸ HTML 3.2 cedilla
sup1 =>'185', # ¹ HTML 3.2 superscript one
ordm =>'186', # º HTML 3.2 masculine ordinal indicator
raquo =>'187', # » HTML 3.2 right-pointing double angle quotation mark
frac14 =>'188', # ¼ HTML 3.2 vulgar fraction one quarter
frac12 =>'189', # ½ HTML 3.2 vulgar fraction one half
frac34 =>'190', # ¾ HTML 3.2 vulgar fraction three quarters
iquest =>'191', # ¿ HTML 3.2 inverted question mark
Agrave =>'192', # À HTML 2.0 latin capital letter a with grave
Aacute =>'193', # Á HTML 2.0 latin capital letter a with acute
Acirc =>'194', # Â HTML 2.0 latin capital letter a with circumflex
lib/Alvis/HTML.pm view on Meta::CPAN
my %header=(title=>undef,
baseURL=>undef);
$self->_set_err_state($ERR_OK); # clean the slate
# Make it utf-8 if not already
my $src_enc;
if ($opts->{sourceEncoding})
{
$src_enc=$opts->{sourceEncoding};
}
elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
{
$src_enc=$self->{sourceEncoding};
}
if ($src_enc)
{
if ($src_enc!~/^\s*utf-?8\s*$/)
{
$html=$self->{encodingWiz}->convert($html,
$src_enc,
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
}
else # try guessing the encoding
{
$html=$self->{encodingWiz}->guess_and_convert($html,
'text',
'html',
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
# ex nihilo nihil
#
if (!defined($html) || $html=~/^\s*$/sgo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_EMPTY_DOC);
return (undef,\%header); # signals "do not pass on"
}
}
# Check if this really looks like "HTML"
#
if ($self->{assertHTML})
{
#
# If we're lucky...
#
if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type!~/(?:html|wml)/igo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_UNK_DOCTYPE,"($type)");
return (undef,\%header); # signals "do not pass on"
}
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($html!~/<(?:(?i)html|body)\W/sgo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_NO_SIGNATURE);
return (undef,\%header); # signals "do not pass on"
}
}
}
if ($self->{assertSourceAssumptions})
{
my %err;
if (!$self->{encodingWiz}->is_utf8($html,\%err))
{
$self->_set_err_state($ERR_SRC_NOT_IN_UTF8,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
# Remove '\0's just in case. Replace by a ' ' just in case they
# separated something meaningful in the original.
$html=~s/[\0]+/ /sgo;
}
# Remove comments
#
$html=~s/<\!\-\-.*?\-\->//sgo;
# Remove some MS & declaration crap Loses some (very little) maybe,
# but suffices for Alvis purposes.
#
( run in 0.644 second using v1.01-cache-2.11-cpan-39bf76dae61 )