Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/HTML.pm view on Meta::CPAN
return $self;
}
sub _init
{
my $self=shift;
$self->{assertHTML}=$DEF_ASSERT_HTML;
$self->{keepAll}=$DEF_KEEP_ALL;
$self->{assertSourceAssumptions}=$DEF_SRC_ASS;;
$self->{convertCharEnts}=$DEF_CONVERT_CHAR_ENTS;
$self->{convertNumEnts}=$DEF_CONVERT_NUM_ENTS;
$self->{cleanWhitespace}=$DEF_CLEAN_WS;
$self->{sourceEncoding}=$DEF_SRC_ENCODING;
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#############################################################################
#
# Public methods
#
##############################################################################
#
# Returns (<contents as text>,<header hash ref>)
#
sub clean
{
my $self=shift;
my $html=shift;
my $opts=shift; # if a title/base URL is wished for as well
# returned in a header hash with keys
# title, baseURL
my %header=(title=>undef,
baseURL=>undef);
$self->_set_err_state($ERR_OK); # clean the slate
# Make it utf-8 if not already
my $src_enc;
if ($opts->{sourceEncoding})
{
$src_enc=$opts->{sourceEncoding};
}
elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
{
$src_enc=$self->{sourceEncoding};
}
if ($src_enc)
{
if ($src_enc!~/^\s*utf-?8\s*$/)
{
$html=$self->{encodingWiz}->convert($html,
$src_enc,
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
}
else # try guessing the encoding
{
$html=$self->{encodingWiz}->guess_and_convert($html,
'text',
'html',
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
# ex nihilo nihil
#
if (!defined($html) || $html=~/^\s*$/sgo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_EMPTY_DOC);
return (undef,\%header); # signals "do not pass on"
}
}
# Check if this really looks like "HTML"
#
if ($self->{assertHTML})
{
#
# If we're lucky...
#
if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type!~/(?:html|wml)/igo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_UNK_DOCTYPE,"($type)");
return (undef,\%header); # signals "do not pass on"
}
}
}
# Otherwise, use a weaker way of checking... a single
# signature start tag will do.
#
if ($html!~/<(?:(?i)html|body)\W/sgo)
{
if ($self->{keepAll})
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_NO_SIGNATURE);
return (undef,\%header); # signals "do not pass on"
}
}
}
if ($self->{assertSourceAssumptions})
{
my %err;
if (!$self->{encodingWiz}->is_utf8($html,\%err))
{
$self->_set_err_state($ERR_SRC_NOT_IN_UTF8,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
# Remove '\0's just in case. Replace by a ' ' just in case they
# separated something meaningful in the original.
$html=~s/[\0]+/ /sgo;
}
# Remove comments
#
$html=~s/<\!\-\-.*?\-\->//sgo;
# Remove some MS & declaration crap Loses some (very little) maybe,
# but suffices for Alvis purposes.
#
$html=~s/<\!.*?>//sgo;
# Extract the title if any, if desired
#
if (defined($opts->{title}))
{
my $title_cand;
if ($html=~/<title\W(.*?)<\/title>/isgo)
{
#
# Extract it
#
$title_cand=$1;
$title_cand=~s/=\s*([\"\'])(.*?)\1/&_neutralize_trouble($1,$2)/sgoe;
$title_cand=~s/^.*?>//sgo;
#
# Clean it
#
my $c;
$title_cand=~s/(?:&\#(\d+);?)/$1<256 && $1>0 ? chr($1) : ""/ego;
$title_cand=~s/(?:&\#[xX]([0-9a-fA-F]+);?)/$c=hex($1); $c<256 && $c>0 ? chr($c) : ""/ego;
$title_cand=~s/(?:&(\w+);?)/$self->_char_ent2char($1)/ego;
$title_cand=~s/\s+/ /sgo;
$title_cand=~s/^\s+//sgo;
$title_cand=~s/\s+$//sgo;
$title_cand=~s/[^A-Za-zÆÁÂÀÅÃÄÇÐÉÊÈËÍÎÌÏÑÓÔÒØÕÖÞÚÛÙÜÝáâæàåãäçéêèðëíîìïñóôòøõößþúûùüýÿ¦¨´¸¼½¾ ,\.\-:\?]//isgo;
}
$header{title}=$title_cand;
}
# Extract the base URL if any, if desired
#
if (defined($opts->{baseURL}))
{
my $base_cand;
if ($html=~/<base\W(.*?)>/isgo)
{
#
lib/Alvis/HTML.pm view on Meta::CPAN
# Fix attributes of interest
$html=~s/(<(?:frame|iframe|img)\W[^>]*?src\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
# Sometimes "HTML" contains Alvis tags...double safeguard them
$html=~s/<(\/?(?:(?i)section|list|item|ulink).*?)>/\<$1\>/sgo;
}
if ($DEBUG)
{
warn $html;
}
# If wished for, convert character entities
if ($self->{convertCharEnts})
{
$html=~s/(?:&(\w+);)/$self->_char_ent2char($1)/ego;
}
# If wished for, convert numerical character entities
if ($self->{convertNumEnts})
{
#
# Numerical entities depend on the presumed character set
# of the source HTML. You had better be sure it is UTF-8 or
# should we check here?
#
$html=~s/(?:&\#(\d+);?)/$self->_num_ent2char($1)/ego;
$html=~s/(?:&\#[xX]([0-9a-fA-F]+);?)/$self->_hex_ent2char($1)/ego;
}
if ($self->{cleanWhitespace})
{
# Might look overcomplicated but is 3-4x faster than the
# first, obvious versions and does not have artificial limits on the
# number of consecutive non-\n ws compressed.
$html=~s/\n/\0/go;
$html=~s/\s+/ /go;
$html=~s/[ ](?=\0)//go;
$html=~s/(?<=\0)[ ]//go;
$html=~s/^\0+//sgo;
$html=~s/\0+$//sgo;
$html=~s/\0{3,}/\n\n/go;
$html=~s/\0/\n/go;
}
return ($html,\%header);
}
###########################################################################
#
# Private methods
#
###########################################################################
sub _num_ent2char
{
my $self=shift;
my $num=shift;
# check for invalid codes
if (!$self->{encodingWiz}->code_is_utf8($num))
{
# must be an error, don't try to fix typos atm
return "&#$num;";
}
my $str=pack("U",$num);
return $str;
}
sub _hex_ent2char
{
my $self=shift;
my $num=shift;
$num=hex($num);
return $self->_num_ent2char($num);
}
sub _char_ent2char
{
my $self=shift;
my $name=shift;
if (defined($Ent2Unicode{$name}))
{
return $Ent2Unicode{$name};
}
else
{
return "&${name};";
}
}
#
# Fix a relevant broken attribute value so it ends with the same
# quote char it starts with
#
sub _fix_attr
{
my $self=shift;
my $prefix=shift;
my $quote=shift;
my $attr_value=shift;
my $suffix=shift;
my $txt=$prefix . $quote . $attr_value;
if ($attr_value!~/$quote/sgo)
{
# Add the ending quote
$txt.=$quote;
}
if (defined($suffix))
{
# the attr value breaks at a space, where the closing > should be
$txt.=$suffix;
}
$txt.='>';
( run in 0.384 second using v1.01-cache-2.11-cpan-140bd7fdf52 )