Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/HTML.pm view on Meta::CPAN
#
# Extract the base URL
#
my $base_pars=$1;
$base_pars=~s/=\s*([\"\'])(.*?)\1/&_neutralize_trouble($1,$2)/sgoe;
if ($base_pars=~/href\s*=\s*([\"\'])(.*?)\1/isgo)
{
my $href=$2;
$href=~s/\&/\&/go;
$base_cand=$href;
}
}
$header{baseURL}=$base_cand;
}
# Decapitation to avoid including the title at least
# and to make the following faster.
#
$html=~s/<head\W.*?<\/head>//isgo;
# Remove the entire elements STYLE, (NO)SCRIPT
#
$html=~s/<style.*?<\/style>//isgo;
$html=~s/<script.*?<\/script>//isgo;
$html=~s/<noscript.*?<\/noscript>//isgo;
# Tag removal. Optimized for speed.
# Hangs on the assumption that the input cannot contain '\0's.
# Algorithm:
# 1. Mark & replace legal tag starts with '\0'.
# 2. Go from the start of a tag to the beginning of
# the next one, neutralizing any confusing chars
# inside possible attribute values.
# 3. Pick the leftmost '>' before the start of the next
# tag as the end of the tag.
# 4. Remove all tags.
#
if ($self->{alvisKeep})
{
$html=~s/<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption)(?=\W)/\0/sgo;
}
if ($self->{alvisRemove})
{
$html=~s/<\/?(?:(?i)tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|button|center|cite|code|col|colgro...
}
if ($self->{obsolete})
{
$html=~s/<\/?(?:(?i)header|nextid|section|listing|xmp|plaintext)(?=\W)/\0/sgo;
}
if ($self->{proprietary})
{
$html=~s/<\/?(?:(?i)align|blink|embed|ilayer|keygen|layer|multicol|noembed|nolayer|nosave|spacer|inlineinput|sound|audioscope|blackface|animate|bgsound|comment|marquee|xml|o:p|csaction|csactions|csactiondict|csscriptdict|csactionitem|csobj|wbr|nobr|...
}
if ($self->{xhtml})
{
$html=~s/<\/?(?:(?i)ruby|rbc|rtc|rb|rt|rp)(?=\W)/\0/sgo;
}
if ($self->{wml})
{
$html=~s/<\/?(?:(?i)access|card|template|wml|anchor|do|onevent|postfield|go|noop|prev|refresh|fieldset|optgroup|select|setvar|timer)(?=\W)/\0/sgo;
}
# $html=~s/=\s*([\"\'])([^\0]*?)\1/&_neutralize_trouble($1,$2)/sgoe;
$html=~s/(?<=\0).*?>//sgo;
$html=~s/\0/ /go;
# We have removed those tags we wanted to now
# If we have some tags left, do some fixing
if (!$self->{alvisKeep}||!$self->{alvisRemove}||!$self->{obsolete}||
!$self->{proprietary}||!$self->{xhtml}||!$self->{wml})
{
# Often we have <TAG ... </TAG>. Fix that.
$html=~s/(<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption|tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|app...
$html=~s/(?<=\0)([^>]*?)(?=\0)/$1>/sgo;
$html=~s/(?<=\0)([^\0>]*?)$/$1>/sgo;
$html=~s/\0/ /go;
}
# Alvis needs some finer tuning
if (!$self->{alvisKeep})
{
# Fix attributes of interest
$html=~s/(<a\W[^>]*?href\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
# Fix attributes of interest
$html=~s/(<(?:frame|iframe|img)\W[^>]*?src\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
# Sometimes "HTML" contains Alvis tags...double safeguard them
$html=~s/<(\/?(?:(?i)section|list|item|ulink).*?)>/\<$1\>/sgo;
}
if ($DEBUG)
{
warn $html;
}
# If wished for, convert character entities
if ($self->{convertCharEnts})
{
$html=~s/(?:&(\w+);)/$self->_char_ent2char($1)/ego;
}
# If wished for, convert numerical character entities
if ($self->{convertNumEnts})
{
#
# Numerical entities depend on the presumed character set
# of the source HTML. You had better be sure it is UTF-8 or
# should we check here?
#
$html=~s/(?:&\#(\d+);?)/$self->_num_ent2char($1)/ego;
$html=~s/(?:&\#[xX]([0-9a-fA-F]+);?)/$self->_hex_ent2char($1)/ego;
}
if ($self->{cleanWhitespace})
{
# Might look overcomplicated but is 3-4x faster than the
# first, obvious versions and does not have artificial limits on the
# number of consecutive non-\n ws compressed.
$html=~s/\n/\0/go;
$html=~s/\s+/ /go;
$html=~s/[ ](?=\0)//go;
$html=~s/(?<=\0)[ ]//go;
$html=~s/^\0+//sgo;
$html=~s/\0+$//sgo;
$html=~s/\0{3,}/\n\n/go;
$html=~s/\0/\n/go;
}
return ($html,\%header);
}
###########################################################################
#
( run in 1.121 second using v1.01-cache-2.11-cpan-acebb50784d )