Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/HTML.pm  view on Meta::CPAN

	    #
	    # Extract the base URL
	    #
	    my $base_pars=$1;
	    $base_pars=~s/=\s*([\"\'])(.*?)\1/&_neutralize_trouble($1,$2)/sgoe;
	    
	    if ($base_pars=~/href\s*=\s*([\"\'])(.*?)\1/isgo)
	    {
		my $href=$2;
		$href=~s/\&/\&/go;
		$base_cand=$href;
	    }
	}
	$header{baseURL}=$base_cand;
    }

    # Decapitation to avoid including the title at least
    # and to make the following faster.
    #
    $html=~s/<head\W.*?<\/head>//isgo;

    # Remove the entire elements STYLE, (NO)SCRIPT  
    #
    $html=~s/<style.*?<\/style>//isgo;
    $html=~s/<script.*?<\/script>//isgo;
    $html=~s/<noscript.*?<\/noscript>//isgo;

    # Tag removal. Optimized for speed.
    # Hangs on the assumption that the input cannot contain '\0's.
    # Algorithm:
    #           1. Mark & replace legal tag starts with '\0'.
    #           2. Go from the start of a tag to the beginning of
    #              the next one, neutralizing any confusing chars
    #              inside possible attribute values.
    #           3. Pick the leftmost '>' before the start of the next
    #              tag as the end of the tag.
    #           4. Remove all tags.               
    #
    if ($self->{alvisKeep})
    { 
	$html=~s/<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption)(?=\W)/\0/sgo;
    }
    if ($self->{alvisRemove})
    {
	$html=~s/<\/?(?:(?i)tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|button|center|cite|code|col|colgro...
    }
    if ($self->{obsolete})
    {
	$html=~s/<\/?(?:(?i)header|nextid|section|listing|xmp|plaintext)(?=\W)/\0/sgo;
    }
    if ($self->{proprietary})
    {
	$html=~s/<\/?(?:(?i)align|blink|embed|ilayer|keygen|layer|multicol|noembed|nolayer|nosave|spacer|inlineinput|sound|audioscope|blackface|animate|bgsound|comment|marquee|xml|o:p|csaction|csactions|csactiondict|csscriptdict|csactionitem|csobj|wbr|nobr|...
    }
    if ($self->{xhtml})
    {
	$html=~s/<\/?(?:(?i)ruby|rbc|rtc|rb|rt|rp)(?=\W)/\0/sgo;
    }
    if ($self->{wml})
    {
	$html=~s/<\/?(?:(?i)access|card|template|wml|anchor|do|onevent|postfield|go|noop|prev|refresh|fieldset|optgroup|select|setvar|timer)(?=\W)/\0/sgo;
    }

#    $html=~s/=\s*([\"\'])([^\0]*?)\1/&_neutralize_trouble($1,$2)/sgoe;
    $html=~s/(?<=\0).*?>//sgo;
    $html=~s/\0/ /go;
    
    # We have removed those tags we wanted to now
    
    # If we have some tags left, do some fixing 
    if (!$self->{alvisKeep}||!$self->{alvisRemove}||!$self->{obsolete}||
	!$self->{proprietary}||!$self->{xhtml}||!$self->{wml})
    {
	# Often we have <TAG ... </TAG>. Fix that.
	$html=~s/(<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption|tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|app...
	$html=~s/(?<=\0)([^>]*?)(?=\0)/$1>/sgo;
	$html=~s/(?<=\0)([^\0>]*?)$/$1>/sgo;
	$html=~s/\0/ /go;
    }

    # Alvis needs some finer tuning
    if (!$self->{alvisKeep})
    { 
	# Fix attributes of interest
	$html=~s/(<a\W[^>]*?href\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
	# Fix attributes of interest
	$html=~s/(<(?:frame|iframe|img)\W[^>]*?src\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;

	# Sometimes "HTML" contains Alvis tags...double safeguard them
	$html=~s/<(\/?(?:(?i)section|list|item|ulink).*?)>/\&lt;$1\&gt;/sgo;
    }
	
    if ($DEBUG)
    {
	warn $html;
    }

    # If wished for, convert character entities 
    if ($self->{convertCharEnts})
    {
	$html=~s/(?:&(\w+);)/$self->_char_ent2char($1)/ego;
    }

    # If wished for, convert numerical character entities 
    if ($self->{convertNumEnts})
    {
	#
	# Numerical entities depend on the presumed character set
        # of the source HTML. You had better be sure it is UTF-8 or
        # should we check here?
	#
	$html=~s/(?:&\#(\d+);?)/$self->_num_ent2char($1)/ego;
	$html=~s/(?:&\#[xX]([0-9a-fA-F]+);?)/$self->_hex_ent2char($1)/ego;
    }

    if ($self->{cleanWhitespace})
    {
	# Might look overcomplicated but is 3-4x faster than the
	# first, obvious versions and does not have artificial limits on the
	# number of consecutive non-\n ws compressed.
	$html=~s/\n/\0/go;
	$html=~s/\s+/ /go;
	$html=~s/[ ](?=\0)//go;
	$html=~s/(?<=\0)[ ]//go;
	$html=~s/^\0+//sgo;
	$html=~s/\0+$//sgo;
	$html=~s/\0{3,}/\n\n/go;
	$html=~s/\0/\n/go;
    }

    return ($html,\%header);
}

###########################################################################
#



( run in 1.121 second using v1.01-cache-2.11-cpan-acebb50784d )