Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/HTML.pm  view on Meta::CPAN

package Alvis::HTML;

use warnings;
use strict;

use Alvis::Document::Encoding;

$Alvis::HTML::VERSION = '0.31';

#############################################################################
#
#     Kimmo Valtonen, based on earlier work assisted by Ville Tuulos and
#     Antti Tuominen
#
#############################################################################

#############################################################################
#
#     Global variables & constants
#
##############################################################################

# Do we assert that our assumptions about the source hold? 
my $DEF_SRC_ASS=1;
# Do we check first to see if the document really looks like HTML?
my $DEF_ASSERT_HTML=1;
# Do we pass on even non-HTML documents?
my $DEF_KEEP_ALL=0;
# Do we replace character entities with actual characters?
my $DEF_CONVERT_CHAR_ENTS=1;
# Do we replace numerical character entities with actual characters?
my $DEF_CONVERT_NUM_ENTS=0;
# Do we try to clean extra whitespace?
my $DEF_CLEAN_WS=0;
# Source encoding
my $DEF_SRC_ENCODING='utf-8';

my $DEBUG=0;

###########################################################################
#
#     Symbolic character entity to Unicode (decimal) mapping
#
###########################################################################

my %Ent2Unicode=(
 quot =>'34',    #   "    HTML 2.0    quotation mark
 amp =>'38',    #   &    HTML 2.0    ampersand
 lt =>'60',    #   <    HTML 2.0    less-than sign
 gt =>'62',    #   >    HTML 2.0    greater-than sign
 nbsp =>'160',    #        HTML 3.2    no-break space
 iexcl =>'161',    #   ¡    HTML 3.2    inverted exclamation mark
 cent =>'162',    #   ¢    HTML 3.2    cent sign
 pound =>'163',    #   £    HTML 3.2    pound sign
 curren =>'164',    #   ¤    HTML 3.2    currency sign
 yen =>'165',    #   ¥    HTML 3.2    yen sign
 brvbar =>'166',    #   ¦    HTML 3.2    broken bar
 sect =>'167',    #   §    HTML 3.2    section sign
 uml =>'168',    #   ¨    HTML 3.2    diaeresis
 copy =>'169',    #   ©    HTML 3.2    copyright sign
 ordf =>'170',    #   ª    HTML 3.2    feminine ordinal indicator
 laquo =>'171',    #   «    HTML 3.2    left-pointing double angle quotation mark
 not =>'172',    #   ¬    HTML 3.2    not sign
 shy =>'173',    #   ­    HTML 3.2    soft hyphen
 reg =>'174',    #   ®    HTML 3.2    registered sign
 macr =>'175',    #   ¯    HTML 3.2    macron
 deg =>'176',    #   °    HTML 3.2    degree sign
 plusmn =>'177',    #   ±    HTML 3.2    plus-minus sign
 sup2 =>'178',    #   ²    HTML 3.2    superscript two
 sup3 =>'179',    #   ³    HTML 3.2    superscript three
 acute =>'180',    #   ´    HTML 3.2    acute accent
 micro =>'181',    #   µ    HTML 3.2    micro sign
 para =>'182',    #   ¶    HTML 3.2    pilcrow sign
 middot =>'183',    #   ·    HTML 3.2    middle dot
 cedil =>'184',    #   ¸    HTML 3.2    cedilla
 sup1 =>'185',    #   ¹    HTML 3.2    superscript one
 ordm =>'186',    #   º    HTML 3.2    masculine ordinal indicator
 raquo =>'187',    #   »    HTML 3.2    right-pointing double angle quotation mark
 frac14 =>'188',    #   ¼    HTML 3.2    vulgar fraction one quarter
 frac12 =>'189',    #   ½    HTML 3.2    vulgar fraction one half
 frac34 =>'190',    #   ¾    HTML 3.2    vulgar fraction three quarters
 iquest =>'191',    #   ¿    HTML 3.2    inverted question mark
 Agrave =>'192',    #   À    HTML 2.0    latin capital letter a with grave
 Aacute =>'193',    #   Á    HTML 2.0    latin capital letter a with acute
 Acirc =>'194',    #   Â    HTML 2.0    latin capital letter a with circumflex

lib/Alvis/HTML.pm  view on Meta::CPAN


    my %header=(title=>undef,
		baseURL=>undef);

    $self->_set_err_state($ERR_OK);  # clean the slate

    # Make it utf-8 if not already
    my $src_enc;
    if ($opts->{sourceEncoding})
    {
	$src_enc=$opts->{sourceEncoding};
    }
    elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
    {
	$src_enc=$self->{sourceEncoding};
    }
    if ($src_enc)
    {
	if ($src_enc!~/^\s*utf-?8\s*$/)
	{
	    $html=$self->{encodingWiz}->convert($html,
						$src_enc,
						'utf8');
	    if (!defined($html))
	    {
		$self->_set_err_state($ERR_UTF8_CONV,
				      $self->{encodingWiz}->errmsg());
		return (undef,\%header);  # signals "do not pass on"
	    }
	}
    }
    else # try guessing the encoding
    {
	$html=$self->{encodingWiz}->guess_and_convert($html,
						      'text',
						      'html',
						      'utf8');
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
    }

    # ex nihilo nihil 
    #
    if (!defined($html) || $html=~/^\s*$/sgo)
    {
	if ($self->{keepAll})
	{
	    return ("\n",\%header);
	}
	else
	{
	    $self->_set_err_state($ERR_EMPTY_DOC);
	    return (undef,\%header);  # signals "do not pass on"
	}  
    }

    # Check if this really looks like "HTML" 
    #
    if ($self->{assertHTML})
    {
	#
	# If we're lucky...
	#
	if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
	{
	    my $type=$1;
	    if ($type!~/(?:html|wml)/igo)
	    {
		if ($self->{keepAll})
		{
		    return ("\n",\%header);
		}
		else
		{
		    $self->_set_err_state($ERR_UNK_DOCTYPE,"($type)");
		    return (undef,\%header);  # signals "do not pass on"
		}
	    }
	}
	# Otherwise, use a weaker way of checking... a single 
	# signature start tag will do. 
	#
	if ($html!~/<(?:(?i)html|body)\W/sgo)
	{
	    if ($self->{keepAll})
	    {
		return ("\n",\%header);
	    }
	    else
	    {
		$self->_set_err_state($ERR_NO_SIGNATURE);
		return (undef,\%header);  # signals "do not pass on"
	    }
	} 
    }

    if ($self->{assertSourceAssumptions})
    {
	my %err;
	if (!$self->{encodingWiz}->is_utf8($html,\%err))
	{
	    $self->_set_err_state($ERR_SRC_NOT_IN_UTF8,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
	# Remove '\0's just in case. Replace by a ' ' just in case they 
	# separated something meaningful in the original. 
	$html=~s/[\0]+/ /sgo;
    }

    # Remove comments
    #
    $html=~s/<\!\-\-.*?\-\->//sgo;

    # Remove some MS & declaration crap  Loses some (very little) maybe, 
    # but suffices for Alvis purposes.
    #



( run in 0.644 second using v1.01-cache-2.11-cpan-39bf76dae61 )