utf8 results from the CPAN

Alvis-Convert

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{assertHTML}=$DEF_ASSERT_HTML;
    $self->{keepAll}=$DEF_KEEP_ALL;
    $self->{assertSourceAssumptions}=$DEF_SRC_ASS;;
    $self->{convertCharEnts}=$DEF_CONVERT_CHAR_ENTS;
    $self->{convertNumEnts}=$DEF_CONVERT_NUM_ENTS;
    $self->{cleanWhitespace}=$DEF_CLEAN_WS;
    $self->{sourceEncoding}=$DEF_SRC_ENCODING;

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

#############################################################################
#
#      Public methods
#
##############################################################################
 
#
# Returns (<contents as text>,<header hash ref>)
#
sub clean
{
    my $self=shift;
    my $html=shift;
    my $opts=shift;    # if a title/base URL is wished for as well
                       # returned in a header hash with keys
                       # title, baseURL

    my %header=(title=>undef,
		baseURL=>undef);

    $self->_set_err_state($ERR_OK);  # clean the slate

    # Make it utf-8 if not already
    my $src_enc;
    if ($opts->{sourceEncoding})
    {
	$src_enc=$opts->{sourceEncoding};
    }
    elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
    {
	$src_enc=$self->{sourceEncoding};
    }
    if ($src_enc)
    {
	if ($src_enc!~/^\s*utf-?8\s*$/)
	{
	    $html=$self->{encodingWiz}->convert($html,
						$src_enc,
						'utf8');
	    if (!defined($html))
	    {
		$self->_set_err_state($ERR_UTF8_CONV,
				      $self->{encodingWiz}->errmsg());
		return (undef,\%header);  # signals "do not pass on"
	    }
	}
    }
    else # try guessing the encoding
    {
	$html=$self->{encodingWiz}->guess_and_convert($html,
						      'text',
						      'html',
						      'utf8');
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
    }

    # ex nihilo nihil 
    #
    if (!defined($html) || $html=~/^\s*$/sgo)
    {
	if ($self->{keepAll})
	{
	    return ("\n",\%header);
	}
	else
	{
	    $self->_set_err_state($ERR_EMPTY_DOC);
	    return (undef,\%header);  # signals "do not pass on"
	}  
    }

    # Check if this really looks like "HTML" 
    #
    if ($self->{assertHTML})
    {
	#
	# If we're lucky...
	#
	if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
	{
	    my $type=$1;
	    if ($type!~/(?:html|wml)/igo)
	    {
		if ($self->{keepAll})
		{
		    return ("\n",\%header);
		}
		else
		{
		    $self->_set_err_state($ERR_UNK_DOCTYPE,"($type)");
		    return (undef,\%header);  # signals "do not pass on"
		}
	    }
	}
	# Otherwise, use a weaker way of checking... a single 
	# signature start tag will do. 
	#
	if ($html!~/<(?:(?i)html|body)\W/sgo)
	{
	    if ($self->{keepAll})
	    {
		return ("\n",\%header);
	    }
	    else
	    {
		$self->_set_err_state($ERR_NO_SIGNATURE);
		return (undef,\%header);  # signals "do not pass on"
	    }
	} 
    }

    if ($self->{assertSourceAssumptions})
    {
	my %err;
	if (!$self->{encodingWiz}->is_utf8($html,\%err))
	{
	    $self->_set_err_state($ERR_SRC_NOT_IN_UTF8,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
	# Remove '\0's just in case. Replace by a ' ' just in case they 
	# separated something meaningful in the original. 
	$html=~s/[\0]+/ /sgo;
    }

    # Remove comments
    #
    $html=~s/<\!\-\-.*?\-\->//sgo;

    # Remove some MS & declaration crap  Loses some (very little) maybe, 
    # but suffices for Alvis purposes.
    #
    $html=~s/<\!.*?>//sgo;

    # Extract the title if any, if desired
    #
    if (defined($opts->{title}))
    {
	my $title_cand;
	if ($html=~/<title\W(.*?)<\/title>/isgo)
	{
	    #
	    # Extract it
	    #
	    $title_cand=$1;
	    $title_cand=~s/=\s*([\"\'])(.*?)\1/&_neutralize_trouble($1,$2)/sgoe;
	    $title_cand=~s/^.*?>//sgo;
	     
	    #
	    # Clean it
	    #
	    my $c;
	    $title_cand=~s/(?:&\#(\d+);?)/$1<256 && $1>0 ? chr($1) : ""/ego;
	    $title_cand=~s/(?:&\#[xX]([0-9a-fA-F]+);?)/$c=hex($1); $c<256 && $c>0 ? chr($c) : ""/ego;
	    $title_cand=~s/(?:&(\w+);?)/$self->_char_ent2char($1)/ego;
	    
	    $title_cand=~s/\s+/ /sgo;
	    $title_cand=~s/^\s+//sgo;
	    $title_cand=~s/\s+$//sgo;
	    
	    $title_cand=~s/[^A-Za-zÆÁÂÀÅÃÄÇÐÉÊÈËÍÎÌÏÑÓÔÒØÕÖÞÚÛÙÜÝáâæàåãäçéêèðëíîìïñóôòøõößþúûùüýÿ¦¨´¸¼½¾ ,\.\-:\?]//isgo;
	}	    

	
	$header{title}=$title_cand;
    }

    # Extract the base URL if any, if desired
    #
    if (defined($opts->{baseURL}))
    {
	my $base_cand;
	if ($html=~/<base\W(.*?)>/isgo)
	{
	    #

lib/Alvis/HTML.pm view on Meta::CPAN

	# Fix attributes of interest
	$html=~s/(<(?:frame|iframe|img)\W[^>]*?src\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;

	# Sometimes "HTML" contains Alvis tags...double safeguard them
	$html=~s/<(\/?(?:(?i)section|list|item|ulink).*?)>/\&lt;$1\&gt;/sgo;
    }
	
    if ($DEBUG)
    {
	warn $html;
    }

    # If wished for, convert character entities 
    if ($self->{convertCharEnts})
    {
	$html=~s/(?:&(\w+);)/$self->_char_ent2char($1)/ego;
    }

    # If wished for, convert numerical character entities 
    if ($self->{convertNumEnts})
    {
	#
	# Numerical entities depend on the presumed character set
        # of the source HTML. You had better be sure it is UTF-8 or
        # should we check here?
	#
	$html=~s/(?:&\#(\d+);?)/$self->_num_ent2char($1)/ego;
	$html=~s/(?:&\#[xX]([0-9a-fA-F]+);?)/$self->_hex_ent2char($1)/ego;
    }

    if ($self->{cleanWhitespace})
    {
	# Might look overcomplicated but is 3-4x faster than the
	# first, obvious versions and does not have artificial limits on the
	# number of consecutive non-\n ws compressed.
	$html=~s/\n/\0/go;
	$html=~s/\s+/ /go;
	$html=~s/[ ](?=\0)//go;
	$html=~s/(?<=\0)[ ]//go;
	$html=~s/^\0+//sgo;
	$html=~s/\0+$//sgo;
	$html=~s/\0{3,}/\n\n/go;
	$html=~s/\0/\n/go;
    }

    return ($html,\%header);
}

###########################################################################
#
# Private methods
#
###########################################################################

sub _num_ent2char
{
    my $self=shift;
    my $num=shift;

    # check for invalid codes 
    if (!$self->{encodingWiz}->code_is_utf8($num))
    {
	# must be an error, don't try to fix typos atm
	return "&#$num;";
    }

    my $str=pack("U",$num);

    return $str;
}

sub _hex_ent2char
{
    my $self=shift;
    my $num=shift;

    $num=hex($num);
    return $self->_num_ent2char($num);
}

sub _char_ent2char
{
    my $self=shift;
    my $name=shift;

    if (defined($Ent2Unicode{$name}))
    {
	return $Ent2Unicode{$name};
    }
    else
    {
	return "&${name};";
    }
}

#
# Fix a relevant broken attribute value so it ends with the same
# quote char it starts with
#
sub _fix_attr
{
    my $self=shift;
    my $prefix=shift;
    my $quote=shift;
    my $attr_value=shift;
    my $suffix=shift;
    
    my $txt=$prefix . $quote . $attr_value;
    if ($attr_value!~/$quote/sgo)
    {
	# Add the ending quote
	$txt.=$quote;
    }

    if (defined($suffix))
    {
	# the attr value breaks at a space, where the closing > should be
	$txt.=$suffix;
    }

    $txt.='>';

( run in 1.039 second using v1.01-cache-2.11-cpan-e1769b4cff6 )