Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/HTML.pm  view on Meta::CPAN

    $self->{assertSourceAssumptions}=$DEF_SRC_ASS;;
    $self->{convertCharEnts}=$DEF_CONVERT_CHAR_ENTS;
    $self->{convertNumEnts}=$DEF_CONVERT_NUM_ENTS;
    $self->{cleanWhitespace}=$DEF_CLEAN_WS;
    $self->{sourceEncoding}=$DEF_SRC_ENCODING;

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

#############################################################################
#
#      Public methods
#
##############################################################################
 
#
# Returns (<contents as text>,<header hash ref>)
#
sub clean
{
    my $self=shift;
    my $html=shift;
    my $opts=shift;    # if a title/base URL is wished for as well
                       # returned in a header hash with keys
                       # title, baseURL

    my %header=(title=>undef,
		baseURL=>undef);

    $self->_set_err_state($ERR_OK);  # clean the slate

    # Make it utf-8 if not already
    my $src_enc;
    if ($opts->{sourceEncoding})
    {
	$src_enc=$opts->{sourceEncoding};
    }
    elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
    {
	$src_enc=$self->{sourceEncoding};
    }
    if ($src_enc)
    {
	if ($src_enc!~/^\s*utf-?8\s*$/)
	{
	    $html=$self->{encodingWiz}->convert($html,
						$src_enc,
						'utf8');
	    if (!defined($html))
	    {
		$self->_set_err_state($ERR_UTF8_CONV,
				      $self->{encodingWiz}->errmsg());
		return (undef,\%header);  # signals "do not pass on"
	    }
	}
    }
    else # try guessing the encoding
    {
	$html=$self->{encodingWiz}->guess_and_convert($html,
						      'text',
						      'html',
						      'utf8');
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
    }

    # ex nihilo nihil 
    #
    if (!defined($html) || $html=~/^\s*$/sgo)
    {
	if ($self->{keepAll})
	{
	    return ("\n",\%header);
	}
	else
	{
	    $self->_set_err_state($ERR_EMPTY_DOC);
	    return (undef,\%header);  # signals "do not pass on"
	}  
    }

    # Check if this really looks like "HTML" 
    #
    if ($self->{assertHTML})
    {
	#
	# If we're lucky...
	#
	if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
	{
	    my $type=$1;
	    if ($type!~/(?:html|wml)/igo)
	    {
		if ($self->{keepAll})
		{
		    return ("\n",\%header);
		}
		else
		{
		    $self->_set_err_state($ERR_UNK_DOCTYPE,"($type)");
		    return (undef,\%header);  # signals "do not pass on"
		}
	    }
	}
	# Otherwise, use a weaker way of checking... a single 
	# signature start tag will do. 
	#
	if ($html!~/<(?:(?i)html|body)\W/sgo)
	{
	    if ($self->{keepAll})
	    {
		return ("\n",\%header);
	    }
	    else
	    {

lib/Alvis/HTML.pm  view on Meta::CPAN

			      baseURL=>1});
 if (!defined($txt))
 {
     die "Instantiating Alvis::HTML failed.";
 }

 #
 # Remove all HTML tags from the document. Assert that the document actually
 # is HTML. HTML is in 'iso-8859-1', (output is always in UTF-8).
 # Assert that the source assumptions (UTF-8, no '\0') hold before
 # trying to convert.
 #
 $C=Alvis::HTML->new(alvisKeep=>1,
                     alvisRemove=>1,
                     obsolete=>1,
                     proprietary=>1,
		     xhtml=>1,
	             wml=>1,
		     keepAll=>1,
		     assertHTML=>0,
		     convertCharEnts=>1,
		     convertNumEnts=>1,
                     sourceEncoding=>'iso-8859-1',
                     assertSourceAssumptions=>1
		    );


=head1 DESCRIPTION

Assumes the input is in UTF-8 and does NOT contain '\0's (or rather that 
they carry no meaning and are removable). 

=head1 METHODS

=head2 new()

Options available:
    assertHTML         if 1, try to check if the source really
                       is in any of the recognized dialects.
    keepAll            if 1, pass all documents on regardless of
                       their HTMLness. Non-HTML goes forward as '\n'.

 Options to specify HTML subsets whose tags to remove: (set to defined)

    alvisKeep          W3's HTML 4.01 tags Alvis::Convert
                       is interested in
    alvisRemove        4.01 tags Alvis::Convert is NOT interested in
    obsolete           HTML <4.01
    proprietary        Net-escape,Exploder,...
    xhtml              XHTML 1.1
    wml                WML

     Note: alvisKeep + alvisRemove == remove all HTML 4.01 tags

    convertCharEnts    convert symbolic character entities to UTF-8 characters.
    convertNumEnts     convert numerical character entities to UTF-8 
                       characters.  

    sourceEncoding     encoding of the source HTML text (default: 'utf-8')
                       If not 'utf-8', HTML is converted to UTF-8.
                       If undefined, the encoding is guessed first.

    assertSourceAssumptions
 
                       make sure that before any operations the source is
                       in UTF-8 and contains no null bytes.

=head2 clean(html,options)

Remove unwanted tags from $html (text). $options is
a mechanism for returning the title and base URL of the document and
setting call-specific parameters.

If their extraction is desired, set fields 'title' and 'baseURL'
to a defined value. e.g. 

  my ($txt,$header)=$C->clean($html,
                              {title=>1,
         		       baseURL=>1});

In $options you can also set the source and target encodings
(sourceEncoding,targetEncoding).

   my ($txt,$header)=$C->clean($html,
                              {title=>1,
         		       baseURL=>1,
                               sourceEncoding=>'iso-8859-1'});

This will guess the encoding first:

   my ($txt,$header)=$C->clean($html,
                              {title=>1,
         		       baseURL=>1,
                               sourceEncoding=>undef});

will convert from 'iso-8859-1' to default output encoding (UTF-8).

=head2 errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

=head1 SEE ALSO

Alvis::Canonical

=head1 AUTHOR

Kimmo Valtonen, E<lt>kimmo.valtonen@hiit.fiE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Kimmo Valtonen

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut




( run in 1.678 second using v1.01-cache-2.11-cpan-39bf76dae61 )