Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Canonical.pm  view on Meta::CPAN

    my $html=shift;
    my $opts=shift;    # if a title/base URL is wished for as well, they are
                    # returned in a header hash with the same keys
                    #  
                    #     title
                    #     baseURL
                    #     sourceEncoding
    
    $self->_set_err_state($ERR_OK);  # clean the slate

    my ($title,$baseURL,$src_enc);
    $title=$opts->{title} if exists($opts->{title} );
    $baseURL=$opts->{title} if exists($opts->{baseURL} );
    $src_enc=$self->{sourceEncoding};
    $src_enc=$opts->{sourceEncoding} if exists($opts->{sourceEncoding} );

    my ($contents,$header)=
	$self->{htmlConverter}->clean($html,
				      {title=>$title,
				       baseURL=>$baseURL,
				       sourceEncoding=>$src_enc});
    if (!defined($contents))
    {
	$self->_set_err_state($ERR_HTML_CONV,"In HTML converter: " . 
			      $self->{htmlConverter}->errmsg());
	return (undef,$header)
    }

    if ($DEBUG)
    {
	open(F,">candoc.cleaned");
	print F $contents;
	close(F);
    }


    # To safeguard the element contents with regard to XML
    $contents=$self->_make_txt_XML_safe($contents);

    # Here goes
    my $can_doc=$self->_contents2canDoc($contents,$header,$src_enc);
    if (!defined($can_doc))
    {
	$self->_set_err_state($ERR_CONT2CAN_DOC);
	return (undef,$header);
    }

    return ($can_doc,$header);
}

#########################################################################
#
#      Private methods
#
######################################################################

sub _contents2canDoc
{
    my $self=shift;
    my $contents=shift; # contains relevant HTML markup
    my $header=shift;   # will be updated with information like links
    my $source_encoding=shift;    

    my $can_doc;

    if ($DEBUG)
    {
	open(F,">candoc.cleanNXMLSafe");
	print F $contents;
	close(F);
    }
    # Convert in order of importance to the structure
    $can_doc=$self->_handle_sections($contents,$source_encoding);
    if ($DEBUG)
    {
	my $can_doc2=$self->_to_alvis($can_doc);
	$can_doc2=$self->_pretty_print($can_doc2);
	open(F,">candoc.aftersections");
	print F $can_doc2;
	close(F);
    }
    $can_doc=$self->_handle_lists($can_doc);
    if ($DEBUG)
    {
	my $can_doc2=$self->_to_alvis($can_doc);
	$can_doc2=$self->_pretty_print($can_doc2);
	open(F,">candoc.afterlists");
	print F $can_doc2;
	close(F);
    }
    $can_doc=$self->_handle_links($can_doc,$header);

    if ($DEBUG)
    {
	my $can_doc2=$self->_to_alvis($can_doc);
	$can_doc2=$self->_pretty_print($can_doc2);
	open(F,">candoc.afterlinks");
	print F $can_doc2;
	close(F);
    }
    # OK, time to put some make-up on and go out
    $can_doc=$self->_to_alvis($can_doc);
    if ($DEBUG)
    {
	my $can_doc2=$self->_pretty_print($can_doc);
	open(F,">candoc.aftertoalvis");
	print F $can_doc2;
	close(F);
    }
    $can_doc=$self->_pretty_print($can_doc);
    if ($DEBUG)
    {
	open(F,">candoc.afterprettyprint");
	print F $can_doc;
	close(F);
    }

    

    return $can_doc;
}



( run in 1.348 second using v1.01-cache-2.11-cpan-140bd7fdf52 )