Daizu

 view release on metacpan or  search on metacpan

lib/Daizu/HTML.pm  view on Meta::CPAN

package Daizu::HTML;
use warnings;
use strict;

use base 'Exporter';
our @EXPORT_OK = qw(
    dom_body_to_html4 dom_node_to_html4 dom_body_to_text
    dom_filtered_for_feeds
    absolutify_links
    html_escape_text html_escape_attr
);

use XML::LibXML;
use HTML::Tagset;
use URI;
use Encode qw( encode );
use Carp qw( croak );
use Carp::Assert qw( assert DEBUG );
use Daizu::Util qw( trim );

=head1 NAME

Daizu::HTML - functions for handling HTML and XHTML content

=head1 FUNCTIONS

The following functions are available for export from this module.
None of them are exported by default.

=over

=item dom_body_to_html4($doc, [$start_node], [$end_node])

Given an L<XML::LibXML::Document> object for an XHTML document fragment,
whose root element should be C<body>, returns a string representation of
the content in S<HTML 4> format.

C<$start_node> and C<$end_node> are both independently optional.
If either is present then only part of the document will be presented
in the HTML output.  Both must be either C<undef> or a node from the
root (C<body>) element of the document.  C<$start_node> should be the first
node to be shown in the output, or C<undef> to start from the beginning.
C<$end_node> should be the node I<after> the last node to be output,
or C<undef> to end at the end of the document.

=cut

sub dom_body_to_html4
{
    my ($doc, $start_node, $end_node) = @_;
    my $html = '';

    my $right_part = !defined $start_node;
    for my $child ($doc->documentElement->childNodes) {
        $right_part = 1
            if defined $start_node && $child->isSameNode($start_node);
        $right_part = 0
            if defined $end_node && $child->isSameNode($end_node);
        $html .= dom_node_to_html4($child)
            if $right_part;
    }

    return $html;
}

=item dom_node_to_html4($node)

Used by the
L<dom_body_to_html4()|/dom_body_to_html4($doc, [$start_node], [$end_node])>
function above
to process individual nodes.  The argument should be an
L<XML::LibXML::Node> object of some kind.  Returns a string containing
S<HTML 4> code, which for example will have text properly escaped.

=cut

sub dom_node_to_html4
{
    my ($node) = @_;
    my $type = $node->nodeType;

    return encode('UTF-8', html_escape_text($node->data), Encode::FB_CROAK)
        if $type == XML::LibXML::XML_TEXT_NODE ||
           $type == XML::LibXML::XML_CDATA_SECTION_NODE;

    if ($type == XML::LibXML::XML_ELEMENT_NODE) {
        my $ns = $node->namespaceURI;
        return '' if defined $ns && $ns eq $Daizu::HTML_EXTENSION_NS;

        my $elem_name = lc $node->localname;

        my $html = "<$elem_name";
        for my $attr ($node->attributes) {
            next unless $attr->nodeType == XML::LibXML::XML_ATTRIBUTE_NODE;
            my $attr_name = lc $attr->localname;
            $html .= " $attr_name";
            my $boolattr = $HTML::Tagset::boolean_attr{$elem_name};
            $html .= '="' .
                     encode('UTF-8', html_escape_attr($attr->value),
                            Encode::FB_CROAK) .
                     '"'
                unless $boolattr &&
                       ((!ref $boolattr && $boolattr eq $attr_name) ||
                        (ref $boolattr && $boolattr->{$attr_name}));
        }
        $html .= '>';

        if (!$HTML::Tagset::emptyElement{$elem_name}) {
            for my $child ($node->childNodes) {
                $html .= dom_node_to_html4($child);
            }
            $html .= "</$elem_name>";
        }
        elsif ($node->hasChildNodes) {
            warn "element '$elem_name' at line " . $node->line_number .
                 " shouldn't have content";
        }

        return $html;
    }

    return '<!--' .
           encode('UTF-8', html_escape_text($node->data), Encode::FB_CROAK) .
           '-->'
        if $type == XML::LibXML::XML_COMMENT_NODE;

    return ''
        if $type == XML::LibXML::XML_XINCLUDE_START ||
           $type == XML::LibXML::XML_XINCLUDE_END;

    die "node type $type in XML::LibXML DOM not expected";

#   These are the node types I don't currently bother with:
#       XML::LibXML::XML_ATTRIBUTE_NODE = 2
#       XML::LibXML::XML_ENTITY_REF_NODE = 5
#       XML::LibXML::XML_ENTITY_NODE = 6
#       XML::LibXML::XML_PI_NODE = 7
#       XML::LibXML::XML_DOCUMENT_NODE = 9
#       XML::LibXML::XML_DOCUMENT_TYPE_NODE = 10
#       XML::LibXML::XML_DOCUMENT_FRAG_NODE = 11
#       XML::LibXML::XML_NOTATION_NODE = 12
#       XML::LibXML::XML_HTML_DOCUMENT_NODE = 13
#       XML::LibXML::XML_DTD_NODE = 14
#       XML::LibXML::XML_ELEMENT_DECL = 15
#       XML::LibXML::XML_ATTRIBUTE_DECL = 16
#       XML::LibXML::XML_ENTITY_DECL = 17
#       XML::LibXML::XML_NAMESPACE_DECL = 18
#       XML::LibXML::XML_DOCB_DOCUMENT_NODE = 21
}

=item dom_body_to_text($doc)

Given an XHTML body (as an L<XML::LibXML::Document> object in the usually
format) return a plain text version of the content, with some markup
translatted into text formatting in a limited way to make it reasonably
readable.

=cut

sub dom_body_to_text
{
    my ($doc) = @_;
    my $text = '';
    my $accum = '';

    # This 'object' is used to track the progress of the formatting and
    # accumulate the output text.
    my $fmt = {
        # State:
        txt => '',
        linelen => 0,
        indent => 0,
        indent_stack => [],
        list_type => 'ul',
        list_pos => 1,
        list_stack => [],
        block_started => 0,
        word_gap => 0,
        text_level => undef,    # undef=normal, otherwise 'sup' or 'sub'

        # Configuration:
        max_linelen => 72,
        min_breakable_line => 10,
        block_indent => '    ',
        ul_indent => ' * ',
        ol_indent => ' %d. ',
    };

    _dom_node_children_to_text($doc->documentElement, $fmt);

    return _fmt_finish($fmt);
}

our %SUPERSCRIPT_CHARS = (
    0x0028 => 0x207D,   # SUPERSCRIPT LEFT PARENTHESIS
    0x0029 => 0x207E,   # SUPERSCRIPT RIGHT PARENTHESIS
    0x002B => 0x207A,   # SUPERSCRIPT PLUS SIGN
    0x002D => 0x207B,   # close enough for superscript HYPHEN-MINUS
    0x0030 => 0x2070,   # SUPERSCRIPT ZERO
    0x0031 => 0x00B9,   # SUPERSCRIPT ONE
    0x0032 => 0x00B2,   # SUPERSCRIPT TWO
    0x0033 => 0x00B3,   # SUPERSCRIPT THREE
    0x0034 => 0x2074,   # SUPERSCRIPT FOUR
    0x0035 => 0x2075,   # SUPERSCRIPT FIVE
    0x0036 => 0x2076,   # SUPERSCRIPT SIX
    0x0037 => 0x2077,   # SUPERSCRIPT SEVEN
    0x0038 => 0x2078,   # SUPERSCRIPT EIGHT
    0x0039 => 0x2079,   # SUPERSCRIPT NINE
    0x003D => 0x207C,   # SUPERSCRIPT EQUALS SIGN
    0x0069 => 0x2071,   # SUPERSCRIPT LATIN SMALL LETTER I
    0x006E => 0x207F,   # SUPERSCRIPT LATIN SMALL LETTER N
    0x2212 => 0x207B,   # SUPERSCRIPT MINUS
);

lib/Daizu/HTML.pm  view on Meta::CPAN

            _dom_node_children_to_text($node, $fmt);
            _fmt_end_block($fmt);
        }
        elsif ($name eq 'ul' || $name eq 'ol') {
            push @{$fmt->{list_type_stack}}, [ $fmt->{list_type}, $fmt->{list_pos} ];
            $fmt->{list_type} = $name;
            $fmt->{list_pos} = 1;
            _dom_node_children_to_text($node, $fmt);
            ($fmt->{list_type}, $fmt->{list_pos}) = @{pop @{$fmt->{list_type_stack}}};
        }
        elsif ($name eq 'pre') {
            _fmt_new_block($fmt, $fmt->{block_indent});
            my $indent = ' ' x $fmt->{indent};
            my $code = trim($node->textContent);
            $code =~ s/(?:\x0D\x0A|\x0A|\x0D)/\n$indent/g;
            $fmt->{txt} .= $code;
            $code =~ s/^.*\n//s;
            if ($code =~ /\S/) {
                $fmt->{linelen} = $fmt->{indent} + length $code;
                $fmt->{block_started} = 1;
            }
            _fmt_end_block($fmt);
        }
        elsif ($name eq 'img') {
            my $alt = trim($node->getAttribute('alt'));
            $alt = '' unless defined $alt;
            _fmt_add_text($fmt, $alt);
        }
        elsif ($name eq 'br') {
            _fmt_new_line($fmt);
        }
        elsif ($name eq 'q') {
            _fmt_add_text($fmt, chr 8220);
            _dom_node_children_to_text($node, $fmt);
            _fmt_add_text($fmt, chr 8221);
        }
        elsif ($name eq 'sup' || $name eq 'sub') {
            my $old_text_level = $fmt->{text_level};
            $fmt->{text_level} = $name;
            _dom_node_children_to_text($node, $fmt);
            $fmt->{text_level} = $old_text_level;
        }
        else {
            # Unknown element.  Ignore the markup and just process the text.
            _dom_node_children_to_text($node, $fmt);
        }
    }
}

=item dom_filtered_for_feeds($doc)

Return a new version of the article content in C<$doc>, with bits of
markup which aren't relevant or might be unwelcome in feed content,
such as C<script> elements and C<style> attributes.  Also remove C<span>
elements because they're not needed when there's no custom styling,
and Bloglines currently turns them into invalid HTML.  Also remove
C<class> attributes in case they cause some unexpected styling to be
applied.

In addition, any elements in the Daizu HTML extension namespace are
removed.  Elements in other non-XHTML namespaces will cause this function
to fail.  They shouldn't be there by the time the content is being output
anyway.

Both C<$doc> and the return value are L<XML::LibXML::Document> objects
of the kind returned by
L<the article_doc() method in Daizu::File|Daizu::File/$file-E<gt>article_doc>.
The original DOM in C<$doc> is not altered.  The return value is a
completely independent copy.

=cut

sub dom_filtered_for_feeds
{
    my ($in_doc) = @_;

    my $out_doc = XML::LibXML::Document->new('1.0', 'UTF-8');
    my @out_child = _node_filtered_for_feeds($in_doc->documentElement);
    assert(@out_child == 1) if DEBUG;
    $out_doc->setDocumentElement(@out_child);

    return $out_doc;
}

sub _node_filtered_for_feeds
{
    my ($node) = @_;
    my $type = $node->nodeType;

    return $node->cloneNode(0)
        if $type == XML::LibXML::XML_TEXT_NODE ||
           $type == XML::LibXML::XML_CDATA_SECTION_NODE;

    if ($type == XML::LibXML::XML_ELEMENT_NODE) {
        my $ns = $node->namespaceURI;
        return if defined $ns && $ns eq $Daizu::HTML_EXTENSION_NS;
        croak "unrecognized namespace '$ns' used in article"
            if defined $ns && $ns ne 'http://www.w3.org/1999/xhtml';

        # Ignore certain elements which would be rude to put in a feed.
        my $elem_name = $node->localname;
        return if $elem_name =~ /^(script|style)$/i;

        if ($elem_name eq 'span' ||
            ($elem_name eq 'a' && !$node->hasAttribute('href')))
        {
            # Strip the element out but retain its content.
            return map { _node_filtered_for_feeds($_) } $node->childNodes;
        }
        else {
            my $out_elem = XML::LibXML::Element->new($elem_name);

            for my $attr ($node->attributes) {
                next unless $attr->nodeType == XML::LibXML::XML_ATTRIBUTE_NODE;
                my $attr_name = $attr->localname;
                next if $attr_name =~ /^(class|style|on.*|id|name)$/i;
                $out_elem->setAttribute($attr_name => $attr->value);
            }

            for my $child ($node->childNodes) {
                my @out = _node_filtered_for_feeds($child);
                $out_elem->appendChild($_)
                    for @out;
            }

            return $out_elem;
        }
    }

    return
        if $type == XML::LibXML::XML_COMMENT_NODE   ||
           $type == XML::LibXML::XML_XINCLUDE_START ||
           $type == XML::LibXML::XML_XINCLUDE_END;

    die "node type $type in XML::LibXML DOM not expected";

#   These are the node types I don't currently bother with:
#       XML::LibXML::XML_ATTRIBUTE_NODE = 2
#       XML::LibXML::XML_ENTITY_REF_NODE = 5
#       XML::LibXML::XML_ENTITY_NODE = 6
#       XML::LibXML::XML_PI_NODE = 7
#       XML::LibXML::XML_DOCUMENT_NODE = 9
#       XML::LibXML::XML_DOCUMENT_TYPE_NODE = 10
#       XML::LibXML::XML_DOCUMENT_FRAG_NODE = 11
#       XML::LibXML::XML_NOTATION_NODE = 12
#       XML::LibXML::XML_HTML_DOCUMENT_NODE = 13
#       XML::LibXML::XML_DTD_NODE = 14
#       XML::LibXML::XML_ELEMENT_DECL = 15
#       XML::LibXML::XML_ATTRIBUTE_DECL = 16
#       XML::LibXML::XML_ENTITY_DECL = 17
#       XML::LibXML::XML_NAMESPACE_DECL = 18
#       XML::LibXML::XML_DOCB_DOCUMENT_NODE = 21
}

=item absolutify_links($doc, $base_url)

Given an XHTML document (as an L<XML::LibXML::Document> object), find
all the attributes in the markup which are relative URLs and turn them
into absolute URLs relative to C<$base_url>.  This can be used to prepare
content from an article to be published in a different place with a different
URL, such as in an RSS feed or on an index page, while ensuring that any
links or embedded files continue to work.

The document's elements must be in the XHTML namespace, or they will be
ignored.

TODO - some of this could be refactored with the link replacing stuff
in Daizu::Preview to be more thorough.  For now though it just works on
'a href' and 'img src', since that will catch almost all cases.

=cut

sub absolutify_links
{
    my ($doc, $base_url) = @_;
    $base_url = URI->new($base_url);

    my %FIND_ATTRS = (
        a => 'href',
        img => 'src',
    );

    while (my ($elem_name, $attr_name) = each %FIND_ATTRS) {
        for ($doc->findnodes("//*[(namespace-uri() = 'http://www.w3.org/1999/xhtml' or namespace-uri() = '') and local-name() = '$elem_name']/@*[local-name() = '$attr_name']")) {
            my $url = URI->new($_->getValue);
            $_->setValue($url->abs($base_url));
        }
    }
}

=item html_escape_text($text)

Escape C<$text> in a way which makes it safe to include in the content
of HTML or XML elements.  The characters C<E<lt>>, C<E<gt>>, and C<&> are
escaped.  Returns the new value.

The output may not be suitable for including as the value of an
HTML or XML attribute.

The return value is always formatted as bytes encoded in UTF-8.

=cut

sub html_escape_text
{
    my ($s) = @_;
    $s =~ s/&/&amp;/g;
    $s =~ s/</&lt;/g;
    $s =~ s/>/&gt;/g;
    return $s;
}

=item html_escape_attr($text)

Escape C<$text> in a way which makes it safe to include in the content of
HTML or XML elements, or the values of HTML or XML attributes in double
quotes.  The characters C<E<lt>>, C<E<gt>>, C<&>, and C<"> are escaped.
Returns the new value.

The return value is always formatted as bytes encoded in UTF-8.

=cut

sub html_escape_attr



( run in 1.194 second using v1.01-cache-2.11-cpan-d06a3f9ecfd )