FunctionalPerl

 view release on metacpan or  search on metacpan

htmlgen/FunctionalPerl/Htmlgen/Htmlparse.pm  view on Meta::CPAN

# or the terms of the Artistic License version 2 or the terms of the
# MIT License (Expat version). See the file COPYING.md that came
# bundled with this file.
#

=head1 NAME

FunctionalPerl::Htmlgen::Htmlparse

=head1 SYNOPSIS

    use FunctionalPerl::Htmlgen::Htmlparse qw(htmlparse);
    my $b = htmlparse '<p>hi</p> <p>there!', 'body';
    is ref($b), 'PXML::_::XHTML';
    is $b->string, '<body><p>hi</p><p>there!</p></body>';

=head1 DESCRIPTION


=head1 NOTE

This is alpha software! Read the status section in the package README
or on the L<website|http://functional-perl.org/>.

=cut

package FunctionalPerl::Htmlgen::Htmlparse;
use strict;
use warnings;
use warnings FATAL => 'uninitialized';
use experimental "signatures";
use Sub::Call::Tail;
use Exporter "import";

our @EXPORT      = qw();
our @EXPORT_OK   = qw(htmlparse);
our %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);

use FP::Docstring;
use HTML::TreeBuilder;

#use PXML::Element;
use PXML::XHTML;
use Chj::TEST;

sub htmlparse_raw ($htmlstr, $whichtag) {
    my $t = HTML::TreeBuilder->new;
    $t->ignore_unknown(0);    # allow <with_toc> elements
    $t->parse_content($htmlstr);
    my $e = $t->elementify;

    # (^ actually mutates $t into the HTML::Element object already, ugh)
    $e->find_by_tag_name($whichtag)
}

my $attsubname_re = qr/\w[\w-]*/;    # XX OK?
my $attname_re
    = qr/$attsubname_re(?::$attsubname_re)?/;    # with namespace, optionally

sub htmlmap($e) {
    __ '(HTML::Element) -> PXML::_::XHTML '
        . '-- convert output from HTML::TreeBuilder to PXML::XHTML (PXML::Element)';
    my $name = lc($e->tag);
    my $atts = {};
    for ($e->all_external_attr_names) {
        next if $_ eq "/";

        # HACK: accept namespaces in attribute names
        die "invalid attribute name string '$_'" unless /^$attname_re\z/s;
        $$atts{ lc $_ } = $e->attr($_);
    }

    # XX unsafe, if we don't check that a corresponding constructor
    # exists! Could fall back to just PXML::Element (which
    # PXML::_::XHTML is):
    PXML::_::XHTML->new(
        $name, $atts,
        [
            map {
                if (ref $_) {

                    # another HTML::Element
                    no warnings "recursion";  # XX should rather sanitize input?
                    htmlmap($_)
                } else {

                    # a string
                    $_
                }
            } @{ $e->content || [] }
        ]
    );
}

sub htmlparse ($str, $whichtag) {
    __ '($str,$whichtag) -> PXML::Element '
        . '-- parse HTML string to PXML; $whichtag is passed to'
        . ' find_by_tag_name from HTML::TreeBuilder';
    htmlmap(htmlparse_raw($str, $whichtag))
}

# TEST{ htmlparse ('<with_toc><p>abc</p><p>foo</p></with_toc>', "body")
#       ->string }
#   '<body><with_toc><p>abc</p><p>foo</p></with_toc></body>';
# HTML::TreeBuilder VERSION 5.02 drops with_toc here.

TEST {
    htmlparse('x<with_toc><p>abc</p><p>foo</p></with_toc>', "body")->string
}
'<body>x<with_toc><p>abc</p><p>foo</p></with_toc></body>';

# interestingly here it doesn't.

# But perhaps it's best to do like:
TEST {
    htmlparse('<body><with_toc><p>abc</p><p>foo</p></with_toc></body>', "body")
        ->string
}
'<body><with_toc><p>abc</p><p>foo</p></with_toc></body>';

1



( run in 0.375 second using v1.01-cache-2.11-cpan-d0baa829c65 )