Mojo-DOM58
view release on metacpan or search on metacpan
lib/Mojo/DOM58.pm view on Meta::CPAN
sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef }
sub _nodes {
return () unless my $tree = shift;
my @nodes = @$tree[_start($tree) .. $#$tree];
return shift() ? [grep { $_->[0] eq 'tag' } @nodes] : \@nodes;
}
sub _offset {
my ($parent, $child) = @_;
my $i = _start($parent);
$_ eq $child ? last : $i++ for @$parent[$i .. $#$parent];
return $i;
}
sub _parent { $_[0]->[$_[0][0] eq 'tag' ? 3 : 2] }
sub _parse {
my ($self, $input) = @_;
return Mojo::DOM58::_HTML->new(xml => $self->xml)->parse($input)->tree
unless blessed $input && $input->isa('Mojo::DOM58');
my $tree = dclone $input->tree;
return $tree->[0] eq 'root' ? $tree : _fragment($tree);
}
sub _replace {
my ($self, $parent, $child, $nodes) = @_;
splice @$parent, _offset($parent, $child), 1, @{_link($parent, $nodes)};
return $self->parent;
}
sub _select { $_[1] ? $_[0]->grep(matches => $_[1]) : $_[0] }
sub _siblings {
my ($tree, $tags, $tail, $i) = @_;
return defined $i ? undef : [] if $tree->[0] eq 'root';
my $nodes = _nodes(_parent($tree));
my $match = -1;
defined($match++) and $_ eq $tree and last for @$nodes;
if ($tail) { splice @$nodes, 0, $match + 1 }
else { splice @$nodes, $match, ($#$nodes + 1) - $match }
@$nodes = grep { $_->[0] eq 'tag' } @$nodes if $tags;
return defined $i ? $i == -1 && !@$nodes ? undef : $nodes->[$i] : $nodes;
}
sub _start { $_[0][0] eq 'root' ? 1 : 4 }
sub _text {
my ($nodes, $xml, $all) = @_;
my $text = '';
while (my $node = shift @$nodes) {
my $type = $node->[0];
# Text
if ($type eq 'text' || $type eq 'cdata' || $type eq 'raw') {
$text .= $node->[1];
}
# Nested tag
elsif ($type eq 'tag' && $all) {
unshift @$nodes, @{_nodes($node)} if $xml || ($node->[1] ne 'script' && $node->[1] ne 'style');
}
}
return $text;
}
sub _wrap {
my ($self, $content, $new) = @_;
return $self if (my $tree = $self->tree)->[0] eq 'root' && !$content;
return $self if $tree->[0] ne 'root' && $tree->[0] ne 'tag' && $content;
# Find innermost tag
my $current;
my $first = $new = $self->_parse($new);
$current = $first while $first = _nodes($first, 1)->[0];
return $self unless $current;
# Wrap content
if ($content) {
push @$current, @{_link($current, _nodes($tree))};
splice @$tree, _start($tree), $#$tree, @{_link($tree, _nodes($new))};
return $self;
}
# Wrap element
$self->_replace(_parent($tree), $tree, _nodes($new));
push @$current, @{_link($current, [$tree])};
return $self;
}
1;
=encoding utf8
=head1 NAME
Mojo::DOM58 - Minimalistic HTML/XML DOM parser with CSS selectors
=head1 SYNOPSIS
use Mojo::DOM58;
# Parse
my $dom = Mojo::DOM58->new('<div><p id="a">Test</p><p id="b">123</p></div>');
# Find
say $dom->at('#b')->text;
say $dom->find('p')->map('text')->join("\n");
say $dom->find('[id]')->map(attr => 'id')->join("\n");
# Iterate
$dom->find('p[id]')->reverse->each(sub { say $_->{id} });
# Loop
for my $e ($dom->find('p[id]')->each) {
say $e->{id}, ':', $e->text;
}
# Modify
$dom->find('div p')->last->append('<p id="c">456</p>');
$dom->at('#c')->prepend($dom->new_tag('p', id => 'd', '789'));
$dom->find(':not(p)')->map('strip');
# Render
say "$dom";
=head1 DESCRIPTION
L<Mojo::DOM58> is a minimalistic and relaxed pure-perl HTML/XML DOM parser based
on L<Mojo::DOM>. It supports the L<HTML Living Standard|https://html.spec.whatwg.org/>
and L<Extensible Markup Language (XML) 1.0|https://www.w3.org/TR/xml/>, and
matching based on L<CSS3 selectors|https://www.w3.org/TR/selectors/>. It will
even try to interpret broken HTML and XML, so you should not use it for
validation.
=head1 FORK INFO
L<Mojo::DOM58> is a fork of L<Mojo::DOM> and tracks features and fixes to stay
closely compatible with upstream. It differs only in the standalone format and
compatibility with Perl 5.8. Any bugs or patches not related to these changes
should be reported directly to the L<Mojolicious> issue tracker.
This release of L<Mojo::DOM58> is up to date with version C<9.40> of
L<Mojolicious>.
=head1 NODES AND ELEMENTS
When we parse an HTML/XML fragment, it gets turned into a tree of nodes.
<!DOCTYPE html>
<html>
<head><title>Hello</title></head>
<body>World!</body>
</html>
There are currently eight different kinds of nodes, C<cdata>, C<comment>,
C<doctype>, C<pi>, C<raw>, C<root>, C<tag> and C<text>. Elements are nodes of
the type C<tag>.
root
|- doctype (html)
+- tag (html)
|- tag (head)
| +- tag (title)
| +- raw (Hello)
+- tag (body)
+- text (World!)
While all node types are represented as L<Mojo::DOM58> objects, some methods like
L</"attr"> and L</"namespace"> only apply to elements.
=head1 HTML AND XML
L<Mojo::DOM58> defaults to HTML semantics, that means all tags and attribute
names are lowercased and selectors need to be lowercase as well.
# HTML semantics
my $dom = Mojo::DOM58->new('<P ID="greeting">Hi!</P>');
say $dom->at('p[id]')->text;
If an XML declaration is found, the parser will automatically switch into XML
mode and everything becomes case-sensitive.
# XML semantics
my $dom = Mojo::DOM58->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>');
say $dom->at('P[ID]')->text;
HTML or XML semantics can also be forced with the L</"xml"> method.
# Force HTML semantics
my $dom = Mojo::DOM58->new->xml(0)->parse('<P ID="greeting">Hi!</P>');
say $dom->at('p[id]')->text;
# Force XML semantics
my $dom = Mojo::DOM58->new->xml(1)->parse('<P ID="greeting">Hi!</P>');
say $dom->at('P[ID]')->text;
=head1 SELECTORS
L<Mojo::DOM58> uses a CSS selector engine based on L<Mojo::DOM::CSS>. All CSS
selectors that make sense for a standalone parser are supported.
=over
=item Z<>*
Any element.
my $all = $dom->find('*');
=item E
An element of type C<E>.
my $title = $dom->at('title');
lib/Mojo/DOM58.pm view on Meta::CPAN
=head2 strip
my $parent = $dom->strip;
Remove this element while preserving its content and return L</"parent">.
# "<div>Test</div>"
$dom->parse('<div><h1>Test</h1></div>')->at('h1')->strip;
=head2 tag
my $tag = $dom->tag;
$dom = $dom->tag('div');
This element's tag name.
# List tag names of child elements
say $dom->children->map('tag')->join("\n");
=head2 tap
$dom = $dom->tap(sub {...});
Equivalent to L<Mojo::Base/"tap">.
=head2 text
my $text = $dom->text;
Extract text content from this element only (not including child elements).
# "bar"
$dom->parse("<div>foo<p>bar</p>baz</div>")->at('p')->text;
# "foo\nbaz\n"
$dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text;
=head2 to_string
my $str = $dom->to_string;
Render this node and its content to HTML/XML.
# "<b>Test</b>"
$dom->parse('<div><b>Test</b></div>')->at('div b')->to_string;
To extract text content from all descendant nodes, see L</"all_text">.
=head2 tree
my $tree = $dom->tree;
$dom = $dom->tree(['root']);
Document Object Model. Note that this structure should only be used very
carefully since it is very dynamic.
=head2 type
my $type = $dom->type;
This node's type, usually C<cdata>, C<comment>, C<doctype>, C<pi>, C<raw>,
C<root>, C<tag> or C<text>.
# "cdata"
$dom->parse('<![CDATA[Test]]>')->child_nodes->first->type;
# "comment"
$dom->parse('<!-- Test -->')->child_nodes->first->type;
# "doctype"
$dom->parse('<!DOCTYPE html>')->child_nodes->first->type;
# "pi"
$dom->parse('<?xml version="1.0"?>')->child_nodes->first->type;
# "raw"
$dom->parse('<title>Test</title>')->at('title')->child_nodes->first->type;
# "root"
$dom->parse('<p>Test</p>')->type;
# "tag"
$dom->parse('<p>Test</p>')->at('p')->type;
# "text"
$dom->parse('<p>Test</p>')->at('p')->child_nodes->first->type;
=head2 val
my $value = $dom->val;
Extract value from form element (such as C<button>, C<input>, C<option>,
C<select> and C<textarea>), or return C<undef> if this element has no value. In
the case of C<select> with C<multiple> attribute, find C<option> elements with
C<selected> attribute and return an array reference with all values, or
C<undef> if none could be found.
# "a"
$dom->parse('<input name=test value=a>')->at('input')->val;
# "b"
$dom->parse('<textarea>b</textarea>')->at('textarea')->val;
# "c"
$dom->parse('<option value="c">Test</option>')->at('option')->val;
# "d"
$dom->parse('<select><option selected>d</option></select>')
->at('select')->val;
# "e"
$dom->parse('<select multiple><option selected>e</option></select>')
->at('select')->val->[0];
# "on"
$dom->parse('<input name=test type=checkbox>')->at('input')->val;
=head2 with_roles
my $new_class = Mojo::DOM58->with_roles('Mojo::DOM58::Role::One');
my $new_class = Mojo::DOM58->with_roles('+One', '+Two');
$dom = $dom->with_roles('+One', '+Two');
Equivalent to L<Mojo::Base/"with_roles">. Note that role support depends on
( run in 0.568 second using v1.01-cache-2.11-cpan-13bb782fe5a )