Mojo-DOM58

 view release on metacpan or  search on metacpan

lib/Mojo/DOM58.pm  view on Meta::CPAN

sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef }

sub _nodes {
  return () unless my $tree = shift;
  my @nodes = @$tree[_start($tree) .. $#$tree];
  return shift() ? [grep { $_->[0] eq 'tag' } @nodes] : \@nodes;
}

sub _offset {
  my ($parent, $child) = @_;
  my $i = _start($parent);
  $_ eq $child ? last : $i++ for @$parent[$i .. $#$parent];
  return $i;
}

sub _parent { $_[0]->[$_[0][0] eq 'tag' ? 3 : 2] }

sub _parse {
  my ($self, $input) = @_;
  return Mojo::DOM58::_HTML->new(xml => $self->xml)->parse($input)->tree
    unless blessed $input && $input->isa('Mojo::DOM58');
  my $tree = dclone $input->tree;
  return $tree->[0] eq 'root' ? $tree : _fragment($tree);
}

sub _replace {
  my ($self, $parent, $child, $nodes) = @_;
  splice @$parent, _offset($parent, $child), 1, @{_link($parent, $nodes)};
  return $self->parent;
}

sub _select { $_[1] ? $_[0]->grep(matches => $_[1]) : $_[0] }

sub _siblings {
  my ($tree, $tags, $tail, $i) = @_;

  return defined $i ? undef : [] if $tree->[0] eq 'root';

  my $nodes = _nodes(_parent($tree));
  my $match = -1;
  defined($match++) and $_ eq $tree and last for @$nodes;

  if ($tail) { splice @$nodes, 0, $match + 1 }
  else       { splice @$nodes, $match, ($#$nodes + 1) - $match }

  @$nodes = grep { $_->[0] eq 'tag' } @$nodes if $tags;

  return defined $i ? $i == -1 && !@$nodes ? undef : $nodes->[$i] : $nodes;
}

sub _start { $_[0][0] eq 'root' ? 1 : 4 }

sub _text {
  my ($nodes, $xml, $all) = @_;

  my $text = '';
  while (my $node = shift @$nodes) {
    my $type = $node->[0];

    # Text
    if ($type eq 'text' || $type eq 'cdata' || $type eq 'raw') {
      $text .= $node->[1];
    }

    # Nested tag
    elsif ($type eq 'tag' && $all) {
      unshift @$nodes, @{_nodes($node)} if $xml || ($node->[1] ne 'script' && $node->[1] ne 'style');
    }
  }

  return $text;
}

sub _wrap {
  my ($self, $content, $new) = @_;

  return $self if (my $tree = $self->tree)->[0] eq 'root' && !$content;
  return $self if $tree->[0] ne 'root' && $tree->[0] ne 'tag' && $content;

  # Find innermost tag
  my $current;
  my $first = $new = $self->_parse($new);
  $current = $first while $first = _nodes($first, 1)->[0];
  return $self unless $current;

  # Wrap content
  if ($content) {
    push @$current, @{_link($current, _nodes($tree))};
    splice @$tree, _start($tree), $#$tree, @{_link($tree, _nodes($new))};
    return $self;
  }

  # Wrap element
  $self->_replace(_parent($tree), $tree, _nodes($new));
  push @$current, @{_link($current, [$tree])};
  return $self;
}

1;

=encoding utf8

=head1 NAME

Mojo::DOM58 - Minimalistic HTML/XML DOM parser with CSS selectors

=head1 SYNOPSIS

  use Mojo::DOM58;

  # Parse
  my $dom = Mojo::DOM58->new('<div><p id="a">Test</p><p id="b">123</p></div>');

  # Find
  say $dom->at('#b')->text;
  say $dom->find('p')->map('text')->join("\n");
  say $dom->find('[id]')->map(attr => 'id')->join("\n");

  # Iterate
  $dom->find('p[id]')->reverse->each(sub { say $_->{id} });

  # Loop
  for my $e ($dom->find('p[id]')->each) {
    say $e->{id}, ':', $e->text;
  }

  # Modify
  $dom->find('div p')->last->append('<p id="c">456</p>');
  $dom->at('#c')->prepend($dom->new_tag('p', id => 'd', '789'));
  $dom->find(':not(p)')->map('strip');

  # Render
  say "$dom";

=head1 DESCRIPTION

L<Mojo::DOM58> is a minimalistic and relaxed pure-perl HTML/XML DOM parser based
on L<Mojo::DOM>. It supports the L<HTML Living Standard|https://html.spec.whatwg.org/>
and L<Extensible Markup Language (XML) 1.0|https://www.w3.org/TR/xml/>, and
matching based on L<CSS3 selectors|https://www.w3.org/TR/selectors/>. It will
even try to interpret broken HTML and XML, so you should not use it for
validation.

=head1 FORK INFO

L<Mojo::DOM58> is a fork of L<Mojo::DOM> and tracks features and fixes to stay
closely compatible with upstream. It differs only in the standalone format and
compatibility with Perl 5.8. Any bugs or patches not related to these changes
should be reported directly to the L<Mojolicious> issue tracker.

This release of L<Mojo::DOM58> is up to date with version C<9.40> of
L<Mojolicious>.

=head1 NODES AND ELEMENTS

When we parse an HTML/XML fragment, it gets turned into a tree of nodes.

  <!DOCTYPE html>
  <html>
    <head><title>Hello</title></head>
    <body>World!</body>
  </html>

There are currently eight different kinds of nodes, C<cdata>, C<comment>,
C<doctype>, C<pi>, C<raw>, C<root>, C<tag> and C<text>. Elements are nodes of
the type C<tag>.

  root
  |- doctype (html)
  +- tag (html)
     |- tag (head)
     |  +- tag (title)
     |     +- raw (Hello)
     +- tag (body)
        +- text (World!)

While all node types are represented as L<Mojo::DOM58> objects, some methods like
L</"attr"> and L</"namespace"> only apply to elements.

=head1 HTML AND XML

L<Mojo::DOM58> defaults to HTML semantics, that means all tags and attribute
names are lowercased and selectors need to be lowercase as well.

  # HTML semantics
  my $dom = Mojo::DOM58->new('<P ID="greeting">Hi!</P>');
  say $dom->at('p[id]')->text;

If an XML declaration is found, the parser will automatically switch into XML
mode and everything becomes case-sensitive.

  # XML semantics
  my $dom = Mojo::DOM58->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>');
  say $dom->at('P[ID]')->text;

HTML or XML semantics can also be forced with the L</"xml"> method.

  # Force HTML semantics
  my $dom = Mojo::DOM58->new->xml(0)->parse('<P ID="greeting">Hi!</P>');
  say $dom->at('p[id]')->text;

  # Force XML semantics
  my $dom = Mojo::DOM58->new->xml(1)->parse('<P ID="greeting">Hi!</P>');
  say $dom->at('P[ID]')->text;

=head1 SELECTORS

L<Mojo::DOM58> uses a CSS selector engine based on L<Mojo::DOM::CSS>. All CSS
selectors that make sense for a standalone parser are supported.

=over

=item Z<>*

Any element.

  my $all = $dom->find('*');

=item E

An element of type C<E>.

  my $title = $dom->at('title');

lib/Mojo/DOM58.pm  view on Meta::CPAN

=head2 strip

  my $parent = $dom->strip;

Remove this element while preserving its content and return L</"parent">.

  # "<div>Test</div>"
  $dom->parse('<div><h1>Test</h1></div>')->at('h1')->strip;

=head2 tag

  my $tag = $dom->tag;
  $dom    = $dom->tag('div');

This element's tag name.

  # List tag names of child elements
  say $dom->children->map('tag')->join("\n");

=head2 tap

  $dom = $dom->tap(sub {...});

Equivalent to L<Mojo::Base/"tap">.

=head2 text

  my $text = $dom->text;

Extract text content from this element only (not including child elements).

  # "bar"
  $dom->parse("<div>foo<p>bar</p>baz</div>")->at('p')->text;

  # "foo\nbaz\n"
  $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text;

=head2 to_string

  my $str = $dom->to_string;

Render this node and its content to HTML/XML.

  # "<b>Test</b>"
  $dom->parse('<div><b>Test</b></div>')->at('div b')->to_string;

To extract text content from all descendant nodes, see L</"all_text">.

=head2 tree

  my $tree = $dom->tree;
  $dom     = $dom->tree(['root']);

Document Object Model. Note that this structure should only be used very
carefully since it is very dynamic.

=head2 type

  my $type = $dom->type;

This node's type, usually C<cdata>, C<comment>, C<doctype>, C<pi>, C<raw>,
C<root>, C<tag> or C<text>.

  # "cdata"
  $dom->parse('<![CDATA[Test]]>')->child_nodes->first->type;

  # "comment"
  $dom->parse('<!-- Test -->')->child_nodes->first->type;

  # "doctype"
  $dom->parse('<!DOCTYPE html>')->child_nodes->first->type;

  # "pi"
  $dom->parse('<?xml version="1.0"?>')->child_nodes->first->type;

  # "raw"
  $dom->parse('<title>Test</title>')->at('title')->child_nodes->first->type;

  # "root"
  $dom->parse('<p>Test</p>')->type;

  # "tag"
  $dom->parse('<p>Test</p>')->at('p')->type;

  # "text"
  $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->type;

=head2 val

  my $value = $dom->val;

Extract value from form element (such as C<button>, C<input>, C<option>,
C<select> and C<textarea>), or return C<undef> if this element has no value. In
the case of C<select> with C<multiple> attribute, find C<option> elements with
C<selected> attribute and return an array reference with all values, or
C<undef> if none could be found.

  # "a"
  $dom->parse('<input name=test value=a>')->at('input')->val;

  # "b"
  $dom->parse('<textarea>b</textarea>')->at('textarea')->val;

  # "c"
  $dom->parse('<option value="c">Test</option>')->at('option')->val;

  # "d"
  $dom->parse('<select><option selected>d</option></select>')
    ->at('select')->val;

  # "e"
  $dom->parse('<select multiple><option selected>e</option></select>')
    ->at('select')->val->[0];

  # "on"
  $dom->parse('<input name=test type=checkbox>')->at('input')->val;

=head2 with_roles

  my $new_class = Mojo::DOM58->with_roles('Mojo::DOM58::Role::One');
  my $new_class = Mojo::DOM58->with_roles('+One', '+Two');
  $dom          = $dom->with_roles('+One', '+Two');

Equivalent to L<Mojo::Base/"with_roles">. Note that role support depends on



( run in 0.568 second using v1.01-cache-2.11-cpan-13bb782fe5a )