HTML-TreeBuilder-XPath
view release on metacpan or search on metacpan
lib/HTML/TreeBuilder/XPath.pm view on Meta::CPAN
sub getAttributes
{ my $elt= shift;
my %atts= $elt->all_external_attr;
my $rank=0;
my @atts= map { bless( { _name => $_, _value => $atts{$_},
_elt => $elt, _rank => $rank++,
},
'HTML::TreeBuilder::XPath::Attribute'
)
} sort keys %atts;
return wantarray ? @atts : \@atts;
}
sub to_number { return XML::XPathEngine::Number->new( $_[0]->as_text); }
sub string_value
{ my $elt= shift;
if( $elt->isCommentNode) { return $elt->{text}; }
return $elt->as_text;
};
# called on a parent, with a child as second argument and its rank as third
# returns the child if it is already an element, or
# a new HTML::TreeBuilder::XPath::Text element if it is a plain string
sub _child_as_object
{ my( $elt, $elt_or_text, $rank)= @_;
return undef unless( defined $elt_or_text);
if( ! ref $elt_or_text)
{ # $elt_or_text is a string, turn it into a TextNode object
$elt_or_text= bless { _content => $elt_or_text, _parent => $elt, },
'HTML::TreeBuilder::XPath::TextNode'
;
Scalar::Util::weaken($elt_or_text->{_parent});
}
if( ref $rank) { warn "rank is a ", ref( $rank), " elt_or_text is a ", ref( $elt_or_text); }
$elt_or_text->{_rank}= $rank; # used for sorting;
return $elt_or_text;
}
sub toString { return shift->as_XML( @_); }
# produces better looking XML
{
no warnings 'redefine';
sub as_XML_compact
{ my( $node, $opt)= @_;
my $name = $node->{'_tag'};
if( $name eq '~literal') { return _xml_escape_text( $node->{text}); }
if( $name eq '~declaration') { return '<!' . _xml_escape_text( $node->{text}) . ">"; }
if( $name eq '~pi') { return '<?' . _xml_escape_text( $node->{text}) . '?>'; }
if( $name eq '~comment') { return '<!--' . _xml_escape_comment( $node->{text}) . '-->'; }
my $lc_name= lc $name;
my $xml= $node->_start_tag;
if( $HTML::Tagset::isCDATA_Parent{$lc_name})
{ my $content= $node->{_content} || '';
if( ref $content eq 'ARRAY' || $content->isa( 'ARRAY'))
{ $xml .= _xml_escape_cdata( join( '', @$content), $opt); }
else
{ $xml .= $content; }
}
else
{ # start tag
foreach my $child ($node->content_list)
{ if( ref $child) { $xml .= $child->as_XML_compact(); }
else { $xml .= _xml_escape_text( $child); }
}
}
$xml.= "</$name>" unless $HTML::Tagset::emptyElement{$lc_name};
return $xml;
}
}
{ my %phrase_name; # all phrase tags, + literals (those are not indented)
my %extra_newline; # tags that get an extra newline before the end tag
my $default_indent; # 2 spaces, change with the 'indent' option
BEGIN
{ %phrase_name= %HTML::Tagset::isPhraseMarkup;
$phrase_name{'~literal'}= 1;
$default_indent= ' ';
%extra_newline= map { $_ => 1 } qw(html head body script div table tbody thead tfoot tr form dl ol ul);
}
sub as_XML_indented
{ my( $node, $opt)= @_;
my $name = $node->{'_tag'};
my $lc_name= lc $name;
if( $name eq '~literal') { return _xml_escape_text( $node->{text}); }
if( $name eq '~declaration') { return '<!' . _xml_escape_text( $node->{text}) . ">\n"; }
if( $name eq '~pi') { return '<?' . _xml_escape_text( $node->{text}) . '?>'; }
if( $name eq '~comment') { return '<!--' . _xml_escape_comment( $node->{text}) . '-->'; }
my $xml;
my $pre_tag_indent='';
if(!$phrase_name{$lc_name}) { $pre_tag_indent= "\n" . ($opt->{indent} || $default_indent) x ($opt->{indent_level}||0); }
if( $opt->{indent_level}) { $xml .= $pre_tag_indent; }
$xml.= $node->_start_tag();
my $content='';
if( $HTML::Tagset::isCDATA_Parent{$lc_name})
{ my $content= $node->{_content} || '';
if( ref $content && (ref $content eq 'ARRAY' || $content->isa( 'ARRAY') ))
{ $content= _xml_escape_cdata( join( '', @$content), $opt); }
}
else
{
my %child_opt= %$opt;
$child_opt{indent_level}++;
foreach my $child ($node->content_list)
{ if( ref $child) { $content .= $child->as_XML_indented( \%child_opt ); }
else { $content .= _xml_escape_text( $child); }
}
}
$xml .= $content;
if( $extra_newline{$lc_name} && $content ne '' ) { $xml.= $pre_tag_indent; }
$xml.= "</$name>" unless $HTML::Tagset::emptyElement{$lc_name};
$xml .="\n" if( !$opt->{indent_level});
return $xml;
}
}
sub _start_tag
{ my( $node)= @_;
my $name = $node->{'_tag'};
my $start_tag.= "<$name";
foreach my $att_name (sort keys %$node)
{ next if( (!length $att_name) || ($att_name=~ m{^_}) || ($att_name eq '/') );
my $well_formed_att_name= well_formed_name( $att_name);
$start_tag .= qq{ $well_formed_att_name="} . _xml_escape_attribute_value( $node->{$att_name}) . qq{"};
}
$start_tag.= $HTML::Tagset::emptyElement{lc $name} ? " />" : ">";
return $start_tag;
}
sub well_formed_name
{ my( $name)= @_;
$name=~ s{[^\w:_-]+}{_}g;
if( $name=~ m{^\d}) { $name= "a$name"; }
return $name;
}
sub _indent_level
{ my( $node)= @_;
my $level= scalar grep { !$HTML::Tagset::isPhraseMarkup{lc $_->{_tag}} } $node->lineage;
return $level;
}
{ my( $indent, %extra_newline, $nl);
BEGIN
{ $indent= ' ';
$nl= "\n";
%extra_newline= map { $_ => 1 } qw(html head body script div table tr form ol ul);
}
sub indents
{ my( $opt, $name)= @_;
my $indents= { pre_start_tag => '', post_start_tag => '', pre_end_tag => '', post_end_tag => ''};
if( $opt->{indented})
{ my $indent_level= $opt->{indent_level};
my $wrapping_nl= $nl;
if( !defined( $indent_level)) { $indent_level = 0; $wrapping_nl= ''; }
if( $HTML::Tagset::isKnown{lc $name} && !$HTML::Tagset::isPhraseMarkup{lc $name} && $indent_level > 0)
{ $indents->{pre_start_tag}= $wrapping_nl . ($indent x $indent_level); }
if( $extra_newline{lc $name})
{ $indents->{post_start_tag}= $nl;
$indents->{pre_end_tag}= $nl . ($indent x $indent_level);
}
if( $indent_level == 0)
{ $indents->{post_end_tag} = $wrapping_nl; }
}
return $indents;
}
}
sub _xml_escape_attribute_value
{ my( $text)= @_;
$text=~ s{([&<>"])}{$CHAR2DEFAULT_ENT{$1}}g; # escape also quote, as it is the attribute separator
return $text;
}
sub _xml_escape_text
{ my( $text)= @_;
$text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
return $text;
}
sub _xml_escape_comment
{ my( $text)= @_;
$text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
$text=~ s{--}{--}g; # can't have double --'s in XML comments
return $text;
}
sub _xml_escape_cdata
{ my( $text, $opt)= @_;
if( $opt->{force_escape_cdata} || $text=~ m{[<&]})
{ $text=~ s{^\s*\Q<![CDATA[}{}s;
$text=~ s{\Q]]>\E\s*$}{}s;
$text=~ s{]]>}{]]>}g; # can't have]]> in CDATA
$text= "<![CDATA[$text]]>";
}
return $text;
}
package HTML::TreeBuilder::XPath::TextNode;
use base 'HTML::TreeBuilder::XPath::Node';
sub getParentNode { return shift->{_parent}; }
sub getValue { return shift->{_content}; }
sub isTextNode { return 1; }
sub getAttributes { return wantarray ? () : []; }
# similar to HTML::Element as_XML
sub as_XML
{ my( $node, $entities)= @_;
my $content= $node->{_content};
if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
{ $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
else
{ $content= HTML::Element::_xml_escape_text($content); }
return $content;
}
*as_XML_compact = *as_XML;
*as_XML_indented = *as_XML;
sub getPreviousSibling
{ my $self= shift;
my $rank= $self->{_rank};
#unless( defined $self->{_rank})
# { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
my $parent= $self->{_parent};
return $rank ? $parent->_child_as_object( $parent->{_content}->[$rank-1], $rank-1) : undef;
}
sub getNextSibling
{ my $self= shift;
my $rank= $self->{_rank};
#unless( defined $self->{_rank})
# { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
my $parent= $self->{_parent};
my $next_sibling= $parent->{_content}->[$rank+1];
return defined( $next_sibling) ? $parent->_child_as_object( $next_sibling, $rank+1) : undef;
}
sub getRootNode
{ return shift->{_parent}->getRootNode; }
sub string_value { return shift->{_content}; }
# added to provide element-like methods to text nodes, for use by cmp
sub lineage
{ my( $node)= @_;
my $parent= $node->{_parent};
( run in 2.750 seconds using v1.01-cache-2.11-cpan-524268b4103 )