HTML-TreeBuilder-XPath

 view release on metacpan or  search on metacpan

lib/HTML/TreeBuilder/XPath.pm  view on Meta::CPAN


sub getAttributes
  { my $elt= shift;
    my %atts= $elt->all_external_attr;
    my $rank=0;
    my @atts= map { bless( { _name => $_, _value => $atts{$_}, 
                             _elt => $elt, _rank => $rank++, 
                           }, 
                               'HTML::TreeBuilder::XPath::Attribute'
                         )
                  } sort keys %atts;
    return wantarray ? @atts : \@atts;
  }

sub to_number { return XML::XPathEngine::Number->new( $_[0]->as_text); }
sub string_value 
  { my $elt= shift;
    if( $elt->isCommentNode) { return $elt->{text}; }
    return $elt->as_text;
  };

# called on a parent, with a child as second argument and its rank as third
# returns the child if it is already an element, or
# a new HTML::TreeBuilder::XPath::Text element if it is a plain string
sub _child_as_object
  { my( $elt, $elt_or_text, $rank)= @_;
    return undef unless( defined $elt_or_text);
    if( ! ref $elt_or_text)
      { # $elt_or_text is a string, turn it into a TextNode object
        $elt_or_text= bless { _content => $elt_or_text, _parent => $elt, }, 
                            'HTML::TreeBuilder::XPath::TextNode'
                      ;
        Scalar::Util::weaken($elt_or_text->{_parent});
      }
    if( ref $rank) { warn "rank is a ", ref( $rank), " elt_or_text is a ", ref( $elt_or_text); } 
    $elt_or_text->{_rank}= $rank; # used for sorting;
    return $elt_or_text;
  }

sub toString { return shift->as_XML( @_); }

# produces better looking XML
{
  no warnings 'redefine'; 
  sub as_XML_compact
    { my( $node, $opt)= @_;
      my $name = $node->{'_tag'};
      if( $name eq '~literal')     { return _xml_escape_text( $node->{text});                     }

      if( $name eq '~declaration') { return '<!'   . _xml_escape_text( $node->{text})    . ">";   }
      if( $name eq '~pi')          { return '<?'   . _xml_escape_text( $node->{text})    . '?>';  }
      if( $name eq '~comment')     { return '<!--' . _xml_escape_comment( $node->{text}) . '-->'; }

      my $lc_name= lc $name;

      my $xml= $node->_start_tag;

      if( $HTML::Tagset::isCDATA_Parent{$lc_name})
        { my $content= $node->{_content} || '';
          if( ref $content eq 'ARRAY' || $content->isa( 'ARRAY'))
            { $xml .= _xml_escape_cdata( join( '', @$content), $opt); }
          else
            { $xml .= $content; }
        }
      else
        { # start tag
          foreach my $child ($node->content_list) 
            { if( ref $child) { $xml .= $child->as_XML_compact(); }
              else            { $xml .=  _xml_escape_text( $child); }
            }
        }
          $xml.= "</$name>" unless $HTML::Tagset::emptyElement{$lc_name};
      return $xml;
    }
}

    

{ my %phrase_name;    # all phrase tags, + literals (those are not indented)
  my %extra_newline;  # tags that get an extra newline before the end tag
  my $default_indent; # 2 spaces, change with the 'indent' option
  BEGIN 
    { %phrase_name= %HTML::Tagset::isPhraseMarkup;
      $phrase_name{'~literal'}= 1;
      $default_indent= '  ';
      %extra_newline= map { $_ => 1 } qw(html head body script div table tbody thead tfoot tr form dl ol ul);
    }

  sub as_XML_indented
    { my( $node, $opt)= @_;


      my $name = $node->{'_tag'};
      my $lc_name= lc $name;

      if( $name eq '~literal')     { return _xml_escape_text( $node->{text});                     }
      if( $name eq '~declaration') { return '<!'   . _xml_escape_text( $node->{text})    . ">\n"; }


      if( $name eq '~pi')          { return '<?'   . _xml_escape_text( $node->{text})    . '?>';  }
      if( $name eq '~comment')     { return '<!--' . _xml_escape_comment( $node->{text}) . '-->'; }
      
      my $xml;
      my $pre_tag_indent='';
      if(!$phrase_name{$lc_name}) { $pre_tag_indent=  "\n" . ($opt->{indent} || $default_indent) x ($opt->{indent_level}||0); }
      if( $opt->{indent_level}) { $xml .= $pre_tag_indent; }

      $xml.= $node->_start_tag();

      my $content='';

      if( $HTML::Tagset::isCDATA_Parent{$lc_name})
        { my $content= $node->{_content} || '';
          if( ref $content && (ref $content eq 'ARRAY' || $content->isa( 'ARRAY') ))
            { $content= _xml_escape_cdata( join( '', @$content), $opt); }
        }
      else
        { 
          my %child_opt= %$opt;
          $child_opt{indent_level}++;
          foreach my $child ($node->content_list) 
            { if( ref $child) { $content .= $child->as_XML_indented( \%child_opt ); }
              else            { $content .=  _xml_escape_text( $child);             }
            }
        }
      $xml .= $content;

      if( $extra_newline{$lc_name} && $content ne '' ) { $xml.= $pre_tag_indent; }
      $xml.= "</$name>" unless $HTML::Tagset::emptyElement{$lc_name};
      $xml .="\n" if( !$opt->{indent_level});
     
      return $xml;
    }
}

sub _start_tag
  { my( $node)= @_;
    my $name = $node->{'_tag'};
    my $start_tag.= "<$name";
    foreach my $att_name (sort keys %$node) 
      { next if( (!length $att_name) ||  ($att_name=~ m{^_}) || ($att_name eq '/') );
        my $well_formed_att_name= well_formed_name( $att_name);
        $start_tag .= qq{ $well_formed_att_name="} . _xml_escape_attribute_value( $node->{$att_name}) . qq{"};
      }
    $start_tag.= $HTML::Tagset::emptyElement{lc $name} ? " />" : ">";
    return $start_tag;
  }

sub well_formed_name
  { my( $name)= @_;
    $name=~ s{[^\w:_-]+}{_}g;
    if( $name=~ m{^\d}) { $name= "a$name"; }
    return $name;
  }

sub _indent_level
  { my( $node)= @_;
    my $level= scalar grep { !$HTML::Tagset::isPhraseMarkup{lc $_->{_tag}} } $node->lineage;
    return $level;
  }

{ my( $indent, %extra_newline, $nl);
  BEGIN 
    { $indent= '  '; 
      $nl= "\n";
      %extra_newline= map { $_ => 1 } qw(html head body script div table tr form ol ul);
    }
   
  sub indents
    { my( $opt, $name)= @_;
      my $indents= { pre_start_tag => '', post_start_tag => '', pre_end_tag => '', post_end_tag => ''};
      if( $opt->{indented})
        { my $indent_level= $opt->{indent_level};
          my $wrapping_nl= $nl; 
          if( !defined( $indent_level)) { $indent_level = 0; $wrapping_nl= ''; }
          if( $HTML::Tagset::isKnown{lc $name} && !$HTML::Tagset::isPhraseMarkup{lc $name} && $indent_level > 0) 
            { $indents->{pre_start_tag}= $wrapping_nl . ($indent x $indent_level); }
          if( $extra_newline{lc $name})
            { $indents->{post_start_tag}= $nl; 
              $indents->{pre_end_tag}= $nl . ($indent x $indent_level);
            }
          if( $indent_level == 0) 
            { $indents->{post_end_tag} = $wrapping_nl; }
        }
      return $indents;
    }
}

    
sub _xml_escape_attribute_value
  { my( $text)= @_;
    $text=~ s{([&<>"])}{$CHAR2DEFAULT_ENT{$1}}g; # escape also quote, as it is the attribute separator
    return $text;
  }

sub _xml_escape_text
  { my( $text)= @_;
    $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
    return $text;
  }

sub _xml_escape_comment
  { my( $text)= @_;
    $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
    $text=~ s{--}{-&#45;}g; # can't have double --'s in XML comments
    return $text;
  }

sub _xml_escape_cdata
  { my( $text, $opt)= @_;
    if( $opt->{force_escape_cdata} || $text=~ m{[<&]})
      { $text=~ s{^\s*\Q<![CDATA[}{}s;
        $text=~ s{\Q]]>\E\s*$}{}s;
        $text=~ s{]]>}{]]&#62;}g; # can't have]]> in CDATA
        $text=  "<![CDATA[$text]]>";
      }
    return $text;
  }


package HTML::TreeBuilder::XPath::TextNode;

use base 'HTML::TreeBuilder::XPath::Node';

sub getParentNode { return shift->{_parent};    }
sub getValue      { return shift->{_content};   }
sub isTextNode    { return 1;                   }
sub getAttributes { return wantarray ? () : []; }

# similar to HTML::Element as_XML
sub as_XML
  { my( $node, $entities)= @_;
    my $content= $node->{_content};
    if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
      { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
    else
      { $content= HTML::Element::_xml_escape_text($content); }
    return $content;
  }
*as_XML_compact  = *as_XML;
*as_XML_indented = *as_XML;


sub getPreviousSibling
  { my $self= shift;
    my $rank= $self->{_rank}; 
    #unless( defined $self->{_rank})
    #  { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
    my $parent= $self->{_parent};
    return $rank ? $parent->_child_as_object( $parent->{_content}->[$rank-1], $rank-1) : undef;
  }

sub getNextSibling
  { my $self= shift;
    my $rank= $self->{_rank};
    #unless( defined $self->{_rank})
    #  { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
    my $parent= $self->{_parent};
    my $next_sibling= $parent->{_content}->[$rank+1];
    return defined( $next_sibling) ? $parent->_child_as_object( $next_sibling, $rank+1) : undef;
  }

sub getRootNode
  { return shift->{_parent}->getRootNode; }

sub string_value { return shift->{_content}; }

# added to provide element-like methods to text nodes, for use by cmp
sub lineage 
  { my( $node)= @_;
    my $parent= $node->{_parent};



( run in 2.750 seconds using v1.01-cache-2.11-cpan-524268b4103 )