XML-TreeBuilder

 view release on metacpan or  search on metacpan

lib/XML/TreeBuilder.pm  view on Meta::CPAN

require 5;

package XML::TreeBuilder;

use warnings;
use strict;
use XML::Element ();
use XML::Parser  ();
use Carp;
use IO::File;
use XML::Catalog 1.02;
use File::Basename;
use File::Spec;
use vars qw(@ISA $VERSION);

$VERSION = '5.4';
@ISA     = ('XML::Element');

#==========================================================================
sub new {
    my ( $this, $arg ) = @_;
    my $class = ref($this) || $this;

    if ( $arg && ( ref($arg) ne 'HASH' ) ) {
        croak(
            q|new expects an anonymous hash, $t->new( { NoExpand => 1, ErrorContext => 2 } ), for it's parameters, not a |
                . ref($arg) );
    }

    my $NoExpand     = ( delete $arg->{NoExpand}     || undef );
    my $ErrorContext = ( delete $arg->{ErrorContext} || undef );
    my $catalog
        = (    delete $arg->{catalog}
            || $ENV{XML_CATALOG_FILES}
            || '/etc/xml/catalog' );
    my $debug = ( delete $arg->{debug} || undef );

    if ( %{$arg} ) {
        croak "unknown args: " . join( ", ", keys %{$arg} );
    }

    my $self = XML::Element->new('NIL');
    bless $self, $class;    # and rebless
    $self->{_element_class}      = 'XML::Element';
    $self->{_store_comments}     = 0;
    $self->{_store_pis}          = 0;
    $self->{_store_declarations} = 0;
    $self->{_store_cdata}        = 0;

    # have to let HTML::Element know there are encoded entities
    $XML::Element::encoded_content = $NoExpand if ($NoExpand);

    my @stack;

 # Compare the simplicity of this to the sheer nastiness of HTML::TreeBuilder!

    $self->{_xml_parser} = XML::Parser->new(
        Handlers => {
            Default => sub {

                # Stuff unexpanded entities back on to the stack as is.
                if ( ($NoExpand) && ( $_[1] =~ /&[^\;]+\;/ ) ) {
                    $stack[-1]->push_content( $_[1] );
                }
                return;
            },

            Start => sub {
                my $xp  = shift;
                my $str = $xp->original_string();
                if (@stack) {
                    my @args;
                    my $tag = shift(@_);
                    while (@_) {
                        my ( $attr, $val ) = splice( @_, 0, 2 );
## BUGBUG This dirty hack is because the $val from XML::Parser isn't correct when $NoExpand is set ... can we fix it?
## any entity in an attribute is lost
## given <doc id="this-&FOO;-attr"> $val is "this--attr" not "this-&FOO;-attr"
                        if ( $NoExpand && $str =~ /\s$attr="([^"]*\&[^"]*)"/ )
                        {
                            $val = $1;
                        }
                        push( @args, $attr, $val );
                    }

                    unshift( @args, $tag );
                    push @stack, $self->{_element_class}->new(@args);
                    $stack[-2]->push_content( $stack[-1] );
                }
                else {
                    $self->tag(shift);
                    while (@_) {
                        my ( $attr, $val ) = splice( @_, 0, 2 );
## BUGBUG This dirty hack is because the $val from XML::Parser isn't correct when $NoExpand is set ... can we fix it?
## any entity in an attribute is lost
## given <doc id="this-&FOO;-attr"> $val is "this--attr" not "this-&FOO;-attr"
                        if ( $NoExpand && $str =~ /\s$attr="([^"]*\&[^"]*)"/ )
                        {
                            $val = $1;
                        }
                        $self->attr( $attr, $val );
                    }
                    push @stack, $self;
                }
            },

            End => sub { pop @stack; return },

lib/XML/TreeBuilder.pm  view on Meta::CPAN

                shift;
                ( @stack ? $stack[-1] : $self )->push_content(
                    $self->{_element_class}->new(
                        '~declaration',
                        'text' => join ' ',
                        'ATTLIST', @_
                    )
                );
                return;
            },

            Element => sub {
                return unless $self->{_store_declarations};
                shift;
                ( @stack ? $stack[-1] : $self )->push_content(
                    $self->{_element_class}->new(
                        '~declaration',
                        'text' => join ' ',
                        'ELEMENT', @_
                    )
                );
                return;
            },

            Doctype => sub {
                return unless $self->{_store_declarations};
                shift;
                ## Need this because different types set different array entries.
                no warnings 'uninitialized';
                ( @stack ? $stack[-1] : $self )->push_content(
                    $self->{_element_class}->new(
                        '~declaration',
                        'text' => join( ' ', ( 'DOCTYPE', @_ ) ),
                        type   => 'DOCTYPE',
                        mytag  => $_[0],
                        uri    => $_[1],
                        pid    => $_[2],
                    )
                );
                return;
            },

            Entity => sub {
                return unless $self->{_store_declarations};
                shift;
                ## Need this because different entity types set different array entries.
                no warnings 'uninitialized';
                ( @stack ? $stack[-1] : $self )->push_content(
                    $self->{_element_class}->new(
                        '~declaration',
                        'text' => join( ' ', ( 'ENTITY', @_ ) ),
                        type   => 'ENTITY',
                        name   => $_[0],
                        value  => $_[1],
                    )
                );
                return;
            },

            CdataStart => sub {
                return unless $self->{_store_cdata};
                shift;
                push @stack,
                    $self->{_element_class}->new( '~cdata', 'text' => $_[1] );
                $stack[-2]->push_content( $stack[-1] );
                return;
            },

            CdataEnd => sub {
                return unless $self->{_store_cdata};
                pop @stack;
                return;
            },

            ExternEnt => sub {
                return if ($NoExpand);
                my $xp = shift;
                my ( $base, $sysid, $pubid ) = @_;
                my $file = "$sysid";

                if ( $sysid =~ /^http:/ ) {
## BUGBUG need to catch when there is no local file
                    my $cat = XML::Catalog->new($catalog);
                    $file = $cat->resolve_public($pubid);
                    croak("Can't resolve '$pubid'")
                        if ( !defined($file) || $file eq '' );
                    $file =~ s/^file:\/\///;
                    my ( $filename, $directories, $suffix )
                        = fileparse($file);
                    $base = $directories;
                }
                else {
                    $sysid =~ s/^file:\/\/// if ( $sysid =~ /^file:/ );

                    if ( File::Spec->file_name_is_absolute($sysid) ) {
                        my ( $filename, $directories, $suffix )
                            = fileparse($sysid);
                        $base = $directories;
                    }
                    else {
                        my ( $filename, $directories, $suffix )
                            = fileparse($base);
                        $file = File::Spec->rel2abs( $sysid, $directories );
                    }
                }
                my $fh = new IO::File( $file, "r" );
                croak "$!" unless $fh;
                $xp->{_BaseStack} ||= [];
                $xp->{_FhStack}   ||= [];

                push( @{ $xp->{_BaseStack} }, $base );
                push( @{ $xp->{_FhStack} },   $fh );

                $xp->base($base);
                return ($fh);
            },

            ExternEntFin => sub {
                return if ($NoExpand);
                my ($xp) = shift;

                my $fh = pop( @{ $xp->{_FhStack} } );
                $fh->close if ($fh);

                my $base = pop( @{ $xp->{_BaseStack} } );
                $xp->base($base) if ($base);
                return;
            },

        },
        NoExpand      => $NoExpand,
        ErrorContext  => $ErrorContext,
        ParseParamEnt => !$NoExpand,
        NoLWP         => 0,
    );

    return $self;
}

#==========================================================================
sub _elem    # universal accessor...
{
    my ( $self, $elem, $val ) = @_;
    my $old = $self->{$elem};
    $self->{$elem} = $val if defined $val;
    return $old;
}

sub store_comments     { shift->_elem( '_store_comments',     @_ ); }
sub store_declarations { shift->_elem( '_store_declarations', @_ ); }
sub store_pis          { shift->_elem( '_store_pis',          @_ ); }
sub store_cdata        { shift->_elem( '_store_cdata',        @_ ); }

#==========================================================================

sub parse {
    shift->{_xml_parser}->parse(@_);
}

sub parse_file { shift->parsefile(@_) }    # alias

sub parsefile {
    shift->{_xml_parser}->parsefile(@_);
}

sub eof {
    delete shift->{_xml_parser};           # sure, why not?
}

#==========================================================================
1;

__END__


=head1 NAME

XML::TreeBuilder - Parser that builds a tree of XML::Element objects

=head1 SYNOPSIS

  foreach my $file_name (@ARGV) {
    my $tree = XML::TreeBuilder->new({ 'NoExpand' => 0, 'ErrorContext' => 0 }); # empty tree
    $tree->parse_file($file_name);
    print "Hey, here's a dump of the parse tree of $file_name:\n";
    $tree->dump; # a method we inherit from XML::Element
    print "And here it is, bizarrely rerendered as XML:\n",
      $tree->as_XML, "\n";
    
    # Now that we're done with it, we must destroy it.
    $tree = $tree->delete;
  }

=head1 DESCRIPTION

This module uses XML::Parser to make XML document trees constructed of
XML::Element objects (and XML::Element is a subclass of HTML::Element
adapted for XML).  XML::TreeBuilder is meant particularly for people
who are used to the HTML::TreeBuilder / HTML::Element interface to
document trees, and who don't want to learn some other document
interface like XML::Twig or XML::DOM.

The way to use this class is to:

1. start a new (empty) XML::TreeBuilder object.

2. set any of the "store" options you want.

3. then parse the document from a source by calling
C<$x-E<gt>parsefile(...)>
or
C<$x-E<gt>parse(...)> (See L<XML::Parser> docs for the options

lib/XML/TreeBuilder.pm  view on Meta::CPAN

=item $root = XML::TreeBuilder->new()

Construct a new XML::TreeBuilder object.

Parameters:

=over

=item NoExpand

    Passed to XML::Parser. Do not Expand external entities.
    Default: undef

=item ErrorContext

    Passed to XML::Parser. Number of context lines to generate on errors.
    Default: undef

=item catalog

    Path to an Oasis XML catalog. Passed to XML::Catalog to resolve entities if NoExpand is not set.
    Default: $ENV{XML_CATALOG_FILES} || '/etc/xml/catalog'

=back

=item $root->eof

Deletes parser object.

=item $root->parse(...options...)

Uses XML::Parser's C<parse> method to parse XML from the source(s?)
specified by the options.  See L<XML::Parse>

=item $root->parsefile(...options...)

Uses XML::Parser's C<parsefile> method to parse XML from the source(s?)
specified by the options.  See L<XML::Parse>

=item $root->parse_file(...options...)

Simply an alias for C<parsefile>.

=item $root->store_comments(value)

This determines whether TreeBuilder will normally store comments found
while parsing content into C<$root>.  Currently, this is off by default.

=item $root->store_declarations(value)

This determines whether TreeBuilder will normally store markup
declarations found while parsing content into C<$root>.  Currently,
this is off by default.

=item $root->store_pis(value)

This determines whether TreeBuilder will normally store processing
instructions found while parsing content into C<$root>.
Currently, this is off (false) by default.

=item $root->store_cdata(value)

This determines whether TreeBuilder will normally store CDATA
sectitons found while parsing content into C<$root>. Adds a ~cdata node.

Currently, this is off (false) by default.

=back

=head1 SEE ALSO

L<XML::Parser>, L<XML::Element>, L<HTML::TreeBuilder>, L<HTML::DOMbo>.

And for alternate XML document interfaces, L<XML::DOM> and L<XML::Twig>.


=head1 COPYRIGHT AND DISCLAIMERS

Copyright (c) 2000,2004 Sean M. Burke.  All rights reserved.
Copyright (c) 2010,2011,2013 Jeff Fearn. All rights reserved.

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

This program is distributed in the hope that it will be useful, but
without any warranty; without even the implied warranty of
merchantability or fitness for a particular purpose.


=head1 AUTHOR

Current Author:
	Jeff Fearn E<lt>jfearn@cpan.orgE<gt>.

Former Authors:
	Sean M. Burke, E<lt>sburke@cpan.orgE<gt>

=cut



( run in 2.081 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )