YAX

 view release on metacpan or  search on metacpan

lib/YAX/Parser.pm  view on Meta::CPAN


sub read_file {
    my ( $self, $file ) = @_;
    my $xstr;
    {
        open FH, $file or return;
        local $/ = undef;
        $xstr = <FH>;
        close FH;
    }
    return $xstr;
}

sub parse_file {
    my ( $self, $file ) = @_;
    return $self->parse( $self->read_file( $file ) );
}

sub stream_file {
    my ( $self, $file, $state, %subs ) = @_;
    return $self->stream( $self->read_file( $file ), $state, %subs );
}

sub parse_as_fragment {
    my ( $self, $xstr ) = @_;
    my $xdoc = $self->parse( '<yax:frag>'.$xstr.'</yax:frag>' );
    my $root = $xdoc->root;
    my $frag = YAX::Fragment->new;
    $frag->append( $root->[0] ) while @$root;
    return $frag;
}

sub parse_file_as_fragment {
    my ( $self, $file ) = @_;
    my $xstr = $self->read_file( $file );
    my $frag = $self->parse_as_fragment( $xstr );
    return $frag;
}

sub tokenize { 
    my ( $self, $xstr ) = @_;
    return $xstr =~ /$XML_SPE/g;
}

sub _mk_decl {
    my ( $self, $decl, $parent ) = @_;
    my ( $type, $name );
    my $offset = 1;
    my $length = length( $decl );

    substr( $decl, 0, 4 ) eq '<!--' && do {
	$offset = 4;
	$length = $length - $offset - 3;
	$type   = COMMENT_NODE;
        $name   = '#comment';
    };
    substr( $decl, 0, 9 ) eq '<![CDATA[' && do {
	$offset = 9;
	$length = $length - $offset - 3;
	$type   = CDATA_SECTION_NODE;
        $name   = '#cdata';
    };
    substr( $decl, 0, 9 ) eq '<!DOCTYPE' && do {
	$offset = 10;
	$length = $length - $offset - 3;
        $type   = DOCUMENT_TYPE_NODE;
        $name   = "#document-type";
    };
    return $self->_mk_node(
        $name, $type, substr( $decl, $offset, $length ), $parent
    );
}

sub _mk_proc {
    my ( $self, $text, $parent ) = @_;
    my ( $name, $data ) = ( $text =~ /^<\?([a-zA-Z0-9_-]+?)\s+(.*?)\s*\?>/ );
    return $self->_mk_node(
        $name, PROCESSING_INSTRUCTION_NODE, $data, $parent
    );
}

sub _mk_node {
    my ( $self, $name, $type, $data, $parent ) = @_;

    my $node = YAX::Node->new( $name, $type, $data );
    push @$parent, $node;
    $node->parent( $parent );

    return $node;
}

sub _mk_text {
    my ( $self, $text, $parent ) = @_;

    my $node = YAX::Text->new( $text );
    push @$parent, $node;
    $node->parent( $parent );

    return $node;
}

sub _mk_elmt {
    my ( $self, $elmt, $parent ) = @_;
    my $copy = substr( $elmt, 1, -1 );
    my ( $name, $atts ) = split(/\s+/, $copy, 2);

    $name =~ s/\/$//;

    my %atts = $atts ? $self->parse_attributes( $atts ) : ( );
    my $node = YAX::Element->new( $name, %atts );

    push @$parent, $node;
    $node->parent( $parent );

    return $node;
}

sub parse_attributes {
    my ( $self, $atts ) = @_;
    my %atts = ( );
    while ( $atts =~ /$ElemTagCE2/g ) {
        $atts{ $1 } = defined $2 ? $2 : $3;
    }
    return %atts;
}

1;

__END__

=head1 NAME

YAX::Parser - fast pure Perl tree and stream parser

=head1 SYNOPSIS

 use YAX::Parser;

 my $xml_str = <<XML
   <?xml version="1.0" ?>
   <doc>
     <content id="42"><![CDATA[
        This is a cdata section, so >>anything goes!<<
     ]]>
     </content>
     <!-- comments are nodes too -->
   </doc>
 XML

 # tree parse - the common case
 my $xml_doc = YAX::Parser->parse( $xml_str );
 my $xml_doc = YAX::Parser->parse_file( $path );

 # shallow parse
 my @tokens = YAX::Parser->tokenize( $xml_str );

 # stream parse 
 YAX::Parser->stream( $xml_str, $state, %handlers )
 YAX::Parser->stream_file( '/some/file.xml', $state, %handlers );
 
=head1 DESCRIPTION

This module implements a fast DOM and stream parser based on Robert D. Cameron's
regular expression shallow parsing grammar and technique. It doesn't implement
the full W3C DOM API by design. Instead, it takes a more pragmatic approach. DOM
trees are constructed with everything being an object except for attributes, which
are stored as a hash reference.

We also borrow some ideas from browser implementations, in particular, nodes are
keyed in a table in the document on their C<id> attributes (if present) so you can
say:

 my $found = $xml_doc->get( $node_id );

Parsing is usually done by calling class methods on YAX::Parser, which,
if invoked as a tree parser, returns an instance of L<YAX::Document>

 my $xml_doc = YAX::Parser->parse( $xml_str );

=head1 METHODS

See the L</SYNOPSIS> for, here's just the list for now:

=over 4

=item parse( $xml_str )

Parse $xml_str and return a L<YAX::Document> object.

=item parse_file( $path )

Same as above by read the file at $path for the input.

=item stream( $xml_str, $state, %handlers )

Although not its main focus, YAX::Parser also provides for stream
parsing. It tries to be a bit more sane than Expat, in that it allows
you to specify a state holder which can be anything and is passed as
the first argument to the handler functions. A typical case is to
use a hash reference with a stack (for tracking nesting):

 my $state = { stack => [ ] };



( run in 2.817 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )