YAX
view release on metacpan or search on metacpan
lib/YAX/Parser.pm view on Meta::CPAN
sub read_file {
my ( $self, $file ) = @_;
my $xstr;
{
open FH, $file or return;
local $/ = undef;
$xstr = <FH>;
close FH;
}
return $xstr;
}
sub parse_file {
my ( $self, $file ) = @_;
return $self->parse( $self->read_file( $file ) );
}
sub stream_file {
my ( $self, $file, $state, %subs ) = @_;
return $self->stream( $self->read_file( $file ), $state, %subs );
}
sub parse_as_fragment {
my ( $self, $xstr ) = @_;
my $xdoc = $self->parse( '<yax:frag>'.$xstr.'</yax:frag>' );
my $root = $xdoc->root;
my $frag = YAX::Fragment->new;
$frag->append( $root->[0] ) while @$root;
return $frag;
}
sub parse_file_as_fragment {
my ( $self, $file ) = @_;
my $xstr = $self->read_file( $file );
my $frag = $self->parse_as_fragment( $xstr );
return $frag;
}
sub tokenize {
my ( $self, $xstr ) = @_;
return $xstr =~ /$XML_SPE/g;
}
sub _mk_decl {
my ( $self, $decl, $parent ) = @_;
my ( $type, $name );
my $offset = 1;
my $length = length( $decl );
substr( $decl, 0, 4 ) eq '<!--' && do {
$offset = 4;
$length = $length - $offset - 3;
$type = COMMENT_NODE;
$name = '#comment';
};
substr( $decl, 0, 9 ) eq '<![CDATA[' && do {
$offset = 9;
$length = $length - $offset - 3;
$type = CDATA_SECTION_NODE;
$name = '#cdata';
};
substr( $decl, 0, 9 ) eq '<!DOCTYPE' && do {
$offset = 10;
$length = $length - $offset - 3;
$type = DOCUMENT_TYPE_NODE;
$name = "#document-type";
};
return $self->_mk_node(
$name, $type, substr( $decl, $offset, $length ), $parent
);
}
sub _mk_proc {
my ( $self, $text, $parent ) = @_;
my ( $name, $data ) = ( $text =~ /^<\?([a-zA-Z0-9_-]+?)\s+(.*?)\s*\?>/ );
return $self->_mk_node(
$name, PROCESSING_INSTRUCTION_NODE, $data, $parent
);
}
sub _mk_node {
my ( $self, $name, $type, $data, $parent ) = @_;
my $node = YAX::Node->new( $name, $type, $data );
push @$parent, $node;
$node->parent( $parent );
return $node;
}
sub _mk_text {
my ( $self, $text, $parent ) = @_;
my $node = YAX::Text->new( $text );
push @$parent, $node;
$node->parent( $parent );
return $node;
}
sub _mk_elmt {
my ( $self, $elmt, $parent ) = @_;
my $copy = substr( $elmt, 1, -1 );
my ( $name, $atts ) = split(/\s+/, $copy, 2);
$name =~ s/\/$//;
my %atts = $atts ? $self->parse_attributes( $atts ) : ( );
my $node = YAX::Element->new( $name, %atts );
push @$parent, $node;
$node->parent( $parent );
return $node;
}
sub parse_attributes {
my ( $self, $atts ) = @_;
my %atts = ( );
while ( $atts =~ /$ElemTagCE2/g ) {
$atts{ $1 } = defined $2 ? $2 : $3;
}
return %atts;
}
1;
__END__
=head1 NAME
YAX::Parser - fast pure Perl tree and stream parser
=head1 SYNOPSIS
use YAX::Parser;
my $xml_str = <<XML
<?xml version="1.0" ?>
<doc>
<content id="42"><![CDATA[
This is a cdata section, so >>anything goes!<<
]]>
</content>
<!-- comments are nodes too -->
</doc>
XML
# tree parse - the common case
my $xml_doc = YAX::Parser->parse( $xml_str );
my $xml_doc = YAX::Parser->parse_file( $path );
# shallow parse
my @tokens = YAX::Parser->tokenize( $xml_str );
# stream parse
YAX::Parser->stream( $xml_str, $state, %handlers )
YAX::Parser->stream_file( '/some/file.xml', $state, %handlers );
=head1 DESCRIPTION
This module implements a fast DOM and stream parser based on Robert D. Cameron's
regular expression shallow parsing grammar and technique. It doesn't implement
the full W3C DOM API by design. Instead, it takes a more pragmatic approach. DOM
trees are constructed with everything being an object except for attributes, which
are stored as a hash reference.
We also borrow some ideas from browser implementations, in particular, nodes are
keyed in a table in the document on their C<id> attributes (if present) so you can
say:
my $found = $xml_doc->get( $node_id );
Parsing is usually done by calling class methods on YAX::Parser, which,
if invoked as a tree parser, returns an instance of L<YAX::Document>
my $xml_doc = YAX::Parser->parse( $xml_str );
=head1 METHODS
See the L</SYNOPSIS> for, here's just the list for now:
=over 4
=item parse( $xml_str )
Parse $xml_str and return a L<YAX::Document> object.
=item parse_file( $path )
Same as above by read the file at $path for the input.
=item stream( $xml_str, $state, %handlers )
Although not its main focus, YAX::Parser also provides for stream
parsing. It tries to be a bit more sane than Expat, in that it allows
you to specify a state holder which can be anything and is passed as
the first argument to the handler functions. A typical case is to
use a hash reference with a stack (for tracking nesting):
my $state = { stack => [ ] };
( run in 2.817 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )