XML-Parser-Lite-Tree
view release on metacpan or search on metacpan
lib/XML/Parser/LiteCopy.pm view on Meta::CPAN
my %parameters = @_;
$self->setHandlers(); # clear first
$self->setHandlers(%{$parameters{Handlers} || {}});
$ReturnErrors = $parameters{ReturnErrors} || 0;
return $self;
}
sub setHandlers {
my $self = shift;
# allow symbolic refs, avoid "subroutine redefined" warnings
no strict 'refs'; local $^W;
# clear all handlers if called without parameters
if (not @_) {
for (qw(Start End Char Final Init CData Comment Doctype PI Error)) {
*$_ = sub {}
}
}
# we could use each here, too...
while (@_) {
my($name, $func) = splice(@_, 0, 2);
*$name = defined $func
? $func
: sub {}
}
return $self;
}
sub _regexp {
my $patch = shift || '';
my $package = __PACKAGE__;
# This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html
# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
# Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998.
# Copyright (c) 1998, Robert D. Cameron.
# The following code may be freely used and distributed provided that
# this copyright and citation notice remains intact and that modifications
# or additions are clearly identified.
use re 'eval';
my $TextSE = "[^<]+";
# the following backrefs have been added:
# 1 : TextSE
# 2 : MarkupSPE / DeclCE / CommentCE
# 3 : MarkupSPE / DeclCE / CDATA_CE
# 4 : MarkupSPE / DeclCE / DocTypeCE
# 5 : MarkupSPE / PI_CE
# 6 : MarkupSPE / EndTagCE
# 7+: MarkupSPE / ElemTagCE
my $Until2Hyphens = "(?:[^-]*)-(?:[^-]+-)*-";
my $CommentCE = "($Until2Hyphens)(?{${package}::comment(\$2)})>?";
my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
my $CDATA_CE = "($UntilRSBs(?:[^\\]>]$UntilRSBs)*)(?{${package}::cdata(\$3)})>";
my $S = "[ \\n\\t\\r]+";
my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
my $Name = "(?:$NameStrt)(?:$NameChar)*";
my $QuoteSE = "\"[^\"]*\"|'[^']*'";
my $DT_IdentSE = "$Name(?:$S(?:$Name|$QuoteSE))*";
my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
my $S1 = "[\\n\\r\\t ]";
my $UntilQMs = "[^?]*\\?+";
my $PI_Tail = "\\?|$S1$UntilQMs(?:[^>?]$UntilQMs)*";
my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail>))|%$Name;|$S";
my $DocTypeCE = "$S($DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?)>(?{${package}::_doctype(\$4)})";
my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
my $PI_CE = "($Name(?:$PI_Tail))>(?{${package}::_pi(\$5); undef})";
# these expressions were modified for backtracking and events
my $EndTagCE = "($Name)(?{${package}::_end(\$6); undef})(?:$S)?>";
my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'";
my $ElemTagCE = "($Name)"
. "(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)"
. "(?{[\@{\$^R||[]},\$8=>defined\$9?\$9:\$10]}))*(?:$S)?(/)?>"
. "(?{${package}::_start(\$7,\@{\$^R||[]}),\$^R=[]})(?{\$11 and ${package}::_end(\$7); undef})";
my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
# Next expression is under "black magic".
# Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE',
# but it doesn't work under Perl 5.005 and only magic with
# (?:....)?? solved the problem.
# I would appreciate if someone let me know what is the right thing to do
# and what's the reason for all this magic.
# Seems like a problem related to (?:....)? rather than to ?{} feature.
# Tests are in t/31-xmlparserlite.t if you decide to play with it.
#"(?{[]})(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE";
"(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE";
}
setHandlers();
# Try 5.6 and 5.10 regex first
my $REGEXP = _regexp('??');
sub _parse_re {
use re "eval";
undef $^R;
1 while $_[0] =~ m{$REGEXP}go
};
# fixup regex if it does not work...
{
if (not eval { _parse_re('<soap:foo xmlns:soap="foo">bar</soap:foo>'); 1; } ) {
$REGEXP = _regexp();
local $^W;
*_parse_re = sub {
lib/XML/Parser/LiteCopy.pm view on Meta::CPAN
undef $^R;
1 while $_[0] =~ m{$REGEXP}go
};
}
}
sub parse {
_init();
_parse_re($_[1]);
_final();
}
my(@stack, $level);
sub _init {
@stack = ();
$level = 0;
Init(__PACKAGE__, @_);
}
sub _final {
return _error("not properly closed tag '$stack[-1]'") if @stack;
return _error("no element found") unless $level;
Final(__PACKAGE__, @_)
}
sub _start {
return _error("multiple roots, wrong element '$_[0]'") if $level++ && !@stack;
push(@stack, $_[0]);
Start(__PACKAGE__, @_);
}
sub _char {
Char(__PACKAGE__, $_[0]), return if @stack;
# check for junk before or after element
# can't use split or regexp due to limitations in ?{} implementation,
# will iterate with loop, but we'll do it no more than two times, so
# it shouldn't affect performance
for (my $i=0; $i < length $_[0]; $i++) {
return _error("junk '$_[0]' @{[$level ? 'after' : 'before']} XML element")
if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there
}
}
sub _end {
return _error("unexpected closing tag '$_[0]'") if !@stack;
pop(@stack) eq $_[0] or return _error("mismatched tag '$_[0]'");
End(__PACKAGE__, $_[0]);
}
sub comment {
Comment(__PACKAGE__, substr $_[0], 0, -2);
}
sub end {
pop(@stack) eq $_[0] or return _error("mismatched tag '$_[0]'");
End(__PACKAGE__, $_[0]);
}
sub cdata {
return _error("CDATA outside of tag stack") unless @stack;
CData(__PACKAGE__, substr $_[0], 0, -2);
}
sub _doctype {
Doctype(__PACKAGE__, $_[0]);
}
sub _pi {
PI(__PACKAGE__, substr $_[0], 0, -1);
}
sub _error {
if ($ReturnErrors){
Error(__PACKAGE__, $_[0]);
return;
}
die "$_[0]\n";
}
# ======================================================================
1;
__END__
=head1 NAME
XML::Parser::LiteCopy - Lightweight regexp-based XML parser
=head1 SYNOPSIS
use XML::Parser::LiteCopy;
$p1 = new XML::Parser::LiteCopy;
$p1->setHandlers(
Start => sub { shift; print "start: @_\n" },
Char => sub { shift; print "char: @_\n" },
End => sub { shift; print "end: @_\n" },
);
$p1->parse('<foo id="me">Hello World!</foo>');
$p2 = new XML::Parser::LiteCopy
Handlers => {
Start => sub { shift; print "start: @_\n" },
Char => sub { shift; print "char: @_\n" },
End => sub { shift; print "end: @_\n" },
}
;
$p2->parse('<foo id="me">Hello <bar>cruel</bar> World!</foo>');
=head1 DESCRIPTION
This Perl implements an XML parser with a interface similar to
XML::Parser. Though not all callbacks are supported, you should be able to
use it in the same way you use XML::Parser. Due to using experimantal regexp
features it'll work only on Perl 5.6 and above and may behave differently on
different platforms.
Note that you cannot use regular expressions or split in callbacks. This is
due to a limitation of perl's regular expression implementation (which is
( run in 0.543 second using v1.01-cache-2.11-cpan-140bd7fdf52 )