XML-Mini
view release on metacpan or search on metacpan
lib/XML/Mini/Document.pm view on Meta::CPAN
@res = Text::Balanced::extract_tagged($XMLString, undef, undef, undef, { 'ignore' => $ignore });
} else {
@res = Text::Balanced::extract_tagged($XMLString);
}
if ($#res == 5)
{
# We've extracted a balanced <tag>..</tag>
my $extracted = $res[0]; # the entire <t>..</t>
my $remainder = $res[1]; # stuff after the <t>..</t>HERE - 3
my $prefix = $res[3]; # the <t ...> itself - 1
my $contents = $res[4]; # the '..' between <t>..</t> - 2
my $suffix = $res[5]; # the </t>
#XML::Mini->Log("Grabbed prefix '$prefix'...");
my $newElement;
if ($prefix =~ m|<\s*([^\s>]+)\s*([^>]*)>|)
{
my $name = $1;
my $attribs = $2;
$newElement = $parentElement->createChild($name);
$self->_extractAttributesFromString($newElement, $attribs) if ($attribs);
$self->fromSubStringBT($newElement, $contents) if ($contents =~ m|\S|);
$self->fromSubStringBT($parentElement, $remainder) if ($remainder =~ m|\S|);
} else {
XML::Mini->Log("XML::Mini::Document::fromSubStringBT extracted balanced text from invalid tag '$prefix' - ignoring");
}
} else {
$XMLString =~ s/>\s*\n/>/gsm;
if ($XMLString =~ m/^\s*<\s*([^\s>]+)([^>]*>).*<\s*\/\1\s*>/osm)
{
# starts with a normal <tag> ... </tag> but has some ?? in it
my $startTag = $2;
return $self->fromSubStringBT($parentElement, $XMLString, 'USEIGNORE')
unless ($startTag =~ m|/\s*>$|);
}
# not a <tag>...</tag>
#it's either a
if ($XMLString =~ m/^\s*(<\s*([^\s>]+)([^>]+)\/\s*>| # <unary \/>
<\?\s*([^\s>]+)\s*([^>]*)\?>| # <? headers ?>
<!--(.+?)-->| # <!-- comments -->
<!\[CDATA\s*\[(.*?)\]\]\s*>\s*| # CDATA
<!DOCTYPE\s*([^\[>]*)(\[.*?\])?\s*>\s*| # DOCTYPE
<!ENTITY\s*([^"'>]+)\s*(["'])([^\11]+)\11\s*>\s*| # ENTITY
([^<]+))(.*)/xogsmi) # plain text
{
my $firstPart = $1;
my $unaryName = $2;
my $unaryAttribs = $3;
my $headerName = $4;
my $headerAttribs= $5;
my $comment = $6;
my $cdata = $7;
my $doctype = $8;
my $doctypeCont = $9;
my $entityName = $10;
my $entityCont = $12;
my $plainText = $13;
my $remainder = $14;
# There is some duplication here that should be merged with that in fromSubString()
if ($unaryName)
{
my $newElement = $parentElement->createChild($unaryName);
$self->_extractAttributesFromString($newElement, $unaryAttribs) if ($unaryAttribs);
} elsif ($headerName)
{
my $newElement = XML::Mini::Element::Header->new($headerName);
$self->_extractAttributesFromString($newElement, $headerAttribs) if ($headerAttribs);
$parentElement->appendChild($newElement);
} elsif (defined $comment) {
$parentElement->comment($comment);
} elsif (defined $cdata) {
my $newElement = XML::Mini::Element::CData->new($cdata);
$parentElement->appendChild($newElement);
} elsif ($doctype || defined $doctypeCont) {
my $newElement = XML::Mini::Element::DocType->new($doctype);
$parentElement->appendChild($newElement);
if ($doctypeCont)
{
$doctypeCont =~ s/^\s*\[//smg;
$doctypeCont =~ s/\]\s*$//smg;
$self->fromSubStringBT($newElement, $doctypeCont);
}
} elsif (defined $entityName) {
my $newElement = XML::Mini::Element::Entity->new($entityName, $entityCont);
$parentElement->appendChild($newElement);
} elsif (defined $plainText && $plainText =~ m|\S|sm)
{
$parentElement->createNode($plainText);
} else {
XML::Mini->Log("NO MATCH???") if ($XML::Mini::Debug);
}
if (defined $remainder && $remainder =~ m|\S|sm)
{
$self->fromSubStringBT($parentElement, $remainder);
}
} else {
# No match here either...
XML::Mini->Log("No match in fromSubStringBT() for '$XMLString'") if ($XML::Mini::Debug);
} # end if it matches one of our other tags or plain text
} # end if Text::Balanced returned a match
} # end fromSubStringBT()
sub fromSubString
{
my $self = shift;
my $parentElement = shift;
my $XMLString = shift;
if ($XML::Mini::Debug)
{
XML::Mini->Log("Called fromSubString() with parent '" . $parentElement->name() . "'\n");
}
# The heart of the parsing is here, in our mega regex
# The sections are for:
# <tag>...</tag>
# <!-- comments -->
# <singletag />
# <![CDATA [ STUFF ]]>
# <!DOCTYPE ... [ ... ]>
# <!ENTITY bla "bla">
# plain text
#=~/<\s*([^\s>]+)([^>]+)?>(.*?)<\s*\/\\1\s*>\s*([^<]+)?(.*)
if ($TextBalancedAvailable)
{
return $self->fromSubStringBT($parentElement, $XMLString);
}
while ($XMLString =~/\s*<\s*([^\s>]+)([^>]+)?>(.*?)<\s*\/\1\s*>\s*([^<]+)?(.*)|
\s*<!--(.+?)-->\s*|
\s*<\s*([^\s>]+)\s*([^>]*)\/\s*>\s*([^<>]+)?|
\s*<!\[CDATA\s*\[(.*?)\]\]\s*>\s*|
\s*<!DOCTYPE\s*([^\[>]*)(\[.*?\])?\s*>\s*|
\s*<!ENTITY\s*([^"'>]+)\s*(["'])([^\14]+)\14\s*>\s*|
\s*<\?\s*([^\s>]+)\s*([^>]*)\?>|
^([^<]+)(.*)/xogsmi)
{
# Check which string matched.'
my $uname = $7;
my $comment = $6;
my $cdata = $10;
my $doctypedef = $11;
if ($12)
{
if ($doctypedef)
{
$doctypedef .= ' ' . $12;
} else {
$doctypedef = $12;
}
}
my $entityname = $13;
my $headername = $16;
my $headerAttribs = $17;
my $plaintext = $18;
if (defined $uname)
{
my $ufinaltxt = $9;
my $newElement = $parentElement->createChild($uname);
$self->_extractAttributesFromString($newElement, $8);
if (defined $ufinaltxt && $ufinaltxt =~ m|\S+|)
{
$parentElement->createNode($ufinaltxt);
}
} elsif (defined $headername)
{
my $newElement = XML::Mini::Element::Header->new($headername);
$self->_extractAttributesFromString($newElement, $headerAttribs) if ($headerAttribs);
$parentElement->appendChild($newElement);
} elsif (defined $comment) {
#my $newElement = XML::Mini::Element::Comment->new('!--');
#$newElement->createNode($comment);
$parentElement->comment($comment);
} elsif (defined $cdata) {
my $newElement = XML::Mini::Element::CData->new($cdata);
$parentElement->appendChild($newElement);
} elsif (defined $doctypedef) {
my $newElement = XML::Mini::Element::DocType->new($11);
$parentElement->appendChild($newElement);
$self->fromSubString($newElement, $doctypedef);
} elsif (defined $entityname) {
my $newElement = XML::Mini::Element::Entity->new($entityname, $15);
$parentElement->appendChild($newElement);
} elsif (defined $plaintext) {
my $afterTxt = $19;
if ($plaintext !~ /^\s+$/)
{
$parentElement->createNode($plaintext);
}
if (defined $afterTxt)
{
$self->fromSubString($parentElement, $afterTxt);
}
} elsif ($1) {
my $nencl = $3;
my $finaltxt = $4;
my $otherTags = $5;
my $newElement = $parentElement->createChild($1);
$self->_extractAttributesFromString($newElement, $2);
if ($nencl =~ /^\s*([^\s<][^<]*)/)
{
my $txt = $1;
$newElement->createNode($txt);
$nencl =~ s/^\s*[^<]+//;
}
$self->fromSubString($newElement, $nencl);
if (defined $finaltxt)
{
$parentElement->createNode($finaltxt);
}
if (defined $otherTags)
{
$self->fromSubString($parentElement, $otherTags);
}
}
} # end while matches
} #* end method fromSubString */
sub toFile
{
my $self = shift;
my $filename = shift || return XML::Mini->Error("XML::Mini::Document::toFile - must pass a filename to save to");
my $safe = shift;
( run in 1.430 second using v1.01-cache-2.11-cpan-437f7b0c052 )