XML-Mini

 view release on metacpan or  search on metacpan

lib/XML/Mini/Document.pm  view on Meta::CPAN

		@res = Text::Balanced::extract_tagged($XMLString, undef, undef, undef, { 'ignore' => $ignore });
	} else {
		@res = Text::Balanced::extract_tagged($XMLString);
	}
	
	if ($#res == 5)
	{
		# We've extracted a balanced <tag>..</tag>
	
		my $extracted = $res[0]; # the entire <t>..</t>
		my $remainder = $res[1]; # stuff after the <t>..</t>HERE  - 3
		my $prefix = $res[3]; # the <t ...> itself - 1
		my $contents = $res[4]; # the '..' between <t>..</t> - 2
		my $suffix = $res[5]; # the </t>
		
		#XML::Mini->Log("Grabbed prefix '$prefix'...");
		my $newElement;
		
		if ($prefix =~ m|<\s*([^\s>]+)\s*([^>]*)>|)
		{
			my $name = $1;
			my $attribs = $2;
			$newElement = $parentElement->createChild($name);
	    		$self->_extractAttributesFromString($newElement, $attribs) if ($attribs);
			
			$self->fromSubStringBT($newElement, $contents) if ($contents =~ m|\S|);
			
			$self->fromSubStringBT($parentElement, $remainder) if ($remainder =~ m|\S|);
		} else {
			
			XML::Mini->Log("XML::Mini::Document::fromSubStringBT extracted balanced text from invalid tag '$prefix' - ignoring");
    		}
	} else {
	
		$XMLString =~ s/>\s*\n/>/gsm;
		if ($XMLString =~ m/^\s*<\s*([^\s>]+)([^>]*>).*<\s*\/\1\s*>/osm)
		{
			# starts with a normal <tag> ... </tag> but has some ?? in it
			
			my $startTag = $2;
			return $self->fromSubStringBT($parentElement, $XMLString, 'USEIGNORE')
				unless ($startTag =~ m|/\s*>$|);
		}
	
		# not a <tag>...</tag>
		#it's either a                             
		if ($XMLString =~ m/^\s*(<\s*([^\s>]+)([^>]+)\/\s*>|	# <unary \/>
					 <\?\s*([^\s>]+)\s*([^>]*)\?>|	# <? headers ?>
					 <!--(.+?)-->|			# <!-- comments -->
					 <!\[CDATA\s*\[(.*?)\]\]\s*>\s*| 	# CDATA 
					 <!DOCTYPE\s*([^\[>]*)(\[.*?\])?\s*>\s*|	# DOCTYPE
					 <!ENTITY\s*([^"'>]+)\s*(["'])([^\11]+)\11\s*>\s*| # ENTITY
					 ([^<]+))(.*)/xogsmi) # plain text
		{
			my $firstPart	 = $1;
			my $unaryName 	 = $2;
			my $unaryAttribs = $3;
			my $headerName 	 = $4;
			my $headerAttribs= $5;
			my $comment 	 = $6;
			my $cdata	 = $7;
			my $doctype	 = $8;
			my $doctypeCont  = $9;
			my $entityName	 = $10;
			my $entityCont	 = $12;
			my $plainText	 = $13;
			my $remainder 	 = $14;
			
			
			
			# There is some duplication here that should be merged with that in fromSubString()
			if ($unaryName)
			{
				my $newElement = $parentElement->createChild($unaryName);
				$self->_extractAttributesFromString($newElement, $unaryAttribs) if ($unaryAttribs);
			} elsif ($headerName)
			{
				my $newElement = XML::Mini::Element::Header->new($headerName);
				$self->_extractAttributesFromString($newElement, $headerAttribs) if ($headerAttribs);
				$parentElement->appendChild($newElement);
			} elsif (defined $comment) {
				$parentElement->comment($comment);
			} elsif (defined $cdata) {
				my $newElement = XML::Mini::Element::CData->new($cdata);
				$parentElement->appendChild($newElement);
			} elsif ($doctype || defined $doctypeCont) {
				my $newElement = XML::Mini::Element::DocType->new($doctype);
				$parentElement->appendChild($newElement);
				if ($doctypeCont)
				{
					$doctypeCont =~ s/^\s*\[//smg;
					$doctypeCont =~ s/\]\s*$//smg;
					
					$self->fromSubStringBT($newElement, $doctypeCont);
				}
			} elsif (defined $entityName) {
				my $newElement = XML::Mini::Element::Entity->new($entityName, $entityCont);
				$parentElement->appendChild($newElement);
			} elsif (defined $plainText && $plainText =~ m|\S|sm)
			{
				$parentElement->createNode($plainText);
			} else {
				XML::Mini->Log("NO MATCH???") if ($XML::Mini::Debug);
			}
			
			
			if (defined $remainder && $remainder =~ m|\S|sm)
			{
				$self->fromSubStringBT($parentElement, $remainder);
			}
			
		} else {
			# No match here either...
			XML::Mini->Log("No match in fromSubStringBT() for '$XMLString'") if ($XML::Mini::Debug);
			
		} # end if it matches one of our other tags or plain text
		
	} # end if Text::Balanced returned a match
	
	
} # end fromSubStringBT()
			
	
    

sub fromSubString
{
    my $self = shift;
    my $parentElement = shift;
    my $XMLString = shift;
    
    if ($XML::Mini::Debug) 
    {
		XML::Mini->Log("Called fromSubString() with parent '" . $parentElement->name() . "'\n");
    }
    
    
    # The heart of the parsing is here, in our mega regex
    # The sections are for:
    # <tag>...</tag>
    # <!-- comments -->
    # <singletag />
    # <![CDATA [ STUFF ]]>
    # <!DOCTYPE ... [ ... ]>
    # <!ENTITY bla "bla">
    # plain text
    #=~/<\s*([^\s>]+)([^>]+)?>(.*?)<\s*\/\\1\s*>\s*([^<]+)?(.*)
    
    
    if ($TextBalancedAvailable)
    {
    	return $self->fromSubStringBT($parentElement, $XMLString);
    }
    
    while ($XMLString =~/\s*<\s*([^\s>]+)([^>]+)?>(.*?)<\s*\/\1\s*>\s*([^<]+)?(.*)|
    \s*<!--(.+?)-->\s*|
    \s*<\s*([^\s>]+)\s*([^>]*)\/\s*>\s*([^<>]+)?|
    \s*<!\[CDATA\s*\[(.*?)\]\]\s*>\s*|
    \s*<!DOCTYPE\s*([^\[>]*)(\[.*?\])?\s*>\s*|
    \s*<!ENTITY\s*([^"'>]+)\s*(["'])([^\14]+)\14\s*>\s*|
    \s*<\?\s*([^\s>]+)\s*([^>]*)\?>|
    ^([^<]+)(.*)/xogsmi)
	   

    {
	# Check which string matched.'
	my $uname = $7;
	my $comment = $6;
	my $cdata = $10;
	my $doctypedef = $11;
	if ($12)
	{
		if ($doctypedef)
		{
			$doctypedef .= ' ' . $12;
		} else {
			$doctypedef = $12;
		}
	}
	
	my $entityname = $13;
	my $headername = $16;
	my $headerAttribs  = $17;
	my $plaintext = $18;
	
	if (defined $uname)
	{
	    my $ufinaltxt = $9;
	    my $newElement = $parentElement->createChild($uname);
	    $self->_extractAttributesFromString($newElement, $8);
	    if (defined $ufinaltxt && $ufinaltxt =~ m|\S+|)
	    {
		$parentElement->createNode($ufinaltxt);
	    }
	} elsif (defined $headername)
	{
		my $newElement = XML::Mini::Element::Header->new($headername);
		$self->_extractAttributesFromString($newElement, $headerAttribs) if ($headerAttribs);
		$parentElement->appendChild($newElement);
	
	} elsif (defined $comment) {
	    #my $newElement = XML::Mini::Element::Comment->new('!--');
	    #$newElement->createNode($comment);
	    $parentElement->comment($comment);
	} elsif (defined $cdata) {
	    my $newElement = XML::Mini::Element::CData->new($cdata);
	    $parentElement->appendChild($newElement);
	} elsif (defined $doctypedef) {
	    
	    my $newElement = XML::Mini::Element::DocType->new($11);
	    $parentElement->appendChild($newElement);
	    $self->fromSubString($newElement, $doctypedef);
	    
	} elsif (defined $entityname) {
	    
	    my $newElement = XML::Mini::Element::Entity->new($entityname, $15);
	    $parentElement->appendChild($newElement);
	    
	} elsif (defined $plaintext) {
	    
	    my $afterTxt = $19;
	    if ($plaintext !~ /^\s+$/)
	    {
		$parentElement->createNode($plaintext);
	    }
	    
	    if (defined $afterTxt)
	    {
		$self->fromSubString($parentElement, $afterTxt);
	    }
	} elsif ($1) {
	    
	    my $nencl = $3;
	    my $finaltxt = $4;
	    my $otherTags = $5;
	    my $newElement = $parentElement->createChild($1);
	    $self->_extractAttributesFromString($newElement, $2);
	    
	    
	    if ($nencl =~ /^\s*([^\s<][^<]*)/)
	    {
		my $txt = $1;
		$newElement->createNode($txt);
		$nencl =~ s/^\s*[^<]+//;
	    }
	    
	    $self->fromSubString($newElement, $nencl);
	    
	    if (defined $finaltxt)
	    {
		$parentElement->createNode($finaltxt);
	    }
	    
	    if (defined $otherTags)
	    {
		$self->fromSubString($parentElement, $otherTags);
	    }
	}
    } # end while matches
} #* end method fromSubString */

sub toFile
{
    my $self = shift;
    my $filename = shift || return XML::Mini->Error("XML::Mini::Document::toFile - must pass a filename to save to");
    my $safe = shift;



( run in 1.430 second using v1.01-cache-2.11-cpan-437f7b0c052 )