simplexmlparse
view release on metacpan or search on metacpan
lib/simpleXMLParse.pm view on Meta::CPAN
package simpleXMLParse;
# Perl Module: simpleXMLParse
# Author: Daniel Edward Graham
# Copyright (c) Daniel Edward Graham 2008-2018
# Date: 01/01/2018
# License: LGPL 3.0
#
require Exporter;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
use Data::Dumper;
@ISA = qw(Exporter);
# This allows declaration use simpleXMLParse ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
%EXPORT_TAGS = ( 'all' => [ qw(
) ] );
@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
@EXPORT = qw(
);
$VERSION = '3.1';
use Carp;
use strict;
no warnings;
#use open ':encoding(utf8)';
my @cdata;
my $cdataInd = 0;
my $MAXIND = 10000;
sub new {
my $class = shift;
my %args = (@_ == 1) ? ((ref($_[0]) eq 'HASH') ? %{$_[0]}:(input => $_[0])):@_;
my $altstyle = 0;
my $fn;
$fn = $args{"input"};
$altstyle = 1 if ($args{"style"} eq '2');
my $self = {};
$self->{"xml"} = undef;
$self->{"data"} = undef;
open (INFILE1, "$fn") or croak "Unable to process [$fn] $! \n";
binmode(INFILE1);
my ($c1, $c2, $c3);
read(INFILE1, $c1, 1);
read(INFILE1, $c2, 1);
read(INFILE1, $c3, 1);
close(INFILE1);
if (($c1 eq "\xFE" && $c2 eq "\xFF") || ($c1 eq "\xFF" && $c2 eq "\xFE")) {
# UTF-16
open(INFILE, '<:encoding(UTF-16)', "$fn") or croak "Unable to process [$fn] $!\n";
$self->{"xml"} = join '', <INFILE>;
} else {
if ($c1 eq "\xEF" && $c2 eq "\xBB" && $c3 eq "\xBF") {
# UTF-8 with BOM...
open(INFILE, '<:encoding(UTF-8)', "$fn") or croak "Unable to process [$fn] $!\n";
my $str = join '', <INFILE>;
# $str =~ s/^\xEF\xBB\xBF//g;
$str =~ s/^\x{fffe}//g;
$str =~ s/^\x{feff}//g;
$self->{"xml"} = $str;
} else {
# UTF-8 with NO BOM
open(INFILE, '<:encoding(UTF-8)', "$fn") or croak "Unable to process [$fn] $!\n";
$self->{"xml"} = join '', <INFILE>;
}
}
close(INFILE);
$self->{"data"} = _ParseXML( $self->{"xml"}, $altstyle );
my $ret = bless $self;
if ($altstyle) {
$ret->_convertToStyle();
}
$cdataInd = $cdataInd % $MAXIND;
return $ret;
}
sub parse {
my $self = shift;
return $self->{data};
}
sub _convertToStyle {
my $self = shift;
my @recursearr = ($self->{"data"});
while (@recursearr) {
my $i = pop @recursearr;
if (ref($i) eq "HASH") {
foreach my $j (keys %$i) {
if ($j =~ /^(.*?)\_(.*?)\_([0-9]+)\_attr$/) {
my ($attrnm, $tagnm, $cnt) = ($1, $2, $3);
$attrnm =~ s/0x0/_/gs;
$tagnm =~ s/0x0/_/gs;
my $n = undef;
if (ref($i->{$tagnm}) eq "ARRAY") {
my $hold;
if (ref($i->{$tagnm}->[$cnt]) eq '') {
$hold = $i->{$tagnm}->[$cnt];
$i->{$tagnm}->[$cnt] = { };
if ($hold !~ /^\s*$/ ) {
$i->{$tagnm}->[$cnt]->{content} = $hold;
}
}
while (defined($i->{$tagnm}->[$cnt]->{$attrnm.$n})) {
$n++;
}
$i->{$tagnm}->[$cnt]->{$attrnm.$n} = $i->{$j};
} else {
if (ref($i->{$tagnm}) eq "HASH") {
my $n = undef;
while (defined($i->{$tagnm}->{$attrnm.$n})) {
$n++;
}
$i->{$tagnm}->{$attrnm.$n} = $i->{$j};
} else {
my $hold;
$hold = $i->{$tagnm};
$i->{$tagnm} = { };
if ($hold !~ /^\s*$/) {
$i->{$tagnm}->{content} = $hold;
}
$i->{$tagnm}->{$attrnm} = $i->{$j};
}
}
delete $i->{$j};
} else {
push @recursearr, $i->{$j};
}
}
} else {
if (ref($i) eq "ARRAY") {
foreach my $j (@$i) {
push @recursearr, $j;
}
}
}
}
}
sub _cdatasub {
my $cdata = shift;
my $tmpind = $cdataInd++;
$cdata[$tmpind] = $cdata;
return "0x0CDATA0x0".($tmpind)."0x0";
}
sub _cdatasubout {
my $ind = shift;
my $cdata = $cdata[$ind];
return $cdata;
}
sub _unescp {
my $firsttag = shift;
$firsttag =~ s/\\\\/\\/gs;
$firsttag =~ s/\\\*/\*/gs;
$firsttag =~ s/\\\|/\|/gs;
$firsttag =~ s/\\\$/\$/gs;
$firsttag =~ s/\\\?/\?/gs;
$firsttag =~ s/\\\{/\{/gs;
$firsttag =~ s/\\\}/\}/gs;
$firsttag =~ s/\\\(/\(/gs;
$firsttag =~ s/\\\)/\)/gs;
$firsttag =~ s/\\\+/\+/gs;
$firsttag =~ s/\\\[/\[/gs;
$firsttag =~ s/\\\]/\]/gs;
$firsttag =~ s/\\\./\./gs;
$firsttag =~ s/\\\^/\^/gs;
$firsttag =~ s/\\\-/\-/gs;
return $firsttag;
}
sub hconv {
my $arg = $_[0];
my $p = pack "H*", $arg;
return $p;
}
sub _entity {
my $text = shift;
$text =~ s/\<\;/\</g;
$text =~ s/\>\;/\>/g;
$text =~ s/\&\;/\&/g;
$text =~ s/\&apos\;/\'/g;
$text =~ s/\"\;/\"/g;
$text =~ s/\&\#x([0-9a-fA-F]+)\;/&hconv($1)/ge;
return $text;
}
sub _ParseXML {
my ($xml, $altstyle) = @_;
# $xml =~ s/\n//g;
$xml =~ s/\<\!\[CDATA\[(.*?)\]\]\>/&_cdatasub($1)/egs;
$xml =~ s/\<\!\-\-.*?\-\-\>//gs;
$xml =~ s/\<\?xml.*?\?\>//gs;
$xml =~ s/\<\?[^\>]*?\?\>//gs;
$xml =~ s/\<\!\-\-[^\>]*?\-\-\>//gs;
$xml =~ s/\<\!ELEMENT[^\>]*?\>//gs;
$xml =~ s/\<\!ENTITY[^\>]*?\>//gs;
$xml =~ s/\<\!ATTLIST[^\>]*?\>//gs;
$xml =~ s/\<\!DOCTYPE[^\>]*?\>//gs;
my $rethash = ();
my @retarr;
my $firsttag = $xml;
my ( $attr, $innerxml, $xmlfragment );
$firsttag =~ s/^[\s\n]*\<([^\s\>\n\/]*).*$/$1/gs;
$firsttag =~ s/\\/\\\\/gs;
$firsttag =~ s/\*/\\\*/gs;
$firsttag =~ s/\|/\\\|/gs;
$firsttag =~ s/\$/\\\$/gs;
$firsttag =~ s/\?/\\\?/gs;
$firsttag =~ s/\{/\\\{/gs;
$firsttag =~ s/\}/\\\}/gs;
$firsttag =~ s/\(/\\\(/gs;
$firsttag =~ s/\)/\\\)/gs;
$firsttag =~ s/\+/\\\+/gs;
$firsttag =~ s/\[/\\\[/gs;
$firsttag =~ s/\]/\\\]/gs;
$firsttag =~ s/\./\\\./gs;
$firsttag =~ s/\^/\\\^/gs;
$firsttag =~ s/\-/\\\-/gs;
if ( $xml =~ /^[\s\n]*\<${firsttag}(\>|[\s\n]\>|[\s\n][^\>]*[^\/]\>)(.*?)\<\/${firsttag}[\s\n]*\>(.*)$/s )
{
$attr = $1;
$innerxml = $2;
$xmlfragment = $3;
$attr =~ s/\>$//gs;
}
else {
if ( $xml =~ /^[\s\n]*\<${firsttag}(\/\>|[\s\n][^\>]*\/\>)(.*)$/s ) {
$attr = $1;
$innerxml = "";
$xmlfragment = $2;
$attr =~ s/\/\>$//gs;
} else {
if (!ref($xml)) {
$xml = _entity($xml);
$xml =~ s/0x0CDATA0x0(\d+?)0x0/&_cdatasubout($1)/egs;
}
if ($xml eq '') {
return {};
} else {
return $xml;
}
}
}
my $ixml = $innerxml;
while ($ixml =~ /^.*?\<${firsttag}(\>|[\s\n]\>|[\s\n][^\>]*[^\/]\>)(.*?)$/s) {
$ixml = $2;
$innerxml .= "</${firsttag}>";
if ($xmlfragment =~ /^(.*?)\<\/${firsttag}[\s\n]*\>(.*)$/s) {
my $ix = $1;
$innerxml .= $ix;
$ixml .= $ix;
$xmlfragment = $2;
} else {
die "Invalid XML innerxml: $innerxml\nixml: $ixml\nxmlfragment: $xmlfragment\n";
}
}
my $nextparse = _ParseXML($innerxml, $altstyle);
$rethash->{&_unescp($firsttag)} = $nextparse;
my @attrarr;
while ( $attr =~ s/^[\s\n]*([^\s\=\n]+)\s*\=\s*(\".*?\"|\'.*?\')(.*)$/$3/gs ) {
my ($name, $val) = ($1, $2);
$val =~ s/^\'(.*)\'$/$1/gs;
$val =~ s/^\"(.*)\"$/$1/gs;
push @attrarr, $name;
push @attrarr, _entity($val);
}
my $attrcnt = 0;
while ( my $val = shift(@attrarr) ) {
my ($val1, $firsttag1) = ($val, $firsttag);
if ($altstyle) {
$val1 =~ s/_/0x0/gs;
$firsttag1 =~ s/_/0x0/gs;
}
$rethash->{ "$val1" . "_".&_unescp(${firsttag1})."_" . $attrcnt . "_attr" } = shift(@attrarr);
}
my $retflag = 0;
my ( $xmlfragment1, $xmlfragment2 );
my %attrhash;
$attrcnt++;
while (1) {
if ( $xmlfragment =~
/^(.*?)\<${firsttag}(\>|[\s\n]\>|[\s\n][^\>]*[^\/]\>)(.*?)\<\/${firsttag}[\s\n]*\>(.*)$/s )
{
if ( !$retflag ) {
push @retarr, $nextparse;
}
$retflag = 1;
$xmlfragment1 = $1;
$attr = $2;
$innerxml = $3;
$xmlfragment2 = $4;
} else {
if ( $xmlfragment =~ /^(.*?)\<${firsttag}(\/\>|[\s\n][^\>]*\/\>)(.*)$/s ) {
if ( !$retflag ) {
push @retarr, $nextparse;
( run in 1.756 second using v1.01-cache-2.11-cpan-2398b32b56e )