XML-TreePP
view release on metacpan or search on metacpan
lib/XML/TreePP.pm view on Meta::CPAN
This method receives an XML document from a remote server via HTTP and
returns a hash tree converted.
$tree = $tpp->parsehttp( $method, $url, $body, $head );
C<$method> is a method of HTTP connection: GET/POST/PUT/DELETE
C<$url> is an URI of an XML file.
C<$body> is a request body when you use POST method.
C<$head> is a request headers as a hash ref.
L<LWP::UserAgent> module or L<HTTP::Lite> module is required to fetch a file.
( $tree, $xml, $code ) = $tpp->parsehttp( $method, $url, $body, $head );
In array context, This method returns also raw XML document received
and HTTP response's status code.
=head2 write
This method parses a hash tree and returns an XML document as a string.
$source = $tpp->write( $tree, $encode );
C<$tree> is a reference to a hash tree.
=head2 writefile
This method parses a hash tree and writes an XML document into a file.
$tpp->writefile( $file, $tree, $encode );
C<$file> is a filename to create.
C<$tree> is a reference to a hash tree.
=head1 OPTIONS FOR PARSING XML
This module accepts option parameters following:
=head2 force_array
This option allows you to specify a list of element names which
should always be forced into an array representation.
$tpp->set( force_array => [ 'rdf:li', 'item', '-xmlns' ] );
The default value is null, it means that context of the elements
will determine to make array or to keep it scalar or hash.
Note that the special wildcard name C<'*'> means all elements.
=head2 force_hash
This option allows you to specify a list of element names which
should always be forced into an hash representation.
$tpp->set( force_hash => [ 'item', 'image' ] );
The default value is null, it means that context of the elements
will determine to make hash or to keep it scalar as a text node.
See also L</text_node_key> option below.
Note that the special wildcard name C<'*'> means all elements.
=head2 cdata_scalar_ref
This option allows you to convert a cdata section into a reference
for scalar on parsing an XML document.
$tpp->set( cdata_scalar_ref => 1 );
The default value is false, it means that each cdata section is converted into a scalar.
=head2 user_agent
This option allows you to specify a HTTP_USER_AGENT string which
is used by parsehttp() method.
$tpp->set( user_agent => 'Mozilla/4.0 (compatible; ...)' );
The default string is C<'XML-TreePP/#.##'>, where C<'#.##'> is
substituted with the version number of this library.
=head2 http_lite
This option forces pasrsehttp() method to use a L<HTTP::Lite> instance.
my $http = HTTP::Lite->new();
$tpp->set( http_lite => $http );
=head2 lwp_useragent
This option forces parsehttp() method to use a L<LWP::UserAgent> instance.
my $ua = LWP::UserAgent->new();
$ua->timeout( 60 );
$ua->env_proxy;
$tpp->set( lwp_useragent => $ua );
You may use this with L<LWP::UserAgent::WithCache>.
=head2 base_class
This blesses class name for each element's hashref.
Each class is named straight as a child class of it parent class.
$tpp->set( base_class => 'MyElement' );
my $xml = '<root><parent><child key="val">text</child></parent></root>';
my $tree = $tpp->parse( $xml );
print ref $tree->{root}->{parent}->{child}, "\n";
A hash for <child> element above is blessed to C<MyElement::root::parent::child>
class. You may use this with L<Class::Accessor>.
=head2 elem_class
This blesses class name for each element's hashref.
Each class is named horizontally under the direct child of C<MyElement>.
$tpp->set( base_class => 'MyElement' );
my $xml = '<root><parent><child key="val">text</child></parent></root>';
my $tree = $tpp->parse( $xml );
print ref $tree->{root}->{parent}->{child}, "\n";
A hash for <child> element above is blessed to C<MyElement::child> class.
=head2 xml_deref
This option dereferences the numeric character references, like ë,
漢, etc., in an XML document when this value is true.
$tpp->set( xml_deref => 1 );
lib/XML/TreePP.pm view on Meta::CPAN
$deref = \&xml_deref_octet;
}
}
while ( $$textref =~ m{
([^<]*) <
((
\? ([^<>]*) \?
)|(
\!\[CDATA\[(.*?)\]\]
)|(
\!DOCTYPE\s+([^\[\]<>]*(?:\[.*?\]\s*)?)
)|(
\!--(.*?)--
)|(
([^\!\?\s<>](?:"[^"]*"|'[^']*'|[^"'<>])*)
))
> ([^<]*)
}sxg ) {
my (
$ahead, $match, $typePI, $contPI, $typeCDATA,
$contCDATA, $typeDocT, $contDocT, $typeCmnt, $contCmnt,
$typeElem, $contElem, $follow
)
= ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 );
if ( defined $ahead && $ahead =~ /\S/ ) {
$ahead =~ s/([^\040-\076])/sprintf("\\x%02X",ord($1))/eg;
$self->warn( "Invalid string: [$ahead] before <$match>" );
}
if ($typeElem) { # Element
my $node = {};
if ( $contElem =~ s#^/## ) {
$node->{endTag}++;
}
elsif ( $contElem =~ s#/$## ) {
$node->{emptyTag}++;
}
else {
$node->{startTag}++;
}
$node->{tagName} = $1 if ( $contElem =~ s#^(\S+)\s*## );
unless ( $node->{endTag} ) {
my $attr;
while ( $contElem =~ m{
([^\s\=\"\']+)\s*=\s*(?:(")(.*?)"|'(.*?)')
}sxg ) {
my $key = $1;
my $val = &$deref( $2 ? $3 : $4 );
if ( ! ref $attr ) {
$attr = {};
tie( %$attr, 'Tie::IxHash' ) if $ixhash;
}
$attr->{$prefix.$key} = $val;
}
$node->{attributes} = $attr if ref $attr;
}
push( @$flat, $node );
}
elsif ($typeCDATA) { ## CDATASection
if ( exists $self->{cdata_scalar_ref} && $self->{cdata_scalar_ref} ) {
push( @$flat, \$contCDATA ); # as reference for scalar
}
else {
push( @$flat, $contCDATA ); # as scalar like text node
}
}
elsif ($typeCmnt) { # Comment (ignore)
}
elsif ($typeDocT) { # DocumentType (ignore)
}
elsif ($typePI) { # ProcessingInstruction (ignore)
}
else {
$self->warn( "Invalid Tag: <$match>" );
}
if ( $follow =~ /\S/ ) { # text node
my $val = &$deref($follow);
push( @$flat, $val );
}
}
$flat;
}
sub flat_to_tree {
my $self = shift;
my $source = shift;
my $parent = shift;
my $class = shift;
my $tree = {};
my $text = [];
if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
tie( %$tree, 'Tie::IxHash' );
}
while ( scalar @$source ) {
my $node = shift @$source;
if ( !ref $node || UNIVERSAL::isa( $node, "SCALAR" ) ) {
push( @$text, $node ); # cdata or text node
next;
}
my $name = $node->{tagName};
if ( $node->{endTag} ) {
last if ( $parent eq $name );
return $self->die( "Invalid tag sequence: <$parent></$name>" );
}
my $elem = $node->{attributes};
my $forcehash = $self->{__force_hash_all} || $self->{__force_hash}->{$name};
my $subclass;
if ( defined $class ) {
my $escname = $name;
$escname =~ s/\W/_/sg;
$subclass = $class.'::'.$escname;
}
if ( $node->{startTag} ) { # recursive call
my $child = $self->flat_to_tree( $source, $name, $subclass );
next unless defined $child;
my $hasattr = scalar keys %$elem if ref $elem;
if ( UNIVERSAL::isa( $child, "HASH" ) ) {
if ( $hasattr ) {
# some attributes and some child nodes
%$elem = ( %$elem, %$child );
}
else {
# some child nodes without attributes
$elem = $child;
}
}
else {
if ( $hasattr ) {
# some attributes and text node
$elem->{$self->{text_node_key}} = $child;
}
elsif ( $forcehash ) {
# only text node without attributes
$elem = { $self->{text_node_key} => $child };
}
else {
# text node without attributes
$elem = $child;
}
}
}
elsif ( $forcehash && ! ref $elem ) {
$elem = {};
}
# bless to a class by base_class or elem_class
if ( ref $elem && UNIVERSAL::isa( $elem, "HASH" ) ) {
if ( defined $subclass ) {
bless( $elem, $subclass );
} elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
my $escname = $name;
$escname =~ s/\W/_/sg;
my $elmclass = $self->{elem_class}.'::'.$escname;
bless( $elem, $elmclass );
}
}
# next unless defined $elem;
$tree->{$name} ||= [];
push( @{ $tree->{$name} }, $elem );
}
if ( ! $self->{__force_array_all} ) {
foreach my $key ( keys %$tree ) {
next if $self->{__force_array}->{$key};
next if ( 1 < scalar @{ $tree->{$key} } );
$tree->{$key} = shift @{ $tree->{$key} };
}
}
my $haschild = scalar keys %$tree;
if ( scalar @$text ) {
if ( scalar @$text == 1 ) {
# one text node (normal)
$text = shift @$text;
}
elsif ( ! scalar grep {ref $_} @$text ) {
# some text node splitted
$text = join( '', @$text );
}
else {
# some cdata node
my $join = join( '', map {ref $_ ? $$_ : $_} @$text );
$text = \$join;
}
if ( $haschild ) {
# some child nodes and also text node
$tree->{$self->{text_node_key}} = $text;
}
else {
# only text node without child nodes
$tree = $text;
}
}
elsif ( ! $haschild ) {
# no child and no text
$tree = "";
}
$tree;
}
sub hash_to_xml {
my $self = shift;
my $name = shift;
my $hash = shift;
my $out = [];
my $attr = [];
my $allkeys = [ keys %$hash ];
my $fo = $self->{__first_out} if ref $self->{__first_out};
my $lo = $self->{__last_out} if ref $self->{__last_out};
my $firstkeys = [ sort { $fo->{$a} <=> $fo->{$b} } grep { exists $fo->{$_} } @$allkeys ] if ref $fo;
my $lastkeys = [ sort { $lo->{$a} <=> $lo->{$b} } grep { exists $lo->{$_} } @$allkeys ] if ref $lo;
$allkeys = [ grep { ! exists $fo->{$_} } @$allkeys ] if ref $fo;
$allkeys = [ grep { ! exists $lo->{$_} } @$allkeys ] if ref $lo;
unless ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
$allkeys = [ sort @$allkeys ];
}
my $prelen = $self->{__attr_prefix_len};
my $pregex = $self->{__attr_prefix_rex};
my $textnk = $self->{text_node_key};
my $tagend = $self->{empty_element_tag_end} || $EMPTY_ELEMENT_TAG_END;
foreach my $keys ( $firstkeys, $allkeys, $lastkeys ) {
next unless ref $keys;
my $elemkey = $prelen ? [ grep { substr($_,0,$prelen) ne $pregex } @$keys ] : $keys;
my $attrkey = $prelen ? [ grep { substr($_,0,$prelen) eq $pregex } @$keys ] : [];
foreach my $key ( @$elemkey ) {
my $val = $hash->{$key};
if ( !defined $val ) {
next if ($key eq $textnk);
push( @$out, "<$key$tagend" );
}
elsif ( UNIVERSAL::isa( $val, 'HASH' ) ) {
my $child = $self->hash_to_xml( $key, $val );
push( @$out, $child );
}
elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
my $child = $self->array_to_xml( $key, $val );
push( @$out, $child );
}
elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
my $child = $self->scalaref_to_cdata( $key, $val );
push( @$out, $child );
}
else {
my $ref = ref $val;
$self->warn( "Unsupported reference type: $ref in $key" ) if $ref;
my $child = $self->scalar_to_xml( $key, $val );
push( @$out, $child );
}
}
foreach my $key ( @$attrkey ) {
my $name = substr( $key, $prelen );
my $val = &xml_escape( $hash->{$key} );
push( @$attr, ' ' . $name . '="' . $val . '"' );
}
}
my $jattr = join( '', @$attr );
if ( defined $name && scalar @$out && ! grep { ! /^</s } @$out ) {
# Use human-friendly white spacing
if ( defined $self->{__indent} ) {
s/^(\s*<)/$self->{__indent}$1/mg foreach @$out;
}
unshift( @$out, "\n" );
}
my $text = join( '', @$out );
if ( defined $name ) {
if ( scalar @$out ) {
$text = "<$name$jattr>$text</$name>\n";
}
else {
$text = "<$name$jattr$tagend\n";
}
}
$text;
}
sub array_to_xml {
my $self = shift;
my $name = shift;
my $array = shift;
my $out = [];
my $tagend = $self->{empty_element_tag_end} || $EMPTY_ELEMENT_TAG_END;
foreach my $val (@$array) {
if ( !defined $val ) {
push( @$out, "<$name$tagend\n" );
}
elsif ( UNIVERSAL::isa( $val, 'HASH' ) ) {
my $child = $self->hash_to_xml( $name, $val );
push( @$out, $child );
}
elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
my $child = $self->array_to_xml( $name, $val );
push( @$out, $child );
}
elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
my $child = $self->scalaref_to_cdata( $name, $val );
push( @$out, $child );
}
else {
my $ref = ref $val;
$self->warn( "Unsupported reference type: $ref in $name" ) if $ref;
my $child = $self->scalar_to_xml( $name, $val );
push( @$out, $child );
}
}
my $text = join( '', @$out );
$text;
}
sub scalaref_to_cdata {
my $self = shift;
my $name = shift;
my $ref = shift;
my $data = defined $$ref ? $$ref : '';
$data =~ s#(]])(>)#$1]]><![CDATA[$2#g;
my $text = '<![CDATA[' . $data . ']]>';
$text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
$text;
}
sub scalar_to_xml {
my $self = shift;
my $name = shift;
my $scalar = shift;
my $copy = $scalar;
my $text = &xml_escape($copy);
$text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
$text;
}
sub write_raw_xml {
my $self = shift;
my $file = shift;
my $fh = Symbol::gensym();
open( $fh, ">$file" ) or return $self->die( "$! - $file" );
print $fh @_;
close($fh);
}
sub read_raw_xml {
my $self = shift;
my $file = shift;
my $fh = Symbol::gensym();
open( $fh, $file ) or return $self->die( "$! - $file" );
local $/ = undef;
my $text = <$fh>;
close($fh);
$text;
}
sub looks_like_xml {
my $textref = shift;
my $args = ( $$textref =~ /^(?:\s*\xEF\xBB\xBF)?\s*<\?xml(\s+\S.*)\?>/s )[0];
if ( ! $args ) {
return;
}
return $args;
}
sub xml_decl_encoding {
my $textref = shift;
return unless defined $$textref;
my $args = looks_like_xml($textref) or return;
my $getcode = ( $args =~ /\s+encoding=(".*?"|'.*?')/ )[0] or return;
$getcode =~ s/^['"]//;
$getcode =~ s/['"]$//;
$getcode;
}
sub encode_from_to {
( run in 0.529 second using v1.01-cache-2.11-cpan-39bf76dae61 )