XML-DT
view release on metacpan or search on metacpan
lib/XML/DT.pm view on Meta::CPAN
father("x"); # returns value for attribute "x" on father element
father("x", "value"); # sets value for attribute "x" on father
# element
You can also use it directly as a reference to C<@dtattributes>:
father->{"x"}; # gets the attribute
father->{"x"} = "value"; # sets the attribute
$attributes = father; # gets all attributes reference
=item C<root>
You can use it as a function to access to your tree root element.
root("x"); # gets attribute C<x> on root element
root("x", "value"); # sets value for attribute C<x> on root
You can also use it directly as a reference to C<$dtattributes[-1]>:
root->{"x"}; # gets the attribute x
root->{"x"} = "value"; # sets the attribute x
$attributes = root; # gets all attributes reference
=back
=head1 User provided element processing functions
The user must provide an HASH with a function for each element, that
computes element output. Functions can use the element name C<$q>, the
element content C<$c> and the attribute values hash C<%v>.
All those global variables are defined in C<$CALLER::>.
Each time an element is find the associated function is called.
Content is calculated by concatenation of element contents strings and
interior elements return values.
=head2 C<-default> function
When a element has no associated function, the function associated
with C<-default> called. If no C<-default> function is defined the
default function returns a XML like string for the element.
When you use C</-type> definitions, you often need do set C<-default>
function to return just the contents: C<sub{$c}>.
=head2 C<-outputenc> option
C<-outputenc> defines the output encoding (default is Unicode UTF8).
=head2 C<-inputenc> option
C<-inputenc> forces a input encoding type. Whenever that is possible,
define the input encoding in the XML file:
<?xml version='1.0' encoding='ISO-8859-1'?>
=head2 C<-pcdata> function
C<-pcdata> function is used to define transformation over the
contents. Typically this function should look at context (see
C<inctxt> function)
The default C<-pcdata> function is the identity
=head2 C<-cdata> function
You can process C<<CDATA>> in a way different from pcdata. If you
define a C<-cdata> method, it will be used. Otherwise, the C<-pcdata>
method is called.
=head2 C<-begin> function
Function to be executed before processing XML file.
Example of use: initialization of side-effect variables
=head2 C<-end> function
Function to be executed after processing XML file. I can use C<$c>
content value. The value returned by C<-end> will be the C<dt> return
value.
Example of use: post-processing of returned contents
=head2 C<-recover> option
If set, the parser will try to recover in XML errors.
=head2 C<-html> option
If set, the parser will try to recover in errors. Note that this
differs from the previous one in the sense it uses some knowledge of
the HTML structure for the recovery.
=head2 C<-userdata> option
Use this to pass any information you like to your handlers. The data
structure you pass in this option will be available as C<< $u >> in
your code. -- New in 0.62.
=head1 Elements with values other than strings (C<-type>)
By default all elements return strings, and contents (C<$c>) is the
concatenation of the strings returned by the sub-elements.
In some situations the XML text contains values that are better
processed as a structured type.
The following types (functors) are available:
=over 4
=item THE_CHILD
Return the result of processing the only child of the element.
=item LAST_CHILD
Returns the result of processing the last child of the element.
=item STR
concatenates all the sub-elements returned values (DEFAULT) all the
sub-element should return strings to be concatenated;
=item SEQ
lib/XML/DT.pm view on Meta::CPAN
for $z (keys %aux){
my $code = sub {
my $l;
for $l (@{$aux{$z}}) {
my $prefix = "";
$prefix = "^" unless (($l->[0]) or ($l->[1]));
$prefix = "^" if (($l->[0] eq "/") && ($l->[1]));
if ($l->[3]) {
if(inctxt("$prefix$l->[1]") && _testAttr($l->[3]))
{return &{$l->[2]}; }
} else {
if(inctxt("$prefix$l->[1]")) {return &{$l->[2]};}
}
}
return &{ $aux2{$z}} if $aux2{$z} ;
return &{ $h{-default}} if $h{-default};
&toxml();
};
$n{$z} = $code;
}
for $z (keys %aux2){
$n{$z} ||= $aux2{$z} ;
}
return %n;
}
sub _omni {
my ($par, $xml, @l) = @_;
my $defaulttype =
(exists($xml->{-type}) && exists($xml->{-type}{-default}))
?
$xml->{-type}{-default} : "STR";
my $type = $ty{$par} || $defaulttype;
my %typeargs = ();
if (ref($type) eq "mmapon") {
$typeargs{$_} = 1 for (@$type);
$type = "MMAPON";
}
my $r ;
if( $type eq 'STR') { $r = "" }
elsif( $type eq 'THE_CHILD' or $type eq 'LAST_CHILD') { $r = 0 }
elsif( $type eq 'SEQ' or $type eq "ARRAY") { $r = [] }
elsif( $type eq 'SEQH' or $type eq "ARRAYOFHASH") { $r = [] }
elsif( $type eq 'MAP' or $type eq "HASH") { $r = {} }
elsif( $type eq 'MULTIMAP') { $r = {} }
elsif( $type eq 'MMAPON' or $type eq "HASHOFARRAY") { $r = {} }
elsif( $type eq 'NONE') { $r = "" }
elsif( $type eq 'ZERO') { return "" }
my ($name, $val, @val, $atr, $aux);
$u = $xml->{-userdata};
while(@l) {
my $tree = shift @l;
next unless $tree;
$name = ref($tree) eq "XML::LibXML::CDATASection" ? "-pcdata" : $tree->getName();
if (ref($tree) eq "XML::LibXML::CDATASection") {
$val = $tree->getData();
$name = "-cdata";
$aux = (defined($xml->{-outputenc}))?_fromUTF8($val,$xml->{-outputenc}):$val;
if (defined($xml->{-cdata})) {
push(@dtcontext,"-cdata");
$c = $aux;
$aux = &{$xml->{-cdata}};
pop(@dtcontext);
} elsif (defined($xml->{-pcdata})) {
push(@dtcontext,"-pcdata");
$c = $aux;
$aux = &{$xml->{-pcdata}};
pop(@dtcontext);
}
} elsif (ref($tree) eq "XML::LibXML::Comment") {
### At the moment, treat as Text
### We will need to change this, I hope!
$val = "";
$name = "-pcdata";
$aux= (defined($xml->{-outputenc}))?_fromUTF8($val, $xml->{-outputenc}):$val;
if (defined($xml->{-pcdata})) {
push(@dtcontext,"-pcdata");
$c = $aux;
$aux = &{$xml->{-pcdata}};
pop(@dtcontext);
}
}
elsif (ref($tree) eq "XML::LibXML::Text") {
$val = $tree->getData();
$name = "-pcdata";
$aux = (defined($xml->{-outputenc}))?_fromUTF8($val,$xml->{-outputenc}):$val;
if (defined($xml->{-pcdata})) {
push(@dtcontext,"-pcdata");
$c = $aux;
$aux = &{$xml->{-pcdata}};
pop(@dtcontext);
}
} elsif (ref($tree) eq "XML::LibXML::Element") {
my %atr = _nodeAttributes($tree);
$atr = \%atr;
if (exists($xml->{-ignorecase})) {
$name = lc($name);
for (keys %$atr) {
my ($k,$v) = (lc($_),$atr->{$_});
delete($atr->{$_});
$atr->{$k} = $v;
}
}
push(@dtcontext,$name);
$dtcontextcount{$name}++;
unshift(@dtatributes, $atr);
unshift(@dtattributes, $atr);
$aux = _omniele($xml, $name, _omni($name, $xml, ($tree->getChildnodes())), $atr);
shift(@dtatributes);
shift(@dtattributes);
pop(@dtcontext); $dtcontextcount{$name}--;
} elsif (ref($tree) eq "XML::LibXML::Node") {
if ($tree->nodeType == XML_ENTITY_REF_NODE) {
# if we get here, is because we are not expanding entities (I think)
if ($tree->textContent) {
$aux = $tree->textContent;
} else {
$aux = '&'.$tree->nodeName.';';
}
} else {
print STDERR "Not handled, generic node of type: [",$tree->nodeType,"]\n";
}
} else {
print STDERR "Not handled: [",ref($tree),"]\n";
}
if ($type eq "STR"){ if (defined($aux)) {$r .= $aux} ;}
elsif ($type eq "THE_CHILD" or $type eq "LAST_CHILD"){
$r = $aux unless _whitepc($aux, $name); }
elsif ($type eq "SEQ" or $type eq "ARRAY"){
push(@$r, $aux) unless _whitepc($aux, $name);}
elsif ($type eq "SEQH" or $type eq "ARRAYHASH"){
push(@$r,{"-c" => $aux,
"-q" => $name,
_nodeAttributes($tree)
}) unless _whitepc($aux,$name);
}
elsif($type eq "MMAPON"){
if(not _whitepc($aux,$name)){
if(! $typeargs{$name}) {
warn "duplicated tag '$name'\n" if(defined($r->{$name}));
$r->{$name} = $aux }
else { push(@{$r->{$name}},$aux) unless _whitepc($aux,$name)}}
}
elsif($type eq "MAP" or $type eq "HASH"){
if(not _whitepc($aux,$name)){
warn "duplicated tag '$name'\n" if(defined($r->{$name}));
$r->{$name} = $aux }}
elsif($type eq "MULTIMAP"){
push(@{$r->{$name}},$aux) unless _whitepc($aux,$name)}
elsif($type eq "NONE"){ $r = $aux;}
else { $r="undefined type !!!"}
}
$r;
}
sub _omniele {
my $xml = shift;
my $aux;
($q, $c, $aux) = @_;
%v = %$aux;
if (defined($xml->{-outputenc})) {
for (keys %v){
$v{$_} = _fromUTF8($v{$_}, $xml->{-outputenc})
}
}
if (defined $xml->{$q})
{ &{$xml->{$q}} }
elsif (defined $xml->{'-default'})
{ &{$xml->{'-default'}} }
elsif (defined $xml->{'-tohtml'})
{ tohtml() }
else
{ toxml() }
}
sub xmltree { +{'-c' => $c, '-q' => $q, %v} }
sub tohtml {
my ($q,$v,$c);
if (not @_) {
($q,$v,$c) = ($XML::DT::q, \%XML::DT::v, $XML::DT::c);
} elsif (ref($_[0])) {
$c = shift;
} else {
($q,$v,$c) = @_;
}
if (not ref($c)) {
if ($q eq "-pcdata") {
return $c
} elsif ($q eq "link" || $q eq "br" || $q eq "hr" || $q eq "img") {
return _openTag($q,$v)
} else {
return _openTag($q,$v) . "$c</$q>"
}
}
elsif (ref($c) eq "HASH" && $c->{'-q'} && $c->{'-c'}) {
my %a = %$c;
my ($q,$c) = delete @a{"-q","-c"};
tohtml($q,\%a,(ref($c)?tohtml($c):$c));
}
elsif (ref($c) eq "HASH") {
_openTag($q,$v).
join("",map {($_ ne "-pcdata")
? ( (ref($c->{$_}) eq "ARRAY")
? "<$_>".
join("</$_>\n<$_>", @{$c->{$_}}).
"</$_>\n"
: tohtml($_,{},$c->{$_})."\n" )
: () }
keys %{$c} ) .
"$c->{-pcdata}</$q>" } ######## "NOTYetREady"
elsif (ref($c) eq "ARRAY") {
if (defined($q) && exists($ty{$q}) && $ty{$q} eq "SEQH") {
tohtml($q,$v,join("\n",map {tohtml($_)} @$c))
} elsif (defined $q) {
tohtml($q,$v,join("",@{$c}))
} else {
join("\n",map {(ref($_)?tohtml($_):$_)} @$c)
}
}
}
sub toxml {
my ($q,$v,$c);
if (not @_) {
($q, $v, $c) = ($XML::DT::q, \%XML::DT::v, $XML::DT::c);
} elsif (ref($_[0])) {
$c = shift;
} else {
($q, $v, $c) = @_;
}
if (not ref($c)) {
if ($q eq "-pcdata") {
return $c
} elsif ($c eq "") {
return _emptyTag($q,$v)
} else {
return _openTag($q,$v) . "$c</$q>"
}
}
elsif (ref($c) eq "HASH" && $c->{'-q'} && $c->{'-c'}) {
my %a = %$c;
my ($q,$c) = delete @a{"-q","-c"};
### _openTag($q,\%a).toxml($c).).
### toxml($q,\%a,join("\n",map {toxml($_)} @$c))
toxml($q,\%a,(ref($c)?toxml($c):$c));
}
elsif (ref($c) eq "HASH") {
_openTag($q,$v).
join("",map {($_ ne "-pcdata")
? ( (ref($c->{$_}) eq "ARRAY")
? "<$_>".
join("</$_>\n<$_>", @{$c->{$_}}).
"</$_>\n"
: toxml($_,{},$c->{$_})."\n" )
: () }
keys %{$c} ) .
"$c->{-pcdata}</$q>" } ######## "NOTYetREady"
elsif (ref($c) eq "ARRAY") {
if (defined($q) && exists($ty{$q}) && $ty{$q} eq "SEQH") {
toxml($q,$v,join("\n",map {toxml($_)} @$c))
} elsif (defined $q) {
toxml($q,$v,join("",@{$c}))
} else {
join("\n",map {(ref($_)?toxml($_):$_)} @$c)
}
}
}
sub _openTag{
"<$_[0]". join("",map {" $_=\"$_[1]{$_}\""} keys %{$_[1]} ).">"
}
sub _emptyTag{
"<$_[0]". join("",map {" $_=\"$_[1]{$_}\""} keys %{$_[1]} )."/>"
}
sub mkdtskel_fromDTD {
my $filename = shift;
my $file = ParseDTDFile($filename);
print <<'PERL';
#!/usr/bin/perl
use warnings;
use strict;
use XML::DT;
my $filename = shift;
# Variable Reference
#
# $c - contents after child processing
# $q - element name (tag)
# %v - hash of attributes
my %handler=(
# '-outputenc' => 'ISO-8859-1',
# '-default' => sub{"<$q>$c</$q>"},
PERL
for (sort keys %{$file}) {
print " '$_' => sub { },";
print " # attributes: ",
join(", ", keys %{$file->{$_}{attributes}}) if exists($file->{$_}{attributes});
print "\n";
}
print <<'PERL';
);
print dt($filename, %handler);
PERL
}
lib/XML/DT.pm view on Meta::CPAN
print " '$name' => sub{ }, #";
print " $element{$name} occurrences;";
print ' attributes: ',
join(', ', keys %{$att{$name}}) if $att{$name};
# print " \"\$q:\$c\"\n";
print "\n";
}
print <<'END';
);
print dt($filename, %handler);
END
}
);
my $file = shift(@files);
while($file =~ /^-/){
if ($file eq "-html") {
$HTML = " '-html' => 1,\n";
$mkdtskel{'-html'} = 1;}
elsif($file eq "-latin1") { $mkdtskel{'-inputenc'}='ISO-8859-1';}
else { die("usage mktskel [-html] [-latin1] file \n")}
$file=shift(@files)}
dt($file,%mkdtskel)
}
sub _nodeAttributes {
my $node = shift;
my %answer = ();
my @attrs = $node->getAttributes();
for (@attrs) {
if (ref($_) eq "XML::LibXML::Namespace") {
# TODO: This should not be ignored, I think.
# This sould be converted on a standard attribute with
# key 'namespace' and respective contents
} else {
$answer{$_->getName()} = $_->getValue();
}
}
return %answer;
}
sub mkdtdskel {
my @files = @_;
my $name;
my %att;
my %ele;
my %elel;
my $root;
my %handler=(
'-outputenc' => 'ISO-8859-1',
'-default' => sub{
$elel{$q}++;
$root = $q unless ctxt(1);
$ele{ctxt(1)}{$q} ++;
for(keys(%v)){$att{$q}{$_} ++ } ;
},
'-pcdata' => sub{ if ($c =~ /[^ \t\n]/){ $ele{ctxt(1)}{"#PCDATA"}=1 }},
);
while($files[0] =~ /^-/){
if ($files[0] eq "-html") { $handler{'-html'} = 1;}
elsif($files[0] eq "-latin1") { $handler{'-inputenc'}='ISO-8859-1';}
else { die("usage mkdtdskel [-html] [-latin1] file* \n")}
shift(@files)}
for my $filename (@files){
dt($filename,%handler);
}
print "<!-- DTD $root ... -->\n<!-- (C) ... " . localtime(time) ." -->\n";
delete $elel{$root};
for ($root, keys %elel){
_putele($_, \%ele);
for $name (keys(%{$att{$_}})) {
print( "\t<!-- $name : ... -->\n");
print( "\t<!ATTLIST $_ $name CDATA #IMPLIED >\n");
}
}
}
sub _putele {
my ($e,$ele) = @_;
my @f ;
if ($ele->{$e}) {
@f = keys %{$ele->{$e}};
print "<!ELEMENT $e (", join("|", @f ),")",
(@f >= 1 && $f[0] eq "#PCDATA" ? "" : "*"),
" >\n";
print "<!-- ", join(" | ", (map {"$_=$ele->{$e}{$_}"} @f )), " -->\n";
}
else {
print "<!ELEMENT $e EMPTY >\n";
}
}
sub _whitepc {
$_[1] eq '-pcdata' and $_[0] =~ /^[ \t\r\n]*$/
}
sub MMAPON {
bless([@_],"mmapon")
}
sub _fromUTF8 {
my $string = shift;
my $encode = shift;
my $ans = eval { XML::LibXML::decodeFromUTF8($encode, $string) };
if ($@) {
return $string
} else {
return $ans
}
}
1;
( run in 0.935 second using v1.01-cache-2.11-cpan-39bf76dae61 )