Treex-PML
view release on metacpan or search on metacpan
lib/Treex/PML/Instance/Reader.pm view on Meta::CPAN
use strict;
use warnings;
no warnings qw(recursion);
use Scalar::Util qw(blessed);
use UNIVERSAL::DOES;
use Carp;
use Data::Dumper;
BEGIN {
our $VERSION = '2.29'; # version template
}
use List::Util qw(first);
use Scalar::Util qw(weaken);
use Treex::PML::Instance::Common qw(:diagnostics :constants);
use Treex::PML::Schema;
use XML::LibXML::Reader;
use Treex::PML::IO qw(open_uri close_uri rename_uri);
use Encode;
use constant {
XAT_TYPE => 0,
XAT_NAME => 1,
XAT_VALUE => 1,
XAT_NS => 2,
XAT_ATTRS => 3,
XAT_CHILDREN => 5,
XAT_LINE => 4,
};
our $STRICT =1;
our $XTC_FLAGS;
use vars qw( $HAVE_XS );
BEGIN {
if (!$ENV{PML_COMPILE_NO_XS} && eval {
require XML::CompactTree::XS;
import XML::CompactTree::XS;
$HAVE_XS = 1;
1;
}) {
# print STDERR "Using XML::CompactTree::XS\n" if $HAVE_XS;
$XTC_FLAGS = XML::CompactTree::XS::XCT_ATTRIBUTE_ARRAY()|
XML::CompactTree::XS::XCT_LINE_NUMBERS()|
XML::CompactTree::XS::XCT_IGNORE_COMMENTS();
} else {
require XML::CompactTree;
import XML::CompactTree;
$XTC_FLAGS = XML::CompactTree::XCT_ATTRIBUTE_ARRAY()|
XML::CompactTree::XCT_LINE_NUMBERS()|
XML::CompactTree::XCT_IGNORE_COMMENTS();
$HAVE_XS = 0;
}
}
my (%handlers,%src,
%handler_cache,@handler_cache,
%schema_cache,@schema_cache
);
# TODO:
# - create one handler per cdata+format type
# - test inline schemas
our $CACHE_HANDLERS=1;
our $CACHE_SCHEMAS=1;
our $MAX_SCHEMA_CACHE_SIZE=50;
our $VALIDATE_CDATA=0;
our $VALIDATE_SEQUENCES=1;
our $BUILD_TREES = 1;
our $LOAD_REFFILES = 1;
our $KNIT = 1;
our $READER_OPTS = {
no_cdata => 1,
clean_namespaces => 1,
expand_entities => 1,
expand_xinclude => 1,
no_xinclude_nodes => 1,
};
require Treex::PML;
sub _get_handlers_cache_key {
my ($schema)=@_;
my $key="$schema"; $key=~s/.*=//; # strip class
return
[
$key,
join ',',
$key,
$VALIDATE_CDATA || 0,
$VALIDATE_SEQUENCES || 0,
$BUILD_TREES || 0,
$LOAD_REFFILES || 0,
$KNIT || 0,
$Treex::PML::Node::TYPE,
$Treex::PML::Node::lbrother,
$Treex::PML::Node::rbrother,
$Treex::PML::Node::parent,
$Treex::PML::Node::firstson,
];
}
sub _get_schema_cache_key {
my ($schema_file)=@_;
if ((blessed($schema_file) and $schema_file->isa('URI'))) { # assume URI
if (($schema_file->scheme||'') eq 'file') {
$schema_file = $schema_file->file
} else {
return '0 '.$schema_file;
}
}
if (-f $schema_file) {
my $mtime = (stat $schema_file)[9];
return $mtime.' '.$schema_file;
}
}
sub get_cached_schema {
my ($schema_file)=@_;
return unless defined $schema_file;
my $cached = $schema_cache{$schema_file};
if ($cached and $schema_cache[-1] ne $schema_file) {
# move the last retrieved schema to the end of the queue
@schema_cache = ((grep { $_ ne $schema_file } @schema_cache),$schema_file);
}
return $cached;
}
sub cache_schema {
my ($key,$schema)=@_;
push @schema_cache,$key;
$schema_cache{$key} = $schema;
if (@schema_cache > $MAX_SCHEMA_CACHE_SIZE) {
lib/Treex/PML/Instance/Reader.pm view on Meta::CPAN
$reader = XML::LibXML::Reader->new(IO => $fh, URI => $ctxt->{'_filename'});
}
$reader->nextElement();
last;
}
} else {
_debug("failed");
}
}
}
if (($reader->namespaceURI||'') ne PML_NS) {
my $f = $ctxt->{'_filename'} || '';
die("Root element of '$f' isn't in PML namespace: '".($reader->localName()||'')."' ".($reader->namespaceURI()||''))
}
}
$ctxt->{_root} = read_header($ctxt,$reader,$opts);
my $schema = $ctxt->{'_schema'};
unless (ref($schema)) {
die("Instance doesn't provide PML schema!");
}
unless (length($schema->{version}||'')) {
die("PML Schema file ".$ctxt->{'_schema-url'}." does not specify version!");
}
if (index(SUPPORTED_PML_VERSIONS," ".$schema->{version}." ")<0) {
die("Unsupported PML Schema version ".$schema->{version}." in ".$ctxt->{'_schema-url'});
}
{
# preprocess the options selected_references and selected_keys:
# we map the reffile names to reffile id's
my $sel_knit = ($ctxt->{_selected_knits} =
$opts->{selected_knits});
my $sel_refs = ($ctxt->{_selected_references} =
$opts->{selected_references});
croak("Treex::PML::Instance->load: selected_knits must be a Hash ref!")
if defined($sel_knit) && ref($sel_knit) ne 'HASH';
croak("Treex::PML::Instance->load: selected_references must be a Hash ref!")
if defined($sel_refs) && ref($sel_refs) ne 'HASH';
($ctxt->{'_selected_knits_ids'},
$ctxt->{'_selected_references_ids'}) = map {
my $sel = $_;
my $ret = {
(defined($sel) ?
(map {
my $ids = $ctxt->{'_refnames'}->{$_};
my $val = $sel->{$_};
map { $_=>$val }
defined($ids) ? (ref($ids) ? @$ids : ($ids)) : ()
} keys %$sel) : ())
};
$ret
} ($sel_knit,$sel_refs);
}
$ctxt->read_reffiles({use_resources=>$opts->{use_resources}});
$ctxt->{'_no_read_trees'} = $opts->{no_trees};
local $BUILD_TREES = $opts->{no_trees} ? 0 : 1;
local $LOAD_REFFILES = $opts->{no_references} ? 0 : 1;
local $KNIT = $opts->{no_knit} ? 0 : $LOAD_REFFILES;
local $VALIDATE_CDATA =$opts->{validate_cdata} ? 1 : 0;
local $VALIDATE_SEQUENCES =$opts->{ignore_content_patterns} ? 0 : 1;
$ctxt->{'_id-hash'}={};
prepare_handlers($ctxt);
dump_handlers($ctxt) if $opts->{dump_handlers} or $ENV{PML_COMPILE_DUMP};
load_data($ctxt,$reader,$opts);
while ($reader->read) {
if ($reader->nodeType == XML_READER_TYPE_PROCESSING_INSTRUCTION) {
push @{$ctxt->{'_pi'}}, [ $reader->name,$reader->value ];
}
}
$handlers{'#initialize'}->($ctxt);
$ctxt->{_root} = $handlers{'#root'}->($ctxt->{_root});
};
($handlers{'#cleanup'}||sub{})->();
%handlers=();
close_uri($fh_to_close) if defined $fh_to_close;
die $@ if $@;
$ctxt->{'_parser'} = undef;
return $ctxt;
}
######################################################
# $ctxt
sub _reader_address {
my ($ctxt,$reader)=@_;
my $line_number=$reader->lineNumber;
return " at ".$ctxt->{'_filename'}." line ".$line_number."\n";
}
sub read_header {
my ($ctxt,$reader,$opts)=@_;
# manually extract the root node
my $root = [XML_READER_TYPE_ELEMENT,
$reader->localName,
undef,
];
# read root node attributes
$root->[XAT_LINE] = 0;
$root->[XAT_ATTRS] = readAttributes($reader);
my $found_head = 0;
while ($reader->read == 1) {
my $type = $reader->nodeType;
if ($type == XML_READER_TYPE_TEXT) { # no CDATA
die "Unexpected content of a root element preceding <head>"._reader_address($ctxt,$reader);
} elsif ($type == XML_READER_TYPE_ELEMENT) {
if ($reader->localName eq 'head' and $reader->namespaceURI eq PML_NS) {
# we have head!
$found_head = 1;
last;
} else {
die "Unexpected element '".$reader->name."' precedes PML header <head>"._reader_address($ctxt,$reader);
}
}
}
unless ($found_head) {
die "Did not find PML <head> element: the document '".$ctxt->{_filename}."' is not a PML instance!";
lib/Treex/PML/Instance/Reader.pm view on Meta::CPAN
my ($r)=@_;
my @attrs;
my ($prefix,$name);
if ($r->moveToFirstAttribute==1) {
do {{
$prefix = $r->prefix;
$name = $r->localName;
push @attrs, ($name,$r->value) unless ($prefix and $prefix eq 'xmlns') or (!$prefix and $name eq 'xmlns');
}} while ($r->moveToNextAttribute==1);
$r->moveToElement;
}
\@attrs;
}
sub _paste_last_code {
my ($node,$prev,$p)=@_;
return qq`
#$node\->{'$Treex::PML::Node::rbrother'}=undef;
$prev\->{'$Treex::PML::Node::rbrother'}=$node;
weaken( $node\->{'$Treex::PML::Node::lbrother'} = $prev );
weaken( $node\->{'$Treex::PML::Node::parent'} = $p );
`;
}
sub _paste_first_code {
my ($node,$p)=@_;
return qq`
#$node\->{'$Treex::PML::Node::rbrother'}=undef;
#$node\->{'$Treex::PML::Node::lbrother'}=undef;
$p\->{'$Treex::PML::Node::firstson'}=$node;
weaken( $node\->{'$Treex::PML::Node::parent'} = $p );
`;
}
sub hash_id_code {
my ($key,$value)=@_;
return q`
for (`.$key.q`) {
if (defined and length) {
if (exists($ID_HASH->{$ID_PREFIX.$_}) and
$ID_HASH->{$ID_PREFIX.$_} != `.$value.q`) {
warn("Duplicated ID '$_'");
}
weaken( $ID_HASH->{$ID_PREFIX.$_} = `.$value.q` );
}
}`
}
sub _fix_id_member {
my ($decl)=@_;
return unless $decl;
my ($idM) = $decl->find_members_by_role('#ID');
if ($idM) {
# what follows is a hack fixing buggy PDT 2.0 schemas
my $cdecl = $idM->get_content_decl(1); # no_resolve
if ($cdecl and $cdecl->get_decl_type == PML_CDATA_DECL and $cdecl->get_format eq 'ID') {
$cdecl->set_format('PMLREF');
} elsif ($cdecl = $idM->get_content_decl()) {
if ($cdecl and $cdecl->get_decl_type == PML_CDATA_DECL and $cdecl->get_format eq 'ID') {
warn "Trying to knit object of type '".$decl->get_decl_path."' which has an #ID-attribute ".
"'".$idM->get_name."' declared as <cdata format=\"ID\"/>. ".
"Note that the data-type for #ID-attributes in objects knitted as DOM should be ".
"<cdata format=\"PMLREF\"/> (Hint: redeclare with <derive> for imported types).";
}
}
}
return $idM;
}
sub knit_code {
my ($decl,$assign,$fail)=@_;
my $sub = q`
if ($ref) {
$ref =~ s/^(?:(.*?)\#)//;
my $file_id = $1||'';
my $do_knit=$selected_knits->{$file_id};
unless (defined($do_knit) and $do_knit==0) {
my $target;
if (length $file_id) {
my $f = $parsed_reffile->{ $file_id };
if (ref $f) {
if (UNIVERSAL::DOES::does($f,'Treex::PML::Instance')) {
$target = $f->{'_id-hash'}->{$ref};
$target->{'#knit_prefix'}=$file_id;
} else { # DOM`;
if ($decl) {
my $idM = _fix_id_member($decl);
my $idM_name = $idM && $idM->get_name;
my $decl_path = $decl->get_decl_path; $decl_path =~ s/^!//;
$sub .= q`
my $dom_node = $ref_index->{$file_id}{$ref} || $f->getElementsById($ref);
if (defined $dom_node) {
$target = $ID_HASH->{$ID_PREFIX.$file_id.'#'.$ref};
if (!defined $target) {
my $p = $ID_PREFIX;
$ID_PREFIX.=$file_id.'#';
my $r = XML::LibXML::Reader->new(string=>'<f xmlns="`.PML_NS.q`">'.$dom_node->toString.'</f>');
$r->nextElement;
# print $r, $dom_node->toString,"\n";
my %ns;
my $tree = XML::CompactTree`.($HAVE_XS ? '::XS' : '').q`::readSubtreeToPerl($r,`.$XTC_FLAGS.q`,\%ns);
my $index = $pml_ns_index;
$pml_ns_index = $ns{'`.PML_NS.q`'} || -1;
# print "index: $pml_ns_index\n";
# print Dumper($tree->[0][XAT_CHILDREN][0]);
$target = $handlers{'`.$decl_path.q`'}->($tree->[XAT_CHILDREN][0]);`;
if ($idM) {
$sub .= q`
$target->{`.$idM_name.q`}=$file_id.'#'.$target->{`.$idM_name.q`} if $target;`;
}
$sub .= q`
$pml_ns_index = $index;
$weaken=0;
$ID_PREFIX=$p;
}
}`;
} else {
$sub .= q`
warn("DOM knit error: knit content type not declared in the schema!\n");`;
}
$sub.=q`
}
} else {
lib/Treex/PML/Instance/Reader.pm view on Meta::CPAN
unless ($have_trees) {
$have_trees = 1;
$ctxt->{'_pml_trees_type'} = $trees_type;
$ctxt->{'_trees'} = Treex::PML::Factory->createList(\@list,1);
return;
}`;
}
$sub .= q`
return Treex::PML::Factory->createList(\@list,1);
}`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_ALT_DECL) {
# print $path."\t@".$decl->get_decl_type_str,"\n";
my $cpath = $decl->get_content_decl->get_decl_path;
$cpath=~s/^!//;
my $src = $schema_name.'__generated_read_alt@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $c=$p->[XAT_CHILDREN];
my $a=$p->[XAT_ATTRS];
return undef unless $c and @$c or $a and @$a;
my $singleton = $a && @$a ? 1 : 0;
unless ($singleton) {
for my $el (@$c) {
if (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
$singleton = 1;
last;
} elsif ($el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
$singleton = 1 if $el->[XAT_NAME] ne 'AM';
last;
}
}
}
if ($singleton) {
return $handlers{ '`.$cpath.q`' }->($p);
} else {
my @alt;
for my $el (@$c) {
if (ref($el) and $el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
$el->[XAT_NAME] eq 'AM' or _report_error(q(Unexpected non-AM element ').$el->[XAT_NAME].q(' in an alt: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
push @alt, $handlers{ '`.$cpath.q`' }->($el);
} elsif (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
_report_error(q(Unexpected text content in an alt: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
}
return @alt == 0 ? undef : @alt == 1 ? $alt[0] :
#return bless \@alt, 'Treex::PML::Alt';
Treex::PML::Factory->createAlt(\@alt,1);
}
}
`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_CDATA_DECL) {
my $src = $schema_name.'__generated_read_cdata@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $text;
if (ref($p)) {
my $c = $p->[XAT_CHILDREN];
return undef unless $c and @$c;
my $type;
$text = join '',
map {
if (ref($_)) {
$type = $_->[XAT_TYPE];
if ($type == XML_READER_TYPE_TEXT ||
$type == XML_READER_TYPE_WHITESPACE ||
$type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ||
$type == XML_READER_TYPE_CDATA) {
$_->[XAT_VALUE]
} elsif ($type == XML_READER_TYPE_ELEMENT) {
_report_error(q(Element found where only character data were expected in element <).$_->[XAT_NAME].q(> of CDATA type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
} else {
$_
}
} @$c;`;
my $format_checker;
if ($VALIDATE_CDATA and $decl->get_format ne 'any') {
$sub .=q`
} else {
$text = $p;
}`;
$format_checker = $decl->_get_format_checker();
if (defined $format_checker) {
if (ref($format_checker) eq 'CODE') {
$sub .= q`
if (defined $text and length $text and !$format_checker->($text)) {`;
} else {
$sub .= q`
if (defined $text and length $text and $text !~ $format_checker) {`;
}
$sub .= q`
warn("CDATA value '$text' does not conform to format '`.$decl->get_format.q`' at ".$pml_file.' line '.$p->[XAT_LINE]);
}`;
}
$sub .= q`
return $text;
}`;
} else {
$sub .=q`
return $text;
} else {
return $p;
}
}`;
}
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_CHOICE_DECL) {
# print $path,"\n";
( run in 0.952 second using v1.01-cache-2.11-cpan-524268b4103 )