Treex-PML

 view release on metacpan or  search on metacpan

lib/Treex/PML/Instance/Reader.pm  view on Meta::CPAN

use strict;
use warnings;
no warnings qw(recursion);
use Scalar::Util qw(blessed);
use UNIVERSAL::DOES;

use Carp;
use Data::Dumper;

BEGIN {
  our $VERSION = '2.29'; # version template
}
use List::Util qw(first);
use Scalar::Util qw(weaken);
use Treex::PML::Instance::Common qw(:diagnostics :constants);
use Treex::PML::Schema;
use XML::LibXML::Reader;
use Treex::PML::IO qw(open_uri close_uri rename_uri);
use Encode;

use constant {
  XAT_TYPE  => 0,
  XAT_NAME  => 1,
  XAT_VALUE => 1,
  XAT_NS  => 2,
  XAT_ATTRS => 3,
  XAT_CHILDREN => 5,
  XAT_LINE => 4,
};

our $STRICT =1;
our $XTC_FLAGS;
use vars qw( $HAVE_XS );
BEGIN {
  if (!$ENV{PML_COMPILE_NO_XS} && eval {
    require XML::CompactTree::XS;
    import XML::CompactTree::XS;
    $HAVE_XS = 1;
    1;
  }) {
    # print STDERR "Using XML::CompactTree::XS\n" if $HAVE_XS;
    $XTC_FLAGS = XML::CompactTree::XS::XCT_ATTRIBUTE_ARRAY()|
                 XML::CompactTree::XS::XCT_LINE_NUMBERS()|
                 XML::CompactTree::XS::XCT_IGNORE_COMMENTS();
  } else {
    require XML::CompactTree;
    import XML::CompactTree;
    $XTC_FLAGS = XML::CompactTree::XCT_ATTRIBUTE_ARRAY()|
                 XML::CompactTree::XCT_LINE_NUMBERS()|
                 XML::CompactTree::XCT_IGNORE_COMMENTS();
    $HAVE_XS = 0;
  }
}

my (%handlers,%src,
    %handler_cache,@handler_cache,
    %schema_cache,@schema_cache
   );

# TODO:
# - create one handler per cdata+format type
# - test inline schemas

our $CACHE_HANDLERS=1;
our $CACHE_SCHEMAS=1;
our $MAX_SCHEMA_CACHE_SIZE=50;

our $VALIDATE_CDATA=0;
our $VALIDATE_SEQUENCES=1;
our $BUILD_TREES = 1;
our $LOAD_REFFILES = 1;
our $KNIT = 1;

our $READER_OPTS = {
  no_cdata => 1,
  clean_namespaces => 1,
  expand_entities => 1,
  expand_xinclude => 1,
  no_xinclude_nodes => 1,
};

require Treex::PML;

sub _get_handlers_cache_key {
  my ($schema)=@_;
  my $key="$schema"; $key=~s/.*=//; # strip class
  return
    [
      $key,
      join ',',
      $key,
      $VALIDATE_CDATA || 0,
      $VALIDATE_SEQUENCES || 0,
      $BUILD_TREES || 0,
      $LOAD_REFFILES || 0,
      $KNIT || 0,
      $Treex::PML::Node::TYPE,
      $Treex::PML::Node::lbrother,
      $Treex::PML::Node::rbrother,
      $Treex::PML::Node::parent,
      $Treex::PML::Node::firstson,
     ];
}

sub _get_schema_cache_key {
  my ($schema_file)=@_;
  if ((blessed($schema_file) and $schema_file->isa('URI'))) { # assume URI
    if (($schema_file->scheme||'') eq 'file') {
      $schema_file = $schema_file->file
    } else {
      return '0 '.$schema_file;
    }
  }
  if (-f $schema_file) {
    my $mtime = (stat $schema_file)[9];
    return $mtime.' '.$schema_file;
  }
}

sub get_cached_schema {
  my ($schema_file)=@_;
  return unless defined $schema_file;
  my $cached = $schema_cache{$schema_file};
  if ($cached and $schema_cache[-1] ne $schema_file) {
    # move the last retrieved schema to the end of the queue
    @schema_cache = ((grep { $_ ne $schema_file } @schema_cache),$schema_file);
  }
  return $cached;
}

sub cache_schema {
  my ($key,$schema)=@_;
  push @schema_cache,$key;
  $schema_cache{$key} = $schema;
  if (@schema_cache > $MAX_SCHEMA_CACHE_SIZE) {

lib/Treex/PML/Instance/Reader.pm  view on Meta::CPAN

              $reader = XML::LibXML::Reader->new(IO => $fh, URI => $ctxt->{'_filename'});
            }
            $reader->nextElement();
            last;
          }
        } else {
          _debug("failed");
        }
      }
    }
    if (($reader->namespaceURI||'') ne PML_NS) {
      my $f = $ctxt->{'_filename'} || '';
      die("Root element of '$f' isn't in PML namespace: '".($reader->localName()||'')."' ".($reader->namespaceURI()||''))
    }
  }

  $ctxt->{_root} = read_header($ctxt,$reader,$opts);
  my $schema = $ctxt->{'_schema'};
  unless (ref($schema)) {
    die("Instance doesn't provide PML schema!");
  }
  unless (length($schema->{version}||'')) {
    die("PML Schema file ".$ctxt->{'_schema-url'}." does not specify version!");
  }
  if (index(SUPPORTED_PML_VERSIONS," ".$schema->{version}." ")<0) {
    die("Unsupported PML Schema version ".$schema->{version}." in ".$ctxt->{'_schema-url'});
  }

  {
    # preprocess the options selected_references and selected_keys:
    # we map the reffile names to reffile id's
    my $sel_knit = ($ctxt->{_selected_knits} =
                      $opts->{selected_knits});
    my $sel_refs = ($ctxt->{_selected_references} =
                      $opts->{selected_references});
    croak("Treex::PML::Instance->load: selected_knits must be a Hash ref!")
      if defined($sel_knit) && ref($sel_knit) ne 'HASH';
    croak("Treex::PML::Instance->load: selected_references must be a Hash ref!")
      if defined($sel_refs) && ref($sel_refs) ne 'HASH';
    ($ctxt->{'_selected_knits_ids'},
     $ctxt->{'_selected_references_ids'}) = map {
       my $sel = $_;
       my $ret = {
         (defined($sel) ?
           (map {
             my $ids = $ctxt->{'_refnames'}->{$_};
             my $val = $sel->{$_};
             map { $_=>$val }
               defined($ids) ? (ref($ids) ? @$ids : ($ids)) : ()
           } keys %$sel) : ())
        };
       $ret
     } ($sel_knit,$sel_refs);
  }

  $ctxt->read_reffiles({use_resources=>$opts->{use_resources}});
  $ctxt->{'_no_read_trees'} = $opts->{no_trees};
  local $BUILD_TREES = $opts->{no_trees} ? 0 : 1;
  local $LOAD_REFFILES = $opts->{no_references} ? 0 : 1;
  local $KNIT = $opts->{no_knit} ? 0 : $LOAD_REFFILES;
  local $VALIDATE_CDATA =$opts->{validate_cdata} ? 1 : 0;
  local $VALIDATE_SEQUENCES =$opts->{ignore_content_patterns} ? 0 : 1;
  $ctxt->{'_id-hash'}={};

  prepare_handlers($ctxt);
  dump_handlers($ctxt) if $opts->{dump_handlers} or $ENV{PML_COMPILE_DUMP};
  load_data($ctxt,$reader,$opts);
  while ($reader->read) {
    if ($reader->nodeType == XML_READER_TYPE_PROCESSING_INSTRUCTION) {
      push @{$ctxt->{'_pi'}}, [ $reader->name,$reader->value ];
    }
  }

  $handlers{'#initialize'}->($ctxt);
  $ctxt->{_root} = $handlers{'#root'}->($ctxt->{_root});
  };
  ($handlers{'#cleanup'}||sub{})->();
  %handlers=();
  close_uri($fh_to_close) if defined $fh_to_close;
  die $@ if $@;
  $ctxt->{'_parser'} = undef;
  return $ctxt;
}

######################################################
# $ctxt

sub _reader_address {
  my ($ctxt,$reader)=@_;
  my $line_number=$reader->lineNumber;
  return " at ".$ctxt->{'_filename'}." line ".$line_number."\n";
}

sub read_header {
  my ($ctxt,$reader,$opts)=@_;

  # manually extract the root node
  my $root = [XML_READER_TYPE_ELEMENT,
              $reader->localName,
              undef,
             ];
  # read root node attributes
  $root->[XAT_LINE] = 0;
  $root->[XAT_ATTRS] = readAttributes($reader);
  my $found_head = 0;
  while ($reader->read == 1) {
    my $type = $reader->nodeType;
    if ($type == XML_READER_TYPE_TEXT) { # no CDATA
      die "Unexpected content of a root element preceding <head>"._reader_address($ctxt,$reader);
    } elsif ($type == XML_READER_TYPE_ELEMENT) {
      if ($reader->localName eq 'head' and $reader->namespaceURI eq PML_NS) {
        # we have head!
        $found_head = 1;
        last;
      } else {
        die "Unexpected element '".$reader->name."' precedes PML header <head>"._reader_address($ctxt,$reader);
      }
    }
  }
  unless ($found_head) {
    die "Did not find PML <head> element: the document '".$ctxt->{_filename}."' is not a PML instance!";

lib/Treex/PML/Instance/Reader.pm  view on Meta::CPAN

  my ($r)=@_;
  my @attrs;
  my ($prefix,$name);
  if ($r->moveToFirstAttribute==1) {
    do {{
      $prefix = $r->prefix;
      $name = $r->localName;
      push @attrs, ($name,$r->value) unless ($prefix and $prefix eq 'xmlns') or (!$prefix and $name eq 'xmlns');
    }} while ($r->moveToNextAttribute==1);
    $r->moveToElement;
  }
  \@attrs;
}


sub _paste_last_code {
  my ($node,$prev,$p)=@_;
  return qq`
            #$node\->{'$Treex::PML::Node::rbrother'}=undef;
            $prev\->{'$Treex::PML::Node::rbrother'}=$node;
            weaken( $node\->{'$Treex::PML::Node::lbrother'} = $prev );
            weaken( $node\->{'$Treex::PML::Node::parent'} = $p );
`;
}
sub _paste_first_code {
  my ($node,$p)=@_;
  return qq`
           #$node\->{'$Treex::PML::Node::rbrother'}=undef;
           #$node\->{'$Treex::PML::Node::lbrother'}=undef;
           $p\->{'$Treex::PML::Node::firstson'}=$node;
           weaken( $node\->{'$Treex::PML::Node::parent'} = $p );
`;
}

sub hash_id_code {
  my ($key,$value)=@_;
  return q`
        for (`.$key.q`) {
          if (defined and length) {
            if (exists($ID_HASH->{$ID_PREFIX.$_}) and
                $ID_HASH->{$ID_PREFIX.$_} != `.$value.q`) {
               warn("Duplicated ID '$_'");
            }
            weaken( $ID_HASH->{$ID_PREFIX.$_} = `.$value.q` );
          }
        }`
}

sub _fix_id_member {
  my ($decl)=@_;
  return unless $decl;
  my ($idM) = $decl->find_members_by_role('#ID');
  if ($idM) {
    # what follows is a hack fixing buggy PDT 2.0 schemas
    my $cdecl = $idM->get_content_decl(1); # no_resolve
    if ($cdecl and $cdecl->get_decl_type == PML_CDATA_DECL and $cdecl->get_format eq 'ID') {
      $cdecl->set_format('PMLREF');
    } elsif ($cdecl = $idM->get_content_decl()) {
      if ($cdecl and $cdecl->get_decl_type == PML_CDATA_DECL and $cdecl->get_format eq 'ID') {
          warn "Trying to knit object of type '".$decl->get_decl_path."' which has an #ID-attribute ".
            "'".$idM->get_name."' declared as <cdata format=\"ID\"/>. ".
              "Note that the data-type for #ID-attributes in objects knitted as DOM should be ".
                "<cdata format=\"PMLREF\"/> (Hint: redeclare with <derive> for imported types).";
      }
    }
  }
  return $idM;
}

sub knit_code {
  my ($decl,$assign,$fail)=@_;
  my $sub = q`
             if ($ref) {
               $ref =~ s/^(?:(.*?)\#)//;
               my $file_id = $1||'';
               my $do_knit=$selected_knits->{$file_id};
               unless (defined($do_knit) and $do_knit==0) {
                 my $target;
                 if (length $file_id) {
                   my $f = $parsed_reffile->{ $file_id };
                   if (ref $f) {
                     if (UNIVERSAL::DOES::does($f,'Treex::PML::Instance')) {
                       $target = $f->{'_id-hash'}->{$ref};
                       $target->{'#knit_prefix'}=$file_id;
                     } else { # DOM`;
  if ($decl) {
    my $idM = _fix_id_member($decl);
    my $idM_name = $idM && $idM->get_name;
    my $decl_path = $decl->get_decl_path; $decl_path =~ s/^!//;
    $sub .= q`
                       my $dom_node = $ref_index->{$file_id}{$ref} || $f->getElementsById($ref);
                       if (defined $dom_node) {
                         $target  = $ID_HASH->{$ID_PREFIX.$file_id.'#'.$ref};
                         if (!defined $target) {
                            my $p = $ID_PREFIX;
                            $ID_PREFIX.=$file_id.'#';
                            my $r = XML::LibXML::Reader->new(string=>'<f xmlns="`.PML_NS.q`">'.$dom_node->toString.'</f>');
                            $r->nextElement;
                            # print $r, $dom_node->toString,"\n";
                            my %ns;
                            my $tree = XML::CompactTree`.($HAVE_XS ? '::XS' : '').q`::readSubtreeToPerl($r,`.$XTC_FLAGS.q`,\%ns);
                            my $index = $pml_ns_index;
                            $pml_ns_index = $ns{'`.PML_NS.q`'} || -1;
                            # print "index: $pml_ns_index\n";
                            # print Dumper($tree->[0][XAT_CHILDREN][0]);
                            $target = $handlers{'`.$decl_path.q`'}->($tree->[XAT_CHILDREN][0]);`;
    if ($idM) {
      $sub .= q`
                            $target->{`.$idM_name.q`}=$file_id.'#'.$target->{`.$idM_name.q`} if $target;`;
    }
    $sub .= q`
                            $pml_ns_index = $index;
                            $weaken=0;
                            $ID_PREFIX=$p;
                         }
                        }`;
  } else {
    $sub .= q`
                        warn("DOM knit error: knit content type not declared in the schema!\n");`;
  }
  $sub.=q`
                     }
                   } else {

lib/Treex/PML/Instance/Reader.pm  view on Meta::CPAN

          unless ($have_trees) {
            $have_trees = 1;
            $ctxt->{'_pml_trees_type'} = $trees_type;
            $ctxt->{'_trees'} = Treex::PML::Factory->createList(\@list,1);
            return;
          }`;
      }
      $sub .= q`
          return Treex::PML::Factory->createList(\@list,1);
        }`;
      # print $sub;
      $src{$src}=$sub;
      $handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
    } elsif ($decl_type == PML_ALT_DECL) {
      # print $path."\t@".$decl->get_decl_type_str,"\n";
      my $cpath = $decl->get_content_decl->get_decl_path;
      $cpath=~s/^!//;
      my $src = $schema_name.'__generated_read_alt@'.$path;
      $src=~y{/}{@};
      my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
       sub {
          my ($p)=@_;
          my $c=$p->[XAT_CHILDREN];
          my $a=$p->[XAT_ATTRS];
          return undef unless $c and @$c or $a and @$a;
          my $singleton = $a && @$a ? 1 : 0;
          unless ($singleton) {
            for my $el (@$c) {
              if (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
                $singleton = 1;
                last;
              } elsif ($el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
                $singleton = 1 if $el->[XAT_NAME] ne 'AM';
                last;
              }
            }
          }
          if ($singleton) {
            return $handlers{ '`.$cpath.q`' }->($p);
          } else {
            my @alt;
            for my $el (@$c) {
              if (ref($el) and $el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
                $el->[XAT_NAME] eq 'AM' or _report_error(q(Unexpected non-AM element ').$el->[XAT_NAME].q(' in an alt: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
                push @alt, $handlers{ '`.$cpath.q`' }->($el);
              } elsif (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
                 _report_error(q(Unexpected text content in an alt: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
              }
            }
            return @alt == 0 ? undef : @alt == 1 ? $alt[0] :
               #return bless \@alt, 'Treex::PML::Alt';
               Treex::PML::Factory->createAlt(\@alt,1);
          }
       }
    `;
      # print $sub;
      $src{$src}=$sub;
      $handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;

    } elsif ($decl_type == PML_CDATA_DECL) {
      my $src = $schema_name.'__generated_read_cdata@'.$path;
      $src=~y{/}{@};
      my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
         sub {
            my ($p)=@_;
            my $text;
            if (ref($p)) {
              my $c = $p->[XAT_CHILDREN];
              return undef unless $c and @$c;
              my $type;
              $text = join '',
              map {
                if (ref($_)) {
                  $type = $_->[XAT_TYPE];
                  if ($type == XML_READER_TYPE_TEXT ||
                      $type == XML_READER_TYPE_WHITESPACE ||
                      $type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ||
                      $type == XML_READER_TYPE_CDATA) {
                    $_->[XAT_VALUE]
                  } elsif ($type == XML_READER_TYPE_ELEMENT) {
                     _report_error(q(Element found where only character data were expected in element <).$_->[XAT_NAME].q(> of CDATA type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
                  }
                } else {
                  $_
                }
              } @$c;`;
      my $format_checker;
      if ($VALIDATE_CDATA and $decl->get_format ne 'any') {
        $sub .=q`
            } else {
              $text = $p;
            }`;
        $format_checker = $decl->_get_format_checker();
        if (defined $format_checker) {
          if (ref($format_checker) eq 'CODE') {
            $sub .= q`
            if (defined $text and length $text and !$format_checker->($text)) {`;
          } else {
            $sub .= q`
            if (defined $text and length $text and $text !~ $format_checker) {`;
          }
          $sub .= q`
              warn("CDATA value '$text' does not conform to format '`.$decl->get_format.q`' at ".$pml_file.' line '.$p->[XAT_LINE]);
            }`;
        }
        $sub .= q`
            return $text;
          }`;
      } else {
        $sub .=q`
              return $text;
            } else {
              return $p;
            }
          }`;
      }
      #    print $sub;
      $src{$src}=$sub;
      $handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
    } elsif ($decl_type == PML_CHOICE_DECL) {
      #    print $path,"\n";



( run in 0.952 second using v1.01-cache-2.11-cpan-524268b4103 )