PMLTQ-PML2BASE

 view release on metacpan or  search on metacpan

lib/PMLTQ/PML2BASE.pm  view on Meta::CPAN

package PMLTQ::PML2BASE;
our $AUTHORITY = 'cpan:MATY';
$PMLTQ::PML2BASE::VERSION = '3.0.1';
# ABSTRACT: Convert from PML to SQL


use 5.006;
use strict;
use warnings;
$|=1;
use Data::Dumper;

use open qw(:std :utf8);
use Carp;

# $SIG{__WARN__} = sub { Carp::cluck(@_); };

use constant MAX_NAME_LENGTH => 16;

use constant HYBRID=>1;        # create both separate node table and node table with attributes
use constant NO_TREE_TABLE=>2; # don't create one common common node table for all node types
use constant MAX_MIN_ORD=>4;   # node tables have #max_ord and #min_ord columns
use constant TOP_TREE_FLAG=>8; # The <root>__#files table has a 'top' column indicating that
                               # a given root node belongs to the top-level tree list
                               # of a file (i.e. it is not a nested #NODE within
                               # some non-#NODE and non-#TREES data structure)
#
# Generic SQL DB scheme for PML data:
#
# - every structure/container/ has a unique idx (Number) and carries attributes as columns
#
# - container has a '#content' column
#
# - cdata/constant/choice attributes are stored in the respective columns
#
# - structure/container members are in a separate table where they have a unique idx,
#   referred to by the member column
#
# - unordered-list/alt members are stored in a separate table, whose columns
#   are a 1:N idx referred to by the member column, and a LM/AM column
#   containing the value (following the rules described here);
#
# - sequence members are in a separate table whose columns correspond
#   to the elements, each containing an reference to a table
#   representing all occurrences of that element; this table has
#   an 1:N idx, '#pos' containing position of the element in the sequence,
#   '#elem-pos' containing the number of preceding elements of the same name in the sequence+1,
#   and the value, as usual. To retrieve a complete content of a certain sequence as a table,
#   one has to use a UNION on all the element tables, ordering by #pos and possibly outputing
#   a constant '#name' column
#
# - the table names should be derived from PML type names in a canonical way,
#   one per PML schema type decl
#
# Possible modifications:
#
# a) cast all cdata structure members also into separate tables and thus
#    keep varchar data in separate tables
#
# b) cast node attributes to a separate table, separating the tree structure
#    from node data, making the tree-structure table very thin
#    in fact: this is necessary, since nodes can be of different types
#
# c) use cdata format information to determine the table column format
#
#  some hacks:
#
#  - updating refs in strips .rf suffix if the member is a PMLREF but
#    not if it is a list of PMLREFs
#
#  - prefix of PMLREFs is stripped down; should be kept and
#    used to verify the target based on filename on UPDATE
#
#

use Treex::PML::Schema;
use Treex::PML::Instance;
use PMLTQ::Common;
use PMLTQ::Relation;
use List::Util qw(max first);
use Cwd;
use Carp;

PMLTQ::Relation->load();

my ($file_table,$root_name,$schema,$references_table,%schema,%fh,%orig_name,%seen_ref_schema,
    $node_table,$index_id,$last_type_no,$tree_no,$filename,
    $idx,$node_idx, $last_tree_no, %pmlref_target_info, $relations, @dump_fns
   );

our %opts;

sub init {
  $idx = $opts{'init-idx'} || 0;
  $node_idx=($opts{'init-node-idx'}|| $opts{'init-idx'} || 0);
  $last_type_no=0;
  $index_id=0;
  $tree_no=0;
  $filename=undef;
  $relations=[];
  @dump_fns=();

  %pmlref_target_info=();

  $opts{'related-schema'} ||= [];
  $opts{other_schemas}||=[
    map {
      Treex::PML::Schema->new({filename=>$_,use_resources=>1})
    } @{$opts{'related-schema'}}
  ];
  $opts{hybrid}||=1;
  $opts{prefix}||='';
  $opts{'data-dir'}||='';
}

sub destroy {
  undef $file_table;
  undef $root_name;
  undef $schema;
  undef $references_table;
  undef %schema;
  undef %fh;
  undef %orig_name;
  undef %seen_ref_schema;



( run in 0.883 second using v1.01-cache-2.11-cpan-df04353d9ac )