Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/Document.pm  view on Meta::CPAN

package Alvis::NLPPlatform::Document;

use strict;
use warnings;

use Lingua::Identify;

use Data::Dumper;

our $VERSION=$Alvis::NLPPlatform::VERSION;

# use YAML qw( Dump );

sub getnamespace
{
    my $file = shift;

    my $line;
    my $xmlns = undef;

    open FILE, $file;
    binmode(FILE);

    while(($line=<FILE>)){
	if ($line =~ /xmlns=\"?([^\"]+)\"?/) {
            $xmlns = $1;
	    next;
        }
    };
    close FILE;

    return($xmlns);
}

sub get_documentRecords
{
    my $xmlalvisfile=shift;

    my $doc;
    my $Parser=XML::LibXML->new();


    my $doc_list = "";

    eval
    {
	$doc=$Parser->parse_string($xmlalvisfile);
    };
    if ($@)
    {
	warn "Parsing the doc failed: $@. Trying to get the IDs..\n";
	eval
	{
	    $xmlalvisfile=~s/<documentRecord\s(xmlns=[^\s]+)*\sid\s*=\s*\"([^\"]*?)\">/&unparseable_id($2)/esgo;
	};
    }
    else
    {
	if ($doc)
	{

	    my $root=$doc->documentElement();
	    for my $rec_node ($root->getChildrenByTagName('documentRecord'))
	    {
		my $id=$rec_node->getAttribute("id");
		if (defined($id))
		{
		    $doc_list .= $rec_node->toString();



( run in 0.750 second using v1.01-cache-2.11-cpan-f56aa216473 )