Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Document.pm view on Meta::CPAN
package Alvis::NLPPlatform::Document;
use strict;
use warnings;
use Lingua::Identify;
use Data::Dumper;
our $VERSION=$Alvis::NLPPlatform::VERSION;
# use YAML qw( Dump );
sub getnamespace
{
my $file = shift;
my $line;
my $xmlns = undef;
open FILE, $file;
binmode(FILE);
while(($line=<FILE>)){
if ($line =~ /xmlns=\"?([^\"]+)\"?/) {
$xmlns = $1;
next;
}
};
close FILE;
return($xmlns);
}
sub get_documentRecords
{
my $xmlalvisfile=shift;
my $doc;
my $Parser=XML::LibXML->new();
my $doc_list = "";
eval
{
$doc=$Parser->parse_string($xmlalvisfile);
};
if ($@)
{
warn "Parsing the doc failed: $@. Trying to get the IDs..\n";
eval
{
$xmlalvisfile=~s/<documentRecord\s(xmlns=[^\s]+)*\sid\s*=\s*\"([^\"]*?)\">/&unparseable_id($2)/esgo;
};
}
else
{
if ($doc)
{
my $root=$doc->documentElement();
for my $rec_node ($root->getChildrenByTagName('documentRecord'))
{
my $id=$rec_node->getAttribute("id");
if (defined($id))
{
$doc_list .= $rec_node->toString();
( run in 0.750 second using v1.01-cache-2.11-cpan-f56aa216473 )