Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Document.pm view on Meta::CPAN
{
$xmlalvisfile=~s/<documentRecord\s(xmlns=[^\s]+)*\sid\s*=\s*\"([^\"]*?)\">/&unparseable_id($2)/esgo;
};
}
else
{
if ($doc)
{
my $root=$doc->documentElement();
for my $rec_node ($root->getChildrenByTagName('documentRecord'))
{
my $id=$rec_node->getAttribute("id");
if (defined($id))
{
$doc_list .= $rec_node->toString();
}
else
{
my $rec_str=$rec_node->toString();
$rec_str=~s/\n/ /sgo;
warn "No id for record $rec_str\n";
}
}
}
else
{
my $doc_str=$xmlalvisfile;
$doc_str=~s/\n/ /sgo;
warn "Parsing the doc failed. Doc: $doc_str\n";
}
}
return $doc_list;
}
sub get_language_from_file
{
my $xmlalvisfile=shift;
my $outfile = shift;
my $config = shift;
print STDERR "Identifying the language from file ($xmlalvisfile)\n";
my $doc;
my $Parser=XML::LibXML->new();
eval
{
$doc=$Parser->parse_file($xmlalvisfile);
};
if (!$@)
{
if ($doc)
{
my $xmlalvisdata = &get_language($doc);
open OUTPUT_FILE, ">$outfile";
binmode(OUTPUT_FILE, ":utf8");
print OUTPUT_FILE "$xmlalvisdata\n";
close(OUTPUT_FILE);
return($outfile);
}
else
{
warn "Parsing the doc failed.\n";
}
} else {
warn "Parsing the doc failed.\n";
print STDERR $@;
}
return $outfile;
}
sub get_language_from_data
{
my $xmlalvis=shift;
# print STDERR $xmlalvis;
print STDERR "Identifying the language from data\n";
my $doc;
my $Parser=XML::LibXML->new();
eval
{
$doc=$Parser->parse_string($xmlalvis);
};
if (!$@)
{
if ($doc)
{
$xmlalvis = &get_language($doc);
}
else
{
warn "Parsing the doc failed. \n";
}
} else {
warn "Parsing the doc failed.\n";
if ($@ =~ /UTF-8/) {
warn "Not a UTF-8, assume to be a latin-1 document\n";
print STDERR "Converting in UTF8...\n";
Encode::from_to($xmlalvis, "iso-8859-1", "UTF-8");
print STDERR "done\n";
$xmlalvis = &get_language_from_data($xmlalvis);
}
}
# print STDERR $xmlalvis;
return $xmlalvis;
}
sub get_language
{
my ($doc) = @_;
( run in 0.888 second using v1.01-cache-2.11-cpan-39bf76dae61 )