Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/Document.pm  view on Meta::CPAN

	{
	    $xmlalvisfile=~s/<documentRecord\s(xmlns=[^\s]+)*\sid\s*=\s*\"([^\"]*?)\">/&unparseable_id($2)/esgo;
	};
    }
    else
    {
	if ($doc)
	{

	    my $root=$doc->documentElement();
	    for my $rec_node ($root->getChildrenByTagName('documentRecord'))
	    {
		my $id=$rec_node->getAttribute("id");
		if (defined($id))
		{
		    $doc_list .= $rec_node->toString();
		}
		else
		{
		    my $rec_str=$rec_node->toString();
		    $rec_str=~s/\n/ /sgo;
		    warn "No id for record $rec_str\n";
		}
	    }
	}
	else
	{
	    my $doc_str=$xmlalvisfile;
	    $doc_str=~s/\n/ /sgo;
	    warn "Parsing the doc failed. Doc: $doc_str\n";
	}
    }

    return $doc_list;
}

sub get_language_from_file
{
    my $xmlalvisfile=shift;
    my $outfile = shift;
    my $config = shift;

    print STDERR "Identifying the language from file ($xmlalvisfile)\n";

    my $doc;
    my $Parser=XML::LibXML->new();


    eval
    {
	$doc=$Parser->parse_file($xmlalvisfile);
    };
    if (!$@)
    {
	if ($doc)
	{
	    my $xmlalvisdata = &get_language($doc);


	    open OUTPUT_FILE, ">$outfile";
	    binmode(OUTPUT_FILE, ":utf8");
	    print OUTPUT_FILE "$xmlalvisdata\n";
	    close(OUTPUT_FILE);
	    return($outfile);
	}
	else
	{
	    warn "Parsing the doc failed.\n";
	}
    } else {
	warn "Parsing the doc failed.\n";
	print STDERR $@;
    }

    return $outfile;
}

sub get_language_from_data
{
    my $xmlalvis=shift;

#       print STDERR $xmlalvis;

    print STDERR "Identifying the language from data\n";

    my $doc;
    my $Parser=XML::LibXML->new();


    eval
    {
	$doc=$Parser->parse_string($xmlalvis);
    };
    if (!$@)
    {
	if ($doc)
	{
	    $xmlalvis = &get_language($doc);
	}
	else
	{
	    warn "Parsing the doc failed. \n";
	}
    } else {
	warn "Parsing the doc failed.\n";
	if ($@ =~ /UTF-8/) {
	    warn "Not a UTF-8, assume to be a latin-1 document\n";
	    print STDERR "Converting in UTF8...\n";
	    Encode::from_to($xmlalvis, "iso-8859-1", "UTF-8");
	    print STDERR "done\n";
	    $xmlalvis = &get_language_from_data($xmlalvis);
	}
    }
#         print STDERR $xmlalvis;
    return $xmlalvis;
}


sub get_language
{
    my ($doc) = @_;



( run in 0.888 second using v1.01-cache-2.11-cpan-39bf76dae61 )