Alvis-NLPPlatform

 view release on metacpan or  search on metacpan

lib/Alvis/NLPPlatform/Convert.pm  view on Meta::CPAN

    } else {
	$ODir = ".";
    }


    warn "Outdir is $ODir\n";

    print STDERR "Initialisation of the Alvis converter ...";

    my $C=Alvis::Convert->new(outputRootDir=>$ODir,
			  outputNPerSubdir=>1000,
			  outputAtSameLocation=>0,
			  metaEncoding=>$MetaEncoding,
			  sourceEncoding=>$HTMLEncoding,
			  includeOriginalDocument=>$IncOrigDoc,
                          sourceEncodingFromMeta=>$HTMLEncodingFromMeta);

    $C->init_output();
    my $i = 0;
    while (-f "$ODir/0/$i.alvis") { $i++;};
    warn "Starting  at $i\n";
    $C->{outputN} = $i;
    print STDERR "done\n";
    return($C);
}

sub html2alvis
{
    my $filename = shift;
    my $Alvis_converter = shift;
    my $config = shift;

    print STDERR "Converting $filename to ALVIS XML format\n";

    my $meta_txt = &make_meta($filename);

    my $html_txt=$Alvis_converter->read_HTML($filename);

#    print STDERR "==>" .  utf8::is_utf8($html_txt) . "\n";

    if (!defined($html_txt))
    {
	warn "Reading the HTML for basename \"$filename\" failed. " .
	    $Alvis_converter->errmsg();
	$Alvis_converter->clearerr();
	return (1);;
    }

#     print STDERR $html_txt;

    my $alvisXML=$Alvis_converter->HTML($html_txt,$meta_txt);

    if (!defined($alvisXML))
    {
	warn "Obtaining the Alvis version of the " .
	    "HTML version of an article failed. " . $Alvis_converter->errmsg();
	$Alvis_converter->clearerr();
	return 2;
    }
#  	my $e=Alvis::Document::Encoding->new();
# 	my $type_guesser=Alvis::Document::Type->new();
# 	my ($doc_type,$doc_sub_type)=$type_guesser->guess($alvisXML);
# 	my $doc_encoding=$e->guess_and_convert($alvisXML,$doc_type,$doc_sub_type, "UTF-8");
# 	if (!defined($doc_encoding))
# 	{
# 	    die('Cannot guess. ' . $e->errmsg());
# 	}
# 	print STDERR "$doc_type,$doc_sub_type,$doc_encoding\n";
# 	print STDERR $e->guess($alvisXML);
#     warn "Checking the encoding\n";
#     if (!Encode::is_utf8($alvisXML)) {
# 	warn "Not a UTF-8, assume to be a latin-1 document\n";
# 	print STDERR "Converting in UTF8...\n";
# 	Encode::from_to($alvisXML, "iso-8859-1", "UTF-8");
# 	print STDERR "done\n";
#     }
#  	print STDERR $alvisXML;
#  	exit;
	
#   my $decoder = Encode::Guess->guess_encoding($alvisXML, /UTF-8/);
#     if (!ref($decoder)) {
# 	warn "Not a UTF-8, assume to be a latin-1 document\n";
# 	print STDERR "Converting in UTF8...\n";
# 	$alvisXML = $decoder->decode($alvisXML);
# # 	Encode::from_to($alvisXML, "iso-8859-1", "UTF-8");
# 	print STDERR "done\n";
#     } else {
# 	warn "Document is already in UTF-8 :-)\n";
#     }

# TO SEE HOW TO REMOVE CONCAT of foot and head
#     if ($alvisXML !~ /^\s*<\?xml /) {
# 	my $xmlhead = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
# 	my $xmlfoot = "</documentCollection>\n";
# 	$alvisXML = Alvis::NLPPlatform::Document::get_language_from_data($xmlhead . $alvisXML . $xmlfoot);
#     } else {
#     print STDERR $alvisXML;

	$alvisXML = Alvis::NLPPlatform::Document::get_language_from_data($alvisXML);
#      print STDERR $alvisXML;

#     }
#      print STDERR $alvisXML;

#     return &outputting_alvis($xmlhead . $alvisXML . $xmlfoot, $Alvis_converter, $config);
    return &outputting_alvis($alvisXML, $Alvis_converter, $config);

#     print STDERR "\t done\n";

#     return 0;
}

sub outputting_alvis_from_file
{
    my $alvisfile = shift;
    my $Alvis_converter = shift;
    my $config = shift;

    open ALVISFILE, $alvisfile or die "No such file: $alvisfile\n";
#       binmode(ALVISFILE, ":utf8");
    binmode ALVISFILE; # XXXX

    local $/ = undef;

    my $alvisfile_data = <ALVISFILE>;
    close ALVISFILE;

    my $docs = Alvis::NLPPlatform::Document::get_documentRecords($alvisfile_data);

#     print STDERR "doc_list : $docs\n";

#    return &outputting_alvis($alvisfile_data, $Alvis_converter, $config);
    return &outputting_alvis($docs, $Alvis_converter, $config);

}

sub outputting_alvis
{
    my $alvisXML = shift;
    my $Alvis_converter = shift;



( run in 0.636 second using v1.01-cache-2.11-cpan-e1769b4cff6 )