Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Convert.pm view on Meta::CPAN
} else {
$ODir = ".";
}
warn "Outdir is $ODir\n";
print STDERR "Initialisation of the Alvis converter ...";
my $C=Alvis::Convert->new(outputRootDir=>$ODir,
outputNPerSubdir=>1000,
outputAtSameLocation=>0,
metaEncoding=>$MetaEncoding,
sourceEncoding=>$HTMLEncoding,
includeOriginalDocument=>$IncOrigDoc,
sourceEncodingFromMeta=>$HTMLEncodingFromMeta);
$C->init_output();
my $i = 0;
while (-f "$ODir/0/$i.alvis") { $i++;};
warn "Starting at $i\n";
$C->{outputN} = $i;
print STDERR "done\n";
return($C);
}
sub html2alvis
{
my $filename = shift;
my $Alvis_converter = shift;
my $config = shift;
print STDERR "Converting $filename to ALVIS XML format\n";
my $meta_txt = &make_meta($filename);
my $html_txt=$Alvis_converter->read_HTML($filename);
# print STDERR "==>" . utf8::is_utf8($html_txt) . "\n";
if (!defined($html_txt))
{
warn "Reading the HTML for basename \"$filename\" failed. " .
$Alvis_converter->errmsg();
$Alvis_converter->clearerr();
return (1);;
}
# print STDERR $html_txt;
my $alvisXML=$Alvis_converter->HTML($html_txt,$meta_txt);
if (!defined($alvisXML))
{
warn "Obtaining the Alvis version of the " .
"HTML version of an article failed. " . $Alvis_converter->errmsg();
$Alvis_converter->clearerr();
return 2;
}
# my $e=Alvis::Document::Encoding->new();
# my $type_guesser=Alvis::Document::Type->new();
# my ($doc_type,$doc_sub_type)=$type_guesser->guess($alvisXML);
# my $doc_encoding=$e->guess_and_convert($alvisXML,$doc_type,$doc_sub_type, "UTF-8");
# if (!defined($doc_encoding))
# {
# die('Cannot guess. ' . $e->errmsg());
# }
# print STDERR "$doc_type,$doc_sub_type,$doc_encoding\n";
# print STDERR $e->guess($alvisXML);
# warn "Checking the encoding\n";
# if (!Encode::is_utf8($alvisXML)) {
# warn "Not a UTF-8, assume to be a latin-1 document\n";
# print STDERR "Converting in UTF8...\n";
# Encode::from_to($alvisXML, "iso-8859-1", "UTF-8");
# print STDERR "done\n";
# }
# print STDERR $alvisXML;
# exit;
# my $decoder = Encode::Guess->guess_encoding($alvisXML, /UTF-8/);
# if (!ref($decoder)) {
# warn "Not a UTF-8, assume to be a latin-1 document\n";
# print STDERR "Converting in UTF8...\n";
# $alvisXML = $decoder->decode($alvisXML);
# # Encode::from_to($alvisXML, "iso-8859-1", "UTF-8");
# print STDERR "done\n";
# } else {
# warn "Document is already in UTF-8 :-)\n";
# }
# TO SEE HOW TO REMOVE CONCAT of foot and head
# if ($alvisXML !~ /^\s*<\?xml /) {
# my $xmlhead = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
# my $xmlfoot = "</documentCollection>\n";
# $alvisXML = Alvis::NLPPlatform::Document::get_language_from_data($xmlhead . $alvisXML . $xmlfoot);
# } else {
# print STDERR $alvisXML;
$alvisXML = Alvis::NLPPlatform::Document::get_language_from_data($alvisXML);
# print STDERR $alvisXML;
# }
# print STDERR $alvisXML;
# return &outputting_alvis($xmlhead . $alvisXML . $xmlfoot, $Alvis_converter, $config);
return &outputting_alvis($alvisXML, $Alvis_converter, $config);
# print STDERR "\t done\n";
# return 0;
}
sub outputting_alvis_from_file
{
my $alvisfile = shift;
my $Alvis_converter = shift;
my $config = shift;
open ALVISFILE, $alvisfile or die "No such file: $alvisfile\n";
# binmode(ALVISFILE, ":utf8");
binmode ALVISFILE; # XXXX
local $/ = undef;
my $alvisfile_data = <ALVISFILE>;
close ALVISFILE;
my $docs = Alvis::NLPPlatform::Document::get_documentRecords($alvisfile_data);
# print STDERR "doc_list : $docs\n";
# return &outputting_alvis($alvisfile_data, $Alvis_converter, $config);
return &outputting_alvis($docs, $Alvis_converter, $config);
}
sub outputting_alvis
{
my $alvisXML = shift;
my $Alvis_converter = shift;
( run in 0.636 second using v1.01-cache-2.11-cpan-e1769b4cff6 )