Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Convert.pm view on Meta::CPAN
print STDERR "done\n";
return &conversion_file_to_alvis_xml($outfile, $AlvisConv, $config, $mm);
} else {
print STDERR "No convertion to HMTL to do: already in HTML, unknown type or no specified converter\n";
return 0;
}
} else {
if ($type =~ /html/i) {
# calling Alvis convert
return &html2alvis($infile, $AlvisConv, $config);
} elsif ($type =~ /xml/i) {
# checking if it is a xhtml or not
my $xmlns = Alvis::NLPPlatform::Document::getnamespace($infile);
if (defined $xmlns) {
if ($xmlns eq "http://www.w3.org/1999/xhtml") {
# if xhtml calling Alvis convert
return &html2alvis($infile, $AlvisConv, $config);
} else {
# if xml,
# check if it is an alvis file
# if not, try to use a stylesheet
if ($xmlns eq "http://alvis.info/enriched/") {
print STDERR "Already in XML Alvis\n";
$outfile_path = Alvis::NLPPlatform::Document::get_language_from_file($infile, "$outfile.lang", $config);
# print STDERR "Outfile is $outfile (i.e. $outfile_path)\n";
return &outputting_alvis_from_file($outfile_path, $AlvisConv, $config);
} else {
# use of a stylesheet conversion
($xmlns, $outdata) = &applying_stylesheet($infile, $xmlns, $config);
if ($xmlns ne "default") {
# CAREFUL not tested yet !
$outdata = Alvis::NLPPlatform::Document::get_language_from_data($outdata);
return &outputting_alvis($outdata, $AlvisConv, $config);
} else {
return &outputting_empty_xmlns_file($outdata, $outfile . ".out", $AlvisConv, $config, $mm);
}
}
}
} else {
# use of a default stylesheet conversion
($xmlns,$outdata) = &applying_stylesheet($infile, "default", $config);
return &outputting_empty_xmlns_file($outdata, $outfile . ".out", $AlvisConv, $config, $mm);
}
}
}
}
sub outputting_empty_xmlns_file
{
my $outdata = shift;
my $outfile = shift;
my $AlvisConv = shift ;
my $config = shift;
my $mm = shift;
warn "Openning $outfile\n";
open OUTFILE, ">$outfile";
binmode(OUTFILE, ":utf8");
print OUTFILE $outdata;
close OUTFILE;
return &conversion_file_to_alvis_xml($outfile, $AlvisConv, $config, $mm);
}
sub applying_stylesheet
{
my $file = shift;
my $xmlns = shift;
my $config = shift;
# my $xslt_proc = XML::XSLT->new ($stylesheet, warnings => 1);
# $xslt_proc->transform ($file);
# print $xslt_proc->toString;
# maybe change for XML::DOM::Lite::XSLT engine
if (!exists $config->{"CONVERTERS"}->{"STYLESHEET"}->{$xmlns}) {
$xmlns = "default";
}
my $command = $config->{"CONVERTERS"}->{"STYLESHEET"}->{$xmlns} . " $file";
print STDERR "Applying the stylesheet : " . $config->{"CONVERTERS"}->{"STYLESHEET"}->{$xmlns} . "\n";
my $outdata;
$outdata = `$command`;
return ($xmlns,$outdata);
# $xslt_proc->dispose();
}
sub get_type_file
{
my $file = shift;
my $mm = shift;
print STDERR "Determining the type of the file " . $file . ": ";
my $type = $mm->checktype_filename($file);
if ($file =~ /.ppt$/i) {
$type = "application/powerpoint";
warn "Getting the type thanks to the extension\n";
}
if ($file =~ /.xls$/i) {
$type = "application/vnd.ms-excel";
warn "Getting the type thanks to the extension\n";
}
# if msword may be it should be relevant to check the extension, to better determine the type
$type =~ s/;.*//;
if (($type eq "message/rfc822") || ($file =~ /^x-system\/x-unix;/)) {
if ($file =~ /.tex$/i) {
$type = "text/x-tex";
warn "Getting the type thanks to the extension\n";
}
}
print STDERR "Type file: $type\n";
lib/Alvis/NLPPlatform/Convert.pm view on Meta::CPAN
# my $e=Alvis::Document::Encoding->new();
# my $type_guesser=Alvis::Document::Type->new();
# my ($doc_type,$doc_sub_type)=$type_guesser->guess($alvisXML);
# my $doc_encoding=$e->guess_and_convert($alvisXML,$doc_type,$doc_sub_type, "UTF-8");
# if (!defined($doc_encoding))
# {
# die('Cannot guess. ' . $e->errmsg());
# }
# print STDERR "$doc_type,$doc_sub_type,$doc_encoding\n";
# print STDERR $e->guess($alvisXML);
# warn "Checking the encoding\n";
# if (!Encode::is_utf8($alvisXML)) {
# warn "Not a UTF-8, assume to be a latin-1 document\n";
# print STDERR "Converting in UTF8...\n";
# Encode::from_to($alvisXML, "iso-8859-1", "UTF-8");
# print STDERR "done\n";
# }
# print STDERR $alvisXML;
# exit;
# my $decoder = Encode::Guess->guess_encoding($alvisXML, /UTF-8/);
# if (!ref($decoder)) {
# warn "Not a UTF-8, assume to be a latin-1 document\n";
# print STDERR "Converting in UTF8...\n";
# $alvisXML = $decoder->decode($alvisXML);
# # Encode::from_to($alvisXML, "iso-8859-1", "UTF-8");
# print STDERR "done\n";
# } else {
# warn "Document is already in UTF-8 :-)\n";
# }
# TO SEE HOW TO REMOVE CONCAT of foot and head
# if ($alvisXML !~ /^\s*<\?xml /) {
# my $xmlhead = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
# my $xmlfoot = "</documentCollection>\n";
# $alvisXML = Alvis::NLPPlatform::Document::get_language_from_data($xmlhead . $alvisXML . $xmlfoot);
# } else {
# print STDERR $alvisXML;
$alvisXML = Alvis::NLPPlatform::Document::get_language_from_data($alvisXML);
# print STDERR $alvisXML;
# }
# print STDERR $alvisXML;
# return &outputting_alvis($xmlhead . $alvisXML . $xmlfoot, $Alvis_converter, $config);
return &outputting_alvis($alvisXML, $Alvis_converter, $config);
# print STDERR "\t done\n";
# return 0;
}
sub outputting_alvis_from_file
{
my $alvisfile = shift;
my $Alvis_converter = shift;
my $config = shift;
open ALVISFILE, $alvisfile or die "No such file: $alvisfile\n";
# binmode(ALVISFILE, ":utf8");
binmode ALVISFILE; # XXXX
local $/ = undef;
my $alvisfile_data = <ALVISFILE>;
close ALVISFILE;
my $docs = Alvis::NLPPlatform::Document::get_documentRecords($alvisfile_data);
# print STDERR "doc_list : $docs\n";
# return &outputting_alvis($alvisfile_data, $Alvis_converter, $config);
return &outputting_alvis($docs, $Alvis_converter, $config);
}
sub outputting_alvis
{
my $alvisXML = shift;
my $Alvis_converter = shift;
my $config = shift;
my $xmlheadXML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
my $xmlheaddC = "<documentCollection xmlns=\"http://alvis.info/enriched/\" version=\"1.1\">\n";
my $xmlfoot = "</documentCollection>\n";
# print STDERR Dumper $config;
# print STDERR $alvisXML;
if ((defined $config->{"CONVERTERS"}->{"StoreInputFiles"}) && ($config->{"CONVERTERS"}->{"StoreInputFiles"} == 0)) {
print STDERR "Don't store files\n";
if ($alvisXML =~ /^\s*<documentRecord/) {
$alvisXML = $xmlheadXML . $xmlheaddC . $alvisXML . $xmlfoot;
}
if ($alvisXML !~ /^\s*<\?xml /) {
$alvisXML = $xmlheadXML . $alvisXML;
}
# print STDERR "$alvisXML\n";
return $alvisXML;
}
else {
print STDERR "Store files\n";
# print STDERR $alvisXML;
if (!$Alvis_converter->output_Alvis([$alvisXML]))
{
warn "Outputting the Alvis records failed. " . $Alvis_converter->errmsg();
$Alvis_converter->clearerr();
return 3;
}
}
}
sub make_meta
{
my $filename = shift;
my $meta_txt = "";
$meta_txt .= "title\ttitle\n";
$meta_txt .= "date\t";
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime();
$meta_txt .= sprintf("%04d-%02d-%02d %02d:%02d:%02d\n", $year+1900, $mon+1, $mday, $hour, $min, $sec);
while($filename =~ s/\.out$//){};
$meta_txt .= "url\tfile://$filename\n";
return($meta_txt);
}
sub making_spool
{
my $config = shift;
my $outputRootDir = shift;
my $xmlfile;
my $xml_rec_doc = "";
my $line;
my $working_dir = getcwd();
print STDERR "Making the spool ...\n";
print STDERR "Working dir : $working_dir\n";
mkpath($config->{"alvis_connection"}->{"SPOOLDIR"}) or warn "cannot create directory " . $config->{"alvis_connection"}->{"SPOOLDIR"} . "\n";
touch($config->{"alvis_connection"}->{"SPOOLDIR"} . "/seq");
$|=1;
my $pipe = new Alvis::Pipeline::Read(port => $config->{"alvis_connection"}->{"HARVESTER_PORT"}, spooldir => $config->{"alvis_connection"}->{"SPOOLDIR"},
loglevel=>10)
or die "can't create read-pipe on port " . $config->{"alvis_connection"}->{"HARVESTER_PORT"} . ": $!";
my $pipe_out = new Alvis::Pipeline::Write(host => "localhost",
port => $config->{"alvis_connection"}->{"HARVESTER_PORT"},
loglevel => 10)
or die "can't create ALVIS write-pipe for port '" . $config->{"alvis_connection"}->{"HARVESTER_PORT"} . "': $!";
my $tmp_spool_dir = $outputRootDir . "/0";
opendir DIR, $tmp_spool_dir;
while($xmlfile = readdir DIR) {
if (($xmlfile ne ".") && ($xmlfile ne "..")) {
open XMLFILE, "$tmp_spool_dir/$xmlfile" or die "Cannot open such file ($xmlfile)\n";
binmode(XMLFILE, ":utf8");
$xml_rec_doc = "";
while($line = <XMLFILE>) {
$xml_rec_doc .= $line;
}
$pipe_out->write($xml_rec_doc);
close XMLFILE;
unlink "$tmp_spool_dir/$xmlfile";
}
}
closedir(DIR);
rmdir($tmp_spool_dir);
sleep 1;
$pipe_out->close() or die "Truly unbelievable (1)";
$pipe->close(); # or die "Truly unbelievable (2)";
print STDERR "Making the spool ... done\n";
chdir($working_dir);
return 0;
}
1;
__END__
=head1 NAME
Alvis::NLPPlatform::Convert - Perl extension for converting files in
any format into the ALVIS XML.
=head1 SYNOPSIS
use Alvis::NLPPlatform::Convert;
my %config = &Alvis::NLPPlatform::load_config($rcfile);
my $mm = Alvis::NLPPlatform::Convert::load_MagicNumber(\%config);
my $AlvisConverter = Alvis::NLPPlatform::Convert::html2alvis_init();
Alvis::NLPPlatform::Convert::conversion_file_to_alvis_xml($ARGV[0], $AlvisConverter, \%config, $mm);
=head1 DESCRIPTION
This module provides methods to convert input files into the ALVIS XML
format. It determines the type of the input files according to its
magic number and applies converters. Output files are stored in a
temporary spool.
=head1 METHODS
=head2 load_MagicNumber
( run in 0.764 second using v1.01-cache-2.11-cpan-39bf76dae61 )