Alvis-NLPPlatform
view release on metacpan or search on metacpan
lib/Alvis/NLPPlatform/Convert.pm view on Meta::CPAN
my $AlvisConv = shift ;
my $config = shift;
my $mm = shift;
warn "Openning $outfile\n";
open OUTFILE, ">$outfile";
binmode(OUTFILE, ":utf8");
print OUTFILE $outdata;
close OUTFILE;
return &conversion_file_to_alvis_xml($outfile, $AlvisConv, $config, $mm);
}
sub applying_stylesheet
{
my $file = shift;
my $xmlns = shift;
my $config = shift;
# my $xslt_proc = XML::XSLT->new ($stylesheet, warnings => 1);
# $xslt_proc->transform ($file);
# print $xslt_proc->toString;
# maybe change for XML::DOM::Lite::XSLT engine
if (!exists $config->{"CONVERTERS"}->{"STYLESHEET"}->{$xmlns}) {
$xmlns = "default";
}
my $command = $config->{"CONVERTERS"}->{"STYLESHEET"}->{$xmlns} . " $file";
print STDERR "Applying the stylesheet : " . $config->{"CONVERTERS"}->{"STYLESHEET"}->{$xmlns} . "\n";
my $outdata;
$outdata = `$command`;
return ($xmlns,$outdata);
# $xslt_proc->dispose();
}
sub get_type_file
{
my $file = shift;
my $mm = shift;
print STDERR "Determining the type of the file " . $file . ": ";
my $type = $mm->checktype_filename($file);
if ($file =~ /.ppt$/i) {
$type = "application/powerpoint";
warn "Getting the type thanks to the extension\n";
}
if ($file =~ /.xls$/i) {
$type = "application/vnd.ms-excel";
warn "Getting the type thanks to the extension\n";
}
# if msword may be it should be relevant to check the extension, to better determine the type
$type =~ s/;.*//;
if (($type eq "message/rfc822") || ($file =~ /^x-system\/x-unix;/)) {
if ($file =~ /.tex$/i) {
$type = "text/x-tex";
warn "Getting the type thanks to the extension\n";
}
}
print STDERR "Type file: $type\n";
return($type);
}
sub html2alvis_init
{
my $config = shift;
my $ODir;
my $NPerOurDir=1000;
# my $MetaEncoding='iso-8859-1';
my $MetaEncoding='UTF-8';
my $HTMLEncoding=undef;'iso-8859-1';
my $HTMLEncodingFromMeta='utf-8';
my $IncOrigDoc=1;
if (defined $config->{"ALVISTMP"}) {
$ODir = $config->{"ALVISTMP"};
} else {
$ODir = ".";
}
warn "Outdir is $ODir\n";
print STDERR "Initialisation of the Alvis converter ...";
my $C=Alvis::Convert->new(outputRootDir=>$ODir,
outputNPerSubdir=>1000,
outputAtSameLocation=>0,
metaEncoding=>$MetaEncoding,
sourceEncoding=>$HTMLEncoding,
includeOriginalDocument=>$IncOrigDoc,
sourceEncodingFromMeta=>$HTMLEncodingFromMeta);
$C->init_output();
my $i = 0;
while (-f "$ODir/0/$i.alvis") { $i++;};
warn "Starting at $i\n";
$C->{outputN} = $i;
print STDERR "done\n";
return($C);
}
sub html2alvis
{
my $filename = shift;
my $Alvis_converter = shift;
my $config = shift;
print STDERR "Converting $filename to ALVIS XML format\n";
my $meta_txt = &make_meta($filename);
( run in 0.596 second using v1.01-cache-2.11-cpan-39bf76dae61 )