Alvis-Convert
view release on metacpan or search on metacpan
bin/wikipedia2alvis view on Meta::CPAN
my $XMLDumpF=shift @ARGV;
$|=1;
my $C=Alvis::Convert->new(outputRootDir=>$ODir,
outputNPerSubdir=>$NPerOutDir,
outputAtSameLocation=>0,
includeOriginalDocument=>$IncOrigDoc);
if (!defined($C))
{
die("Instantiating Alvis::Convert failed.\n");
}
my %Seen;
my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
[\&_output_wikipedia_article],
{expandTemplates=>$ExpandTemplates,
templateDumpF=>$TemplateDumpF,
outputFormat=>$OutputFormat,
categoryWord=>$CategoryWord,
date=>$Date,
namespaces=>[@Namespaces],
dumpCatGraph=>$DumpCatGraph,
catGraphDumpF=>$CatGraphDumpF},
[\&_wikipedia_progress]
))
{
die("Conversion failed. " . $C->errmsg());
}
print "\n";
sub _output_wikipedia_article
{
my $title=shift;
my $date=shift;
my $output_format=shift;
my $record_txt=shift;
my $is_redir=shift;
my $namespace=shift;
warn "TITLE:$title";
my $alvis_XML;
if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
{
my $meta_txt;
$meta_txt.="title\t$title\n";
$meta_txt.="date\t$date\n";
my $ns_txt="";
if ($namespace ne '')
{
$ns_txt="$namespace/";
}
$meta_txt.="url\twikipedia/$ns_txt$title\n";
$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
if (!defined($alvis_XML))
{
warn "Obtaining the Alvis version of the " .
"HTML version of an article failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
return 1;
}
}
elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
{
$alvis_XML=$record_txt;
}
else
{
die("Internal inconsistency: output format of a Wikipedia article " .
"is an unrecognized one: \"$output_format\".");
}
$title=~s/\//_/isgo;
my $dir=$ODir . "/" . (int($N/$NPerOutDir)+1);
system("mkdir -p $dir"); # fix this laziness for portability
my $base_name=$dir . "/" . $title;
if (!$C->output_Alvis([$alvis_XML],$base_name))
{
warn "Outputting the Alvis records for base name " .
"\"$base_name\" failed. " . $C->errmsg();
$C->clearerr();
}
$N++;
return 1;
}
sub _wikipedia_progress
{
my $prog_txt=shift;
my $total_nof_records=shift;
my $nof_hits=shift;
my $mess=shift;
if (defined($total_nof_records) && defined($nof_hits))
{
print sprintf("%s Total:%d Found:%d",$prog_txt,
$total_nof_records,$nof_hits) . "\r";
}
else
{
if (!defined($mess))
{
$mess="";
}
print sprintf("%s %-70s",$prog_txt,$mess) . "\r";
}
}
( run in 0.780 second using v1.01-cache-2.11-cpan-140bd7fdf52 )