Alvis-Convert
view release on metacpan or search on metacpan
bin/wikipedia2alvis view on Meta::CPAN
my $Date=undef;
my $DumpCatGraph=1;
my $DumpTemplates=0;
my $CatGraphDumpF="CategoryGraph.storable";
my $Language='en';
my $RootCategory='fundamental';
my $CategoryWord="Category";
my $TemplateWord='Template';
my @Namespaces=('');
my $NamespacesTxt=undef; # Namespaces to include, default 'Articles'
GetOptions('help|?'=>\$PrintHelp,
'man'=>\$PrintManual,
'warnings!'=>\$Warnings,
'out-dir=s'=>\$ODir,
'namespaces=s'=>\$NamespacesTxt,
'N-per-out-dir=s'=>\$NPerOutDir,
'original!'=>\$IncOrigDoc,
'expand-templates-fully!'=>\$ExpandTemplates,
'dump-templates!'=>\$DumpTemplates,
'template-dump-file=s'=>\$TemplateDumpF,
'convert-via-html!'=>\$ConvertViaHTML,
'language=s'=>\$Language,
'category-word=s'=>\$CategoryWord,
'root-category=s'=>\$RootCategory,
'template-word=s'=>\$TemplateWord,
'date=s'=>\$Date,
'dump-category-graph!'=>\$DumpCatGraph,
'category-graph-dump-file=s'=>\$CatGraphDumpF
) or
pod2usage(2);
pod2usage(1) if $PrintHelp;
pod2usage(-exitstatus => 0, -verbose => 2) if $PrintManual;
pod2usage(1) if (@ARGV!=1);
#
# If we don't want to dump the templates, signal it like this
#
if (!$DumpTemplates)
{
undef $TemplateDumpF;
}
#
# Check that we know this language
#
if (!exists($LangSettings{$Language}))
{
die("Unrecognized language abbreviation \"$Language\".\n");
}
else
{
$RootCategory=$LangSettings{$Language}{rootCategory};
$CategoryWord=$LangSettings{$Language}{categoryWord};
$TemplateWord=$LangSettings{$Language}{templateWord};
}
#
# Speed vs. (possibly) quality
#
if ($ConvertViaHTML)
{
$OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_HTML;
}
else
{
$OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS;
}
if ($NamespacesTxt)
{
for my $ns (split(/,/,$NamespacesTxt))
{
$ns=~s/^\s+//isgo;
$ns=~s/\s+$//isgo;
push(@Namespaces,$ns);
}
}
my $XMLDumpF=shift @ARGV;
$|=1;
my $C=Alvis::Convert->new(outputRootDir=>$ODir,
outputNPerSubdir=>$NPerOutDir,
outputAtSameLocation=>0,
includeOriginalDocument=>$IncOrigDoc);
if (!defined($C))
{
die("Instantiating Alvis::Convert failed.\n");
}
my %Seen;
my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
[\&_output_wikipedia_article],
{expandTemplates=>$ExpandTemplates,
templateDumpF=>$TemplateDumpF,
outputFormat=>$OutputFormat,
categoryWord=>$CategoryWord,
date=>$Date,
namespaces=>[@Namespaces],
dumpCatGraph=>$DumpCatGraph,
catGraphDumpF=>$CatGraphDumpF},
[\&_wikipedia_progress]
))
{
die("Conversion failed. " . $C->errmsg());
}
print "\n";
sub _output_wikipedia_article
{
my $title=shift;
my $date=shift;
my $output_format=shift;
my $record_txt=shift;
my $is_redir=shift;
my $namespace=shift;
warn "TITLE:$title";
my $alvis_XML;
if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
{
my $meta_txt;
$meta_txt.="title\t$title\n";
$meta_txt.="date\t$date\n";
my $ns_txt="";
if ($namespace ne '')
{
$ns_txt="$namespace/";
}
$meta_txt.="url\twikipedia/$ns_txt$title\n";
$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
if (!defined($alvis_XML))
{
warn "Obtaining the Alvis version of the " .
"HTML version of an article failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
return 1;
}
}
elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
{
$alvis_XML=$record_txt;
}
else
{
die("Internal inconsistency: output format of a Wikipedia article " .
"is an unrecognized one: \"$output_format\".");
}
$title=~s/\//_/isgo;
my $dir=$ODir . "/" . (int($N/$NPerOutDir)+1);
system("mkdir -p $dir"); # fix this laziness for portability
my $base_name=$dir . "/" . $title;
if (!$C->output_Alvis([$alvis_XML],$base_name))
{
warn "Outputting the Alvis records for base name " .
"\"$base_name\" failed. " . $C->errmsg();
$C->clearerr();
}
$N++;
return 1;
}
sub _wikipedia_progress
{
my $prog_txt=shift;
my $total_nof_records=shift;
my $nof_hits=shift;
my $mess=shift;
if (defined($total_nof_records) && defined($nof_hits))
{
print sprintf("%s Total:%d Found:%d",$prog_txt,
$total_nof_records,$nof_hits) . "\r";
}
else
{
if (!defined($mess))
{
$mess="";
}
print sprintf("%s %-70s",$prog_txt,$mess) . "\r";
}
}
__END__
=head1 NAME
wikipedia2alvis.pl - Wikipedia XML dump to Alvis XML converter
=head1 SYNOPSIS
wikipedia2alvis.pl [options] [Wikipedia XML dump file]
( run in 1.436 second using v1.01-cache-2.11-cpan-39bf76dae61 )