Alvis-Convert
view release on metacpan or search on metacpan
bin/wikipedia2alvis view on Meta::CPAN
#!/usr/bin/perl -w
use Getopt::Long;
Getopt::Long::Configure ("bundling");
use Pod::Usage;
use Alvis::Convert;
use Alvis::Wikipedia::XMLDump;
use strict;
#
# Language-dependent identifiers. Add to these according to your needs.
# TO BE DONE: add a configuration option so these can be read from a file...
#
my %LangSettings=('en'=>{rootCategory=>'fundamental',
categoryWord=>'Category',
templateWord=>'Template'},
'fr'=>{rootCategory=>'principale',
categoryWord=>'Catégorie',
templateWord=>'Modèle'},
'sl'=>{rootCategory=>'temeljna',
categoryWord=>'Category'} );
my $PrintManual=0;
my $PrintHelp=0;
my $Warnings=1;
my $ODir='.';
my $NPerOutDir=1000;
my $IncOrigDoc=0;
my $ExpandTemplates=0;
my $TemplateDumpF='Templates.storable';
my $OutputFormat="HTML";
my $ConvertViaHTML=1;
my $Date=undef;
my $DumpCatGraph=1;
my $DumpTemplates=0;
my $CatGraphDumpF="CategoryGraph.storable";
my $Language='en';
my $RootCategory='fundamental';
my $CategoryWord="Category";
my $TemplateWord='Template';
my @Namespaces=('');
my $NamespacesTxt=undef; # Namespaces to include, default 'Articles'
GetOptions('help|?'=>\$PrintHelp,
'man'=>\$PrintManual,
'warnings!'=>\$Warnings,
'out-dir=s'=>\$ODir,
'namespaces=s'=>\$NamespacesTxt,
'N-per-out-dir=s'=>\$NPerOutDir,
'original!'=>\$IncOrigDoc,
'expand-templates-fully!'=>\$ExpandTemplates,
'dump-templates!'=>\$DumpTemplates,
'template-dump-file=s'=>\$TemplateDumpF,
'convert-via-html!'=>\$ConvertViaHTML,
'language=s'=>\$Language,
'category-word=s'=>\$CategoryWord,
'root-category=s'=>\$RootCategory,
'template-word=s'=>\$TemplateWord,
'date=s'=>\$Date,
'dump-category-graph!'=>\$DumpCatGraph,
'category-graph-dump-file=s'=>\$CatGraphDumpF
) or
pod2usage(2);
pod2usage(1) if $PrintHelp;
pod2usage(-exitstatus => 0, -verbose => 2) if $PrintManual;
pod2usage(1) if (@ARGV!=1);
#
# If we don't want to dump the templates, signal it like this
#
if (!$DumpTemplates)
{
undef $TemplateDumpF;
}
#
# Check that we know this language
#
if (!exists($LangSettings{$Language}))
{
die("Unrecognized language abbreviation \"$Language\".\n");
}
else
{
$RootCategory=$LangSettings{$Language}{rootCategory};
$CategoryWord=$LangSettings{$Language}{categoryWord};
$TemplateWord=$LangSettings{$Language}{templateWord};
}
#
# Speed vs. (possibly) quality
#
if ($ConvertViaHTML)
{
$OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_HTML;
}
else
{
$OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS;
}
if ($NamespacesTxt)
{
for my $ns (split(/,/,$NamespacesTxt))
{
$ns=~s/^\s+//isgo;
$ns=~s/\s+$//isgo;
push(@Namespaces,$ns);
}
}
my $XMLDumpF=shift @ARGV;
$|=1;
my $C=Alvis::Convert->new(outputRootDir=>$ODir,
outputNPerSubdir=>$NPerOutDir,
outputAtSameLocation=>0,
includeOriginalDocument=>$IncOrigDoc);
if (!defined($C))
{
die("Instantiating Alvis::Convert failed.\n");
}
my %Seen;
my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
[\&_output_wikipedia_article],
{expandTemplates=>$ExpandTemplates,
templateDumpF=>$TemplateDumpF,
outputFormat=>$OutputFormat,
categoryWord=>$CategoryWord,
date=>$Date,
namespaces=>[@Namespaces],
dumpCatGraph=>$DumpCatGraph,
catGraphDumpF=>$CatGraphDumpF},
[\&_wikipedia_progress]
))
{
die("Conversion failed. " . $C->errmsg());
}
print "\n";
sub _output_wikipedia_article
{
my $title=shift;
my $date=shift;
my $output_format=shift;
my $record_txt=shift;
my $is_redir=shift;
my $namespace=shift;
warn "TITLE:$title";
my $alvis_XML;
if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
{
my $meta_txt;
$meta_txt.="title\t$title\n";
$meta_txt.="date\t$date\n";
my $ns_txt="";
if ($namespace ne '')
{
$ns_txt="$namespace/";
}
$meta_txt.="url\twikipedia/$ns_txt$title\n";
$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
if (!defined($alvis_XML))
{
warn "Obtaining the Alvis version of the " .
"HTML version of an article failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
return 1;
}
}
elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
{
$alvis_XML=$record_txt;
}
else
{
die("Internal inconsistency: output format of a Wikipedia article " .
"is an unrecognized one: \"$output_format\".");
}
$title=~s/\//_/isgo;
my $dir=$ODir . "/" . (int($N/$NPerOutDir)+1);
system("mkdir -p $dir"); # fix this laziness for portability
my $base_name=$dir . "/" . $title;
if (!$C->output_Alvis([$alvis_XML],$base_name))
{
warn "Outputting the Alvis records for base name " .
"\"$base_name\" failed. " . $C->errmsg();
$C->clearerr();
}
$N++;
return 1;
}
sub _wikipedia_progress
{
my $prog_txt=shift;
my $total_nof_records=shift;
my $nof_hits=shift;
my $mess=shift;
if (defined($total_nof_records) && defined($nof_hits))
{
print sprintf("%s Total:%d Found:%d",$prog_txt,
$total_nof_records,$nof_hits) . "\r";
}
else
{
if (!defined($mess))
{
$mess="";
}
print sprintf("%s %-70s",$prog_txt,$mess) . "\r";
}
}
__END__
=head1 NAME
wikipedia2alvis.pl - Wikipedia XML dump to Alvis XML converter
=head1 SYNOPSIS
wikipedia2alvis.pl [options] [Wikipedia XML dump file]
Options:
--out-dir output directory
--namespaces list of namespaces to extract
--N-per-out-dir # of records per output directory
--[no-]original include original document?
--[no-]expand-templates-fully do we try to expand templates fully?
--[no-]dump-templates do we dump the templates?
--template-dump-file the file to dump the templates to
--[no-]convert-via-html do we convert via HTML or directly to Alvis?
--date the date of the Wikipedia dump
--[no-]dump-category-graph do we dump the category graph?
--category-graph-dump-file the file to dump the category graph to
--category-word category namespace identifier
--root-category root category identifier
--template-word template namespace identifier
--language the language of the Wikipedia dump
--help brief help message
--man full documentation
--[no]warnings warnings output flag
=head1 OPTIONS
=over 8
=item B<--out-dir>
Sets the output directory. Default value: '.'.
=item B<--namespaces>
Sets the namespaces whose records to extract. Given as a ','-separated
list. The namespace names have to be the exact identifiers.
Articles are always extracted. Default value: '''', i.e. articles.
=item B<--N-per-out-dir>
Sets the # of records per output directory. Default value: 1000.
=item B<--[no-]original>
Shall the original document be included in the output? Default
value: no.
=item B<--[no-]expand-templates-fully>
Do we try to expand templates fully or do we simply insert a list of
the template parameter values given in the call? Default value: no.
=item B<--[no-]dump-templates>
Do we dump the templates onto disk in a loadable format?
Default value: no.
=item B<--template-dump-file>
The name of the (possible) template dump file. Default value:
'Templates.storable'.
=item B<--[no-]convert-via-html>
Do we sacrifice speed for quality (possibly) by converting from
Wikitext to Alvis XML via an intermediate HTML version.
Default value: yes.
=item B<--language>
The language of the Wikipedia dump. Affects category and template
extraction. Possible values: 'en' (English), 'fr' (French), 'sl'
(Slovenian). Default value: 'en'.
=item B<--category-word>
The identifier for the category namespace. Overruled by '--language'.
Default value: 'Category'.
=item B<--root-category>
The identifier for the root category of the category graph.
Overruled by '--language'. Default value: 'fundamental'.
=item B<--template-word>
The identifier for the template namespace. Overruled by '--language'.
Default value: 'Template'.
=item B<--date>
The date of the Wikipedia dump as YYYYMMDD. Default value: undefined
(means: use current date).
=item B<--[no-]dump-category-graph>
Do we dump the category graph onto disk in a loadable format?.
Default value: yes.
=item B<--category-graph-dump-file>
The name of the (possible) category graph dump file. Default value:
'CategoryGraph.storable'.
=item B<--help>
Prints a brief help message and exits.
=item B<--man>
( run in 0.789 second using v1.01-cache-2.11-cpan-5b529ec07f3 )