Alvis-Convert

 view release on metacpan or  search on metacpan

bin/wikipedia2alvis  view on Meta::CPAN

my $Date=undef;
my $DumpCatGraph=1;
my $DumpTemplates=0;
my $CatGraphDumpF="CategoryGraph.storable";
my $Language='en';
my $RootCategory='fundamental';
my $CategoryWord="Category";
my $TemplateWord='Template';
my @Namespaces=('');   
my $NamespacesTxt=undef;          # Namespaces to include, default 'Articles'

GetOptions('help|?'=>\$PrintHelp, 
	   'man'=>\$PrintManual,
	   'warnings!'=>\$Warnings,
	   'out-dir=s'=>\$ODir,
	   'namespaces=s'=>\$NamespacesTxt,
	   'N-per-out-dir=s'=>\$NPerOutDir,
	   'original!'=>\$IncOrigDoc,
	   'expand-templates-fully!'=>\$ExpandTemplates,
	   'dump-templates!'=>\$DumpTemplates,
	   'template-dump-file=s'=>\$TemplateDumpF,
	   'convert-via-html!'=>\$ConvertViaHTML,
	   'language=s'=>\$Language,
	   'category-word=s'=>\$CategoryWord,
	   'root-category=s'=>\$RootCategory,
	   'template-word=s'=>\$TemplateWord,
	   'date=s'=>\$Date,
	   'dump-category-graph!'=>\$DumpCatGraph,
	   'category-graph-dump-file=s'=>\$CatGraphDumpF
	   ) or 
    pod2usage(2);
pod2usage(1) if $PrintHelp;
pod2usage(-exitstatus => 0, -verbose => 2) if $PrintManual;
pod2usage(1) if (@ARGV!=1);

#
# If we don't want to dump the templates, signal it like this
#
if (!$DumpTemplates)
{
    undef $TemplateDumpF;
}
# 
# Check that we know this language
#
if (!exists($LangSettings{$Language}))
{
    die("Unrecognized language abbreviation \"$Language\".\n");
}
else
{
    $RootCategory=$LangSettings{$Language}{rootCategory};
    $CategoryWord=$LangSettings{$Language}{categoryWord};
    $TemplateWord=$LangSettings{$Language}{templateWord};
}
#
# Speed vs. (possibly) quality
#
if ($ConvertViaHTML)
{
    $OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_HTML;
}
else
{
    $OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS;
}
if ($NamespacesTxt)
{
    for my $ns (split(/,/,$NamespacesTxt))
    {
	$ns=~s/^\s+//isgo;
	$ns=~s/\s+$//isgo;
	push(@Namespaces,$ns);
    }
}


my $XMLDumpF=shift @ARGV;

$|=1;

my $C=Alvis::Convert->new(outputRootDir=>$ODir,
                          outputNPerSubdir=>$NPerOutDir,
                          outputAtSameLocation=>0,
			  includeOriginalDocument=>$IncOrigDoc);
if (!defined($C))
{
    die("Instantiating Alvis::Convert failed.\n");
}

my %Seen;

my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
		   [\&_output_wikipedia_article],
		   {expandTemplates=>$ExpandTemplates,
		    templateDumpF=>$TemplateDumpF,
		    outputFormat=>$OutputFormat,
		    categoryWord=>$CategoryWord,
                    date=>$Date,
		    namespaces=>[@Namespaces],
                    dumpCatGraph=>$DumpCatGraph,
		    catGraphDumpF=>$CatGraphDumpF},
		   [\&_wikipedia_progress]
		   ))
{
    die("Conversion failed. " . $C->errmsg());
}
print "\n";


sub _output_wikipedia_article
{
    my $title=shift;
    my $date=shift;
    my $output_format=shift;
    my $record_txt=shift;
    my $is_redir=shift;
    my $namespace=shift;

    warn "TITLE:$title";
    
    my $alvis_XML;
    if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
    {
	my $meta_txt;
	$meta_txt.="title\t$title\n";
	$meta_txt.="date\t$date\n";
	my $ns_txt="";
	if ($namespace ne '')
	{
	    $ns_txt="$namespace/";
	}
	$meta_txt.="url\twikipedia/$ns_txt$title\n";

	$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
        if (!defined($alvis_XML))
        {
            warn "Obtaining the Alvis version of the " .
                "HTML version of an article failed. " . $C->errmsg() if
                $Warnings;
            $C->clearerr();
            return 1;
        }

    }
    elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
    {
	$alvis_XML=$record_txt;
    }
    else
    {
	die("Internal inconsistency: output format of a Wikipedia article " .
	    "is an unrecognized one: \"$output_format\".");
    }

    $title=~s/\//_/isgo;
    my $dir=$ODir . "/" . (int($N/$NPerOutDir)+1);
    system("mkdir -p $dir");  # fix this laziness for portability

    my $base_name=$dir . "/" . $title;

    if (!$C->output_Alvis([$alvis_XML],$base_name))
    {
	warn "Outputting the Alvis records for base name " .
	    "\"$base_name\" failed. " . $C->errmsg();
	$C->clearerr();
    }

    $N++;

    return 1;
}

sub _wikipedia_progress
{
    my $prog_txt=shift;
    my $total_nof_records=shift;
    my $nof_hits=shift;
    my $mess=shift;

    if (defined($total_nof_records) && defined($nof_hits))
    {
        print sprintf("%s Total:%d Found:%d",$prog_txt,
		      $total_nof_records,$nof_hits) . "\r";
    }
    else
    {
        if (!defined($mess))
        {
            $mess="";
        }
        print sprintf("%s %-70s",$prog_txt,$mess) . "\r";
    }
}


__END__

=head1 NAME
    
    wikipedia2alvis.pl - Wikipedia XML dump to Alvis XML converter
    
=head1 SYNOPSIS
    
    wikipedia2alvis.pl [options] [Wikipedia XML dump file]



( run in 1.436 second using v1.01-cache-2.11-cpan-39bf76dae61 )