Alvis-Convert

 view release on metacpan or  search on metacpan

bin/wikipedia2alvis  view on Meta::CPAN


my $XMLDumpF=shift @ARGV;

$|=1;

my $C=Alvis::Convert->new(outputRootDir=>$ODir,
                          outputNPerSubdir=>$NPerOutDir,
                          outputAtSameLocation=>0,
			  includeOriginalDocument=>$IncOrigDoc);
if (!defined($C))
{
    die("Instantiating Alvis::Convert failed.\n");
}

my %Seen;

my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
		   [\&_output_wikipedia_article],
		   {expandTemplates=>$ExpandTemplates,
		    templateDumpF=>$TemplateDumpF,
		    outputFormat=>$OutputFormat,
		    categoryWord=>$CategoryWord,
                    date=>$Date,
		    namespaces=>[@Namespaces],
                    dumpCatGraph=>$DumpCatGraph,
		    catGraphDumpF=>$CatGraphDumpF},
		   [\&_wikipedia_progress]
		   ))
{
    die("Conversion failed. " . $C->errmsg());
}
print "\n";


sub _output_wikipedia_article
{
    my $title=shift;
    my $date=shift;
    my $output_format=shift;
    my $record_txt=shift;
    my $is_redir=shift;
    my $namespace=shift;

    warn "TITLE:$title";
    
    my $alvis_XML;
    if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
    {
	my $meta_txt;
	$meta_txt.="title\t$title\n";
	$meta_txt.="date\t$date\n";
	my $ns_txt="";
	if ($namespace ne '')
	{
	    $ns_txt="$namespace/";
	}
	$meta_txt.="url\twikipedia/$ns_txt$title\n";

	$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
        if (!defined($alvis_XML))
        {
            warn "Obtaining the Alvis version of the " .
                "HTML version of an article failed. " . $C->errmsg() if
                $Warnings;
            $C->clearerr();
            return 1;
        }

    }
    elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
    {
	$alvis_XML=$record_txt;
    }
    else
    {
	die("Internal inconsistency: output format of a Wikipedia article " .
	    "is an unrecognized one: \"$output_format\".");
    }

    $title=~s/\//_/isgo;
    my $dir=$ODir . "/" . (int($N/$NPerOutDir)+1);
    system("mkdir -p $dir");  # fix this laziness for portability

    my $base_name=$dir . "/" . $title;

    if (!$C->output_Alvis([$alvis_XML],$base_name))
    {
	warn "Outputting the Alvis records for base name " .
	    "\"$base_name\" failed. " . $C->errmsg();
	$C->clearerr();
    }

    $N++;

    return 1;
}

sub _wikipedia_progress
{
    my $prog_txt=shift;
    my $total_nof_records=shift;
    my $nof_hits=shift;
    my $mess=shift;

    if (defined($total_nof_records) && defined($nof_hits))
    {
        print sprintf("%s Total:%d Found:%d",$prog_txt,
		      $total_nof_records,$nof_hits) . "\r";
    }
    else
    {
        if (!defined($mess))
        {
            $mess="";
        }
        print sprintf("%s %-70s",$prog_txt,$mess) . "\r";
    }
}



( run in 0.780 second using v1.01-cache-2.11-cpan-140bd7fdf52 )