view release on metacpan or search on metacpan
bin/wikipedia2alvis view on Meta::CPAN
{
$RootCategory=$LangSettings{$Language}{rootCategory};
$CategoryWord=$LangSettings{$Language}{categoryWord};
$TemplateWord=$LangSettings{$Language}{templateWord};
}
#
# Speed vs. (possibly) quality
#
if ($ConvertViaHTML)
{
$OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_HTML;
}
else
{
$OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS;
}
if ($NamespacesTxt)
{
for my $ns (split(/,/,$NamespacesTxt))
{
$ns=~s/^\s+//isgo;
$ns=~s/\s+$//isgo;
push(@Namespaces,$ns);
}
}
bin/wikipedia2alvis view on Meta::CPAN
my $title=shift;
my $date=shift;
my $output_format=shift;
my $record_txt=shift;
my $is_redir=shift;
my $namespace=shift;
warn "TITLE:$title";
my $alvis_XML;
if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
{
my $meta_txt;
$meta_txt.="title\t$title\n";
$meta_txt.="date\t$date\n";
my $ns_txt="";
if ($namespace ne '')
{
$ns_txt="$namespace/";
}
$meta_txt.="url\twikipedia/$ns_txt$title\n";
bin/wikipedia2alvis view on Meta::CPAN
if (!defined($alvis_XML))
{
warn "Obtaining the Alvis version of the " .
"HTML version of an article failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
return 1;
}
}
elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
{
$alvis_XML=$record_txt;
}
else
{
die("Internal inconsistency: output format of a Wikipedia article " .
"is an unrecognized one: \"$output_format\".");
}
$title=~s/\//_/isgo;
lib/Alvis/Convert.pm view on Meta::CPAN
$ERR_XML_PARSE,
$ERR_NO_URL,
$ERR_ENCODING_WIZARD,
$ERR_UTF8_CONV,
$ERR_ENCODING_CONV,
$ERR_TYPE_SUFFIX,
$ERR_READ_HTML,
$ERR_READ_NEWS_XML,
$ERR_ALVIS_CONV,
$ERR_ALVIS_SUFFIX,
$ERR_NO_OUTPUT_ROOT_DIR,
$ERR_WRITING_OUTPUT,
$ERR_DIR_CONV,
$ERR_NO_HTML_F,
$ERR_META_F,
$ERR_HTML_F,
$ERR_NEWS_XML_F,
$ERR_DOC_ALVIS_CONV,
$ERR_NEWS_XML_PARSE,
$ERR_MULTIPLE_SUFFIX_MEANING,
$ERR_OUTPUT_ALVIS,
$ERR_OUTPUT_SET_OF_RECORDS,
$ERR_AINODUMP,
$ERR_OPEN_AINODUMP,
$ERR_AINODUMP_PROCESS,
$ERR_DOC_TYPE_WIZARD,
$ERR_TYPE_GUESS,
$ERR_UNK_FILE_TYPE,
$ERR_WIKIPEDIA,
$ERR_OPEN_WIKIPEDIA,
$ERR_WIKIPEDIA_CONV
)=(0..40);
lib/Alvis/Convert.pm view on Meta::CPAN
$ERR_ENCODING_WIZARD=>"Unable to instantiate " .
"Alvis::Document::Encoding.",
$ERR_UTF8_CONV=>"Trying to convert to UTF-8 failed.",
$ERR_ENCODING_CONV=>"Converting from the supposed source " .
"encoding to UTF-8 failed.",
$ERR_TYPE_SUFFIX=>"No suffix given for a type.",
$ERR_READ_HTML=>"Reading the HTML failed.",
$ERR_READ_NEWS_XML=>"Reading the news XML failed.",
$ERR_ALVIS_CONV=>"Conversion to Alvis format failed.",
$ERR_ALVIS_SUFFIX=>"No Alvis suffix defined.",
$ERR_NO_OUTPUT_ROOT_DIR=>"No output root directory.",
$ERR_WRITING_OUTPUT=>"Writing the output failed.",
$ERR_DIR_CONV=>"Converting a directory failed.",
$ERR_NO_HTML_F=>"No HTML file.",
$ERR_META_F=>"Opening the meta file failed.",
$ERR_HTML_F=>"Opening the HTML file failed.",
$ERR_NEWS_XML_F=>"Opening the news XML file failed.",
$ERR_DOC_ALVIS_CONV=>"Converting a document to Alvis format failed.",
$ERR_NEWS_XML_PARSE=>"Parsing the news XML failed.",
$ERR_MULTIPLE_SUFFIX_MEANING=>
"Multiple meanings for a single suffix.",
$ERR_OUTPUT_ALVIS=>"Outputting the Alvis records failed.",
$ERR_OUTPUT_SET_OF_RECORDS=>"Outputting a set of records to a " .
"file as a documentCollection failed.",
$ERR_AINODUMP=>"Instantiating Alvis::AinoDump failed.",
$ERR_OPEN_AINODUMP=>"Opening an ainodump file failed.",
$ERR_AINODUMP_PROCESS=>"Processing an ainodump file failed.",
$ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type " .
"failed.",
$ERR_TYPE_GUESS=>"Guessing the document's type failed.",
$ERR_UNK_FILE_TYPE=>"Unrecognized file type.",
$ERR_WIKIPEDIA=>"Instantiating Alvis::Wikipedia::XMLDump failed.",
$ERR_OPEN_WIKIPEDIA=>"Opening the Wikipedia XML dump file failed.",
lib/Alvis/Convert.pm view on Meta::CPAN
}
#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
# will be called like this:
# _output_wikipedia_article($arg1,$arg2,...,
# $title,$output_format,
# $record_txt,$is_redir)
#
# where $output_format is a global defined in Alvis::Wikipedia::XMLDump
# as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...] OPTIONAL
# will be called like this:
# _wikipedia_progress($arg1,$arg2,...,
# $prog_txt,$N,$n,$mess)
#
# where $N is the total number of records processed and $n the number of hits
#
# opts: a hash of options with these possible fields:
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records
# ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
# dumpCatGraph flag for dumping the category graph
# catGraphDumpF category graph dump file
#
lib/Alvis/Convert.pm view on Meta::CPAN
$self->_set_err_state($ERR_ALVIS_SUFFIX);
return 0;
}
if ($self->{outputAtSameLocation})
{
$out_f=$base_name . "." . $self->{articleN} . '.' .
$self->{alvisSuffix};
$self->{articleN}++;
if (!$self->_output_set_of_records($alvis_record,$out_f))
{
$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
return 0;
}
$self->{outputN}++;
print "$self->{outputN}\r";
}
else
{
if (!defined($self->{outputRootDir}))
{
$self->_set_err_state($ERR_NO_OUTPUT_ROOT_DIR);
return 0;
}
my $dir=$self->{outputRootDir} . '/' .
int($self->{outputN} / $self->{outputNPerSubdir});
if ($self->{outputN} % $self->{outputNPerSubdir}==0)
{
mkdir($dir);
}
$out_f=$dir . '/' . $self->{outputN} . '.' .
$self->{alvisSuffix};
if (!$self->_output_set_of_records($alvis_record,$out_f))
{
$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
return 0;
}
$self->{outputN}++;
print "$self->{outputN}\r";
}
}
return 1;
}
lib/Alvis/Convert.pm view on Meta::CPAN
my $alvisXML=$self->HTML($text,$meta_txt);
$self->{sourceEncoding}=$srcenc_setting;
if (!defined($alvisXML))
{
$self->_set_err_state($ERR_ALVIS_CONV);
return 0;
}
if (!$self->output_Alvis([$alvisXML],$base_name))
{
$self->_set_err_state($ERR_OUTPUT_ALVIS,
"Base name: \"$base_name\"");
return 0;
}
}
else
{
warn "Ainodump document $header->{id} was not of a convertible " .
"type: $type/$sub_type.\n" if $self->{ainodumpWarnings};
}
lib/Alvis/Convert.pm view on Meta::CPAN
}
sub _output_set_of_records
{
my $self=shift;
my $set_of_records_txt=shift;
my $path=shift;
if (!defined(open(OUT,">:utf8",$path)))
{
$self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " .
"\"$path\"");
return 0;
}
print OUT "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
print OUT "<documentCollection xmlns=\"http://alvis.info/enriched/\">\n";
print OUT $set_of_records_txt;
print OUT "</documentCollection>\n";
close(OUT);
return 1;
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
use Alvis::Wikipedia::CatGraph;
use Alvis::Canonical;
########################################################################
#
# Exported constants
#
#######################################################################
# Record output formats
our ($OUTPUT_HTML,
$OUTPUT_ALVIS
)=(0..1);
############################################################################
#
# Error message stuff
#
############################################################################
my ($ERR_OK,
$ERR_VAR,
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
$ERR_BUILD_CAT_GRAPH,
$ERR_CATEGORIES,
$ERR_XML_PARSER,
$ERR_CAN_DOC_CONVERSION,
$ERR_ID,
$ERR_TITLE,
$ERR_CAT_PAGE_LINKS_ADD,
$ERR_CAT_GRAPH,
$ERR_LOAD_TEMPLATES,
$ERR_CAT_GRAPH_DUMP,
$ERR_UNK_OUTPUT_FORMAT
)=(0..23);
my %ErrMsgs=($ERR_OK=>"",
$ERR_VAR=>"Unable to instantiate Alvis::Wikipedia::Variables.",
$ERR_PARSER=>
"Unable to instantiate Alvis::Wikipedia::WikitextParser.",
$ERR_FIRST_PASS=>"The first pass over the records failed.",
$ERR_SECOND_PASS=>"The main pass over the records failed.",
$ERR_TEMPL_ADD=>"Adding the definition of a template failed.",
$ERR_EXPAND=>"Variable and template expansion failed.",
$ERR_DUMP=>"Opening the SQL dump file failed.",
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
$ERR_XML_PARSER=>"Unable to instantiate Parse::MediaWikiDump",
$ERR_CAN_DOC_CONVERSION=>"Converting the text from HTML to " .
"canonicalDocument format failed",
$ERR_ID=>"Calculating the id failed.",
$ERR_TITLE=>"Malformed title",
$ERR_CAT_PAGE_LINKS_ADD=>"Adding the links of a category page " .
"to the graph failed",
$ERR_CAT_GRAPH=>"Instantiating CatGraph failed",
$ERR_LOAD_TEMPLATES=>"Loading the templates failed.",
$ERR_CAT_GRAPH_DUMP=>"Dumping the category graph failed.",
$ERR_UNK_OUTPUT_FORMAT=>"Unrecognized XML dump record output " .
"format."
);
sub _set_err_state
{
my $self=shift;
my $errcode=shift;
my $errmsg=shift;
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
}
return $self;
}
sub _init
{
my $self=shift;
$self->{expandTemplates}=0;
$self->{outputFormat}=$OUTPUT_HTML;
$self->{skipRedirects}=0;
$self->{categoryWord}='Category';
$self->{templateWord}='Template';
$self->{dumpCategoryData}=1;
$self->{dumpTemplateData}=1;
$self->{catGraphDumpF}='CatGraph.storable';
$self->{templateDumpF}='Templates.storable';
if (defined(@_))
{
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
}
}
#
# opts: hash with fields
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records ($OUTPUT_HTML,
# $OUTPUT_ALVIS),...
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
# dumpCatGraph flag for dumping the category graph
# catGraphDumpF category graph dump file
#
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
if (!$self->_add_cat_page_links_to_graph($title,$text))
{
$self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
"title: \"$title\"");
return 0;
}
}
my @cb;
if ($output_format eq $OUTPUT_HTML)
{
my $html=$self->{parser}->to_HTML($text);
if (!defined($html))
{
$self->_set_err_state($ERR_HTML);
return 0;
}
$html="<HTML>\n<BODY>\n" . $html . "</BODY>\n</HTML>\n";
@cb=@$cb;
&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,$html,
$is_redir,$namespace);
}
elsif ($output_format eq $OUTPUT_ALVIS)
{
; # Skip HTML and convert directly to Alvis XML to save time
die("NOT IMPLEMENTED YET!");
my $alvis_XML;
@cb=@$cb;
&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,
$alvis_XML,$is_redir,$namespace);
}
else
{
$self->_set_err_state($ERR_UNK_OUTPUT_FORMAT,
"format: \"$output_format\"");
return 0;
}
return 1;
}
sub _add_cat_page_to_graph
{
my $self=shift;