Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
package Alvis::Wikipedia::XMLDump;
use warnings;
use strict;
$Alvis::Wikipedia::XMLDump::VERSION = '0.1';
use Storable;
use Parse::MediaWikiDump;
use Digest::MD5;
use Encode;
use Data::Dumper;
use Alvis::Wikipedia::WikitextParser;
use Alvis::Wikipedia::Variables;
use Alvis::Wikipedia::CatGraph;
use Alvis::Canonical;
########################################################################
#
# Exported constants
#
#######################################################################
# Record output formats
our ($OUTPUT_HTML,
$OUTPUT_ALVIS
)=(0..1);
############################################################################
#
# Error message stuff
#
############################################################################
my ($ERR_OK,
$ERR_VAR,
$ERR_PARSER,
$ERR_FIRST_PASS,
$ERR_SECOND_PASS,
$ERR_TEMPL_ADD,
$ERR_EXPAND,
$ERR_DUMP,
$ERR_TABLE_PARSE,
$ERR_REC_CB,
$ERR_HTML,
$ERR_CAN_DOC_CONV,
$ERR_ALVIS,
$ERR_BUILD_CAT_GRAPH,
$ERR_CATEGORIES,
$ERR_XML_PARSER,
$ERR_CAN_DOC_CONVERSION,
$ERR_ID,
$ERR_TITLE,
$ERR_CAT_PAGE_LINKS_ADD,
$ERR_CAT_GRAPH,
$ERR_LOAD_TEMPLATES,
$ERR_CAT_GRAPH_DUMP,
$ERR_UNK_OUTPUT_FORMAT
)=(0..23);
my %ErrMsgs=($ERR_OK=>"",
$ERR_VAR=>"Unable to instantiate Alvis::Wikipedia::Variables.",
$ERR_PARSER=>
"Unable to instantiate Alvis::Wikipedia::WikitextParser.",
$ERR_FIRST_PASS=>"The first pass over the records failed.",
$ERR_SECOND_PASS=>"The main pass over the records failed.",
$ERR_TEMPL_ADD=>"Adding the definition of a template failed.",
$ERR_EXPAND=>"Variable and template expansion failed.",
$ERR_DUMP=>"Opening the SQL dump file failed.",
$ERR_TABLE_PARSE=>"Parsing a subtable failed.",
$ERR_REC_CB=>"Record handling callback failed.",
$ERR_HTML=>"Wikitext -> HTML failed.",
$ERR_CAN_DOC_CONV=>
"Creating a new instance of Alvis::Canonical failed.",
$ERR_ALVIS=>"Converting to Alvis failed",
$ERR_BUILD_CAT_GRAPH=>"Adding to the category graph failed.",
$ERR_CATEGORIES=>"Determining the categories of an article " .
"failed.",
$ERR_XML_PARSER=>"Unable to instantiate Parse::MediaWikiDump",
$ERR_CAN_DOC_CONVERSION=>"Converting the text from HTML to " .
"canonicalDocument format failed",
$ERR_ID=>"Calculating the id failed.",
$ERR_TITLE=>"Malformed title",
$ERR_CAT_PAGE_LINKS_ADD=>"Adding the links of a category page " .
"to the graph failed",
$ERR_CAT_GRAPH=>"Instantiating CatGraph failed",
$ERR_LOAD_TEMPLATES=>"Loading the templates failed.",
$ERR_CAT_GRAPH_DUMP=>"Dumping the category graph failed.",
$ERR_UNK_OUTPUT_FORMAT=>"Unrecognized XML dump record output " .
"format."
);
sub _set_err_state
{
my $self=shift;
my $errcode=shift;
my $errmsg=shift;
if (!defined($errcode))
{
confess("set_err_state() called with an undefined argument.");
}
if (exists($ErrMsgs{$errcode}))
{
if ($errcode==$ERR_OK)
{
$self->{errstr}="";
}
else
{
$self->{errstr}.=" " . $ErrMsgs{$errcode};
if (defined($errmsg))
{
$self->{errstr}.=" " . $errmsg;
}
}
}
else
{
confess("Internal error: set_err_state() called with an " .
"unrecognized argument ($errcode).")
}
}
sub clearerr
{
my $self=shift;
$self->{errstr}="";
}
sub errmsg
{
my $self=shift;
return $self->{errstr};
}
##########################################################################
#
# Public methods
#
##########################################################################
sub new
{
my $proto=shift;
my $class=ref($proto)||$proto;
my $parent=ref($proto)&&$proto;
my $self={};
bless($self,$class);
$self->_set_err_state($ERR_OK);
$self->_init(@_);
$self->{variables}=Alvis::Wikipedia::Variables->new();
if (!defined($self->{variables}))
{
$self->_set_err_state($ERR_VAR);
return undef;
}
$self->{parser}=Alvis::Wikipedia::WikitextParser->new();
if (!defined($self->{parser}))
{
$self->_set_err_state($ERR_PARSER);
return undef;
}
$self->{canDocConverter}=Alvis::Canonical->new(convertCharEnts=>1,
convertNumEnts=>1,
sourceEncoding=>'utf8');
if (!defined($self->{canDocConverter}))
{
$self->_set_err_state($ERR_CAN_DOC_CONV);
return undef;
}
$self->{catGraph}=Alvis::Wikipedia::CatGraph->new();
if (!defined($self->{catGraph}))
{
$self->_set_err_state($ERR_CAT_GRAPH);
return undef;
}
return $self;
}
sub _init
{
my $self=shift;
$self->{expandTemplates}=0;
$self->{outputFormat}=$OUTPUT_HTML;
$self->{skipRedirects}=0;
$self->{categoryWord}='Category';
$self->{templateWord}='Template';
$self->{dumpCategoryData}=1;
$self->{dumpTemplateData}=1;
$self->{catGraphDumpF}='CatGraph.storable';
$self->{templateDumpF}='Templates.storable';
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#
# opts: hash with fields
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records ($OUTPUT_HTML,
# $OUTPUT_ALVIS),...
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
# dumpCatGraph flag for dumping the category graph
# catGraphDumpF category graph dump file
#
sub extract_records
{
my $self=shift;
my $fd=shift; # dump fd ref
my $cb=shift; # [\&foo,$arg1,$arg2], callback for each [record title,text]
my $opts=shift;
my $prog_cb=shift; # [\&foo,$arg1,$arg2], optional callback for progress
# ('N records processed')
if (!defined($cb))
{
$self->_set_err_state($ERR_XML_PARSER);
return 0;
}
my $prog_txt="";
my $expand_templates;
if (exists($self->{expandTemplates}))
{
$expand_templates=$self->{expandTemplates};
}
if (exists($opts->{expandTemplates}))
{
$expand_templates=$opts->{expandTemplates};
}
my %namespaces;
if ($expand_templates)
{
if ($opts->{templateDumpF})
{
if (defined($prog_cb))
{
my @prog_cb=@$prog_cb;
&{$prog_cb[0]}(@prog_cb[1..$#prog_cb],"Loading the templates");
}
if (!$self->{variables}->load_templates($opts->{templateDumpF}))
{
$self->_set_err_state($ERR_LOAD_TEMPLATES);
return 0;
}
}
else # Have to do a pass first to collect the templates
{
$self->{XMLParser}=Parse::MediaWikiDump::Pages->new($fd);
if (!defined($self->{XMLParser}))
{
$self->_set_err_state($ERR_XML_PARSER);
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
{
$self->_set_err_state($ERR_TEMPL_ADD);
return 0;
}
}
return 1;
}
sub _return_alvis_record
{
my $self=shift;
my $cb=shift;
my $mod_date=shift;
my $category_word=shift;
my $expand_templates=shift;
my $output_format=shift;
my $namespace=shift;
my $title=shift;
my $text=shift;
my $is_redir=shift;
my $orig_text=$text;
my $expansion;
$text=~s/<!--.*?-->//sgo;
$title=$self->{parser}->normalize_title($title);
if (!defined($title))
{
$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
return 0;
}
$expansion=$self->{variables}->expand($namespace,$title,$text,
$expand_templates);
if (!defined($expansion))
{
$self->_set_err_state($ERR_EXPAND);
return 0;
}
$text=$expansion;
if ($namespace ne '')
{
$title="$namespace/$title";
}
if ($namespace eq $category_word && $self->{dumpCategoryData})
{
if (!$self->_add_cat_page_links_to_graph($title,$text))
{
$self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
"title: \"$title\"");
return 0;
}
}
my @cb;
if ($output_format eq $OUTPUT_HTML)
{
my $html=$self->{parser}->to_HTML($text);
if (!defined($html))
{
$self->_set_err_state($ERR_HTML);
return 0;
}
$html="<HTML>\n<BODY>\n" . $html . "</BODY>\n</HTML>\n";
@cb=@$cb;
&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,$html,
$is_redir,$namespace);
}
elsif ($output_format eq $OUTPUT_ALVIS)
{
; # Skip HTML and convert directly to Alvis XML to save time
die("NOT IMPLEMENTED YET!");
my $alvis_XML;
@cb=@$cb;
&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,
$alvis_XML,$is_redir,$namespace);
}
else
{
$self->_set_err_state($ERR_UNK_OUTPUT_FORMAT,
"format: \"$output_format\"");
return 0;
}
return 1;
}
sub _add_cat_page_to_graph
{
my $self=shift;
my $namespace=shift;
my $title=shift;
my $text=shift;
my $is_redir=shift;
my $orig_text=$text;
my $expansion;
$text=~s/<!--.*?-->//sgo;
$title=$self->{parser}->normalize_title($title);
if (!defined($title))
{
$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
return 0;
}
$expansion=$self->{variables}->expand($namespace,$title,$text);
if (!defined($expansion))
{
$self->_set_err_state($ERR_EXPAND);
return 0;
}
$text=$expansion;
if (!$self->_add_cat_page_links_to_graph($title,$text))
{
$self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
"title: \"$title\"");
return 0;
}
return 1;
}
sub _add_cat_page_links_to_graph
{
my $self=shift;
my $title=shift; # already normalized
my $text=shift;
my $cat=$title;
$text=~s/\[\[(?:(?i)$self->{categoryWord}):(.*?)\]\]/$self->_add_cat_link($cat,$1)/sgoe;
return 1;
}
sub _add_cat_link
{
( run in 1.926 second using v1.01-cache-2.11-cpan-39bf76dae61 )