view release on metacpan or search on metacpan
bin/wikipedia2alvis view on Meta::CPAN
my @Namespaces=('');
my $NamespacesTxt=undef; # Namespaces to include, default 'Articles'
GetOptions('help|?'=>\$PrintHelp,
'man'=>\$PrintManual,
'warnings!'=>\$Warnings,
'out-dir=s'=>\$ODir,
'namespaces=s'=>\$NamespacesTxt,
'N-per-out-dir=s'=>\$NPerOutDir,
'original!'=>\$IncOrigDoc,
'expand-templates-fully!'=>\$ExpandTemplates,
'dump-templates!'=>\$DumpTemplates,
'template-dump-file=s'=>\$TemplateDumpF,
'convert-via-html!'=>\$ConvertViaHTML,
'language=s'=>\$Language,
'category-word=s'=>\$CategoryWord,
'root-category=s'=>\$RootCategory,
'template-word=s'=>\$TemplateWord,
'date=s'=>\$Date,
'dump-category-graph!'=>\$DumpCatGraph,
'category-graph-dump-file=s'=>\$CatGraphDumpF
bin/wikipedia2alvis view on Meta::CPAN
{
die("Instantiating Alvis::Convert failed.\n");
}
my %Seen;
my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
[\&_output_wikipedia_article],
{expandTemplates=>$ExpandTemplates,
templateDumpF=>$TemplateDumpF,
outputFormat=>$OutputFormat,
categoryWord=>$CategoryWord,
date=>$Date,
namespaces=>[@Namespaces],
dumpCatGraph=>$DumpCatGraph,
catGraphDumpF=>$CatGraphDumpF},
[\&_wikipedia_progress]
))
{
bin/wikipedia2alvis view on Meta::CPAN
=head1 SYNOPSIS
wikipedia2alvis.pl [options] [Wikipedia XML dump file]
Options:
--out-dir output directory
--namespaces list of namespaces to extract
--N-per-out-dir # of records per output directory
--[no-]original include original document?
--[no-]expand-templates-fully do we try to expand templates fully?
--[no-]dump-templates do we dump the templates?
--template-dump-file the file to dump the templates to
--[no-]convert-via-html do we convert via HTML or directly to Alvis?
--date the date of the Wikipedia dump
--[no-]dump-category-graph do we dump the category graph?
--category-graph-dump-file the file to dump the category graph to
--category-word category namespace identifier
--root-category root category identifier
--template-word template namespace identifier
--language the language of the Wikipedia dump
bin/wikipedia2alvis view on Meta::CPAN
=item B<--N-per-out-dir>
Sets the # of records per output directory. Default value: 1000.
=item B<--[no-]original>
Shall the original document be included in the output? Default
value: no.
=item B<--[no-]expand-templates-fully>
Do we try to expand templates fully or do we simply insert a list of
the template parameter values given in the call? Default value: no.
=item B<--[no-]dump-templates>
Do we dump the templates onto disk in a loadable format?
Default value: no.
=item B<--template-dump-file>
The name of the (possible) template dump file. Default value:
lib/Alvis/Convert.pm view on Meta::CPAN
$self->{encodingWizard}=
Alvis::Document::Encoding->new(defaultEncoding=>undef);
if (!defined($self->{encodingWizard}))
{
$self->_set_err_state($ERR_ENCODING_WIZARD);
return undef;
}
$self->{wikipediaConverter}=
Alvis::Wikipedia::XMLDump->new(expandVariables=>1,
skipRedirects=>0,
dumpCategoryData=>1,
dumpTemplateData=>1);
if (!defined($self->{wikipediaConverter}))
{
$self->_set_err_state($ERR_WIKIPEDIA);
return undef;
}
$self->{docTypeWizard}=
lib/Alvis/Convert.pm view on Meta::CPAN
# will be called like this:
# _wikipedia_progress($arg1,$arg2,...,
# $prog_txt,$N,$n,$mess)
#
# where $N is the total number of records processed and $n the number of hits
#
# opts: a hash of options with these possible fields:
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records
# ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
$ERR_NO_NAMESPACE,
$ERR_STORE,
$ERR_UNDEF_DUMP,
$ERR_RETRIEVE
)=(0..10);
my %ErrMsgs=($ERR_OK=>"",
$ERR_PARSER=>"Unable to instantiate Alvis::Wikipedia::WikitextParser.",
$ERR_NORM=>"Title normalization failed.",
$ERR_UNK_TEMPL=>"Unrecognized template name.",
$ERR_PARAM=>"Application of a parameter pattern failed.",
$ERR_NO_TEXT=>"Undefined text to expand",
$ERR_NO_TITLE=>"Undefined title to expand",
$ERR_NO_NAMESPACE=>"Undefined namespace to expand",
$ERR_STORE=>"Storable::store() failed.",
$ERR_UNDEF_DUMP=>"Trying to dump when there are no definitions.",
$ERR_RETRIEVE=>"Storable::retrieve() failed."
);
sub _set_err_state
{
my $self=shift;
my $errcode=shift;
my $errmsg=shift;
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
$def=~s/<noinclude>.*?<\/noinclude>//sgo;
$def=~s/<\/?includeonly>//sgo;
$self->{defs}{$norm_name}=$def;
return 1;
}
#
# expand_for_real: do we try to expand the templates for real
# (messy and error-prone) or do we simply replace
# with a list of the parameter values?
#
sub expand
{
my $self=shift;
my $namespace=shift;
my $title=shift;
my $text=shift;
my $expand_for_real=shift;
if (!defined($namespace))
{
$self->_set_err_state($ERR_NO_NAMESPACE);
return undef;
}
if (!defined($title))
{
$self->_set_err_state($ERR_NO_TITLE);
return undef;
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
{
$self->_set_err_state($ERR_NO_TEXT);
return undef;
}
$self->{currNamespace}=$namespace;
$self->{currTitle}=$title;
$self->{nofExpansions}=0;
warn "TRANSCLUDING...\n" if $DEBUG;
my $expanded_text=$self->_transclude($text,$expand_for_real);
warn "DONE TRANSCLUDING\n" if $DEBUG;
return $expanded_text;
}
#
# expand_for_real: do we try to expand the templates for real
# (messy and error-prone) or do we simply replace
# with a list of the parameter values?
#
sub _transclude
{
my $self=shift;
my $text=shift;
my $expand_for_real=shift;
$self->{higherLevelExpandedNames}={};
while ($text=~/(([^\{])?\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*)(\|.*?)?\}\})/sgo)
{
$self->{thisLevelExpandedNames}={};
# Safeguard against malevolent templates
if (length($text)>$self->{maxExpandedTextSize} ||
$self->{nofExpansions}>$self->{maxNofExpansions})
{
warn "Excessive expansion stopped for \"$self->{currNamespace}:$self->{currTitle}\"" .
". Length of the text to expand: " .
length($text) . ", # of expansions: " . $self->{nofExpansions};
return $text;
}
warn "BEFORE VARIABLE SUBSITUTION\n" if $DEBUG;
# Variable substitution
$text=~s/(\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*?)\}\})/$self->_substitute_variable($1,$2)/sgeo;
warn "TEXT AFTER VARIABLE SUBSTITUTION:$text\n" if $DEBUG;
# Template substitution
$text=~s/(([^\{])?\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*)(\|[^\{]*?)?\}\})/$self->_substitute_template($1,$2,$3,$4,$expand_for_real)/sgeo;
warn "TEXT AFTER TEMPLATE SUBSTITUTION:$text\n" if $DEBUG;
for my $name (keys %{$self->{thisLevelExpandedNames}})
{
$self->{higherLevelExpandedNames}->{$name}=1;
}
}
return $text;
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
}
}
sub _substitute_template
{
my $self=shift;
my $orig_text=shift;
my $pre_context=shift;
my $name=shift;
my $params=shift;
my $expand_for_real=shift;
my $found=0;
my $expanded_text;
my %arg_assignments=();
$name=$self->{parser}->normalize_title($name);
warn "substitute_template():" if $DEBUG;
warn "PRE:\"$pre_context\"\n" if $DEBUG;
warn "NAME:\"$name\"\n" if $DEBUG;
warn "PARAMS:\"$params\"\n" if $DEBUG;
# Don't parse {{{}}} because that's only for template arguments
if (defined($pre_context) && $pre_context eq '{')
{
warn "{ PRE-CONTEXT\n" if $DEBUG;
return $orig_text;
}
# Ok, now expand if it's a template
# Do we know this template or don't we care anyway?
if (($name && exists($self->{defs}{$name})) || !$expand_for_real)
{
warn "TEMPLATE $name FOUND\n" if $DEBUG;
$found=1;
if (defined($pre_context))
{
$expanded_text=$pre_context;
}
#
# Not recommended atm .. the bloody syntax seems to keep
# on changing with each new server alpha version
#
if ($expand_for_real)
{
$expanded_text.=$self->{defs}{$name};
warn "TEXT AFTER ADDING EXPANSION:$expanded_text\n" if $DEBUG;
if (defined($params))
{
# Collect the parameter assignments
my @actual_args=$self->_get_template_call_args($params);
my $index=1;
for my $arg (@actual_args)
{
my $eq_pos=index($arg,'=');
if ($eq_pos<0)
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
my $value=substr($arg,$eq_pos+1);
$value=~s/^\s+//;
$value=~s/\s+$//;
warn "Adding actual arg \'$name\', value \'$value\'\n" if $DEBUG;
$arg_assignments{$name}=$value;
}
}
}
# Keep track of expanded names
$self->{thisLevelExpandedNames}{$name}=1;
# Substitute actual parameter values
while ($expanded_text=~/(\{\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*?)(\|[^\{]*?)?\}\}\})/sgo)
{
$expanded_text=~s/(\{\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*?)(\|[^\{]*?)?\}\}\})/$self->_substitute_param_value($1,$2,$3,\%arg_assignments)/sgeo;
warn "TEXT AFTER PARAMETER VALUE SUBSTITUTION:$expanded_text\n" if $DEBUG;
}
# If the template begins with a table or block-level
# element, it should be treated as beginning a new line.
if (defined($pre_context) && $pre_context!~/\n/ && $expanded_text=~/^(\{\||:|;|\#|\*)/)
{
warn "ADDING NEWLINE PRE-CONTEXT\n" if $DEBUG;
$expanded_text="\n" . $expanded_text;
}
# remove comments
$expanded_text=~s/<!--.*?-->//isgo;
}
else # play it safe -- shouldn't matter much for search engine
# purposes
{
if (defined($params))
{
# Collect the parameter assignments
my @actual_args=$self->_get_template_call_args($params);
my $index=1;
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
$value=~s/\s+$//;
warn "Adding actual arg \'$name\', value \'$value\'\n" if $DEBUG;
$arg_assignments{$name}=$value;
}
}
}
#
# Simply insert the parameter values as a list
#
$expanded_text.="\n";
for my $p (keys %arg_assignments)
{
$expanded_text.="*$arg_assignments{$p}\n";
}
# If the template begins with a table or block-level
# element, it should be treated as beginning a new line.
if (defined($pre_context) && $pre_context!~/\n/ && $expanded_text=~/^(\{\||:|;|\#|\*)/)
{
warn "ADDING NEWLINE PRE-CONTEXT\n" if $DEBUG;
$expanded_text="\n" . $expanded_text;
}
# remove comments
$expanded_text=~s/<!--.*?-->//isgo if defined($expanded_text);
$expanded_text.="\n";
$expanded_text.="----\n"; # to cause a logical section break
return $expanded_text;
}
}
if (!$found)
{
warn "AT END. NOT FOUND\n" if $DEBUG;
#
# Have to safeguard against retrying this
#
return $pre_context . "UNKNOWN_TEMPLATE_$name" if $DEBUG;
}
else
{
$self->{nofExpansions}++;
warn "AT END. FOUND.\n" if $DEBUG;
return $expanded_text;
}
}
#
# Triple brace replacement -- used for template arguments
#
sub _substitute_param_value
{
my $self=shift;
my $orig_text=shift;
lib/Alvis/Wikipedia/Variables.pm view on Meta::CPAN
if (!$self->{templates}->load($f))
{
$self->_set_err_state($ERR_TEMPL_LOAD,$self->{templates}->errmsg());
return 0;
}
return 1;
}
sub expand
{
my $self=shift;
my $namespace=shift;
my $title=shift;
my $text=shift;
my $expand_templates_for_real=shift; # do we expand the templates fully?
#
# Problems: <math>,<nowiki>...safeguard them
#
my $sep_text=$self->{parser}->separate_markup($text);
if (!defined($sep_text))
{
$self->_set_err_state($ERR_SEP,"Text:\"$text\"");
return "";
}
lib/Alvis/Wikipedia/Variables.pm view on Meta::CPAN
my $exp_text="";
for my $s (@$sep_text)
{
my ($type,$t)=@$s;
if ($type eq $Alvis::Wikipedia::WikitextParser::MARKUP)
{
# warn "MARKUP TO EXPAND:$t\n";
my $exp_t=$self->{templates}->expand($namespace,$title,$t,
$expand_templates_for_real);
if (!defined($exp_t))
{
$self->_set_err_state($ERR_EXP,"Text:\"$t\"");
return undef;
}
else
{
$exp_text.=$exp_t;
}
}
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
return undef;
}
return $self;
}
sub _init
{
my $self=shift;
$self->{expandTemplates}=0;
$self->{outputFormat}=$OUTPUT_HTML;
$self->{skipRedirects}=0;
$self->{categoryWord}='Category';
$self->{templateWord}='Template';
$self->{dumpCategoryData}=1;
$self->{dumpTemplateData}=1;
$self->{catGraphDumpF}='CatGraph.storable';
$self->{templateDumpF}='Templates.storable';
if (defined(@_))
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#
# opts: hash with fields
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records ($OUTPUT_HTML,
# $OUTPUT_ALVIS),...
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
# ('N records processed')
if (!defined($cb))
{
$self->_set_err_state($ERR_XML_PARSER);
return 0;
}
my $prog_txt="";
my $expand_templates;
if (exists($self->{expandTemplates}))
{
$expand_templates=$self->{expandTemplates};
}
if (exists($opts->{expandTemplates}))
{
$expand_templates=$opts->{expandTemplates};
}
my %namespaces;
if ($expand_templates)
{
if ($opts->{templateDumpF})
{
if (defined($prog_cb))
{
my @prog_cb=@$prog_cb;
&{$prog_cb[0]}(@prog_cb[1..$#prog_cb],"Loading the templates");
}
if (!$self->{variables}->load_templates($opts->{templateDumpF}))
{
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
{
for my $ns ($opts->{namespaces})
{
$namespaces{$ns}=1;
}
}
my $p_cb=[@$prog_cb,$prog_txt];
if (!$self->_pass_over_records(\%namespaces,
[\&_return_alvis_record,
$self,$cb,$date,$category_word,
$expand_templates,$output_format],
$p_cb))
{
$self->_set_err_state($ERR_SECOND_PASS);
return 0;
}
my $dump_cat_graph;
if (exists($self->{dumpCatGraph}))
{
$dump_cat_graph=$self->{dumpCatGraph};
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
return 1;
}
sub _return_alvis_record
{
my $self=shift;
my $cb=shift;
my $mod_date=shift;
my $category_word=shift;
my $expand_templates=shift;
my $output_format=shift;
my $namespace=shift;
my $title=shift;
my $text=shift;
my $is_redir=shift;
my $orig_text=$text;
my $expansion;
$text=~s/<!--.*?-->//sgo;
$title=$self->{parser}->normalize_title($title);
if (!defined($title))
{
$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
return 0;
}
$expansion=$self->{variables}->expand($namespace,$title,$text,
$expand_templates);
if (!defined($expansion))
{
$self->_set_err_state($ERR_EXPAND);
return 0;
}
$text=$expansion;
if ($namespace ne '')
{
$title="$namespace/$title";
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
$text=~s/<!--.*?-->//sgo;
$title=$self->{parser}->normalize_title($title);
if (!defined($title))
{
$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
return 0;
}
$expansion=$self->{variables}->expand($namespace,$title,$text);
if (!defined($expansion))
{
$self->_set_err_state($ERR_EXPAND);
return 0;
}
$text=$expansion;
if (!$self->_add_cat_page_links_to_graph($title,$text))
{
$self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="FF2C88E89A1DDFE4F8CD4845EEC285E3" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1142938329956</modifiedDate>
<httpServer>Apache</httpServer>
<urls>
<url>http://searchenginewatch.com/searchday/article.php/3592876</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>At long last, Google has launched its ownGoogle Finance service. For years, those seeking specialty financial information via Google have been sent to competitors such as Yahoo and MSN. Now Google's providing financial information di...
<metaData>
<meta name="title">Google Launches Google Finance</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>
wrote</anchorText>
<location>http://searchenginewatch.com/_subscribers/articles/article.php/3353401</location>