Alvis-Convert

 view release on metacpan or  search on metacpan

bin/wikipedia2alvis  view on Meta::CPAN

my @Namespaces=('');   
my $NamespacesTxt=undef;          # Namespaces to include, default 'Articles'

GetOptions('help|?'=>\$PrintHelp, 
	   'man'=>\$PrintManual,
	   'warnings!'=>\$Warnings,
	   'out-dir=s'=>\$ODir,
	   'namespaces=s'=>\$NamespacesTxt,
	   'N-per-out-dir=s'=>\$NPerOutDir,
	   'original!'=>\$IncOrigDoc,
	   'expand-templates-fully!'=>\$ExpandTemplates,
	   'dump-templates!'=>\$DumpTemplates,
	   'template-dump-file=s'=>\$TemplateDumpF,
	   'convert-via-html!'=>\$ConvertViaHTML,
	   'language=s'=>\$Language,
	   'category-word=s'=>\$CategoryWord,
	   'root-category=s'=>\$RootCategory,
	   'template-word=s'=>\$TemplateWord,
	   'date=s'=>\$Date,
	   'dump-category-graph!'=>\$DumpCatGraph,
	   'category-graph-dump-file=s'=>\$CatGraphDumpF

bin/wikipedia2alvis  view on Meta::CPAN

{
    die("Instantiating Alvis::Convert failed.\n");
}

my %Seen;

my $N=0;
$C->init_output();
if (!$C->wikipedia($XMLDumpF,
		   [\&_output_wikipedia_article],
		   {expandTemplates=>$ExpandTemplates,
		    templateDumpF=>$TemplateDumpF,
		    outputFormat=>$OutputFormat,
		    categoryWord=>$CategoryWord,
                    date=>$Date,
		    namespaces=>[@Namespaces],
                    dumpCatGraph=>$DumpCatGraph,
		    catGraphDumpF=>$CatGraphDumpF},
		   [\&_wikipedia_progress]
		   ))
{

bin/wikipedia2alvis  view on Meta::CPAN

=head1 SYNOPSIS
    
    wikipedia2alvis.pl [options] [Wikipedia XML dump file]

  Options:

    --out-dir                      output directory
    --namespaces                   list of namespaces to extract
    --N-per-out-dir                # of records per output directory
    --[no-]original                include original document?
    --[no-]expand-templates-fully  do we try to expand templates fully?
    --[no-]dump-templates          do we dump the templates?
    --template-dump-file           the file to dump the templates to
    --[no-]convert-via-html        do we convert via HTML or directly to Alvis? 
    --date                         the date of the Wikipedia dump
    --[no-]dump-category-graph     do we dump the category graph?
    --category-graph-dump-file     the file to dump the category graph to
    --category-word                category namespace identifier
    --root-category                root category identifier
    --template-word                template namespace identifier
    --language                     the language of the Wikipedia dump

bin/wikipedia2alvis  view on Meta::CPAN


=item B<--N-per-out-dir>

    Sets the # of records per output directory. Default value: 1000.

=item B<--[no-]original>

    Shall the original document be included in the output? Default
    value: no.

=item B<--[no-]expand-templates-fully>

    Do we try to expand templates fully or do we simply insert a list of
    the template parameter values given in the call? Default value: no.

=item B<--[no-]dump-templates>

    Do we dump the templates onto disk in a loadable format? 
    Default value: no.

=item B<--template-dump-file>

    The name of the (possible) template dump file. Default value: 

lib/Alvis/Convert.pm  view on Meta::CPAN


    $self->{encodingWizard}=
	Alvis::Document::Encoding->new(defaultEncoding=>undef);
    if (!defined($self->{encodingWizard}))
    {
	$self->_set_err_state($ERR_ENCODING_WIZARD);
	return undef;
    }

    $self->{wikipediaConverter}=
	Alvis::Wikipedia::XMLDump->new(expandVariables=>1,
				       skipRedirects=>0,
				       dumpCategoryData=>1,
				       dumpTemplateData=>1);
    if (!defined($self->{wikipediaConverter}))
    {
	$self->_set_err_state($ERR_WIKIPEDIA);
	return undef;
    }

    $self->{docTypeWizard}=

lib/Alvis/Convert.pm  view on Meta::CPAN

#               will be called like this:
#          _wikipedia_progress($arg1,$arg2,...,
#                              $prog_txt,$N,$n,$mess)
#
#   where $N is the total number of records processed and $n the number of hits
#
# opts:  a hash of options with these possible fields:
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records 
#                             ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN

    $ERR_NO_NAMESPACE,
    $ERR_STORE,
    $ERR_UNDEF_DUMP,
    $ERR_RETRIEVE
    )=(0..10);
my %ErrMsgs=($ERR_OK=>"",
	     $ERR_PARSER=>"Unable to instantiate Alvis::Wikipedia::WikitextParser.",
	     $ERR_NORM=>"Title normalization failed.",
	     $ERR_UNK_TEMPL=>"Unrecognized template name.",
	     $ERR_PARAM=>"Application of a parameter pattern failed.",
	     $ERR_NO_TEXT=>"Undefined text to expand",
	     $ERR_NO_TITLE=>"Undefined title to expand",
	     $ERR_NO_NAMESPACE=>"Undefined namespace to expand",
	     $ERR_STORE=>"Storable::store() failed.",
	     $ERR_UNDEF_DUMP=>"Trying to dump when there are no definitions.",
	     $ERR_RETRIEVE=>"Storable::retrieve() failed."
	    );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN


    $def=~s/<noinclude>.*?<\/noinclude>//sgo;
    $def=~s/<\/?includeonly>//sgo;
    
    $self->{defs}{$norm_name}=$def;

    return 1;
}

#
#  expand_for_real:  do we try to expand the templates for real
#                    (messy and error-prone) or do we simply replace
#                    with a list of the parameter values?
#
sub expand 
{
    my $self=shift;
    my $namespace=shift;
    my $title=shift;
    my $text=shift;
    my $expand_for_real=shift;

    if (!defined($namespace))
    {
	$self->_set_err_state($ERR_NO_NAMESPACE);
	return undef;
    }
    if (!defined($title))
    {
	$self->_set_err_state($ERR_NO_TITLE);
	return undef;

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN

    {
	$self->_set_err_state($ERR_NO_TEXT);
	return undef;
    }

    $self->{currNamespace}=$namespace;
    $self->{currTitle}=$title;
    $self->{nofExpansions}=0;

    warn "TRANSCLUDING...\n" if $DEBUG;
    my $expanded_text=$self->_transclude($text,$expand_for_real);
    warn "DONE TRANSCLUDING\n" if $DEBUG;

    return $expanded_text;
}

#
#  expand_for_real:  do we try to expand the templates for real
#                    (messy and error-prone) or do we simply replace
#                    with a list of the parameter values?
#
sub _transclude
{
    my $self=shift;
    my $text=shift;
    my $expand_for_real=shift;

    $self->{higherLevelExpandedNames}={};
    while ($text=~/(([^\{])?\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*)(\|.*?)?\}\})/sgo)
    {
	$self->{thisLevelExpandedNames}={};
	# Safeguard against malevolent templates 
	if (length($text)>$self->{maxExpandedTextSize} || 
	    $self->{nofExpansions}>$self->{maxNofExpansions})
	{
	    warn "Excessive expansion stopped for \"$self->{currNamespace}:$self->{currTitle}\"" .
		". Length of the text to expand: " .
		length($text) . ", # of expansions: " . $self->{nofExpansions};
		return $text;
	}

	warn "BEFORE VARIABLE SUBSITUTION\n" if $DEBUG;
	# Variable substitution
	$text=~s/(\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*?)\}\})/$self->_substitute_variable($1,$2)/sgeo;
	warn "TEXT AFTER VARIABLE SUBSTITUTION:$text\n" if $DEBUG;
	
	# Template substitution
	$text=~s/(([^\{])?\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*)(\|[^\{]*?)?\}\})/$self->_substitute_template($1,$2,$3,$4,$expand_for_real)/sgeo;
	
	warn "TEXT AFTER TEMPLATE SUBSTITUTION:$text\n" if $DEBUG;
    
	for my $name (keys %{$self->{thisLevelExpandedNames}})
	{
	    $self->{higherLevelExpandedNames}->{$name}=1;
	}
    }

    return $text;

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN

    }
}

sub _substitute_template
{
    my $self=shift;
    my $orig_text=shift;
    my $pre_context=shift;
    my $name=shift;
    my $params=shift;
    my $expand_for_real=shift;

    my $found=0;

    my $expanded_text;
    my %arg_assignments=(); 

    $name=$self->{parser}->normalize_title($name);

    warn "substitute_template():" if $DEBUG;
    warn "PRE:\"$pre_context\"\n" if $DEBUG;
    warn "NAME:\"$name\"\n" if $DEBUG;
    warn "PARAMS:\"$params\"\n" if $DEBUG;

    # Don't parse {{{}}} because that's only for template arguments
    if (defined($pre_context) && $pre_context eq '{') 
    {
	warn "{ PRE-CONTEXT\n" if $DEBUG;
	return $orig_text;
    }

    # Ok, now expand if it's a template
    
    # Do we know this template or don't we care anyway?
    if (($name && exists($self->{defs}{$name})) || !$expand_for_real) 
    {
	warn "TEMPLATE $name FOUND\n" if $DEBUG;

	$found=1;
	
	if (defined($pre_context))
	{
	    $expanded_text=$pre_context;
	}

	#
        # Not recommended atm .. the bloody syntax seems to keep
        # on changing with each new server alpha version
	#
	if ($expand_for_real)
	{
	    $expanded_text.=$self->{defs}{$name};
	    warn "TEXT AFTER ADDING EXPANSION:$expanded_text\n" if $DEBUG;
	    
	    if (defined($params))
	    {
		# Collect the parameter assignments 
		my @actual_args=$self->_get_template_call_args($params);
		my $index=1;
		for my $arg (@actual_args)
		{
		    my $eq_pos=index($arg,'=');
		    if ($eq_pos<0) 

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN

			my $value=substr($arg,$eq_pos+1);
			$value=~s/^\s+//;
			$value=~s/\s+$//;
			
			warn "Adding actual arg \'$name\', value \'$value\'\n" if $DEBUG;
			$arg_assignments{$name}=$value;
		    }
		}
	    }
	    
	    # Keep track of expanded names
	    $self->{thisLevelExpandedNames}{$name}=1;

	    # Substitute actual parameter values 
	    while ($expanded_text=~/(\{\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*?)(\|[^\{]*?)?\}\}\})/sgo)
	    {
		$expanded_text=~s/(\{\{\{([ %!\"\$\&\'\(\)\*,\-\.\/0-9:;=\?\@A-Z\\\^_\`a-z\~\x80-\xFF\n]*?)(\|[^\{]*?)?\}\}\})/$self->_substitute_param_value($1,$2,$3,\%arg_assignments)/sgeo;
		warn "TEXT AFTER PARAMETER VALUE SUBSTITUTION:$expanded_text\n" if $DEBUG;
	    }
	    
	    # If the template begins with a table or block-level
	    # element, it should be treated as beginning a new line.
	    if (defined($pre_context) && $pre_context!~/\n/ && $expanded_text=~/^(\{\||:|;|\#|\*)/) 
	    {
		warn "ADDING NEWLINE PRE-CONTEXT\n" if $DEBUG;
		$expanded_text="\n" . $expanded_text;
	    }
	    # remove comments
	    $expanded_text=~s/<!--.*?-->//isgo;   

	}
	else  # play it safe -- shouldn't matter much for search engine
              # purposes
	{
	    if (defined($params))
	    {
		# Collect the parameter assignments 
		my @actual_args=$self->_get_template_call_args($params);
		my $index=1;

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN

			$value=~s/\s+$//;
			
			warn "Adding actual arg \'$name\', value \'$value\'\n" if $DEBUG;
			$arg_assignments{$name}=$value;
		    }
		}
	    }
	    #
	    # Simply insert the parameter values as a list
	    #
	    $expanded_text.="\n";
	    for my $p (keys %arg_assignments)
	    {
		$expanded_text.="*$arg_assignments{$p}\n";
	    }
	    # If the template begins with a table or block-level
	    # element, it should be treated as beginning a new line.
	    if (defined($pre_context) && $pre_context!~/\n/ && $expanded_text=~/^(\{\||:|;|\#|\*)/) 
	    {
		warn "ADDING NEWLINE PRE-CONTEXT\n" if $DEBUG;
		$expanded_text="\n" . $expanded_text;
	    }
	    # remove comments
	    $expanded_text=~s/<!--.*?-->//isgo if defined($expanded_text);   
	    $expanded_text.="\n";
	    $expanded_text.="----\n"; # to cause a logical section break 

	    return $expanded_text;
	}
    }
    
    if (!$found) 
    {
	warn "AT END. NOT FOUND\n" if $DEBUG;
	#
	# Have to safeguard against retrying this
	#
	return $pre_context . "UNKNOWN_TEMPLATE_$name" if $DEBUG;
    } 
    else 
    {
	$self->{nofExpansions}++;

	warn "AT END. FOUND.\n" if $DEBUG;
	return $expanded_text;
    }
}

#
# Triple brace replacement -- used for template arguments
#
sub _substitute_param_value
{
    my $self=shift;
    my $orig_text=shift;

lib/Alvis/Wikipedia/Variables.pm  view on Meta::CPAN

    
    if (!$self->{templates}->load($f))
    {
	$self->_set_err_state($ERR_TEMPL_LOAD,$self->{templates}->errmsg());
	return 0;
    }

    return 1;
}

sub expand
{
    my $self=shift;
    my $namespace=shift;
    my $title=shift;
    my $text=shift;
    my $expand_templates_for_real=shift; # do we expand the templates fully?

    #
    # Problems: <math>,<nowiki>...safeguard them
    #
    my $sep_text=$self->{parser}->separate_markup($text);
    if (!defined($sep_text))
    {
	$self->_set_err_state($ERR_SEP,"Text:\"$text\"");
	return "";
    }

lib/Alvis/Wikipedia/Variables.pm  view on Meta::CPAN

    my $exp_text="";

    for my $s (@$sep_text)
    {
	my ($type,$t)=@$s;

	if ($type eq $Alvis::Wikipedia::WikitextParser::MARKUP)
	{
#	    warn "MARKUP TO EXPAND:$t\n";

            my $exp_t=$self->{templates}->expand($namespace,$title,$t,
						 $expand_templates_for_real);
            if (!defined($exp_t))
	    {
		$self->_set_err_state($ERR_EXP,"Text:\"$t\"");
		return undef;
	    }
            else
	    {
		$exp_text.=$exp_t;
	    }
	}

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

	return undef;
    }

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{expandTemplates}=0;
    $self->{outputFormat}=$OUTPUT_HTML;
    $self->{skipRedirects}=0;
    $self->{categoryWord}='Category';
    $self->{templateWord}='Template';
    $self->{dumpCategoryData}=1;
    $self->{dumpTemplateData}=1;
    $self->{catGraphDumpF}='CatGraph.storable';
    $self->{templateDumpF}='Templates.storable';

    if (defined(@_))

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

#
# opts: hash with fields
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records ($OUTPUT_HTML,
#                             $OUTPUT_ALVIS),...
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

                        # ('N records processed')

    if (!defined($cb))
    {
	$self->_set_err_state($ERR_XML_PARSER);
	return 0;
    }

    my $prog_txt="";

    my $expand_templates;
    if (exists($self->{expandTemplates}))
    {
	$expand_templates=$self->{expandTemplates};
    }
    if (exists($opts->{expandTemplates}))
    {
	$expand_templates=$opts->{expandTemplates};
    }

    my %namespaces;

    if ($expand_templates)
    {
	if ($opts->{templateDumpF})
	{
	    if (defined($prog_cb))
	    {
		my @prog_cb=@$prog_cb;
		&{$prog_cb[0]}(@prog_cb[1..$#prog_cb],"Loading the templates");
	    }
	    if (!$self->{variables}->load_templates($opts->{templateDumpF}))
	    {

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

    {
	for my $ns ($opts->{namespaces})
	{
	    $namespaces{$ns}=1;
	}
    }
    my $p_cb=[@$prog_cb,$prog_txt];
    if (!$self->_pass_over_records(\%namespaces,
				   [\&_return_alvis_record,
				    $self,$cb,$date,$category_word,
				    $expand_templates,$output_format],
				   $p_cb))
    {
	$self->_set_err_state($ERR_SECOND_PASS);
	return 0;
    }

    my $dump_cat_graph;
    if (exists($self->{dumpCatGraph}))
    {
	$dump_cat_graph=$self->{dumpCatGraph};

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN


    return 1;
}

sub _return_alvis_record
{
    my $self=shift;
    my $cb=shift;
    my $mod_date=shift;
    my $category_word=shift;
    my $expand_templates=shift;
    my $output_format=shift;
    my $namespace=shift;
    my $title=shift;
    my $text=shift;
    my $is_redir=shift;

    my $orig_text=$text;
    my $expansion;
    
    $text=~s/<!--.*?-->//sgo;
    
    $title=$self->{parser}->normalize_title($title);
    if (!defined($title))
    {
	$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
	return 0;
    }
    
    $expansion=$self->{variables}->expand($namespace,$title,$text,
					  $expand_templates);
    if (!defined($expansion))
    {
	$self->_set_err_state($ERR_EXPAND);
	return 0;
    }
    $text=$expansion;
    
    if ($namespace ne '')
    { 
	$title="$namespace/$title";

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

    
    $text=~s/<!--.*?-->//sgo;
    
    $title=$self->{parser}->normalize_title($title);
    if (!defined($title))
    {
	$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
	return 0;
    }
    
    $expansion=$self->{variables}->expand($namespace,$title,$text);
    if (!defined($expansion))
    {
	$self->_set_err_state($ERR_EXPAND);
	return 0;
    }
    $text=$expansion;

    if (!$self->_add_cat_page_links_to_graph($title,$text))
    {
	$self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="FF2C88E89A1DDFE4F8CD4845EEC285E3" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1142938329956</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3592876</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>At long last, Google has launched its ownGoogle Finance service. For years, those seeking specialty financial information via Google have been sent to competitors such as Yahoo and MSN. Now Google's providing financial information di...
      <metaData>
        <meta name="title">Google Launches Google Finance</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>
wrote</anchorText>
            <location>http://searchenginewatch.com/_subscribers/articles/article.php/3353401</location>



( run in 2.121 seconds using v1.01-cache-2.11-cpan-5b529ec07f3 )