Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

    
    return $self->{errstr};
}

##########################################################################
#
# Public methods
#
##########################################################################

sub new
{
    my $proto=shift;

    my $class=ref($proto)||$proto;
    my $parent=ref($proto)&&$proto;
    my $self={};
    bless($self,$class);

    $self->_set_err_state($ERR_OK);

    $self->_init(@_);

    $self->{variables}=Alvis::Wikipedia::Variables->new();
    if (!defined($self->{variables}))
    {
	$self->_set_err_state($ERR_VAR);
	return undef;
    }

    $self->{parser}=Alvis::Wikipedia::WikitextParser->new();
    if (!defined($self->{parser}))
    {
	$self->_set_err_state($ERR_PARSER);
	return undef;
    }

    $self->{canDocConverter}=Alvis::Canonical->new(convertCharEnts=>1,
						   convertNumEnts=>1,
						   sourceEncoding=>'utf8');
    if (!defined($self->{canDocConverter}))
    {
	$self->_set_err_state($ERR_CAN_DOC_CONV);
	return undef;
    }

    $self->{catGraph}=Alvis::Wikipedia::CatGraph->new();
    if (!defined($self->{catGraph}))
    {
	$self->_set_err_state($ERR_CAT_GRAPH);
	return undef;
    }

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{expandTemplates}=0;
    $self->{outputFormat}=$OUTPUT_HTML;
    $self->{skipRedirects}=0;
    $self->{categoryWord}='Category';
    $self->{templateWord}='Template';
    $self->{dumpCategoryData}=1;
    $self->{dumpTemplateData}=1;
    $self->{catGraphDumpF}='CatGraph.storable';
    $self->{templateDumpF}='Templates.storable';

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

#
# opts: hash with fields
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records ($OUTPUT_HTML,
#                             $OUTPUT_ALVIS),...
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump
#     dumpCatGraph            flag for dumping the category graph
#     catGraphDumpF           category graph dump file
#
sub extract_records
{
    my $self=shift;
    my $fd=shift;   # dump fd ref 
    my $cb=shift;  # [\&foo,$arg1,$arg2], callback for each [record title,text]
    my $opts=shift;
    my $prog_cb=shift;  # [\&foo,$arg1,$arg2], optional callback for progress 
                        # ('N records processed')

    if (!defined($cb))
    {
	$self->_set_err_state($ERR_XML_PARSER);
	return 0;
    }

    my $prog_txt="";

    my $expand_templates;
    if (exists($self->{expandTemplates}))
    {
	$expand_templates=$self->{expandTemplates};
    }
    if (exists($opts->{expandTemplates}))
    {
	$expand_templates=$opts->{expandTemplates};
    }

    my %namespaces;

    if ($expand_templates)
    {
	if ($opts->{templateDumpF})
	{
	    if (defined($prog_cb))
	    {
		my @prog_cb=@$prog_cb;
		&{$prog_cb[0]}(@prog_cb[1..$#prog_cb],"Loading the templates");
	    }
	    if (!$self->{variables}->load_templates($opts->{templateDumpF}))
	    {
		$self->_set_err_state($ERR_LOAD_TEMPLATES);
		return 0;
	    }
	}
	else # Have to do a pass first to collect the templates
	{
	    $self->{XMLParser}=Parse::MediaWikiDump::Pages->new($fd);
	    if (!defined($self->{XMLParser}))
	    {
		$self->_set_err_state($ERR_XML_PARSER);
		return 0;
	    }
	    
	    my $template_word;
	    if (exists($self->{templateWord}))
	    {
		$template_word=$self->{templateWord};
	    }
	    if (exists($opts->{templateWord}))
	    {
		$template_word=$opts->{templateWord};
	    }

	    $prog_txt="Collecting templates";
	    %namespaces=($template_word=>1);
	    if (!$self->_pass_over_records(\%namespaces,
					   [\&_collect_templates,$self],
					   [$prog_cb,$prog_txt]))
	    {
		$self->_set_err_state($ERR_FIRST_PASS);
		return 0;
	    }
	    
	    if ($self->{dumpTemplateData})
	    {
		$self->{variables}->dump_templates($opts->{templateDumpF});
	    }
	}
    }

    $prog_txt="Expanding variables and converting";
    #
    # Just in case we did a first pass, destroy old instance
    #
    undef $self->{XMLParser};
    $self->{XMLParser}=Parse::MediaWikiDump::Pages->new($fd);
    if (!defined($self->{XMLParser}))
    {
	$self->_set_err_state($ERR_XML_PARSER);
	return 0;
    }

    my $category_word;
    if (exists($self->{categoryWord}))
    {
	$category_word=$self->{categoryWord};
    }
    if (exists($opts->{categoryWord}))
    {
	$category_word=$opts->{categoryWord};
    }

    my $date;
    if ($opts->{date})
    {
	$date=$opts->{date};
    }
    else # pick the current date
    {
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=
	    localtime();
	$date=sprintf("%04d%02d%02d",1900+$year,1+$mon,$mday);
    }

    my $output_format;
    if (exists($self->{outputFormat}))
    {
	$output_format=$self->{outputFormat};
    }
    if (exists($opts->{outputFormat}))
    {
	$output_format=$opts->{outputFormat};
    }

    # Pick articles and category pages
    %namespaces=(''=>1,$category_word=>1);
    #
    # Add any other wanted namespaces
    #
    if ($opts->{namespaces})
    {
	for my $ns ($opts->{namespaces})
	{
	    $namespaces{$ns}=1;
	}
    }
    my $p_cb=[@$prog_cb,$prog_txt];
    if (!$self->_pass_over_records(\%namespaces,
				   [\&_return_alvis_record,
				    $self,$cb,$date,$category_word,
				    $expand_templates,$output_format],
				   $p_cb))
    {
	$self->_set_err_state($ERR_SECOND_PASS);
	return 0;
    }

    my $dump_cat_graph;
    if (exists($self->{dumpCatGraph}))
    {
	$dump_cat_graph=$self->{dumpCatGraph};
    }
    if (exists($opts->{dumpCatGraph}))
    {
	$dump_cat_graph=$opts->{dumpCatGraph};
    }

    if ($dump_cat_graph)
    {
	my $cat_graph_f;
	if (exists($self->{catGraphDumpF}))
	{
	    $cat_graph_f=$self->{catGraphDumpF};
	}
	if (exists($opts->{catGraphDumpF}))
	{
	    $cat_graph_f=$opts->{catGraphDumpF};
	}
	if (!$self->{catGraph}->dump_graph($cat_graph_f))
	{
	    $self->_set_err_state($ERR_CAT_GRAPH_DUMP);
	    return 0;
	}
    }

    return 1;
}

#########################################################################
#
#    Private methods
#
#########################################################################

sub _collect_templates 
{ 
    my $self=shift; 
    my $namespace=shift; 
    my $title=shift; 
    my $text=shift;

    if ($namespace eq $self->{templateWord})
    {
	if (!$self->{variables}->add_template($title,$text))
	{
	    $self->_set_err_state($ERR_TEMPL_ADD);
	    return 0;
	}
    }

    return 1;
}

sub _return_alvis_record
{
    my $self=shift;
    my $cb=shift;
    my $mod_date=shift;
    my $category_word=shift;
    my $expand_templates=shift;
    my $output_format=shift;
    my $namespace=shift;
    my $title=shift;
    my $text=shift;
    my $is_redir=shift;

    my $orig_text=$text;
    my $expansion;
    
    $text=~s/<!--.*?-->//sgo;
    
    $title=$self->{parser}->normalize_title($title);
    if (!defined($title))
    {
	$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
	return 0;
    }
    
    $expansion=$self->{variables}->expand($namespace,$title,$text,
					  $expand_templates);
    if (!defined($expansion))
    {
	$self->_set_err_state($ERR_EXPAND);
	return 0;
    }
    $text=$expansion;
    
    if ($namespace ne '')
    { 
	$title="$namespace/$title";
    }
    
    if ($namespace eq $category_word && $self->{dumpCategoryData})
    {
	if (!$self->_add_cat_page_links_to_graph($title,$text))
	{
	    $self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
				  "title: \"$title\"");
	    return 0;
	}
    }
    
    my @cb;
    
    if ($output_format eq $OUTPUT_HTML)
    {
	my $html=$self->{parser}->to_HTML($text);
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_HTML);
	    return 0;
	}
	$html="<HTML>\n<BODY>\n" . $html . "</BODY>\n</HTML>\n";	    
	
	@cb=@$cb;
	&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,$html,
		  $is_redir,$namespace);
    }
    elsif ($output_format eq $OUTPUT_ALVIS)
    {
	; # Skip HTML and convert directly to Alvis XML to save time
	die("NOT IMPLEMENTED YET!");
	my $alvis_XML;
	
	@cb=@$cb;
	&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,
		  $alvis_XML,$is_redir,$namespace);
    }
    else
    {
	$self->_set_err_state($ERR_UNK_OUTPUT_FORMAT,
			      "format: \"$output_format\"");
	return 0;
    }

    return 1;
}

sub _add_cat_page_to_graph
{
    my $self=shift;
    my $namespace=shift;
    my $title=shift;
    my $text=shift;
    my $is_redir=shift;

    my $orig_text=$text;
    my $expansion;
    
    $text=~s/<!--.*?-->//sgo;
    
    $title=$self->{parser}->normalize_title($title);
    if (!defined($title))
    {
	$self->_set_err_state($ERR_TITLE,"title: \"$title\"");
	return 0;
    }
    
    $expansion=$self->{variables}->expand($namespace,$title,$text);
    if (!defined($expansion))
    {
	$self->_set_err_state($ERR_EXPAND);
	return 0;
    }
    $text=$expansion;

    if (!$self->_add_cat_page_links_to_graph($title,$text))
    {
	$self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
			      "title: \"$title\"");
	return 0;
    }

    return 1;
}

sub _add_cat_page_links_to_graph
{
    my $self=shift;
    my $title=shift;  # already normalized
    my $text=shift;

    my $cat=$title;

    $text=~s/\[\[(?:(?i)$self->{categoryWord}):(.*?)\]\]/$self->_add_cat_link($cat,$1)/sgoe;

    return 1;
}

sub _add_cat_link
{
    my $self=shift;
    my $cat=shift;
    my $parent_spec=shift;

    my @parts=split(/\|/,$parent_spec);

    my $parent=$self->{parser}->normalize_title($parts[0]);
    if (!defined($parent))
    {
	$self->_set_err_state($ERR_TITLE,
			      "category parent title: \"$parts[0]\"");
	return 0;
    }

    $self->{catGraph}->add_link($cat,$parent);
}

sub _pass_over_records
{
    my $self=shift;
    my $target_namespaces=shift;
    my $cb=shift;
    my $prog_cb=shift;


    $self->{'N'}=0;
    $self->{'n'}=0;



( run in 0.597 second using v1.01-cache-2.11-cpan-5b529ec07f3 )