Alvis-Convert

 view release on metacpan or  search on metacpan

bin/wikipedia2alvis  view on Meta::CPAN

{
    $RootCategory=$LangSettings{$Language}{rootCategory};
    $CategoryWord=$LangSettings{$Language}{categoryWord};
    $TemplateWord=$LangSettings{$Language}{templateWord};
}
#
# Speed vs. (possibly) quality
#
if ($ConvertViaHTML)
{
    $OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_HTML;
}
else
{
    $OutputFormat=$Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS;
}
if ($NamespacesTxt)
{
    for my $ns (split(/,/,$NamespacesTxt))
    {
	$ns=~s/^\s+//isgo;
	$ns=~s/\s+$//isgo;
	push(@Namespaces,$ns);
    }
}

bin/wikipedia2alvis  view on Meta::CPAN

    my $title=shift;
    my $date=shift;
    my $output_format=shift;
    my $record_txt=shift;
    my $is_redir=shift;
    my $namespace=shift;

    warn "TITLE:$title";
    
    my $alvis_XML;
    if ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_HTML)
    {
	my $meta_txt;
	$meta_txt.="title\t$title\n";
	$meta_txt.="date\t$date\n";
	my $ns_txt="";
	if ($namespace ne '')
	{
	    $ns_txt="$namespace/";
	}
	$meta_txt.="url\twikipedia/$ns_txt$title\n";

bin/wikipedia2alvis  view on Meta::CPAN

        if (!defined($alvis_XML))
        {
            warn "Obtaining the Alvis version of the " .
                "HTML version of an article failed. " . $C->errmsg() if
                $Warnings;
            $C->clearerr();
            return 1;
        }

    }
    elsif ($output_format eq $Alvis::Wikipedia::XMLDump::OUTPUT_ALVIS)
    {
	$alvis_XML=$record_txt;
    }
    else
    {
	die("Internal inconsistency: output format of a Wikipedia article " .
	    "is an unrecognized one: \"$output_format\".");
    }

    $title=~s/\//_/isgo;

lib/Alvis/Convert.pm  view on Meta::CPAN

    $ERR_XML_PARSE,
    $ERR_NO_URL,
    $ERR_ENCODING_WIZARD,
    $ERR_UTF8_CONV,
    $ERR_ENCODING_CONV,
    $ERR_TYPE_SUFFIX,
    $ERR_READ_HTML,
    $ERR_READ_NEWS_XML,
    $ERR_ALVIS_CONV,
    $ERR_ALVIS_SUFFIX,
    $ERR_NO_OUTPUT_ROOT_DIR,
    $ERR_WRITING_OUTPUT,
    $ERR_DIR_CONV,
    $ERR_NO_HTML_F,
    $ERR_META_F,
    $ERR_HTML_F,
    $ERR_NEWS_XML_F,
    $ERR_DOC_ALVIS_CONV,
    $ERR_NEWS_XML_PARSE,
    $ERR_MULTIPLE_SUFFIX_MEANING,
    $ERR_OUTPUT_ALVIS,
    $ERR_OUTPUT_SET_OF_RECORDS,
    $ERR_AINODUMP,
    $ERR_OPEN_AINODUMP,
    $ERR_AINODUMP_PROCESS,
    $ERR_DOC_TYPE_WIZARD,
    $ERR_TYPE_GUESS,
    $ERR_UNK_FILE_TYPE,
    $ERR_WIKIPEDIA,
    $ERR_OPEN_WIKIPEDIA,
    $ERR_WIKIPEDIA_CONV
    )=(0..40);

lib/Alvis/Convert.pm  view on Meta::CPAN

	     $ERR_ENCODING_WIZARD=>"Unable to instantiate " .
	     "Alvis::Document::Encoding.",
	     $ERR_UTF8_CONV=>"Trying to convert to UTF-8 failed.",
	     $ERR_ENCODING_CONV=>"Converting from the supposed source " .
	     "encoding to UTF-8 failed.",
	     $ERR_TYPE_SUFFIX=>"No suffix given for a type.",
	     $ERR_READ_HTML=>"Reading the HTML failed.",
	     $ERR_READ_NEWS_XML=>"Reading the news XML failed.",
	     $ERR_ALVIS_CONV=>"Conversion to Alvis format failed.",
	     $ERR_ALVIS_SUFFIX=>"No Alvis suffix defined.",
	     $ERR_NO_OUTPUT_ROOT_DIR=>"No output root directory.",
	     $ERR_WRITING_OUTPUT=>"Writing the output failed.",
	     $ERR_DIR_CONV=>"Converting a directory failed.",
	     $ERR_NO_HTML_F=>"No HTML file.",
	     $ERR_META_F=>"Opening the meta file failed.",
	     $ERR_HTML_F=>"Opening the HTML file failed.",
	     $ERR_NEWS_XML_F=>"Opening the news XML file failed.",
	     $ERR_DOC_ALVIS_CONV=>"Converting a document to Alvis format failed.",
	     $ERR_NEWS_XML_PARSE=>"Parsing the news XML failed.",
	     $ERR_MULTIPLE_SUFFIX_MEANING=>
	     "Multiple meanings for a single suffix.",
	     $ERR_OUTPUT_ALVIS=>"Outputting the Alvis records failed.",
	     $ERR_OUTPUT_SET_OF_RECORDS=>"Outputting a set of records to a " .
	     "file as a documentCollection  failed.",
	     $ERR_AINODUMP=>"Instantiating Alvis::AinoDump failed.",
	     $ERR_OPEN_AINODUMP=>"Opening an ainodump file failed.",
	     $ERR_AINODUMP_PROCESS=>"Processing an ainodump file failed.",
	     $ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type " .
	                           "failed.",
	     $ERR_TYPE_GUESS=>"Guessing the document's type failed.",
	     $ERR_UNK_FILE_TYPE=>"Unrecognized file type.",
	     $ERR_WIKIPEDIA=>"Instantiating Alvis::Wikipedia::XMLDump failed.",
	     $ERR_OPEN_WIKIPEDIA=>"Opening the Wikipedia XML dump file failed.",

lib/Alvis/Convert.pm  view on Meta::CPAN

}

#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
#               will be called like this:
#          _output_wikipedia_article($arg1,$arg2,...,
#                                    $title,$output_format,
#                                    $record_txt,$is_redir)
#
#  where $output_format is a global defined in Alvis::Wikipedia::XMLDump
#  as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...]     OPTIONAL
#               will be called like this:
#          _wikipedia_progress($arg1,$arg2,...,
#                              $prog_txt,$N,$n,$mess)
#
#   where $N is the total number of records processed and $n the number of hits
#
# opts:  a hash of options with these possible fields:
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records 
#                             ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump
#     dumpCatGraph            flag for dumping the category graph
#     catGraphDumpF           category graph dump file
#

lib/Alvis/Convert.pm  view on Meta::CPAN

	    $self->_set_err_state($ERR_ALVIS_SUFFIX);
	    return 0;
	}
	if ($self->{outputAtSameLocation})
	{
	    $out_f=$base_name . "." . $self->{articleN} . '.' .
		$self->{alvisSuffix};
	    $self->{articleN}++;
	    if (!$self->_output_set_of_records($alvis_record,$out_f))
	    {
		$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
		return 0;
	    }
	    $self->{outputN}++;
	    print "$self->{outputN}\r";
	}
	else
	{
	    if (!defined($self->{outputRootDir}))
	    {
		$self->_set_err_state($ERR_NO_OUTPUT_ROOT_DIR);
		return 0;
	    }
	    my $dir=$self->{outputRootDir} . '/' . 
		int($self->{outputN} / $self->{outputNPerSubdir});
	    if ($self->{outputN} % $self->{outputNPerSubdir}==0)
	    {
		mkdir($dir);
	    }
	    $out_f=$dir . '/' . $self->{outputN} . '.' .
		$self->{alvisSuffix};
	    
	    if (!$self->_output_set_of_records($alvis_record,$out_f))
	    {
		$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
		return 0;
	    }
	    
	    $self->{outputN}++;
	    print "$self->{outputN}\r";
	}
    }

    return 1;
}

lib/Alvis/Convert.pm  view on Meta::CPAN

	my $alvisXML=$self->HTML($text,$meta_txt);
	$self->{sourceEncoding}=$srcenc_setting;
	if (!defined($alvisXML))
	{
	    $self->_set_err_state($ERR_ALVIS_CONV);
	    return 0;
	}

	if (!$self->output_Alvis([$alvisXML],$base_name))
	{
	    $self->_set_err_state($ERR_OUTPUT_ALVIS,
				  "Base name: \"$base_name\"");
	    return 0;
	}
    }
    else
    {
	warn "Ainodump document $header->{id} was not of a convertible " .
	     "type: $type/$sub_type.\n" if $self->{ainodumpWarnings};
    }

lib/Alvis/Convert.pm  view on Meta::CPAN

}

sub _output_set_of_records
{
    my $self=shift;
    my $set_of_records_txt=shift;
    my $path=shift;

    if (!defined(open(OUT,">:utf8",$path)))
    {
	$self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " .
			      "\"$path\"");
	return 0;
    }
    print OUT "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    print OUT "<documentCollection xmlns=\"http://alvis.info/enriched/\">\n";
    print OUT $set_of_records_txt;
    print OUT "</documentCollection>\n";
    close(OUT);
    
    return 1;

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

use Alvis::Wikipedia::CatGraph;
use Alvis::Canonical;

########################################################################
#
#  Exported constants
#
#######################################################################

# Record output formats
our ($OUTPUT_HTML,
     $OUTPUT_ALVIS
     )=(0..1);

############################################################################
#
#  Error message stuff
#
############################################################################

my ($ERR_OK,
    $ERR_VAR,

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

    $ERR_BUILD_CAT_GRAPH,
    $ERR_CATEGORIES,
    $ERR_XML_PARSER,
    $ERR_CAN_DOC_CONVERSION,
    $ERR_ID,
    $ERR_TITLE,
    $ERR_CAT_PAGE_LINKS_ADD,
    $ERR_CAT_GRAPH,
    $ERR_LOAD_TEMPLATES,
    $ERR_CAT_GRAPH_DUMP,
    $ERR_UNK_OUTPUT_FORMAT
    )=(0..23);
my %ErrMsgs=($ERR_OK=>"",
	     $ERR_VAR=>"Unable to instantiate Alvis::Wikipedia::Variables.",
	     $ERR_PARSER=>
	         "Unable to instantiate Alvis::Wikipedia::WikitextParser.",
	     $ERR_FIRST_PASS=>"The first pass over the records failed.",
	     $ERR_SECOND_PASS=>"The main pass over the records failed.",
	     $ERR_TEMPL_ADD=>"Adding the definition of a template failed.",
	     $ERR_EXPAND=>"Variable and template expansion failed.",
	     $ERR_DUMP=>"Opening the SQL dump file failed.",

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

	     $ERR_XML_PARSER=>"Unable to instantiate Parse::MediaWikiDump",
	     $ERR_CAN_DOC_CONVERSION=>"Converting the text from HTML to " .
	     "canonicalDocument format failed",
	     $ERR_ID=>"Calculating the id failed.",
	     $ERR_TITLE=>"Malformed title",
	     $ERR_CAT_PAGE_LINKS_ADD=>"Adding the links of a category page " .
	     "to the graph failed",
	     $ERR_CAT_GRAPH=>"Instantiating CatGraph failed",
	     $ERR_LOAD_TEMPLATES=>"Loading the templates failed.",
	     $ERR_CAT_GRAPH_DUMP=>"Dumping the category graph failed.",
	     $ERR_UNK_OUTPUT_FORMAT=>"Unrecognized XML dump record output " .
	     "format."
	     );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;


lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

    }

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{expandTemplates}=0;
    $self->{outputFormat}=$OUTPUT_HTML;
    $self->{skipRedirects}=0;
    $self->{categoryWord}='Category';
    $self->{templateWord}='Template';
    $self->{dumpCategoryData}=1;
    $self->{dumpTemplateData}=1;
    $self->{catGraphDumpF}='CatGraph.storable';
    $self->{templateDumpF}='Templates.storable';

    if (defined(@_))
    {

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

    }
}

#
# opts: hash with fields
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records ($OUTPUT_HTML,
#                             $OUTPUT_ALVIS),...
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump
#     dumpCatGraph            flag for dumping the category graph
#     catGraphDumpF           category graph dump file
#

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN

	if (!$self->_add_cat_page_links_to_graph($title,$text))
	{
	    $self->_set_err_state($ERR_CAT_PAGE_LINKS_ADD,
				  "title: \"$title\"");
	    return 0;
	}
    }
    
    my @cb;
    
    if ($output_format eq $OUTPUT_HTML)
    {
	my $html=$self->{parser}->to_HTML($text);
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_HTML);
	    return 0;
	}
	$html="<HTML>\n<BODY>\n" . $html . "</BODY>\n</HTML>\n";	    
	
	@cb=@$cb;
	&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,$html,
		  $is_redir,$namespace);
    }
    elsif ($output_format eq $OUTPUT_ALVIS)
    {
	; # Skip HTML and convert directly to Alvis XML to save time
	die("NOT IMPLEMENTED YET!");
	my $alvis_XML;
	
	@cb=@$cb;
	&{$cb[0]}(@cb[1..$#cb],$title,$mod_date,$output_format,
		  $alvis_XML,$is_redir,$namespace);
    }
    else
    {
	$self->_set_err_state($ERR_UNK_OUTPUT_FORMAT,
			      "format: \"$output_format\"");
	return 0;
    }

    return 1;
}

sub _add_cat_page_to_graph
{
    my $self=shift;



( run in 0.349 second using v1.01-cache-2.11-cpan-4e96b696675 )