Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Convert.pm  view on Meta::CPAN


use Carp;
use Data::Dumper;
use Encode;
use XML::LibXML;

use Alvis::Canonical;
use Alvis::Document;
use Alvis::Document::Encoding;
use Alvis::Document::Meta;
use Alvis::Document::Links;
use Alvis::Document::Type;
use Alvis::AinoDump;
use Alvis::Wikipedia::XMLDump;


############################################################################
#
#  Global variables
#
############################################################################

# Types of documents handled
our ($UNKNOWN_FILE_TYPE,$DIR,$META,$HTML,$NEWS_XML,$AINODUMP,
     $WIKIPEDIA_XML_DUMP)=(0..6);
my %RecognizedEntryTypeDescs=($UNKNOWN_FILE_TYPE=>"Guess the file type",
			      $DIR=>"Directory",
			      $META=>"Meta information",
			      $HTML=>"HTML",
			      $NEWS_XML=>
			      "XML information about a news article",
			      $AINODUMP=>"ainodump",
			      $WIKIPEDIA_XML_DUMP=>"Wikipedia XML dump");

############################################################################
#
#  Error message stuff
#
############################################################################

my ($ERR_OK,
    $ERR_CANONICAL,
    $ERR_ASSEMBLER,
    $ERR_CANDOC_CONV,
    $ERR_META,
    $ERR_LINKS,
    $ERR_LINK_ADD,
    $ERR_ASSEMBLE,
    $ERR_NO_NEWS_XML_TEXT,
    $ERR_XML_PARSER,
    $ERR_XML_PARSE,
    $ERR_NO_URL,
    $ERR_ENCODING_WIZARD,
    $ERR_UTF8_CONV,
    $ERR_ENCODING_CONV,
    $ERR_TYPE_SUFFIX,
    $ERR_READ_HTML,
    $ERR_READ_NEWS_XML,
    $ERR_ALVIS_CONV,
    $ERR_ALVIS_SUFFIX,
    $ERR_NO_OUTPUT_ROOT_DIR,
    $ERR_WRITING_OUTPUT,
    $ERR_DIR_CONV,
    $ERR_NO_HTML_F,
    $ERR_META_F,
    $ERR_HTML_F,
    $ERR_NEWS_XML_F,
    $ERR_DOC_ALVIS_CONV,
    $ERR_NEWS_XML_PARSE,
    $ERR_MULTIPLE_SUFFIX_MEANING,
    $ERR_OUTPUT_ALVIS,
    $ERR_OUTPUT_SET_OF_RECORDS,
    $ERR_AINODUMP,
    $ERR_OPEN_AINODUMP,
    $ERR_AINODUMP_PROCESS,
    $ERR_DOC_TYPE_WIZARD,
    $ERR_TYPE_GUESS,
    $ERR_UNK_FILE_TYPE,
    $ERR_WIKIPEDIA,
    $ERR_OPEN_WIKIPEDIA,
    $ERR_WIKIPEDIA_CONV
    )=(0..40);

my %ErrMsgs=($ERR_OK=>"",
	     $ERR_CANONICAL=>"Could not instantiate Alvis::Canonical.",
	     $ERR_ASSEMBLER=>"Could not instantiate Alvis::Document.",
	     $ERR_CANDOC_CONV=>"Conversion to canonicalDocument failed.",
	     $ERR_META=>"Could not instantiate Alvis::Document::Meta.",
	     $ERR_LINKS=>"Could not instantiate Alvis::Document::Links.",
	     $ERR_LINK_ADD=>"Adding a link failed.",
	     $ERR_ASSEMBLE=>"Assembling a document failed.",
	     $ERR_NO_NEWS_XML_TEXT=>"Unable to extract the content from News" .
	     " XML format.",
	     $ERR_XML_PARSER=>"Could not instantiate XML::LibXML.",
	     $ERR_XML_PARSE=>"Parsing the XML failed.",
	     $ERR_NO_URL=>"No URL.",
	     $ERR_ENCODING_WIZARD=>"Unable to instantiate " .
	     "Alvis::Document::Encoding.",
	     $ERR_UTF8_CONV=>"Trying to convert to UTF-8 failed.",
	     $ERR_ENCODING_CONV=>"Converting from the supposed source " .
	     "encoding to UTF-8 failed.",
	     $ERR_TYPE_SUFFIX=>"No suffix given for a type.",
	     $ERR_READ_HTML=>"Reading the HTML failed.",
	     $ERR_READ_NEWS_XML=>"Reading the news XML failed.",
	     $ERR_ALVIS_CONV=>"Conversion to Alvis format failed.",
	     $ERR_ALVIS_SUFFIX=>"No Alvis suffix defined.",
	     $ERR_NO_OUTPUT_ROOT_DIR=>"No output root directory.",
	     $ERR_WRITING_OUTPUT=>"Writing the output failed.",
	     $ERR_DIR_CONV=>"Converting a directory failed.",
	     $ERR_NO_HTML_F=>"No HTML file.",
	     $ERR_META_F=>"Opening the meta file failed.",
	     $ERR_HTML_F=>"Opening the HTML file failed.",
	     $ERR_NEWS_XML_F=>"Opening the news XML file failed.",
	     $ERR_DOC_ALVIS_CONV=>"Converting a document to Alvis format failed.",
	     $ERR_NEWS_XML_PARSE=>"Parsing the news XML failed.",
	     $ERR_MULTIPLE_SUFFIX_MEANING=>
	     "Multiple meanings for a single suffix.",
	     $ERR_OUTPUT_ALVIS=>"Outputting the Alvis records failed.",
	     $ERR_OUTPUT_SET_OF_RECORDS=>"Outputting a set of records to a " .
	     "file as a documentCollection  failed.",
	     $ERR_AINODUMP=>"Instantiating Alvis::AinoDump failed.",
	     $ERR_OPEN_AINODUMP=>"Opening an ainodump file failed.",
	     $ERR_AINODUMP_PROCESS=>"Processing an ainodump file failed.",
	     $ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type " .
	                           "failed.",
	     $ERR_TYPE_GUESS=>"Guessing the document's type failed.",
	     $ERR_UNK_FILE_TYPE=>"Unrecognized file type.",
	     $ERR_WIKIPEDIA=>"Instantiating Alvis::Wikipedia::XMLDump failed.",
	     $ERR_OPEN_WIKIPEDIA=>"Opening the Wikipedia XML dump file failed.",
	     $ERR_WIKIPEDIA_CONV=>"Extracting the articles from the Wikipedia" .
	                          " XML dump failed." 
	     );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;


    if (!defined($errcode))
    {
	confess("set_err_state() called with an undefined argument.");
    }

    if (exists($ErrMsgs{$errcode}))
    {
	if ($errcode==$ERR_OK)
	{
	    $self->{errstr}="";
	}
	else
	{
	    $self->{errstr}.=" " . $ErrMsgs{$errcode};
	    if (defined($errmsg))
	    {
		$self->{errstr}.=" " . $errmsg;
	    }

	}
    }
    else
    {
	confess("Internal error: set_err_state() called with an " .
		"unrecognized argument ($errcode).")
    }
}

sub clearerr
{
    my $self=shift;
    
    $self->{errstr}="";
}

sub errmsg
{
    my $self=shift;
    

lib/Alvis/Convert.pm  view on Meta::CPAN

	{
	    if (!defined($meta->get('baseURL')))
	    {
		my $base_URL=$meta->get('url');
		$base_URL=~s/\/[^\/]+?$/\//isgo;
		$meta->set('baseURL',$base_URL);
	    }
	}
	
	my $alvisXML=
	    $self->{documentAssembler}->assemble({canDoc=>$can_doc,
						  meta=>$meta,
						  links=>$links,
						  origText=>$orig_txt});
	if (!defined($alvisXML))
	{
	    $self->_set_err_state($ERR_ASSEMBLE,
				  $self->{documentAssembler}->errmsg());
	    return undef;
	}
	push(@alvisXMLs,$alvisXML);
    }

    return \@alvisXMLs;
}

sub ainodump
{
    my $self=shift;
    my $f=shift;   

    # No meta needed -- one per record in the dump
    #
    if (!defined(open(AINO,"<:raw",$f)))
    {
	$self->_set_err_state($ERR_OPEN_AINODUMP,
			      "File: \"$f\"");
	return 0;
    }
    if (!$self->{ainodumpConverter}
	->process_dump(*AINO,
		       [\&_process_ainodump_doc,$self]))
    {
	$self->_set_err_state($ERR_AINODUMP_PROCESS,
			      "File: \"$f\"");
	return 0;
    }
    close(AINO);
 
    return 1;
}

#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
#               will be called like this:
#          _output_wikipedia_article($arg1,$arg2,...,
#                                    $title,$output_format,
#                                    $record_txt,$is_redir)
#
#  where $output_format is a global defined in Alvis::Wikipedia::XMLDump
#  as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...]     OPTIONAL
#               will be called like this:
#          _wikipedia_progress($arg1,$arg2,...,
#                              $prog_txt,$N,$n,$mess)
#
#   where $N is the total number of records processed and $n the number of hits
#
# opts:  a hash of options with these possible fields:
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records 
#                             ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump
#     dumpCatGraph            flag for dumping the category graph
#     catGraphDumpF           category graph dump file
#
sub wikipedia
{
    my $self=shift;
    my $f=shift;   
    my $output_cb=shift;  
    my $opts=shift;
    my $progress_cb=shift;

    if (!defined(open(WIKIPEDIA,"<:utf8",$f)))
    {
	$self->_set_err_state($ERR_OPEN_WIKIPEDIA,
			      "File: \"$f\"");
	return 0;
    }
    if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA,
						      $output_cb,
						      $opts,
						      $progress_cb))
    {
	$self->_set_err_state($ERR_WIKIPEDIA_CONV,
			      "File: \"$f\"");
	return 0;
    }

    close(WIKIPEDIA);

    return 1;
}

sub set
{
    my $self=shift;
    my $param=shift;
    my $value=shift;

    $self->{$param}=$value;
}

sub read_HTML
{
    my $self=shift;
    my $f=shift;
    my $meta_txt=shift;

    my $html_txt="";

    # Stupid duplicating of "how the f**k do you read UTF8 in Perl?" fix
    my $meta=Alvis::Document::Meta->new(text=>$meta_txt);
    if (!defined($meta))
    {

lib/Alvis/Convert.pm  view on Meta::CPAN

}

sub read_news_XML
{
    my $self=shift;
    my $f=shift;

    if (!defined(open(X,"<:utf8",$f)))
    {
	$self->_set_err_state($ERR_NEWS_XML_F,
			      "File: \"$f\".");
	return undef;
    }
    my $txt="";
    while (my $l=<X>)
    {
	$txt.=$l;
    }
    close(X);

    return $txt;
}

sub init_output
{
    my $self=shift;
    
    $self->{outputN}=0;
}

sub output_Alvis
{
    my $self=shift;
    my $alvis_records=shift;
    my $base_name=shift;
    
    $self->{recordN}=0;
    for my $alvis_record (@$alvis_records)
    {
	if (!defined($alvis_record))
	{
	    $self->_set_err_state($ERR_DOC_ALVIS_CONV,
				  "Base name:\"$base_name\"," .
				  "# of record: $self->{recordN}");
	    return 0;
	}

	my $out_f;
	if (!defined($self->{alvisSuffix}))
	{
	    $self->_set_err_state($ERR_ALVIS_SUFFIX);
	    return 0;
	}
	if ($self->{outputAtSameLocation})
	{
	    $out_f=$base_name . "." . $self->{articleN} . '.' .
		$self->{alvisSuffix};
	    $self->{articleN}++;
	    if (!$self->_output_set_of_records($alvis_record,$out_f))
	    {
		$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
		return 0;
	    }
	    $self->{outputN}++;
	    print "$self->{outputN}\r";
	}
	else
	{
	    if (!defined($self->{outputRootDir}))
	    {
		$self->_set_err_state($ERR_NO_OUTPUT_ROOT_DIR);
		return 0;
	    }
	    my $dir=$self->{outputRootDir} . '/' . 
		int($self->{outputN} / $self->{outputNPerSubdir});
	    if ($self->{outputN} % $self->{outputNPerSubdir}==0)
	    {
		mkdir($dir);
	    }
	    $out_f=$dir . '/' . $self->{outputN} . '.' .
		$self->{alvisSuffix};
	    
	    if (!$self->_output_set_of_records($alvis_record,$out_f))
	    {
		$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
		return 0;
	    }
	    
	    $self->{outputN}++;
	    print "$self->{outputN}\r";
	}
    }

    return 1;
}

############################################################################
#
#          Private methods
#
############################################################################

sub _process_ainodump_doc
{
    my $self=shift;
    my $text=shift;
    my $header=shift;

#    print Dumper($header);
#    print "\n";

    my ($type,$sub_type)=$self->{docTypeWizard}->guess($text);
    if (!(defined($type) && defined($sub_type)))
    {
	$self->_set_err_state($ERR_TYPE_GUESS,
			      $self->{docTypeWizard}->errmsg());
	return 0;
    }

#    print "TYPE:$type,SUBTYPE:$sub_type\n";
    
    if ($type eq 'text' && $sub_type eq 'html')
    {
	my $meta_txt;
	if (defined($header->{url}))
	{
	    $meta_txt.="url\t$header->{url}\n";
	}
	if (defined($header->{time}))
	{
	    $meta_txt.="date\t$header->{time}\n";
	}
 	
	my $base_name;
	if (defined($header->{id}))
	{
	    $base_name=$header->{id};
	}
	else 
	{
	    warn "Ainodump document had no ID. URL,time:" .
		"($header->{url},$header->{time})\n" if $self->{ainodumpWarnings};
	    return 1;
	}

	my $srcenc_setting=$self->{sourceEncoding};
	$self->{sourceEncoding}=undef;
	my $alvisXML=$self->HTML($text,$meta_txt);
	$self->{sourceEncoding}=$srcenc_setting;
	if (!defined($alvisXML))
	{
	    $self->_set_err_state($ERR_ALVIS_CONV);
	    return 0;
	}

	if (!$self->output_Alvis([$alvisXML],$base_name))
	{
	    $self->_set_err_state($ERR_OUTPUT_ALVIS,
				  "Base name: \"$base_name\"");
	    return 0;
	}
    }
    else
    {
	warn "Ainodump document $header->{id} was not of a convertible " .
	     "type: $type/$sub_type.\n" if $self->{ainodumpWarnings};
    }

    return 1;
}

sub _output_set_of_records
{
    my $self=shift;
    my $set_of_records_txt=shift;
    my $path=shift;

    if (!defined(open(OUT,">:utf8",$path)))
    {
	$self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " .
			      "\"$path\"");
	return 0;
    }
    print OUT "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    print OUT "<documentCollection xmlns=\"http://alvis.info/enriched/\">\n";
    print OUT $set_of_records_txt;
    print OUT "</documentCollection>\n";
    close(OUT);
    
    return 1;
}

sub _get_HTML_txt
{
    my $self=shift;
    my $file_versions=shift;
    my $base_name=shift;
    my $html_suffix=shift;

    my ($html_txt);
    if (defined($html_suffix) && 
	exists($file_versions->{$base_name}{$html_suffix}))
    {
	my $html_f=$base_name . "." . $html_suffix;
	$html_txt=$self->_read_HTML($html_f);
	if (!defined($html_txt))
	{
	    $self->_set_err_state($ERR_READ_HTML,"File:\"$html_f\"");
	    return undef;
	}
    }
    else # no HTML file
    {
	$self->_set_err_state($ERR_NO_HTML_F,"Base name:\"$base_name\"");
	return undef;
    }

    return $html_txt;
}


sub _read_HTML
{
    my $self=shift;
    my $f=shift;

    if (!defined(open(H,"<$f")))
    {
	$self->_set_err_state($ERR_HTML_F,
			      "File: \"$f\".");
	return undef;
    }
    my $txt="";
    while (my $l=<H>)
    {
	$txt.=$l;
    }
    close(H);

    return $txt;



( run in 1.758 second using v1.01-cache-2.11-cpan-13bb782fe5a )