Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Convert.pm view on Meta::CPAN
$self->{documentAssembler}->assemble({canDoc=>$can_doc,
links=>$links,
meta=>$meta,
origText=>$html});
if (!defined($alvisXML))
{
$self->_set_err_state($ERR_ASSEMBLE,
$self->{documentAssembler}->errmsg());
return undef;
}
return $alvisXML;
}
sub newsXML
{
my $self=shift;
my $newsXML=shift;
my $meta_txt=shift;
my $orig_txt=shift;
$self->_set_err_state($ERR_OK);
my $meta=Alvis::Document::Meta->new(text=>$meta_txt);
if (!defined($meta))
{
$self->_set_err_state($ERR_META,
"Meta text:\"$meta_txt\".");
return undef;
}
my @alvisXMLs=();
my $articles=$self->_parse_newsXML($newsXML);
if (!defined($articles))
{
$self->_set_err_state($ERR_NEWS_XML_PARSE);
return undef;
}
for my $article (@$articles)
{
my ($text,$iso_date,$title,$links)=@$article;
if (!defined($text))
{
$self->_set_err_state($ERR_NO_NEWS_XML_TEXT,
"News XML text:\"$newsXML\".");
# OK, ignore
next;
# return undef;
}
$text='<HTML><BODY>' . $text . '</BODY></HTML>';
# Check that the ISO date actually is in ISO format...
if (defined($iso_date))
{
$meta->set('dc:date',$iso_date);
}
my ($can_doc,$header)=
$self->{canonicalConverter}->HTML($text,
{sourceEncoding=>'utf8'});
if (!defined($can_doc))
{
$self->_set_err_state($ERR_CANDOC_CONV,
$self->{canonicalConverter}->errmsg());
return undef;
}
if (defined($title))
{
$meta->set('title',$title);
}
if (!defined($meta->get('url')))
{
$self->_set_err_state($ERR_NO_URL);
return undef;
}
else
{
if (!defined($meta->get('baseURL')))
{
my $base_URL=$meta->get('url');
$base_URL=~s/\/[^\/]+?$/\//isgo;
$meta->set('baseURL',$base_URL);
}
}
my $alvisXML=
$self->{documentAssembler}->assemble({canDoc=>$can_doc,
meta=>$meta,
links=>$links,
origText=>$orig_txt});
if (!defined($alvisXML))
{
$self->_set_err_state($ERR_ASSEMBLE,
$self->{documentAssembler}->errmsg());
return undef;
}
push(@alvisXMLs,$alvisXML);
}
return \@alvisXMLs;
}
sub ainodump
{
my $self=shift;
my $f=shift;
# No meta needed -- one per record in the dump
#
if (!defined(open(AINO,"<:raw",$f)))
{
$self->_set_err_state($ERR_OPEN_AINODUMP,
"File: \"$f\"");
return 0;
}
if (!$self->{ainodumpConverter}
->process_dump(*AINO,
[\&_process_ainodump_doc,$self]))
{
$self->_set_err_state($ERR_AINODUMP_PROCESS,
"File: \"$f\"");
return 0;
}
close(AINO);
return 1;
}
#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
# will be called like this:
# _output_wikipedia_article($arg1,$arg2,...,
# $title,$output_format,
# $record_txt,$is_redir)
#
# where $output_format is a global defined in Alvis::Wikipedia::XMLDump
# as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...] OPTIONAL
# will be called like this:
# _wikipedia_progress($arg1,$arg2,...,
# $prog_txt,$N,$n,$mess)
#
# where $N is the total number of records processed and $n the number of hits
#
# opts: a hash of options with these possible fields:
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records
# ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
# dumpCatGraph flag for dumping the category graph
# catGraphDumpF category graph dump file
#
sub wikipedia
{
my $self=shift;
my $f=shift;
my $output_cb=shift;
my $opts=shift;
my $progress_cb=shift;
if (!defined(open(WIKIPEDIA,"<:utf8",$f)))
{
$self->_set_err_state($ERR_OPEN_WIKIPEDIA,
"File: \"$f\"");
return 0;
}
if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA,
$output_cb,
$opts,
$progress_cb))
{
$self->_set_err_state($ERR_WIKIPEDIA_CONV,
"File: \"$f\"");
return 0;
}
close(WIKIPEDIA);
return 1;
}
sub set
{
my $self=shift;
my $param=shift;
my $value=shift;
$self->{$param}=$value;
}
sub read_HTML
{
my $self=shift;
my $f=shift;
my $meta_txt=shift;
my $html_txt="";
# Stupid duplicating of "how the f**k do you read UTF8 in Perl?" fix
my $meta=Alvis::Document::Meta->new(text=>$meta_txt);
if (!defined($meta))
{
$self->_set_err_state($ERR_META,
"Meta text:\"$meta_txt\".");
return undef;
}
my $src_enc;
if ($self->{sourceEncoding})
{
$src_enc=$self->{sourceEncoding};
}
if ($self->{sourceEncodingFromMeta})
{
my $detected=$meta->get('detectedCharSet');
if ($detected)
{
$src_enc=$detected;
}
}
if (defined($src_enc) && $src_enc=~/^\s*utf\s*\-?\s*8\s*$/i)
{
if (!defined(open(H,"<:utf8",$f)))
{
$self->_set_err_state($ERR_HTML_F,
"File: \"$f\".");
return undef;
}
while (my $l=<H>)
{
$html_txt.=$l;
}
close(H);
}
else
{
if (!defined(open(H,"<$f")))
{
$self->_set_err_state($ERR_HTML_F,
"File: \"$f\".");
return undef;
}
while (my $l=<H>)
{
$html_txt.=$l;
}
close(H);
}
return $html_txt;
}
sub read_meta
{
my $self=shift;
my $f=shift;
my $meta_txt="";
if (defined($self->{metaEncoding}))
{
if ($self->{metaEncoding}=~/^\s*utf\s*\-?\s*8\s*$/i)
{
if (!defined(open(M,"<:utf8",$f)))
{
$self->_set_err_state($ERR_META_F,
"File: \"$f\".");
return undef;
}
while (my $l=<M>)
{
$meta_txt.=$l;
}
close(M);
}
else # non-UTF8
{
if (!defined(open(M,"<$f")))
{
$self->_set_err_state($ERR_META_F,
"File: \"$f\".");
return undef;
}
while (my $l=<M>)
{
$meta_txt.=$l;
}
close(M);
eval
{
Encode::from_to($meta_txt,
$self->{metaEncoding},'utf-8',Encode::FB_WARN);
};
if ($@)
{
$self->_set_err_state($ERR_ENCODING_CONV,
"$@. Supposed source encoding of \"$f\":" .
"\"$self->{metaEncoding}\".");
return undef;
}
}
}
else # encoding unknown
{
if (!defined(open(M,"<$f")))
{
$self->_set_err_state($ERR_META_F,
"File: \"$f\".");
return undef;
}
my $meta_txt="";
while (my $l=<M>)
{
$meta_txt.=$l;
}
close(M);
$meta_txt=$self->{encodingWizard}->try_to_convert_to_utf8($meta_txt,
'text',
'plain');
if (!defined($meta_txt))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWizard}->errmsg());
return undef;
}
}
return $meta_txt;
}
sub read_news_XML
{
my $self=shift;
my $f=shift;
if (!defined(open(X,"<:utf8",$f)))
{
$self->_set_err_state($ERR_NEWS_XML_F,
"File: \"$f\".");
return undef;
}
my $txt="";
while (my $l=<X>)
{
$txt.=$l;
}
close(X);
return $txt;
}
sub init_output
{
my $self=shift;
$self->{outputN}=0;
}
sub output_Alvis
{
my $self=shift;
my $alvis_records=shift;
my $base_name=shift;
$self->{recordN}=0;
for my $alvis_record (@$alvis_records)
{
if (!defined($alvis_record))
{
$self->_set_err_state($ERR_DOC_ALVIS_CONV,
"Base name:\"$base_name\"," .
"# of record: $self->{recordN}");
return 0;
}
my $out_f;
if (!defined($self->{alvisSuffix}))
{
$self->_set_err_state($ERR_ALVIS_SUFFIX);
return 0;
}
if ($self->{outputAtSameLocation})
{
$out_f=$base_name . "." . $self->{articleN} . '.' .
$self->{alvisSuffix};
$self->{articleN}++;
if (!$self->_output_set_of_records($alvis_record,$out_f))
{
$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
return 0;
}
$self->{outputN}++;
print "$self->{outputN}\r";
}
else
{
lib/Alvis/Convert.pm view on Meta::CPAN
}
# print "TYPE:$type,SUBTYPE:$sub_type\n";
if ($type eq 'text' && $sub_type eq 'html')
{
my $meta_txt;
if (defined($header->{url}))
{
$meta_txt.="url\t$header->{url}\n";
}
if (defined($header->{time}))
{
$meta_txt.="date\t$header->{time}\n";
}
my $base_name;
if (defined($header->{id}))
{
$base_name=$header->{id};
}
else
{
warn "Ainodump document had no ID. URL,time:" .
"($header->{url},$header->{time})\n" if $self->{ainodumpWarnings};
return 1;
}
my $srcenc_setting=$self->{sourceEncoding};
$self->{sourceEncoding}=undef;
my $alvisXML=$self->HTML($text,$meta_txt);
$self->{sourceEncoding}=$srcenc_setting;
if (!defined($alvisXML))
{
$self->_set_err_state($ERR_ALVIS_CONV);
return 0;
}
if (!$self->output_Alvis([$alvisXML],$base_name))
{
$self->_set_err_state($ERR_OUTPUT_ALVIS,
"Base name: \"$base_name\"");
return 0;
}
}
else
{
warn "Ainodump document $header->{id} was not of a convertible " .
"type: $type/$sub_type.\n" if $self->{ainodumpWarnings};
}
return 1;
}
sub _output_set_of_records
{
my $self=shift;
my $set_of_records_txt=shift;
my $path=shift;
if (!defined(open(OUT,">:utf8",$path)))
{
$self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " .
"\"$path\"");
return 0;
}
print OUT "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
print OUT "<documentCollection xmlns=\"http://alvis.info/enriched/\">\n";
print OUT $set_of_records_txt;
print OUT "</documentCollection>\n";
close(OUT);
return 1;
}
sub _get_HTML_txt
{
my $self=shift;
my $file_versions=shift;
my $base_name=shift;
my $html_suffix=shift;
my ($html_txt);
if (defined($html_suffix) &&
exists($file_versions->{$base_name}{$html_suffix}))
{
my $html_f=$base_name . "." . $html_suffix;
$html_txt=$self->_read_HTML($html_f);
if (!defined($html_txt))
{
$self->_set_err_state($ERR_READ_HTML,"File:\"$html_f\"");
return undef;
}
}
else # no HTML file
{
$self->_set_err_state($ERR_NO_HTML_F,"Base name:\"$base_name\"");
return undef;
}
return $html_txt;
}
sub _read_HTML
{
my $self=shift;
my $f=shift;
if (!defined(open(H,"<$f")))
{
$self->_set_err_state($ERR_HTML_F,
"File: \"$f\".");
return undef;
}
my $txt="";
while (my $l=<H>)
{
$txt.=$l;
}
close(H);
lib/Alvis/Convert.pm view on Meta::CPAN
includeOriginalDocument=>0,
sourceEncodingFromMeta=>1);
# Restart output counters
$C->init_output();
# Convert e.g. HTML
for my $html_text (@html)
{
my $alvisXML=$C->HTML($html_txt,$meta_txt);
if (!defined($alvisXML))
{
warn $C->errmsg();
$C->clearerr();
next;
}
if (!$C->output_Alvis([$alvisXML]))
{
warn $C->errmsg();
$C->clearerr();
next;
}
}
=head1 DESCRIPTION
Converts document collections of different formats to Alvis XML
format.
=head1 METHODS
=head2 new()
Options:
fileType the MIME type of the source file to convert.
Default: guess.
sourceEncoding encoding of the source document. Default: guess.
urlFromBasename extract URL from basename. Default: no.
outputAtSameLocation output Alvis XML to the same directories as the
source documents. Default: no.
alvisSuffix suffix of the output Alvis XML records. Default:
'alvis'.
outputRootDir root directory for output files. Default: '.'
outputNPerSubdir number of records output per subdirectory.
Default: 1000
defaultDocType first guess document (MIME) type. Default: 'text'.
defaultDocSubType first guess document subtype. Default: 'html'.
defaultEncoding first guess encoding. Default: 'iso-8859-1'.
includeOriginalDocument include original document in the output?
Default: yes.
ainodumpWarnings issue warnings concerning ainodump conversion?
Default: yes.
sourceEncodingFromMeta read source encoding from Meta information?
Default: no.
=head2 HTML()
my $alvisXML=$C->HTML($html_txt,$meta_txt,
{sourceEncoding=>'utf8',
sourceEncodingFromMeta=>0
});
if (!defined($alvisXML))
{
warn $C->errmsg();
$C->clearerr();
next;
}
=head2 newsXML()
$meta_txt=$C->read_meta($news_xml_entries{$base_name}{metaF});
if (!defined($meta_txt))
{
warn "Reading meta file " .
"\"$news_xml_entries{$base_name}{metaF}\" failed. " .
$C->errmsg();
$C->clearerr();
next;
}
my $alvisXMLs;
$xml_txt=$C->read_news_XML($news_xml_entries{$base_name}{xmlF});
if (!defined($xml_txt))
{
warn "Reading the news XML for basename \"$base_name\" failed. " .
$C->errmsg();
$C->clearerr();
next;
}
$alvisXMLs=$C->newsXML($xml_txt,$meta_txt,$original_document_text);
if (!defined($alvisXMLs))
{
warn "Obtaining the Alvis versions of the documents inside " .
"\"$base_name\"'s XML file failed. " . $C->errmsg();
$C->clearerr();
next;
}
=head2 ainodump()
if (!$C->ainodump($ainodump_file))
{
warn "Obtaining the Alvis version of the " .
"ainodump file \"$dump_entries{$base_name}{ainoF}\" " .
"failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
}
=head2 set()
$C->set('alvisSuffix','foo');
=head2 read_HTML()
$html_txt=$C->read_HTML($html_file,$meta_txt);
if (!defined($html_txt))
{
warn "Reading the HTML failed. " .
( run in 0.664 second using v1.01-cache-2.11-cpan-140bd7fdf52 )