Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Convert.pm view on Meta::CPAN
############################################################################
#
# Public methods
#
############################################################################
sub new
{
my $proto=shift;
my $class=ref($proto)||$proto;
my $parent=ref($proto)&&$proto;
my $self={};
bless($self,$class);
$self->_set_err_state($ERR_OK);
$self->_init(@_);
if (defined($self->{urlBase}))
{
if ($self->{urlBase}!~/\/$/)
{
$self->{urlBase}.='/';
}
}
$self->{canonicalConverter}=Alvis::Canonical->new();
if (!defined($self->{canonicalConverter}))
{
$self->_set_err_state($ERR_CANONICAL);
return undef;
}
$self->{documentAssembler}=
Alvis::Document->new(includeOriginalDocument=>
$self->{includeOriginalDocument});
if (!defined($self->{documentAssembler}))
{
$self->_set_err_state($ERR_ASSEMBLER);
return undef;
}
$self->{XMLParser}=XML::LibXML->new();
if (!defined($self->{XMLParser}))
{
$self->_set_err_state($ERR_XML_PARSER);
return undef;
}
$self->{encodingWizard}=
Alvis::Document::Encoding->new(defaultEncoding=>undef);
if (!defined($self->{encodingWizard}))
{
$self->_set_err_state($ERR_ENCODING_WIZARD);
return undef;
}
$self->{wikipediaConverter}=
Alvis::Wikipedia::XMLDump->new(expandVariables=>1,
skipRedirects=>0,
dumpCategoryData=>1,
dumpTemplateData=>1);
if (!defined($self->{wikipediaConverter}))
{
$self->_set_err_state($ERR_WIKIPEDIA);
return undef;
}
$self->{docTypeWizard}=
Alvis::Document::Type->new(defaultType=>
$self->{defaultDocType},
defaultSubType=>
$self->{defaultDocSubType});
if (!defined($self->{docTypeWizard}))
{
$self->_set_err_state($ERR_DOC_TYPE_WIZARD);
return undef;
}
return $self;
}
sub _init
{
my $self=shift;
$self->{fileType}=undef;
$self->{sourceEncoding}=undef;
$self->{urlFromBasename}=0;
$self->{outputAtSameLocation}=0;
$self->{alvisSuffix}='alvis';
$self->{outputRootDir}='.';
$self->{outputNPerSubdir}=1000;
$self->{defaultDocType}='text';
$self->{defaultDocSubType}='html';
$self->{defaultEncoding}='iso-8859-1';
$self->{includeOriginalDocument}=1;
$self->{ainodumpWarnings}=1;
$self->{sourceEncodingFromMeta}=0;
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#
# in UTF-8
#
sub HTML
{
my $self=shift;
my $html=shift;
my $meta_txt=shift;
my $opts=shift;
$self->_set_err_state($ERR_OK);
lib/Alvis/Convert.pm view on Meta::CPAN
if (!defined($alvisXML))
{
$self->_set_err_state($ERR_ASSEMBLE,
$self->{documentAssembler}->errmsg());
return undef;
}
push(@alvisXMLs,$alvisXML);
}
return \@alvisXMLs;
}
sub ainodump
{
my $self=shift;
my $f=shift;
# No meta needed -- one per record in the dump
#
if (!defined(open(AINO,"<:raw",$f)))
{
$self->_set_err_state($ERR_OPEN_AINODUMP,
"File: \"$f\"");
return 0;
}
if (!$self->{ainodumpConverter}
->process_dump(*AINO,
[\&_process_ainodump_doc,$self]))
{
$self->_set_err_state($ERR_AINODUMP_PROCESS,
"File: \"$f\"");
return 0;
}
close(AINO);
return 1;
}
#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
# will be called like this:
# _output_wikipedia_article($arg1,$arg2,...,
# $title,$output_format,
# $record_txt,$is_redir)
#
# where $output_format is a global defined in Alvis::Wikipedia::XMLDump
# as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...] OPTIONAL
# will be called like this:
# _wikipedia_progress($arg1,$arg2,...,
# $prog_txt,$N,$n,$mess)
#
# where $N is the total number of records processed and $n the number of hits
#
# opts: a hash of options with these possible fields:
#
# namespaces ref to a list of namespace identifiers whose
# records to extract
# expandTemplates flag for true template expansion
# templateDumpF template dump file
# outputFormat format for result records
# ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
# categoryWord category namespace identifier (changes with
# language)
# templateWord template namespace identifier (changes with
# language)
# rootCategory root category identifier (changes with
# language)
# date the date of the dump
# dumpCatGraph flag for dumping the category graph
# catGraphDumpF category graph dump file
#
sub wikipedia
{
my $self=shift;
my $f=shift;
my $output_cb=shift;
my $opts=shift;
my $progress_cb=shift;
if (!defined(open(WIKIPEDIA,"<:utf8",$f)))
{
$self->_set_err_state($ERR_OPEN_WIKIPEDIA,
"File: \"$f\"");
return 0;
}
if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA,
$output_cb,
$opts,
$progress_cb))
{
$self->_set_err_state($ERR_WIKIPEDIA_CONV,
"File: \"$f\"");
return 0;
}
close(WIKIPEDIA);
return 1;
}
sub set
{
my $self=shift;
my $param=shift;
my $value=shift;
$self->{$param}=$value;
}
sub read_HTML
{
my $self=shift;
my $f=shift;
my $meta_txt=shift;
my $html_txt="";
# Stupid duplicating of "how the f**k do you read UTF8 in Perl?" fix
( run in 0.716 second using v1.01-cache-2.11-cpan-5623c5533a1 )