Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Convert.pm  view on Meta::CPAN


############################################################################
#
#          Public methods
#
############################################################################

sub new
{
    my $proto=shift;
 
    my $class=ref($proto)||$proto;
    my $parent=ref($proto)&&$proto;
    my $self={};
    bless($self,$class);

    $self->_set_err_state($ERR_OK);

    $self->_init(@_);

    if (defined($self->{urlBase}))
    {
	if ($self->{urlBase}!~/\/$/)
	{
	    $self->{urlBase}.='/';
	}
    }

    $self->{canonicalConverter}=Alvis::Canonical->new();
    if (!defined($self->{canonicalConverter}))
    {
	$self->_set_err_state($ERR_CANONICAL);
	return undef;
    }

    $self->{documentAssembler}=
	Alvis::Document->new(includeOriginalDocument=>
			     $self->{includeOriginalDocument});
    if (!defined($self->{documentAssembler}))
    {
	$self->_set_err_state($ERR_ASSEMBLER);
	return undef;
    }

    $self->{XMLParser}=XML::LibXML->new();
    if (!defined($self->{XMLParser}))
    {
	$self->_set_err_state($ERR_XML_PARSER);
	return undef;
    }

    $self->{encodingWizard}=
	Alvis::Document::Encoding->new(defaultEncoding=>undef);
    if (!defined($self->{encodingWizard}))
    {
	$self->_set_err_state($ERR_ENCODING_WIZARD);
	return undef;
    }

    $self->{wikipediaConverter}=
	Alvis::Wikipedia::XMLDump->new(expandVariables=>1,
				       skipRedirects=>0,
				       dumpCategoryData=>1,
				       dumpTemplateData=>1);
    if (!defined($self->{wikipediaConverter}))
    {
	$self->_set_err_state($ERR_WIKIPEDIA);
	return undef;
    }

    $self->{docTypeWizard}=
	Alvis::Document::Type->new(defaultType=>
				   $self->{defaultDocType},
				   defaultSubType=>
				   $self->{defaultDocSubType});
    if (!defined($self->{docTypeWizard}))
    {
	$self->_set_err_state($ERR_DOC_TYPE_WIZARD);
	return undef;
    }

    return $self;
}

sub _init
{
    my $self=shift;

    $self->{fileType}=undef;
    $self->{sourceEncoding}=undef;
    $self->{urlFromBasename}=0;
    $self->{outputAtSameLocation}=0;
    $self->{alvisSuffix}='alvis';
    $self->{outputRootDir}='.';
    $self->{outputNPerSubdir}=1000;
    $self->{defaultDocType}='text';
    $self->{defaultDocSubType}='html';
    $self->{defaultEncoding}='iso-8859-1';
    $self->{includeOriginalDocument}=1;
    $self->{ainodumpWarnings}=1;
    $self->{sourceEncodingFromMeta}=0;

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }

}

#
# in UTF-8
#
sub HTML
{
    my $self=shift;
    my $html=shift;   
    my $meta_txt=shift;
    my $opts=shift;

    $self->_set_err_state($ERR_OK);

lib/Alvis/Convert.pm  view on Meta::CPAN

	if (!defined($alvisXML))
	{
	    $self->_set_err_state($ERR_ASSEMBLE,
				  $self->{documentAssembler}->errmsg());
	    return undef;
	}
	push(@alvisXMLs,$alvisXML);
    }

    return \@alvisXMLs;
}

sub ainodump
{
    my $self=shift;
    my $f=shift;   

    # No meta needed -- one per record in the dump
    #
    if (!defined(open(AINO,"<:raw",$f)))
    {
	$self->_set_err_state($ERR_OPEN_AINODUMP,
			      "File: \"$f\"");
	return 0;
    }
    if (!$self->{ainodumpConverter}
	->process_dump(*AINO,
		       [\&_process_ainodump_doc,$self]))
    {
	$self->_set_err_state($ERR_AINODUMP_PROCESS,
			      "File: \"$f\"");
	return 0;
    }
    close(AINO);
 
    return 1;
}

#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
#               will be called like this:
#          _output_wikipedia_article($arg1,$arg2,...,
#                                    $title,$output_format,
#                                    $record_txt,$is_redir)
#
#  where $output_format is a global defined in Alvis::Wikipedia::XMLDump
#  as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...]     OPTIONAL
#               will be called like this:
#          _wikipedia_progress($arg1,$arg2,...,
#                              $prog_txt,$N,$n,$mess)
#
#   where $N is the total number of records processed and $n the number of hits
#
# opts:  a hash of options with these possible fields:
#
#     namespaces              ref to a list of namespace identifiers whose
#                             records to extract
#     expandTemplates         flag for true template expansion
#     templateDumpF           template dump file
#     outputFormat            format for result records 
#                             ($Alvis::Wikipedia::XMLDump::OUTPUT_*)
#     categoryWord            category namespace identifier (changes with
#                             language)
#     templateWord            template namespace identifier (changes with
#                             language)
#     rootCategory            root category identifier (changes with
#                             language)
#     date                    the date of the dump
#     dumpCatGraph            flag for dumping the category graph
#     catGraphDumpF           category graph dump file
#
sub wikipedia
{
    my $self=shift;
    my $f=shift;   
    my $output_cb=shift;  
    my $opts=shift;
    my $progress_cb=shift;

    if (!defined(open(WIKIPEDIA,"<:utf8",$f)))
    {
	$self->_set_err_state($ERR_OPEN_WIKIPEDIA,
			      "File: \"$f\"");
	return 0;
    }
    if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA,
						      $output_cb,
						      $opts,
						      $progress_cb))
    {
	$self->_set_err_state($ERR_WIKIPEDIA_CONV,
			      "File: \"$f\"");
	return 0;
    }

    close(WIKIPEDIA);

    return 1;
}

sub set
{
    my $self=shift;
    my $param=shift;
    my $value=shift;

    $self->{$param}=$value;
}

sub read_HTML
{
    my $self=shift;
    my $f=shift;
    my $meta_txt=shift;

    my $html_txt="";

    # Stupid duplicating of "how the f**k do you read UTF8 in Perl?" fix



( run in 0.716 second using v1.01-cache-2.11-cpan-5623c5533a1 )