Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Convert.pm view on Meta::CPAN
return 0;
}
if ($self->{outputAtSameLocation})
{
$out_f=$base_name . "." . $self->{articleN} . '.' .
$self->{alvisSuffix};
$self->{articleN}++;
if (!$self->_output_set_of_records($alvis_record,$out_f))
{
$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
return 0;
}
$self->{outputN}++;
print "$self->{outputN}\r";
}
else
{
if (!defined($self->{outputRootDir}))
{
$self->_set_err_state($ERR_NO_OUTPUT_ROOT_DIR);
return 0;
}
my $dir=$self->{outputRootDir} . '/' .
int($self->{outputN} / $self->{outputNPerSubdir});
if ($self->{outputN} % $self->{outputNPerSubdir}==0)
{
mkdir($dir);
}
$out_f=$dir . '/' . $self->{outputN} . '.' .
$self->{alvisSuffix};
if (!$self->_output_set_of_records($alvis_record,$out_f))
{
$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
return 0;
}
$self->{outputN}++;
print "$self->{outputN}\r";
}
}
return 1;
}
############################################################################
#
# Private methods
#
############################################################################
sub _process_ainodump_doc
{
my $self=shift;
my $text=shift;
my $header=shift;
# print Dumper($header);
# print "\n";
my ($type,$sub_type)=$self->{docTypeWizard}->guess($text);
if (!(defined($type) && defined($sub_type)))
{
$self->_set_err_state($ERR_TYPE_GUESS,
$self->{docTypeWizard}->errmsg());
return 0;
}
# print "TYPE:$type,SUBTYPE:$sub_type\n";
if ($type eq 'text' && $sub_type eq 'html')
{
my $meta_txt;
if (defined($header->{url}))
{
$meta_txt.="url\t$header->{url}\n";
}
if (defined($header->{time}))
{
$meta_txt.="date\t$header->{time}\n";
}
my $base_name;
if (defined($header->{id}))
{
$base_name=$header->{id};
}
else
{
warn "Ainodump document had no ID. URL,time:" .
"($header->{url},$header->{time})\n" if $self->{ainodumpWarnings};
return 1;
}
my $srcenc_setting=$self->{sourceEncoding};
$self->{sourceEncoding}=undef;
my $alvisXML=$self->HTML($text,$meta_txt);
$self->{sourceEncoding}=$srcenc_setting;
if (!defined($alvisXML))
{
$self->_set_err_state($ERR_ALVIS_CONV);
return 0;
}
if (!$self->output_Alvis([$alvisXML],$base_name))
{
$self->_set_err_state($ERR_OUTPUT_ALVIS,
"Base name: \"$base_name\"");
return 0;
}
}
else
{
warn "Ainodump document $header->{id} was not of a convertible " .
"type: $type/$sub_type.\n" if $self->{ainodumpWarnings};
}
return 1;
}
sub _output_set_of_records
lib/Alvis/Convert.pm view on Meta::CPAN
return \@articles;
}
1;
__END__
=head1 NAME
Alvis::Convert - Perl extension for converting documents from a number of
different source formats to Alvis XML format.
=head1 SYNOPSIS
use Alvis::Convert;
# Create a new instance, outputting under 'out'. Get the detected
# encoding from sourceEncodingFromMeta.
#
my $C=Alvis::Convert->new(outputRootDir=>'out',
outputNPerSubdir=>1000,
outputAtSameLocation=>0,
includeOriginalDocument=>0,
sourceEncodingFromMeta=>1);
# Restart output counters
$C->init_output();
# Convert e.g. HTML
for my $html_text (@html)
{
my $alvisXML=$C->HTML($html_txt,$meta_txt);
if (!defined($alvisXML))
{
warn $C->errmsg();
$C->clearerr();
next;
}
if (!$C->output_Alvis([$alvisXML]))
{
warn $C->errmsg();
$C->clearerr();
next;
}
}
=head1 DESCRIPTION
Converts document collections of different formats to Alvis XML
format.
=head1 METHODS
=head2 new()
Options:
fileType the MIME type of the source file to convert.
Default: guess.
sourceEncoding encoding of the source document. Default: guess.
urlFromBasename extract URL from basename. Default: no.
outputAtSameLocation output Alvis XML to the same directories as the
source documents. Default: no.
alvisSuffix suffix of the output Alvis XML records. Default:
'alvis'.
outputRootDir root directory for output files. Default: '.'
outputNPerSubdir number of records output per subdirectory.
Default: 1000
defaultDocType first guess document (MIME) type. Default: 'text'.
defaultDocSubType first guess document subtype. Default: 'html'.
defaultEncoding first guess encoding. Default: 'iso-8859-1'.
includeOriginalDocument include original document in the output?
Default: yes.
ainodumpWarnings issue warnings concerning ainodump conversion?
Default: yes.
sourceEncodingFromMeta read source encoding from Meta information?
Default: no.
=head2 HTML()
my $alvisXML=$C->HTML($html_txt,$meta_txt,
{sourceEncoding=>'utf8',
sourceEncodingFromMeta=>0
});
if (!defined($alvisXML))
{
warn $C->errmsg();
$C->clearerr();
next;
}
=head2 newsXML()
$meta_txt=$C->read_meta($news_xml_entries{$base_name}{metaF});
if (!defined($meta_txt))
{
warn "Reading meta file " .
"\"$news_xml_entries{$base_name}{metaF}\" failed. " .
$C->errmsg();
$C->clearerr();
next;
}
my $alvisXMLs;
$xml_txt=$C->read_news_XML($news_xml_entries{$base_name}{xmlF});
if (!defined($xml_txt))
{
warn "Reading the news XML for basename \"$base_name\" failed. " .
$C->errmsg();
$C->clearerr();
next;
}
$alvisXMLs=$C->newsXML($xml_txt,$meta_txt,$original_document_text);
if (!defined($alvisXMLs))
{
warn "Obtaining the Alvis versions of the documents inside " .
"\"$base_name\"'s XML file failed. " . $C->errmsg();
$C->clearerr();
next;
}
=head2 ainodump()
if (!$C->ainodump($ainodump_file))
{
warn "Obtaining the Alvis version of the " .
"ainodump file \"$dump_entries{$base_name}{ainoF}\" " .
"failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
}
( run in 0.586 second using v1.01-cache-2.11-cpan-39bf76dae61 )