Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Convert.pm  view on Meta::CPAN

	    return 0;
	}
	if ($self->{outputAtSameLocation})
	{
	    $out_f=$base_name . "." . $self->{articleN} . '.' .
		$self->{alvisSuffix};
	    $self->{articleN}++;
	    if (!$self->_output_set_of_records($alvis_record,$out_f))
	    {
		$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
		return 0;
	    }
	    $self->{outputN}++;
	    print "$self->{outputN}\r";
	}
	else
	{
	    if (!defined($self->{outputRootDir}))
	    {
		$self->_set_err_state($ERR_NO_OUTPUT_ROOT_DIR);
		return 0;
	    }
	    my $dir=$self->{outputRootDir} . '/' . 
		int($self->{outputN} / $self->{outputNPerSubdir});
	    if ($self->{outputN} % $self->{outputNPerSubdir}==0)
	    {
		mkdir($dir);
	    }
	    $out_f=$dir . '/' . $self->{outputN} . '.' .
		$self->{alvisSuffix};
	    
	    if (!$self->_output_set_of_records($alvis_record,$out_f))
	    {
		$self->_set_err_state($ERR_OUTPUT_SET_OF_RECORDS);
		return 0;
	    }
	    
	    $self->{outputN}++;
	    print "$self->{outputN}\r";
	}
    }

    return 1;
}

############################################################################
#
#          Private methods
#
############################################################################

sub _process_ainodump_doc
{
    my $self=shift;
    my $text=shift;
    my $header=shift;

#    print Dumper($header);
#    print "\n";

    my ($type,$sub_type)=$self->{docTypeWizard}->guess($text);
    if (!(defined($type) && defined($sub_type)))
    {
	$self->_set_err_state($ERR_TYPE_GUESS,
			      $self->{docTypeWizard}->errmsg());
	return 0;
    }

#    print "TYPE:$type,SUBTYPE:$sub_type\n";
    
    if ($type eq 'text' && $sub_type eq 'html')
    {
	my $meta_txt;
	if (defined($header->{url}))
	{
	    $meta_txt.="url\t$header->{url}\n";
	}
	if (defined($header->{time}))
	{
	    $meta_txt.="date\t$header->{time}\n";
	}
 	
	my $base_name;
	if (defined($header->{id}))
	{
	    $base_name=$header->{id};
	}
	else 
	{
	    warn "Ainodump document had no ID. URL,time:" .
		"($header->{url},$header->{time})\n" if $self->{ainodumpWarnings};
	    return 1;
	}

	my $srcenc_setting=$self->{sourceEncoding};
	$self->{sourceEncoding}=undef;
	my $alvisXML=$self->HTML($text,$meta_txt);
	$self->{sourceEncoding}=$srcenc_setting;
	if (!defined($alvisXML))
	{
	    $self->_set_err_state($ERR_ALVIS_CONV);
	    return 0;
	}

	if (!$self->output_Alvis([$alvisXML],$base_name))
	{
	    $self->_set_err_state($ERR_OUTPUT_ALVIS,
				  "Base name: \"$base_name\"");
	    return 0;
	}
    }
    else
    {
	warn "Ainodump document $header->{id} was not of a convertible " .
	     "type: $type/$sub_type.\n" if $self->{ainodumpWarnings};
    }

    return 1;
}

sub _output_set_of_records

lib/Alvis/Convert.pm  view on Meta::CPAN


    return \@articles;
}


1;

__END__

=head1 NAME

Alvis::Convert - Perl extension for converting documents from a number of 
different source formats to Alvis XML format.

=head1 SYNOPSIS

 use Alvis::Convert;

 # Create a new instance, outputting under 'out'. Get the detected
 # encoding from sourceEncodingFromMeta.
 #
 my $C=Alvis::Convert->new(outputRootDir=>'out',
	   		   outputNPerSubdir=>1000,
			   outputAtSameLocation=>0,
			   includeOriginalDocument=>0,
                           sourceEncodingFromMeta=>1);
 # Restart output counters
 $C->init_output();

 # Convert e.g. HTML
 for my $html_text (@html)
 {
     my $alvisXML=$C->HTML($html_txt,$meta_txt);
     if (!defined($alvisXML))
     {
	warn $C->errmsg();
	$C->clearerr();
	next;
     }
 
     if (!$C->output_Alvis([$alvisXML]))
     {
         warn $C->errmsg();
         $C->clearerr();
         next;
     }
 }

=head1 DESCRIPTION

Converts document collections of different formats to Alvis XML
format.

=head1 METHODS

=head2 new()

Options:

    fileType                 the MIME type of the source file to convert. 
                             Default: guess.
    sourceEncoding           encoding of the source document. Default: guess.  
    urlFromBasename          extract URL from basename. Default: no.
    outputAtSameLocation     output Alvis XML to the same directories as the
                             source documents. Default: no.
    alvisSuffix              suffix of the output Alvis XML records. Default:
                             'alvis'.
    outputRootDir            root directory for output files. Default: '.'
    outputNPerSubdir         number of records output per subdirectory.
                             Default: 1000
    defaultDocType           first guess document (MIME) type. Default: 'text'.
    defaultDocSubType        first guess document subtype. Default: 'html'.
    defaultEncoding          first guess encoding. Default: 'iso-8859-1'.
    includeOriginalDocument  include original document in the output?
                             Default: yes.
    ainodumpWarnings         issue warnings concerning ainodump conversion?
                             Default: yes.
    sourceEncodingFromMeta   read source encoding from Meta information?
                             Default: no.
    

=head2 HTML()

     my $alvisXML=$C->HTML($html_txt,$meta_txt,
                           {sourceEncoding=>'utf8',
                            sourceEncodingFromMeta=>0
                            });
     if (!defined($alvisXML))
     {
	warn $C->errmsg();
	$C->clearerr();
	next;
     }

=head2 newsXML()

     $meta_txt=$C->read_meta($news_xml_entries{$base_name}{metaF});
     if (!defined($meta_txt))
     {
         warn "Reading meta file " .
              "\"$news_xml_entries{$base_name}{metaF}\" failed. " .
              $C->errmsg();
         $C->clearerr();
         next;
     }
     my $alvisXMLs;
     $xml_txt=$C->read_news_XML($news_xml_entries{$base_name}{xmlF});
     if (!defined($xml_txt))
     {
         warn "Reading the news XML for basename \"$base_name\" failed. " .
               $C->errmsg();
         $C->clearerr();
         next;
     }
     $alvisXMLs=$C->newsXML($xml_txt,$meta_txt,$original_document_text);
     if (!defined($alvisXMLs))
     {
         warn "Obtaining the Alvis versions of the documents inside " .
              "\"$base_name\"'s XML file failed. " . $C->errmsg();
         $C->clearerr();
         next;
     }

=head2 ainodump()

    if (!$C->ainodump($ainodump_file))
    {
       warn "Obtaining the Alvis version of the " .
            "ainodump file \"$dump_entries{$base_name}{ainoF}\" " .
            "failed. " . $C->errmsg() if
              $Warnings;
       $C->clearerr();
    }



( run in 0.586 second using v1.01-cache-2.11-cpan-39bf76dae61 )