Alvis-Convert

 view release on metacpan or  search on metacpan

bin/alvisXMLmerge  view on Meta::CPAN


    my $is_merged = 0;

    if (defined($extra_file)) {
        print "extra file: $extra_file\n";
        merge($orig_filename, $out_filename, \%extra_all);
        compress($out_filename)
          if ($bzip2 || $orig_filename =~ /\.bz2$/);
        $is_merged = 1;
    } else {
        my @extra_filenames = guess_filename($extra_filename);
        for my $extra_filename (@extra_filenames) {
            if (-e $extra_filename) {
                print "extra file: $extra_filename\n" if ($VERBOSE);
                my %extra = read_extra_file($extra_filename, keys %config);

                my $start_time = [gettimeofday] if ($DEBUG);
                merge($orig_filename, $out_filename, \%extra);
                print "merge time: ", tv_interval($start_time, [gettimeofday]),
                  "\n"
                  if ($DEBUG);

bin/alvisXMLmerge  view on Meta::CPAN

################################################################################
sub compress
{
    my $file = shift;
    unless ($file =~ /\.bz2$/) {
        `bzip2 $file`;
    }
}

################################################################################
sub guess_filename
{
    my %guessed  = ();
    my $filename = shift;
    $guessed{$filename} = 1;
    $guessed{$1}        = 1 if ($filename =~ /(.+)\.bz2$/);
    $guessed{$1}        = 1 if ($filename =~ /(.+)\.gz$/);
    $guessed{$1}        = 1 if ($filename =~ /(.+)\.zip$/);

    if ($filename =~ /(.+)\.xml$/) {
        $guessed{$filename . '.bz2'} = 1;
        $guessed{$filename . '.gz'}  = 1;
        $guessed{$filename . '.zip'} = 1;
    }

    return keys %guessed;
}

__END__

=head1 NAME
    
    alvisXMLmerge.pl - script to merge ALVIS XML files from input directory 
with ALVIS XML nodes in extra directory or file

=head1 SYNOPSIS

bin/html2alvis  view on Meta::CPAN


    Sets the # of records per output directory. Default value: 1000.

=item B<--meta-encoding>

    Specifies the encoding of all meta files. Default value 'iso-8859-1'.

=item B<--html-encoding>

    Specifies the encoding of all HTML files. Default value 'iso-8859-1'.
    Default: undef (meaning 'guess').

=item B<--html-encoding-from-meta>

    Specifies whether the encoding of an HTML file should be read from
    the corresponding meta file. If no information is given there,
    --html-encoding is used, if that is not given, the encoding is guessed.
    Default: no.

=item B<--[no]original>

    Shall the original document be included in the output? Default
    value: yes.

=item B<--help>

    Prints a brief help message and exits.

bin/html2plain  view on Meta::CPAN


    Sets the output directory. Default value: '.'.

=item B<--N-per-out-dir>

    Sets the # of records per output directory. Default value: 1000.

=item B<--source-encoding>

    Specifies the encoding of the HTML files. Default value undef,
    which means that the encoding is guessed for each document.

=item B<--[no]assert-html>

    Specifies whether it is asserted that the document actually looks like
    HTML before trying to convert. Default: yes.

=item B<--[no]symbolic-char-entities-to-chars>

    Specifies whether symbolic character entities are converted to 
    UTF-8 characters. Default: yes.

lib/Alvis/Canonical.pm  view on Meta::CPAN


#############################################################################
#
#     Global variables & constants
#
##############################################################################

my $DEF_WARNINGS=0;    # add warning comments about fixes to the doc? 
my $DEF_CONVERT_CHAR_ENTS=1; # convert "relevant" char ents
my $DEF_CONVERT_NUM_ENTS=1;  # convert numerical entities
my $DEF_SRC_ENC=undef; # guess the source encoding

my $DEBUG=0;

#############################################################################
#
#     Error message stuff
#
#############################################################################

my $ErrStr;

lib/Alvis/Canonical.pm  view on Meta::CPAN


    warnings         Issue warnings about badly faulty original HTML where
                     we have to resort to an heuristic solution.
                     Puts a warning to STDERR documenting the error and
                     the solution. Default: no.
    convertCharEnts  Convert HTML symbolic character entities to UTF-8 
                     characters? Default: yes.
    convertNumEnts   Convert HTML numerical character entities to UTF-8 
                     characters? Default: yes.
    sourceEncoding   the encoding of the source documents. Default: undef,
                     which means it is guessed.  
     
  my $C=Alvis::Canonical->new(convertCharEnts=>1,
                              convertNumEnts=>1);
  if (!defined($C))
  {
    die die("Unable to instantiate Alvis::Canonical.");
  }

=head2 HTML($html,$options)

lib/Alvis/Convert.pm  view on Meta::CPAN


sub _process_ainodump_doc
{
    my $self=shift;
    my $text=shift;
    my $header=shift;

#    print Dumper($header);
#    print "\n";

    my ($type,$sub_type)=$self->{docTypeWizard}->guess($text);
    if (!(defined($type) && defined($sub_type)))
    {
	$self->_set_err_state($ERR_TYPE_GUESS,
			      $self->{docTypeWizard}->errmsg());
	return 0;
    }

#    print "TYPE:$type,SUBTYPE:$sub_type\n";
    
    if ($type eq 'text' && $sub_type eq 'html')

lib/Alvis/Convert.pm  view on Meta::CPAN

Converts document collections of different formats to Alvis XML
format.

=head1 METHODS

=head2 new()

Options:

    fileType                 the MIME type of the source file to convert. 
                             Default: guess.
    sourceEncoding           encoding of the source document. Default: guess.  
    urlFromBasename          extract URL from basename. Default: no.
    outputAtSameLocation     output Alvis XML to the same directories as the
                             source documents. Default: no.
    alvisSuffix              suffix of the output Alvis XML records. Default:
                             'alvis'.
    outputRootDir            root directory for output files. Default: '.'
    outputNPerSubdir         number of records output per subdirectory.
                             Default: 1000
    defaultDocType           first guess document (MIME) type. Default: 'text'.
    defaultDocSubType        first guess document subtype. Default: 'html'.
    defaultEncoding          first guess encoding. Default: 'iso-8859-1'.
    includeOriginalDocument  include original document in the output?
                             Default: yes.
    ainodumpWarnings         issue warnings concerning ainodump conversion?
                             Default: yes.
    sourceEncodingFromMeta   read source encoding from Meta information?
                             Default: no.
    

=head2 HTML()

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

$Alvis::Document::Encoding::VERSION = '0.1';

use HTML::Encoding;
use Encode;
use Encode::Guess;

use Alvis::Document::Type;

#############################################################################
#
#  A collection of routines for checking UTF-8 validity, guessing the
#  encoding of a document etc.
#
#############################################################################

#############################################################################
#
#     Global variables & constants
#
##############################################################################

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

	     $ERR_ILLEGAL_CODE=>"Illegal UTF-8 code.",
	     $ERR_DOC=>"No document text.",
	     $ERR_DOC_TYPE=>"No document type.",
	     $ERR_DOC_SUB_TYPE=>"No document subtype.",
	     $ERR_BOM=>"Byte order mark recognition failed miserably.",
	     $ERR_FIRST_CHARS=>"Guessing from the first characters " .
	     "failed miserably.",
	     $ERR_META=>"Guessing from the meta information " .
	     "failed miserably.",
	     $ERR_XML=>"Guessing from XML format failed miserably.",
	     $ERR_GUESS=>"Unable to guess at the encoding.",
	     $ERR_WRONG_GUESS=>"This pair does not convert:",
	     $ERR_ILLEGAL_CHAR=>"Illegal character in supposedly UTF-8 " .
	     "result.",
	     $ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type",
	     $ERR_TYPE_GUESS=>"Guessing the document type failed.",
	     $ERR_ENCODE_GUESS=>"Encode::Guess failed.",
	     $ERR_GUESS_AND_CONVERT=>"Guessing an encoding and then " .
	     "converting failed.",
	     $ERR_UNABLE_TO_GUESS=>"Unable to guess at encoding name " .
	     "corrections."
   );

sub _set_err_state
{
    my $self=shift;
    my $errcode=shift;
    my $errmsg=shift;

    if (!defined($errcode))

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

	$i++;
    }

    return 1;
}

#
# type: The type of the document as one of the recognized types
#       defined in Alvis::Document::Type (superset of MIME). 
#
sub guess
{
    my $self=shift;
    my $text=shift;
    my $type=shift;
    my $sub_type=shift;

    $self->_set_err_state($ERR_OK);  # clean the slate

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    if (!(defined($type) && defined($sub_type)))
    {
	($type,$sub_type)=$self->{docTypeWizard}->guess($text);
	if (!(defined($type) && defined($sub_type)))
	{
	    $self->_set_err_state($ERR_TYPE_GUESS,
				  $self->{docTypeWizard}->errmsg());
	    return undef;
	}
    }

#    warn "($type,$sub_type)";

    my @guesses;
    if ($type eq 'text')
    {
	if ($sub_type eq 'html')
	{
	    @guesses=$self->_HTML($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
	elsif ($sub_type eq 'xhtml')
	{
	    @guesses=$self->_XHTML($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
	elsif ($sub_type eq 'plain')
	{
	    @guesses=$self->_plain_text($text);
	    if (scalar(@guesses)==0)
	    {
		$self->_set_err_state($ERR_GUESS);
		if (defined($self->{defaultEncoding}))
		{
		    @guesses=($self->{defaultEncoding});
		}
	    }
	}
    }

    return @guesses;
}

sub from_to
{
    my $self=shift;
    my $text=shift;
    my $source_enc=shift;
    my $target_enc=shift;

    {

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

	    $self->_set_err_state($ERR_ILLEGAL_CHAR,
				  " Position: $err{pos}," .
				  "Code:$err{code}");
	    return undef;
	}
    }

    my $try=$self->from_to($text,$source_enc,$target_enc);
    if (!defined($try))
    {
	my @possible_src_typo_fixes=$self->guess_typo_fixes($source_enc);
	my @possible_trg_typo_fixes=$self->guess_typo_fixes($target_enc);

	for my $src_enc_guess ($self->guess_typo_fixes($source_enc))
	{
	    for my $trg_enc_guess ($self->guess_typo_fixes($target_enc))
	    {
		my $try=$self->from_to($text,$src_enc_guess,$trg_enc_guess);
		if (defined($try))
		{
		    return $try;
		}
	    }
	}
	
	$self->_set_err_state($ERR_UNABLE_TO_GUESS);
	return undef;
    }

    return $try;
}

sub guess_and_convert
{
    my $self=shift;
    my $text=shift;
    my $type=shift;
    my $sub_type=shift;
    my $target_enc=shift;

    $self->_set_err_state($ERR_OK);

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;
    }
    if (!(defined($type) && defined($sub_type)))
    {
	($type,$sub_type)=$self->{docTypeWizard}->guess($text);
	if (!(defined($type) && defined($sub_type)))
	{
	    $self->_set_err_state($ERR_TYPE_GUESS,
				  $self->{docTypeWizard}->errmsg());
	    return undef;
	}
    }
    
    my @enc_guesses=$self->guess($text,$type,$sub_type);
    if (scalar(@enc_guesses)==0)
    {
	$self->_set_err_state($ERR_GUESS);
	return undef;
    }

    my $result; 
    for my $enc_guess (@enc_guesses)
    {
	if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
	   return $text;
        } else {
           $result=$self->convert($text,$enc_guess,$target_enc);
	   if (defined($result))
	   {
	       return $result;
	   }
	}
    }
    if (!defined($result))
    {
        #  test if its UTF-8 already
	&Encode::_utf8_on($text);
	if (  &Encode::is_utf8($text) ) {
		return $text;
        }
         &Encode::_utf8_off($text);
	$self->_set_err_state($ERR_GUESS_AND_CONVERT);
        # print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
	# print STDERR "\n$text\n\n";
	return undef;
    }

    return $result;
}

sub guess_typo_fixes
{
    my $self=shift;
    my $typo=shift;

    my @possibilities=($typo);
    if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
    {
	push(@possibilities,'utf8');
    }
    if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my @guesses;

    eval
    {
	@guesses=HTML::Encoding::encoding_from_byte_order_mark($text,xhtml=>0);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_BOM,"$@");
    }
    if (scalar(@guesses))
    {
	return @guesses;
    }

    # Sanity check to exclude e.g. UTF-32
    #
    eval
    {
	@guesses=
	    HTML::Encoding::encoding_from_first_chars($text);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_FIRST_CHARS,"$@");
    }

    my @tries;
    if (scalar(@guesses))
    {
	@tries=@guesses;
    }
    else
    {
	@tries=@{$HTML::Encoding::DEFAULT_ENCODINGS};
    }
    foreach my $try (@tries)
    {
	if ($try=~/^\s*UTF-(?:16|32)((?:B|L)E)?\s*$/isgo)
	{
	    # HTML::Encoding is a bit imperfect

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

	eval
	{
	    @try_results=
		HTML::Encoding::encoding_from_meta_element($text,$try);
	};
	if ($@)
	{
	    $self->_set_err_state($ERR_META,"$@");
	}
	
	@guesses=(@try_results,@guesses);
    }

    return @guesses;
}

sub _XHTML
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my @guesses;
    eval
    {
	@guesses=HTML::Encoding::encoding_from_xml_document($text);
    };
    if ($@)
    {
	$self->_set_err_state($ERR_XML,"$@");
	return ();
    }
    
    if (scalar(@guesses))
    {
	return @guesses;
    }
    
    return $self->_HTML($text);
}

sub _plain_text
{
    my $self=shift;
    my $text=shift;

    if (!defined($text) || length($text)<1)
    {
	$self->_set_err_state($ERR_DOC);
	return ();
    }

    my $enc=guess_encoding($text);
    if (ref($enc))
    {
	return ($enc->name());
    }
    else
    {
	$self->_set_err_state($ERR_ENCODE_GUESS,"$@");
	return ();
    }
}

1;
__END__

=head1 NAME

Alvis::Encoding - Perl extension for guessing and checking the encoding of
documents.

=head1 SYNOPSIS

 use Alvis::Encoding;

 # Create a new instance
 my $e=Alvis::Encoding->new();
 if (!defined($e))
 {

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

 # pass a placeholder in a hash ref argument:
 my %err=();
 if (!$e->is_utf8($text,\%err))
 {
    my $position=$err{pos};
    my $code=$err{code};
    . . . 
 }

 # 
 # Guess the encoding of a document given a guess for its type 
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_encoding=$e->guess($text,$doc_type,$doc_sub_type);
 if (!defined($doc_encoding))
 {
     die('Cannot guess. ' . $e->errmsg());
 }

 # 
 # Try converting a document to UTF-8 with only its type known
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
 if (!defined($doc_in_utf8))
 {
     die('Cannot guess. ' . $e->errmsg());
 }
 
 # Try to guess what was meant 
 my @possibilities=$e->guess_typo_fixes('uft-8');

=head1 DESCRIPTION

A collection of methods for guessing, confirming and fixing the encoding
of a document.

=head1 METHODS

=head2 new()

Options:

    defaultDocType       default type for a document. Default: text.
    defaultDocSubType    default sub type for a document. Default: html.

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN


=head2 is_utf8(text,err_hash_ref)

Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a 
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.

=head2 guess(text,doc_type,doc_sub_type)

Guess the encoding of a document given a guess for its type (and subtype).

=head2 guess_and_convert(text,doc_type,doc_sub_type,target_encoding)

Tries to first guess the encoding of the document given a guess at its
type and subtype, and then tries to convert it to $target_encoding.

=head2 convert(text,source_encoding,target_encoding)

Tries to convert $text from $source_encoding to $target_encoding.

=head2 guess_typo_fixes($typo)

Returns a set of guesses for the meant encoding in a case of an encoding
name containing typos.

=head2 errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

=head1 SEE ALSO

Alvis::Document::Type

lib/Alvis/Document/Type.pm  view on Meta::CPAN

    if (defined(@_))
    {
        my %args=@_;
        @$self{ keys %args }=values(%args);
    }
}

#
# Returns similarly to MIME ($type,$sub_type), but is broader
#
sub guess
{
    my $self=shift;
    my $text=shift;

    $self->_set_err_state($ERR_OK);  # clean the slate

    if (!defined($text))
    {
	$self->_set_err_state($ERR_DOC);
	return undef;

lib/Alvis/Document/Type.pm  view on Meta::CPAN

1;




1;
__END__

=head1 NAME

Alvis::Document::Type - Perl extension for guessing and checking the type
of a document (an extension of MIME types).

=head1 SYNOPSIS

 use Alvis::Document::Type;

 # Create a new instance
 my $t=Alvis::Document::Type->new(defaultType=>'text',
                                  defaultSubType=>'html');
 if (!defined($t))
 {
    die('Ugh!');
 }

 my ($doc_type,$doc_sub_type)=$t->guess($doc_text);
 if (!(defined($doc_type) && defined($doc_sub_type)))
 {
    die("Guess what? " . $t->errmsg()); 
 }

=head1 DESCRIPTION

Tries to guess the type of a document similarly to MIME types
(type and a subtype).

Adds subtypes 'rss' and 'html' to MIME type 'text'.

=head1 METHODS

=head2 new()

Options:

    defaultType       The default type (text).
    defaultSubType    The default subtype (plain).

=head2 guess($text)

Tries to guess the type of $text.

=head2 errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

=head1 SEE ALSO


=head1 AUTHOR

lib/Alvis/HTML.pm  view on Meta::CPAN

						$src_enc,
						'utf8');
	    if (!defined($html))
	    {
		$self->_set_err_state($ERR_UTF8_CONV,
				      $self->{encodingWiz}->errmsg());
		return (undef,\%header);  # signals "do not pass on"
	    }
	}
    }
    else # try guessing the encoding
    {
	$html=$self->{encodingWiz}->guess_and_convert($html,
						      'text',
						      'html',
						      'utf8');
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
    }

lib/Alvis/HTML.pm  view on Meta::CPAN

    wml                WML

     Note: alvisKeep + alvisRemove == remove all HTML 4.01 tags

    convertCharEnts    convert symbolic character entities to UTF-8 characters.
    convertNumEnts     convert numerical character entities to UTF-8 
                       characters.  

    sourceEncoding     encoding of the source HTML text (default: 'utf-8')
                       If not 'utf-8', HTML is converted to UTF-8.
                       If undefined, the encoding is guessed first.

    assertSourceAssumptions
 
                       make sure that before any operations the source is
                       in UTF-8 and contains no null bytes.

=head2 clean(html,options)

Remove unwanted tags from $html (text). $options is
a mechanism for returning the title and base URL of the document and

lib/Alvis/HTML.pm  view on Meta::CPAN

         		       baseURL=>1});

In $options you can also set the source and target encodings
(sourceEncoding,targetEncoding).

   my ($txt,$header)=$C->clean($html,
                              {title=>1,
         		       baseURL=>1,
                               sourceEncoding=>'iso-8859-1'});

This will guess the encoding first:

   my ($txt,$header)=$C->clean($html,
                              {title=>1,
         		       baseURL=>1,
                               sourceEncoding=>undef});

will convert from 'iso-8859-1' to default output encoding (UTF-8).

=head2 errmsg()

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="3DAB2F05CBCFBD7765C7E71C63E6FFE8" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1145563212583</modifiedDate>
        <httpServer>Apache/2.0</httpServer>
        <urls>
          <url>http://google.weblogsinc.com/2006/04/20/google-has-been-testing-google-base-in-search-results/</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Google is always hard at work fine tuning and trying out new search strategies. Apparantly Google is now hard at work integrating Google Base car searches into the organic results says Jason Dowdell. He recently came across a car sea...
      <metaData>
        <meta name="title">Google has been Testing Google Base in Search Results</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Google Base</anchorText>
            <location>http://base.google.com/</location>
          </link>



( run in 0.432 second using v1.01-cache-2.11-cpan-702932259ff )