view release on metacpan or search on metacpan
bin/alvisXMLmerge view on Meta::CPAN
my $is_merged = 0;
if (defined($extra_file)) {
print "extra file: $extra_file\n";
merge($orig_filename, $out_filename, \%extra_all);
compress($out_filename)
if ($bzip2 || $orig_filename =~ /\.bz2$/);
$is_merged = 1;
} else {
my @extra_filenames = guess_filename($extra_filename);
for my $extra_filename (@extra_filenames) {
if (-e $extra_filename) {
print "extra file: $extra_filename\n" if ($VERBOSE);
my %extra = read_extra_file($extra_filename, keys %config);
my $start_time = [gettimeofday] if ($DEBUG);
merge($orig_filename, $out_filename, \%extra);
print "merge time: ", tv_interval($start_time, [gettimeofday]),
"\n"
if ($DEBUG);
bin/alvisXMLmerge view on Meta::CPAN
################################################################################
sub compress
{
my $file = shift;
unless ($file =~ /\.bz2$/) {
`bzip2 $file`;
}
}
################################################################################
sub guess_filename
{
my %guessed = ();
my $filename = shift;
$guessed{$filename} = 1;
$guessed{$1} = 1 if ($filename =~ /(.+)\.bz2$/);
$guessed{$1} = 1 if ($filename =~ /(.+)\.gz$/);
$guessed{$1} = 1 if ($filename =~ /(.+)\.zip$/);
if ($filename =~ /(.+)\.xml$/) {
$guessed{$filename . '.bz2'} = 1;
$guessed{$filename . '.gz'} = 1;
$guessed{$filename . '.zip'} = 1;
}
return keys %guessed;
}
__END__
=head1 NAME
alvisXMLmerge.pl - script to merge ALVIS XML files from input directory
with ALVIS XML nodes in extra directory or file
=head1 SYNOPSIS
bin/html2alvis view on Meta::CPAN
Sets the # of records per output directory. Default value: 1000.
=item B<--meta-encoding>
Specifies the encoding of all meta files. Default value 'iso-8859-1'.
=item B<--html-encoding>
Specifies the encoding of all HTML files. Default value 'iso-8859-1'.
Default: undef (meaning 'guess').
=item B<--html-encoding-from-meta>
Specifies whether the encoding of an HTML file should be read from
the corresponding meta file. If no information is given there,
--html-encoding is used, if that is not given, the encoding is guessed.
Default: no.
=item B<--[no]original>
Shall the original document be included in the output? Default
value: yes.
=item B<--help>
Prints a brief help message and exits.
bin/html2plain view on Meta::CPAN
Sets the output directory. Default value: '.'.
=item B<--N-per-out-dir>
Sets the # of records per output directory. Default value: 1000.
=item B<--source-encoding>
Specifies the encoding of the HTML files. Default value undef,
which means that the encoding is guessed for each document.
=item B<--[no]assert-html>
Specifies whether it is asserted that the document actually looks like
HTML before trying to convert. Default: yes.
=item B<--[no]symbolic-char-entities-to-chars>
Specifies whether symbolic character entities are converted to
UTF-8 characters. Default: yes.
lib/Alvis/Canonical.pm view on Meta::CPAN
#############################################################################
#
# Global variables & constants
#
##############################################################################
my $DEF_WARNINGS=0; # add warning comments about fixes to the doc?
my $DEF_CONVERT_CHAR_ENTS=1; # convert "relevant" char ents
my $DEF_CONVERT_NUM_ENTS=1; # convert numerical entities
my $DEF_SRC_ENC=undef; # guess the source encoding
my $DEBUG=0;
#############################################################################
#
# Error message stuff
#
#############################################################################
my $ErrStr;
lib/Alvis/Canonical.pm view on Meta::CPAN
warnings Issue warnings about badly faulty original HTML where
we have to resort to an heuristic solution.
Puts a warning to STDERR documenting the error and
the solution. Default: no.
convertCharEnts Convert HTML symbolic character entities to UTF-8
characters? Default: yes.
convertNumEnts Convert HTML numerical character entities to UTF-8
characters? Default: yes.
sourceEncoding the encoding of the source documents. Default: undef,
which means it is guessed.
my $C=Alvis::Canonical->new(convertCharEnts=>1,
convertNumEnts=>1);
if (!defined($C))
{
die die("Unable to instantiate Alvis::Canonical.");
}
=head2 HTML($html,$options)
lib/Alvis/Convert.pm view on Meta::CPAN
sub _process_ainodump_doc
{
my $self=shift;
my $text=shift;
my $header=shift;
# print Dumper($header);
# print "\n";
my ($type,$sub_type)=$self->{docTypeWizard}->guess($text);
if (!(defined($type) && defined($sub_type)))
{
$self->_set_err_state($ERR_TYPE_GUESS,
$self->{docTypeWizard}->errmsg());
return 0;
}
# print "TYPE:$type,SUBTYPE:$sub_type\n";
if ($type eq 'text' && $sub_type eq 'html')
lib/Alvis/Convert.pm view on Meta::CPAN
Converts document collections of different formats to Alvis XML
format.
=head1 METHODS
=head2 new()
Options:
fileType the MIME type of the source file to convert.
Default: guess.
sourceEncoding encoding of the source document. Default: guess.
urlFromBasename extract URL from basename. Default: no.
outputAtSameLocation output Alvis XML to the same directories as the
source documents. Default: no.
alvisSuffix suffix of the output Alvis XML records. Default:
'alvis'.
outputRootDir root directory for output files. Default: '.'
outputNPerSubdir number of records output per subdirectory.
Default: 1000
defaultDocType first guess document (MIME) type. Default: 'text'.
defaultDocSubType first guess document subtype. Default: 'html'.
defaultEncoding first guess encoding. Default: 'iso-8859-1'.
includeOriginalDocument include original document in the output?
Default: yes.
ainodumpWarnings issue warnings concerning ainodump conversion?
Default: yes.
sourceEncodingFromMeta read source encoding from Meta information?
Default: no.
=head2 HTML()
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
$Alvis::Document::Encoding::VERSION = '0.1';
use HTML::Encoding;
use Encode;
use Encode::Guess;
use Alvis::Document::Type;
#############################################################################
#
# A collection of routines for checking UTF-8 validity, guessing the
# encoding of a document etc.
#
#############################################################################
#############################################################################
#
# Global variables & constants
#
##############################################################################
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
$ERR_ILLEGAL_CODE=>"Illegal UTF-8 code.",
$ERR_DOC=>"No document text.",
$ERR_DOC_TYPE=>"No document type.",
$ERR_DOC_SUB_TYPE=>"No document subtype.",
$ERR_BOM=>"Byte order mark recognition failed miserably.",
$ERR_FIRST_CHARS=>"Guessing from the first characters " .
"failed miserably.",
$ERR_META=>"Guessing from the meta information " .
"failed miserably.",
$ERR_XML=>"Guessing from XML format failed miserably.",
$ERR_GUESS=>"Unable to guess at the encoding.",
$ERR_WRONG_GUESS=>"This pair does not convert:",
$ERR_ILLEGAL_CHAR=>"Illegal character in supposedly UTF-8 " .
"result.",
$ERR_DOC_TYPE_WIZARD=>"Instantiating Alvis::Document::Type",
$ERR_TYPE_GUESS=>"Guessing the document type failed.",
$ERR_ENCODE_GUESS=>"Encode::Guess failed.",
$ERR_GUESS_AND_CONVERT=>"Guessing an encoding and then " .
"converting failed.",
$ERR_UNABLE_TO_GUESS=>"Unable to guess at encoding name " .
"corrections."
);
sub _set_err_state
{
my $self=shift;
my $errcode=shift;
my $errmsg=shift;
if (!defined($errcode))
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
$i++;
}
return 1;
}
#
# type: The type of the document as one of the recognized types
# defined in Alvis::Document::Type (superset of MIME).
#
sub guess
{
my $self=shift;
my $text=shift;
my $type=shift;
my $sub_type=shift;
$self->_set_err_state($ERR_OK); # clean the slate
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
}
if (!(defined($type) && defined($sub_type)))
{
($type,$sub_type)=$self->{docTypeWizard}->guess($text);
if (!(defined($type) && defined($sub_type)))
{
$self->_set_err_state($ERR_TYPE_GUESS,
$self->{docTypeWizard}->errmsg());
return undef;
}
}
# warn "($type,$sub_type)";
my @guesses;
if ($type eq 'text')
{
if ($sub_type eq 'html')
{
@guesses=$self->_HTML($text);
if (scalar(@guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
if (defined($self->{defaultEncoding}))
{
@guesses=($self->{defaultEncoding});
}
}
}
elsif ($sub_type eq 'xhtml')
{
@guesses=$self->_XHTML($text);
if (scalar(@guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
if (defined($self->{defaultEncoding}))
{
@guesses=($self->{defaultEncoding});
}
}
}
elsif ($sub_type eq 'plain')
{
@guesses=$self->_plain_text($text);
if (scalar(@guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
if (defined($self->{defaultEncoding}))
{
@guesses=($self->{defaultEncoding});
}
}
}
}
return @guesses;
}
sub from_to
{
my $self=shift;
my $text=shift;
my $source_enc=shift;
my $target_enc=shift;
{
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
$self->_set_err_state($ERR_ILLEGAL_CHAR,
" Position: $err{pos}," .
"Code:$err{code}");
return undef;
}
}
my $try=$self->from_to($text,$source_enc,$target_enc);
if (!defined($try))
{
my @possible_src_typo_fixes=$self->guess_typo_fixes($source_enc);
my @possible_trg_typo_fixes=$self->guess_typo_fixes($target_enc);
for my $src_enc_guess ($self->guess_typo_fixes($source_enc))
{
for my $trg_enc_guess ($self->guess_typo_fixes($target_enc))
{
my $try=$self->from_to($text,$src_enc_guess,$trg_enc_guess);
if (defined($try))
{
return $try;
}
}
}
$self->_set_err_state($ERR_UNABLE_TO_GUESS);
return undef;
}
return $try;
}
sub guess_and_convert
{
my $self=shift;
my $text=shift;
my $type=shift;
my $sub_type=shift;
my $target_enc=shift;
$self->_set_err_state($ERR_OK);
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
}
if (!(defined($type) && defined($sub_type)))
{
($type,$sub_type)=$self->{docTypeWizard}->guess($text);
if (!(defined($type) && defined($sub_type)))
{
$self->_set_err_state($ERR_TYPE_GUESS,
$self->{docTypeWizard}->errmsg());
return undef;
}
}
my @enc_guesses=$self->guess($text,$type,$sub_type);
if (scalar(@enc_guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
return undef;
}
my $result;
for my $enc_guess (@enc_guesses)
{
if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
return $text;
} else {
$result=$self->convert($text,$enc_guess,$target_enc);
if (defined($result))
{
return $result;
}
}
}
if (!defined($result))
{
# test if its UTF-8 already
&Encode::_utf8_on($text);
if ( &Encode::is_utf8($text) ) {
return $text;
}
&Encode::_utf8_off($text);
$self->_set_err_state($ERR_GUESS_AND_CONVERT);
# print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
# print STDERR "\n$text\n\n";
return undef;
}
return $result;
}
sub guess_typo_fixes
{
my $self=shift;
my $typo=shift;
my @possibilities=($typo);
if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
{
push(@possibilities,'utf8');
}
if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
{
my $self=shift;
my $text=shift;
if (!defined($text) || length($text)<1)
{
$self->_set_err_state($ERR_DOC);
return ();
}
my @guesses;
eval
{
@guesses=HTML::Encoding::encoding_from_byte_order_mark($text,xhtml=>0);
};
if ($@)
{
$self->_set_err_state($ERR_BOM,"$@");
}
if (scalar(@guesses))
{
return @guesses;
}
# Sanity check to exclude e.g. UTF-32
#
eval
{
@guesses=
HTML::Encoding::encoding_from_first_chars($text);
};
if ($@)
{
$self->_set_err_state($ERR_FIRST_CHARS,"$@");
}
my @tries;
if (scalar(@guesses))
{
@tries=@guesses;
}
else
{
@tries=@{$HTML::Encoding::DEFAULT_ENCODINGS};
}
foreach my $try (@tries)
{
if ($try=~/^\s*UTF-(?:16|32)((?:B|L)E)?\s*$/isgo)
{
# HTML::Encoding is a bit imperfect
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
eval
{
@try_results=
HTML::Encoding::encoding_from_meta_element($text,$try);
};
if ($@)
{
$self->_set_err_state($ERR_META,"$@");
}
@guesses=(@try_results,@guesses);
}
return @guesses;
}
sub _XHTML
{
my $self=shift;
my $text=shift;
if (!defined($text) || length($text)<1)
{
$self->_set_err_state($ERR_DOC);
return ();
}
my @guesses;
eval
{
@guesses=HTML::Encoding::encoding_from_xml_document($text);
};
if ($@)
{
$self->_set_err_state($ERR_XML,"$@");
return ();
}
if (scalar(@guesses))
{
return @guesses;
}
return $self->_HTML($text);
}
sub _plain_text
{
my $self=shift;
my $text=shift;
if (!defined($text) || length($text)<1)
{
$self->_set_err_state($ERR_DOC);
return ();
}
my $enc=guess_encoding($text);
if (ref($enc))
{
return ($enc->name());
}
else
{
$self->_set_err_state($ERR_ENCODE_GUESS,"$@");
return ();
}
}
1;
__END__
=head1 NAME
Alvis::Encoding - Perl extension for guessing and checking the encoding of
documents.
=head1 SYNOPSIS
use Alvis::Encoding;
# Create a new instance
my $e=Alvis::Encoding->new();
if (!defined($e))
{
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
# pass a placeholder in a hash ref argument:
my %err=();
if (!$e->is_utf8($text,\%err))
{
my $position=$err{pos};
my $code=$err{code};
. . .
}
#
# Guess the encoding of a document given a guess for its type
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_encoding=$e->guess($text,$doc_type,$doc_sub_type);
if (!defined($doc_encoding))
{
die('Cannot guess. ' . $e->errmsg());
}
#
# Try converting a document to UTF-8 with only its type known
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
if (!defined($doc_in_utf8))
{
die('Cannot guess. ' . $e->errmsg());
}
# Try to guess what was meant
my @possibilities=$e->guess_typo_fixes('uft-8');
=head1 DESCRIPTION
A collection of methods for guessing, confirming and fixing the encoding
of a document.
=head1 METHODS
=head2 new()
Options:
defaultDocType default type for a document. Default: text.
defaultDocSubType default sub type for a document. Default: html.
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
=head2 is_utf8(text,err_hash_ref)
Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.
=head2 guess(text,doc_type,doc_sub_type)
Guess the encoding of a document given a guess for its type (and subtype).
=head2 guess_and_convert(text,doc_type,doc_sub_type,target_encoding)
Tries to first guess the encoding of the document given a guess at its
type and subtype, and then tries to convert it to $target_encoding.
=head2 convert(text,source_encoding,target_encoding)
Tries to convert $text from $source_encoding to $target_encoding.
=head2 guess_typo_fixes($typo)
Returns a set of guesses for the meant encoding in a case of an encoding
name containing typos.
=head2 errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
=head1 SEE ALSO
Alvis::Document::Type
lib/Alvis/Document/Type.pm view on Meta::CPAN
if (defined(@_))
{
my %args=@_;
@$self{ keys %args }=values(%args);
}
}
#
# Returns similarly to MIME ($type,$sub_type), but is broader
#
sub guess
{
my $self=shift;
my $text=shift;
$self->_set_err_state($ERR_OK); # clean the slate
if (!defined($text))
{
$self->_set_err_state($ERR_DOC);
return undef;
lib/Alvis/Document/Type.pm view on Meta::CPAN
1;
1;
__END__
=head1 NAME
Alvis::Document::Type - Perl extension for guessing and checking the type
of a document (an extension of MIME types).
=head1 SYNOPSIS
use Alvis::Document::Type;
# Create a new instance
my $t=Alvis::Document::Type->new(defaultType=>'text',
defaultSubType=>'html');
if (!defined($t))
{
die('Ugh!');
}
my ($doc_type,$doc_sub_type)=$t->guess($doc_text);
if (!(defined($doc_type) && defined($doc_sub_type)))
{
die("Guess what? " . $t->errmsg());
}
=head1 DESCRIPTION
Tries to guess the type of a document similarly to MIME types
(type and a subtype).
Adds subtypes 'rss' and 'html' to MIME type 'text'.
=head1 METHODS
=head2 new()
Options:
defaultType The default type (text).
defaultSubType The default subtype (plain).
=head2 guess($text)
Tries to guess the type of $text.
=head2 errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
=head1 SEE ALSO
=head1 AUTHOR
lib/Alvis/HTML.pm view on Meta::CPAN
$src_enc,
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
}
else # try guessing the encoding
{
$html=$self->{encodingWiz}->guess_and_convert($html,
'text',
'html',
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
lib/Alvis/HTML.pm view on Meta::CPAN
wml WML
Note: alvisKeep + alvisRemove == remove all HTML 4.01 tags
convertCharEnts convert symbolic character entities to UTF-8 characters.
convertNumEnts convert numerical character entities to UTF-8
characters.
sourceEncoding encoding of the source HTML text (default: 'utf-8')
If not 'utf-8', HTML is converted to UTF-8.
If undefined, the encoding is guessed first.
assertSourceAssumptions
make sure that before any operations the source is
in UTF-8 and contains no null bytes.
=head2 clean(html,options)
Remove unwanted tags from $html (text). $options is
a mechanism for returning the title and base URL of the document and
lib/Alvis/HTML.pm view on Meta::CPAN
baseURL=>1});
In $options you can also set the source and target encodings
(sourceEncoding,targetEncoding).
my ($txt,$header)=$C->clean($html,
{title=>1,
baseURL=>1,
sourceEncoding=>'iso-8859-1'});
This will guess the encoding first:
my ($txt,$header)=$C->clean($html,
{title=>1,
baseURL=>1,
sourceEncoding=>undef});
will convert from 'iso-8859-1' to default output encoding (UTF-8).
=head2 errmsg()
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="3DAB2F05CBCFBD7765C7E71C63E6FFE8" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1145563212583</modifiedDate>
<httpServer>Apache/2.0</httpServer>
<urls>
<url>http://google.weblogsinc.com/2006/04/20/google-has-been-testing-google-base-in-search-results/</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Google is always hard at work fine tuning and trying out new search strategies. Apparantly Google is now hard at work integrating Google Base car searches into the organic results says Jason Dowdell. He recently came across a car sea...
<metaData>
<meta name="title">Google has been Testing Google Base in Search Results</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>Google Base</anchorText>
<location>http://base.google.com/</location>
</link>