view release on metacpan or search on metacpan
bin/alvisDecollect view on Meta::CPAN
use Encode;
###################### CONFIGURATION #####################
my $GROUPELEMENT = "documentCollection";
############ END CONFIGURATION ######################
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
my $header = "";
my $collected = 0;
while ( !$collected && ($_=<>) ) {
print;
if ( /<$GROUPELEMENT / ){
$collected = 1;
}
}
bin/alvisSource view on Meta::CPAN
# Feeds docs from DIR/*.xml into the pipelineT. Wait minutes before each send.
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Alvis::Pipeline;
# use Data::Dumper;
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";
my $PIPE_WRITEPORT=0;
my $PIPE_WRITEHOST="localhost";
my $verbose = 0;
my ( @dirs, $sleep, $shutdown, $host, $port );
$host = 'localhost';
$port = 10000;
$shutdown = 0;
$sleep = 0;
bin/alvisXMLjoin view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use Encode;
use Alvis::Utils qw(open_file);
use encoding 'utf8';
use open ':utf8';
my $xml_header = '<?xml version="1.0" encoding="UTF-8"?>';
my $doc_col = '<documentCollection xmlns="http://alvis.info/enriched/" version="1.1">';
my $doc_col_end = '</documentCollection>';
################################################################################
# main
join_files(read_params());
bin/alvisXMLmerge view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use Encode;
use File::Copy;
use File::Path;
use encoding 'utf8';
use open ':utf8';
use Time::HiRes qw(gettimeofday tv_interval);
use Alvis::Utils qw(absolutize_path open_file get_files);
####################### global vars
my $VERBOSE = 1;
my $DEBUG = 0;
################################################################################
# main sub
bin/alvisXMLsplit view on Meta::CPAN
$F=shift @ARGV;
}
}
pod2usage(1) if @ARGV != 2;
my $Size=shift @ARGV;
my $ODir=shift @ARGV;
$report = 0; # switch off STDERR status
use encoding 'utf8';
use open ':utf8';
# have to make sure documentRecord elements one per line
if ( $bz || $bz_in ) {
open(W,"bzcat $F | perl -p -e \"s/<documentRecord/\n<documentRecord/g;\" |")
|| die("Unable to open \"$F\"");
} else {
*W = open_file($F);
#open(W,"perl -p -e \"s/<documentRecord/\n<documentRecord/g;\" $F |")
# || die("Unable to open \"$F\"");
}
bin/alvisXMLsplit view on Meta::CPAN
my $N = 1;
my $Collection = $CollectionHeader;
while (my $record=&get_next_rec(*W))
{
$Collection.=$record;
if ($N%$Size==0)
{
$Collection.="</documentCollection>\n";
my $FN = int($N/$Size) + $offset - 1;
my $out_f="$ODir/$FN.xml";
open(OUT,">:utf8",$out_f) || die("Unable to open $out_f");
print OUT $Collection;
close(OUT);
if ( $bz ) {
system("bzip2 $out_f");
}
$Collection=$CollectionHeader;
}
if ( $report ) { print STDERR "$N\r"; }
$N++;
}
if ( $report ) { print STDERR "\n"; }
if (($N-1)%$Size)
{
$Collection.="</documentCollection>\n";
my $FN = int($N/$Size) + $offset;
my $out_f="$ODir/$FN.xml";
open(OUT,">:utf8",$out_f) || die("Unable to open $out_f");
print OUT $Collection;
close(OUT);
if ( $bz ) {
system("bzip2 $out_f");
}
}
sub get_next_rec
{
bin/alvisXSL view on Meta::CPAN
# toss out whatever else was included, and add this
my $GROUPELEMENTEXTRA = " xmlns=\"http://alvis.info/enriched/\" version=\"1.1\"";
############ END CONFIGURATION ######################
# autoflush
select((select(STDERR), $| = 1)[0]);
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
my $USAGE = "alvisXSL [--gzip|--bzip2|--dir] [--xslargs ARGS] [--xsl XSL-FILE] XML-FILE+\n"
. " Runs xsltproc multiple times on inputs. To convert into\n"
. " into XML, use alvisDecollect as a post-processor.\n"
. " dir = descend into directories, but not recursively\n"
. " xsl = $XSL\n"
. " xslargs = $XSLARGS\n";
# command line inputs
bin/alvis_wikipedia_add_cats view on Meta::CPAN
if (!$G->load_graph($DumpF))
{
die("Loading the graph dump failed: " . $G->errmsg());
}
print STDERR "\n";
my $List;
if ($ListF)
{
print STDERR "Getting the list of categories....\r";
open(L,"<:utf8",$ListF) || die("Unable to open \"$ListF\"");;
while (my $l=<L>)
{
chomp $l;
push(@$List,$l);
}
close(L);
print STDERR "\n";
}
print STDERR "Building the path length map....\r";
bin/alvis_wikipedia_add_cats view on Meta::CPAN
&_parse_entries(\@entries,$options,\%alvis_entries);
print " \r";
for my $base_name (keys %alvis_entries)
{
my $alvisXML;
if (exists($alvis_entries{$base_name}{alvisF}))
{
my $f=$alvis_entries{$base_name}{alvisF};
open(W,"<:utf8",$f) || die("Unable to open \"$f\"");
my $out=$f;
$out=~s/^.*\///sgo;
open(OUT,">:utf8","$OutDir/$out") ||
die("Unable to open \"$OutDir/$out\"");
my $new_rec="";
my $N=1;
while (my $record=&_get_next_rec(*W))
{
my $cats=&_get_cats($record);
if (!defined($cats))
{
warn "Getting the categories of record #$N in file " .
bin/html2plain view on Meta::CPAN
my $out_f;
my $dir=$ODir . '/' .
int($outputN / $NPerOurDir);
if ($outputN % $NPerOurDir==0)
{
mkdir($dir);
}
$out_f=$dir . '/' . $outputN . '.' .
$OutSuffix;
if (!defined(open(OUT,">:utf8",$out_f)))
{
warn "Cannot open output file \"$out_f\".\n";
return 0;
}
print OUT $plain_txt;
close(OUT);
$outputN++;
print "$outputN\r";
}
bin/wikipedia2alvis view on Meta::CPAN
my $meta_txt;
$meta_txt.="title\t$title\n";
$meta_txt.="date\t$date\n";
my $ns_txt="";
if ($namespace ne '')
{
$ns_txt="$namespace/";
}
$meta_txt.="url\twikipedia/$ns_txt$title\n";
$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
if (!defined($alvis_XML))
{
warn "Obtaining the Alvis version of the " .
"HTML version of an article failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
return 1;
}
}
lib/Alvis/Buffer.pm view on Meta::CPAN
# $Id: Buffer.pm,v 1.1 2006/12/01 09:40:24 buntine Exp $
package Alvis::Buffer;
use strict;
use warnings;
use Time::Simple;
use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";
our $VERSION = '0.10';
=head1 NAME
Alvis::Buffer - Perl extension for buffering utilities for the Alvis pipeline
=head1 SYNOPSIS
use Alvis::Buffer;
lib/Alvis/Convert.pm view on Meta::CPAN
$text='<HTML><BODY>' . $text . '</BODY></HTML>';
# Check that the ISO date actually is in ISO format...
if (defined($iso_date))
{
$meta->set('dc:date',$iso_date);
}
my ($can_doc,$header)=
$self->{canonicalConverter}->HTML($text,
{sourceEncoding=>'utf8'});
if (!defined($can_doc))
{
$self->_set_err_state($ERR_CANDOC_CONV,
$self->{canonicalConverter}->errmsg());
return undef;
}
if (defined($title))
{
$meta->set('title',$title);
lib/Alvis/Convert.pm view on Meta::CPAN
# catGraphDumpF category graph dump file
#
sub wikipedia
{
my $self=shift;
my $f=shift;
my $output_cb=shift;
my $opts=shift;
my $progress_cb=shift;
if (!defined(open(WIKIPEDIA,"<:utf8",$f)))
{
$self->_set_err_state($ERR_OPEN_WIKIPEDIA,
"File: \"$f\"");
return 0;
}
if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA,
$output_cb,
$opts,
$progress_cb))
{
lib/Alvis/Convert.pm view on Meta::CPAN
{
my $detected=$meta->get('detectedCharSet');
if ($detected)
{
$src_enc=$detected;
}
}
if (defined($src_enc) && $src_enc=~/^\s*utf\s*\-?\s*8\s*$/i)
{
if (!defined(open(H,"<:utf8",$f)))
{
$self->_set_err_state($ERR_HTML_F,
"File: \"$f\".");
return undef;
}
while (my $l=<H>)
{
$html_txt.=$l;
}
close(H);
lib/Alvis/Convert.pm view on Meta::CPAN
{
my $self=shift;
my $f=shift;
my $meta_txt="";
if (defined($self->{metaEncoding}))
{
if ($self->{metaEncoding}=~/^\s*utf\s*\-?\s*8\s*$/i)
{
if (!defined(open(M,"<:utf8",$f)))
{
$self->_set_err_state($ERR_META_F,
"File: \"$f\".");
return undef;
}
while (my $l=<M>)
{
$meta_txt.=$l;
}
close(M);
lib/Alvis/Convert.pm view on Meta::CPAN
"File: \"$f\".");
return undef;
}
my $meta_txt="";
while (my $l=<M>)
{
$meta_txt.=$l;
}
close(M);
$meta_txt=$self->{encodingWizard}->try_to_convert_to_utf8($meta_txt,
'text',
'plain');
if (!defined($meta_txt))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWizard}->errmsg());
return undef;
}
}
return $meta_txt;
}
sub read_news_XML
{
my $self=shift;
my $f=shift;
if (!defined(open(X,"<:utf8",$f)))
{
$self->_set_err_state($ERR_NEWS_XML_F,
"File: \"$f\".");
return undef;
}
my $txt="";
while (my $l=<X>)
{
$txt.=$l;
}
lib/Alvis/Convert.pm view on Meta::CPAN
return 1;
}
sub _output_set_of_records
{
my $self=shift;
my $set_of_records_txt=shift;
my $path=shift;
if (!defined(open(OUT,">:utf8",$path)))
{
$self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " .
"\"$path\"");
return 0;
}
print OUT "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
print OUT "<documentCollection xmlns=\"http://alvis.info/enriched/\">\n";
print OUT $set_of_records_txt;
print OUT "</documentCollection>\n";
close(OUT);
lib/Alvis/Convert.pm view on Meta::CPAN
Default: yes.
ainodumpWarnings issue warnings concerning ainodump conversion?
Default: yes.
sourceEncodingFromMeta read source encoding from Meta information?
Default: no.
=head2 HTML()
my $alvisXML=$C->HTML($html_txt,$meta_txt,
{sourceEncoding=>'utf8',
sourceEncodingFromMeta=>0
});
if (!defined($alvisXML))
{
warn $C->errmsg();
$C->clearerr();
next;
}
=head2 newsXML()
lib/Alvis/Document.pm view on Meta::CPAN
my $XML;
my $md5;
if (defined($ingredients->{origText}) && $self->{includeOriginalDocument})
{
$md5=uc(Digest::MD5->new->add($ingredients->{origText})->hexdigest());
}
else
{
$md5=uc(Digest::MD5->new->add(encode_utf8($ingredients->{canDoc}))->hexdigest());
}
$XML.=" <documentRecord id=\"$md5\" xmlns=\"http://alvis.info/enriched/\">\n";
$XML.=" <acquisition>\n";
my $last_modified;
if (defined($ingredients->{meta}->get('date')))
{
$last_modified=$ingredients->{meta}->get('date');
}
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
##########################################################################
#
# Public methods
#
#########################################################################
#
# Returns 1 if the (decimal) character code is legal UTF-8
#
sub code_is_utf8
{
my $self=shift;
my $dec_code=shift;
# check for invalid codes
if ($dec_code<0 || $dec_code>1114111 || $InvalidUtf8Code{$dec_code})
{
return 0;
}
return 1;
}
#
# Returns 1 if all of the characters of the text are legal UTF-8
# Else, returns 0 and sets an error message specifying the location
# (1..) of the first illegal character code
# If you wish to obtain the position and offending code, pass a
# hash ref
#
sub is_utf8
{
my $self=shift;
my $text=shift;
my $err=shift;
# Go over the text char by char and check for invalid char codes
my @chars=split(//,$text);
my $i=1;
for my $char (@chars)
{
# We test for valid code
#
my $code=ord($char);
if (!$self->code_is_utf8($code))
{
$self->_set_err_state($ERR_ILLEGAL_CODE,
sprintf("Position: #%d, character code: %#x",
$i,$code));
if (defined($err) && ref($err) eq 'HASH')
{
$err->{pos}=$i;
$err->{code}=$code;
}
return 0;
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
my $err=$@;
$err=~s/ at .*$//isgo;
$self->_set_err_state($ERR_WRONG_GUESS,
"source encoding: $source_enc, " .
"target encoding: $target_enc. Why? $err.");
return undef;
}
if ($target_enc=~/^\s*utf-?8\s*$/isgo)
{
# leaves the bl***y UTF-8 flag on
Encode::_utf8_on($text);
}
}
return $text;
}
#
# Should always leave the UTF-8 flag on, if target is UTF-8
#
sub convert
{
my $self=shift;
my $text=shift;
my $source_enc=shift;
my $target_enc=shift;
my %err;
if ($source_enc=~/^\s*utf-?8\s*$/isgo)
{
if (!$self->is_utf8($text,\%err))
{
$self->_set_err_state($ERR_ILLEGAL_CHAR,
" Position: $err{pos}," .
"Code:$err{code}");
return undef;
}
}
my $try=$self->from_to($text,$source_enc,$target_enc);
if (!defined($try))
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
my @enc_guesses=$self->guess($text,$type,$sub_type);
if (scalar(@enc_guesses)==0)
{
$self->_set_err_state($ERR_GUESS);
return undef;
}
my $result;
for my $enc_guess (@enc_guesses)
{
if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
return $text;
} else {
$result=$self->convert($text,$enc_guess,$target_enc);
if (defined($result))
{
return $result;
}
}
}
if (!defined($result))
{
# test if its UTF-8 already
&Encode::_utf8_on($text);
if ( &Encode::is_utf8($text) ) {
return $text;
}
&Encode::_utf8_off($text);
$self->_set_err_state($ERR_GUESS_AND_CONVERT);
# print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
# print STDERR "\n$text\n\n";
return undef;
}
return $result;
}
sub guess_typo_fixes
{
my $self=shift;
my $typo=shift;
my @possibilities=($typo);
if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
{
push(@possibilities,'utf8');
}
if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)
{
push(@possibilities,'UTF-16');
}
if ($typo=~/^\s*iso-?8559-?1\s*$/isgo)
{
push(@possibilities,'iso-8859-1');
}
if ($typo=~/^\s*ecu-?(kr|jp|cn|tw|jisx0213)\s*$/isgo)
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
# Create a new instance
my $e=Alvis::Encoding->new();
if (!defined($e))
{
die "Instantiating Alvis::Encoding failed.";
}
# Check that a (decimal) character code is legal UTF-8
my $code=55;
if (!$e->code_is_utf8($code))
{
# The message will contain the position and the offending character's code
die $e->errmsg();
}
# Check that a text is legal UTF-8
my $text;
if (!$e->is_utf8($text))
{
# The message will contain the position and the offending character's code
die $e->errmsg();
}
# If you need to obtain the position (1..) and the offending character,
# pass a placeholder in a hash ref argument:
my %err=();
if (!$e->is_utf8($text,\%err))
{
my $position=$err{pos};
my $code=$err{code};
. . .
}
#
# Guess the encoding of a document given a guess for its type
#
my $type_guesser=Alvis::Document::Type->new();
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
if (!defined($doc_encoding))
{
die('Cannot guess. ' . $e->errmsg());
}
#
# Try converting a document to UTF-8 with only its type known
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
if (!defined($doc_in_utf8))
{
die('Cannot guess. ' . $e->errmsg());
}
# Try to guess what was meant
my @possibilities=$e->guess_typo_fixes('uft-8');
=head1 DESCRIPTION
A collection of methods for guessing, confirming and fixing the encoding
lib/Alvis/Document/Encoding.pm view on Meta::CPAN
=head1 METHODS
=head2 new()
Options:
defaultDocType default type for a document. Default: text.
defaultDocSubType default sub type for a document. Default: html.
defaultEncoding default encoding for a document. Default: iso-8859-1.
=head2 code_is_utf8(decimal_code)
Returns 1 if the (decimal) character code is legal UTF-8.
=head2 is_utf8(text,err_hash_ref)
Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.
=head2 guess(text,doc_type,doc_sub_type)
lib/Alvis/HTML.pm view on Meta::CPAN
elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
{
$src_enc=$self->{sourceEncoding};
}
if ($src_enc)
{
if ($src_enc!~/^\s*utf-?8\s*$/)
{
$html=$self->{encodingWiz}->convert($html,
$src_enc,
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
}
else # try guessing the encoding
{
$html=$self->{encodingWiz}->guess_and_convert($html,
'text',
'html',
'utf8');
if (!defined($html))
{
$self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
}
# ex nihilo nihil
#
lib/Alvis/HTML.pm view on Meta::CPAN
{
$self->_set_err_state($ERR_NO_SIGNATURE);
return (undef,\%header); # signals "do not pass on"
}
}
}
if ($self->{assertSourceAssumptions})
{
my %err;
if (!$self->{encodingWiz}->is_utf8($html,\%err))
{
$self->_set_err_state($ERR_SRC_NOT_IN_UTF8,
$self->{encodingWiz}->errmsg());
return (undef,\%header); # signals "do not pass on"
}
# Remove '\0's just in case. Replace by a ' ' just in case they
# separated something meaningful in the original.
$html=~s/[\0]+/ /sgo;
}
lib/Alvis/HTML.pm view on Meta::CPAN
# Private methods
#
###########################################################################
sub _num_ent2char
{
my $self=shift;
my $num=shift;
# check for invalid codes
if (!$self->{encodingWiz}->code_is_utf8($num))
{
# must be an error, don't try to fix typos atm
return "&#$num;";
}
my $str=pack("U",$num);
return $str;
}
lib/Alvis/Utils.pm view on Meta::CPAN
package Alvis::Utils;
require Exporter;
use strict;
use open ':utf8';
use File::Find;
use Cwd 'abs_path';
use Carp;
our @ISA = qw(Exporter);
#our @EXPORT = qw(open_file get_files);
our @EXPORT_OK = qw( open_file get_files absolutize_path );
our $VERSION = 0.01;
lib/Alvis/Wikipedia/XMLDump.pm view on Meta::CPAN
$self->{parser}=Alvis::Wikipedia::WikitextParser->new();
if (!defined($self->{parser}))
{
$self->_set_err_state($ERR_PARSER);
return undef;
}
$self->{canDocConverter}=Alvis::Canonical->new(convertCharEnts=>1,
convertNumEnts=>1,
sourceEncoding=>'utf8');
if (!defined($self->{canDocConverter}))
{
$self->_set_err_state($ERR_CAN_DOC_CONV);
return undef;
}
$self->{catGraph}=Alvis::Wikipedia::CatGraph->new();
if (!defined($self->{catGraph}))
{
$self->_set_err_state($ERR_CAT_GRAPH);