Alvis-Convert

 view release on metacpan or  search on metacpan

bin/alvisDecollect  view on Meta::CPAN

use Encode;

###################### CONFIGURATION #####################

my $GROUPELEMENT = "documentCollection";


############ END CONFIGURATION ######################

# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';

my $header = "";
my $collected = 0;

while ( !$collected && ($_=<>) ) {
  print;
  if ( /<$GROUPELEMENT / ){
    $collected = 1;
  }
}

bin/alvisSource  view on Meta::CPAN

# Feeds docs from DIR/*.xml into the pipelineT. Wait minutes before each send.


use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Alvis::Pipeline;
# use Data::Dumper;

use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";

my $PIPE_WRITEPORT=0;
my $PIPE_WRITEHOST="localhost";

my $verbose = 0;
my ( @dirs, $sleep, $shutdown, $host, $port );
$host = 'localhost';
$port = 10000;
$shutdown = 0;
$sleep = 0;

bin/alvisXMLjoin  view on Meta::CPAN

#!/usr/bin/perl -w

use strict;

use Getopt::Long;
use Pod::Usage;
use Encode;

use Alvis::Utils qw(open_file);

use encoding 'utf8';
use open ':utf8';

my $xml_header = '<?xml version="1.0" encoding="UTF-8"?>';
my $doc_col = '<documentCollection xmlns="http://alvis.info/enriched/" version="1.1">';
my $doc_col_end = '</documentCollection>';

################################################################################
# main

join_files(read_params());

bin/alvisXMLmerge  view on Meta::CPAN

#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use Pod::Usage;
use Encode;
use File::Copy;
use File::Path;

use encoding 'utf8';
use open ':utf8';
use Time::HiRes qw(gettimeofday tv_interval);

use Alvis::Utils qw(absolutize_path open_file get_files);

####################### global vars
my $VERBOSE = 1;
my $DEBUG   = 0;

################################################################################
# main sub

bin/alvisXMLsplit  view on Meta::CPAN

    $F=shift @ARGV;
  }
}

pod2usage(1) if @ARGV != 2;

my $Size=shift @ARGV;
my $ODir=shift @ARGV;
$report = 0;    #  switch off STDERR status

use encoding 'utf8';
use open ':utf8';

#  have to make sure documentRecord elements one per line
if ( $bz || $bz_in ) {
  open(W,"bzcat $F | perl -p -e \"s/<documentRecord/\n<documentRecord/g;\" |") 
    || die("Unable to open \"$F\"");
} else {
	*W = open_file($F);
  #open(W,"perl -p -e \"s/<documentRecord/\n<documentRecord/g;\" $F |") 
  #  || die("Unable to open \"$F\"");
} 

bin/alvisXMLsplit  view on Meta::CPAN

my $N = 1;
my $Collection = $CollectionHeader;
while (my $record=&get_next_rec(*W))
{
    $Collection.=$record;
    if ($N%$Size==0)
    {
 	$Collection.="</documentCollection>\n";
	my $FN = int($N/$Size) + $offset - 1;
	my $out_f="$ODir/$FN.xml";
	open(OUT,">:utf8",$out_f) || die("Unable to open $out_f");
	print OUT $Collection;
	close(OUT);
	if ( $bz ) {
		system("bzip2 $out_f");
        }
	$Collection=$CollectionHeader;
    }
    if ( $report ) { print STDERR "$N\r"; }
    $N++;
}
if ( $report ) { print STDERR "\n"; }

if (($N-1)%$Size)
{
    $Collection.="</documentCollection>\n";
    my $FN = int($N/$Size) + $offset;
    my $out_f="$ODir/$FN.xml";
    open(OUT,">:utf8",$out_f) || die("Unable to open $out_f");
    print OUT $Collection;
    close(OUT);
    if ( $bz ) {
      system("bzip2 $out_f");
    }
}


sub get_next_rec
{

bin/alvisXSL  view on Meta::CPAN

#  toss out whatever else was included, and add this
my $GROUPELEMENTEXTRA = " xmlns=\"http://alvis.info/enriched/\" version=\"1.1\"";


############ END CONFIGURATION ######################

#  autoflush
select((select(STDERR), $| = 1)[0]);

# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';


my $USAGE = "alvisXSL [--gzip|--bzip2|--dir] [--xslargs ARGS] [--xsl XSL-FILE] XML-FILE+\n" 
  . "   Runs xsltproc multiple times on inputs.   To convert into\n"
  . "   into XML, use alvisDecollect as a post-processor.\n" 
  . "   dir = descend into directories, but not recursively\n"
  . "   xsl = $XSL\n"
  . "   xslargs = $XSLARGS\n";

#  command line inputs

bin/alvis_wikipedia_add_cats  view on Meta::CPAN

if (!$G->load_graph($DumpF))
{
    die("Loading the graph dump failed: " . $G->errmsg());
}
print STDERR "\n";

my $List;
if ($ListF)
{
    print STDERR "Getting the list of categories....\r";
    open(L,"<:utf8",$ListF) || die("Unable to open \"$ListF\"");;
    while (my $l=<L>)
    {
	chomp $l;
	push(@$List,$l);
    }
    close(L);
    print STDERR "\n";
}

print STDERR "Building the path length map....\r";

bin/alvis_wikipedia_add_cats  view on Meta::CPAN

    &_parse_entries(\@entries,$options,\%alvis_entries);	
    print "                                       \r";

    for my $base_name (keys %alvis_entries)
    {
	my $alvisXML;
	
	if (exists($alvis_entries{$base_name}{alvisF}))
	{
	    my $f=$alvis_entries{$base_name}{alvisF};
	    open(W,"<:utf8",$f) || die("Unable to open \"$f\"");
	    my $out=$f;
	    $out=~s/^.*\///sgo;
	    open(OUT,">:utf8","$OutDir/$out") || 
		die("Unable to open \"$OutDir/$out\"");

	    my $new_rec="";
	    my $N=1;
	    while (my $record=&_get_next_rec(*W))
	    {
		my $cats=&_get_cats($record);
		if (!defined($cats))
		{
		    warn "Getting the categories of record #$N in file " .

bin/html2plain  view on Meta::CPAN

    my $out_f;
    my $dir=$ODir . '/' . 
	int($outputN / $NPerOurDir);
    if ($outputN % $NPerOurDir==0)
    {
	mkdir($dir);
    }
    $out_f=$dir . '/' . $outputN . '.' .
	$OutSuffix;
    
    if (!defined(open(OUT,">:utf8",$out_f)))
    {
	warn "Cannot open output file \"$out_f\".\n";
	return 0;
    }
    print OUT $plain_txt;
    close(OUT);
    
    $outputN++;
    print "$outputN\r";
}

bin/wikipedia2alvis  view on Meta::CPAN

	my $meta_txt;
	$meta_txt.="title\t$title\n";
	$meta_txt.="date\t$date\n";
	my $ns_txt="";
	if ($namespace ne '')
	{
	    $ns_txt="$namespace/";
	}
	$meta_txt.="url\twikipedia/$ns_txt$title\n";

	$alvis_XML=$C->HTML($record_txt,$meta_txt,{sourceEncoding=>'utf8'});
        if (!defined($alvis_XML))
        {
            warn "Obtaining the Alvis version of the " .
                "HTML version of an article failed. " . $C->errmsg() if
                $Warnings;
            $C->clearerr();
            return 1;
        }

    }

lib/Alvis/Buffer.pm  view on Meta::CPAN

# $Id: Buffer.pm,v 1.1 2006/12/01 09:40:24 buntine Exp $

package Alvis::Buffer;

use strict;
use warnings;
use Time::Simple;

use encoding 'utf8';
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDERR, ":utf8";

our $VERSION = '0.10';

=head1 NAME

Alvis::Buffer - Perl extension for buffering utilities for the Alvis pipeline

=head1 SYNOPSIS

 use Alvis::Buffer;

lib/Alvis/Convert.pm  view on Meta::CPAN

	$text='<HTML><BODY>' . $text . '</BODY></HTML>';
	
	# Check that the ISO date actually is in ISO format...
	if (defined($iso_date))
	{
	    $meta->set('dc:date',$iso_date);
	}
	
	my ($can_doc,$header)=
	    $self->{canonicalConverter}->HTML($text,
					      {sourceEncoding=>'utf8'});
	if (!defined($can_doc))
	{
	    $self->_set_err_state($ERR_CANDOC_CONV,
				  $self->{canonicalConverter}->errmsg());
	    return undef;
	}
	
	if (defined($title))
	{
	    $meta->set('title',$title);

lib/Alvis/Convert.pm  view on Meta::CPAN

#     catGraphDumpF           category graph dump file
#
sub wikipedia
{
    my $self=shift;
    my $f=shift;   
    my $output_cb=shift;  
    my $opts=shift;
    my $progress_cb=shift;

    if (!defined(open(WIKIPEDIA,"<:utf8",$f)))
    {
	$self->_set_err_state($ERR_OPEN_WIKIPEDIA,
			      "File: \"$f\"");
	return 0;
    }
    if (!$self->{wikipediaConverter}->extract_records(\*WIKIPEDIA,
						      $output_cb,
						      $opts,
						      $progress_cb))
    {

lib/Alvis/Convert.pm  view on Meta::CPAN

    {
        my $detected=$meta->get('detectedCharSet');
        if ($detected)
        {
            $src_enc=$detected;
        }
    }

    if (defined($src_enc) && $src_enc=~/^\s*utf\s*\-?\s*8\s*$/i)
    {
	if (!defined(open(H,"<:utf8",$f)))
        {
            $self->_set_err_state($ERR_HTML_F,
                                  "File: \"$f\".");
            return undef;
        }
        while (my $l=<H>)
        {
            $html_txt.=$l;
        }
        close(H);

lib/Alvis/Convert.pm  view on Meta::CPAN

{
    my $self=shift;
    my $f=shift;

    my $meta_txt="";

    if (defined($self->{metaEncoding}))
    {
	if ($self->{metaEncoding}=~/^\s*utf\s*\-?\s*8\s*$/i)
	{
	    if (!defined(open(M,"<:utf8",$f)))
            {
                $self->_set_err_state($ERR_META_F,
                                      "File: \"$f\".");
                return undef;
            }
            while (my $l=<M>)
            {
                $meta_txt.=$l;
            }
            close(M);

lib/Alvis/Convert.pm  view on Meta::CPAN

				  "File: \"$f\".");
	    return undef;
	}
	my $meta_txt="";
	while (my $l=<M>)
	{
	    $meta_txt.=$l;
	}
	close(M);
	
	$meta_txt=$self->{encodingWizard}->try_to_convert_to_utf8($meta_txt,
								  'text',
								  'plain');
	if (!defined($meta_txt))
	{
	    $self->_set_err_state($ERR_UTF8_CONV,
				  $self->{encodingWizard}->errmsg());
	    return undef;
	}
    }

    return $meta_txt;
}

sub read_news_XML
{
    my $self=shift;
    my $f=shift;

    if (!defined(open(X,"<:utf8",$f)))
    {
	$self->_set_err_state($ERR_NEWS_XML_F,
			      "File: \"$f\".");
	return undef;
    }
    my $txt="";
    while (my $l=<X>)
    {
	$txt.=$l;
    }

lib/Alvis/Convert.pm  view on Meta::CPAN


    return 1;
}

sub _output_set_of_records
{
    my $self=shift;
    my $set_of_records_txt=shift;
    my $path=shift;

    if (!defined(open(OUT,">:utf8",$path)))
    {
	$self->_set_err_state($ERR_WRITING_OUTPUT,"Output file: " .
			      "\"$path\"");
	return 0;
    }
    print OUT "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    print OUT "<documentCollection xmlns=\"http://alvis.info/enriched/\">\n";
    print OUT $set_of_records_txt;
    print OUT "</documentCollection>\n";
    close(OUT);

lib/Alvis/Convert.pm  view on Meta::CPAN

                             Default: yes.
    ainodumpWarnings         issue warnings concerning ainodump conversion?
                             Default: yes.
    sourceEncodingFromMeta   read source encoding from Meta information?
                             Default: no.
    

=head2 HTML()

     my $alvisXML=$C->HTML($html_txt,$meta_txt,
                           {sourceEncoding=>'utf8',
                            sourceEncodingFromMeta=>0
                            });
     if (!defined($alvisXML))
     {
	warn $C->errmsg();
	$C->clearerr();
	next;
     }

=head2 newsXML()

lib/Alvis/Document.pm  view on Meta::CPAN


    my $XML;

    my $md5;
    if (defined($ingredients->{origText}) && $self->{includeOriginalDocument})
    {
	$md5=uc(Digest::MD5->new->add($ingredients->{origText})->hexdigest());
    }
    else
    {
	$md5=uc(Digest::MD5->new->add(encode_utf8($ingredients->{canDoc}))->hexdigest());
    }

    $XML.="  <documentRecord id=\"$md5\" xmlns=\"http://alvis.info/enriched/\">\n";
    $XML.="    <acquisition>\n";

    my $last_modified;
    if (defined($ingredients->{meta}->get('date')))
    {
	$last_modified=$ingredients->{meta}->get('date');
    }

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN


##########################################################################
#
#  Public methods
#
#########################################################################

#
# Returns 1 if the (decimal) character code is legal UTF-8
#
sub code_is_utf8
{
    my $self=shift;
    my $dec_code=shift;

    # check for invalid codes 
    if ($dec_code<0 || $dec_code>1114111 || $InvalidUtf8Code{$dec_code})
    {
	return 0;
    }

    return 1;
}

#
# Returns 1 if all of the characters of the text are legal UTF-8
# Else, returns 0 and sets an error message specifying the location
# (1..) of the first illegal character code
# If you wish to obtain the position and offending code, pass a 
# hash ref
#
sub is_utf8
{
    my $self=shift;
    my $text=shift;
    my $err=shift;

    # Go over the text char by char and check for invalid char codes
    my @chars=split(//,$text);
    my $i=1;
    for my $char (@chars)
    {
	# We test for valid code
	#
	my $code=ord($char);
	if (!$self->code_is_utf8($code))
	{
	    $self->_set_err_state($ERR_ILLEGAL_CODE,
				  sprintf("Position: #%d, character code: %#x",
					  $i,$code));
	    if (defined($err) && ref($err) eq 'HASH')
	    {
		$err->{pos}=$i;
		$err->{code}=$code;
	    }
	    return 0;

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

	    my $err=$@;
	    $err=~s/ at .*$//isgo;
	    $self->_set_err_state($ERR_WRONG_GUESS,
				  "source encoding: $source_enc, " .
				  "target encoding: $target_enc. Why? $err.");
	    return undef;
	}
        if ($target_enc=~/^\s*utf-?8\s*$/isgo)
        {
	    # leaves the bl***y UTF-8 flag on
	    Encode::_utf8_on($text); 
        }
    }
    return $text;
}

# 
# Should always leave the UTF-8 flag on, if target is UTF-8
#
sub convert
{
    my $self=shift;
    my $text=shift;
    my $source_enc=shift;
    my $target_enc=shift;

    my %err;
    if ($source_enc=~/^\s*utf-?8\s*$/isgo)
    {
	if (!$self->is_utf8($text,\%err))
	{
	    $self->_set_err_state($ERR_ILLEGAL_CHAR,
				  " Position: $err{pos}," .
				  "Code:$err{code}");
	    return undef;
	}
    }

    my $try=$self->from_to($text,$source_enc,$target_enc);
    if (!defined($try))

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

    my @enc_guesses=$self->guess($text,$type,$sub_type);
    if (scalar(@enc_guesses)==0)
    {
	$self->_set_err_state($ERR_GUESS);
	return undef;
    }

    my $result; 
    for my $enc_guess (@enc_guesses)
    {
	if ( $target_enc eq "utf8" && ( $enc_guess =~ /utf-?8/i ) ) {
	   return $text;
        } else {
           $result=$self->convert($text,$enc_guess,$target_enc);
	   if (defined($result))
	   {
	       return $result;
	   }
	}
    }
    if (!defined($result))
    {
        #  test if its UTF-8 already
	&Encode::_utf8_on($text);
	if (  &Encode::is_utf8($text) ) {
		return $text;
        }
         &Encode::_utf8_off($text);
	$self->_set_err_state($ERR_GUESS_AND_CONVERT);
        # print STDERR join("==", @enc_guesses) . " -> $target_enc : undef\n";
	# print STDERR "\n$text\n\n";
	return undef;
    }

    return $result;
}

sub guess_typo_fixes
{
    my $self=shift;
    my $typo=shift;

    my @possibilities=($typo);
    if ($typo=~/^\s*(?:utf|uft)-?8\s*$/isgo)
    {
	push(@possibilities,'utf8');
    }
    if ($typo=~/^\s*(?:utf|uft)-16\s*$/isgo)
    {
	push(@possibilities,'UTF-16');
    }
    if ($typo=~/^\s*iso-?8559-?1\s*$/isgo)
    {
	push(@possibilities,'iso-8859-1');
    }
    if ($typo=~/^\s*ecu-?(kr|jp|cn|tw|jisx0213)\s*$/isgo)

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN


 # Create a new instance
 my $e=Alvis::Encoding->new();
 if (!defined($e))
 {
    die "Instantiating Alvis::Encoding failed.";
 }

 # Check that a (decimal) character code is legal UTF-8
 my $code=55;
 if (!$e->code_is_utf8($code))
 {
    # The message will contain the position and the offending character's code 
    die $e->errmsg();
 }

 # Check that a text is legal UTF-8
 my $text;
 if (!$e->is_utf8($text))
 {
    # The message will contain the position and the offending character's code 
    die $e->errmsg();
 }

 # If you need to obtain the position (1..) and the offending character,
 # pass a placeholder in a hash ref argument:
 my %err=();
 if (!$e->is_utf8($text,\%err))
 {
    my $position=$err{pos};
    my $code=$err{code};
    . . . 
 }

 # 
 # Guess the encoding of a document given a guess for its type 
 #
 my $type_guesser=Alvis::Document::Type->new();

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

 if (!defined($doc_encoding))
 {
     die('Cannot guess. ' . $e->errmsg());
 }

 # 
 # Try converting a document to UTF-8 with only its type known
 #
 my $type_guesser=Alvis::Document::Type->new();
 my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
 my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
 if (!defined($doc_in_utf8))
 {
     die('Cannot guess. ' . $e->errmsg());
 }
 
 # Try to guess what was meant 
 my @possibilities=$e->guess_typo_fixes('uft-8');

=head1 DESCRIPTION

A collection of methods for guessing, confirming and fixing the encoding

lib/Alvis/Document/Encoding.pm  view on Meta::CPAN

=head1 METHODS

=head2 new()

Options:

    defaultDocType       default type for a document. Default: text.
    defaultDocSubType    default sub type for a document. Default: html.
    defaultEncoding      default encoding for a document. Default: iso-8859-1.

=head2 code_is_utf8(decimal_code)

Returns 1 if the (decimal) character code is legal UTF-8.

=head2 is_utf8(text,err_hash_ref)

Returns 1 if all of the characters of $text are legal UTF-8
Else, returns 0 and sets an error message specifying the location
(1..) of the first illegal character code
If you wish to obtain the position and offending code, pass a 
hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and
$err_hash_ref->{code}.

=head2 guess(text,doc_type,doc_sub_type)

lib/Alvis/HTML.pm  view on Meta::CPAN

    elsif (!exists($opts->{sourceEncoding}) && $self->{sourceEncoding})
    {
	$src_enc=$self->{sourceEncoding};
    }
    if ($src_enc)
    {
	if ($src_enc!~/^\s*utf-?8\s*$/)
	{
	    $html=$self->{encodingWiz}->convert($html,
						$src_enc,
						'utf8');
	    if (!defined($html))
	    {
		$self->_set_err_state($ERR_UTF8_CONV,
				      $self->{encodingWiz}->errmsg());
		return (undef,\%header);  # signals "do not pass on"
	    }
	}
    }
    else # try guessing the encoding
    {
	$html=$self->{encodingWiz}->guess_and_convert($html,
						      'text',
						      'html',
						      'utf8');
	if (!defined($html))
	{
	    $self->_set_err_state($ERR_GUESS_ENC_UTF8_CONV,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
    }

    # ex nihilo nihil 
    #

lib/Alvis/HTML.pm  view on Meta::CPAN

	    {
		$self->_set_err_state($ERR_NO_SIGNATURE);
		return (undef,\%header);  # signals "do not pass on"
	    }
	} 
    }

    if ($self->{assertSourceAssumptions})
    {
	my %err;
	if (!$self->{encodingWiz}->is_utf8($html,\%err))
	{
	    $self->_set_err_state($ERR_SRC_NOT_IN_UTF8,
				  $self->{encodingWiz}->errmsg());
	    return (undef,\%header);  # signals "do not pass on"
	}
	# Remove '\0's just in case. Replace by a ' ' just in case they 
	# separated something meaningful in the original. 
	$html=~s/[\0]+/ /sgo;
    }

lib/Alvis/HTML.pm  view on Meta::CPAN

# Private methods
#
###########################################################################

sub _num_ent2char
{
    my $self=shift;
    my $num=shift;

    # check for invalid codes 
    if (!$self->{encodingWiz}->code_is_utf8($num))
    {
	# must be an error, don't try to fix typos atm
	return "&#$num;";
    }

    my $str=pack("U",$num);

    return $str;
}

lib/Alvis/Utils.pm  view on Meta::CPAN

package Alvis::Utils;

require Exporter;
use strict;
use open ':utf8';
use File::Find;
use Cwd 'abs_path';
use Carp;

our @ISA = qw(Exporter);

#our @EXPORT    = qw(open_file get_files);
our @EXPORT_OK = qw( open_file get_files absolutize_path );
our $VERSION   = 0.01;

lib/Alvis/Wikipedia/XMLDump.pm  view on Meta::CPAN


    $self->{parser}=Alvis::Wikipedia::WikitextParser->new();
    if (!defined($self->{parser}))
    {
	$self->_set_err_state($ERR_PARSER);
	return undef;
    }

    $self->{canDocConverter}=Alvis::Canonical->new(convertCharEnts=>1,
						   convertNumEnts=>1,
						   sourceEncoding=>'utf8');
    if (!defined($self->{canDocConverter}))
    {
	$self->_set_err_state($ERR_CAN_DOC_CONV);
	return undef;
    }

    $self->{catGraph}=Alvis::Wikipedia::CatGraph->new();
    if (!defined($self->{catGraph}))
    {
	$self->_set_err_state($ERR_CAT_GRAPH);



( run in 0.804 second using v1.01-cache-2.11-cpan-49f99fa48dc )