view release on metacpan or search on metacpan
bin/ainodump2alvis view on Meta::CPAN
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$aino_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
bin/ainodump2alvis view on Meta::CPAN
$aino_entries->{$basename}{ainoF}=$e;
}
}
}
sub _convert_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %dump_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%dump_entries);
print " \r";
for my $base_name (keys %dump_entries)
{
if (!exists($dump_entries{$base_name}{ainoF}))
{
bin/alvisXMLmerge view on Meta::CPAN
use Encode;
use File::Copy;
use File::Path;
use encoding 'utf8';
use open ':utf8';
use Time::HiRes qw(gettimeofday tv_interval);
use Alvis::Utils qw(absolutize_path open_file get_files);
####################### global vars
my $VERBOSE = 1;
my $DEBUG = 0;
################################################################################
# main sub
# TODO: optimize reqexp
# TODO: performance
my ($orig_dir, $extra_dir, $out_dir, $config_file, $bzip2, $extra_file) =
read_params();
bin/alvisXSL view on Meta::CPAN
return &nextfile();
}
$nf = shift(@files);
#print STDERR "Got $nf\n";
if ( !$nf ) {
return $nf;
}
if ( -d $nf ) {
#print STDERR "Is dir\n";
if ( $usedir ) {
@dirfiles = sort(glob("$nf/*"));
$withdir = $nf;
$usingdir = 1;
return &nextfile();
} else {
#print STDERR "Open on $nf failed\n";
return &nextfile();
}
}
if ( -f $nf ) {
#print STDERR "Done\n";
bin/alvis_wikipedia_add_cats view on Meta::CPAN
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$alvis_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
bin/alvis_wikipedia_add_cats view on Meta::CPAN
$alvis_entries->{$basename}{alvisF}=$e;
}
}
}
sub _add_cats_to_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %alvis_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%alvis_entries);
print " \r";
for my $base_name (keys %alvis_entries)
{
my $alvisXML;
bin/html2alvis view on Meta::CPAN
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$html_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
bin/html2alvis view on Meta::CPAN
$html_entries->{$basename}{htmlF}=$e;
}
}
}
sub _convert_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %html_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%html_entries);
print " \r";
for my $base_name (keys %html_entries)
{
my ($meta_txt,$html_txt);
bin/html2plain view on Meta::CPAN
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$html_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
bin/html2plain view on Meta::CPAN
$html_entries->{$basename}{htmlF}=$e;
}
}
}
sub _convert_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %html_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%html_entries);
print " \r";
for my $base_name (keys %html_entries)
{
my ($html_txt,$plain_txt,$header);
bin/news_xml2alvis view on Meta::CPAN
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$news_xml_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
bin/news_xml2alvis view on Meta::CPAN
$news_xml_entries->{$basename}{origF}=$e;
}
}
}
sub _convert_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %news_xml_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%news_xml_entries);
print " \r";
for my $base_name (keys %news_xml_entries)
{
my ($meta_txt,$xml_txt);
lib/Alvis/Convert.pm view on Meta::CPAN
return 1;
}
#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
# will be called like this:
# _output_wikipedia_article($arg1,$arg2,...,
# $title,$output_format,
# $record_txt,$is_redir)
#
# where $output_format is a global defined in Alvis::Wikipedia::XMLDump
# as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...] OPTIONAL
# will be called like this:
# _wikipedia_progress($arg1,$arg2,...,
# $prog_txt,$N,$n,$mess)
#
# where $N is the total number of records processed and $n the number of hits
#
t/test-data/original/0/101.alvis view on Meta::CPAN
<documentRecord id="0717FBB236A4A067DC9BE4FA48801BE3">
<acquisition>
<acquisitionData>
<modifiedDate>1141065614536</modifiedDate>
<httpServer>Apache/1.3.34 (Unix) DAV/1.0.3 mod_auth_passthrough/1.8 mod_log_bytes/1.2 mod_bwlimited/1.4 PHP/4.4.1 FrontPage/5.0.2.2635 mod_ssl/2.8.25 OpenSSL/0.9.7a</httpServer>
<urls>
<url>http://battellemedia.com/archives/2004_08.php</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>I'm slow to report the news here (the embargo lifted last night at 9 pm) but today Yahoo launched its local search product. I was on an informal "advisory board" for this product, but I have to admit that my focus on the book did not...
<metaData>
<meta name="title">John Battelle's Searchblog: August 2004 Archives</meta>
<meta name="dc.date">2004-08-03</meta>
<meta name="dc.type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>CNET coverage...</anchorText>
<location>http://news.com.com/Yahoo%2C+Ask+Jeeves+out+to+lure+locals/2100-1024_3-5294125.html</location>
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="18C9FD35812DFC4D4CCF0FD6AC1646BC" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1149133052555</modifiedDate>
<httpServer>Apache/1.3.33 (Unix)</httpServer>
<urls>
<url>http://blog.outer-court.com/archive/2006-05-30-n12.html</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Some bloggers are complaining that Google didnât have a Memorial day logo yesterday. Memorial Day âcommemorates U.S. men and women who have died in military service,âWikipedia explains. From a comment at Newsbusters by Warner T...
<metaData>
<meta name="title">Complaints Due to Lack of Google Memorial Day Logo</meta>
<meta name="dc:date">Thu, 01 Jun 2006 02:44:56 GMT</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>itâs good the way it is</anchorText>
<location>http://blog.lewrockwell.com/lewrw/archives/010666.html</location>