Alvis-Convert

 view release on metacpan or  search on metacpan

bin/ainodump2alvis  view on Meta::CPAN

    for my $e (@$entries)
    {
	if ($Seen{$e})
	{
	    next;
	}
	
	$Seen{$e}=1;
	if (-d $e)
	{
	    my @entries=glob("$e/*");;
	    &_parse_entries(\@entries,$options,$aino_entries);
	    next;
	}

	my ($basename,$suffix);
	if ($e=~/^(.*)\.([^\.]+)$/)
	{
	    $basename=$1;
	    $suffix=$2;
	}

bin/ainodump2alvis  view on Meta::CPAN

	    $aino_entries->{$basename}{ainoF}=$e;
	}
    }
}

sub _convert_collection
{
    my $root_dir=shift;
    my $options=shift;

    my @entries=glob("$root_dir/*");
    my %dump_entries=();
    %Seen=();
    print "Parsing the source directory entries...\r";
    &_parse_entries(\@entries,$options,\%dump_entries);	
    print "                                       \r";

    for my $base_name (keys %dump_entries)
    {
	if (!exists($dump_entries{$base_name}{ainoF}))
	{

bin/alvisXMLmerge  view on Meta::CPAN

use Encode;
use File::Copy;
use File::Path;

use encoding 'utf8';
use open ':utf8';
use Time::HiRes qw(gettimeofday tv_interval);

use Alvis::Utils qw(absolutize_path open_file get_files);

####################### global vars
my $VERBOSE = 1;
my $DEBUG   = 0;

################################################################################
# main sub

# TODO: optimize reqexp
# TODO: performance
my ($orig_dir, $extra_dir, $out_dir, $config_file, $bzip2, $extra_file) =
  read_params();

bin/alvisXSL  view on Meta::CPAN

    return &nextfile();
  } 
  $nf = shift(@files);
  #print STDERR "Got $nf\n";
  if ( !$nf ) {
    return $nf;
  }
  if ( -d $nf ) {
    #print STDERR "Is dir\n";
    if ( $usedir ) {
      @dirfiles = sort(glob("$nf/*")); 
      $withdir = $nf;
      $usingdir = 1;
      return &nextfile();
    } else {
      #print STDERR "Open on $nf failed\n";
      return &nextfile();
    }
  }
  if ( -f $nf ) {
    #print STDERR "Done\n";

bin/alvis_wikipedia_add_cats  view on Meta::CPAN

    for my $e (@$entries)
    {
	if ($Seen{$e})
	{
	    next;
	}
	
	$Seen{$e}=1;
	if (-d $e)
	{
	    my @entries=glob("$e/*");;
	    &_parse_entries(\@entries,$options,$alvis_entries);
	    next;
	}

	my ($basename,$suffix);
	if ($e=~/^(.*)\.([^\.]+)$/)
	{
	    $basename=$1;
	    $suffix=$2;
	}

bin/alvis_wikipedia_add_cats  view on Meta::CPAN

	    $alvis_entries->{$basename}{alvisF}=$e;
	}
    }
}

sub _add_cats_to_collection
{
    my $root_dir=shift;
    my $options=shift;

    my @entries=glob("$root_dir/*");
    my %alvis_entries=();
    %Seen=();
    print "Parsing the source directory entries...\r";
    &_parse_entries(\@entries,$options,\%alvis_entries);	
    print "                                       \r";

    for my $base_name (keys %alvis_entries)
    {
	my $alvisXML;
	

bin/html2alvis  view on Meta::CPAN

    for my $e (@$entries)
    {
	if ($Seen{$e})
	{
	    next;
	}
	
	$Seen{$e}=1;
	if (-d $e)
	{
	    my @entries=glob("$e/*");;
	    &_parse_entries(\@entries,$options,$html_entries);
	    next;
	}

	my ($basename,$suffix);
	if ($e=~/^(.*)\.([^\.]+)$/)
	{
	    $basename=$1;
	    $suffix=$2;
	}

bin/html2alvis  view on Meta::CPAN

	    $html_entries->{$basename}{htmlF}=$e;
	}
    }
}

sub _convert_collection
{
    my $root_dir=shift;
    my $options=shift;

    my @entries=glob("$root_dir/*");
    my %html_entries=();
    %Seen=();
    print "Parsing the source directory entries...\r";
    &_parse_entries(\@entries,$options,\%html_entries);	
    print "                                       \r";

    for my $base_name (keys %html_entries)
    {
	my ($meta_txt,$html_txt);

bin/html2plain  view on Meta::CPAN

    for my $e (@$entries)
    {
	if ($Seen{$e})
	{
	    next;
	}
	
	$Seen{$e}=1;
	if (-d $e)
	{
	    my @entries=glob("$e/*");;
	    &_parse_entries(\@entries,$options,$html_entries);
	    next;
	}

	my ($basename,$suffix);
	if ($e=~/^(.*)\.([^\.]+)$/)
	{
	    $basename=$1;
	    $suffix=$2;
	}

bin/html2plain  view on Meta::CPAN

	    $html_entries->{$basename}{htmlF}=$e;
	}
    }
}

sub _convert_collection
{
    my $root_dir=shift;
    my $options=shift;

    my @entries=glob("$root_dir/*");
    my %html_entries=();
    %Seen=();
    print "Parsing the source directory entries...\r";
    &_parse_entries(\@entries,$options,\%html_entries);	
    print "                                       \r";

    for my $base_name (keys %html_entries)
    {
	my ($html_txt,$plain_txt,$header);

bin/news_xml2alvis  view on Meta::CPAN

    for my $e (@$entries)
    {
	if ($Seen{$e})
	{
	    next;
	}
	
	$Seen{$e}=1;
	if (-d $e)
	{
	    my @entries=glob("$e/*");;
	    &_parse_entries(\@entries,$options,$news_xml_entries);
	    next;
	}

	my ($basename,$suffix);
	if ($e=~/^(.*)\.([^\.]+)$/)
	{
	    $basename=$1;
	    $suffix=$2;
	}

bin/news_xml2alvis  view on Meta::CPAN

	    $news_xml_entries->{$basename}{origF}=$e;
	}
    }
}

sub _convert_collection
{
    my $root_dir=shift;
    my $options=shift;

    my @entries=glob("$root_dir/*");
    my %news_xml_entries=();
    %Seen=();
    print "Parsing the source directory entries...\r";
    &_parse_entries(\@entries,$options,\%news_xml_entries);	
    print "                                       \r";

    for my $base_name (keys %news_xml_entries)
    {
	my ($meta_txt,$xml_txt);

lib/Alvis/Convert.pm  view on Meta::CPAN

    return 1;
}

#
# output_cb: [\&_output_wikipedia_article,$arg1,$arg2,...]
#               will be called like this:
#          _output_wikipedia_article($arg1,$arg2,...,
#                                    $title,$output_format,
#                                    $record_txt,$is_redir)
#
#  where $output_format is a global defined in Alvis::Wikipedia::XMLDump
#  as $OUTPUT_*
#
#
# progress_cb: [\&_wikipedia_progress,$arg1,$arg2,...]     OPTIONAL
#               will be called like this:
#          _wikipedia_progress($arg1,$arg2,...,
#                              $prog_txt,$N,$n,$mess)
#
#   where $N is the total number of records processed and $n the number of hits
#

t/test-data/original/0/101.alvis  view on Meta::CPAN

  <documentRecord id="0717FBB236A4A067DC9BE4FA48801BE3">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1141065614536</modifiedDate>
        <httpServer>Apache/1.3.34 (Unix) DAV/1.0.3 mod_auth_passthrough/1.8 mod_log_bytes/1.2 mod_bwlimited/1.4 PHP/4.4.1 FrontPage/5.0.2.2635 mod_ssl/2.8.25 OpenSSL/0.9.7a</httpServer>
        <urls>
          <url>http://battellemedia.com/archives/2004_08.php</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>I'm slow to report the news here (the embargo lifted last night at 9 pm) but today Yahoo launched its local search product. I was on an informal "advisory board" for this product, but I have to admit that my focus on the book did not...
      <metaData>
        <meta name="title">John Battelle's Searchblog: August 2004 Archives</meta>
        <meta name="dc.date">2004-08-03</meta>
        <meta name="dc.type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>CNET coverage...</anchorText>
            <location>http://news.com.com/Yahoo%2C+Ask+Jeeves+out+to+lure+locals/2100-1024_3-5294125.html</location>

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="18C9FD35812DFC4D4CCF0FD6AC1646BC" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1149133052555</modifiedDate>
        <httpServer>Apache/1.3.33 (Unix)</httpServer>
        <urls>
          <url>http://blog.outer-court.com/archive/2006-05-30-n12.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Some bloggers are complaining that Google didn’t have a Memorial day logo yesterday. Memorial Day “commemorates U.S. men and women who have died in military service,”Wikipedia explains. From a comment at Newsbusters by Warner T...
      <metaData>
        <meta name="title">Complaints Due to Lack of Google Memorial Day Logo</meta>
        <meta name="dc:date">Thu, 01 Jun 2006 02:44:56 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>it’s good the way it is</anchorText>
            <location>http://blog.lewrockwell.com/lewrw/archives/010666.html</location>



( run in 0.690 second using v1.01-cache-2.11-cpan-49f99fa48dc )