Alvis-Convert

 view release on metacpan or  search on metacpan

bin/alvisXMLmerge  view on Meta::CPAN

# TODO: move this to Alvis::Utils
sub mk_paths
{
    my $orig_filename  = shift;
    my $uorig_filename = $orig_filename;
    $uorig_filename =~ s/\.bz2$//;
    $uorig_filename =~ s/\.gz$//;
    my $extra_filename = $uorig_filename;
    my $out_filename   = $uorig_filename;

    my $orig_dir_p  = escape($orig_dir);
    my $out_dir_p   = escape($out_dir);
    my $extra_dir_p = escape($extra_dir) unless (defined($extra_file));
    $out_filename   =~ s/$orig_dir_p/$out_dir_p/;
    $extra_filename =~ s/$orig_dir_p/$extra_dir_p/
      unless (defined($extra_file));

    $out_filename =~ /(.+)\/(.+?)$/;
    my $out_dir = $1;
    mkpath($out_dir) unless (-e $out_dir);

    return ($extra_filename, $out_filename);
}

# TODO: move this to Alvis::Utils
sub escape
{
    my $val = shift;
    $val =~ s/(['|"\+?*])/\\$1/g;
    return $val;
}

################################################################################
sub read_extra_file
{
    my $filename = shift;

lib/Alvis/HTML.pm  view on Meta::CPAN

                       is in any of the recognized dialects.
    keepAll            if 1, pass all documents on regardless of
                       their HTMLness. Non-HTML goes forward as '\n'.

 Options to specify HTML subsets whose tags to remove: (set to defined)

    alvisKeep          W3's HTML 4.01 tags Alvis::Convert
                       is interested in
    alvisRemove        4.01 tags Alvis::Convert is NOT interested in
    obsolete           HTML <4.01
    proprietary        Net-escape,Exploder,...
    xhtml              XHTML 1.1
    wml                WML

     Note: alvisKeep + alvisRemove == remove all HTML 4.01 tags

    convertCharEnts    convert symbolic character entities to UTF-8 characters.
    convertNumEnts     convert numerical character entities to UTF-8 
                       characters.  

    sourceEncoding     encoding of the source HTML text (default: 'utf-8')

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="3DAB2F05CBCFBD7765C7E71C63E6FFE8" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1145563212583</modifiedDate>
        <httpServer>Apache/2.0</httpServer>
        <urls>
          <url>http://google.weblogsinc.com/2006/04/20/google-has-been-testing-google-base-in-search-results/</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Google is always hard at work fine tuning and trying out new search strategies. Apparantly Google is now hard at work integrating Google Base car searches into the organic results says Jason Dowdell. He recently came across a car sea...
      <metaData>
        <meta name="title">Google has been Testing Google Base in Search Results</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Google Base</anchorText>
            <location>http://base.google.com/</location>
          </link>
          <link type="a">
            <anchorText>ford escape</anchorText>
            <location>http://www.marketingshift.com/images/google-car-search.jpg</location>
          </link>
          <link type="a">
            <anchorText>Jason Dowdell</anchorText>
            <location>http://www.marketingshift.com/2006/04/google-base-car-search-introduced-in.cfm</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>



( run in 0.381 second using v1.01-cache-2.11-cpan-c21f80fb71c )