Alvis-Convert
view release on metacpan or search on metacpan
bin/alvisXMLmerge view on Meta::CPAN
# TODO: move this to Alvis::Utils
sub mk_paths
{
my $orig_filename = shift;
my $uorig_filename = $orig_filename;
$uorig_filename =~ s/\.bz2$//;
$uorig_filename =~ s/\.gz$//;
my $extra_filename = $uorig_filename;
my $out_filename = $uorig_filename;
my $orig_dir_p = escape($orig_dir);
my $out_dir_p = escape($out_dir);
my $extra_dir_p = escape($extra_dir) unless (defined($extra_file));
$out_filename =~ s/$orig_dir_p/$out_dir_p/;
$extra_filename =~ s/$orig_dir_p/$extra_dir_p/
unless (defined($extra_file));
$out_filename =~ /(.+)\/(.+?)$/;
my $out_dir = $1;
mkpath($out_dir) unless (-e $out_dir);
return ($extra_filename, $out_filename);
}
# TODO: move this to Alvis::Utils
sub escape
{
my $val = shift;
$val =~ s/(['|"\+?*])/\\$1/g;
return $val;
}
################################################################################
sub read_extra_file
{
my $filename = shift;
lib/Alvis/HTML.pm view on Meta::CPAN
is in any of the recognized dialects.
keepAll if 1, pass all documents on regardless of
their HTMLness. Non-HTML goes forward as '\n'.
Options to specify HTML subsets whose tags to remove: (set to defined)
alvisKeep W3's HTML 4.01 tags Alvis::Convert
is interested in
alvisRemove 4.01 tags Alvis::Convert is NOT interested in
obsolete HTML <4.01
proprietary Net-escape,Exploder,...
xhtml XHTML 1.1
wml WML
Note: alvisKeep + alvisRemove == remove all HTML 4.01 tags
convertCharEnts convert symbolic character entities to UTF-8 characters.
convertNumEnts convert numerical character entities to UTF-8
characters.
sourceEncoding encoding of the source HTML text (default: 'utf-8')
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="3DAB2F05CBCFBD7765C7E71C63E6FFE8" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1145563212583</modifiedDate>
<httpServer>Apache/2.0</httpServer>
<urls>
<url>http://google.weblogsinc.com/2006/04/20/google-has-been-testing-google-base-in-search-results/</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Google is always hard at work fine tuning and trying out new search strategies. Apparantly Google is now hard at work integrating Google Base car searches into the organic results says Jason Dowdell. He recently came across a car sea...
<metaData>
<meta name="title">Google has been Testing Google Base in Search Results</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>Google Base</anchorText>
<location>http://base.google.com/</location>
</link>
<link type="a">
<anchorText>ford escape</anchorText>
<location>http://www.marketingshift.com/images/google-car-search.jpg</location>
</link>
<link type="a">
<anchorText>Jason Dowdell</anchorText>
<location>http://www.marketingshift.com/2006/04/google-base-car-search-introduced-in.cfm</location>
</link>
</outlinks>
</links>
</acquisition>
<linguisticAnalysis>
( run in 0.381 second using v1.01-cache-2.11-cpan-c21f80fb71c )