Alvis-Convert
view release on metacpan or search on metacpan
bin/html2plain view on Meta::CPAN
Sets the # of records per output directory. Default value: 1000.
=item B<--source-encoding>
Specifies the encoding of the HTML files. Default value undef,
which means that the encoding is guessed for each document.
=item B<--[no]assert-html>
Specifies whether it is asserted that the document actually looks like
HTML before trying to convert. Default: yes.
=item B<--[no]symbolic-char-entities-to-chars>
Specifies whether symbolic character entities are converted to
UTF-8 characters. Default: yes.
=item B<--[no]numerical-char-entities-to-chars>
Specifies whether numerical character entities are converted to
lib/Alvis/Document/Type.pm view on Meta::CPAN
{
($type,$sub_type)=split(/\//,$mime_type,-1);
}
# If the result is a generic one, check for our types of interest
# by other means
# BTW, File::Type should make it clear and checkable what its
# "I dunno" reply is
if ($type eq 'application' && $sub_type eq 'octet-stream')
{
if ($self->_looks_like_HTML($text))
{
($type,$sub_type)=('text','html');
}
elsif ($self->_looks_like_RSS($text))
{
# not a MIME type
($type,$sub_type)=('text','rss')
}
}
return ($type,$sub_type);
}
sub _looks_like_HTML
{
my $self=shift;
my $text=shift;
#
# If we're lucky...
#
if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
lib/Alvis/Document/Type.pm view on Meta::CPAN
# signature start tag will do.
#
if ($text=~/<(?:(?i)html|body)\W/sgo)
{
return 1;
}
return 0;
}
sub _looks_like_RSS
{
my $self=shift;
my $text=shift;
#
# If we're lucky...
#
if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
lib/Alvis/HTML.pm view on Meta::CPAN
#############################################################################
#############################################################################
#
# Global variables & constants
#
##############################################################################
# Do we assert that our assumptions about the source hold?
my $DEF_SRC_ASS=1;
# Do we check first to see if the document really looks like HTML?
my $DEF_ASSERT_HTML=1;
# Do we pass on even non-HTML documents?
my $DEF_KEEP_ALL=0;
# Do we replace character entities with actual characters?
my $DEF_CONVERT_CHAR_ENTS=1;
# Do we replace numerical character entities with actual characters?
my $DEF_CONVERT_NUM_ENTS=0;
# Do we try to clean extra whitespace?
my $DEF_CLEAN_WS=0;
# Source encoding
lib/Alvis/HTML.pm view on Meta::CPAN
{
return ("\n",\%header);
}
else
{
$self->_set_err_state($ERR_EMPTY_DOC);
return (undef,\%header); # signals "do not pass on"
}
}
# Check if this really looks like "HTML"
#
if ($self->{assertHTML})
{
#
# If we're lucky...
#
if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
{
my $type=$1;
if ($type!~/(?:html|wml)/igo)
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="B4158BE3ACF2447B8B2FF1AFFB5361A0" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1147168350172</modifiedDate>
<httpServer>Apache</httpServer>
<urls>
<url>http://searchenginewatch.com/searchday/article.php/3603301</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Paying attention to web metrics is an increasingly important aspect of search marketing, with methodologies, processes and tools that can dramatically lift marketing and business performance. A special report from the Search Engine S...
<metaData>
<meta name="title">Multichannel Metrics: Managing the Sea of Data</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText> Grantastic Designs, Inc.</anchorText>
<location>http://www.grantasticdesigns.com/</location>
</link>
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="F3F560D7ED8DE899CD17D9302AADE8EF" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1147377627223</modifiedDate>
<httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
<urls>
<url>http://www.seroundtable.com/archives/003799.html</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>This morning I described what is Google Co-op, but I also promised I would try to implement an example for this site. Well, we have implemented phase one of Google Co-op subscription links for this site. You can subscribe to the coop...
<metaData>
<meta name="title">Dynamic Implementation of Google Co-op for Search Engine Roundtable</meta>
<meta name="dc:date">Thu, 11 May 2006 19:35:25 GMT</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>subscribe</anchorText>
<location>http://www.google.com/coop/trust/add?user=015090516856763095929&continue=http://www.google.com/coop/profile?user=015090516856763095929&sig=Y_aOf96WG5HGmgVEImc3p144xnXGY=</location>
( run in 0.386 second using v1.01-cache-2.11-cpan-64827b87656 )