Alvis-Convert

 view release on metacpan or  search on metacpan

bin/html2plain  view on Meta::CPAN


    Sets the # of records per output directory. Default value: 1000.

=item B<--source-encoding>

    Specifies the encoding of the HTML files. Default value undef,
    which means that the encoding is guessed for each document.

=item B<--[no]assert-html>

    Specifies whether it is asserted that the document actually looks like
    HTML before trying to convert. Default: yes.

=item B<--[no]symbolic-char-entities-to-chars>

    Specifies whether symbolic character entities are converted to 
    UTF-8 characters. Default: yes.

=item B<--[no]numerical-char-entities-to-chars>

    Specifies whether numerical character entities are converted to 

lib/Alvis/Document/Type.pm  view on Meta::CPAN

    {
	($type,$sub_type)=split(/\//,$mime_type,-1);
    }

    # If the result is a generic one, check for our types of interest
    # by other means
    # BTW, File::Type should make it clear and checkable what its
    # "I dunno" reply is
    if ($type eq 'application' && $sub_type eq 'octet-stream')
    {
	if ($self->_looks_like_HTML($text))
	{
	    ($type,$sub_type)=('text','html');
	}
	elsif ($self->_looks_like_RSS($text))
	{
	    # not a MIME type
	    ($type,$sub_type)=('text','rss')
	}
    }

    return ($type,$sub_type);
}


sub _looks_like_HTML
{
    my $self=shift;
    my $text=shift;

    #
    # If we're lucky...
    #
    if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
    {
	my $type=$1;

lib/Alvis/Document/Type.pm  view on Meta::CPAN

    # signature start tag will do. 
    #
    if ($text=~/<(?:(?i)html|body)\W/sgo)
    {
	return 1;
    }

    return 0;
}

sub _looks_like_RSS
{
    my $self=shift;
    my $text=shift;

    #
    # If we're lucky...
    #
    if ($text=~/<!DOCTYPE\s+(\S+)/isgo)
    {
	my $type=$1;

lib/Alvis/HTML.pm  view on Meta::CPAN

#############################################################################

#############################################################################
#
#     Global variables & constants
#
##############################################################################

# Do we assert that our assumptions about the source hold? 
my $DEF_SRC_ASS=1;
# Do we check first to see if the document really looks like HTML?
my $DEF_ASSERT_HTML=1;
# Do we pass on even non-HTML documents?
my $DEF_KEEP_ALL=0;
# Do we replace character entities with actual characters?
my $DEF_CONVERT_CHAR_ENTS=1;
# Do we replace numerical character entities with actual characters?
my $DEF_CONVERT_NUM_ENTS=0;
# Do we try to clean extra whitespace?
my $DEF_CLEAN_WS=0;
# Source encoding

lib/Alvis/HTML.pm  view on Meta::CPAN

	{
	    return ("\n",\%header);
	}
	else
	{
	    $self->_set_err_state($ERR_EMPTY_DOC);
	    return (undef,\%header);  # signals "do not pass on"
	}  
    }

    # Check if this really looks like "HTML" 
    #
    if ($self->{assertHTML})
    {
	#
	# If we're lucky...
	#
	if ($html=~/<!DOCTYPE\s+(\S+)/isgo)
	{
	    my $type=$1;
	    if ($type!~/(?:html|wml)/igo)

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="B4158BE3ACF2447B8B2FF1AFFB5361A0" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1147168350172</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3603301</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Paying attention to web metrics is an increasingly important aspect of search marketing, with methodologies, processes and tools that can dramatically lift marketing and business performance. A special report from the Search Engine S...
      <metaData>
        <meta name="title">Multichannel Metrics: Managing the Sea of Data</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText> Grantastic Designs, Inc.</anchorText>
            <location>http://www.grantasticdesigns.com/</location>
          </link>

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="F3F560D7ED8DE899CD17D9302AADE8EF" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1147377627223</modifiedDate>
        <httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
        <urls>
          <url>http://www.seroundtable.com/archives/003799.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>This morning I described what is Google Co-op, but I also promised I would try to implement an example for this site. Well, we have implemented phase one of Google Co-op subscription links for this site. You can subscribe to the coop...
      <metaData>
        <meta name="title">Dynamic Implementation of Google Co-op for Search Engine Roundtable</meta>
        <meta name="dc:date">Thu, 11 May 2006 19:35:25 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>subscribe</anchorText>
            <location>http://www.google.com/coop/trust/add?user=015090516856763095929&amp;continue=http://www.google.com/coop/profile?user=015090516856763095929&amp;sig=Y_aOf96WG5HGmgVEImc3p144xnXGY=</location>



( run in 0.386 second using v1.01-cache-2.11-cpan-64827b87656 )