Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Canonical.pm  view on Meta::CPAN

#########################################################################
#
#      Private methods
#
######################################################################

sub _contents2canDoc
{
    my $self=shift;
    my $contents=shift; # contains relevant HTML markup
    my $header=shift;   # will be updated with information like links
    my $source_encoding=shift;    

    my $can_doc;

    if ($DEBUG)
    {
	open(F,">candoc.cleanNXMLSafe");
	print F $contents;
	close(F);
    }

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="48FFC0A03C2756C583F6D80C9E527393" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1142422246164</modifiedDate>
        <httpServer>Apache/1.3.33 (Unix)</httpServer>
        <urls>
          <url>http://blog.outer-court.com/archive/2006-03-15-n42.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Google releases their desktop search tool in an updated version today. Among some bugfixes, there’s a new Quick Search box. Hit Ctrl twice to make it appear in the middle of your desktop, and then search for anything – your compu...
      <metaData>
        <meta name="title">Google Desktop's Quick Search Box</meta>
        <meta name="dc:date">Wed, 15 Mar 2006 11:20:57 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Quick Search box</anchorText>
            <location>http://desktop.google.com/features.html#quicksearch</location>

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="E25E5DBF90E6C6A3CDF200F61F6A20E6" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1150315246240</modifiedDate>
        <httpServer>Apache/1.3.36 (Unix) mod_fastcgi/2.4.2 mod_auth_passthrough/1.8 mod_log_bytes/1.2 mod_bwlimited/1.4 PHP/4.4.2 FrontPage/5.0.2.2635.SR1.2 mod_ssl/2.8.27 OpenSSL/0.9.7a</httpServer>
        <urls>
          <url>http://www.searchenginejournal.com/?p=3530</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>RSS - Things That Make You Go Hmmm Why doesn’t the new Yahoo Spark Blog publish an RSS feed? Of any kind? Not even an “add to my Yahoo” button? Why can’t I subscribe to the Technorati Hot Tags widget that’s (supposedly) upd...
      <metaData>
        <meta name="title">RSS - Things That Make You Go Hmmm</meta>
        <meta name="dc:type">text/html; charset=utf-8</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Technorati Hot Tags</anchorText>
            <location>http://www.technorati.com/tags/</location>
          </link>



( run in 0.251 second using v1.01-cache-2.11-cpan-05444aca049 )