Alvis-Convert

 view release on metacpan or  search on metacpan

t/test-data/to-split/29.xml  view on Meta::CPAN

          <link type="a">
            <anchorText>Yahoo News</anchorText>
            <location>http://news.yahoo.com/</location>
          </link>
          <link type="a">
            <anchorText>stock quote service</anchorText>
            <location>http://www.google.com/help/features.html#stock</location>
          </link>
          <link type="a">
            <anchorText>Google Blog 
Search</anchorText>
            <location>http://blogsearch.google.com/</location>
          </link>
          <link type="a">
            <anchorText>Yahoo Finance</anchorText>
            <location>http://finance.yahoo.com/</location>
          </link>
          <link type="a">
            <anchorText>Google News</anchorText>
            <location>http://news.google.com/</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Chris Sherman</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Yahoo</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>NetRatings</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Moreover</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>MSN</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Blogger</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Finance</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Toolbar</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Gmail</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>MSN</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Yahoo News</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Yahoo Finance</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Blog</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>MSN MoneyCentral</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Flash</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>way Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Blog Search</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google News</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="48FFC0A03C2756C583F6D80C9E527393" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1142422246164</modifiedDate>
        <httpServer>Apache/1.3.33 (Unix)</httpServer>
        <urls>
          <url>http://blog.outer-court.com/archive/2006-03-15-n42.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Google releases their desktop search tool in an updated version today. Among some bugfixes, there’s a new Quick Search box. Hit Ctrl twice to make it appear in the middle of your desktop, and then search for anything – your compu...
      <metaData>
        <meta name="title">Google Desktop's Quick Search Box</meta>
        <meta name="dc:date">Wed, 15 Mar 2006 11:20:57 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Quick Search box</anchorText>
            <location>http://desktop.google.com/features.html#quicksearch</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Brin</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Desktop</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="18C9FD35812DFC4D4CCF0FD6AC1646BC" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1149133052555</modifiedDate>
        <httpServer>Apache/1.3.33 (Unix)</httpServer>
        <urls>
          <url>http://blog.outer-court.com/archive/2006-05-30-n12.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Some bloggers are complaining that Google didn’t have a Memorial day logo yesterday. Memorial Day “commemorates U.S. men and women who have died in military service,”Wikipedia explains. From a comment at Newsbusters by Warner T...
      <metaData>
        <meta name="title">Complaints Due to Lack of Google Memorial Day Logo</meta>
        <meta name="dc:date">Thu, 01 Jun 2006 02:44:56 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>it’s good the way it is</anchorText>
            <location>http://blog.lewrockwell.com/lewrw/archives/010666.html</location>
          </link>
          <link type="a">
            <anchorText>Wikipedia</anchorText>
            <location>http://en.wikipedia.org/wiki/Memorial_Day</location>
          </link>
          <link type="a">
            <anchorText>a comment at Newsbusters</anchorText>
            <location>http://newsbusters.org/node/5580</location>
          </link>
          <link type="a">
            <anchorText>Some bloggers</anchorText>
            <location>http://technorati.com/search/google%20memorial</location>
          </link>
        </outlinks>
      </links>

t/test-data/to-split/29.xml  view on Meta::CPAN

      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Search</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google Video</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="7F0D97BDACC9D73DA79364ADF93A9080" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1144768340466</modifiedDate>
        <httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
        <urls>
          <url>http://www.seroundtable.com/archives/003639.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>There is a DigitalPoint Forum thread named that discusses a neat PageRank tool at http://www.webmastereyes.com/. The PageRank tool is different from others, in that it will enable you to plug in a URL and it will then place graphical...
      <metaData>
        <meta name="title">New Google PageRank Tool Plots PR Values Overlays On Page</meta>
        <meta name="dc:date">Tue, 11 Apr 2006 12:40:49 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>http://www.webmastereyes.com/</anchorText>
            <location>http://www.webmastereyes.com/</location>
          </link>
          <link type="a">
            <anchorText>thread</anchorText>
            <location>http://forums.digitalpoint.com/showthread.php?t=74054</location>
          </link>
          <link type="a">
            <anchorText>DigitalPoint Forums</anchorText>
            <location>http://forums.digitalpoint.com/showthread.php?t=74054</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Digital</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google PageRank</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="E25E5DBF90E6C6A3CDF200F61F6A20E6" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1150315246240</modifiedDate>
        <httpServer>Apache/1.3.36 (Unix) mod_fastcgi/2.4.2 mod_auth_passthrough/1.8 mod_log_bytes/1.2 mod_bwlimited/1.4 PHP/4.4.2 FrontPage/5.0.2.2635.SR1.2 mod_ssl/2.8.27 OpenSSL/0.9.7a</httpServer>
        <urls>
          <url>http://www.searchenginejournal.com/?p=3530</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>RSS - Things That Make You Go Hmmm Why doesn’t the new Yahoo Spark Blog publish an RSS feed? Of any kind? Not even an “add to my Yahoo” button? Why can’t I subscribe to the Technorati Hot Tags widget that’s (supposedly) upd...
      <metaData>
        <meta name="title">RSS - Things That Make You Go Hmmm</meta>
        <meta name="dc:type">text/html; charset=utf-8</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Technorati Hot Tags</anchorText>
            <location>http://www.technorati.com/tags/</location>
          </link>
          <link type="a">
            <anchorText>eBay</anchorText>
            <location>http://www2.ebay.com/aw/core/200603200913002.html</location>
          </link>
          <link type="a">
            <anchorText>Yahoo Spark Blog</anchorText>
            <location>http://dir.yahoo.com/thespark/240/peek-through-the-pinhole</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Yahoo</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Technorati</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="070E7EB628CC943FBF90E7C6A703D9B2" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1149606759016</modifiedDate>
        <httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
        <urls>
          <url>http://www.seroundtable.com/archives/003894.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Any SEO/M will tell you their job description sucks because in the process of describing exactly what they do, they nearly always watch the listener's eyes glaze over, waiting for a topic that may make better sense. Same thing with u...
      <metaData>
        <meta name="title">Officer Usability and General SEO</meta>
        <meta name="dc:date">Mon, 05 Jun 2006 11:52:34 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Should links still be underlined and blue?</anchorText>
            <location>http://www.cre8asiteforums.com/forums/index.php?s=&amp;showtopic=36893&amp;view=findpost&amp;p=181570</location>
          </link>
          <link type="a">
            <anchorText>Contextual Usability?</anchorText>
            <location>http://www.cre8asiteforums.com/forums/index.php?s=&amp;showtopic=37336&amp;view=findpost&amp;p=183860</location>
          </link>
          <link type="a">
            <anchorText>Features don't matter anymore, Welcome to the Age of User Experience</anchorText>
            <location>http://www.cre8asiteforums.com/forums/index.php?s=&amp;showtopic=37237&amp;view=findpost&amp;p=183418</location>
          </link>
        </outlinks>



( run in 2.343 seconds using v1.01-cache-2.11-cpan-f56aa216473 )