Alvis-Convert

 view release on metacpan or  search on metacpan

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="35D3C71D8D04A7A782CD2E8CBF17220C" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1144681935588</modifiedDate>
        <httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
        <urls>
          <url>http://www.seroundtable.com/archives/003633.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>A featured Search Engine Watch Forum thread named SEO &amp; Newspapers discusses a recent NYTimes article named This Boring Headline Is Written for Google. The first paragraph of the article somes it up; Journalists over the years ha...
      <metaData>
        <meta name="title">New York Times Changes Web Only Headlines To Be Search Engine Friendly</meta>
        <meta name="dc:date">Mon, 10 Apr 2006 13:37:11 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Search Engine Watch Forums</anchorText>
            <location>http://forums.searchenginewatch.com/showthread.php?threadid=11001</location>
          </link>
          <link type="a">
            <anchorText>SEO &amp; Newspapers</anchorText>
            <location>http://forums.searchenginewatch.com/showthread.php?threadid=11001</location>
          </link>
          <link type="a">
            <anchorText>explains</anchorText>
            <location>http://blog.searchenginewatch.com/blog/060410-090051</location>
          </link>
          <link type="a">
            <anchorText>This Boring Headline Is Written for Google</anchorText>
            <location>http://www.nytimes.com/2006/04/09/weekinreview/09lohr.html?ex=1302235200&amp;en=86fd20f27aa1d645&amp;ei=5090&amp;partner=rssuserland&amp;emc=rss</location>
          </link>
        </outlinks>

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="7F0D97BDACC9D73DA79364ADF93A9080" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1144768340466</modifiedDate>
        <httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
        <urls>
          <url>http://www.seroundtable.com/archives/003639.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>There is a DigitalPoint Forum thread named that discusses a neat PageRank tool at http://www.webmastereyes.com/. The PageRank tool is different from others, in that it will enable you to plug in a URL and it will then place graphical...
      <metaData>
        <meta name="title">New Google PageRank Tool Plots PR Values Overlays On Page</meta>
        <meta name="dc:date">Tue, 11 Apr 2006 12:40:49 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>http://www.webmastereyes.com/</anchorText>
            <location>http://www.webmastereyes.com/</location>
          </link>
          <link type="a">
            <anchorText>thread</anchorText>
            <location>http://forums.digitalpoint.com/showthread.php?t=74054</location>
          </link>
          <link type="a">
            <anchorText>DigitalPoint Forums</anchorText>
            <location>http://forums.digitalpoint.com/showthread.php?t=74054</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Digital</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google PageRank</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>

t/test-data/to-split/29.xml  view on Meta::CPAN

          <link type="a">
            <anchorText>http://about.ask.com/docs/about/televisionads.shtml</anchorText>
            <location>http://about.ask.com/docs/about/televisionads.shtml</location>
          </link>
          <link type="a">
            <anchorText>Ask.com Second TV Blitz Stars Chief Scientist Guru, Apostolos Gerasoulis</anchorText>
            <location>http://blog.searchenginewatch.com/blog/060503-084529</location>
          </link>
          <link type="a">
            <anchorText>Search Engine Roundtable Forums</anchorText>
            <location>http://forums.seroundtable.com/showthread.php?t=699</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Apostolos Gerasoulis</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Teoma</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Rutgers University</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Scient</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="57E3FF55199853DF2777EF6B8DC24516" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1149969689989</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3612406</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Links to the week's topics from search engine forums across the web. What Top 5 Skills Would You Study to Become a Better SEO? Search Engine Watch Forums "What skills would you put on your Matrix 'must have' list for your career path...
      <metaData>
        <meta name="title">Search Engine Forums Spotlight</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>June 2006: Start of the Traditional Summer Slump</anchorText>
            <location>http://www.webmasterworld.com/forum89/14428.htm</location>
          </link>

t/test-data/to-split/29.xml  view on Meta::CPAN

          <link type="a">
            <anchorText>Search Engine Guide</anchorText>
            <location>http://www.searchengineguide.com/</location>
          </link>
          <link type="a">
            <anchorText>Does Citing Sources Help Rankings?</anchorText>
            <location>http://www.v7n.com/forums/google-forum/31501-does-citing-sources-help-rankings.html</location>
          </link>
          <link type="a">
            <anchorText>Cache Problems Growing for Directories?</anchorText>
            <location>http://forums.searchenginewatch.com/showthread.php?threadid=11916</location>
          </link>
          <link type="a">
            <anchorText>What Top 5 Skills Would You Study to Become a Better SEO?</anchorText>
            <location>http://forums.searchenginewatch.com/showthread.php?t=11945</location>
          </link>
          <link type="a">
            <anchorText>Google Office Continued: Spreadsheet Application Launched</anchorText>
            <location>http://www.cre8asiteforums.com/forums/index.php?showtopic=37455</location>
          </link>
          <link type="a">
            <anchorText>Separate Page for PPC?</anchorText>
            <location>http://www.webproworld.com/viewtopic.php?t=64119</location>
          </link>
          <link type="a">



( run in 0.697 second using v1.01-cache-2.11-cpan-49f99fa48dc )