Alvis-Convert

 view release on metacpan or  search on metacpan

t/test-data/to-split/29.xml  view on Meta::CPAN

        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Wikipedia</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="0770964CAC923ACCDC189E0EA4208AE0" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1141993156883</modifiedDate>
        <httpServer>Apache/1.3.34 (Unix) DAV/1.0.3 mod_auth_passthrough/1.8 mod_log_bytes/1.2 mod_bwlimited/1.4 PHP/4.4.1 FrontPage/5.0.2.2635 mod_ssl/2.8.25 OpenSSL/0.9.7a</httpServer>
        <urls>
          <url>http://battellemedia.com/archives/002391.php</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>From a Reuters story: Sen. Ron Wyden on Thursday proposed legislation aimed at preventing high-speed Internet service providers from charging content companies extra so consumers have faster access to their Web sites or receive speci...
      <metaData>
        <meta name="title">Net Neutrality Bill Unveiled</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>IPDemocracy</anchorText>
            <location>http://www.ipdemocracy.com/archives/2006/03/02/index.php#001213</location>
          </link>
          <link type="a">
            <anchorText>just joined FM</anchorText>
            <location>http://fmpub.net/archives/2006/03/were_growingmee.php</location>
          </link>
          <link type="a">
            <anchorText>Reuters story</anchorText>
            <location>http://today.reuters.com/news/articleinvesting.aspx?type=governmentFilingsNews&amp;storyid=URI:urn:newsml:reuters.com:20060302:MTFH08897_2006-03-02_22-55-27_N02376259:1</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="35D3C71D8D04A7A782CD2E8CBF17220C" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1144681935588</modifiedDate>
        <httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
        <urls>
          <url>http://www.seroundtable.com/archives/003633.html</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>A featured Search Engine Watch Forum thread named SEO &amp; Newspapers discusses a recent NYTimes article named This Boring Headline Is Written for Google. The first paragraph of the article somes it up; Journalists over the years ha...
      <metaData>
        <meta name="title">New York Times Changes Web Only Headlines To Be Search Engine Friendly</meta>
        <meta name="dc:date">Mon, 10 Apr 2006 13:37:11 GMT</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>Search Engine Watch Forums</anchorText>
            <location>http://forums.searchenginewatch.com/showthread.php?threadid=11001</location>
          </link>
          <link type="a">
            <anchorText>SEO &amp; Newspapers</anchorText>
            <location>http://forums.searchenginewatch.com/showthread.php?threadid=11001</location>
          </link>
          <link type="a">
            <anchorText>explains</anchorText>
            <location>http://blog.searchenginewatch.com/blog/060410-090051</location>
          </link>
          <link type="a">
            <anchorText>This Boring Headline Is Written for Google</anchorText>
            <location>http://www.nytimes.com/2006/04/09/weekinreview/09lohr.html?ex=1302235200&amp;en=86fd20f27aa1d645&amp;ei=5090&amp;partner=rssuserland&amp;emc=rss</location>
          </link>
        </outlinks>
      </links>
    </acquisition>
  <linguisticAnalysis>
    <semantic_unit_level>
      <semantic_unit><named_entity><form>Danny Sullivan</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Yahoo</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>NYTimes</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>MSN</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
      <semantic_unit><named_entity><form>MSN</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
    </semantic_unit_level>
  </linguisticAnalysis>

  </documentRecord>
<documentRecord id="B4158BE3ACF2447B8B2FF1AFFB5361A0" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1147168350172</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3603301</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Paying attention to web metrics is an increasingly important aspect of search marketing, with methodologies, processes and tools that can dramatically lift marketing and business performance. A special report from the Search Engine S...
      <metaData>
        <meta name="title">Multichannel Metrics: Managing the Sea of Data</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText> Grantastic Designs, Inc.</anchorText>
            <location>http://www.grantasticdesigns.com/</location>
          </link>



( run in 0.985 second using v1.01-cache-2.11-cpan-13bb782fe5a )