Alvis-Convert
view release on metacpan or search on metacpan
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="35D3C71D8D04A7A782CD2E8CBF17220C" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1144681935588</modifiedDate>
<httpServer>Apache/1.3.28 (Unix) mod_gzip/1.3.26.1a PHP/4.3.10 mod_ssl/2.8.15 OpenSSL/0.9.7c</httpServer>
<urls>
<url>http://www.seroundtable.com/archives/003633.html</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>A featured Search Engine Watch Forum thread named SEO & Newspapers discusses a recent NYTimes article named This Boring Headline Is Written for Google. The first paragraph of the article somes it up; Journalists over the years ha...
<metaData>
<meta name="title">New York Times Changes Web Only Headlines To Be Search Engine Friendly</meta>
<meta name="dc:date">Mon, 10 Apr 2006 13:37:11 GMT</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>Search Engine Watch Forums</anchorText>
<location>http://forums.searchenginewatch.com/showthread.php?threadid=11001</location>
</link>
<link type="a">
<anchorText>SEO & Newspapers</anchorText>
<location>http://forums.searchenginewatch.com/showthread.php?threadid=11001</location>
</link>
<link type="a">
<anchorText>explains</anchorText>
<location>http://blog.searchenginewatch.com/blog/060410-090051</location>
</link>
<link type="a">
<anchorText>This Boring Headline Is Written for Google</anchorText>
<location>http://www.nytimes.com/2006/04/09/weekinreview/09lohr.html?ex=1302235200&en=86fd20f27aa1d645&ei=5090&partner=rssuserland&emc=rss</location>
</link>
</outlinks>
</links>
</acquisition>
<linguisticAnalysis>
<semantic_unit_level>
<semantic_unit><named_entity><form>Danny Sullivan</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Yahoo</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>NYTimes</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>MSN</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>MSN</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
</semantic_unit_level>
</linguisticAnalysis>
</documentRecord>
<documentRecord id="B4158BE3ACF2447B8B2FF1AFFB5361A0" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1147168350172</modifiedDate>
<httpServer>Apache</httpServer>
<urls>
<url>http://searchenginewatch.com/searchday/article.php/3603301</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Paying attention to web metrics is an increasingly important aspect of search marketing, with methodologies, processes and tools that can dramatically lift marketing and business performance. A special report from the Search Engine S...
<metaData>
<meta name="title">Multichannel Metrics: Managing the Sea of Data</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText> Grantastic Designs, Inc.</anchorText>
<location>http://www.grantasticdesigns.com/</location>
</link>
<link type="a">
<anchorText>Search Engine Visibility</anchorText>
<location>http://www.searchenginesbook.com</location>
</link>
</outlinks>
</links>
</acquisition>
<linguisticAnalysis>
<semantic_unit_level>
<semantic_unit><named_entity><form>Eric Peterson</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Shari Thurow</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Pete</form><named_entity_type>person</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>WebSideStory</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Visual Sciences</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
</semantic_unit_level>
</linguisticAnalysis>
</documentRecord>
<documentRecord id="6373E6ED154F42639933FA99BCE915DB" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1149760377185</modifiedDate>
<httpServer>Apache/2.0</httpServer>
<urls>
<url>http://google.weblogsinc.com/2006/06/06/google-getting-sued-in-france-by-book-publisher/</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Google is getting sued again from another book publisher. What else is new? These book publishers do not like Google to use excerpts from their books without permission. Even though they might be making additional sales from individu...
<metaData>
<meta name="title">Google getting sued in France by book publisher</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>Alexander</anchorText>
<location>http://www.mobileread.com</location>
</link>
</outlinks>
</links>
</acquisition>
<linguisticAnalysis>
<semantic_unit_level>
<semantic_unit><named_entity><form>Google Inc</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Google</form><named_entity_type>comp</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Google France</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
<semantic_unit><named_entity><form>Google</form><named_entity_type>soft</named_entity_type></named_entity></semantic_unit>
</semantic_unit_level>
</linguisticAnalysis>
( run in 2.294 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )