select results from the CPAN

select

Alvis-Convert

view release on metacpan or search on metacpan


my $RECORDELEMENT = "documentRecord";
my $GROUPELEMENT = "documentCollection";
#  toss out whatever else was included, and add this
my $GROUPELEMENTEXTRA = " xmlns=\"http://alvis.info/enriched/\" version=\"1.1\"";


############ END CONFIGURATION ######################

#  autoflush
select((select(STDERR), $| = 1)[0]);

# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';


my $USAGE = "alvisXSL [--gzip|--bzip2|--dir] [--xslargs ARGS] [--xsl XSL-FILE] XML-FILE+\n" 
  . "   Runs xsltproc multiple times on inputs.   To convert into\n"
  . "   into XML, use alvisDecollect as a post-processor.\n" 
  . "   dir = descend into directories, but not recursively\n"

lib/Alvis/Buffer.pm view on Meta::CPAN

#  return 0 on fatal error, after printing error message
sub fix() {
  $docs = 0;
  $size = 0;
  if ( ! -f $BUFFER ) {
    #  start new one
    if ( ! open(ABUF,">>$BUFFER") ) {
      print STDERR "Cannot open $BUFFER: $!\n";
      return 0;
    }
    select((select(ABUF), $| = 1)[0]);
    print ABUF $HEADER;
  } else {
    #  check old one first
    if ( ! open(ABUF,"<$BUFFER") ) {
      print STDERR "Cannot open $BUFFER: $!\n";
      return 0;
    }
    #  its a UTF-8 file, so have to read it all
    #  since cannot start half way through
    my $last = "";

lib/Alvis/Buffer.pm view on Meta::CPAN

	print STDERR "  has been completed, so move manually\n";
      }
      return 0;
    }
    CORE::close(ABUF);
    # now open for append
    if ( ! open(ABUF,">>$BUFFER") ) {
      print STDERR "Cannot open $BUFFER: $!\n";
      return 0;
    }
    select((select(ABUF), $| = 1)[0]);
  }
  1;
}

############################################
#
#  rename output XML buffer file to xml-add/N.xml for some N
#  and create a new output XML buffer file, name is returned;
#  return undef on fatal error, after printing error message
sub save() {

lib/Alvis/HTML.pm view on Meta::CPAN

    #           3. Pick the leftmost '>' before the start of the next
    #              tag as the end of the tag.
    #           4. Remove all tags.               
    #
    if ($self->{alvisKeep})
    { 
	$html=~s/<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption)(?=\W)/\0/sgo;
    }
    if ($self->{alvisRemove})
    {
	$html=~s/<\/?(?:(?i)tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|button|center|cite|code|col|colgro...
    }
    if ($self->{obsolete})
    {
	$html=~s/<\/?(?:(?i)header|nextid|section|listing|xmp|plaintext)(?=\W)/\0/sgo;
    }
    if ($self->{proprietary})
    {
	$html=~s/<\/?(?:(?i)align|blink|embed|ilayer|keygen|layer|multicol|noembed|nolayer|nosave|spacer|inlineinput|sound|audioscope|blackface|animate|bgsound|comment|marquee|xml|o:p|csaction|csactions|csactiondict|csscriptdict|csactionitem|csobj|wbr|nobr|...
    }
    if ($self->{xhtml})
    {
	$html=~s/<\/?(?:(?i)ruby|rbc|rtc|rb|rt|rp)(?=\W)/\0/sgo;
    }
    if ($self->{wml})
    {
	$html=~s/<\/?(?:(?i)access|card|template|wml|anchor|do|onevent|postfield|go|noop|prev|refresh|fieldset|optgroup|select|setvar|timer)(?=\W)/\0/sgo;
    }

#    $html=~s/=\s*([\"\'])([^\0]*?)\1/&_neutralize_trouble($1,$2)/sgoe;
    $html=~s/(?<=\0).*?>//sgo;
    $html=~s/\0/ /go;
    
    # We have removed those tags we wanted to now
    
    # If we have some tags left, do some fixing 
    if (!$self->{alvisKeep}||!$self->{alvisRemove}||!$self->{obsolete}||
	!$self->{proprietary}||!$self->{xhtml}||!$self->{wml})
    {
	# Often we have <TAG ... </TAG>. Fix that.
	$html=~s/(<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption|tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|app...
	$html=~s/(?<=\0)([^>]*?)(?=\0)/$1>/sgo;
	$html=~s/(?<=\0)([^\0>]*?)$/$1>/sgo;
	$html=~s/\0/ /go;
    }

    # Alvis needs some finer tuning
    if (!$self->{alvisKeep})
    { 
	# Fix attributes of interest
	$html=~s/(<a\W[^>]*?href\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;

t/test-data/alvisXSL/alvis2titles.xsl view on Meta::CPAN

-->

  <!-- disable all default text node output -->
  <xsl:template match="text()"/>

  <!-- match on alvis xml record -->
  <xsl:template match="a:documentRecord">

    <!-- First line:  format "D URL DOCID TITLE"  -->
    <xsl:text>D </xsl:text>
    <xsl:value-of select="a:acquisition/a:acquisitionData/a:urls/a:url"/>
     <xsl:text> </xsl:text>
    <xsl:value-of select="@id"/>
     <xsl:text> </xsl:text>
    <xsl:value-of select="a:acquisition/a:metaData/a:meta[@name='title']"/>
          <xsl:text>
</xsl:text>

  </xsl:template>


</xsl:stylesheet>

t/test-data/to-split/29.xml view on Meta::CPAN

<documentRecord id="FF2C88E89A1DDFE4F8CD4845EEC285E3" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1142938329956</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3592876</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>At long last, Google has launched its ownGoogle Finance service. For years, those seeking specialty financial information via Google have been sent to competitors such as Yahoo and MSN. Now Google's providing financial information di...
      <metaData>
        <meta name="title">Google Launches Google Finance</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>
wrote</anchorText>
            <location>http://searchenginewatch.com/_subscribers/articles/article.php/3353401</location>

( run in 0.247 second using v1.01-cache-2.11-cpan-0d8aa00de5b )