Alvis-Convert
view release on metacpan or search on metacpan
bin/alvisXSL view on Meta::CPAN
my $RECORDELEMENT = "documentRecord";
my $GROUPELEMENT = "documentCollection";
# toss out whatever else was included, and add this
my $GROUPELEMENTEXTRA = " xmlns=\"http://alvis.info/enriched/\" version=\"1.1\"";
############ END CONFIGURATION ######################
# autoflush
select((select(STDERR), $| = 1)[0]);
# encoding pragmas follow any includes like "use"
use encoding 'utf8';
use open ':utf8';
my $USAGE = "alvisXSL [--gzip|--bzip2|--dir] [--xslargs ARGS] [--xsl XSL-FILE] XML-FILE+\n"
. " Runs xsltproc multiple times on inputs. To convert into\n"
. " into XML, use alvisDecollect as a post-processor.\n"
. " dir = descend into directories, but not recursively\n"
lib/Alvis/Buffer.pm view on Meta::CPAN
# return 0 on fatal error, after printing error message
sub fix() {
$docs = 0;
$size = 0;
if ( ! -f $BUFFER ) {
# start new one
if ( ! open(ABUF,">>$BUFFER") ) {
print STDERR "Cannot open $BUFFER: $!\n";
return 0;
}
select((select(ABUF), $| = 1)[0]);
print ABUF $HEADER;
} else {
# check old one first
if ( ! open(ABUF,"<$BUFFER") ) {
print STDERR "Cannot open $BUFFER: $!\n";
return 0;
}
# its a UTF-8 file, so have to read it all
# since cannot start half way through
my $last = "";
lib/Alvis/Buffer.pm view on Meta::CPAN
print STDERR " has been completed, so move manually\n";
}
return 0;
}
CORE::close(ABUF);
# now open for append
if ( ! open(ABUF,">>$BUFFER") ) {
print STDERR "Cannot open $BUFFER: $!\n";
return 0;
}
select((select(ABUF), $| = 1)[0]);
}
1;
}
############################################
#
# rename output XML buffer file to xml-add/N.xml for some N
# and create a new output XML buffer file, name is returned;
# return undef on fatal error, after printing error message
sub save() {
lib/Alvis/HTML.pm view on Meta::CPAN
# 3. Pick the leftmost '>' before the start of the next
# tag as the end of the tag.
# 4. Remove all tags.
#
if ($self->{alvisKeep})
{
$html=~s/<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption)(?=\W)/\0/sgo;
}
if ($self->{alvisRemove})
{
$html=~s/<\/?(?:(?i)tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|button|center|cite|code|col|colgro...
}
if ($self->{obsolete})
{
$html=~s/<\/?(?:(?i)header|nextid|section|listing|xmp|plaintext)(?=\W)/\0/sgo;
}
if ($self->{proprietary})
{
$html=~s/<\/?(?:(?i)align|blink|embed|ilayer|keygen|layer|multicol|noembed|nolayer|nosave|spacer|inlineinput|sound|audioscope|blackface|animate|bgsound|comment|marquee|xml|o:p|csaction|csactions|csactiondict|csscriptdict|csactionitem|csobj|wbr|nobr|...
}
if ($self->{xhtml})
{
$html=~s/<\/?(?:(?i)ruby|rbc|rtc|rb|rt|rp)(?=\W)/\0/sgo;
}
if ($self->{wml})
{
$html=~s/<\/?(?:(?i)access|card|template|wml|anchor|do|onevent|postfield|go|noop|prev|refresh|fieldset|optgroup|select|setvar|timer)(?=\W)/\0/sgo;
}
# $html=~s/=\s*([\"\'])([^\0]*?)\1/&_neutralize_trouble($1,$2)/sgoe;
$html=~s/(?<=\0).*?>//sgo;
$html=~s/\0/ /go;
# We have removed those tags we wanted to now
# If we have some tags left, do some fixing
if (!$self->{alvisKeep}||!$self->{alvisRemove}||!$self->{obsolete}||
!$self->{proprietary}||!$self->{xhtml}||!$self->{wml})
{
# Often we have <TAG ... </TAG>. Fix that.
$html=~s/(<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption|tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|app...
$html=~s/(?<=\0)([^>]*?)(?=\0)/$1>/sgo;
$html=~s/(?<=\0)([^\0>]*?)$/$1>/sgo;
$html=~s/\0/ /go;
}
# Alvis needs some finer tuning
if (!$self->{alvisKeep})
{
# Fix attributes of interest
$html=~s/(<a\W[^>]*?href\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
t/test-data/alvisXSL/alvis2titles.xsl view on Meta::CPAN
-->
<!-- disable all default text node output -->
<xsl:template match="text()"/>
<!-- match on alvis xml record -->
<xsl:template match="a:documentRecord">
<!-- First line: format "D URL DOCID TITLE" -->
<xsl:text>D </xsl:text>
<xsl:value-of select="a:acquisition/a:acquisitionData/a:urls/a:url"/>
<xsl:text> </xsl:text>
<xsl:value-of select="@id"/>
<xsl:text> </xsl:text>
<xsl:value-of select="a:acquisition/a:metaData/a:meta[@name='title']"/>
<xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="FF2C88E89A1DDFE4F8CD4845EEC285E3" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1142938329956</modifiedDate>
<httpServer>Apache</httpServer>
<urls>
<url>http://searchenginewatch.com/searchday/article.php/3592876</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>At long last, Google has launched its ownGoogle Finance service. For years, those seeking specialty financial information via Google have been sent to competitors such as Yahoo and MSN. Now Google's providing financial information di...
<metaData>
<meta name="title">Google Launches Google Finance</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>
wrote</anchorText>
<location>http://searchenginewatch.com/_subscribers/articles/article.php/3353401</location>
( run in 1.245 second using v1.01-cache-2.11-cpan-49f99fa48dc )