Alvis-Convert
view release on metacpan or search on metacpan
lib/Alvis/Canonical.pm view on Meta::CPAN
print F $can_doc2;
close(F);
}
$self->{stack}=();
$can_doc=~s/(\<(\/)?((?:(?i)A|FRAME|IFRAME)|section|list|item)(?:\s.*?)?\>)/$self->_fix_links($1,$2,$3)/sgoe;
# close all tags left open
# warn "STACK:", join("|",@{$self->{stack}});
while (defined(my $open_alvis_tag=pop(@{$self->{stack}})))
{
if ($open_alvis_tag=~/^(a|frame|iframe)$/o)
{
$can_doc.="\</$open_alvis_tag\>";
}
else
{
die("Should be impossible: non-link opening tag " .
"($open_alvis_tag) left on stack.");
}
}
lib/Alvis/Canonical.pm view on Meta::CPAN
if ($DEBUG)
{
warn "ALL:$all";
warn "STACK NOW:",join("|",@{$self->{stack}}) if defined($self->{stack});
}
my $txt="";
# If it's an end tag
if ($end)
{
if ($tag=~/^(a|frame|iframe)$/io)
{
# Close an immediate matching link tag in the context, if any
if (defined(my $context=pop(@{$self->{stack}})))
{
if ($context eq lc($tag))
{
$txt.="\</$context\>";
}
else
{
lib/Alvis/Canonical.pm view on Meta::CPAN
push(@{$self->{stack}},$context);
}
}
# ignore this closing tag, it's misplaced/overruled
}
elsif ($tag=~/^(section|list|item)$/o)
{
# Close an immediate link tag in the context, if any
if (defined(my $context=pop(@{$self->{stack}})))
{
if ($context=~/^(a|frame|iframe)$/)
{
$txt.="\</$context\>";
# close the surrounding structure
if (defined(my $context=pop(@{$self->{stack}})))
{
if ($tag ne $context)
{
die("Should be impossible: mismatch of already " .
"fixed immediate Alvis opening tag ($context) " .
"and closing tag ($tag).");
lib/Alvis/Canonical.pm view on Meta::CPAN
{
die("Should be impossible: unrecognized closing tag type ($tag).");
}
}
else # a start tag
{
# Whatever the tag is,
# close an immediate matching link tag in the context, if any
if (defined(my $context=pop(@{$self->{stack}})))
{
if ($context=~/^(a|frame|iframe)$/)
{
$txt.="\</$context\>";
}
else
{
push(@{$self->{stack}},$context);
}
}
push(@{$self->{stack}},lc($tag)); # remember to normalize
lib/Alvis/Canonical.pm view on Meta::CPAN
my %link=();
$link{type}=lc($tag);
if ($link{type} eq 'a')
{
if (defined($params) && $params=~/href\s*=\s*([\"\'])(.*?)\1/isgo)
{
$url=$self->_handle_url($2,$header->{baseURL});
}
}
elsif ($link{type}=~/^(frame|iframe)$/o)
{
if (defined($params) && $params=~/src\s*=\s*([\'\"])(.*?)\1/isgo)
{
$url=$self->_handle_url($2,$header->{baseURL});
}
}
else
{
die("Should be impossible: Unrecognized link type ($tag).");
}
lib/Alvis/Convert.pm view on Meta::CPAN
if (exists($link->{text}))
{
$text=$link->{text};
}
if (exists($link->{type}))
{
if ($link->{type}=~/^\s*a\s*$/isgo)
{
$type='a';
}
elsif ($link->{type}=~/^\s*i?frame\s*$/isgo)
{
$type='frame';
}
elsif ($link->{type}=~/^\s*img\s*$/isgo)
{
$type='img';
}
}
if (!$links->add($url,$text,$type))
{
$self->_set_err_state($ERR_LINK_ADD,
lib/Alvis/HTML.pm view on Meta::CPAN
# 1. Mark & replace legal tag starts with '\0'.
# 2. Go from the start of a tag to the beginning of
# the next one, neutralizing any confusing chars
# inside possible attribute values.
# 3. Pick the leftmost '>' before the start of the next
# tag as the end of the tag.
# 4. Remove all tags.
#
if ($self->{alvisKeep})
{
$html=~s/<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption)(?=\W)/\0/sgo;
}
if ($self->{alvisRemove})
{
$html=~s/<\/?(?:(?i)tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|button|center|cite|code|col|colgro...
}
if ($self->{obsolete})
{
$html=~s/<\/?(?:(?i)header|nextid|section|listing|xmp|plaintext)(?=\W)/\0/sgo;
}
if ($self->{proprietary})
{
$html=~s/<\/?(?:(?i)align|blink|embed|ilayer|keygen|layer|multicol|noembed|nolayer|nosave|spacer|inlineinput|sound|audioscope|blackface|animate|bgsound|comment|marquee|xml|o:p|csaction|csactions|csactiondict|csscriptdict|csactionitem|csobj|wbr|nobr|...
}
if ($self->{xhtml})
lib/Alvis/HTML.pm view on Meta::CPAN
$html=~s/(?<=\0).*?>//sgo;
$html=~s/\0/ /go;
# We have removed those tags we wanted to now
# If we have some tags left, do some fixing
if (!$self->{alvisKeep}||!$self->{alvisRemove}||!$self->{obsolete}||
!$self->{proprietary}||!$self->{xhtml}||!$self->{wml})
{
# Often we have <TAG ... </TAG>. Fix that.
$html=~s/(<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption|tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|app...
$html=~s/(?<=\0)([^>]*?)(?=\0)/$1>/sgo;
$html=~s/(?<=\0)([^\0>]*?)$/$1>/sgo;
$html=~s/\0/ /go;
}
# Alvis needs some finer tuning
if (!$self->{alvisKeep})
{
# Fix attributes of interest
$html=~s/(<a\W[^>]*?href\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
# Fix attributes of interest
$html=~s/(<(?:frame|iframe|img)\W[^>]*?src\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
# Sometimes "HTML" contains Alvis tags...double safeguard them
$html=~s/<(\/?(?:(?i)section|list|item|ulink).*?)>/\<$1\>/sgo;
}
if ($DEBUG)
{
warn $html;
}
lib/Alvis/Wikipedia/Templates.pm view on Meta::CPAN
$var_name=~s/^(subst|int)://isgo;
}
elsif ($var_name=~/^(FULL)?PAGENAMEE?$/)
{
return $self->{currTitle};
}
elsif ($var_name=~/^NAMESPACEE?$/)
{
return $self->{currNamespace};
}
elsif ($var_name=~/^(__NOTOC__|__FORCETOC__|__TOC__|__NOEDITSECTION__|__START__|CURRENT(MONTH|MONTHNAME|MONTHNAMEGEN|MONTHABBREV|DAY|DAYNAME|YEAR|TIME)|NUMBEROFARTICLES|NUMBEROFFILES|PAGENAMEE|NAMESPACE|__END__|thumbnail|thumb|right|left|none|cen...
{
return "$var_name";
}
else
{
return $text;
}
}
sub _substitute_template
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="FF2C88E89A1DDFE4F8CD4845EEC285E3" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1142938329956</modifiedDate>
<httpServer>Apache</httpServer>
<urls>
<url>http://searchenginewatch.com/searchday/article.php/3592876</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>At long last, Google has launched its ownGoogle Finance service. For years, those seeking specialty financial information via Google have been sent to competitors such as Yahoo and MSN. Now Google's providing financial information di...
<metaData>
<meta name="title">Google Launches Google Finance</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText>
wrote</anchorText>
<location>http://searchenginewatch.com/_subscribers/articles/article.php/3353401</location>
t/test-data/to-split/29.xml view on Meta::CPAN
<documentRecord id="B4158BE3ACF2447B8B2FF1AFFB5361A0" xmlns="http://alvis.info/enriched/">
<acquisition>
<acquisitionData>
<modifiedDate>1147168350172</modifiedDate>
<httpServer>Apache</httpServer>
<urls>
<url>http://searchenginewatch.com/searchday/article.php/3603301</url>
</urls>
</acquisitionData>
<canonicalDocument>
<section>Paying attention to web metrics is an increasingly important aspect of search marketing, with methodologies, processes and tools that can dramatically lift marketing and business performance. A special report from the Search Engine S...
<metaData>
<meta name="title">Multichannel Metrics: Managing the Sea of Data</meta>
<meta name="dc:type">text/html</meta>
</metaData>
<links>
<outlinks>
<link type="a">
<anchorText> Grantastic Designs, Inc.</anchorText>
<location>http://www.grantasticdesigns.com/</location>
</link>
( run in 1.374 second using v1.01-cache-2.11-cpan-e1769b4cff6 )