Alvis-Convert

 view release on metacpan or  search on metacpan

lib/Alvis/Canonical.pm  view on Meta::CPAN

	print F $can_doc2;
	close(F);
    }

    $self->{stack}=();
    $can_doc=~s/(\<(\/)?((?:(?i)A|FRAME|IFRAME)|section|list|item)(?:\s.*?)?\>)/$self->_fix_links($1,$2,$3)/sgoe;
    # close all tags left open 
#    warn "STACK:", join("|",@{$self->{stack}});
    while (defined(my $open_alvis_tag=pop(@{$self->{stack}})))
    {
	if ($open_alvis_tag=~/^(a|frame|iframe)$/o)
	{
	    $can_doc.="\</$open_alvis_tag\>";
	}
	else
	{
	    die("Should be impossible: non-link opening tag " .
		"($open_alvis_tag) left on stack.");
	}
    }

lib/Alvis/Canonical.pm  view on Meta::CPAN

    if ($DEBUG)
    {
	warn "ALL:$all";
	warn "STACK NOW:",join("|",@{$self->{stack}}) if defined($self->{stack});
    }

    my $txt="";
    # If it's an end tag
    if ($end)
    {
	if ($tag=~/^(a|frame|iframe)$/io)
	{
	    # Close an immediate matching link tag in the context, if any 
	    if (defined(my $context=pop(@{$self->{stack}})))
	    {
		if ($context eq lc($tag))
		{
		    $txt.="\</$context\>";
		}
		else
		{

lib/Alvis/Canonical.pm  view on Meta::CPAN

		    push(@{$self->{stack}},$context);
		}
	    }
	    # ignore this closing tag, it's misplaced/overruled
	}
	elsif ($tag=~/^(section|list|item)$/o) 
	{
	    # Close an immediate link tag in the context, if any 
	    if (defined(my $context=pop(@{$self->{stack}})))
	    {
		if ($context=~/^(a|frame|iframe)$/)
		{
		    $txt.="\</$context\>";
		    # close the surrounding structure
		    if (defined(my $context=pop(@{$self->{stack}})))
		    {
			if ($tag ne $context)
			{
			    die("Should be impossible: mismatch of already " .
				"fixed immediate Alvis opening tag ($context) " .
				"and closing tag ($tag).");

lib/Alvis/Canonical.pm  view on Meta::CPAN

	{
	    die("Should be impossible: unrecognized closing tag type ($tag).");
	}
    }
    else # a start tag
    {
	# Whatever the tag is,
	# close an immediate matching link tag in the context, if any 
	if (defined(my $context=pop(@{$self->{stack}})))
	{
	    if ($context=~/^(a|frame|iframe)$/)
	    {
		$txt.="\</$context\>";
	    }
	    else
	    {
		push(@{$self->{stack}},$context);
	    }
	}

	push(@{$self->{stack}},lc($tag)); # remember to normalize

lib/Alvis/Canonical.pm  view on Meta::CPAN


    my %link=();
    $link{type}=lc($tag);
    if ($link{type} eq 'a')
    {
	if (defined($params) && $params=~/href\s*=\s*([\"\'])(.*?)\1/isgo)
	{
	    $url=$self->_handle_url($2,$header->{baseURL});
	}
    }
    elsif ($link{type}=~/^(frame|iframe)$/o)
    {
	if (defined($params) && $params=~/src\s*=\s*([\'\"])(.*?)\1/isgo)
	{
	    $url=$self->_handle_url($2,$header->{baseURL});
	}
    }
    else
    {
	die("Should be impossible: Unrecognized link type ($tag).");
    }

lib/Alvis/Convert.pm  view on Meta::CPAN

	if (exists($link->{text}))
	{
	    $text=$link->{text};
	}
	if (exists($link->{type}))
	{
 	    if ($link->{type}=~/^\s*a\s*$/isgo)
	    {
		$type='a';
	    }
 	    elsif ($link->{type}=~/^\s*i?frame\s*$/isgo)
	    {
		$type='frame';
	    }
 	    elsif ($link->{type}=~/^\s*img\s*$/isgo)
	    {
		$type='img';
	    }
	}
	
	if (!$links->add($url,$text,$type))
	{
	    $self->_set_err_state($ERR_LINK_ADD,

lib/Alvis/HTML.pm  view on Meta::CPAN

    #           1. Mark & replace legal tag starts with '\0'.
    #           2. Go from the start of a tag to the beginning of
    #              the next one, neutralizing any confusing chars
    #              inside possible attribute values.
    #           3. Pick the leftmost '>' before the start of the next
    #              tag as the end of the tag.
    #           4. Remove all tags.               
    #
    if ($self->{alvisKeep})
    { 
	$html=~s/<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption)(?=\W)/\0/sgo;
    }
    if ($self->{alvisRemove})
    {
	$html=~s/<\/?(?:(?i)tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|button|center|cite|code|col|colgro...
    }
    if ($self->{obsolete})
    {
	$html=~s/<\/?(?:(?i)header|nextid|section|listing|xmp|plaintext)(?=\W)/\0/sgo;
    }
    if ($self->{proprietary})
    {
	$html=~s/<\/?(?:(?i)align|blink|embed|ilayer|keygen|layer|multicol|noembed|nolayer|nosave|spacer|inlineinput|sound|audioscope|blackface|animate|bgsound|comment|marquee|xml|o:p|csaction|csactions|csactiondict|csscriptdict|csactionitem|csobj|wbr|nobr|...
    }
    if ($self->{xhtml})

lib/Alvis/HTML.pm  view on Meta::CPAN

    $html=~s/(?<=\0).*?>//sgo;
    $html=~s/\0/ /go;
    
    # We have removed those tags we wanted to now
    
    # If we have some tags left, do some fixing 
    if (!$self->{alvisKeep}||!$self->{alvisRemove}||!$self->{obsolete}||
	!$self->{proprietary}||!$self->{xhtml}||!$self->{wml})
    {
	# Often we have <TAG ... </TAG>. Fix that.
	$html=~s/(<\/?(?:(?i)a|frame|iframe|h[1-6]|p|div|dl|ul|ol|table|li|dd|dt|th|td|caption|tr|blockquote|hr|br|dir|menu|form|fieldset|legend|label|input|select|option|textarea|isindex|noframes|frameset|tfoot|body|tbody|html|head|abbr|acronym|address|app...
	$html=~s/(?<=\0)([^>]*?)(?=\0)/$1>/sgo;
	$html=~s/(?<=\0)([^\0>]*?)$/$1>/sgo;
	$html=~s/\0/ /go;
    }

    # Alvis needs some finer tuning
    if (!$self->{alvisKeep})
    { 
	# Fix attributes of interest
	$html=~s/(<a\W[^>]*?href\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;
	# Fix attributes of interest
	$html=~s/(<(?:frame|iframe|img)\W[^>]*?src\s*=\s*)([\"\'])(\S*?)(\s.*?)?>/$self->_fix_attr($1,$2,$3,$4)/isgoe;

	# Sometimes "HTML" contains Alvis tags...double safeguard them
	$html=~s/<(\/?(?:(?i)section|list|item|ulink).*?)>/\&lt;$1\&gt;/sgo;
    }
	
    if ($DEBUG)
    {
	warn $html;
    }

lib/Alvis/Wikipedia/Templates.pm  view on Meta::CPAN

	$var_name=~s/^(subst|int)://isgo;
    }
    elsif ($var_name=~/^(FULL)?PAGENAMEE?$/)
    {
	return $self->{currTitle};
    }
    elsif ($var_name=~/^NAMESPACEE?$/)
    {
	return $self->{currNamespace};
    }
    elsif ($var_name=~/^(__NOTOC__|__FORCETOC__|__TOC__|__NOEDITSECTION__|__START__|CURRENT(MONTH|MONTHNAME|MONTHNAMEGEN|MONTHABBREV|DAY|DAYNAME|YEAR|TIME)|NUMBEROFARTICLES|NUMBEROFFILES|PAGENAMEE|NAMESPACE|__END__|thumbnail|thumb|right|left|none|cen...
    {
	return "$var_name";
    }
    else
    {
	return $text;
    }
}

sub _substitute_template

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="FF2C88E89A1DDFE4F8CD4845EEC285E3" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1142938329956</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3592876</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>At long last, Google has launched its ownGoogle Finance service. For years, those seeking specialty financial information via Google have been sent to competitors such as Yahoo and MSN. Now Google's providing financial information di...
      <metaData>
        <meta name="title">Google Launches Google Finance</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText>
wrote</anchorText>
            <location>http://searchenginewatch.com/_subscribers/articles/article.php/3353401</location>

t/test-data/to-split/29.xml  view on Meta::CPAN

<documentRecord id="B4158BE3ACF2447B8B2FF1AFFB5361A0" xmlns="http://alvis.info/enriched/">
    <acquisition>
      <acquisitionData>
        <modifiedDate>1147168350172</modifiedDate>
        <httpServer>Apache</httpServer>
        <urls>
          <url>http://searchenginewatch.com/searchday/article.php/3603301</url>
        </urls>
      </acquisitionData>
      <canonicalDocument>        
        <section>Paying attention to web metrics is an increasingly important aspect of search marketing, with methodologies, processes and tools that can dramatically lift marketing and business performance. A special report from the Search Engine S...
      <metaData>
        <meta name="title">Multichannel Metrics: Managing the Sea of Data</meta>
        <meta name="dc:type">text/html</meta>
      </metaData>
      <links>
        <outlinks>
          <link type="a">
            <anchorText> Grantastic Designs, Inc.</anchorText>
            <location>http://www.grantasticdesigns.com/</location>
          </link>



( run in 3.090 seconds using v1.01-cache-2.11-cpan-e1769b4cff6 )