TAPORlib

 view release on metacpan or  search on metacpan

lib/TAPORlib.pm  view on Meta::CPAN

            next if /<\s*applet\b/im ;


        # These are seldom-used tags, or tags that seldom have URLs in them

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*bgsound\b/im ;  # Microsoft only

        s/(<[^>]*\bcite\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
            next if /<\s*blockquote\b/im ;

        s/(<[^>]*\bcite\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
            next if /<\s*del\b/im ;

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*embed\b/im ;    # Netscape only

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bimagemap\s*=\s*["']?)([^\s"'>]*)/   $1 . &full_url($2,$url) /ime,
            next if /<\s*fig\b/im ;      # HTML 3.0

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*h[1-6]\b/im ;   # HTML 3.0

        s/(<[^>]*\bprofile\s*=\s*["']?)([^\s"'>]*)/    $1 . &full_url($2,$url) /ime,
            next if /<\s*head\b/im ;

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*hr\b/im ;       # HTML 3.0

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\blongdesc\s*=\s*["']?)([^\s"'>]*)/   $1 . &full_url($2,$url) /ime,
            next if /<\s*iframe\b/im ;

        s/(<[^>]*\bcite\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
            next if /<\s*ins\b/im ;

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*layer\b/im ;

        s/(<[^>]*\bhref\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\burn\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*link\b/im ;

        s/(<[^>]*\burl\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*meta\b/im ;     # Netscape only

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*note\b/im ;     # HTML 3.0

        s/(<[^>]*\busemap\s*=\s*["']?)([^\s"'>]*)/     $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bcodebase\s*=\s*["']?)([^\s"'>]*)/   $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bdata\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\barchive\s*=\s*["']?)([^\s"'>]*)/    $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bclassid\s*=\s*["']?)([^\s"'>]*)/    $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bname\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
            next if /<\s*object\b/im ;

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bimagemap\s*=\s*["']?)([^\s"'>]*)/   $1 . &full_url($2,$url) /ime,
            next if /<\s*overlay\b/im ;  # HTML 3.0

        s/(<[^>]*\bcite\s*=\s*["']?)([^\s"'>]*)/       $1 . &full_url($2,$url) /ime,
            next if /<\s*q\b/im ;

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
        s/(<[^>]*\bfor\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*script\b/im ;

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*select\b/im ;   # HTML 3.0

        s/(<[^>]*\bsrc\s*=\s*["']?)([^\s"'>]*)/        $1 . &full_url($2,$url) /ime,
            next if /<\s*ul\b/im ;       # HTML 3.0

	}   # foreach (@body)

	my ($commonline) = undef;
	foreach (@body) {$commonline = $commonline . $_ . ">";}
	substr($commonline,-1) = "";

	return $commonline;
}
###############################################################################
sub full_url{
    	my($link,$url)= @_ ;

	my $oldlink=$link;

	my ($baseurl,$relurl,$relurl2) = GetRelativeUrls($url);

	if($link=~ m|^(http://)|i) {goto exit_full_url;}
	if($link=~ m|^(mailto:)|i) {goto exit_full_url;}
	if($link=~ m|^(javascript:)|i) {goto exit_full_url;}
	if($link=~ m|^(#)|i) {goto exit_full_url;}

	$link=~ s|^/|$baseurl/|i;
	$link=~ s|^\./|$relurl/|i;
	$link=~ s|^\.\./|$relurl2/|i;

	if(!($link=~ m|^(http://)|i)) {$link = "$relurl/$link";}

exit_full_url:

	return $link;
}
###############################################################################
sub GetRelativeUrls {
	my($url) = @_;

$url     =~ m|^(http://)([^/\?\r\n]*)|i;
my $host    = $2;

### http://adm.ict.nsc.ru/rus/docs/perl/ - http://adm.ict.nsc.ru
### http://www.irtel.ru/ - http://www.irtel.ru
### http://www.irtel.ru  - http://www.irtel.ru

my $baseurl = "http://$host";

### http://adm.ict.nsc.ru/rus/docs/perl/ - http://adm.ict.nsc.ru/rus/docs/perl
### http://www.irtel.ru/ - http://www.irtel.ru



( run in 1.370 second using v1.01-cache-2.11-cpan-df04353d9ac )