combine

 view release on metacpan or  search on metacpan

Combine/HTMLExtractor.pm  view on Meta::CPAN


## a stack of links for keeping track of TEXT
## which is all of "<a href>text</a>"
    my @TEXT = ();
    $self->{_LINKS} = [];
    my $tottext=''; #All visible text
    my $inHeading=0; my $headtext=''; #All headings
#  ["S",  $tag, $attr, $attrseq, $text]
#  ["E",  $tag, $text]
#  ["T",  $text, $is_data]
#  ["C",  $text]
#  ["D",  $text]
#  ["PI", $token0, $text]

    while (my $T = $self->{_tp}->get_token() ) {
        my $NL; #NewLink
        my $Tag = $T->[1]; #        my $Tag = $T->return_tag;
        my $got_TAGS_IN_NEED=0;
#	Adump($T); #Debug

## Start tag?
        if( $T->[0] eq 'S' ) { #        if($T->is_start_tag) {
	    if ( $Tag =~ /^h\d$/ ) { $inHeading=1; }
	    if (exists $SECTIONTAGS{$Tag}) { $tottext .= "\n\n";}
            next unless exists $TAGS{$Tag};

## Do we have a tag for which we want to capture text?
            $got_TAGS_IN_NEED = 0;
            $got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;

## then check to see if we got things besides META :)
            if(defined $TAGS{ $Tag }) {

                for my $Btag(@{$TAGS{$Tag}}) {
## and we check if they do have one with a value
                    if(exists $T->[2]->{ $Btag }) { #                    if(exists $T->return_attr()->{ $Btag }) {

                        $NL = $T->[2]; #Save all attributes incl ALT in IMG # $NL = $T->return_attr();
## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
                        if($got_TAGS_IN_NEED) {
                            push @TEXT, $NL;
                            $NL->{_TEXT} = "";
                        }
                    }
                }
		if ($Tag eq 'img') {
		    #extract ALT-text
		    if (exists $T->[2]->{alt}) {
			$tottext .= '[' . $T->[2]->{alt} . '] ';
		    } ##else { $tottext .= '[IMG]'; }
		}
            }elsif($Tag eq 'meta') {
                $NL = $T->[2]; #                $NL = $T->return_attr();

                if(defined $$NL{content} and length $$NL{content} and (
                    defined $$NL{'http-equiv'} &&  $$NL{'http-equiv'} =~ /refresh/i
                    or
                    defined $$NL{'name'} &&  $$NL{'name'} =~ /refresh/i
                    ) ) {

                    my( $timeout, $url ) = split m{;\s*?URL=}, $$NL{content},2;
                    my $base = $self->{_base};
                    $$NL{url} = URI->new_abs( $url, $base ) if $base;
                    $$NL{url} = $url unless exists $$NL{url};
                    $$NL{timeout} = $timeout if $timeout;
                }
            }

            ## In case we got nested tags
            if(@TEXT) {
                $TEXT[-1]->{_TEXT} .= $T->[-1]; #                $TEXT[-1]->{_TEXT} .= $T->as_is;
#		my $t=$T->[-1]; print " Nested: $t\n"; #debug
            }

## Text?
        }elsif($T->[0] eq 'T') { #        }elsif($T->is_text) {
            $TEXT[-1]->{_TEXT} .= $T->[-2] if @TEXT; #            $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT;
	    $tottext .=  $T->[-2] . ' '; #	    $tottext .=  $T->as_is;
	    if ( $inHeading ) { $headtext .= $T->[-2]; } #	    if ( $h ne '' ) { $headtext .= $T->as_is . '; '; }
## Declaration?
        }elsif($T->[0] eq 'D') { #        }elsif($T->is_declaration) {
## We look at declarations, to get anly custom .dtd's (tis linky)
            my $text = $T->[-1]; #            my $text = $T->as_is;
            if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) {
                $NL = { raw => $text, url => $1, tag => '!doctype' };
            }
## End tag?
        }elsif($T->[0] eq 'E'){ #        }elsif($T->is_end_tag){
	    if ( $Tag =~ /^h\d$/ ) { $inHeading=0; $headtext .= '; '; }
	    if (exists $SECTIONTAGS{$Tag}) { $tottext .= "\n\n";}
## these be ignored (maybe not in between <a...></a> tags
## unless we're stacking (bug #5723)
            if(@TEXT and exists $TAGS{$Tag}) {
                $TEXT[-1]->{_TEXT} .= $T->[-1]; #                $TEXT[-1]->{_TEXT} .= $T->as_is;
                my $pop = pop @TEXT;
                $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
                $pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
#		my $t = $pop->{_TEXT}; print " I endtag stripHTML: $t\n";
                $self->{_cb}->($self, $pop) if exists $self->{_cb};
            }
        }

        if(defined $NL) {
            $$NL{tag} = $Tag;

            my $base = $self->{_base};

            for my $at( @VALID_URL_ATTRIBUTES ) {
                if( exists $$NL{$at} ) {
                    $$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
                }
            }

            if(exists $self->{_cb}) {
                $self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
            } else {
                push @{$self->{_LINKS}}, $NL;
#		my $t=$$NL{_TEXT}.';'.$$NL{tag}; print " PushNL: $t\n";
#		foreach $t (keys(%{$NL})) { print " K=$t; V=$$NL{$t}\n";  }
            }
        }



( run in 1.467 second using v1.01-cache-2.11-cpan-71847e10f99 )