HTML-LinkExtractor
view release on metacpan or search on metacpan
LinkExtractor.pm view on Meta::CPAN
}
$this->{_tp} = $tp;
$this->_parsola();
return();
}
sub _parsola {
my $self = shift;
## a stack of links for keeping track of TEXT
## which is all of "<a href>text</a>"
my @TEXT = ();
$self->{_LINKS} = [];
# ["S", $tag, $attr, $attrseq, $text]
# ["E", $tag, $text]
# ["T", $text, $is_data]
# ["C", $text]
# ["D", $text]
# ["PI", $token0, $text]
while (my $T = $self->{_tp}->get_token() ) {
my $NL; #NewLink
my $Tag = $T->[1]; # my $Tag = $T->return_tag;
my $got_TAGS_IN_NEED=0;
## Start tag?
if($T->[0] eq 'S' ) { # if($T->is_start_tag) {
next unless exists $TAGS{$Tag};
## Do we have a tag for which we want to capture text?
$got_TAGS_IN_NEED = 0;
$got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;
## then check to see if we got things besides META :)
if(defined $TAGS{ $Tag }) {
for my $Btag(@{$TAGS{$Tag}}) {
## and we check if they do have one with a value
if(exists $T->[2]->{ $Btag }) { # if(exists $T->return_attr()->{ $Btag }) {
$NL = $T->[2]; # $NL = $T->return_attr();
## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
if($got_TAGS_IN_NEED) {
push @TEXT, $NL;
$NL->{_TEXT} = "";
}
}
}
}elsif($Tag eq 'meta') {
$NL = $T->[2]; # $NL = $T->return_attr();
if(defined $$NL{content} and length $$NL{content} and (
defined $$NL{'http-equiv'} && $$NL{'http-equiv'} =~ /refresh/i
or
defined $$NL{'name'} && $$NL{'name'} =~ /refresh/i
) ) {
my( $timeout, $url ) = split m{;\s*?URL=}i, $$NL{content},2;
my $base = $self->{_base};
$$NL{url} = URI->new_abs( $url, $base ) if $base;
$$NL{url} = $url unless exists $$NL{url};
$$NL{timeout} = $timeout if $timeout;
}
}
## In case we got nested tags
if(@TEXT) {
$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
}
## Text?
}elsif($T->[0] eq 'T' ) { # }elsif($T->is_text) {
$TEXT[-1]->{_TEXT} .= $T->[-2] if @TEXT; # $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT;
## Declaration?
}elsif($T->[0] eq 'D' ) { # }elsif($T->is_declaration) {
## We look at declarations, to get anly custom .dtd's (tis linky)
my $text = $T->[-1] ; # my $text = $T->as_is;
if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) {
$NL = { raw => $text, url => $1, tag => '!doctype' };
}
## End tag?
}elsif($T->[0] eq 'E' ){ # }elsif($T->is_end_tag){
## these be ignored (maybe not in between <a...></a> tags
## unless we're stacking (bug #5723)
if(@TEXT and exists $TAGS{$Tag}) {
$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
my $pop = pop @TEXT;
$TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
$pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
$self->{_cb}->($self, $pop) if exists $self->{_cb};
}
}
if(defined $NL) {
$$NL{tag} = $Tag;
my $base = $self->{_base};
for my $at( @VALID_URL_ATTRIBUTES ) {
if( exists $$NL{$at} ) {
$$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
}
}
if(exists $self->{_cb}) {
$self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
} else {
push @{$self->{_LINKS}}, $NL;
}
}
}## endof while (my $token = $p->get_token)
undef $self->{_tp};
return();
}
sub links {
my $self = shift;
( run in 2.338 seconds using v1.01-cache-2.11-cpan-71847e10f99 )