WWW-Bookmark-Crawler

 view release on metacpan or  search on metacpan

Crawler.pm  view on Meta::CPAN


package WWW::Bookmark::Crawler;

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.01';

use OurNet::FuzzyIndex;
use HTML::LinkExtor;
use LWP::UserAgent;
use HTTP::Request::Common;
use Set::Scalar;

sub new($$) {
    my ($pkg, $arg) = @_;
    my $self = {
	SOURCE  =>
	    -f $arg->{SOURCE} ? 
		$arg->{SOURCE} :
		    ( ref($arg->{SOURCE}) eq "ARRAY" ? $arg->{SOURCE} : '' ),
	DBNAME  => $arg->{DBNAME},
	PEEK    => $arg->{PEEK},
	PROXY   => $arg->{PROXY},
	TIMEOUT => $arg->{TIMEOUT} || 10,
    };
    $self->{TOKENIZER} = ref($arg->{TOKENIZER}) ? $arg->{TOKENIZER} : \&tokenizer;

    if(ref($self->{SOURCE}) eq "ARRAY"){
	$self->{_LINKS} = $self->{SOURCE};
    }
    elsif($self->{SOURCE}){
	my $p = HTML::LinkExtor->new();
	$p->parse_file($self->{SOURCE});
	$self->{_LINKS} = [map{$_->[2]}grep{$_->[0] eq 'a' && $_->[1] eq 'href' }$p->links];
    }

    bless $self, $pkg;
}

sub tokenizer($) {
    caller eq __PACKAGE__ or croak(q/It's private!/);
    my(@t);
    for my $tok (grep {$_} split /\s+/o, $_[0]){
        my %words = OurNet::FuzzyIndex::parse($tok, 0);
        foreach my $m (keys %words){
            push @t, map{"$m$_"} grep { $_ !~ /^[\s\t\d]+$/ } keys %{$words{$m}};
            push @t, $m;
        }
        push @t, $_[0] unless @t;
    }
    return @t;
}

sub crawl($) {
    my $pkg = shift;
    my $ua = LWP::UserAgent->new;
    $ua->agent  ("WWW::Bookmark::Crawler $VERSION");
    $ua->proxy  ($pkg->{HTTP_PROXY});
    $ua->timeout($pkg->{TIMEOUT});
    local $| = 1;
    open (DB, ">".$pkg->{DBNAME}) or croak("cannot write to index file");
    local $SIG{INT} = sub { close DB; exit };

    for my $L (@{$pkg->{_LINKS}}){
	my $request = GET ($L);
	my $response = $ua->request($request);
	$response->is_success or next;

	my $stripper = WWW::Bookmark::TagStripper->new();
	$stripper->strip($response->content);

	if($pkg->{PEEK}){
	    print "{\n  $L\n";
	    print "  ".$stripper->{TITLE}."\n}\n";
	}

	print DB
	    $L, "\x02", $stripper->{TITLE}, "\x02",
	    join(qq/\x01/, $pkg->{TOKENIZER}->($stripper->{TEXT}) ), "\n";
    }
    close DB;
}

sub _loadDB($) {
    my $pkg = shift;
    my $L;
    my ($url, $title, $keywords);
    my $cnt = 0;
    $pkg->{_dbloaded} = 1;
    open (DB, $pkg->{DBNAME}) or croak("index file error");
    while($L = <DB>){
	next unless $L =~ /\x02/o;
	chomp $L;
	($url, $title, $keywords) = split /\x02/, $L;
	$pkg->{_URLS}->[$cnt] = $url;
	$pkg->{_TITLES}->[$cnt] = $title;
	foreach my $k (keys %{ { map {$_,1} split /\x01/, $keywords } }){
	    push @{$pkg->{_KEYWORDS}->{$k}}, $cnt;
	}
	$cnt++;
    }
    close DB;
}

sub query($) {
    my $pkg = shift;
    my $query = shift || croak("Query?");

    $pkg->_loadDB unless $pkg->{_dbloaded};

    my @queries = keys %{{
	map {$_,1}
	sort { @{$pkg->{_KEYWORDS}->{$a}} <=> @{$pkg->{_KEYWORDS}->{$b}} }
	$pkg->{TOKENIZER}->($query)
	}};

    my $seta = Set::Scalar->new(@{$pkg->{_KEYWORDS}->{$queries[0]}});



( run in 0.818 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )