WWW-Bookmark-Crawler
view release on metacpan or search on metacpan
package WWW::Bookmark::Crawler;
use 5.006;
use strict;
use warnings;
use Carp;
our $VERSION = '0.01';
use OurNet::FuzzyIndex;
use HTML::LinkExtor;
use LWP::UserAgent;
use HTTP::Request::Common;
use Set::Scalar;
sub new($$) {
my ($pkg, $arg) = @_;
my $self = {
SOURCE =>
-f $arg->{SOURCE} ?
$arg->{SOURCE} :
( ref($arg->{SOURCE}) eq "ARRAY" ? $arg->{SOURCE} : '' ),
DBNAME => $arg->{DBNAME},
PEEK => $arg->{PEEK},
PROXY => $arg->{PROXY},
TIMEOUT => $arg->{TIMEOUT} || 10,
};
$self->{TOKENIZER} = ref($arg->{TOKENIZER}) ? $arg->{TOKENIZER} : \&tokenizer;
if(ref($self->{SOURCE}) eq "ARRAY"){
$self->{_LINKS} = $self->{SOURCE};
}
elsif($self->{SOURCE}){
my $p = HTML::LinkExtor->new();
$p->parse_file($self->{SOURCE});
$self->{_LINKS} = [map{$_->[2]}grep{$_->[0] eq 'a' && $_->[1] eq 'href' }$p->links];
}
bless $self, $pkg;
}
sub tokenizer($) {
caller eq __PACKAGE__ or croak(q/It's private!/);
my(@t);
for my $tok (grep {$_} split /\s+/o, $_[0]){
my %words = OurNet::FuzzyIndex::parse($tok, 0);
foreach my $m (keys %words){
push @t, map{"$m$_"} grep { $_ !~ /^[\s\t\d]+$/ } keys %{$words{$m}};
push @t, $m;
}
push @t, $_[0] unless @t;
}
return @t;
}
sub crawl($) {
my $pkg = shift;
my $ua = LWP::UserAgent->new;
$ua->agent ("WWW::Bookmark::Crawler $VERSION");
$ua->proxy ($pkg->{HTTP_PROXY});
$ua->timeout($pkg->{TIMEOUT});
local $| = 1;
open (DB, ">".$pkg->{DBNAME}) or croak("cannot write to index file");
local $SIG{INT} = sub { close DB; exit };
for my $L (@{$pkg->{_LINKS}}){
my $request = GET ($L);
my $response = $ua->request($request);
$response->is_success or next;
my $stripper = WWW::Bookmark::TagStripper->new();
$stripper->strip($response->content);
if($pkg->{PEEK}){
print "{\n $L\n";
print " ".$stripper->{TITLE}."\n}\n";
}
print DB
$L, "\x02", $stripper->{TITLE}, "\x02",
join(qq/\x01/, $pkg->{TOKENIZER}->($stripper->{TEXT}) ), "\n";
}
close DB;
}
sub _loadDB($) {
my $pkg = shift;
my $L;
my ($url, $title, $keywords);
my $cnt = 0;
$pkg->{_dbloaded} = 1;
open (DB, $pkg->{DBNAME}) or croak("index file error");
while($L = <DB>){
next unless $L =~ /\x02/o;
chomp $L;
($url, $title, $keywords) = split /\x02/, $L;
$pkg->{_URLS}->[$cnt] = $url;
$pkg->{_TITLES}->[$cnt] = $title;
foreach my $k (keys %{ { map {$_,1} split /\x01/, $keywords } }){
push @{$pkg->{_KEYWORDS}->{$k}}, $cnt;
}
$cnt++;
}
close DB;
}
sub query($) {
my $pkg = shift;
my $query = shift || croak("Query?");
$pkg->_loadDB unless $pkg->{_dbloaded};
my @queries = keys %{{
map {$_,1}
sort { @{$pkg->{_KEYWORDS}->{$a}} <=> @{$pkg->{_KEYWORDS}->{$b}} }
$pkg->{TOKENIZER}->($query)
}};
my $seta = Set::Scalar->new(@{$pkg->{_KEYWORDS}->{$queries[0]}});
( run in 0.818 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )