DBIx-TextSearch
view release on metacpan or search on metacpan
lib/DBIx/TextSearch.pm view on Meta::CPAN
my $ftp = Net::FTP->new($host,
Debug => 1,
Passive => 1);
$ftp->login($username, $passwd);
$ftp->cwd("$dir");
$ftp->ascii();
$ftp->get($remote_file, $local_file);
$ftp->quit();
# file transferred, return its location
$self->say("Local file is: $local_file\n");
return $local_file;
}
######################################################################
sub _http {
# fetch a file via http and store locally
my ($self, $url) = @_;
print "URL to fetch: $url\n";
# get unique name for local file
my $local_file = _get_unique_filename();
# fetch the file
my $ua = LWP::UserAgent->new;
my $request = HTTP::Request->new('GET', $url);
my $response = $ua->request($request);
if ($response->is_success) {
# sucessful fetch
# write to disk
my $html = $response->content;
CORE::open(HTML, ">$local_file") or
croak "Can't save HTML file $url to $local_file: $!";
print HTML $html;
close HTML;
# file transferred, return its location
return $local_file;
} else {
# error message
my $error = $response->status_line;
cluck $error;
}
}
######################################################################
sub _rem_newer {
# check the md5 sum of a URI against db.
# return md5. If not in index, md5 from MD5i eq 'none'
# par 1 = http|ftp|file. par2 = uri
my $self = shift();
my ($ftype, $loc) = @_;
my $md5_file; # file checksum
my $md5_db = $self->MD5($loc); # mtime of indexed file
# $md5_db = 'none' if not in index
$self->say("is file newer than already indexed version?\n");
if ($ftype eq 'http') {
$self->say("checking md5 sum with http\n");
my $ua = LWP::UserAgent->new(env_proxy => 1,
keep_alive => 1,
timeout => 30);
my $response = $ua->get($loc);
cluck "Error while getting ", $response->request->uri,
" -- ", $response->status_line, "\nAborting"
unless $response->is_success;
my $doc = $response->content();
$md5_file = md5_hex($doc);
undef $ua;
} elsif ($ftype eq 'ftp') {
my $file = $self->_ftp($loc);
$md5_file = md5_hex($file);
unlink($file);
} elsif ($ftype eq 'file') {
$md5_file = md5_hex($loc);
}
$self->say("file checksum : $md5_file\nindex checksum: $md5_db\n");
if ($md5_file ne $md5_db) {
# remote file is different from indexed version
$self->say("uri is different from indexed version\n");
return ($md5_file, 1);
} else {
$self->say("uri is is the same as the indexed version\n");
return ($md5_file, 0);
}
}
######################################################################
sub index_document {
# given a document URI, add it to the index.
# each word is to be indexed once.
# also, only index if file is newer than database copy.
my ($file, $http_content_type, @head, $toIndex, $md5, $changed, $toRemove);
my $self = shift();
my %params = @_;
$http_content_type = 0;
my $uri = $params{uri};
$self->say("about to index $uri\n");
# get file contents
# if an ftp or http uri, call a sub to fetch the remote file, save
# it somewhere useful (/tmp) and return the name that the file has
# been saved under ($file)
my $url = URI->new($uri) or $self->say("couldn't create URI object to check url options\n");
$self->say("url is $uri\n");
$self->say("URI object is $url\n");
if ($url->scheme() eq 'ftp') {
# an FTP address
# fetch and index only if remote file is newer than db
$self->say("fetching $uri via ftp\n");
($md5, $changed) = $self->_rem_newer('ftp', $uri);
if ($changed == 1) {
$file = $self->_ftp($uri);
}
} elsif ($url->scheme() eq 'http') {
( run in 0.567 second using v1.01-cache-2.11-cpan-39bf76dae61 )