DBIx-TextSearch

 view release on metacpan or  search on metacpan

lib/DBIx/TextSearch.pm  view on Meta::CPAN

    my $ftp = Net::FTP->new($host,
			    Debug => 1,
			    Passive => 1);
    $ftp->login($username, $passwd);
    $ftp->cwd("$dir");
    $ftp->ascii();
    $ftp->get($remote_file, $local_file);
    $ftp->quit();

    # file transferred, return its location
    $self->say("Local file is: $local_file\n");
    return $local_file;
}
######################################################################
sub _http {
    # fetch a file via http and store locally
    my ($self, $url) = @_;
    print "URL to fetch: $url\n";

    # get unique name for local file
    my $local_file = _get_unique_filename();

    # fetch the file
    my $ua = LWP::UserAgent->new;
    my $request = HTTP::Request->new('GET', $url);
    my $response = $ua->request($request);

    if ($response->is_success) {
	# sucessful fetch
	# write to disk
	my $html = $response->content;
	CORE::open(HTML, ">$local_file") or
	  croak "Can't save HTML file $url to $local_file: $!";
	print HTML $html;
	close HTML;
	# file transferred, return its location
	return $local_file;
    } else {
	# error message
	my $error = $response->status_line;
	cluck $error;
    }

}
######################################################################
sub _rem_newer {
    # check the md5 sum of a URI against db.
    # return md5. If not in index, md5 from MD5i eq 'none'
    # par 1 = http|ftp|file. par2 = uri
    my $self = shift();
    my ($ftype, $loc) = @_;
    my $md5_file; # file checksum

    my $md5_db = $self->MD5($loc); # mtime of indexed file
    # $md5_db = 'none' if not in index

    $self->say("is file newer than already indexed version?\n");
    if ($ftype eq 'http') {
	$self->say("checking md5 sum with http\n");
	my $ua = LWP::UserAgent->new(env_proxy => 1,
				     keep_alive => 1,
				     timeout => 30);
	my $response = $ua->get($loc);
	cluck "Error while getting ", $response->request->uri,
	  " -- ", $response->status_line, "\nAborting"
	    unless $response->is_success;
	my $doc = $response->content();
	$md5_file = md5_hex($doc);
	undef $ua;
    } elsif ($ftype eq 'ftp') {
	my $file = $self->_ftp($loc);
	$md5_file = md5_hex($file);
	unlink($file);
    } elsif ($ftype eq 'file') {
	$md5_file = md5_hex($loc);
    }

    $self->say("file checksum : $md5_file\nindex checksum: $md5_db\n");

    if ($md5_file ne $md5_db) {
	# remote file is different from indexed version
	$self->say("uri is different from indexed version\n");
	return ($md5_file, 1);
    } else {
	$self->say("uri is is the same as the indexed version\n");
	return ($md5_file, 0);
    }

}
######################################################################
sub index_document {
    # given a document URI, add it to the index.
    # each word is to be indexed once.
    # also, only index if file is newer than database copy.
    my ($file, $http_content_type, @head, $toIndex, $md5, $changed, $toRemove);
    my $self = shift();
    my %params = @_;
    $http_content_type = 0;

    my $uri = $params{uri};

    $self->say("about to index $uri\n");

    # get file contents

    # if an ftp or http uri, call a sub to fetch the remote file, save
    # it somewhere useful (/tmp) and return the name that the file has
    # been saved under ($file)
    my $url = URI->new($uri) or $self->say("couldn't create URI object to check url options\n");
    $self->say("url is $uri\n");
    $self->say("URI object is $url\n");

    if ($url->scheme() eq 'ftp') {
	# an FTP address
	# fetch and index only if remote file is newer than db
	$self->say("fetching $uri via ftp\n");
	($md5, $changed) = $self->_rem_newer('ftp', $uri);
	if ($changed == 1) {
	    $file = $self->_ftp($uri);
	}
    } elsif ($url->scheme() eq 'http') {



( run in 0.567 second using v1.01-cache-2.11-cpan-39bf76dae61 )