WWW-Search
view release on metacpan or search on metacpan
Programs/AutoSearch view on Meta::CPAN
$URLFilter = $SummaryURLFilter; # use whatever was in the first_index.html file
}
$SummaryURLFilter = $URLFilter;
}
if ($s_dbg)
{
print STDERR qq{Query is : "$SummaryQuery"};
print STDERR qq{ with "@SummaryQueryOptions"} if ($#SummaryQueryOptions);
print STDERR "\n";
} # if
print STDERR "URL Filter is \"$URLFilter\"\n" if $v_dbg;
#
# now locate the weekly format file.
# 1) qid/date.html, or 2) first_date.html, or 3) create one.
&check_date_file($qid,$QueryName,$QueryString); #make qid/date.html
# read date.html and break into fields.
# make the search results into a list of urls, title, & descr).
# (later sort it)
# note: Top & Bottom CAN BE different from index.html.
my($WeeklyTop,
$AppendedHeading,$AppendedTemplate,$Appended,
$SuspendedHeading,$SuspendedTemplate,$Suspended,
$WeeklyBottom)
= &get_weekly_parts($qid);
# insert queryname into html Top from date.html
# usually this is not set up, because when we created the file we didn't
# have the data. Do we have the Query Name?
$WeeklyTop =~ s/>AutoSearch WEB Searching</$QueryName/;
$v_dbg && print STDERR " + got weekly parts...\n";
my $hits = 0; # actual no. of hits.
my $saved = 0; # actual no. saved.
my $search;
# Search AltaVista, or whatever the user has specified.
if($SummarySearchEngine) {
$search = new WWW::Search($SummarySearchEngine);
} else {
$search = new WWW::Search(undef()); # must be undef to get default.
}
$search->{_host} = $opts{'h'} if defined($opts{'h'});
$search->{_port} = $opts{'p'} if defined($opts{'p'});
if (defined($opts{'http_proxy'}) && ($opts{'http_proxy'} ne ''))
{
print STDERR qq{ + applying http_proxy }, Dumper(\$opts{http_proxy}) if $opts{'debug'};
$search->http_proxy(['http', ] => $opts{'http_proxy'});
if (defined($opts{'http_proxy_user'}) && ($opts{'http_proxy_user'} ne ''))
{
print STDERR qq{ + applying $opts{http_proxy_user}...\n} if $opts{'debug'};
$search->http_proxy_user($opts{'http_proxy_user'});
$search->http_proxy_pwd($opts{'http_proxy_pwd'});
} # if
} # if
elsif ($opts{'env_proxy'})
{
$search->env_proxy($opts{'env_proxy'});
}
elsif (0)
{
# This is the OLD code:
$search->http_proxy($ENV{'HTTP_PROXY'}) if ($ENV{'HTTP_PROXY'});
$search->http_proxy($ENV{'http_proxy'}) if ($ENV{'http_proxy'});
} # if
# submit search w/options.
$search->native_query(WWW::Search::escape_query($SummaryQuery), $query_options);
$search->login($opts{'userid'}, $opts{'password'});
# Process the --ignore_channels argument(s):
my @asChannel;
foreach my $sChannel (@{$opts{'ignore_channels'}})
{
push @asChannel, split(/,/, $sChannel);
} # foreach
if ($search->can('ignore_channels')
&&
scalar(@asChannel)
)
{
$search->ignore_channels(@asChannel);
} # if
# examine search results
my($next_result);
my(@new_weekly_url,@new_weekly_title,@new_weekly_description);
my @aoResult; # Parallel array to new_weekly_url
my(@weekly_url,@weekly_title);
# care to see the old summary list?
# print STDERR "old summary:\n";
# foreach $line (@old_summary_url) {
# print STDERR "$line\n";
# }
# how many hits?
# convert latest search results to a list of urls (descriptions & titles)
# filtered by $SummaryURLFilter called new_weekly_*
NEXT_URL:
while ($next_result = $search->next_result()) { # page-by-page
$url = $next_result->url;
$hits++; # how many were returned?
if ($local_filter) { # exclude old pages from prev. version?
# let's not display references to our own pages.
next if $url =~ m,www\.isi\.edu/div7/ib/(.+)/(\d+)\.html$,o;
# let's not display references to our old pages.
next if $url =~ m,www\.isi\.edu/div7/ib/jog,o;
}
# let the user filter out URLs.
if ( ($SummaryURLFilter) && ($url =~ m,$SummaryURLFilter,oi) ) {
# print STDERR "filter out $url \n with filter: $SummaryURLFilter\n";
$url_filter_count++;
next;
}
$saved++; # how many were saved?
push(@weekly_url,$url); # the complete set of hits
$title = $next_result->title;
push(@weekly_title,$title);
# Was it in the old summary? If so, don't save it.
# If not, it is a new search results for this week.
foreach $line (@old_summary_url)
{
# See if this url is in the summary; skip this url if it's in
# the summary:
next NEXT_URL if ($url eq $line);
( run in 1.241 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )