WWW-Search

 view release on metacpan or  search on metacpan

Programs/AutoSearch  view on Meta::CPAN

      $URLFilter = $SummaryURLFilter; # use whatever was in the first_index.html file
    }
    $SummaryURLFilter = $URLFilter;
  }
  if ($s_dbg)
    {
    print STDERR qq{Query is : "$SummaryQuery"};
    print STDERR qq{ with "@SummaryQueryOptions"} if ($#SummaryQueryOptions);
    print STDERR "\n";
    } # if
  print STDERR "URL Filter is \"$URLFilter\"\n" if $v_dbg;
#
# now locate the weekly format file.
# 1) qid/date.html, or 2) first_date.html, or 3) create one.
  &check_date_file($qid,$QueryName,$QueryString); #make qid/date.html

# read date.html and break into fields.
# make the search results into a list of urls, title, & descr).
# (later sort it)
# note: Top & Bottom CAN BE different from index.html.
  my($WeeklyTop,
     $AppendedHeading,$AppendedTemplate,$Appended,
     $SuspendedHeading,$SuspendedTemplate,$Suspended,
     $WeeklyBottom)
   = &get_weekly_parts($qid);
# insert queryname into html Top from date.html
# usually this is not set up, because when we created the file we didn't
# have the data.  Do we have the Query Name?
  $WeeklyTop =~ s/>AutoSearch WEB Searching</$QueryName/;
  $v_dbg && print STDERR " + got weekly parts...\n";

  my $hits = 0; # actual no. of hits.
  my $saved = 0; # actual no. saved.
  my $search;
  # Search AltaVista, or whatever the user has specified.
  if($SummarySearchEngine) {
    $search = new WWW::Search($SummarySearchEngine);
  } else {
    $search = new WWW::Search(undef()); # must be undef to get default.
  }
  $search->{_host} = $opts{'h'} if defined($opts{'h'});
  $search->{_port} = $opts{'p'} if defined($opts{'p'});
  if (defined($opts{'http_proxy'}) && ($opts{'http_proxy'} ne ''))
    {
    print STDERR qq{ + applying http_proxy }, Dumper(\$opts{http_proxy}) if $opts{'debug'};
    $search->http_proxy(['http', ] => $opts{'http_proxy'});
    if (defined($opts{'http_proxy_user'}) && ($opts{'http_proxy_user'} ne ''))
      {
      print STDERR qq{ + applying $opts{http_proxy_user}...\n} if $opts{'debug'};
      $search->http_proxy_user($opts{'http_proxy_user'});
      $search->http_proxy_pwd($opts{'http_proxy_pwd'});
      } # if
    } # if
  elsif ($opts{'env_proxy'})
    {
    $search->env_proxy($opts{'env_proxy'});
    }
  elsif (0)
    {
    # This is the OLD code:
    $search->http_proxy($ENV{'HTTP_PROXY'}) if ($ENV{'HTTP_PROXY'});
    $search->http_proxy($ENV{'http_proxy'}) if ($ENV{'http_proxy'});
    } # if
  # submit search w/options.
  $search->native_query(WWW::Search::escape_query($SummaryQuery), $query_options);
  $search->login($opts{'userid'}, $opts{'password'});
  # Process the --ignore_channels argument(s):
  my @asChannel;
  foreach my $sChannel (@{$opts{'ignore_channels'}})
    {
    push @asChannel, split(/,/, $sChannel);
    } # foreach
  if ($search->can('ignore_channels')
      &&
      scalar(@asChannel)
     )
    {
    $search->ignore_channels(@asChannel);
    } # if
  # examine search results
  my($next_result);
  my(@new_weekly_url,@new_weekly_title,@new_weekly_description);
  my @aoResult;  # Parallel array to new_weekly_url
  my(@weekly_url,@weekly_title);

# care to see the old summary list?
#  print STDERR "old summary:\n";
#  foreach $line (@old_summary_url) {
#    print STDERR "$line\n";
#  }

  # how many hits?
  # convert latest search results to a list of urls (descriptions & titles)
  # filtered by $SummaryURLFilter called new_weekly_*
 NEXT_URL:
  while ($next_result = $search->next_result()) { # page-by-page
    $url = $next_result->url;
    $hits++; # how many were returned?
    if ($local_filter) { # exclude old pages from prev. version?
      # let's not display references to our own pages.
      next if $url =~ m,www\.isi\.edu/div7/ib/(.+)/(\d+)\.html$,o;
      # let's not display references to our old pages.
      next if $url =~ m,www\.isi\.edu/div7/ib/jog,o;
    }
    # let the user filter out URLs.
    if ( ($SummaryURLFilter) && ($url =~ m,$SummaryURLFilter,oi) ) {
#      print STDERR "filter out $url \n with filter: $SummaryURLFilter\n";
      $url_filter_count++;
      next;
    }
    $saved++; # how many were saved?
    push(@weekly_url,$url); # the complete set of hits
    $title = $next_result->title;
    push(@weekly_title,$title);
    # Was it in the old summary?  If so, don't save it.
    # If not, it is a new search results for this week.
    foreach $line (@old_summary_url)
      {
      # See if this url is in the summary; skip this url if it's in
      # the summary:
      next NEXT_URL if ($url eq $line);



( run in 1.241 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )