Apache-Logmonster

 view release on metacpan or  search on metacpan

lib/Apache/Logmonster.pm  view on Meta::CPAN

        };

        my $lines = 0;
        $self->_progress_begin("\t parsing entries from $file") if $debug;

        while ( $gz->gzreadline($_) > 0 ) {
            chomp $_;
            $lines++;
            $self->_progress_continue() if ( $debug && $lines =~ /00$/ );

            my %data;
            @data{@captured_fields} = /$re/;  # no need for /o, a compiled regexp

            # make sure the log format has the vhost tag appended
            my $vhost = $data{'vhost'};
            if ( !$vhost || $vhost eq '-' ) {
                #print "Invalid log entries! Read the FAQ!\n" if $debug;
                print $_ . "\n" if $debug > 2;
                $vhost = $conf->{default_vhost};
                $bad++;
            };

            $vhost = lc($vhost);

            $self->spam_check(\%data, \%count);

            if ( ! $fhs{$vhost} ) {
                $self->open_vhost_handle( $vhost );
            };
            if ( $fhs{$vhost} ) {
                my $fh = $fhs{$vhost};
                print $fh "$_\n";
                $count{$vhost}++;
                next;
            };
            print "\nthe main domain for $vhost is missing!\n" if $debug > 1;
            $orphans{$vhost} = $vhost;
        };
        $gz->gzclose();

        $self->_progress_end() if $debug;
    };

    $self->report_matches( \%count, \%orphans);
    $self->report_spam_hits( \%count );
    $self->report_bad_hits( $bad );

    return \%fhs;
};

sub spam_check {
    my ($self, $data, $count) = @_;
    my $conf = $self->{conf};

    return if ! $conf->{spam_check};

    my $spam_score = 0;

    # check for spam quotient
    if ( $data->{status} ) {
        if ( $data->{status} == 404 ) {    # check for 404 status
            $spam_score++; # a 404 alone is not a sign of naughtiness
        }

        if ( $data->{status} == 412 ) { # httpd config slapping them
            $spam_score++; 
        }

        if ( $data->{status} == 403 ) { # httpd config slapping them
            $spam_score += 2; 
        }
    }

    # nearly all of my referer spam has a # ending the referer string
    if ( $data->{ref} && $data->{ref} =~ /#$/ ) {
        $spam_score += 2;
    }

    # should check for invalid/suspect useragent strings here
    if ( $data->{ua} ) {
        $spam_score += 
              $data->{ua} =~ /crazy/ixms ? 1
            : $data->{ua} =~ /email/i    ? 3
#           : $data->{ua} =~ /windows/   ? 1
            : 0;
    }

    # if we fail more than one spam test...
    if ( $spam_score > 2 ) {
        $count->{spam}++;
        if ( defined $data->{bytes}
            && $data->{bytes} =~ /[0-9]+/ )
        {
            $count->{bytes} += $data->{bytes};
        }

        $count->{spam_agents}{ $data->{ua} }++;
        $count->{spam_referers}{ $data->{ref} }++;

#				printf "%3s - %30s - %30s \n", $data->{status},
#				$data->{ref}, $data->{ua};
        next;    # skips processing the line
    }

# TODO: also keep track of ham referers, and print in referer spam reports, so
# that I can see which UA are entirely spammers and block them in my Apache
# config.
#   else {
#       $count->{ham_referers}{$data->{ref}}++;
#   }
};

sub open_vhost_handle {
    my $self = shift;
    my $vhost = shift;

    my $fh = new FileHandle;   # create a file handle for each ServerName
    $fhs{$vhost} = $fh;         # store in a hash keyed off the domain name

    my $debug  = $self->{debug};

    my $dir = $self->{conf}{tmpdir};    # normally /var/log/(apache|http)/tmp



( run in 1.294 second using v1.01-cache-2.11-cpan-98e64b0badf )