ApacheLog-Parser

 view release on metacpan or  search on metacpan

bin/loghack  view on Meta::CPAN

  }
}

=head2 reskip

Regenerate the skiplist for a given chunk.

=cut

sub do_reskip {
  my ($opt, @files) = @_;

  @files = repo_files($opt, @files);

  my $skipper = get_skipper($opt);
  my $doskip = $skipper->get_matcher;
  foreach my $file (@files) {
    unless(-e $file) {
      die "no such file:\n  $file\n";
    }
    my $fh = open_file($file);

    my $nicename = nice_name($file);
    my $start = time;
    print "$nicename -- ",
      sprintf("%02d:%02d:%02d", (localtime($start))[2,1,0]), "\n";

    my $skipfile = skipfilename($opt, $file);
    my $sw = $skipper->new_writer($skipfile);
    my $lnum = 0;
    while(my $line = <$fh>) {
      $lnum++;
      chomp($line);
      my @v = split(/\t/, $line);
      # create skiplist
      if($doskip and $doskip->(\@v)) {$sw->skip($lnum);}
    }
  }

}

=begin notes

The files are split per-hour.  Time zone adjustments are going to be an
issue.  There's also a potential race condition between two nodes, so
the outputs will always have a ".$chunk" appended to them.  The value of
$part is either 0 or 1 (and only switched to 1 at the start of the file.)

And another issue:  delay.  The request init time is what's shown, but
it doesn't get logged until the request completes.  So a 10min request
will not appear until 10min later.  If there are any large downloads,
they could possibly even span a couple of logrotates.

This also means that tomorrow or the next day could concievably hold a
bit of data from a big download that started 24+ hours ago.  In
practice, logrotate is actually just disposing of this data when it runs
gzip.  That is, a request always goes in the logfile that was open when
the apache process spawned?

Still need to figure out the cleanup pass.  Add the skiplists together
(and/or rename them), figure out where to tie-off the last item, etc.
Probably need some tracking of sources and/or chunks.  Chunks can
probably be treated as closed until further notice as long as a
chunkcount file is maintained somewhere.

=end notes

=begin tznotes

Probably going to just leave the date string unprocessed (but we will
definitely slot it into files according to the adjusted zone.)  Of
course, the date+hour+tz is used to memoize the outgoing date, so taking
the localtime and chunking that back together with the minutes+seconds
wouldn't be a big deal.  We will need to address the dst issue though.

=end tznotes

=cut

=head2 prep

Parse a raw logfile and split it into hourly chunks.

  loghack prep servername/logfile.gz

=cut

sub do_prep {
  my ($opt, @files) = @_;

  my $repo = $opt->{repository} or
    die "must have repository setting for prep()\n";

  my $doskip;
  my $skipper;
  if(-e (my $skipconf = "$repo/.config/skips.conf")) {
    my ($skip) = YAML::LoadFile($skipconf);
    $skipper = ApacheLog::Parser::SkipList->new();
    $skipper->set_config($skip);
    $doskip = $skipper->get_matcher;
  }

  my @loaded;
  foreach my $file (@files) {
    unless(-e $file) {
      my $msg = "no such file:\n  $file\n";
      if($opt->{missok}) { warn $msg; next };
      die $msg;
    }
    my $outpath = repository_path($opt, $file);
    my $fh = open_file($file);

    my $nicename = nice_name($file);

    my $checksum = checksum($fh, 50);
    my $checkfile = "$outpath.loaded/$checksum";
    my $linecount = 0;
    my $ch;
    if(-e "$outpath/.loaded/$checksum") {
      warn "assume $nicename is done\n";
      {local $SIG{CHLD}; close($fh);} # stupid macs
      next;
      # TODO fast-forward support
      # $linecount = $old_linecount; and etc
    }
    else {
      record_source($opt, $file, "$outpath.sources/", $checksum);

      # TODO this could stand to be more atomic
      { # record results
        want_dir("$outpath.loaded");
        my $tag = ($ENV{HOSTNAME} || '') . '.' . $$;
        open($ch, '>', "$checkfile.$tag") or
          die "cannot write '$checkfile.$tag' $!";
        # TODO chmod
        rename("$checkfile.$tag", $checkfile) or
          die "cannot make $checkfile $!";
      }

      # TODO a replayable pipe would be nice
      {local $SIG{CHLD}; close($fh);} # stupid macs
      $fh = open_file($file);
    }

    $opt->{quiet} or print "$nicename -- ",
      sprintf("%02d:%02d:%02d", (localtime)[2,1,0]), "\n";

    my %outhandles;
    my $sw;
    my $out;

    my $chunk = 2;
    my $next_chunk = sub {
      my ($date, $hour, $tz) = @_;

      # might have already started that chunk
      if(my $handles = $outhandles{"$date$hour$tz"}) {
        #warn "back to $date:$hour$tz\n";
        ($out, $sw) = @$handles;
        return;
      }

      # TODO include timezone in this calc
      my $datestring = get_datestring($date);

      # make the tz three digits
      (my $tzout = $tz) =~ s/00$//;
      $tzout = '+' . $tzout if(length($tzout) == 2);

      my $outfile = $outpath . $datestring . ".$hour$tzout.$chunk.tsv.gz";
      push(@loaded, $outfile);
      #warn "writing $outfile\n";
      if(-e $outfile) {
        # XXX how to decide whether to skip completely?
        die "already have $outfile\n";
      }
      $chunk = 1; # from now on
      if($skipper) { # TODO how to reset skipcount?
        my $skipfile = skipfilename($opt, $outfile);
        $sw = $skipper->new_writer($skipfile);
      }
      $out = pipe_out($outfile);
      print $ch File::Basename::basename($outfile), "\n";
      $outhandles{"$date$hour$tz"} = [$out, $sw];
    };

    my $cdate = '';
    my %lc;
    while(my $line = <$fh>) {
      $linecount++;
      chomp($line);
      my $v = parse_line($line);

      # check date/time
      my ($d, $h, $rest) = split(/:/, $v->[dtime], 3);
      my ($tz) = ($rest =~ m/ ([-+]?\d+)/);



( run in 1.530 second using v1.01-cache-2.11-cpan-39bf76dae61 )