ApacheLog-Parser

 view release on metacpan or  search on metacpan

bin/loghack  view on Meta::CPAN


=end tznotes

=cut

=head2 prep

Parse a raw logfile and split it into hourly chunks.

  loghack prep servername/logfile.gz

=cut

sub do_prep {
  my ($opt, @files) = @_;

  my $repo = $opt->{repository} or
    die "must have repository setting for prep()\n";

  my $doskip;
  my $skipper;
  if(-e (my $skipconf = "$repo/.config/skips.conf")) {
    my ($skip) = YAML::LoadFile($skipconf);
    $skipper = ApacheLog::Parser::SkipList->new();
    $skipper->set_config($skip);
    $doskip = $skipper->get_matcher;
  }

  my @loaded;
  foreach my $file (@files) {
    unless(-e $file) {
      my $msg = "no such file:\n  $file\n";
      if($opt->{missok}) { warn $msg; next };
      die $msg;
    }
    my $outpath = repository_path($opt, $file);
    my $fh = open_file($file);

    my $nicename = nice_name($file);

    my $checksum = checksum($fh, 50);
    my $checkfile = "$outpath.loaded/$checksum";
    my $linecount = 0;
    my $ch;
    if(-e "$outpath/.loaded/$checksum") {
      warn "assume $nicename is done\n";
      {local $SIG{CHLD}; close($fh);} # stupid macs
      next;
      # TODO fast-forward support
      # $linecount = $old_linecount; and etc
    }
    else {
      record_source($opt, $file, "$outpath.sources/", $checksum);

      # TODO this could stand to be more atomic
      { # record results
        want_dir("$outpath.loaded");
        my $tag = ($ENV{HOSTNAME} || '') . '.' . $$;
        open($ch, '>', "$checkfile.$tag") or
          die "cannot write '$checkfile.$tag' $!";
        # TODO chmod
        rename("$checkfile.$tag", $checkfile) or
          die "cannot make $checkfile $!";
      }

      # TODO a replayable pipe would be nice
      {local $SIG{CHLD}; close($fh);} # stupid macs
      $fh = open_file($file);
    }

    $opt->{quiet} or print "$nicename -- ",
      sprintf("%02d:%02d:%02d", (localtime)[2,1,0]), "\n";

    my %outhandles;
    my $sw;
    my $out;

    my $chunk = 2;
    my $next_chunk = sub {
      my ($date, $hour, $tz) = @_;

      # might have already started that chunk
      if(my $handles = $outhandles{"$date$hour$tz"}) {
        #warn "back to $date:$hour$tz\n";
        ($out, $sw) = @$handles;
        return;
      }

      # TODO include timezone in this calc
      my $datestring = get_datestring($date);

      # make the tz three digits
      (my $tzout = $tz) =~ s/00$//;
      $tzout = '+' . $tzout if(length($tzout) == 2);

      my $outfile = $outpath . $datestring . ".$hour$tzout.$chunk.tsv.gz";
      push(@loaded, $outfile);
      #warn "writing $outfile\n";
      if(-e $outfile) {
        # XXX how to decide whether to skip completely?
        die "already have $outfile\n";
      }
      $chunk = 1; # from now on
      if($skipper) { # TODO how to reset skipcount?
        my $skipfile = skipfilename($opt, $outfile);
        $sw = $skipper->new_writer($skipfile);
      }
      $out = pipe_out($outfile);
      print $ch File::Basename::basename($outfile), "\n";
      $outhandles{"$date$hour$tz"} = [$out, $sw];
    };

    my $cdate = '';
    my %lc;
    while(my $line = <$fh>) {
      $linecount++;
      chomp($line);
      my $v = parse_line($line);

      # check date/time
      my ($d, $h, $rest) = split(/:/, $v->[dtime], 3);
      my ($tz) = ($rest =~ m/ ([-+]?\d+)/);
      if("$d$h$tz" ne $cdate) {
        $next_chunk->($d, $h, $tz);
        $cdate = "$d$h$tz";
        $lc{$cdate} ||= 0;
        #warn "$d $h $tz\n";
      }
      my $lnum = ++$lc{$cdate};

      # create skiplist
      if($doskip->($v)) {$sw->skip($lnum);}

      print $out join("\t", @$v), "\n";
    }

    print $ch "$linecount\n";
    close($ch) or die "write '$checkfile' failed $!";
    # TODO race checks/chmod

  }
  wait(); # XXX need this?
  return(@loaded);
}

=for doc ###############################################################
Examine the */.loaded files and verify that each one has a linecount
(finished loading.)
  loghack check */.loaded/*

=cut

sub do_check {
  my ($opt, @files) = @_;

  foreach my $file (@files) {
    my $err = run_check($file) or next;
    print "NC $file (", scalar(@$err), " parts)\n";
  }
}
sub do_sweep {
  my ($opt, @files) = @_;

  foreach my $file (@files) {
    my $err = run_check($file) or next;
    print "NC $file (", scalar(@$err), " parts)\n";
    foreach my $part (@$err) {
      print "  $part\n";
      if(-e $part) {
        unlink($part) or die "cannot unlink('$part') $!";
      }
    }
    unlink($file) or die "cannot unlink('$file') $!";
  }
}
sub run_check {
  my ($checkfile) = @_;

  die "'$checkfile' is a directory" if(-d $checkfile);
  open(my $fh, '<', $checkfile) or die "cannot read '$checkfile' $!";
  my @list = map({chomp; $_} <$fh>);

  return() if(@list and $list[-1] and $list[-1] =~ m/^\d+$/);

  my $dir = File::Basename::dirname(File::Basename::dirname($checkfile));
  return([map({"$dir/$_"} @list)]);
}

sub _date_dwim {
  my (@in) = @_;

  my @dates;
  while(@in) {
    my $date = shift(@in);

    if($date eq 'thru') {
      push(@dates, date(pop(@dates))->thru(date(shift(@in))));
      next;
    }



( run in 6.055 seconds using v1.01-cache-2.11-cpan-cdf2f3d4e48 )