ApacheLog-Parser

 view release on metacpan or  search on metacpan

bin/loghack  view on Meta::CPAN


    my $checksum = checksum($fh, 50);
    my $checkfile = "$outpath.loaded/$checksum";
    my $linecount = 0;
    my $ch;
    if(-e "$outpath/.loaded/$checksum") {
      warn "assume $nicename is done\n";
      {local $SIG{CHLD}; close($fh);} # stupid macs
      next;
      # TODO fast-forward support
      # $linecount = $old_linecount; and etc
    }
    else {
      record_source($opt, $file, "$outpath.sources/", $checksum);

      # TODO this could stand to be more atomic
      { # record results
        want_dir("$outpath.loaded");
        my $tag = ($ENV{HOSTNAME} || '') . '.' . $$;
        open($ch, '>', "$checkfile.$tag") or
          die "cannot write '$checkfile.$tag' $!";
        # TODO chmod
        rename("$checkfile.$tag", $checkfile) or
          die "cannot make $checkfile $!";
      }

      # TODO a replayable pipe would be nice
      {local $SIG{CHLD}; close($fh);} # stupid macs
      $fh = open_file($file);
    }

    $opt->{quiet} or print "$nicename -- ",
      sprintf("%02d:%02d:%02d", (localtime)[2,1,0]), "\n";

    my %outhandles;
    my $sw;
    my $out;

    my $chunk = 2;
    my $next_chunk = sub {
      my ($date, $hour, $tz) = @_;

      # might have already started that chunk
      if(my $handles = $outhandles{"$date$hour$tz"}) {
        #warn "back to $date:$hour$tz\n";
        ($out, $sw) = @$handles;
        return;
      }

      # TODO include timezone in this calc
      my $datestring = get_datestring($date);

      # make the tz three digits
      (my $tzout = $tz) =~ s/00$//;
      $tzout = '+' . $tzout if(length($tzout) == 2);

      my $outfile = $outpath . $datestring . ".$hour$tzout.$chunk.tsv.gz";
      push(@loaded, $outfile);
      #warn "writing $outfile\n";
      if(-e $outfile) {
        # XXX how to decide whether to skip completely?
        die "already have $outfile\n";
      }
      $chunk = 1; # from now on
      if($skipper) { # TODO how to reset skipcount?
        my $skipfile = skipfilename($opt, $outfile);
        $sw = $skipper->new_writer($skipfile);
      }
      $out = pipe_out($outfile);
      print $ch File::Basename::basename($outfile), "\n";
      $outhandles{"$date$hour$tz"} = [$out, $sw];
    };

    my $cdate = '';
    my %lc;
    while(my $line = <$fh>) {
      $linecount++;
      chomp($line);
      my $v = parse_line($line);

      # check date/time
      my ($d, $h, $rest) = split(/:/, $v->[dtime], 3);
      my ($tz) = ($rest =~ m/ ([-+]?\d+)/);
      if("$d$h$tz" ne $cdate) {
        $next_chunk->($d, $h, $tz);
        $cdate = "$d$h$tz";
        $lc{$cdate} ||= 0;
        #warn "$d $h $tz\n";
      }
      my $lnum = ++$lc{$cdate};

      # create skiplist
      if($doskip->($v)) {$sw->skip($lnum);}

      print $out join("\t", @$v), "\n";
    }

    print $ch "$linecount\n";
    close($ch) or die "write '$checkfile' failed $!";
    # TODO race checks/chmod

  }
  wait(); # XXX need this?
  return(@loaded);
}

=for doc ###############################################################
Examine the */.loaded files and verify that each one has a linecount
(finished loading.)
  loghack check */.loaded/*

=cut

sub do_check {
  my ($opt, @files) = @_;

  foreach my $file (@files) {
    my $err = run_check($file) or next;
    print "NC $file (", scalar(@$err), " parts)\n";
  }
}



( run in 1.421 second using v1.01-cache-2.11-cpan-39bf76dae61 )