ApacheLog-Parser

 view release on metacpan or  search on metacpan

bin/loghack  view on Meta::CPAN

  }
}

sub pipe_out {
  my ($file) = @_;
  $file =~ m/\.(gz|bz2)$/ or die "unknown extension on $file";
  my $ext = $1;
  my %prog = (
    gz  => 'gzip',
    bz2 => 'bzip2',
  );
  my $prog = $prog{$ext} or die "cannot write $ext files";

  my $pid = open(my $fh, '|-');
  unless($pid) {
    local $SIG{CHLD};
    open(STDOUT, '>', $file) or die "cannot write '$file' $!";
    exec($prog, '-c') or die "ack $!";
  }
  #warn "launch $prog > $file on $pid\n";
  return($fh);
}

sub main {
  my (@args) = @_;

  my %o = (
    archive    => '',
    repository => '',
    missok     => 0,
    daemon     => '',
    cluster    => '',
    skip       => 1,
    quiet      => 0,
  );
  my $hopt = Getopt::Helpful->new(
    usage => 'CALLER <mode> [options] <arguments>',
    ['a|archive=s', \$o{archive}, '<dir>', 'archive dir'],
    ['r|repository=s', \$o{repository}, '<dir>', 'repository dir'],
    ['missok', \$o{missok}, '', 'skip missing files'],
    ['d|daemon=s', \$o{daemon}, '<dir>', 'daemon mode - needs chdir'],
    ['c|cluster=s', \$o{cluster}, '<hosts>', 'cluster mode'],
    ['s|skip!', \$o{skip}, '', 'use skipper (default yes)'],
    ['q|quiet', \$o{quiet}, '', 'suppress status'],
    '+help',
  );
  $hopt->Get_from(\@args);

  if(not $o{repository}) {
    $o{repository} = '.' if(-e '.config');
  }

  if($o{daemon}) {
    daemon(\%o, @args);
    exit;
  }

  my %modes = map({$_ => 1} qw(
    makelinks
    import
    prep check sweep verify confirm list
    unique day_unique month_unique month_unique2
    compile
    aggregate report date dump tabulate count reskip
  ));
  my $mode = shift(@args);
  $modes{$mode} or die "USAGE: mode must be one of ",
    join(", ", sort(keys(%modes))), "\n";

  # TODO deal with the do_ stuff
  if($o{cluster}) {
    cluster(\%o, $mode, @args);
  }
  else {
    my $run = __PACKAGE__->can('do_' . $mode) or
      die "cannot find method 'do_$mode'";
    $run->(\%o, @args);
  }
}

sub name_as_date {
  my ($n) = @_;
  $n =~ s/(?:.*\.)?(\d{4}-\d{2}-\d{2})\..*/$1/ or
    croak("weird name -- $n");
  $n =~ s#.*/##;
  return($n);
}
sub nice_name {
  my ($name) = @_;
  my @d = split(/\/+/, $name);
  my $n = '*.' . name_as_date(pop(@d)) . '.*';
  @d or return($n);
  return(join("/", $d[-1], $n));
}
sub record_source {
  my ($opt, $file, $dir, $md5) = @_;

  my $writefile = $dir.$md5;

  want_dir($dir);

  if(-e $writefile) {
    warn "skipping $writefile ($file)\n";
    return;
  }
  open(my $fh, '>', $writefile) or die "cannot write '$writefile' $!";
  print $fh File::Basename::basename($file), "\n";
  close($fh) or die "cannot write '$writefile' $!";
}
sub want_dir {
  my ($dir) = @_;

  return if(-d $dir);

  unless(mkdir($dir)) {
    die "cannot create $dir $!" unless(-d $dir);
  }
}

sub daemon {
  my ($opt, @args) = @_;

bin/loghack  view on Meta::CPAN

      }

      # TODO include timezone in this calc
      my $datestring = get_datestring($date);

      # make the tz three digits
      (my $tzout = $tz) =~ s/00$//;
      $tzout = '+' . $tzout if(length($tzout) == 2);

      my $outfile = $outpath . $datestring . ".$hour$tzout.$chunk.tsv.gz";
      push(@loaded, $outfile);
      #warn "writing $outfile\n";
      if(-e $outfile) {
        # XXX how to decide whether to skip completely?
        die "already have $outfile\n";
      }
      $chunk = 1; # from now on
      if($skipper) { # TODO how to reset skipcount?
        my $skipfile = skipfilename($opt, $outfile);
        $sw = $skipper->new_writer($skipfile);
      }
      $out = pipe_out($outfile);
      print $ch File::Basename::basename($outfile), "\n";
      $outhandles{"$date$hour$tz"} = [$out, $sw];
    };

    my $cdate = '';
    my %lc;
    while(my $line = <$fh>) {
      $linecount++;
      chomp($line);
      my $v = parse_line($line);

      # check date/time
      my ($d, $h, $rest) = split(/:/, $v->[dtime], 3);
      my ($tz) = ($rest =~ m/ ([-+]?\d+)/);
      if("$d$h$tz" ne $cdate) {
        $next_chunk->($d, $h, $tz);
        $cdate = "$d$h$tz";
        $lc{$cdate} ||= 0;
        #warn "$d $h $tz\n";
      }
      my $lnum = ++$lc{$cdate};

      # create skiplist
      if($doskip->($v)) {$sw->skip($lnum);}

      print $out join("\t", @$v), "\n";
    }

    print $ch "$linecount\n";
    close($ch) or die "write '$checkfile' failed $!";
    # TODO race checks/chmod

  }
  wait(); # XXX need this?
  return(@loaded);
}

=for doc ###############################################################
Examine the */.loaded files and verify that each one has a linecount
(finished loading.)
  loghack check */.loaded/*

=cut

sub do_check {
  my ($opt, @files) = @_;

  foreach my $file (@files) {
    my $err = run_check($file) or next;
    print "NC $file (", scalar(@$err), " parts)\n";
  }
}
sub do_sweep {
  my ($opt, @files) = @_;

  foreach my $file (@files) {
    my $err = run_check($file) or next;
    print "NC $file (", scalar(@$err), " parts)\n";
    foreach my $part (@$err) {
      print "  $part\n";
      if(-e $part) {
        unlink($part) or die "cannot unlink('$part') $!";
      }
    }
    unlink($file) or die "cannot unlink('$file') $!";
  }
}
sub run_check {
  my ($checkfile) = @_;

  die "'$checkfile' is a directory" if(-d $checkfile);
  open(my $fh, '<', $checkfile) or die "cannot read '$checkfile' $!";
  my @list = map({chomp; $_} <$fh>);

  return() if(@list and $list[-1] and $list[-1] =~ m/^\d+$/);

  my $dir = File::Basename::dirname(File::Basename::dirname($checkfile));
  return([map({"$dir/$_"} @list)]);
}

sub _date_dwim {
  my (@in) = @_;

  my @dates;
  while(@in) {
    my $date = shift(@in);

    if($date eq 'thru') {
      push(@dates, date(pop(@dates))->thru(date(shift(@in))));
      next;
    }
    push(@dates, $date);
  }
  return(@dates);
}

=for doc ###############################################################
Given a date range, verify that all files + hours for that server are
done (with the exception of those listed in the .MIA file.)

=cut

sub do_verify {
  my ($opt, @in) = @_;

  my @dates = _date_dwim(@in) or die "you gave no dates";
  foreach my $dir (glob('*')) {
    (-d $dir) or next;
    foreach my $date (@dates) {
      my @got = glob("$dir/$date*");
      print "$dir/$date ", scalar(@got), "\n";
    }
  }
}

=for doc ###############################################################
Make sure that all files are claimed somewhere.  This is useful when a
load-in crashed.

  loghack confirm *

=cut

sub do_confirm {
  my ($opt, @dirs) = @_;

  foreach my $dir (@dirs) {
    my %loaded = map({$_ => 1} sub {
      my ($s_dir) = @_;
      $s_dir .= '/.loaded';
      -d $s_dir or return();
      opendir(my $dh, $s_dir) or die "cannot opendir '$s_dir' $!";
      my @ans;
      foreach my $name (grep({$_ !~ m/^\./} readdir($dh))) {
        my $file = "$s_dir/$name";
        open(my $fh, '<', $file) or die "cannot read '$file' $!";
        my @list = map({chomp; $_} <$fh>);
        pop(@list) if($list[-1] =~ m/^\d+$/);
        push(@ans, @list);
      }
      return(@ans);
    }->($dir)
    );
    $dir =~ s#/*$#/#;
    opendir(my $dh, $dir) or die "cannot opendir '$dir' $!";
    foreach my $name (grep({$_ !~ m/^\./} readdir($dh))) {
      unless($loaded{$name}) {
        print "$dir$name\n";
      }
    }
  }
  # TODO exit with error?
}

=head2 list

List files in the repository.

  loghack list 2008-01-01 thru 2008-01-31 in *

=cut

sub do_list {



( run in 0.789 second using v1.01-cache-2.11-cpan-39bf76dae61 )