ApacheLog-Parser
view release on metacpan or search on metacpan
bin/loghack view on Meta::CPAN
}
}
=head2 reskip
Regenerate the skiplist for a given chunk.
=cut
sub do_reskip {
my ($opt, @files) = @_;
@files = repo_files($opt, @files);
my $skipper = get_skipper($opt);
my $doskip = $skipper->get_matcher;
foreach my $file (@files) {
unless(-e $file) {
die "no such file:\n $file\n";
}
my $fh = open_file($file);
my $nicename = nice_name($file);
my $start = time;
print "$nicename -- ",
sprintf("%02d:%02d:%02d", (localtime($start))[2,1,0]), "\n";
my $skipfile = skipfilename($opt, $file);
my $sw = $skipper->new_writer($skipfile);
my $lnum = 0;
while(my $line = <$fh>) {
$lnum++;
chomp($line);
my @v = split(/\t/, $line);
# create skiplist
if($doskip and $doskip->(\@v)) {$sw->skip($lnum);}
}
}
}
=begin notes
The files are split per-hour. Time zone adjustments are going to be an
issue. There's also a potential race condition between two nodes, so
the outputs will always have a ".$chunk" appended to them. The value of
$part is either 0 or 1 (and only switched to 1 at the start of the file.)
And another issue: delay. The request init time is what's shown, but
it doesn't get logged until the request completes. So a 10min request
will not appear until 10min later. If there are any large downloads,
they could possibly even span a couple of logrotates.
This also means that tomorrow or the next day could concievably hold a
bit of data from a big download that started 24+ hours ago. In
practice, logrotate is actually just disposing of this data when it runs
gzip. That is, a request always goes in the logfile that was open when
the apache process spawned?
Still need to figure out the cleanup pass. Add the skiplists together
(and/or rename them), figure out where to tie-off the last item, etc.
Probably need some tracking of sources and/or chunks. Chunks can
probably be treated as closed until further notice as long as a
chunkcount file is maintained somewhere.
=end notes
=begin tznotes
Probably going to just leave the date string unprocessed (but we will
definitely slot it into files according to the adjusted zone.) Of
course, the date+hour+tz is used to memoize the outgoing date, so taking
the localtime and chunking that back together with the minutes+seconds
wouldn't be a big deal. We will need to address the dst issue though.
=end tznotes
=cut
=head2 prep
Parse a raw logfile and split it into hourly chunks.
loghack prep servername/logfile.gz
=cut
sub do_prep {
my ($opt, @files) = @_;
my $repo = $opt->{repository} or
die "must have repository setting for prep()\n";
my $doskip;
my $skipper;
if(-e (my $skipconf = "$repo/.config/skips.conf")) {
my ($skip) = YAML::LoadFile($skipconf);
$skipper = ApacheLog::Parser::SkipList->new();
$skipper->set_config($skip);
$doskip = $skipper->get_matcher;
}
my @loaded;
foreach my $file (@files) {
unless(-e $file) {
my $msg = "no such file:\n $file\n";
if($opt->{missok}) { warn $msg; next };
die $msg;
}
my $outpath = repository_path($opt, $file);
my $fh = open_file($file);
my $nicename = nice_name($file);
my $checksum = checksum($fh, 50);
my $checkfile = "$outpath.loaded/$checksum";
my $linecount = 0;
my $ch;
if(-e "$outpath/.loaded/$checksum") {
warn "assume $nicename is done\n";
{local $SIG{CHLD}; close($fh);} # stupid macs
next;
# TODO fast-forward support
# $linecount = $old_linecount; and etc
}
else {
record_source($opt, $file, "$outpath.sources/", $checksum);
# TODO this could stand to be more atomic
{ # record results
want_dir("$outpath.loaded");
my $tag = ($ENV{HOSTNAME} || '') . '.' . $$;
open($ch, '>', "$checkfile.$tag") or
die "cannot write '$checkfile.$tag' $!";
# TODO chmod
rename("$checkfile.$tag", $checkfile) or
die "cannot make $checkfile $!";
}
# TODO a replayable pipe would be nice
{local $SIG{CHLD}; close($fh);} # stupid macs
$fh = open_file($file);
}
$opt->{quiet} or print "$nicename -- ",
sprintf("%02d:%02d:%02d", (localtime)[2,1,0]), "\n";
my %outhandles;
my $sw;
my $out;
my $chunk = 2;
my $next_chunk = sub {
my ($date, $hour, $tz) = @_;
# might have already started that chunk
if(my $handles = $outhandles{"$date$hour$tz"}) {
#warn "back to $date:$hour$tz\n";
($out, $sw) = @$handles;
return;
}
# TODO include timezone in this calc
my $datestring = get_datestring($date);
# make the tz three digits
(my $tzout = $tz) =~ s/00$//;
$tzout = '+' . $tzout if(length($tzout) == 2);
my $outfile = $outpath . $datestring . ".$hour$tzout.$chunk.tsv.gz";
push(@loaded, $outfile);
#warn "writing $outfile\n";
if(-e $outfile) {
# XXX how to decide whether to skip completely?
die "already have $outfile\n";
}
$chunk = 1; # from now on
if($skipper) { # TODO how to reset skipcount?
my $skipfile = skipfilename($opt, $outfile);
$sw = $skipper->new_writer($skipfile);
}
$out = pipe_out($outfile);
print $ch File::Basename::basename($outfile), "\n";
$outhandles{"$date$hour$tz"} = [$out, $sw];
};
my $cdate = '';
my %lc;
while(my $line = <$fh>) {
$linecount++;
chomp($line);
my $v = parse_line($line);
# check date/time
my ($d, $h, $rest) = split(/:/, $v->[dtime], 3);
my ($tz) = ($rest =~ m/ ([-+]?\d+)/);
( run in 1.530 second using v1.01-cache-2.11-cpan-39bf76dae61 )