ApacheLog-Compressor

 view release on metacpan or  search on metacpan

lib/ApacheLog/Compressor.pm  view on Meta::CPAN

package ApacheLog::Compressor;
# ABSTRACT: Convert Apache/CLF data to binary format
use strict;
use warnings;

use Socket qw(inet_aton inet_ntoa);
use Date::Parse qw(str2time);
use List::Util qw(min);
use URI;
use URI::Escape qw(uri_unescape);
use DateTime;
use Encode qw(encode_utf8 decode_utf8 FB_DEFAULT is_utf8 FB_CROAK);
use POSIX qw{strftime};

our $VERSION = '0.005';

=head1 NAME

ApacheLog::Compressor - convert Apache / CLF log files into a binary format for transfer

=head1 VERSION

version 0.005

=head1 SYNOPSIS

 use ApacheLog::Compressor;
 use Sys::Hostname qw(hostname);

 # Write all data to bzip2-compressed output file
 open my $out_fh, '>', 'compressed.log.bz2' or die "Failed to create output file: $!";
 binmode $out_fh;
 my $zip = IO::Compress::Bzip2->new($out_fh, BlockSize100K => 9);

 # Provide a callback to send data through to the file
 my $alc = ApacheLog::Compressor->new(
	on_write	=> sub {
		my ($self, $pkt) = @_;
		$zip->write($pkt);
	}
 );

 # Input file - normally use whichever one's just been closed + rotated
 open my $fh, '<', '/var/log/apache2/access.log.1' or die "Failed to open log: $!";

 # Initial packet to identify which server this came from
 $alc->send_packet('server',
 	hostname	=> hostname(),
 );

 # Read and compress all the lines in the files
 while(my $line = <$fh>) {
	 $alc->compress($line);
 }
 close $fh or die $!;
 $zip->close;

 # Dump the stats in case anyone finds them useful
 $alc->stats;

=head1 DESCRIPTION

Converts data from standard Apache log format into a binary stream which is typically 20% - 60% the size of the original file.
Intended for cases where log data needs transferring from multiple high-volume servers for analysis (potentially in realtime
via tail -f).

The log format is a simple dictionary replacement algorithm: each field that cannot be represented in a fixed-width datatype
is replaced with an indexed value, allowing the basic log line packet to be fixed size with additional packets containing the
first instance of each variable-width data item.

lib/ApacheLog/Compressor.pm  view on Meta::CPAN


=head2 default_format

Returns the default format used for parsing log lines.

This is an arrayref containing key => value pairs, see L</FORMAT SPECIFICATION> for
more details.

=cut

sub default_format {
	my $self = shift;
	return [
		type		=> { type => 'C1' },
		vhost		=> { id => 0x03, type => 'n1', regex => qr{([^ ]+)} },
		duration	=> { type => 'N1', regex => qr{(\d+)} },
		ip		=> {
			type => 'N1',
			regex => qr{(\S+)\s+\S+},
			process_in => sub {
				my ($self, $data) = @_;
				$data->{ip} = unpack('N1', inet_aton($data->{ip}));
			},
			process_out => sub {
				my ($self, $data) = @_;
				$data->{ip} = inet_ntoa(pack('N1', $data->{ip}));
			}
		},
		user		=> { id => 0x04, type => 'n1', regex => qr{(\S+)} },
		timestamp	=> {
			id => 0x02,
			regex => qr{\[([^\]]+)\]},
			process_in => sub {
				my ($self, $data) = @_;
				$data->{timestamp} = str2time($data->{timestamp});
			}
		},
		method		=> {
			type => 'C1',
			regex => qr{"([^ ]+)},
			process_in => sub {
				my ($self, $data) = @_;
				$data->{method} = $HTTP_METHOD{$data->{method}};
			},
			process_out => sub {
				my ($self, $data) = @_;
				$data->{method} = $HTTP_METHOD_LIST[$data->{method}];
			}
		},
		url		=> {
			id => 0x07,
			type => 'N1',
			regex => qr{([^ ]+)},
			process_in => sub {
				my ($self, $data) = @_;
				return $data->{url} = '' unless defined $data->{url};

				($data->{url}, $data->{query}) = split /\?/, $data->{url}, 2;
# Dodgy UTF8 handling, currently disabled - no guarantee that URLs are UTF8 anyway
#				if(length $data->{url}) {
				# URI::Escape's uri_unescape but in byte mode so we can check utf8 decoding manually
#					my $txt = $data->{url};
#					$txt = encode_utf8($txt); # turn OFF utf8
#					$txt =~ s/%([0-9A-Fa-f]{2})/pack("C1", hex($1))/ge; # expand
#					$txt = decode_utf8($txt); # turn ON utf8 where applicable
#					$data->{url} = $txt;
#				}
#				if(defined $data->{query} && length $data->{query}) {
				# URI::Escape's uri_unescape but in byte mode so we can check utf8 decoding manually
#					(my $txt = $data->{query}) =~ s/%([0-9A-Fa-f]{2})/pack("C1", hex($1))/eg;
#					$data->{query} = decode_utf8($txt, FB_DEFAULT);
#				}
			}
		},
		query		=> { id => 0x0A, type => 'N1', },
		ver		=> {
			type => 'C1',
			regex => qr{HTTP/(\d+\.\d+)"},
			process_in => sub {
				my ($self, $data) = @_;
				$data->{ver} = ($data->{ver} eq '1.0' ? 0 : 1);
			}, process_out => sub {
				my ($self, $data) = @_;
				$data->{ver} = ($data->{ver} ? '1.1' : '1.0');
			}
		},
		result		=> { type => 'n1', regex => qr{(\d+)} },
		size		=> {
			type => 'N1',
			regex => qr{(\d+|-)},
			process_in => sub {
				my ($self, $data) = @_;
				$data->{size} = ($data->{size} eq '-') ? -1 : $data->{size};
			}, process_out => sub {
				my ($self, $data) = @_;
				$data->{size} = ($data->{size} == 4294967295) ? '-' : $data->{size};
			}
		},
		refer		=> { id => 0x06, type => 'n1', regex => qr{"([^"]*)"} },
		useragent	=> { id => 0x05, type => 'n1', regex => qr{"([^"]*)"} },
	];
}

=head2 update_mapping

Refresh the mapping from format keys and internal definitions.

=cut

sub update_mapping {
	my $self = shift;
	my %fmt = @{ $self->{format} };
	$self->{format_hash} = \%fmt;
	$self->{packet_handler} = {
		0x00 => 'log',
		0x01 => 'server',
		0x80 => 'reset',
		map { $fmt{$_}->{id} => $_ } grep { exists $fmt{$_}->{id} } keys %fmt
	};

# Extract information from format strings so that we know how big the packets are and where the data goes
	my @fmt = @{$self->{format}};
	my $pack_str = '';
	my $log_len = 0;
	my @format_keys;
	my @regex;
	ITEM:
	while(@fmt) {
		my $k = shift(@fmt);



( run in 1.861 second using v1.01-cache-2.11-cpan-39bf76dae61 )