Config-Checker
view release on metacpan or search on metacpan
t/checker.t view on Meta::CPAN
state_variables: '*path to where to keep internal state information in YAML files[PATH]'
parameters: %additional global configuration parameters
sources:
-
name: =name of this source -- will be used later[TEXT]
hosts: '*name of hosts where the data can be found[HOSTNAME]'
path: filesystem path name where the data can be found[PATH]
valid_from: '?date from which this source is valid[DATE]{=parsedate($_[0]) || die "invalid date: $_[0]"}'
valid_to: '?date until which this source is valid[DATE]{=parsedate($_[0]) || die "invalid date: $_[0]"}'
format: 'record format for this source{valid_parser($_[0]) or $error = "invalid parser: <$_[0]> at $context"}'
remove_after: '?data expiration policy{parse_expiration_policy($_[0])}'
jobs:
-
name: =name of this step[WORD]
DISABLED: '?(=0)1 or 0'
source: '+name of input the data[TEXT]'
destination: name of the output data[TEXT]
hosts: '*where to do the work and store the output[HOSTNAME]'
path: where to write the output data[PATH]
valid_from: '?the first day to do this job this way[DATE]{=parsedate($_[0]) || die "invalid date: $_[0]"}'
valid_to: '?the last day to do this job this way[DATE]{=parsedate($_[0]) || die "invalid date: $_[0]"}'
filter: '?perl expression: to apply to choose input[CODE]'
group_by: '?perl expression: re-group input (expand or contract)[CODE]'
sort_by: '*list of job fields to sort by[WORD]'
bucketizer: '?perl expression: returns data to choose bucket[CODE]'
buckets: '?(=1)the total number of output buckets[INTEGER]'
remove_after: '?data expiration policy{parse_expiration_policy($_[0])}'
output_format: 'name of perl module to handle $object -> ascii{valid_parser($_[0]) or $error = "invalid parser: <$_[0]> at $context"}'
config: '%additional configuration parameters'
frequency: '?how often to generate the data, eg "monthly"[FREQUENCY]'
timespan: '?how much data from previous steps to include[TIMESPAN]'
hostsinfo:
hostname[HOSTNAME]:
max_threads: '?(=4)maximum number of processes to run at once[INTEGER]'
max_memory: '?(=5G)maximum amount of memory to use at once[SIZE]'
temporary_storage: '?(=/tmp)Where to keep tmp files[PATH]'
datadir: '?Where relative paths start on this system[PATH]'
END_PROTOTYPE
my $good_config = Load(<<'END_CONFIG');
---
parameters:
ignore_IPs:
- 10/8
- 192.168/16
hostsinfo:
www.google.com:
max_threads: 2
max_memory: 1G
temporary_storage: /tmp
datadir: /data1/delogs
master_node: www.yahoo.com
headers: /data2/david/logtmp/headers/%NAME%
metadata: /data2/david/logtmp/metadata/%YYYY%.%MM%.%DD%.%JOBNAME%
sources:
-
# ----------------------------------------------------------------
name: client logs
hosts:
- ds16-r50
path: /data1/delogs/backup/%YYYY%/%MM%/%DD%/clientlog_%loghost=\w+-r\d+%.%YYYY%-%M%-%D%-%hour=\d+%-%=\d+%
remove_after: 90 days
format: JSON
valid_from: 2008-03-03
valid_to: 2008-09-30
# ----------------------------------------------------------------
name: client logs
hosts:
- www.facebook.com
- www.linkedin.com
- www.myspace.com
- www.microsoft.com
- www.oracle.com
path: /data1/delogs/clientlog_%loghost=\w+-r\d+%.%YYYY%-%M%-%D%-%hour%=\d+%-%=\d+%
remove_after: 90 days
format: JSON
valid_from: 2008-10-01
valid_to: now
jobs:
-
# ----------------------------------------------------------------
#
# Rejoin user sessions together
#
name: rejoin
source:
- raw weblogs
- client logs
destination: rejoined sessions
#
# no toolbar logs in this stream
#
filter: $log->{type} ne 'toolbar'
hosts:
- www.sun.com
- www.ibm.com
path: %DATADIR%/bysession/%YYYY%/%MM%/%DD%/sessions.%BUCKET%.dirty
buckets: 16
bucketizer: $log->{user_id} || $log->{machine_id} || $log->{session_id}
valid_from: 2008-03-03
valid_to: yesterday
sort_by: $log->{user_id} || $log->{machine_id} || $log->{session_id}, $log->{timestamp}
output_format: Unified
frequency: daily
-
# ----------------------------------------------------------------
#
# Filter the sessions to remove internal hits, and bots.
# Save as sessions.
#
name: filter
source: rejoined sessions
destination: cleaned sessions
group_by: session_grouper($log)
filter: clean_logs($log)
path: %DATADIR%/bysession/%YYYY%/%MM%/%DD%/sessions.%BUCKET%.clean
valid_from: 2008-03-03
valid_to: yesterday
output_format: Sessions
frequency: daily
END_CONFIG
( run in 1.075 second using v1.01-cache-2.11-cpan-df04353d9ac )