Bio-Grid-Run-SGE
view release on metacpan or search on metacpan
bin/distribution view on Meta::CPAN
readPretalliedTokens();
} else {
readLinesBuildHash();
}
# see if there was input
checkValuesObjects();
my $totalKeys;
my @sortedKeys;
# if we're just graphing a bunch of numbers, no need to sort
# the values dict
if ($numOnly) {
# here we sort on KEYS not VALUES like other cases
@sortedKeys = sortKeysByKey();
# we graph everything we're given - throw away nothing!
$height = $totalObjects;
} else {
# build the sorted dict
@sortedKeys = sortKeysByValueFrequency();
$maxVal = $valuesDict->{$sortedKeys[0]};
}
# if there aren't height # of distinct values, use less
$totalKeys = scalar @sortedKeys;
if ($totalKeys < $height) { $height = $totalKeys; }
# for logarithmic graphs
my $maxLog = 0;
if ($logarithmic) {
$maxLog = log($maxVal);
}
my $i;
my $j;
my $keyText;
my $ctText;
my $pctText;
my $preBarLen;
my $barWidth;
my $maxPreBarLen = 0;
my $maxKeyLen = 0;
# here is the complex part - read it carefully
for ($i = 0; $i < $height; $i++) {
# print the i'th most-common key
$keyText->[$i] = $sortedKeys[$i];
# how many times this key occurred in the input
my $count = $valuesDict->{$sortedKeys[$i]};
# determine the bar width based on key occurence
if ($logarithmic) {
$barWidth->[$i] = log($count) / $maxLog;
} else {
$barWidth->[$i] = $count / $maxVal;
}
# determine the percent of key frequency
my $percentile = $count / $totalValues * 100;
# graph axis labels, really
$ctText->[$i] = sprintf ("%d", $count);
$pctText->[$i] = $count > 0 ? sprintf ("(%3.02f%%)", $percentile) : '';
$preBarLen = length ($ctText->[$i]) + length ($pctText->[$i]);
# determine the longest key name and longest count/percent text for
# aligning the output
if ($preBarLen > $maxPreBarLen) { $maxPreBarLen = $preBarLen; }
if (length ($sortedKeys[$i]) > $maxKeyLen) { $maxKeyLen = length ($sortedKeys[$i]); }
}
my $endTime = time() * 1000;
my $totalMillis = sprintf ("%.2f", ($endTime - $startTime));
if ($verbose) {
print STDERR "tokens/lines examined: $totalObjects\n";
print STDERR " tallied in histogram: $totalValues\n";
print STDERR " histogram entries: $totalKeys\n";
print STDERR " runtime: ${totalMillis}ms\n";
}
outputGraph();
exit 0;
# --------------------------------------------------------------------------- #
# subroutines
# --------------------------------------------------------------------------- #
# get the keys ordered - we'll only print the most common keys
sub sortKeysByValueFrequency {
my @sortedKeys = reverse sort { int($valuesDict->{$a}) <=> int($valuesDict->{$b}) } keys %{$valuesDict};
return @sortedKeys;
}
# get the keys ordered simply by key
sub sortKeysByKey {
my @sortedKeys = sort { int($a) <=> int($b) } keys %{$valuesDict};
return @sortedKeys;
}
# here we just pull in a stream of numerics and graph them, optionally graphing
# the difference between each pair of values
sub readNumerics {
# monotonically-increasing pairs, must have at least one value
my $lastVal = undef;
while ($inLine = <STDIN>) {
chomp ($inLine);
if ($numOnly =~ /^m/i) {
if (defined $lastVal) {
$graphVal= $inLine - $lastVal;
$totalValues += $graphVal;
$totalObjects++;
}
$lastVal = $inLine;
} else {
$graphVal= $inLine;
$totalValues += $inLine;
$totalObjects++;
}
if ($graphVal > $maxVal) { $maxVal = $graphVal; }
# we just build a list where the keys are line num and values
# are the value
bin/distribution view on Meta::CPAN
}
}
if ($verbose && time() > $nextStat) {
print STDERR " Objects Processed: $totalObjects..." . chr(13);
$nextStat = time() + $statInterval;
}
}
if($pseudoCnt) {
map { $valuesDict->{$_}-- } keys %$valuesDict;
$totalObjects -= scalar keys %$valuesDict;
$totalValues -= scalar keys %$valuesDict;
}
if ($verbose) { print STDERR " Objects Processed: $totalObjects \n"; }
}
# here is the case where we don't need to put the input into
# bins and tally - the data is pre-tallied for us
sub readPretalliedTokens {
while ($inLine = <STDIN>) {
chomp ($inLine);
if ($graphValues eq 'vk') {
if ($inLine =~ /^\s*(\d+)\s+(.+)$/) {
$valuesDict->{$2} = $1;
$totalValues += $1;
if ($1 > $maxVal) { $maxVal = $1; }
$totalObjects++;
} else {
print STDERR " E Input line malformed and discarded: $inLine\n";
}
} elsif ($graphValues eq 'kv') {
if ($inLine =~ /^(.+?)\s+(\d+)$/) {
$valuesDict->{$1} = $2;
$totalValues += $2;
if ($2 > $maxVal) { $maxVal = $2; }
$totalObjects++;
} else {
print STDERR " E Input line malformed and discarded: $inLine\n";
}
}
}
}
# see if there was input
sub checkValuesObjects {
# the input may be empty. or we may have been too strict on the
# matching regexp passed in. either way, we were left with nothing.
if ($totalValues == 0) {
if ($totalObjects > 0) {
print STDERR "All input filtered! ";
} else {
print STDERR "No input! ";
}
print STDERR "No histogram for you.\n";
exit 255;
}
}
# the arrays, hashes, variables must be all-correct for this to
# work, TODO: list out which ones they are, convert to functional
# keyText->[] - list of the keys
# pctText->[] - list of the percents
# ctText->[] - list of the counts
# barWidth->[] - list of the widths of the bars
sub outputGraph {
# print a header with alignment from key names
print STDERR "min";
for ($j = 4; $j <= $maxKeyLen; $j++) { print STDERR " "; }
print STDERR "|Ct (Pct)";
for ($j = 7; $j <= $maxPreBarLen; $j++) { print STDERR " "; }
print STDERR "Histogram";
# get ready for the output, but sorting gets hosed if we print the
# colour code before the key, so put it on the line before
print STDERR "$keyColour\n";
# amount of other output reduces possible size of bar - alas
my $maxBarWidth = $width - $maxPreBarLen - $maxKeyLen - 4;
for ($i = 0; $i < $height; $i++) {
# first the key that we aggregated
print $keyText->[$i];
print $regularColour;
# alignment
for ($j = length ($keyText->[$i]); $j < $maxKeyLen; $j++) { print " "; }
# separater between keys and count/pct
print "|";
print $ctColour;
print $ctText->[$i] . " ";
print $pctColour;
print $pctText->[$i];
# spaces 'til time to print the bar
for ($j = length ($ctText->[$i]) + length ($pctText->[$i]); $j <= $maxPreBarLen; $j++) { print " "; }
print $graphColour;
for ($j = 0; $j < int ($barWidth->[$i] * $maxBarWidth); $j++) {
if ($charWidth < 1) {
# print out maximum-width character (always last in array)
print $graphChars->[scalar @$graphChars - 1];
} else {
# we're printing regular non-unicode characters
if (length ($histogramChar) > 1 && !$unicode) {
# but still we have >1 byte! so print initial byte
# for all but the last character (printed outside loop)
print substr ($histogramChar, 0, 1);
} else {
print $histogramChar;
}
}
}
my $count = $valuesDict->{$sortedKeys[$i]};
# print one too many bar characters so <1% gets a single bar character
if($count > 0) {
my $remainder = ($barWidth->[$i] * $maxBarWidth) - int ($barWidth->[$i] * $maxBarWidth);
my $whichChar = int ($remainder / $charWidth);
if ($charWidth < 1) {
# if we have partial-width characters, get higher resolution
if ($barWidth->[$i] * $maxBarWidth > $charWidth) {
# we have more than charWidth remainder, so print out a
# remainder portion
bin/distribution view on Meta::CPAN
if (length ($histogramChar) > 1 && !$unicode) {
# but still we have >1 byte! so print final byte of input string
print substr ($histogramChar, -1, 1);
} else {
print $histogramChar;
}
}
}
# FIXME: even with all these colour-printing antics, still one key will
# be printed with the wrong colour on sorted output most of the time,
# but i have no idea how to fix this other than to implement sorting of
# the output within the script itself.
if ($i == $height - 1) {
# put the terminal back into a normal-colour mode on last entry
print "$regularColour\n";
} else {
# we do these antics of printing $keyColour on the line before
# the key so that piping output to sort will work
print "$keyColour\n";
}
}
}
# usage
sub doArgs {
print "\n";
print "usage: <commandWithOutput> | $0\n";
print " [--rcfile=<rcFile>]\n";
print " [--size={sm|med|lg|full} | --width=<width> --height=<height>]\n";
print " [--color] [--palette=r,k,c,p,g]\n";
print " [--tokenize=<tokenChar>]\n";
print " [--graph[=[kv|vk]] [--numonly[=mon|abs]]\n";
print " [--char=<barChars>|<substitutionString>]\n";
print " [--help] [--verbose]\n";
print " --char=C character(s) to use for histogram character, some substitutions follow:\n";
print " ba (â¬) Bar\n";
print " bl (Î) Building\n";
print " em (â) Emdash\n";
print " me (â¯) Mid-Elipses\n";
print " di (â¦) Diamond\n";
print " dt (â¢) Dot\n";
print " sq (â¡) Square\n";
print " hl Use 1/3-width unicode partial lines to simulate 3x actual terminal width\n";
print " pb Use 1/8-width unicode partial blocks to simulate 8x actual terminal width\n";
print " pc Use 1/2-width unicode partial circles to simulate 2x actual terminal width\n";
print " --color colourise the output\n";
print " --graph[=G] input is already key/value pairs. vk is default:\n";
print " kv input is ordered key then value\n";
print " vk input is ordered value then key\n";
print " --height=N height of histogram, headers non-inclusive, overrides --size\n";
print " --help get help\n";
print " --logarithmic logarithmic graph\n";
print " --match=RE only match lines (or tokens) that match this regexp, some substitutions follow:\n";
print " word ^[A-Z,a-z]+\$ - tokens/lines must be entirely alphabetic\n";
print " num ^\\d+\$ - tokens/lines must be entirely numeric\n";
print " --numonly[=N] input is numerics, simply graph values without labels\n";
print " abs input is absolute values (default)\n";
print " mon input monotonically-increasing, graph differences (of 2nd and later values)\n";
print " --palette=P comma-separated list of ANSI colour values for portions of the output\n";
print " in this order: regular, key, count, percent, graph. implies --color.\n";
print " --rcfile=F use this rcfile instead of \$HOME/.distributionrc - must be first argument!\n";
print " --size=S size of histogram, can abbreviate to single character, overridden by --width/--height\n";
print " small 40x10\n";
print " medium 80x20\n";
print " large 120x30\n";
print " full terminal width x terminal height (approximately)\n";
print " --tokenize=RE split input on regexp RE and make histogram of all resulting tokens\n";
print " word [^\\w] - split on non-word characters like colons, brackets, commas, etc\n";
print " white \\s - split on whitespace\n";
print " --width=N width of the histogram report, N characters, overrides --size\n";
print " --verbose be verbose\n";
print "\n";
print "You can use single-characters options, like so: -h=25 -w=20 -v. You must still include the =\n";
print "\n";
print "Samples:\n";
print " du -sb /etc/* | $0 --palette=0,37,34,33,32 --graph\n";
print " du -sk /etc/* | awk '{print \$2\" \"\$1}' | $0 --graph=kv\n";
print " zcat /var/log/syslog*gz | $0 --char=o --tokenize=white\n";
print " zcat /var/log/syslog*gz | awk '{print \$5}' | $0 --t=word --m-word --h=15 --c=/\n";
print " zcat /var/log/syslog*gz | cut -c 1-9 | $0 --width=60 --height=10 --char=em\n";
print " find /etc -type f | cut -c 6- | $0 --tokenize=/ --w=90 --h=35 --c=dt\n";
print " cat /usr/share/dict/words | awk '{print length(\$1)}' | $0 --c=* --w=50 --h=10 | sort -n\n";
print "\n";
}
( run in 2.046 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )