Bio-Grid-Run-SGE
view release on metacpan or search on metacpan
bin/distribution view on Meta::CPAN
#!/usr/bin/env perl
# distribution - creates histograms based on input
#
# Version 1.0
#
# Written by Tim Ellis of Palomino
# Copyright (c) 2012 by Tim Ellis
# Released under the terms of GNU GPLv2:
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, visit http://www.gnu.org/licenses/gpl-2.0.html in your
# web browser.
#
# If you're still living in the 1800's, you can instead write to:
#
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
use warnings;
use strict;
# if your system is dying on this statement, it should be safe to simply comment
# it out. your timings with --verbose will be very course.
#use Time::HiRes qw( time );
# for determining how long this program ran
my $startTime = time() * 1000;
# defaults - overridden first by distributionrc then by commandline options
my $pseudoCnt = 0;
my $size = '';
my $width = 80;
my $height = 15;
my $widthArg = 0;
my $heightArg = 0;
my $logarithmic = 0;
my $numOnly = '';
# character for ease of eye to scan left/right/up/down
my $histogramChar = "+";
my $unicode = 0;
my $tokenize = 0;
# by default, everything matches (nothing is stripped out)
my $matchRegexp = ".";
# status and summary statistics
my $verbose = 0;
# how often to give status if verbose
my $statInterval = 0.5;
# whether or not to parse input into bins (if data is pre-summarised and we're
# just presenting results)
my $graphValues = "";
# variables to support colourised output
my $colourisedOutput = 0;
my $colourPalette = '0,0,32,35,34';
my $regularColour = "";
my $keyColour = "";
my $ctColour = "";
my $pctColour = "";
my $graphColour = "";
# we'll re-use inLine at various times for reading files
my $inLine;
# if they want an rcfile, must be first arg
my $rcFile = $ENV{"HOME"} . "/.distributionrc";
if ($ARGV && $ARGV[0] =~ /^-+r(cfile)*=(.+)$/) { $rcFile = $2; }
# read in rcfile if it exists - it'll just be args
if (open (IFILE, "< $rcFile")) {
while ($inLine = <IFILE>) {
chomp ($inLine);
# must put args from rcfile at start so options passed on the
# commandline will override them
unshift (@ARGV, $inLine);
}
}
# process input arguments -- any arguments that aren't known switches are stuck
# onto @extraArgs - note that you can have one or two dashes before the option,
# and only need the first character. also options that take values and switches
# can be overloaded, so -c=o -c means use the o character and colourise the
# output. this can be confusing.
my @extraArgs;
foreach my $arg (@ARGV) {
if ($arg =~ /^-+h(elp)*$/) { doArgs(); exit 0; }
elsif ($arg =~ /^-+w(idth)*=(.+)$/) { $widthArg = $2; }
elsif ($arg =~ /^-+h(eight)*=(.+)$/) { $heightArg = $2; }
# can pass --graph without option
elsif ($arg =~ /^-+g(raph)*=(.+)$/) { $graphValues = $2; }
elsif ($arg =~ /^-+g(raph)*$/) { $graphValues = 'vk'; }
# --char takes option...
elsif ($arg =~ /^-+c(har)*=(.+)$/) { $histogramChar = $2; }
# ...--color doesn't.
elsif ($arg =~ /^-+c(olor)*$/) { $colourisedOutput = 1; }
elsif ($arg =~ /^-+l(ogarithmic)*$/) { $logarithmic = 2; }
# can pass --numonly without option
elsif ($arg =~ /^-+n(umonly)$/) { $numOnly = 'abs'; }
elsif ($arg =~ /^-+n(umonly)*=(.+)$/) { $numOnly = $2; }
elsif ($arg =~ /^-+pseudocnt$/) { $pseudoCnt = 1; }
elsif ($arg =~ /^-+p(alette)*=(.+)$/) { $colourPalette = $2; $colourisedOutput = 1; }
elsif ($arg =~ /^-+s(ize)*=(.+)$/) { $size = $2; }
elsif ($arg =~ /^-+t(okenize)*=(.+)$/) { $tokenize = $2; }
elsif ($arg =~ /^-+m(atch)*=(.+)$/) { $matchRegexp = $2; }
elsif ($arg =~ /^-+v(erbose)*$/) { $verbose = 1; }
else { push (@extraArgs, $arg); }
}
# variables to support colourised output
if ($colourisedOutput) {
# input should be comma-separated list of numerals
my @cp = split (',', $colourPalette);
# add ANSI colour commands around the values - final looks like ^[[32m
for (my $i=0; $i < scalar @cp; $i++) {
$cp[$i] = chr(27) . '[' . $cp[$i] . 'm';
}
($regularColour, $keyColour, $ctColour, $pctColour, $graphColour) = @cp;
}
# for advanced graphing
my $verticalBlocks = ["â", "â", "â", "â", "â
", "â", "â", "â"]; # for future?
my $partialBlocks = ["â", "â", "â", "â", "â", "â", "â", "â"]; # char=pb
my $partialCircles = ["â", "â"]; # char=pc
my $partialLines = ["â¸", "â¾", "â"]; # char=hl
# some useful substitutions for prettiness
if ($histogramChar eq "ba") { $unicode = 1; $histogramChar = "â¬"; }
elsif ($histogramChar eq "bl") { $unicode = 1; $histogramChar = "Î"; }
elsif ($histogramChar eq "em") { $unicode = 1; $histogramChar = "â"; }
elsif ($histogramChar eq "me") { $unicode = 1; $histogramChar = "â¯"; }
elsif ($histogramChar eq "di") { $unicode = 1; $histogramChar = "â¦"; }
elsif ($histogramChar eq "dt") { $unicode = 1; $histogramChar = "â¢"; }
elsif ($histogramChar eq "sq") { $unicode = 1; $histogramChar = "â¡"; }
# high-bit set means we're not in ASCIIland anymore
if (ord (substr ($histogramChar, 0, 1)) >= 128) { $unicode = 1; }
# sub-full character width graphing systems
my $charWidth = 1;
my $graphChars = undef;
if ($histogramChar eq "pb") {
$charWidth = 0.125;
$graphChars = $partialBlocks;
} elsif ($histogramChar eq "pc") {
$charWidth = 0.5;
$graphChars = $partialCircles;
} elsif ($histogramChar eq "hl") {
$charWidth = 0.3334;
$graphChars = $partialLines;
}
# some useful regexp replacements
if ($matchRegexp eq 'word') { $matchRegexp = '^[A-Z,a-z]+$'; }
elsif ($matchRegexp eq 'num') { $matchRegexp = '^\d+$'; }
if ($tokenize eq 'word') { $tokenize = '[^\w]'; }
elsif ($tokenize eq 'white') { $tokenize = '\s'; }
# set the size of the histogram
if ($size =~ /^f/) {
$width = int (`tput cols` * 0.99);
if ($width < 80) { $width = 80; }
$height = int (`tput lines` - 3);
if ($verbose) { $height -= 5; }
if ($height < 25) { $height = 25; }
} elsif ($size =~ /^s/) {
$width = 60;
$height = 10;
} elsif ($size =~ /^m/) {
$width = 100;
$height = 20;
} elsif ($size =~ /^l/) {
$width = 140;
$height = 35;
}
# if they passed in --width or --height, they probably meant it moreso than
# defaults or the --size parameter - so code this last
if ($widthArg) { $width = $widthArg; }
if ($heightArg) { $height = $heightArg; }
# build the dict of input values
my $nextStat = time() + $statInterval;
my $valuesDict;
my $totalValues = 0;
my $totalObjects = 0;
my $graphVal = 0;
my $maxVal = 0;
# three modes of operation
if ($numOnly) {
readNumerics();
} elsif ($graphValues) {
readPretalliedTokens();
} else {
readLinesBuildHash();
}
bin/distribution view on Meta::CPAN
}
# see if there was input
sub checkValuesObjects {
# the input may be empty. or we may have been too strict on the
# matching regexp passed in. either way, we were left with nothing.
if ($totalValues == 0) {
if ($totalObjects > 0) {
print STDERR "All input filtered! ";
} else {
print STDERR "No input! ";
}
print STDERR "No histogram for you.\n";
exit 255;
}
}
# the arrays, hashes, variables must be all-correct for this to
# work, TODO: list out which ones they are, convert to functional
# keyText->[] - list of the keys
# pctText->[] - list of the percents
# ctText->[] - list of the counts
# barWidth->[] - list of the widths of the bars
sub outputGraph {
# print a header with alignment from key names
print STDERR "min";
for ($j = 4; $j <= $maxKeyLen; $j++) { print STDERR " "; }
print STDERR "|Ct (Pct)";
for ($j = 7; $j <= $maxPreBarLen; $j++) { print STDERR " "; }
print STDERR "Histogram";
# get ready for the output, but sorting gets hosed if we print the
# colour code before the key, so put it on the line before
print STDERR "$keyColour\n";
# amount of other output reduces possible size of bar - alas
my $maxBarWidth = $width - $maxPreBarLen - $maxKeyLen - 4;
for ($i = 0; $i < $height; $i++) {
# first the key that we aggregated
print $keyText->[$i];
print $regularColour;
# alignment
for ($j = length ($keyText->[$i]); $j < $maxKeyLen; $j++) { print " "; }
# separater between keys and count/pct
print "|";
print $ctColour;
print $ctText->[$i] . " ";
print $pctColour;
print $pctText->[$i];
# spaces 'til time to print the bar
for ($j = length ($ctText->[$i]) + length ($pctText->[$i]); $j <= $maxPreBarLen; $j++) { print " "; }
print $graphColour;
for ($j = 0; $j < int ($barWidth->[$i] * $maxBarWidth); $j++) {
if ($charWidth < 1) {
# print out maximum-width character (always last in array)
print $graphChars->[scalar @$graphChars - 1];
} else {
# we're printing regular non-unicode characters
if (length ($histogramChar) > 1 && !$unicode) {
# but still we have >1 byte! so print initial byte
# for all but the last character (printed outside loop)
print substr ($histogramChar, 0, 1);
} else {
print $histogramChar;
}
}
}
my $count = $valuesDict->{$sortedKeys[$i]};
# print one too many bar characters so <1% gets a single bar character
if($count > 0) {
my $remainder = ($barWidth->[$i] * $maxBarWidth) - int ($barWidth->[$i] * $maxBarWidth);
my $whichChar = int ($remainder / $charWidth);
if ($charWidth < 1) {
# if we have partial-width characters, get higher resolution
if ($barWidth->[$i] * $maxBarWidth > $charWidth) {
# we have more than charWidth remainder, so print out a
# remainder portion
print $graphChars->[$whichChar];
} else {
# we had minimum remainder, so print minimum-width
# character just print the minimum-width portion
print $graphChars->[0];
}
} else {
# we're printing regular non-unicode characters
if (length ($histogramChar) > 1 && !$unicode) {
# but still we have >1 byte! so print final byte of input string
print substr ($histogramChar, -1, 1);
} else {
print $histogramChar;
}
}
}
# FIXME: even with all these colour-printing antics, still one key will
# be printed with the wrong colour on sorted output most of the time,
# but i have no idea how to fix this other than to implement sorting of
# the output within the script itself.
if ($i == $height - 1) {
# put the terminal back into a normal-colour mode on last entry
print "$regularColour\n";
} else {
# we do these antics of printing $keyColour on the line before
# the key so that piping output to sort will work
print "$keyColour\n";
}
}
}
# usage
sub doArgs {
print "\n";
print "usage: <commandWithOutput> | $0\n";
print " [--rcfile=<rcFile>]\n";
print " [--size={sm|med|lg|full} | --width=<width> --height=<height>]\n";
print " [--color] [--palette=r,k,c,p,g]\n";
print " [--tokenize=<tokenChar>]\n";
print " [--graph[=[kv|vk]] [--numonly[=mon|abs]]\n";
print " [--char=<barChars>|<substitutionString>]\n";
print " [--help] [--verbose]\n";
print " --char=C character(s) to use for histogram character, some substitutions follow:\n";
print " ba (â¬) Bar\n";
print " bl (Î) Building\n";
print " em (â) Emdash\n";
print " me (â¯) Mid-Elipses\n";
print " di (â¦) Diamond\n";
print " dt (â¢) Dot\n";
print " sq (â¡) Square\n";
print " hl Use 1/3-width unicode partial lines to simulate 3x actual terminal width\n";
print " pb Use 1/8-width unicode partial blocks to simulate 8x actual terminal width\n";
print " pc Use 1/2-width unicode partial circles to simulate 2x actual terminal width\n";
print " --color colourise the output\n";
print " --graph[=G] input is already key/value pairs. vk is default:\n";
print " kv input is ordered key then value\n";
print " vk input is ordered value then key\n";
print " --height=N height of histogram, headers non-inclusive, overrides --size\n";
print " --help get help\n";
print " --logarithmic logarithmic graph\n";
print " --match=RE only match lines (or tokens) that match this regexp, some substitutions follow:\n";
print " word ^[A-Z,a-z]+\$ - tokens/lines must be entirely alphabetic\n";
print " num ^\\d+\$ - tokens/lines must be entirely numeric\n";
print " --numonly[=N] input is numerics, simply graph values without labels\n";
print " abs input is absolute values (default)\n";
print " mon input monotonically-increasing, graph differences (of 2nd and later values)\n";
print " --palette=P comma-separated list of ANSI colour values for portions of the output\n";
print " in this order: regular, key, count, percent, graph. implies --color.\n";
print " --rcfile=F use this rcfile instead of \$HOME/.distributionrc - must be first argument!\n";
print " --size=S size of histogram, can abbreviate to single character, overridden by --width/--height\n";
print " small 40x10\n";
print " medium 80x20\n";
print " large 120x30\n";
print " full terminal width x terminal height (approximately)\n";
print " --tokenize=RE split input on regexp RE and make histogram of all resulting tokens\n";
print " word [^\\w] - split on non-word characters like colons, brackets, commas, etc\n";
print " white \\s - split on whitespace\n";
print " --width=N width of the histogram report, N characters, overrides --size\n";
print " --verbose be verbose\n";
print "\n";
print "You can use single-characters options, like so: -h=25 -w=20 -v. You must still include the =\n";
print "\n";
print "Samples:\n";
print " du -sb /etc/* | $0 --palette=0,37,34,33,32 --graph\n";
print " du -sk /etc/* | awk '{print \$2\" \"\$1}' | $0 --graph=kv\n";
print " zcat /var/log/syslog*gz | $0 --char=o --tokenize=white\n";
print " zcat /var/log/syslog*gz | awk '{print \$5}' | $0 --t=word --m-word --h=15 --c=/\n";
print " zcat /var/log/syslog*gz | cut -c 1-9 | $0 --width=60 --height=10 --char=em\n";
print " find /etc -type f | cut -c 6- | $0 --tokenize=/ --w=90 --h=35 --c=dt\n";
print " cat /usr/share/dict/words | awk '{print length(\$1)}' | $0 --c=* --w=50 --h=10 | sort -n\n";
print "\n";
}
( run in 0.603 second using v1.01-cache-2.11-cpan-39bf76dae61 )