percent results from the CPAN

percent

Bio-Grid-Run-SGE

view release on metacpan or search on metacpan

	readPretalliedTokens();
} else {
	readLinesBuildHash();
}

# see if there was input
checkValuesObjects();

my $totalKeys;
my @sortedKeys;

# if we're just graphing a bunch of numbers, no need to sort
# the values dict
if ($numOnly) {
	# here we sort on KEYS not VALUES like other cases
	@sortedKeys = sortKeysByKey();

	# we graph everything we're given - throw away nothing!
	$height = $totalObjects;
} else {
	# build the sorted dict
	@sortedKeys = sortKeysByValueFrequency();
	$maxVal = $valuesDict->{$sortedKeys[0]};
}

# if there aren't height # of distinct values, use less
$totalKeys = scalar @sortedKeys;
if ($totalKeys < $height) { $height = $totalKeys; }

# for logarithmic graphs
my $maxLog = 0;
if ($logarithmic) {
	$maxLog = log($maxVal);
}

my $i;
my $j;
my $keyText;
my $ctText;
my $pctText;
my $preBarLen;
my $barWidth;
my $maxPreBarLen = 0;
my $maxKeyLen = 0;

# here is the complex part - read it carefully
for ($i = 0; $i < $height; $i++) {
	# print the i'th most-common key
	$keyText->[$i] = $sortedKeys[$i];

	# how many times this key occurred in the input
	my $count = $valuesDict->{$sortedKeys[$i]};

	# determine the bar width based on key occurence
	if ($logarithmic) {
		$barWidth->[$i] = log($count) / $maxLog;
	} else {
		$barWidth->[$i] = $count / $maxVal;
	}

	# determine the percent of key frequency
	my $percentile = $count / $totalValues * 100;

	# graph axis labels, really
	$ctText->[$i] = sprintf ("%d", $count);
	$pctText->[$i] = $count > 0 ? sprintf ("(%3.02f%%)", $percentile) : '';
	$preBarLen = length ($ctText->[$i]) + length ($pctText->[$i]);

	# determine the longest key name and longest count/percent text for
	# aligning the output
	if ($preBarLen > $maxPreBarLen) { $maxPreBarLen = $preBarLen; }
	if (length ($sortedKeys[$i]) > $maxKeyLen) { $maxKeyLen = length ($sortedKeys[$i]); }
}

my $endTime = time() * 1000;
my $totalMillis = sprintf ("%.2f", ($endTime - $startTime));

if ($verbose) {
	print STDERR "tokens/lines examined: $totalObjects\n";
	print STDERR " tallied in histogram: $totalValues\n";
	print STDERR "    histogram entries: $totalKeys\n";
	print STDERR "              runtime: ${totalMillis}ms\n";
}

outputGraph();

exit 0;

# --------------------------------------------------------------------------- #
#                                 subroutines
# --------------------------------------------------------------------------- #

# get the keys ordered - we'll only print the most common keys
sub sortKeysByValueFrequency {
	my @sortedKeys = reverse sort { int($valuesDict->{$a}) <=> int($valuesDict->{$b}) } keys %{$valuesDict};
	return @sortedKeys;
}

# get the keys ordered simply by key
sub sortKeysByKey {
	my @sortedKeys = sort { int($a) <=> int($b) } keys %{$valuesDict};
	return @sortedKeys;
}

# here we just pull in a stream of numerics and graph them, optionally graphing
# the difference between each pair of values
sub readNumerics {
	# monotonically-increasing pairs, must have at least one value
	my $lastVal = undef;

	while ($inLine = <STDIN>) {
		chomp ($inLine);
		if ($numOnly =~ /^m/i) {
			if (defined $lastVal) {
				$graphVal= $inLine - $lastVal;
				$totalValues += $graphVal;
				$totalObjects++;
			}
			$lastVal = $inLine;
		} else {
			$graphVal= $inLine;
			$totalValues += $inLine;
			$totalObjects++;
		}

		if ($graphVal > $maxVal) { $maxVal = $graphVal; }

		# we just build a list where the keys are line num and values
		# are the value

bin/distribution view on Meta::CPAN

			}
		}
		if ($verbose && time() > $nextStat) {
			print STDERR "    Objects Processed: $totalObjects..." . chr(13);
			$nextStat = time() + $statInterval;
		}
	}
  if($pseudoCnt) {
    map { $valuesDict->{$_}-- } keys %$valuesDict;
    $totalObjects -= scalar keys %$valuesDict;
    $totalValues -= scalar keys %$valuesDict;
  }
	if ($verbose) { print STDERR "    Objects Processed: $totalObjects    \n"; }
}

# here is the case where we don't need to put the input into
# bins and tally - the data is pre-tallied for us
sub readPretalliedTokens {
	while ($inLine = <STDIN>) {
		chomp ($inLine);
		if ($graphValues eq 'vk') {
			if ($inLine =~ /^\s*(\d+)\s+(.+)$/) {
				$valuesDict->{$2} = $1;
				$totalValues += $1;
				if ($1 > $maxVal) { $maxVal = $1; }
				$totalObjects++;
			} else {
				print STDERR " E Input line malformed and discarded: $inLine\n";
			}
		} elsif ($graphValues eq 'kv') {
			if ($inLine =~ /^(.+?)\s+(\d+)$/) {
				$valuesDict->{$1} = $2;
				$totalValues += $2;
				if ($2 > $maxVal) { $maxVal = $2; }
				$totalObjects++;
			} else {
				print STDERR " E Input line malformed and discarded: $inLine\n";
			}
		}
	}
}

# see if there was input
sub checkValuesObjects {
	# the input may be empty. or we may have been too strict on the
	# matching regexp passed in. either way, we were left with nothing.
	if ($totalValues == 0) {
		if ($totalObjects > 0) {
			print STDERR "All input filtered! ";
		} else {
			print STDERR "No input! ";
		}
		print STDERR "No histogram for you.\n";
		exit 255;
	}
}

# the arrays, hashes, variables must be all-correct for this to
# work, TODO: list out which ones they are, convert to functional
# keyText->[]  - list of the keys
# pctText->[]  - list of the percents
# ctText->[]   - list of the counts
# barWidth->[] - list of the widths of the bars
sub outputGraph {
	# print a header with alignment from key names
	print STDERR "min";
	for ($j = 4; $j <= $maxKeyLen; $j++) { print STDERR " "; }
	print STDERR "|Ct (Pct)";
	for ($j = 7; $j <= $maxPreBarLen; $j++) { print STDERR " "; }
	print STDERR "Histogram";

	# get ready for the output, but sorting gets hosed if we print the
	# colour code before the key, so put it on the line before
	print STDERR "$keyColour\n";

	# amount of other output reduces possible size of bar - alas
	my $maxBarWidth = $width - $maxPreBarLen - $maxKeyLen - 4;

	for ($i = 0; $i < $height; $i++) {
		# first the key that we aggregated
		print $keyText->[$i];
		print $regularColour;
		# alignment
		for ($j = length ($keyText->[$i]); $j < $maxKeyLen; $j++) { print " "; }
		# separater between keys and count/pct
		print "|";
		print $ctColour;
		print $ctText->[$i] . " ";
		print $pctColour;
		print $pctText->[$i];

		# spaces 'til time to print the bar
		for ($j = length ($ctText->[$i]) + length ($pctText->[$i]); $j <= $maxPreBarLen; $j++) { print " "; }

		print $graphColour;
		for ($j = 0; $j < int ($barWidth->[$i] * $maxBarWidth); $j++) {
			if ($charWidth < 1) {
				# print out maximum-width character (always last in array)
				print $graphChars->[scalar @$graphChars - 1];
			} else {
				# we're printing regular non-unicode characters
				if (length ($histogramChar) > 1 && !$unicode) {
					# but still we have >1 byte! so print initial byte
					# for all but the last character (printed outside loop)
					print substr ($histogramChar, 0, 1);
				} else {
					print $histogramChar;
				}
			}
		}

    my $count = $valuesDict->{$sortedKeys[$i]};
		# print one too many bar characters so <1% gets a single bar character
    if($count > 0) {
      my $remainder = ($barWidth->[$i] * $maxBarWidth) - int ($barWidth->[$i] * $maxBarWidth);
      my $whichChar = int ($remainder / $charWidth);
      if ($charWidth < 1) {
        # if we have partial-width characters, get higher resolution
        if ($barWidth->[$i] * $maxBarWidth > $charWidth) {
          # we have more than charWidth remainder, so print out a
          # remainder portion

bin/distribution view on Meta::CPAN

        if (length ($histogramChar) > 1 && !$unicode) {
          # but still we have >1 byte! so print final byte of input string
          print substr ($histogramChar, -1, 1);
        } else {
          print $histogramChar;
        }
      }
    }

		# FIXME: even with all these colour-printing antics, still one key will
		# be printed with the wrong colour on sorted output most of the time,
		# but i have no idea how to fix this other than to implement sorting of
		# the output within the script itself.
		if ($i == $height - 1) {
			# put the terminal back into a normal-colour mode on last entry
			print "$regularColour\n";
		} else {
			# we do these antics of printing $keyColour on the line before
			# the key so that piping output to sort will work
			print "$keyColour\n";
		}
	}
}

# usage
sub doArgs {
	print "\n";
	print "usage: <commandWithOutput> | $0\n";
	print "         [--rcfile=<rcFile>]\n";
	print "         [--size={sm|med|lg|full} | --width=<width> --height=<height>]\n";
	print "         [--color] [--palette=r,k,c,p,g]\n";
	print "         [--tokenize=<tokenChar>]\n";
	print "         [--graph[=[kv|vk]] [--numonly[=mon|abs]]\n";
	print "         [--char=<barChars>|<substitutionString>]\n";
	print "         [--help] [--verbose]\n";
	print "  --char=C       character(s) to use for histogram character, some substitutions follow:\n";
	print "        ba       (â–¬) Bar\n";
	print "        bl       (Îž) Building\n";
	print "        em       (â€”) Emdash\n";
	print "        me       (â‹¯) Mid-Elipses\n";
	print "        di       (â™¦) Diamond\n";
	print "        dt       (â€¢) Dot\n";
	print "        sq       (â–¡) Square\n";
	print "        hl       Use 1/3-width unicode partial lines to simulate 3x actual terminal width\n";
	print "        pb       Use 1/8-width unicode partial blocks to simulate 8x actual terminal width\n";
	print "        pc       Use 1/2-width unicode partial circles to simulate 2x actual terminal width\n";
	print "  --color        colourise the output\n";
	print "  --graph[=G]    input is already key/value pairs. vk is default:\n";
	print "        kv       input is ordered key then value\n";
	print "        vk       input is ordered value then key\n";
	print "  --height=N     height of histogram, headers non-inclusive, overrides --size\n";
	print "  --help         get help\n";
	print "  --logarithmic  logarithmic graph\n";
	print "  --match=RE     only match lines (or tokens) that match this regexp, some substitutions follow:\n";
	print "        word     ^[A-Z,a-z]+\$ - tokens/lines must be entirely alphabetic\n";
	print "        num      ^\\d+\$        - tokens/lines must be entirely numeric\n";
	print "  --numonly[=N]  input is numerics, simply graph values without labels\n";
	print "        abs      input is absolute values (default)\n";
	print "        mon      input monotonically-increasing, graph differences (of 2nd and later values)\n";
	print "  --palette=P    comma-separated list of ANSI colour values for portions of the output\n";
	print "                 in this order: regular, key, count, percent, graph. implies --color.\n";
	print "  --rcfile=F     use this rcfile instead of \$HOME/.distributionrc - must be first argument!\n";
	print "  --size=S       size of histogram, can abbreviate to single character, overridden by --width/--height\n";
	print "        small    40x10\n";
	print "        medium   80x20\n";
	print "        large    120x30\n";
	print "        full     terminal width x terminal height (approximately)\n";
	print "  --tokenize=RE  split input on regexp RE and make histogram of all resulting tokens\n";
	print "        word     [^\\w] - split on non-word characters like colons, brackets, commas, etc\n";
	print "        white    \\s    - split on whitespace\n";
	print "  --width=N      width of the histogram report, N characters, overrides --size\n";
	print "  --verbose      be verbose\n";
	print "\n";
	print "You can use single-characters options, like so: -h=25 -w=20 -v. You must still include the =\n";
	print "\n";
	print "Samples:\n";
	print "  du -sb /etc/* | $0 --palette=0,37,34,33,32 --graph\n";
	print "  du -sk /etc/* | awk '{print \$2\" \"\$1}' | $0 --graph=kv\n";
	print "  zcat /var/log/syslog*gz | $0 --char=o --tokenize=white\n";
	print "  zcat /var/log/syslog*gz | awk '{print \$5}'  | $0 --t=word --m-word --h=15 --c=/\n";
	print "  zcat /var/log/syslog*gz | cut -c 1-9        | $0 --width=60 --height=10 --char=em\n";
	print "  find /etc -type f       | cut -c 6-         | $0 --tokenize=/ --w=90 --h=35 --c=dt\n";
	print "  cat /usr/share/dict/words | awk '{print length(\$1)}' | $0 --c=* --w=50 --h=10 | sort -n\n";
	print "\n";
}

( run in 1.660 second using v1.01-cache-2.11-cpan-7fcb06a456a )