App-colsummary
view release on metacpan or search on metacpan
#!/usr/bin/perl
# colsummary : TSVã¾ãã¯CSVãã¡ã¤ã«ã®ååã®å¤ã®æ§åã表示ãããã¨ã¦ã便å©ã
# 2015/05/11 - 2016/07/05 , 2018-03-28 . Shimono Toshiyuki
# 2019/10/24, 2021/06/08, 2021/06/11 ããã«å¤§å¹
ã«æ¸ãæ¿ã
use 5.014 ;
use strict ;
use warnings ; # also confirmed on 5.011 5.014 5.018
use autodie qw [ open ] ;
use Encode qw[ decode_utf8 encode_utf8 ] ;
use FindBin qw [ $Script ] ;
use Getopt::Std ; getopts 'R:c:g:i:jm:r:su:v:z=!@:#:0:2:' => \my %o ;
use List::Util qw/max min maxstr minstr/ ;
use POSIX qw/strtod/ ; # # å¹³åå¤ã®è¨ç®ã«ç¨ãã str to double.
use Scalar::Util qw/looks_like_number/;
use Term::ANSIColor qw/:constants color/ ; $Term::ANSIColor::AUTORESET = 1 ;
use Time::HiRes qw [ gettimeofday tv_interval ] ; my ${ dt_start } = [ gettimeofday ] ;
my $sdt = sprintf '%04d-%02d-%02d %02d:%02d:%02d', do{my @t= @{[localtime]}[5,4,3,2,1,0]; $t[0]+=1900; $t[1]++; @t } ;
eval "use PerlIO::gzip;1" or die "PerlIO::gzip cannot be loaded, so -z does not work. ($Script, $sdt)\n" if $o{z} ;
sub AlignOut ( @ ) ; # åºå ; eachFileã§ãColstatã§ã使ãã
sub ColStat ( $$ ) ; # $colvals->[åçª] 㨠ååã æ¸¡ããããã¦ããã®ä¸èº«ã表示ãããã; eachFileã§ãColstatã§ã使ãã
sub d3 ($) { $_[0] =~ s/(?<=\d)(?=(\d\d\d)+($|\D))/,/gr } ; # æ°ã3æ¡åºåãã«å¤æããã
sub eachFile ( $ ) ;
# 以ä¸3åã¯eachFileããå¼ã°ããã
sub colnames( $ ) ; # -=ã®æã«å
é è¡ã®æ
å ±ãåãåºã
sub filePinfo ; # ãã¡ã¤ã«æ¯ã®2次æ
å ±(ä¸è¡ãµããª)
sub ColFreq ( $$ ) ; # 第ï¼å¤æ°ã¯ãã¡ã¤ã«ãã³ã㫠第ï¼å¤æ°ã¯åç
§ ; ååã®å¤ã®åå¸ãåãåºã
## å
¨ä½ã§ä½¿ã夿°
my $optu0 = exists $o{u} && $o{u} eq 0 ;
* decode = * decode_utf8 ;
* decode = sub ( $ ) { $_[0] } if $optu0 ;
#* decode = (! $optu0) ? * decode_utf8 : sub ($){ $_[0] } ; #* encode = $o{u} ? * encode_utf8 : sub ($){ $_[0] } ;
my $optR0 = defined $o{R} && $o{R} eq 0 ;
* R0proc = $optR0 ? sub {} : sub { s/\r$// } ; # -R0 ãæå®ãããæã®å¦ç
binmode *STDOUT , ':utf8' unless $optu0 ; # ããã ãå
¨ä½ã§ä½¿ã夿°ã¨ã¯è¨ãããããããããªããããªãã·ã§ã³ã®å¦çã¯ãã®ç¯ã®ååã«ããã®ã§ã
$| = 1 if $o{'!'} ;
$o{g} //= 6 ; # if ( ! defined $o{g} ) ; # åãåºãæ°
$o{r} //= "~" ; # ç¯å²ã表ãè¨å·(åºåã§ä½¿ã)
$o{c} //= '|' ; # ç¯å²ã示ã表示ãè¤æ°ç¹ããè¨å·(åºåã§ä½¿ã)
$o{'#'} = decode ( $o{'#'} ) if defined $o{'#'} ; # é¤å¤ããæ£è¦è¡¨ç¾
push @ARGV , '-' unless @ARGV ; # æ¨æºå
¥åã®è¿½å
my $isep = $o{i} // "\t" ; # å
¥åã®åºåãæå $o{','} = do { $o{','} //= "\t" ; eval qq[qq[$o{','}]] } ;
my $sec = $o{'@'} // 15 ; # ä½ç§ãã¨ã«ã¬ãã¼ãã表示ãããã
my $nc = 0 ; # è¨æ°å¯¾è±¡ã¨ããªãã£ãã»ã«ã®æ°ãã«ã¦ã³ãã
my $rl ; # åãã¡ã¤ã«ã®èªãã è¡æ°ãæ ¼ç´ã
my %fOut = (
j => [map{UNDERLINE decode($_)}map{eval"\"$_\""}qw[åçª ç°ãªãå¤ æ°å¤åå¹³å åå å¤ã®ç¯å² æé »å¤ é »åº¦(éè¤)], q[], 'æ¡æ°'],
e => [ map {UNDERLINE $_ } qw[ cpos diff ave. name range frequent frequency(multi) ] , "", "digits" ] ) ;
my $col = undef ; # 0ãªãªã¸ã³ã®ã«ã©ã çªå· ## sub ColFreq å
ã§ä½¿ãã
* negcell = defined $o{'#'} ? sub { if (m/$o{'#'}/ ) { $col ++ ; $nc ++ ; goto EACH_CELL } } : sub {} ; # o{'0'} ãããã
## ã·ã°ãã«ã«å¯¾ããè¨å®
my ${ INT1 } = sub {
&{ $SIG{ALRM} } ;
print STDERR BRIGHT_RED
'Do you want to get the halfway result? Then type Ctrl + C again within 2 seconds. '. "\n" .
'Really want to Quit? Then press Ctrl + "\" or Ctrl + Yen-Mark. (Ctrl+Z may be what you want.) ' . RESET "\n" ;
$SIG{INT} = sub { select *STDERR ; & ColStat ; select *STDOUT ; return } ;
sleep 2 ;
return ;
} ;
$SIG{ INT } = ${ INT1 } ;
$SIG{ ALRM } = sub { say STDERR GREEN + (d3 $rl) . " lines read. " , scalar localtime ; alarm $sec } ;
alarm $sec ;
eachFile $_ for @ARGV ;
exit 0 ;
## 1åãã¤ãã¡ã¤ã«ãèªã¿åãã
sub eachFile ( $ ) {
#my $FH = do { my $t = *STDIN if $_[0] eq '-' ; open $t, '<', $_[0] if!$t ; binmode $t , ':gzip(gzip)' if $o{z} ; $t } ; # ãã¡ã¤ã«ãã³ãã«ã®åå¾
my $FH = do{my$t;if($_[0]eq'-'){$t=*STDIN}else{open$t,'<',$_[0]} ; binmode $t , ':gzip(gzip)' if $o{z} ; $t } ; # ãã¡ã¤ã«ãã³ãã«ã®åå¾
$rl = 0 ; # åãã¡ã¤ã«ã®èªã¿åã£ãè¡æ°
# 1. æåã®ååã®ä¸¦ã³ãèªã¿åã:
my @colnames = colnames $FH if $o{'='} ;
# 2. ãã¼ã¿ã®ä¸èº«ãèªã¿åãã¤ã¤ãè¿ãå¤ã¯ãåæ°ãã
my $maxCols = ColFreq $FH, my $colvals ; #my $colvals ; ååã®åãã¼ã¿å¤ã®åº¦æ°ãéè¨;$colvals->[åçª-1]{ãã¼ã¿å¤}=度æ°
close $FH ;
# 3. åºåããã
AlignOut @{ $fOut{$o{j}?'j':'e'} } if 0 ne ($o{0}//'') ;
defined $colvals->[$_] and ColStat $colvals->[ $_ ] , $colnames[$_] for 0 .. $maxCols - 1 ; # ãªãã·ã§ã³ -0 ã«ããå
¨ã¦ã®å¤ãé¤å¤ããããã¨ã¯èµ·ããããã
# 4. 2次æ
å ±ãåºåããã
filePinfo ;
}
### ãããããååãåå¾ããã -= ãæå®ãããå ´åã®ã¿
sub colnames ( $ ) {
my $FH = $_[0] ;
$_ = <$FH> ;
$rl ++ if defined $_ ;
$_ //= '' ;
& R0proc ; # <-- R0procã§è¡æ«ã®\r対çã
chomp $_ ;
decode ($_) ;
my @F = split /$isep/, decode ($_) , -1 ;
#my @F = split /$isep/, do { my $FH = $_[0] ; my $t = <$FH> ; $rl++ if defined $t ; $t //= '' ; chomp $t ; decode ($t) } , -1
}
###
sub filePinfo {
exit if ($o{2}//'') eq 0 ;
$rl = d3 ($rl // 0) ; # read lines
my $procsec = tv_interval ${ dt_start } ;
my $out = "$rl line(s) read; ";
$out .= "$nc cells are not counted; " if $nc ;
$out .= sprintf '%0.6f seconds (colsummary)', $procsec ; # ãã¾ã«ãã¤ã¯ãç§åä½ã®$procsecã15æ¡ãããã§è¡¨ç¤ºãããã®ã§sprintfã
say STDERR BOLD DARK ITALIC CYAN $out ;
}
### ååã®å¤ã®åå¸ãåãåºã
sub ColFreq ( $$ ) { # 第ï¼å¤æ°ã¯ãã¡ã¤ã«ãã³ã㫠第ï¼å¤æ°ã¯åç
§
#my %zstr ; # é¤å¤ãããæååã®åºç¾é »åº¦ã(ç¹æ¤ç¨ã§ãããã) #my $intflg ; #$SIG{INT} = sub { $intflg = 1 } ;
my $maxCols = 0 ;
#my $col = undef ; # 0ãªãªã¸ã³ã®ã«ã©ã çªå·
## * lenlim = defined $o{v} ? sub { grep { $_ = substr $_, 0, $o{v} } @_ } : sub {} ; # -v ã§é·ãå¶é ## -v ã®å¶éã¯å¥ã®æã§ã
* tailspacetrim = defined $o{s} ? sub { grep { s/\s+$// } @_ } : sub {} ;
#* negcell = defined $o{'#'} ? sub { if (m/$o{'#'}/ ) { $col ++ ; $nc ++ ; goto EACH_CELL } } : sub {} ; # o{'0'} ãããã
for ( my $FH = $_[0] ; <$FH> ; $rl ++ ) { # <-- - ãããããªã³ã¼ããæ¸ããã¨èªåã§æã£ã¦ãã(2021-06-08)
chomp ;
& R0proc ;
my @F = map { decode ( $_ ) } split /$isep/ , $_ , -1 ;
#& lenlim ( @F ) ; # åã»ã«ã®é·ãå¶é
& tailspacetrim ( @F ) ;
$col = 0 ;
EACH_CELL :
while ( defined ($_ = shift @F) ) {
#do { $zstr { $F[$_] } ++ ; next } if exists $o{'0'} && $F[$_] =~ m/$o{'0'}/ ;
& negcell ; #next if exists $o{'0'} && $F[$_] =~ m/$o{'0'}/ ;
++ $_[1] -> [ $col ] { $_ } ; # ååã®åãã¼ã¿å¤ã®åº¦æ°ãéè¨
$col ++ ;
}
$maxCols = $col if $maxCols < $col ;
( run in 0.872 second using v1.01-cache-2.11-cpan-39bf76dae61 )