Acme-Tools
view release on metacpan or search on metacpan
or string occurring before it.
Output: An array of arrayrefs.
C<ht2t()> is a quick and dirty way of scraping (or harvesting as it is
also called) data from a web page. Look too L<HTML::Parse> to do this
more accurate.
Example:
use Acme::Tools;
use LWP::Simple;
my $url = "http://en.wikipedia.org/wiki/List_of_countries_by_population";
for( ht2t( get($url), "Countries" ) ) {
my($rank, $country, $pop) = @$_;
$pop =~ s/,//g;
printf "%3d | %-32s | %9d\n", @$_ if $pop>0;
}
Output:
1 | China | 1367740000
2 | India | 1262090000
3 | United States | 319043000
4 | Indonesia | 252164800
5 | Brazil | 203404000
...and so on.
=cut
sub ht2t {
my($f,$s,$r)=@_; 1>@_||@_>3 and croak; $s='' if @_==1;
$f=~s,.*?($s).*?(<table.*?)</table.*,$2,si;
my $e=0;$e++ while index($f,$s=chr($e))>=$[;
$f=~s/<t(d|r|h).*?>/\l$1$s/gsi;
$f=~s/\s*<.*?>\s*/ /gsi;
my @t=split("r$s",$f);shift @t;
$r||=sub{s/&(#160|nbsp);/ /g;s/&/&/g;s/^\s*(.*?)\s*$/$1/s;
s/(\d) (\d)/$1$2/g if /^[\d \.\,]+$/};
for(@t){my @r=split/[dh]$s/;shift@r;$_=[map{&$r;$_}@r]}
@t;
}
=head1 FILES, DIRECTORIES
=head2 writefile
Justification:
Perl needs three or four operations to make a file out of a string:
open my $FILE, '>', $filename or die $!;
print $FILE $text;
close($FILE);
This is way simpler:
writefile($filename,$text);
Sub writefile opens the file i binary mode (C<binmode()>) and has two usage modes:
B<Input:> Two arguments
B<First argument> is the filename. If the file exists, its overwritten.
If the file can not be opened for writing, a die (a croak really) happens.
B<Second input argument> is one of:
=over 4
=item * Either a scaler. That is a normal string to be written to the file.
=item * Or a reference to a scalar. That referred text is written to the file.
=item * Or a reference to an array of scalars. This array is the written to the
file element by element and C<< \n >> is automatically appended to each element.
=back
Alternativelly, you can write several files at once.
Example, this:
writefile('file1.txt','The text....tjo');
writefile('file2.txt','The text....hip');
writefile('file3.txt','The text....and hop');
...is the same as this:
writefile([
['file1.txt','The text....tjo'],
['file2.txt','The text....hip'],
['file3.txt','The text....and hop'],
]);
Automatic compression:
writefile('file.txt.gz','my text is compressed by /bin/gzip before written to the file');
Extentions C<.gz>, C<.bz2> and C<.xz> are recognized for compression. See also C<readfile()> and C<openstr()>.
B<Output:> Nothing (for the time being). C<die()>s (C<croak($!)> really) if something goes wrong.
=cut
#todo: use openstr() as in readfile(), transparently gzip .gz filenames and so on
sub writefile {
my($filename,$text)=@_;
if(ref($filename) eq 'ARRAY'){
writefile(@$_) for @$filename;
return;
}
open(WRITEFILE,openstr(">$filename")) and binmode(WRITEFILE) or croak($!);
if(!defined $text or !ref($text)){
print WRITEFILE $text;
}
elsif(ref($text) eq 'SCALAR'){
print WRITEFILE $$text;
}
elsif(ref($text) eq 'ARRAY'){
print WRITEFILE "$_\n" for @$text;
}
else {
croak;
}
close(WRITEFILE);
return;
}
=head2 readfile
Just as with L</writefile> you can read in a whole file in one operation with C<readfile()>. Instead of:
open my $FILE,'<', $filename or die $!;
my $data = join"",<$FILE>;
close($FILE);
This is simpler:
my $data = readfile($filename);
B<More examples:>
Reading the content of the file to a scalar variable: (Any content in C<$data> will be overwritten)
my $data;
readfile('filename.txt',\$data);
Reading the lines of a file into an array:
my @lines;
readfile('filnavn.txt',\@lines);
for(@lines){
...
}
Note: Chomp is done on each line. That is, any newlines (C<< \n >>) will be removed.
If C<@lines> is non-empty, this will be lost.
Sub readfile is context aware. If an array is expected it returns an array of the lines without a trailing C<< \n >>.
The last example can be rewritten:
for(readfile('filnavn.txt')){
...
}
With two input arguments, nothing (undef) is returned from C<readfile()>.
Automatic decompression:
my $txt = readfile('file.txt.gz'); #uses /bin/gunzip to decompress content
Extentions C<.gz>, C<.bz2> and C<.xz> are recognized for decompression. See also C<writefile()> and C<openstr()>.
return @filer;
}
=head2 basename
The basename and dirname functions behaves like the *nix shell commands with the same names.
B<Input:> One or two arguments: Filename and an optional suffix
B<Output:> Returns the filename with any directory and (if given) the suffix removed.
basename('/usr/bin/perl') # returns 'perl'
basename('/usr/local/bin/report.pl','.pl') # returns 'report' since .pl at the end is removed
basename('report2.pl','.pl') # returns 'report2'
basename('report2.pl','.\w+') # returns 'report2.pl', probably not what you meant
basename('report2.pl',qr/.\w+/) # returns 'report2', use qr for regex
=head2 dirname
B<Input:> A filename including path
B<Output:> Removes the filename path and returns just the directory path up until but not including
the last /. Return just a one char C<< . >> (period string) if there is no directory in the input.
dirname('/usr/bin/perl') # returns '/usr/bin'
dirname('perl') # returns '.'
=head2 username
Returns the current linux/unix username, for example the string root
print username(); #just (getpwuid($<))[0] but more readable perhaps
=cut
sub basename { my($f,$s)=(@_,'');$s=quotemeta($s)if!ref($s);$f=~m,^(.*/)?([^/]*?)($s)?$,;$2 }
sub dirname { $_[0]=~m,^(.*)/,;defined($1) && length($1) ? $1 : '.' }
sub username { (getpwuid($<))[0] }
=head2 wipe
Deletes a file by "wiping" it on the disk. Overwrites the file before deleting. (May not work properly on SSDs)
B<Input:>
* Arg 1: A filename
* Optional arg 2: number of times to overwrite file. Default is 3 if omitted, 0 or undef
* Optional arg 3: keep (true/false), wipe() but no delete of file
B<Output:> Same as the C<unlink()> (remove file): 1 for success, 0 or false for failure.
See also: L<https://www.google.com/search?q=wipe+file>, L<http://www.dban.org/>
=cut
sub wipe {
my($file,$times,$keep)=@_;
$times||=3;
croak "ERROR: File $file nonexisting\n" if not -f $file or not -e $file;
my $size=-s$file;
open my $WIFH, '+<', $file or croak "ERROR: Unable to open $file: $!\n";
binmode($WIFH);
for(1..$times){
my $block=chr(int(rand(256))) x 1024;#hm
for(0..($size/1024)){
seek($WIFH,$_*1024,0);
print $WIFH $block;
}
}
close($WIFH);
$keep || unlink($file);
}
=head2 chall
Does chmod + utime + chown on one or more files.
Returns the number of files of which those operations was successful.
Mode, uid, gid, atime and mtime are set from the array ref in the first argument.
The first argument references an array which is exactly like an array returned from perls internal C<stat($filename)> -function.
Example:
my @stat=stat($filenameA);
chall( \@stat, $filenameB, $filenameC, ... ); # by stat-array
chall( $filenameA, $filenameB, $filenameC, ... ); # by file name
Copies the chmod, owner, group, access time and modify time from file A to file B and C.
See C<perldoc -f stat>, C<perldoc -f chmod>, C<perldoc -f chown>, C<perldoc -f utime>
=cut
sub chall {
my($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks )
= ref($_[0]) ? @{shift()} : stat(shift());
my $successful=0;
for(@_){ chmod($mode,$_) && utime($atime,$mtime,$_) && chown($uid,$gid,$_) && $successful++ }
return $successful;
}
=head2 makedir
Input: One or two arguments.
Works like perls C<mkdir()> except that C<makedir()> will create nesessary parent directories if they dont exists.
First input argument: A directory name (absolute, starting with C< / > or relative).
Second input argument: (optional) permission bits. Using the normal C<< 0777^umask() >> as the default if no second input argument is provided.
Example:
makedir("dirB/dirC")
...will create directory C<dirB> if it does not already exists, to be able to create C<dirC> inside C<dirB>.
Returns true on success, otherwise false.
C<makedir()> memoizes directories it has checked for existence before (trading memory and for speed).
Thus directories removed during running the script is not discovered by makedir.
See also C<< perldoc -f mkdir >>, C<< man umask >>
=cut
our %MAKEDIR;
sub makedir {
my($d,$p,$dd)=@_;
$p=0777^umask() if !defined$p;
(
$MAKEDIR{$d} or -d$d or mkdir($d,$p) #or croak("mkdir $d, $p")
or ($dd)=($d=~m,^(.+)/+([^/]+)$,) and makedir($dd,$p) and mkdir($d,$p) #or die;
) and ++$MAKEDIR{$d};
}
=head2 md5sum
B<Input:> a filename (or a scalar ref to a string, see below)
B<Output:> a string of 32 hexadecimal chars from 0-9 or a-f.
Example, the md5sum gnu/linux command without options could be implementet like this:
use Acme::Tools;
print eval{ md5sum($_)." $_\n" } || $@ for @ARGV;
This sub requires L<Digest::MD5>, which is a core perl-module since
version 5.?.? It does not slurp the files or spawn new processes.
If the input argument is a scalar ref then the MD5 of the string referenced is returned in hex.
=cut
sub md5sum {
require Digest::MD5;
my $fn=shift;
return Digest::MD5::md5_hex($$fn) if ref($fn) eq 'SCALAR';
croak "md5sum: $fn is a directory (no md5sum)" if -d $fn;
open my $FH, '<', $fn or croak "Could not open file $fn for md5sum() $!";
binmode($FH);
my $r = eval { Digest::MD5->new->addfile($FH)->hexdigest };
croak "md5sum on $fn failed ($@)\n" if $@;
$r;
}
=head2 which
Returns the first executable program in $ENV{PATH} paths (split by : colon) with the given name.
echo $PATH
perl -MAcme::Tools -le 'print which("gzip")' # maybe prints /bin/gzip
=head2 read_conf
B<First argument:> A file name or a reference to a string with settings in the format described below.
B<Second argument, optional:> A reference to a hash. This hash will have the settings from the file (or stringref).
The hash do not have to be empty beforehand.
Returns a hash with the settings as in this examples:
my %conf = read_conf('/etc/your/thing.conf');
print $conf{sectionA}{knobble}; #prints ABC if the file is as shown below
print $conf{sectionA}{gobble}; #prints ZZZ, the last gobble
print $conf{switch}; #prints OK here as well, unsectioned value
print $conf{part2}{password}; #prints oh:no= x
File use for the above example:
switch: OK #before first section, the '' (empty) section
[sectionA]
knobble: ABC
gobble: XYZ #this gobble is overwritten by the gobble on the next line
gobble: ZZZ
[part2]
password: oh:no= x #should be better
text: { values starting with { continues
until reaching a line with }
Everything from # and behind is regarded comments and ignored. Comments can be on any line.
To keep a # char, put a \ in front of it.
A C< : > or C< = > separates keys and values. Spaces at the beginning or end of lines are
ignored (after removal of #comments), as are any spaces before and after : and = separators.
Empty lines or lines with no C< : > or C< = > is also ignored. Keys and values can contain
internal spaces and tabs, but not at the beginning or end.
Multi-line values must start and end with { and }. Using { and } keep spaces at the start
or end in both one-line and multi-line values.
Sections are marked with C<< [sectionname] >>. Section names, keys and values is case
sensitive. C<Key:values> above the first section or below and empty C<< [] >> is placed
both in the empty section in the returned hash and as top level key/values.
C<read_conf> can be a simpler alternative to the core module L<Config::Std> which has
its own hassles.
$Acme::Tools::Read_conf_empty_section=1; #default 0 (was 1 in version 0.16)
my %conf = read_conf('/etc/your/thing.conf');
my $perc=!$o{M}&&!$o{C}&&!$o{A}&&!$o{P}?sub{""}:
sub{
my @p=$o{P}?(10,50,90):(50);
my @m=@_>0 ? do {grep$_, split",", $xtime{$_[0]}}
: do {grep$_, map {split","} values %xtime};
my @r=percentile(\@p,@m);
@r=(min(@m),@r,max(@m)) if $o{M}||$o{C}||$o{A};
@r=map int($_), @r;
my $fmt=$o{t}?'YYYY/MM/DD-MM:MI:SS':'YYYY/MM/DD';
@r=map tms($_,$fmt), @r;
" ".join(" ",@r);
};
my $width=max( 10, grep $_, map length($_), @e );
@e=@e[-10..-1] if $o{t} and @e>10; #-t tail
printf("%-*s %8d $f %7.2f%%%s\n",$width,$_,$c{$_},&$s($b{$_}),100*$b{$_}/$bts,&$perc($_)) for @e;
printf("%-*s %8d $f %7.2f%%%s\n",$width,"Sum",$cnt,&$s($bts),100,&$perc());
}
sub cmd_resubst {
my %o;
my $zo="123456789e";
my @argv=opts("f:t:vno:gi$zo",\%o,@_);
if(exists$o{t}){ $o{t}=~s,\\,\$, } else { $o{t}='' }
my($i,$tc,$tbfr,$tbto)=(0,0,0,0);
for my $file (@argv){
my $zopt=join" ",map"-$_",grep$o{$_},split//,$zo;
my $oext=$o{o}?$o{o}:$file=~/\.(gz|bz2|xz)$/i?$1:'';
my $open_out_pre=$oext?"|".openstr_prog({qw/gz gzip bz2 bzip2 xz xz/}->{lc($oext)})." $zopt":'';
my $open_out="$open_out_pre > $file.tmp$$";
my $open_in=openstr($file);
# die srlz(\%o,'o','',1);
open my $I, $open_in or croak"ERR: open $open_in failed. $! $?\n";
open my $O, $open_out or croak"ERR: open $open_out failed. $! $?\n";
my $c=0;
my $mod=join"",grep$o{$_},qw(g i);
eval"while(<\$I>){ \$c+=s/\$o{f}/$o{t}/$mod;print \$O \$_ }";
$tc+=$c;
close($I);close($O);
chall($file,"$file.tmp$$") or croak"ERR: chall $file\n" if !$o{n};
my($bfr,$bto)=(-s$file,-s"$file.tmp$$");
unlink $file or croak"ERR: cant rm $file\n";
my $newfile=$o{o}?repl($file,qr/\.(gz|bz2|xz)$/i,".$oext"):$file;
rename("$file.tmp$$",$newfile) or croak"ERR: rename $file.tmp$$ -> $newfile failed\n";
if($o{v}){
my $pr=$bfr?100*$bto/$bfr:0;
printf "%*d/%d %*s %7d =>%8d b (%2d%%) %s\n",
length(0+@argv), ++$i, 0+@argv, -15, "$tc/$c", $bfr, $bto, $pr, $file;
$tbfr+=$bfr;
$tbto+=$bto;
}
}
if($o{v} and @argv>1){
printf "Replaces: %d Bytes before: %d After: %d Change: %.1f%%\n",
$tc, $tbfr, $tbto, $tbfr?100*($tbto-$tbfr)/$tbfr:0
}
$tc;
}
sub cmd_xcat {
for my $fn (@_){
my $os=openstr($fn);
open my $FH, $os or warn "xcat: cannot open $os ($!)\n" and next;
#binmode($FH);#hm?
print while <$FH>;
close($FH);
}
}
sub cmd_freq {
my(@f,$i);
map $f[$_]++, unpack("C*",$_) while <>;
my $s=" " x 12;map{print"$_$s$_$s$_\n"}("BYTE CHAR COUNT","---- ----- -------");
my %m=(145,"DOS-æ",155,"DOS-ø",134,"DOS-Ã¥",146,"DOS-Ã",157,"DOS-Ã",143,"DOS-Ã
",map{($_," ")}0..31);
printf("%4d %5s%8d".(++$i%3?$s:"\n"),$_,$m{$_}||chr,$f[$_]) for grep$f[$_],0..255;print "\n";
my @no=grep!$f[$_],0..255; print "No bytes for these ".@no.": ".join(" ",@no)."\n";
}
sub cmd_deldup {
cmd_finddup('-d',@_);
}
sub cmd_finddup {
# http://www.commandlinefu.com/commands/view/3555/find-duplicate-files-based-on-size-first-then-md5-hash
# die "todo: finddup not ready yet"
my %o;
my @argv=opts("ak:dhsnqv0P:FMRp",\%o,@_); $o{P}=1024*8 if!defined$o{P}; $o{k}='' if!defined$o{k};
croak"ERR: cannot combine -a with -d, -s or -h" if $o{a} and $o{d}||$o{s}||$o{h};
require File::Find;
@argv=map{
my @f;
if(-d$_){ File::Find::find({follow=>0,wanted=>sub{return if !-f$_;push@f,$File::Find::name;1}},$_) }
else { @f=($_) }
@f;
}@argv;
my %md5sum;
my $md5sum=sub{$md5sum{$_[0]}=md5sum($_[0]) if!defined$md5sum{$_[0]}}; #memoize
my $md5sum_1st_part=sub{
open my $fh, "<", $_[0] or die "ERR: Could not read $_[0]";
binmode($fh);
my $buf; read($fh,$buf,$o{P});
close($fh);
md5sum(\$buf);
};
my @checks=( #todo: stat()[0,1] (or[0,1,7]?) and diff filename => no need for md5, is hardlink! just linux?
sub{-s$_[0]},
sub{-s$_[0]<=$o{P}?md5sum($_[0]):&$md5sum_1st_part($_[0])},
sub{md5sum($_[0])}
);
pop @checks if $o{M}; #4tst
my $i=0;
my %s=map{($_=>++$i)}@argv; #sort
my %f=map{($_=>[$_])}@argv; #also weeds out dupl params
for my $c (@checks){
my @f=map @{$f{$_}}, sort keys %f;
if($o{p} and $c eq $checks[-1]){ #view progress for last check, todo: eta() is wacky here! everywhere?
my $sum=@f?sum(map -s$_,@f):0;
my($corg,$cnt,$cntmb,$mb)=($c,0,0,$sum/1e6);
$c=sub{
$cntmb+=(-s$_[0])/1e6;
my $eol=++$cnt==@f?"\n":"\r";
print STDERR sprintf("%d/%d files checked (%d%%), %d/%d MB (%d%%), ETA in %d sec $eol",
$cnt, 0+@f, 100*$cnt/@f, $cntmb, $mb, 100*$cntmb/$mb,
curb(nvl(eta($cnt,0+@f),time)-time(),0,1e7));
&$corg(@_)
};
}
my %n; push @{$n{&$c($_)}}, $_ for @f;
delete @n{grep@{$n{$_}}<2,keys%n};
%f=%n;
}
return %f if $o{F};
my@r=sort{$s{$$a[0]}<=>$s{$$b[0]}}values%f;
my $si={qw(o 9 n 9 O 8 N 8)}->{$o{k}}; #stat index: 9=mtime, 8=atime
my $sort=lc$o{k} eq 'o' ? sub{sprintf"%011d%9d", (stat($_[0]))[$si],$s{$_[0]}}
:lc$o{k} eq 'n' ? sub{sprintf"%011d%9d",1e11-(stat($_[0]))[$si],$s{$_[0]}}
: sub{sprintf "%9d", $s{$_[0]}};
@$_=map$$_[1],sort{$$a[0]cmp$$b[0]}map[&$sort($_),$_],@$_ for @r;
my %of; #dup of
for my $r (@r){
$of{$_}=$$r[0] for @$r[1..$#$r];
}
my $nl=$o{0}?"\x00":"\n";
my $print=sub{$o{q} or print $_[0]};
my $do=sub{ $o{v} && &$print("$_[0]$nl"); qx($_[0]) };
my $go=sub{ $o{n} ? &$print("$_[0]$nl") : &$do($_[0]) };
&$print(join$nl,map join("",map"$_$nl",@$_),@r) and return if $o{a};
@r=map@$_[1..$#$_],@r;
return @r if $o{R}; #hm
unlink@r if $o{d}||$o{s}||$o{h} and !$o{n}; #delete duplicates
map &$go(qq(rm "$_") ),@r if $o{d}&& $o{n}; #delete duplicates, dryrun
map &$go(qq(ln "$of{$_}" "$_")),@r if $o{h}; #replace duplicates with hardlink
map &$go(qq(ln -s "$of{$_}" "$_")),@r if $o{s}; #replace duplicates with symlink,
#todo: BUG! abc/def/file -> ghi/file should be abc/def/file -> ../../ghi/file
return if $o{q} or $o{n}; #quiet or dryrun
&$print("$_$nl") for @r;
}
#http://stackoverflow.com/questions/11900239/can-i-cache-the-output-of-a-command-on-linux-from-cli
our $Ccmd_cache_dir='/tmp/acme-tools-ccmd-cache';
our $Ccmd_cache_expire=15*60; #default 15 minutes
( run in 1.084 second using v1.01-cache-2.11-cpan-39bf76dae61 )