Data-Prepare
view release on metacpan or search on metacpan
scripts/data-prepare view on Meta::CPAN
cols_non_empty non_unique_cols
key_to_index
make_pk_map pk_col_counts pk_match
chop_lines chop_cols header_merge pk_insert
);
use YAML qw(LoadFile);
use Getopt::Long qw(GetOptions);
my $config = "data-prepare-conf.yml";
my ($analyse, $uniq_cols, $verbose, $slice_num, $slice_key, $key_analyse, $pk_analyse);
if (!GetOptions("a!" => \$analyse,
"u!" => \$uniq_cols,
"v!" => \$verbose,
"f=s" => \$config,
"sn=i" => \$slice_num,
"sk=s" => \$slice_key,
"k!" => \$key_analyse,
"p!" => \$pk_analyse,
)) {
require Pod::Usage;
Pod::Usage::pod2usage(1);
}
if ($uniq_cols) {
die "Usage: -u file..." unless @ARGV;
my (@files, %f2col2count) = @_;
for my $file (@ARGV) {
my $data = read_csv($file);
my $col2count = non_unique_cols($data);
next if !keys %$col2count;
$f2col2count{$file} = $col2count;
}
for my $f (keys %f2col2count) {
my $c2c = $f2col2count{$f};
print "$f:\n", map " '$_': $c2c->{$_}\n", sort keys %$c2c;
}
exit;
}
if ($analyse) {
die "Usage: -a file..." unless @ARGV;
my %f2cs;
for my $file (@ARGV) {
my $data = read_csv($file);
$f2cs{$file} = [ cols_non_empty($data) ];
}
print _dump(\%f2cs);
exit;
}
sub _dump {
require Data::Dumper;
$Data::Dumper::Indent = $Data::Dumper::Indent = $Data::Dumper::Sortkeys = $Data::Dumper::Sortkeys = $Data::Dumper::Terse = $Data::Dumper::Terse = 1;
Data::Dumper::Dumper($_[0]);
}
sub read_csv {
my ($file) = @_;
open my $fh, "<:encoding(UTF-8)", $file or die "$file: $!";
my $char = $fh->getc;
$fh->ungetc(ord $char) if ord($char) != 0xFEFF; # Text::CSV fails on BOM
csv(in => $fh);
}
if (grep defined, $slice_num, $slice_key) {
die "Usage: -s[kn] col[num|key] file..." unless @ARGV;
require Encode;
for my $file (@ARGV) {
my $data = read_csv($file);
if (!defined $slice_num) {
$slice_num = key_to_index($data->[0])->{$slice_key};
die "Unknown column-name '$slice_key' in '$file'" if !defined $slice_num;
}
print Encode::encode("UTF-8", $_->[$slice_num]), "\n" for @$data;
}
exit;
}
my $process_config = LoadFile($config);
if ($pk_analyse) {
die "Usage: -p file..." unless @ARGV;
my $pk_map = make_pk_map(
read_csv($process_config->{pk_spec}{file}),
@{ $process_config->{pk_spec} }{ qw(primary_key alt_keys) },
);
for my $file (@ARGV) {
my $data = read_csv($file);
my ($col2code2exact, $no_exact_match) = pk_col_counts($data, $pk_map);
my $k2i = key_to_index($data->[0]);
my (%col2code2approx, @no_approx_match);
for my $row (@$no_exact_match) {
my $approx_match;
for my $possible_col (keys %$col2code2exact) {
my ($best, $pk_cols_unique_best) = pk_match($row->[$k2i->{$possible_col}], $pk_map, $process_config->{pk_spec}{stopwords});
$col2code2approx{$possible_col}{$_}++ for @$pk_cols_unique_best;
$approx_match ||= defined $best;
}
push @no_approx_match, $row if !$approx_match;
}
require Data::Dumper;
print "$file:\n", Data::Dumper::Dumper([ $col2code2exact, \%col2code2approx, \@no_approx_match ]);
}
exit;
}
if ($key_analyse) {
die "Usage: -k file..." unless @ARGV;
my $pk_map = make_pk_map(
read_csv($process_config->{pk_spec}{file}),
@{ $process_config->{pk_spec} }{ qw(primary_key alt_keys) },
);
for my $file (@ARGV) {
my $data = read_csv($file);
print "$file: ", _dump([ pk_col_counts($data, $pk_map) ]);
}
exit;
}
my %set_to_process; @set_to_process{
(map @{ $_->{files} }, map @{$process_config->{$_} || []}, qw(merge pk_insert)),
( run in 1.145 second using v1.01-cache-2.11-cpan-d8267643d1d )