Data-Prepare

 view release on metacpan or  search on metacpan

scripts/data-prepare  view on Meta::CPAN

  cols_non_empty non_unique_cols
  key_to_index
  make_pk_map pk_col_counts pk_match
  chop_lines chop_cols header_merge pk_insert
);
use YAML qw(LoadFile);
use Getopt::Long qw(GetOptions);

my $config = "data-prepare-conf.yml";
my ($analyse, $uniq_cols, $verbose, $slice_num, $slice_key, $key_analyse, $pk_analyse);
if (!GetOptions("a!" => \$analyse,
                "u!"  => \$uniq_cols,
                "v!"  => \$verbose,
                "f=s" => \$config,
                "sn=i" => \$slice_num,
                "sk=s" => \$slice_key,
                "k!" => \$key_analyse,
                "p!" => \$pk_analyse,
               )) {
    require Pod::Usage;
    Pod::Usage::pod2usage(1);
}

if ($uniq_cols) {
  die "Usage: -u file..." unless @ARGV;
  my (@files, %f2col2count) = @_;
  for my $file (@ARGV) {
    my $data = read_csv($file);
    my $col2count = non_unique_cols($data);
    next if !keys %$col2count;
    $f2col2count{$file} = $col2count;
  }
  for my $f (keys %f2col2count) {
    my $c2c = $f2col2count{$f};
    print "$f:\n", map "  '$_': $c2c->{$_}\n", sort keys %$c2c;
  }
  exit;
}

if ($analyse) {
  die "Usage: -a file..." unless @ARGV;
  my %f2cs;
  for my $file (@ARGV) {
    my $data = read_csv($file);
    $f2cs{$file} = [ cols_non_empty($data) ];
  }
  print _dump(\%f2cs);
  exit;
}

sub _dump {
  require Data::Dumper;
  $Data::Dumper::Indent = $Data::Dumper::Indent = $Data::Dumper::Sortkeys = $Data::Dumper::Sortkeys = $Data::Dumper::Terse = $Data::Dumper::Terse = 1;
  Data::Dumper::Dumper($_[0]);
}

sub read_csv {
  my ($file) = @_;
  open my $fh, "<:encoding(UTF-8)", $file or die "$file: $!";
  my $char = $fh->getc;
  $fh->ungetc(ord $char) if ord($char) != 0xFEFF; # Text::CSV fails on BOM
  csv(in => $fh);
}

if (grep defined, $slice_num, $slice_key) {
  die "Usage: -s[kn] col[num|key] file..." unless @ARGV;
  require Encode;
  for my $file (@ARGV) {
    my $data = read_csv($file);
    if (!defined $slice_num) {
      $slice_num = key_to_index($data->[0])->{$slice_key};
      die "Unknown column-name '$slice_key' in '$file'" if !defined $slice_num;
    }
    print Encode::encode("UTF-8", $_->[$slice_num]), "\n" for @$data;
  }
  exit;
}

my $process_config = LoadFile($config);

if ($pk_analyse) {
  die "Usage: -p file..." unless @ARGV;
  my $pk_map = make_pk_map(
    read_csv($process_config->{pk_spec}{file}),
    @{ $process_config->{pk_spec} }{ qw(primary_key alt_keys) },
  );
  for my $file (@ARGV) {
    my $data = read_csv($file);
    my ($col2code2exact, $no_exact_match) = pk_col_counts($data, $pk_map);
    my $k2i = key_to_index($data->[0]);
    my (%col2code2approx, @no_approx_match);
    for my $row (@$no_exact_match) {
      my $approx_match;
      for my $possible_col (keys %$col2code2exact) {
        my ($best, $pk_cols_unique_best) = pk_match($row->[$k2i->{$possible_col}], $pk_map, $process_config->{pk_spec}{stopwords});
        $col2code2approx{$possible_col}{$_}++ for @$pk_cols_unique_best;
        $approx_match ||= defined $best;
      }
      push @no_approx_match, $row if !$approx_match;
    }
    require Data::Dumper;
    print "$file:\n", Data::Dumper::Dumper([ $col2code2exact, \%col2code2approx, \@no_approx_match ]);
  }
  exit;
}

if ($key_analyse) {
  die "Usage: -k file..." unless @ARGV;
  my $pk_map = make_pk_map(
    read_csv($process_config->{pk_spec}{file}),
    @{ $process_config->{pk_spec} }{ qw(primary_key alt_keys) },
  );
  for my $file (@ARGV) {
    my $data = read_csv($file);
    print "$file: ", _dump([ pk_col_counts($data, $pk_map) ]);
  }
  exit;
}

my %set_to_process; @set_to_process{
  (map @{ $_->{files} }, map @{$process_config->{$_} || []}, qw(merge pk_insert)),



( run in 1.145 second using v1.01-cache-2.11-cpan-d8267643d1d )