BOM results from the CPAN

Spreadsheet-Edit

view release on metacpan or search on metacpan


1000.027 : Add OPTION subname_override to log_call and friends.
           Initial support for csv -> txt (fixed-width-fields); not fully working yet.
           Removed undocumented tie_col_variables :safe option.
           Doc tweaks.
1000.026 : Require Path::Tiny 0.146; fix Preload glitch with title_rx
1000.025 : Require Data::Dumper::Interp 7.012 to get portability fixes
1000.023 : Yet more debug code to diagnose Windows smoker failures...
1000.022 : Small changes, hunting for Windows smoker failures.
1000.021 : Allow completely-redundant alias definitions; Added btw* test cases.
1000.020 : Fix bug with BOM+empty files.
1000.019 : Fix leading-zero .csv corruption bug which came back in 1000.018
           Fix BOM-related test issues on Windows.
1000.018 : Support reading csv files with a BOM
1000.016 : Avoid warnings about !$foo ne $bar with perl v5.41.5
1000.016 : Allow {data_soruce} as option to read_spreadsheet()
1000.015 : Add sort_indicies() method.   Remove unused t/LpodhTestUtils.pm
           Add btwN, btwbt added to Spreadsheet::Edit::Log
1000.014 : Revised algo used to search for an OO/LL installation
1000.013 : Require Data::Dumper::Interp 7.003 (to fix API change issue)
1000.012 : Put back first/last_data_rx (reverts removal in 1000.009).
           sort_rows() needs them to be useful in multi-section sheets.
1000.011 : Tester fixes for Windows
1000.010 : Misc. fixes; Enhanced btwN with mini-backtrace etc.

Changes view on Meta::CPAN

        Override with 
          input_encoding => "UTF-8",  # for example
          quote_char     => '"',
          sep_char       => ','
        ----
        Support CSV transcoding e.g.
          convert_spreadsheet( inpath => "file_in_utf16.csv", 
                               cvt_to => "csv",
                               output_encoding => "UTF-8" )

        will auto-detect the input encoding (using BOM if present)
        and transcode to a UTF-8 outpath.
        ---
        Separate and generalize the logging functions into
        Spreadsheet::Edit::Log .
        ---
3.009 : Switch tests from Test::More to Test2::V0

META.yml view on Meta::CPAN

  Carp::Always: '0'
  Clone: '0'
  Data::Dumper: '0'
  Data::Dumper::Interp: '7.012'
  Data::Hexify: '0'
  Digest::MD5: '0'
  Encode: '0'
  Exporter: '5.57'
  Exporter::Tiny: 1.001_000
  Fcntl: '0'
  File::BOM: '0'
  File::Basename: '0'
  File::Copy: '0'
  File::Copy::Recursive: '0'
  File::Find: '0'
  File::Glob: '0'
  File::Spec: '0'
  File::Spec::Functions: '0'
  File::Which: '0'
  Guard: '0'
  Import::Into: '0'

Makefile.PL view on Meta::CPAN

    "Carp::Always" => 0,
    "Clone" => 0,
    "Data::Dumper" => 0,
    "Data::Dumper::Interp" => "7.012",
    "Data::Hexify" => 0,
    "Digest::MD5" => 0,
    "Encode" => 0,
    "Exporter" => "5.57",
    "Exporter::Tiny" => "1.001_000",
    "Fcntl" => 0,
    "File::BOM" => 0,
    "File::Basename" => 0,
    "File::Copy" => 0,
    "File::Copy::Recursive" => 0,
    "File::Find" => 0,
    "File::Glob" => 0,
    "File::Spec" => 0,
    "File::Spec::Functions" => 0,
    "File::Which" => 0,
    "Guard" => 0,
    "Import::Into" => 0,

Makefile.PL view on Meta::CPAN

  "Data::Dumper" => 0,
  "Data::Dumper::Interp" => "7.012",
  "Data::Hexify" => 0,
  "Digest::MD5" => 0,
  "Encode" => 0,
  "English" => 0,
  "Env" => 0,
  "Exporter" => "5.57",
  "Exporter::Tiny" => "1.001_000",
  "Fcntl" => 0,
  "File::BOM" => 0,
  "File::Basename" => 0,
  "File::Copy" => 0,
  "File::Copy::Recursive" => 0,
  "File::Find" => 0,
  "File::Glob" => 0,
  "File::Path" => 0,
  "File::Spec" => 0,
  "File::Spec::Functions" => 0,
  "File::Temp" => "0.23",
  "File::Which" => 0,

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN


use Path::Tiny 0.146 qw/path/;

# Path::Tiny OBVIATES NEED for many but we still need this
use File::Spec ();
use File::Spec::Functions qw/devnull tmpdir rootdir catdir catfile/;

# Still sometimes convenient...
use File::Basename qw(basename);

use File::BOM ();
use File::Which qw/which/;
use URI::file ();
use Guard qw(guard scope_guard);
use Fcntl qw(:flock :seek);
use Symbol qw/qualify_to_ref/;
use Scalar::Util qw/blessed openhandle/;
use List::Util qw/none all notall first min max/;
use Encode qw(encode decode);
use File::Glob qw/bsd_glob GLOB_NOCASE/;
use Digest::MD5 qw/md5_base64/;

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

      #   If false, ONLY decimal numbers (thousands separators ok).
      .",true" # default: false (for import)
      # Tokens 9-10: not used on import
      .",,"
      # Token 11: Remove spaces; trim leading & trailing spaces when reading
      .","  # default: false
      # Token 12: not use on import
      .","
      # Token 13: Import "=..." as formulas instead of text?
      .","  # default: false i.e. do not recognize formulas
      # Token 14: "Automatically detected since LibreOffice 7.6" [BOM?]
      .","
    }
    else {
      undef
    }
  };

  my $ofilter = $opts->{soffice_outfilter} //= do{
    # OutputFilterName[:paramtoken,paramtoken,...]
    if ($opts->{cvt_to} =~ /^(?:csv|txt)$/) {

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

      .","
      # Token 12: (LO 7.2+) sheet selections:
      #   0 or absent => the "first" sheet
      #   1-N => the Nth sheet (arrgh, can not specify name!!)
      #   -1 => export all sheets to files named filebasenamne.Sheetname.csv
      .",".($opts->{allsheets} ? -1 :
            $opts->{sheetname} ? die("add named-sheet support here") :
            0)
      # Token 13: Not used for export
      .","
      # Token 14: true to include BOM in the result
      #.","
    }
    else {
      undef
    }
  };

  # We can only control the output directory path, not the name of
  # an individual result file.  If $dst is a directory then the result
  # could theoretically output into it directly, but instead we always

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

  # Input file basename sans any .suffix
  $opts{ifbase} = $opts{inpath_sans_sheet}->basename(qr/\.[^.]+/);

  %opts
}#_process_args

sub _binmode_slurp_and_log($$$) { # *without trying to seek*
  my ($fh, $ref2octets, $debug) = @_;
  binmode($fh);
  local $/ = undef;
  #$$ref2octets = <$fh>//"";  # Now known to not have a BOM.  <<< IS THIS REALLY TRUE???
  $$ref2octets = <$fh>;
  btwN \3,"Raw slurp-from-0 fh=$fh: ",(defined($$ref2octets) ? visO(substr($$ref2octets,0,300)) : "*undef*")
    if $debug;
  $$ref2octets //= "";
}
sub _slurp_ifnotslurped($$$) {
  my ($fh, $ref2octets, $debug) = @_;
  return if defined $$ref2octets;
  seek($fh, 0, SEEK_SET) or die $!;
  _binmode_slurp_and_log($fh, $ref2octets, $debug);

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

  my $chars = eval {
    decode($enc, substr($$ref2octets, $start_pos),
               $check // (Encode::FB_CROAK|Encode::LEAVE_SRC) )
  };
  die($@,visnew->Useqq(0)->dvis('\n$enc $start_pos $ref2octets')) if $@;
  $chars
}


# Detect cvt_from and cvt_to from filename suffixes or by peeking at the data.
# If input is CSV, detect encoding (removing a BOM if present),
#   detect separator and quote characters, and generate default
#   {col_formats} which, e.g. reads items with leading zeroes
#   (e.g. Zip codes) as character data and not numbers.
#
# Unless the input is usable as-is it will be slurped into memory and the
# buffer returned by reference at $$ref2octets.  If slurped data is present
# the caller should use that instead of reading the original input;
# either way {input_encoding} should be used to decode.
#
# (If $$ref2octets was already defined it is assumed to contain

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

      #_dump_fh("AAA $path open_input raw");
    }
    if (! seek($fh, 0, SEEK_SET)) {
      oops if defined $$ref2octets;
      _binmode_slurp_and_log($fh, $ref2octets, $debug);
      close $fh;
      $fh = undef;
      open $fh, "<:raw", $ref2octets or confess "BUG:in-mem open:$!";
      #_dump_fh("BBB unseekable, slurped");
    }
    my $bomenc = File::BOM::get_encoding_from_filehandle($fh);
    $start_pos = tell($fh);
    if ($bomenc) {
      btw dvis 'Input has BOM, $bomenc $start_pos' if $debug;
      $opts->{input_encoding} = $bomenc;
      binmode($fh); # unnecessary???
      binmode($fh, ":raw:encoding($bomenc):crlf") or die "binmode: $!";
    }
    #_dump_fh("CCC final");
  }
  my sub determine_input_encoding() {
    # If one encoding was specified by the user or implied by a BOM, use it;
    # otherwise try multiple encodings specified by the user or defaulted
    # until one seems to work.
    $opts->{input_encoding} //= $default_input_encodings;
    my @enclist = split m#,#, $opts->{input_encoding};
    return
      if @enclist == 1;
    _slurp_ifnotslurped($fh, $ref2octets, $debug);
    for my $enc (@enclist) {
      eval { _decode_slurped_data($enc, $ref2octets, $start_pos) };

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

      defined($opts->{quote_char}) ? (quote_char=>$opts->{quote_char}) : (),
      defined($opts->{sep_char})   ? (sep_char=>$opts->{sep_char})     : (),
      auto_diag => 2, # throw on error
      @_
    );
    $csvopts{escape_char} = $csvopts{quote_char}; # must always be the same

    my $csv = Text::CSV->new (\%csvopts)
              or croak "Text::CSV->new: ", Text::CSV->error_diag(),
                       dvis('\n## %csvopts\n');
    seek($fh, $start_pos, SEEK_SET) or die $!; # skip over possible BOM
    my $rows;
    while (my $F = $csv->getline( $fh )) {
      push(@$rows, $F);
    }
    $rows
  }

  my sub determine_csv_q_sep($) {
    my ($r2rows) = @_;
    return

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

          last Q;
        }
        warn dvis('$q sep=$sep did not work...\n'),vis($@),"\n"
          if $debug;
        _dump_fd("$q sep=$sep did not work") if $debug;
        $$r2rows = undef;
      }
    }
    unless (defined($$r2rows)) {
      #confess "Input file is not valid CSV (or we have a bug)\n"
      seek($fh, $start_pos, SEEK_SET) or die $!; # skip over possible BOM
      my $n = read($fh, my $somechars, 100);
      croak "ERROR READING input: $!" unless defined($n);
      if ($n == 0) {
        warn "File has NO CONTENT.  Treating it like an (empty) CSV\n"
          if $debug;
        $$r2rows = [];
      } else {
        my @layers = PerlIO::get_layers($fh);
        warn dvis '$start_pos @layers $somechars\n';
        if (open my $fh99, "<:raw", $opts->{inpath}) {

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

    } else {
      return $outpath;
    }
  }

  my $done;
  if ($opts{cvt_from} eq $opts{cvt_to}) {  # csv to csv
    # Special cases: in & out are both CSVs
    if ($input_enc ne $output_enc || $start_pos != 0) {
      # Special case #1: In & out are CSV but using different encodings,
      #   or the same encoding but the input contained a BOM
      #   (which we never want in the output).
      #   With {allsheets} the output will be inside the {outpath} directory.
      my $dst = get_outcsv_path();
      warn "> Transcoding csv:  $input_enc -> $output_enc into ",qsh($dst),"\n"
        if $opts{debug};
      _slurp_ifnotslurped($fh, \$octets, $opts{debug});
      my $chars = _decode_slurped_data($input_enc, \$octets, $start_pos,
                                       Encode::FB_CROAK);
      $octets = encode($output_enc, $chars, Encode::FB_CROAK);
      $dst->spew_raw($octets);

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN


If a single argument is given it specifies INPUT; otherwise all arguments must
be specified as key => value pairs, and may include any options supported
by C<convert_spreadsheet>.

INPUT may be a csv or spreadsheet workbook path; if a spreadsheet,
then a single "sheet" is converted, specified by either a !SHEETNAME suffix
in the INPUT path, a separate C<< sheetname => SHEETNAME >> option,
or if unspecified to extract the only sheet (croaks if there is more than one).

The resulting file handle refers to a guaranteed-seekable BOM-less CSV file.
This will either be a temporary file (auto-removed at process exit),
or the original INPUT if it was already a seekable csv file without a BOM.

RETURNS: A ref to a hash containing the following:

 {
  fh        => the resulting open file handle
  csvpath   => the path {fh} refers to, which might be a temporary file
  sheetname => sheet name if the input was a spreadsheet
 }

=head2 convert_spreadsheet INPUT, cvt_to=>suffix, OPTIONS

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

If C<outpath =E<gt> OUTPATH> is specifed then results are I<always> saved
to that path.  With C<allsheets> this must be a directory, which will be
created if necessary.

If C<outpath> is NOT specified in OPTIONS then, with one exception,
results are saved to a temporary file or directory and that path is returned
as C<outpath> in the result hash.
The exception is if no conversion is necessary
when the input file itself is returned as C<outpath>
(i.e. C<cvt_from> is the same as C<cvt_to> and, if 'csv',
there was no BOM and an encoding change is not needed).

In all cases C<outpath> in the result hash points to the results.

C<cvt_from> or C<cvt_to> are filename suffixes (sans dot)
e.g. "csv", "xlsx", etc., and are only required if INPATH or C<outpath>
parameters do not contain a .suffix .

OPTIONS may also include:

=over 4

lib/Spreadsheet/Edit/IO.pm view on Meta::CPAN

are converted to separate .csv files named "SHEETNAME.csv" in
the 'outpath' directory.  C<< cvt_to =E<gt> 'csv' >> is also requred.

=item input_encoding => ENCODING

Specifies the encoding of INPUT if it is a csv file.

ENCODING may be a comma-separated list of encoding
names which will be tried in the order until one seems to work.
If only one is specified it will be used without trying it first.
The default is "UTF-8,windows-1252".  If a BOM is present it overrides.

=item output_binmode => "..."

Used when writing csv file(s), defaults to ':raw:encoding(UTF-8):crlf'.

=item output_encoding => ENCODING

(Deprecated) Implies output_binmode => ':raw:encoding(ENCODIING):crlf'.

=item col_formats => [...]

t/io.pl view on Meta::CPAN


### This doesn't break the no-internals-mentioned test in t/60_all.t
### because 'carp' is never (normally) called, i.e. no warnings to users occur
##$Carp::Verbose = 1; # show backtrace on errors

use t_SSUtils;
use Encode qw/encode decode/;

use Spreadsheet::Edit qw/:all logmsg fmt_sheet cx2let let2cx sheet/;
use Spreadsheet::Edit::IO qw/convert_spreadsheet OpenAsCsv/;
use File::BOM qw/open_bom/;
use Data::Hexify qw/Hexify/;

use Test::Deep::NoTest qw/eq_deeply/;

my $cwd = fastgetcwd;
#my $input_xlsx_path = abs2rel(fast_abs_path("$Bin/../tlib/Test.xlsx"), $cwd);
my $tlib = path("$Bin/../tlib")->absolute;
my $input_xlsx_path = $tlib->child("Test.xlsx");

# Is LibreOffice (or some substitute) installed?

t/io.pl view on Meta::CPAN

          ivis('Result:   $got_chars\n') ;
    }
  }
} else {
  warn "# Spreadsheet tests skipped because soffice is not installed\n" unless $silent;
}

########################################################################
# Do lots of tests using specified test data.
# This is called twice, once with real data and again with empty data
# (the latter to verify that logically-empty files i.e. BOM only, work)
########################################################################
sub do_encoding_tests($) {
  my $exp_chars = shift;
  confess dvis 'Unexpected CR in $exp_chars' if $exp_chars =~ /\x{0D}/;
  my $exp_CRLFchars = $exp_chars =~ s/\n/\x0D\x0A/gr;
  my $emptydata = $exp_chars eq "";

  # Well, we can't prevent CRLF line endings being written on Windows
  # (when writing with :crlf).  This prevents comparing :crlf-written
  # results with our golden files written on Linux.

t/io.pl view on Meta::CPAN


  (my $local_testcsv_UTF16 = Path::Tiny->tempfile("local_testcsv_UTF16_XXXXX"))
    ->spew({binmode => ":raw:encoding(UTF-16)"}, $exp_chars);

  (my $local_testcsv_UTF16BECRLF = Path::Tiny->tempfile("local_testcsv_UTF16BECRLF_XXXXX"))
    ->spew({binmode => ":raw:encoding(UTF-16BE)"}, $exp_CRLFchars);

  (my $local_testcsv_UTF16LE = Path::Tiny->tempfile("local_testcsv_UTF16LE_XXXXX"))
    ->spew({binmode => ":raw:encoding(UTF-16LE)"}, $exp_chars);

  my $local_testcsv_UTF8CRLFWithBOM = Path::Tiny->tempfile("local_testcsv_UTF8CRLFWithBOM_XXXXX");
  {
    $local_testcsv_UTF8CRLF->spew({binmode => ":raw:encoding(UTF-8)"}, $exp_CRLFchars);
    $local_testcsv_UTF8CRLFWithBOM->spew({binmode => ":raw:encoding(UTF-8)"}, "\N{U+feff}");
    $local_testcsv_UTF8CRLFWithBOM->append_utf8($exp_CRLFchars);
  }

  # Confirm that conflicting specs are caught
  eval{my $dum=read_spreadsheet {sheetname => "Sheet1", verbose => $verbose}, $input_xlsx_path."!Another Sheet" };
  die "Conflicting sheetname opt and !suffix not caught" if $@ eq "";

  # "Read" a csv; should be a pass-thru without conversion when possible
  foreach (
     #  path              exp_passthru  input_encoding     output_encoding
     [$local_testcsv_UTF8,           1, undef             ,undef           ],
     [$local_testcsv_UTF8CRLF,       1, undef             ,undef           ],
     [$local_testcsv_UTF8,           0, undef             ,"UTF-16BE"      ],
     [$local_testcsv_UTF8CRLFWithBOM,0, undef             ,undef           ],
     [$local_testcsv_UTF8CRLFWithBOM,0,"UTF-32"           ,undef           ], # BOM overrides
     [$local_testcsv_UTF16,          0,"UTF-32"           ,undef           ], # BOM overrides
     [$local_testcsv_UTF16BECRLF,    0,"UTF-32,UTF-16BE"  ,undef           ], # no BOM
     [$local_testcsv_UTF16LE,        1,"UTF-16LE"         ,"UTF-16LE"      ], # no BOM
     [$local_testcsv_UTF16BECRLF,    1,"UTF-16BE"         ,"UTF-16BE"      ], # no BOM
     [$local_testcsv_UTF16BECRLF, (!$emptydata),"UTF-32,UTF-16BE"  ,"UTF-16BE"      ], # no BOM
     [$local_testcsv_UTF16LE,        0,"UTF-16LE,UTF-32"  ,"UTF-32"        ], # no BOM
     [$local_testcsv_UTF16LE,        0,"UTF-16LE,UTF-32"  ,"UTF-16BE"      ], # no BOM
         ) {
    my ($input_csvpath, $exp_passthru, $input_enc, $output_enc) = @$_;
    my @inenc_opts  = ($input_enc  ? (input_encoding  => $input_enc ) : ());
    my @outenc_opts = ($output_enc ? (output_encoding => $output_enc) : ());

    warn "--- Expect passthru=",!!$exp_passthru," for $input_csvpath ienc=",u($input_enc)," oenc=",u($output_enc),"\n"
      if $verbose;

    ### This is sometimes failing to detect $local_testcsv as a CSV on Solaris ;
    ### try to show enough information to debug it...

t/io.pl view on Meta::CPAN

             qx/(set -x; ls -ld $input_csvpath) 2>&1/,
             qx/(set -x; ls -ld $h_cs->{outpath}) 2>&1/,
             qx/(set -x; od -t x1a $h_cs->{outpath}) 2>&1/,
             " "
        );
      }
      # Re-read the file ourself and check the data
      my $input_data = do{
        my ($benc,$spill) = open_bom(my $inFH, $input_csvpath, ":raw");
        oops if $spill;
        my $enc = $benc || ($input_csvpath =~ s/.*UTF/UTF-/r =~ s/WithBOM|CRLF|_.*//gr);
        binmode($inFH, ":raw:encoding($enc):crlf") or die $!;
        warn dvis '##BBB $input_csvpath $enc $benc $spill\n' if $debug;
        local $/; <$inFH>
      };
      my $reslurp_data = do{
        my $oenc = $h_cs->{encoding} // die "{encoding} not set in result";
        path($h_cs->{outpath})->slurp( {binmode => ":raw:encoding($oenc):crlf"} );
      };
      unless ($input_data eq $reslurp_data) {
        die dvis 'Data from slurp with result encoding does not match!\n'

( run in 0.882 second using v1.01-cache-2.11-cpan-131fc08a04b )