Data-TableReader
view release on metacpan or search on metacpan
- Wrokaround for Libre Office row-max bug
- New Decoder::XLSX attribute 'xls_formatter', useful for casting
local datetimes to a known time zone.
- Prevent loading Text::CSV_XS if its version is too old.
Version 0.007 - 2018-03-15
- Fix various bugs in detect_input_format relevant to CSV files
- Enhanced debug-level logging during the table search algorighm
Version 0.006 - 2018-03-08
- Auto-detect CSV Unicode format by looking for BOM, unless handle
already has an IO layer to specify encoding.
- Improved logging, and logging API. 'log' attribute is no longer public
on Decoder objects.
Version 0.005 - 2017-11-26
- Remove 'filters' feature since it adds overhead and not much value
- Fix bug when creating multiple parallel record iterators
- Sanitize data headers written in logging messages
Version 0.004 - 2017-11-26
lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN
}
my @layers= PerlIO::get_layers($fh);
if (($enc)= grep { /^encoding|^utf/ } @layers) {
# extract encoding name
return 'UTF-8' if $enc eq 'utf8';
return uc($1) if $enc =~ /encoding\(([^)]+)\)/;
return uc($enc); # could throw a parse error, but this is probably more useful behavior
}
# fh_start_pos will be set if we have already checked for BOM
if ($self->autodetect_encoding && !defined $self->_fh_start_pos) {
$self->_fh_start_pos(tell $fh or 0);
if (($enc= $self->_autodetect_bom($fh))) {
binmode($fh, ":encoding($enc)");
# re-mark the start after the BOM
$self->_fh_start_pos(tell $fh or 0);
return $enc;
}
}
return '';
}
has _fh_start_pos => ( is => 'rw' );
has _iterator => ( is => 'rw', weak_ref => 1 );
has _row_ref => ( is => 'rw' );
sub iterator {
my $self= shift;
croak "Multiple iterators on CSV stream not supported yet" if $self->_iterator;
my $parser= $self->parser;
my $fh= $self->file_handle;
my $row_ref= $self->_row_ref;
# Keeping this object is just an indication of whether an iterator has been used yet
if (!$row_ref) {
$self->_row_ref($row_ref= \(my $row= 0));
# trigger BOM detection if needed
my $enc= $self->encoding;
$self->_log->('debug', "encoding is ".($enc||'maybe utf8'));
# ensure _fh_start_pos is set
$self->_fh_start_pos(tell $fh or 0);
}
elsif ($$row_ref) {
$self->_log->('debug', 'Seeking back to start of input');
seek($fh, $self->_fh_start_pos, 0)
or die "Can't seek back to start of stream";
$$row_ref= 0;
lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN
{
row => $row_ref,
fh => $fh,
origin => $self->_fh_start_pos,
}
);
$self->_iterator($i);
return $i;
}
# This design is simplified from File::BOM in that it ignores UTF-32
# and in any "normal" case it can read from a pipe with only one
# character to push back, avoiding the need to tie the file handle.
# It also checks for whether layers have already been enabled.
# It also avoids seeking to the start of the file handle, in case
# the user deliberately seeked to a position.
sub _autodetect_bom {
my ($self, $fh)= @_;
my $fpos= tell($fh);
local $!;
lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN
if ($buf eq "\xFF\xFE") {
return 'UTF-16LE';
} elsif ($buf eq "\xFE\xFF") {
return 'UTF-16BE';
} elsif ($buf eq "\xEF\xBB" and read($fh, $buf, 1, 2) and $buf eq "\xEF\xBB\xBF") {
return 'UTF-8';
}
}
}
# It wasn't a BOM. Try to undo our read.
$self->_log->('debug', 'No BOM in stream, seeking back to start');
if (length $buf == 1) {
$fh->ungetc(ord $buf);
} elsif (!seek($fh, $fpos, 0)) {
# Can't seek
if ($fh->can('ungets')) { # support for FileHandle::Unget
$fh->ungets($buf);
} else {
croak "Can't seek input handle after BOM detection; You should set an encoding manually, buffer the entire input, or use FileHandle::Unget";
}
}
return;
}
# If you need to subclass this iterator, don't. Just implement your own.
# i.e. I'm not declaring this implementation stable, yet.
use Data::TableReader::Iterator;
BEGIN { @Data::TableReader::Decoder::CSV::_Iter::ISA= ('Data::TableReader::Iterator'); }
lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN
This module makes an attempt at automatic unicode support:
=over
=item *
If the stream has a PerlIO encoding() on it, no additional decoding is done.
=item *
If the stream has a BOM (byte-order mark) for UTF-8 or UTF-16, it adds that
encoding with C<binmode>.
=item *
Else, it lets the parser decide. The default Text::CSV parser will
automatically upgrade UTF-8 sequences that it finds. (and, you can't disable
this without also disabling unicode received from IO layers, which seems like
a bug...)
=back
t/10-decoder-csv.t view on Meta::CPAN
subtest utf_bom => \&test_utf_bom;
done_testing;
sub ascii {
return <<END;
a,b,c,d
1,2,3,4
END
}
sub utf8_bom {
# BOM "test\n"
# "\x{8A66}\x{3057},1,2,3\n"
# "\x{27000}\n"
return "\xEF\xBB\xBF"
."test\n"
."\xE8\xA9\xA6\xE3\x81\x97,1,2,3\n"
."\xF0\xA7\x80\x80\n";
}
sub utf16_le_bom {
return "\xFF\xFE"
."t\0e\0s\0t\0\n\0"
( run in 0.339 second using v1.01-cache-2.11-cpan-e9daa2b36ef )