BOM results from the CPAN

Data-TableReader

view release on metacpan or search on metacpan

 - Wrokaround for Libre Office row-max bug
 - New Decoder::XLSX attribute 'xls_formatter', useful for casting
   local datetimes to a known time zone.
 - Prevent loading Text::CSV_XS if its version is too old.

Version 0.007 - 2018-03-15
 - Fix various bugs in detect_input_format relevant to CSV files
 - Enhanced debug-level logging during the table search algorighm

Version 0.006 - 2018-03-08
 - Auto-detect CSV Unicode format by looking for BOM, unless handle
   already has an IO layer to specify encoding.
 - Improved logging, and logging API.  'log' attribute is no longer public
   on Decoder objects.

Version 0.005 - 2017-11-26
 - Remove 'filters' feature since it adds overhead and not much value
 - Fix bug when creating multiple parallel record iterators
 - Sanitize data headers written in logging messages

Version 0.004 - 2017-11-26

lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN

	}
	
	my @layers= PerlIO::get_layers($fh);
	if (($enc)= grep { /^encoding|^utf/ } @layers) {
		# extract encoding name
		return 'UTF-8' if $enc eq 'utf8';
		return uc($1) if $enc =~ /encoding\(([^)]+)\)/;
		return uc($enc); # could throw a parse error, but this is probably more useful behavior
	}
	
	# fh_start_pos will be set if we have already checked for BOM
	if ($self->autodetect_encoding && !defined $self->_fh_start_pos) {
		$self->_fh_start_pos(tell $fh or 0);
		if (($enc= $self->_autodetect_bom($fh))) {
			binmode($fh, ":encoding($enc)");
			# re-mark the start after the BOM
			$self->_fh_start_pos(tell $fh or 0);
			return $enc;
		}
	}
	return '';
}


has _fh_start_pos => ( is => 'rw' );
has _iterator => ( is => 'rw', weak_ref => 1 );
has _row_ref => ( is => 'rw' );
sub iterator {
	my $self= shift;
	croak "Multiple iterators on CSV stream not supported yet" if $self->_iterator;
	my $parser= $self->parser;
	my $fh= $self->file_handle;
	my $row_ref= $self->_row_ref;
	# Keeping this object is just an indication of whether an iterator has been used yet
	if (!$row_ref) {
		$self->_row_ref($row_ref= \(my $row= 0));
		# trigger BOM detection if needed
		my $enc= $self->encoding;
		$self->_log->('debug', "encoding is ".($enc||'maybe utf8'));
		# ensure _fh_start_pos is set
		$self->_fh_start_pos(tell $fh or 0);
	}
	elsif ($$row_ref) {
		$self->_log->('debug', 'Seeking back to start of input');
		seek($fh, $self->_fh_start_pos, 0)
			or die "Can't seek back to start of stream";
		$$row_ref= 0;

lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN

		{
			row => $row_ref,
			fh  => $fh,
			origin => $self->_fh_start_pos,
		}
	);
	$self->_iterator($i);
	return $i;
}

# This design is simplified from File::BOM in that it ignores UTF-32
# and in any "normal" case it can read from a pipe with only one
# character to push back, avoiding the need to tie the file handle.
# It also checks for whether layers have already been enabled.
# It also avoids seeking to the start of the file handle, in case
# the user deliberately seeked to a position.
sub _autodetect_bom {
	my ($self, $fh)= @_;
	my $fpos= tell($fh);
	
	local $!;

lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN

			if ($buf eq "\xFF\xFE") {
				return 'UTF-16LE';
			} elsif ($buf eq "\xFE\xFF") {
				return 'UTF-16BE';
			} elsif ($buf eq "\xEF\xBB" and read($fh, $buf, 1, 2) and $buf eq "\xEF\xBB\xBF") {
				return 'UTF-8';
			}
		}
	}
	
	# It wasn't a BOM.  Try to undo our read.
	$self->_log->('debug', 'No BOM in stream, seeking back to start');
	if (length $buf == 1) {
		$fh->ungetc(ord $buf);
	} elsif (!seek($fh, $fpos, 0)) {
		# Can't seek
		if ($fh->can('ungets')) { # support for FileHandle::Unget
			$fh->ungets($buf);
		} else {
			croak "Can't seek input handle after BOM detection; You should set an encoding manually, buffer the entire input, or use FileHandle::Unget";
		}
	}
	return;
}

# If you need to subclass this iterator, don't.  Just implement your own.
# i.e. I'm not declaring this implementation stable, yet.
use Data::TableReader::Iterator;
BEGIN { @Data::TableReader::Decoder::CSV::_Iter::ISA= ('Data::TableReader::Iterator'); }

lib/Data/TableReader/Decoder/CSV.pm view on Meta::CPAN

This module makes an attempt at automatic unicode support:

=over

=item *

If the stream has a PerlIO encoding() on it, no additional decoding is done.

=item *

If the stream has a BOM (byte-order mark) for UTF-8 or UTF-16, it adds that
encoding with C<binmode>.

=item *

Else, it lets the parser decide.  The default Text::CSV parser will
automatically upgrade UTF-8 sequences that it finds.  (and, you can't disable
this without also disabling unicode received from IO layers, which seems like
a bug...)

=back

t/10-decoder-csv.t view on Meta::CPAN

subtest utf_bom => \&test_utf_bom;
done_testing;

sub ascii {
	return <<END;
a,b,c,d
1,2,3,4
END
}
sub utf8_bom {
	# BOM "test\n"
	# "\x{8A66}\x{3057},1,2,3\n"
	# "\x{27000}\n"
	return "\xEF\xBB\xBF"
		."test\n"
		."\xE8\xA9\xA6\xE3\x81\x97,1,2,3\n"
		."\xF0\xA7\x80\x80\n";
}
sub utf16_le_bom {
	return "\xFF\xFE"
		."t\0e\0s\0t\0\n\0"

( run in 0.339 second using v1.01-cache-2.11-cpan-e9daa2b36ef )