BOM results from the CPAN

Text-AutoCSV

#!/usr/bin/perl
# ABSTRACT: helper module to automate the use of Text::CSV

# vim:tw=80 :et

# Text/AutoCSV.pm

#
# Written by SÃ©bastien Millet
#   March, July, August, September 2016
#   January - August 2017
#

package Text::AutoCSV;
$Text::AutoCSV::VERSION = '1.2.0';
use strict;
use warnings;

my $PKG = "Text::AutoCSV";

require Exporter;
our @ISA       = 'Exporter';
our @EXPORT_OK = qw(remove_accents);

use Carp;
use Params::Validate qw(validate validate_pos :types);
use List::MoreUtils qw(first_index indexes);
use Fcntl qw(SEEK_SET);
use File::BOM;
use Text::CSV;
use DateTime;

# DateTime::Format::Strptime 1.70 does not work properly with us.
# Actually all version as of 1.63 are fine, except 1.70.
use DateTime::Format::Strptime 1.71;
use Class::Struct;
use Unicode::Normalize;

# lock_keys is used to prevent accessing non existing keys
# Credits: 3381159 on http://stackoverflow.com
#          "make perl shout when trying to access undefined hash key"
use Hash::Util qw(lock_keys);
use Memoize;

# FIXME
# Not needed in release -> should be always commented unless at dev time
#use feature qw(say);
#use Data::Dumper;
#$Data::Dumper::Sortkeys = 1;

# Set to 1 if you wish to turn on debug without touching caller's code
our $ALWAYS_DEBUG = 0;

# Keep it set to 0 unless you know what you're doing!
# Note
#   Taken into account only if debug is set.
my $DEBUG_DATETIME_FORMATS = 0;

# The below is taken into account only if $DEBUG_DATETIME_FORMATS is set.
# It becomes really MASSIVE debug output.
my $DEBUG_DATETIME_FORMATS_EVEN_MORE = 0;

#
# Uncomment to replace carp and croak with cluck and confess, respectively
# Also reachable with perl option:
#   -MCarp=verbose
# See 'perldoc Carp'.
#
#$Carp::Verbose = 1;

# * *************** *
# * BEHAVIOR TUNING *
# * *************** *

# * **************************************************** *
# * ALL THE VARIABLES BELOW ARE RATHER LOW LEVEL.        *
# * IF YOU UPDATE IT, IT WILL LIKELY BREAK THE TEST PLAN *
# * **************************************************** *

my $DEF_SEARCH_CASE           = 0; # Case insensitive search by default
my $DEF_SEARCH_TRIM           = 1; # Trim values by default
my $DEF_SEARCH_IGNORE_ACCENTS = 1; # Ignore accents
my $DEF_SEARCH_IGNORE_EMPTY   = 1; # Ignore empty strings in searches by default
my $DEF_SEARCH_VALUE_IF_NOT_FOUND = undef; # If not found, returned field value is undef
my $DEF_SEARCH_VALUE_IF_AMBIGUOUS = undef; # If more than one record found by search (when a
                                           # unique value is expected), return undef
my $DEF_SEARCH_IGNORE_AMBIGUOUS = 1; # By default, ignore the fact that multiple records are
                                     # found by search and return the first record found

lib/Text/AutoCSV.pm view on Meta::CPAN


    confess "Hey! what is this second argument?" if defined($nonexistent_arg);

    my $new_count = $#{ $self->{_flat} } + 1;

    my $updated_max = 0;
    if ( $new_count > $self->get_max_in_mem_record_count() ) {
        $self->_set_max_in_mem_record_count($new_count);
        $updated_max = 1;
    }

    $self->{_in_mem_record_count} = $new_count;
    if ($_debug) {
        print( $_debugh "_in_mem_record_count updated, set to $new_count" );
        print( $_debugh " (also updated max)" ) if $updated_max;
        print( $_debugh "\n" );
    }
}

sub _detect_inh_encoding {
    my ( $self, $enc, $via, $in_file, $detect_enc ) = @_;
    my $_debug  = $self->{_debug};
    my $_debugh = $self->{_debugh};

    $enc = $DEFAULT_IN_ENCODING if ( !defined $enc ) or $enc eq '';

    my @encodings = split( /\s*,\s*/, $enc );

    confess "Oups! No encoding to try?" if $#encodings < 0;

    print( $_debugh "[ST] _detect_inh_encoding(): start\n" ) if $_debug;

    my $wrn = 0;
    my $m;
    my $m0;
    my $ee;
    for my $e (@encodings) {
        $ee = $e;
        my $viadef = _get_def( $via, '' );
        $m = ":encoding($e)$viadef";
        $m0 = $m unless defined($m0);

        last unless $detect_enc;

        confess "Oups! in_file not defined?"
          if ( !defined $in_file )
          or $in_file eq '';

        print( $_debugh "   Checking encoding '$e' / '$m'\n" ) if $_debug;
        $wrn = 0;

        $self->_register_pass("check $e encoding");

        my $utf8_bom = 0;
        if ( _is_utf8($e) ) {
            if ( open my $fh, '<:raw', $in_file ) {
                my $bom;
                read $fh, $bom, 3;
                if ( length($bom) == 3 and $bom eq "\xef\xbb\xbf" ) {
                    if ( !defined($via) ) {
                        $m .= ":via(File::BOM)";
                    }
                }
                close $fh;
            }
        }

        my $inh;
        if ( !open( $inh, "<", $in_file ) ) { ## no critic (InputOutput::RequireBriefOpen)
            $self->_print_error("unable to open file '$in_file': $!");
            return ( $encodings[0], $m0 );
        }
        binmode $inh, $m;

        # TURN OFF WARNINGS OUTPUT

        {
            local $SIG{__WARN__} = sub {
                $wrn++;

        # Uncomment only for debug!
        # Otherwise you'll get quite a good deal of output at each execution :-)
        #                print(STDERR @_);
            };
            while (<$inh>) { }
        }

        # WARNINGS ARE BACK ON

        close $inh;
        print( $_debugh "     '$m' counts $wrn warning(s)\n" ) if $_debug;

        last if $wrn == 0;
    }

    if ( $wrn >= 1 ) {
        $self->_print_warning(
                "encoding warnings encountered during initial check, "
              . "using '$encodings[0]'" );
        return ( $encodings[0], $m0 );
    }

    confess "Oups! undef encoding string?" unless defined($m);

    print( $_debugh "   Detected encoding string '$ee' / '$m'\n" ) if $_debug;
    return ( $ee, $m );
}

#
# Each of these functions brings status to the next value (current status + 1).
# Each of these functions returns 0 if an error occured, 1 if all good
#
my @status_forward_functions = (
    "_S1_init_input",                 # To go from S0 to S1
    "_S2_init_fields_from_header",    # To go form S1 to S2
    "_S3_init_fields_extra",          # To go from S2 to S3
    "_S4_read_all_in_mem",            # To go from S3 to S4
);

sub _status_reset {
    my $self = shift;

lib/Text/AutoCSV.pm view on Meta::CPAN

        $tmp = $ar->[ $named_fields{$field} ]
          if defined($field)
          and $field ne '';

        # Put here any post-processing of value
        # WARNING
        #   $tmp can be undef
        # ...

        $final[$i] = $tmp;
    }
    $_[1] = [@final];
}

sub write {
    my $self = shift;

    validate_pos(@_);

    return unless $self->_status_forward('S3');

    my $verbose = $self->{verbose};
    my $_debug  = $self->{_debug};
    my $_debugh = $self->{_debugh};

    my $out_file = $self->{out_file};

    my %stats;

    $self->_print("-- $out_file writing start\n") if $verbose;
    my $rows_written = 0;

    my $outh = $self->{outh};

    $self->{_close_outh_when_finished} = 0;
    unless ( defined($outh) ) {
        if ( $out_file eq '' ) {
            $outh = \*STDOUT;
        }
        else {
            unless ( open( $outh, '>', $out_file ) ) { ## no critic (InputOutput::RequireBriefOpen)
                $self->_print_error("unable to open file '$out_file': $!");
                return;
            }
            $self->{_close_outh_when_finished} = 1;
        }
        $self->{outh} = $outh;
    }

    unless ( $self->{_leave_encoding_alone} ) {
        my $enc = (
            defined( $self->{_inh_encoding} )
            ? $self->{_inh_encoding}
            : $DEFAULT_OUT_ENCODING
        );

        # out_encoding option takes precedence
        $enc = $self->{out_encoding} if defined( $self->{out_encoding} );
        my $m = ":encoding($enc)";
        if ( _is_utf8($enc) and $self->{out_utf8_bom} ) {
            $m .= ':via(File::BOM)';
        }

        if ( $OS_IS_PLAIN_WINDOWS and $FIX_PERLMONKS_823214 ) {

  # Tested with UTF-16LE, NOT tested with UTF-16BE (it should be the same story)
            $m = ":raw:perlio:$m:crlf" if $enc =~ /^utf-?16/i;
        }

        binmode $outh, $m;
        print( $_debugh "Encoding string used for output: $m\n" ) if $_debug;
    }

    my $escape_char = $self->{escape_char};
    my $quote_char  = $self->{quote_char};

    my %opts;
    $opts{binary} = 1;
    $opts{eol}    = "\n";

    $opts{sep_char} = $self->{sep_char}     if defined( $self->{sep_char} );
    $opts{sep_char} = $self->{out_sep_char} if defined( $self->{out_sep_char} );

    $opts{quote_char} = $self->{quote_char} if defined( $self->{quote_char} );
    $opts{quote_char} = $self->{out_quote_char}
      if defined( $self->{out_quote_char} );

    $opts{escape_char} = $self->{escape_char}
      if defined( $self->{escape_char} );
    $opts{escape_char} = $self->{out_escape_char}
      if defined( $self->{out_escape_char} );

    $opts{always_quote} = $self->{_is_always_quoted};
    $opts{always_quote} = $self->{out_always_quote}
      if defined( $self->{out_always_quote} );

    my $csvout = Text::CSV->new( {%opts} );
    if ( !defined($csvout) ) {
        $self->_print_error("error creating output Text::CSV object");
        return;
    }

    my $write_filter_hr =
      _get_def( $self->{out_filter}, $self->{write_filter_hr} );

    if (
        (
            $self->{has_headers}
            and (
                !(
                    defined( $self->{out_has_headers} )
                    and ( !$self->{out_has_headers} )
                )
            )
        )
        or $self->{out_has_headers}
      )
    {
        my $ar = [];
        if ( $self->{has_headers} ) {
            $ar = $self->{_headers};

lib/Text/AutoCSV.pm view on Meta::CPAN

another behavior. A header name of 'Phone Number' will get an internal column
name of 'PHONENUMBER' (default behavior, if none of fields_hr, fields_ar and
fields_column_names attributes is specified), and one may prefer 'PHONE NUMBER'
or 'phone number' or whatsoever.

2- By specifying a list of columns using either of fields_hr or fields_ar, you
not only map column names as found in the header line to internal column names:
you also I<request> these columns to be available. If one of the requested
columns cannot be found, Text::AutoCSV will croak (default) or print an error
and return an undef object (if created with C<croak_if_error =E<gt> 0>).

=item fields_column_names

Array ref of column internal names, in the order of columns in file. This
attribute works like the C<column_names> attribute of Text::CSV. It'll just
assign names to columns one by one, regardless of what the header line contains.
It'll work also if the file has no header line.

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv',
        fields_column_names => ['My COL1', '', 'My COL3']);

=item out_file

Output file when executing the L</write> method.

If not specified or empty, write to standard output.

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv');

=item outh

File handle to write CSV data to when executing the L</write> method.
Normally you don't want to specify this attribute.

C<outh> is useful if you don't like the way Text::AutoCSV opens the output file
for you.

Example:

    my $outh = open "myin.csv', ">>";
    my $csv = Text::AutoCSV->new(in_file => 'in.csv', has_headers => 0,
        outh => $outh);

=item out_encoding

Enforce the encoding of output.

Value by default: input encoding

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
        out_encoding => 'UTF-16');

=item out_utf8_bom

Enforce BOM (Byte-Order-Mark) on output, when it is UTF8. If output encoding is
not UTF-8, this attribute is ignored.

B<NOTE>

UTF-8 needs no BOM (there is no Byte-Order in UTF-8), and in practice,
UTF8-encoded files rarely have a BOM.

Using this attribute is not recommended. It is provided for the sake of
completeness, and also to produce Unicode files Microsoft EXCEL will be happy to
read.

At first sight it would seem more logical to make EXCEL happy with something
like this:

    out_encoding => 'UTF-16'

But... While EXCEL will identify UTF-16 and read it as such, it will not take
into account the BOM found at the beginning. In the end the first cell will have
2 useless characters prepended. The only solution the author knows to workaround
this issue if to use UTF-8 as output encoding, and enforce a BOM. That is, use:

    ..., out_encoding => 'UTF-8', out_utf8_bom => 1, ...

=item out_sep_char

Enforce the output CSV separator character.

Value by default: input separator

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
        out_sep_char => ',');

=item out_quote_char

Enforce the output CSV quote character.

Value by default: input quote character

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
        out_quote_char => '"');

=item out_escape_char

Enforce the output CSV escape character.

Value by default: input escape character

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
        out_escape_char_char => '\\');

=item out_always_quote

If true, quote all fields of output (set always_quote of Text::CSV).

If false, don't quote all fields of output (don't set C<always_quote> of
Text::CSV).

Value by default: same as what is found in input

While reading input, Text::AutoCSV works out whether or not all fields were
quoted. If yes, then the output Text::CSV object has the always_quote attribute
set, if no, then the output Text::CSV object does not have this attribute set.

Example:

    my $csv = Text::AutoCSV->new(in_file => 'in.csv', out_file => 'out.csv',
        out_always_quote => 1);

=item out_has_headers

If true, when writing output, write a header line on first line.

If false, when writing output, don't write a header line on first line.

( run in 0.556 second using v1.01-cache-2.11-cpan-39bf76dae61 )