BOM results from the CPAN

Module-Generic

view release on metacpan or search on metacpan

lib/Module/Generic/File.pm view on Meta::CPAN

    $opts->{sep_char} //= $sep;
    my $defaults =
    {
        binary      => 1,
        quote_char  => '"',
        escape_char => '"',
        sep_char    => ',',
        skip_empty_rows => sub{ [] },
    };
    foreach my $k ( keys( %$defaults ) )
    {
        next if( exists( $opts->{ $k } ) );
        $opts->{ $k } = $defaults->{ $k };
    }

    my @core_options = qw(
        accessors allow_loose_escapes allow_loose_quotes always_quote
        allow_unquoted_escape allow_whitespace auto_diag binary blank_is_undef callbacks
        comment_str decode_utf8 diag_verbose eol empty_is_undef escape_char escape_null
        formula formula_handling keep_meta_info quote quote_binary quote_char quote_empty
        quote_space sep sep_char skip_empty_rows strict types undef_str verbatim
    );
    # my @text_csv_opts = grep{ exists( $opts->{ $_ } ) } @core_options;
    my $args = {};
    foreach my $k ( @core_options )
    {
        if( exists( $opts->{ $k } ) )
        {
            # We remove the options from $opts, so we can pass the remaining options to our 'open' method.
            # We could do without, but it is cleaner this way.
            $args->{ $k } = delete( $opts->{ $k } );
        }
    }

    # If the user prefers a callback for each CSV row
    my $callback;
    if( exists( $opts->{callback} ) &&
        ref( $opts->{callback} // '' ) eq 'CODE' )
    {
        $callback = $opts->{callback};
    }

    my $csv;
    # try-catch
    local $@;
    eval
    {
        $csv = Text::CSV->new( $args );
    };
    if( $@ )
    {
        return( $self->error( "Error instantiating a new Text::CSV object: $@" ) );
    }

    my $headers = $opts->{headers} || [];
    my $io;
    my $opened = $io = $self->opened;
    my( $enc, $first_row );
    if( $opts->{detect_bom} )
    {
        $self->__message( 6, "Checking for BOM in CSV file." );
        if( $opened )
        {
            $self->close;
        }
        $self->__message( 6, "Opening the CSV file ${file} in raw mode." );
        $opened = $io = $self->open( '<', { binmode => 'raw' } ) || return( $self->pass_error );
        my $buffer = $self->getline;
        defined( $buffer ) or return( $self->pass_error );
        # See perlunicode
        my $re2enc = [
            # Big endian: 0x00 0x00 0xFE 0xFF
            qr/\x00\x00\xfe\xff/    => 'utf-32be',
            # Little endian: 0xFF 0xFE 0x00 0x00
            qr/\xff\xfe\x00\x00/    => 'utf-32le',
            qr/\xef\xbb\xbf/        => 'utf-8',
            # Big endian: 0xFE 0xFF
            qr/\xfe\xff/            => 'utf-16be',
            # Little endian: 0xFF 0xFE
            qr/\xff\xfe/            => 'utf-16le',
            qr/\xf7\x64\x4c/        => 'utf-1',
            qr/\xdd\x73\x66\x73/    => 'utf-ebcdic',
            qr/\x0e\xfe\xff/        => 'scsu',
            qr/\xfb\xee\x28/        => 'bocu-1',
            qr/\x84\x31\x95\x33/    => 'gb-18030',
            qr/\x{feff}/            => '',
        ];
        my $seek_offset = 0;
        for( my $i = 0; $i < scalar( @$re2enc ); $i += 2 )
        {
            my $re = $re2enc->[$i];
            my $e  = $re2enc->[$i+1];
            if( $buffer =~ s/^($re)// )
            {
                $seek_offset = CORE::length($1);
                $enc = $e;
                last;
            }
        }

        if( !CORE::length( $buffer ) )
        {
            return( $self->error( "This file ${file} is an empty CSV file with a BOM nonetheless." ) );
        }

        if( defined( $enc ) )
        {
            $enc = '' if( $enc eq 'utf-ebcdic' );
        }

        $io->seek( $seek_offset, SEEK_SET ) ||
            return( $self->error( "Unable to seek in file ${file}: $!" ) );
        if( CORE::length( $enc // '' ) )
        {
            # Mark the file handle with the right perl IO layer.
            $io->binmode( ":encoding($enc)" );
        }
        # Default to $opts->{encoding} if it was provided, and we have no encoding found from the BOM.
        elsif( CORE::length( $opts->{binmode} // '' ) )
        {
            $io->binmode( ":encoding($opts->{binmode})" );
        }

        eval
        {
            $first_row = $csv->getline($io);
        };
        if( $@ )
        {
            return( $self->error( "Unable to parse CSV header line: $@" ) );
        }
        elsif( !$first_row )
        {
            # Make sure error_diag() is called in scalar context.
            return( $self->error( "Unable to parse CSV header line in file ${file}: ", scalar( $csv->error_diag ) ) );
        }
    }

    # If we detected the BOM, we will have an $opened (i,e, $io) value set.
    if( !$opened )
    {
        $io = $self->open( '<', $opts ) || return( $self->pass_error );
    }
    elsif( !$self->can_read )
    {
        return( $self->error( "CSV file \"${file}\" is opened, but not in read mode. Cannot load data from it." ) );
    }

    my $cols = $opts->{cols} // $opts->{columns};
    my $process_headers;
    if( !defined( $cols ) || Scalar::Util::reftype( $cols // '' ) ne 'ARRAY' )
    {
        if( $headers eq 'auto' || 
            $headers eq 'uc' || 
            $headers eq 'lc' || 
            ref( $headers ) eq 'CODE' || 
            ref( $headers ) eq 'HASH' )
        {
            local $@;
            eval
            {
                my $row = $first_row ? $first_row : $csv->getline( $io );
                undef( $first_row );
                if( $headers eq 'uc' )
                {
                    my @new = map( uc( $_ ), @$row );
                    $headers = \@new;
                }
                elsif( $headers eq 'lc' )
                {
                    my @new = map( lc( $_ ), @$row );
                    $headers = \@new;
                }
                elsif( ref( $headers ) eq 'CODE' )
                {
                    local $@;
                    my $new = eval{ $headers->( $row ); };
                    if( $@ )
                    {
                        return( $self->error( "Callback returned an error: $@" ) );
                    }
                    if( defined( $new ) )
                    {
                        if( Scalar::Util::reftype( $new ) ne 'ARRAY' )
                        {
                            warn( "Value returned by the callback is not an array reference or an array object." ) if( $self->_is_warnings_enabled( 'Module::Generic' ) );
                            $headers = [];
                        }
                        else
                        {
                            $headers = $new;
                        }
                    }
                }
                elsif( ref( $headers ) eq 'HASH' )
                {
                    my $new = [map{ CORE::length( $headers->{ $_ } // '' ) ? $headers->{ $_ } : $_ } @$row];
                    $headers = $new;
                }

lib/Module/Generic/File.pm view on Meta::CPAN

            my $row_no_cols = scalar( @$row );
            # If the number of columns at this line does not match the number of headers columns, and the warnings is enabled, we warn about it.
            if( $has_cols && 
                $row_no_cols != $no_cols && 
                $is_warnings_enabled &&
                ++$too_many_warnings <= 10 )
            {
                warn( "Number of columns (${row_no_cols}) at line $n does not match the number of columns we have (${no_cols}) in CSV file ${file}" );
            }
            my $ref = {};
            @$ref{ @$cols } = @$row;
            if( defined( $callback ) )
            {
                my $rv = eval
                {
                    $callback->( $ref );
                };
                if( $@ )
                {
                    $self->error( "An error occurred executing callback with data from CSV at line $n", { cause => { data => $ref } } );
                    return(0);
                }
                # Returned value from callback is defined, but explicitly false, we indicate that this is the end
                elsif( defined( $rv ) && !$rv )
                {
                    return( $rv );
                }
            }
            else
            {
                push( @$all, $ref );
            }
        }
        else
        {
            if( defined( $callback ) )
            {
                my $rv = eval
                {
                    $callback->( $row );
                };
                if( $@ )
                {
                    $self->error( "An error occurred executing callback with data from CSV at line $n", { cause => { data => $row } } );
                    return(0);
                }
                # Returned value from callback is defined, but explicitly false, we indicate that this is the end
                elsif( defined( $rv ) && !$rv )
                {
                    return( $rv );
                }
            }
            else
            {
                push( @$all, $row );
            }
        }
        return(1);
    };

    # If we have a initial first raw from our detect BOM operation earlier, then we use that line of data now.
    $process->( $first_row ) if( $first_row );

    # Read each line of CSV data until the end.
    my $row;
    # while( my $row = $csv->getline( $io ) )
    while( $io->opened && ( $row = eval{ $csv->getline( $io ) } ) )
    {
        $n++;
        my $rv = $process->( $row );
        if( !$rv )
        {
            if( $self->error )
            {
                return( $self->pass_error );
            }
            return( $self->new_array );
        }
    }
    if( $@ )
    {
        return( $self->error( "Error trying to load data from CSV file $file: $@" ) );
    }
    elsif( !$csv->eof && ( my $error = $csv->error_diag ) )
    {
        warn( "Warning: error reading a CSV line: ${error}" ) if( $is_warnings_enabled );
    }
    # Return a Module::Generic::Array object.
    return( $self->new_array( $all ) );
}

sub load_json
{
    my $self = shift( @_ );
    my $opts = $self->_get_args_as_hash( @_ );
    # Inherited from Module::Generic
    my $j = $self->new_json || return( $self->pass_error );
    if( exists( $opts->{boolean_values} ) && 
        $self->_is_array( $opts->{boolean_values} ) )
    {
        $j->boolean_values( @{$opts->{boolean_values}} );
    }
    if( exists( $opts->{filter_json_object} ) && 
        ref( $opts->{filter_json_object} ) eq 'CODE' )
    {
        $j->filter_json_object( $opts->{filter_json_object} );
    }
    if( exists( $opts->{filter_json_single_key_object} ) && 
        ref( $opts->{filter_json_single_key_object} ) eq 'HASH' &&
        scalar( keys( %{$opts->{filter_json_single_key_object}} ) ) &&
        ref( $opts->{filter_json_single_key_object}->{ [keys( %{$opts->{filter_json_single_key_object}} ) ]->[0] } ) eq 'CODE' )
    {
        $j->filter_json_single_key_object( %{$opts->{filter_json_single_key_object}} );
    }
    if( exists( $opts->{decode_prefix} ) && 
        defined( $opts->{decode_prefix} ) && 
        CORE::length( $opts->{decode_prefix} ) )
    {
        $j->decode_prefix( $opts->{decode_prefix} );
    }
    my $json = $self->load_utf8;

lib/Module/Generic/File.pm view on Meta::CPAN

See also L</symlink>

=head2 load

Assuming this element is an existing file, this will load its content and return it as a regular string.

If the C<binmode> used on the file is C<:unix>, then this will call L<perlfunc/read> to load the file content, otherwise it localises the input record separator C<$/> and read the entire content in one go. See L<perlvar/$INPUT_RECORD_SEPARATOR>

If this method is called on a directory object, it will return undef.

=head2 load_csv

    my $array_object = $file->load_csv( %options ) ||
        die( $file->error );
    # Passing the options as an hash reference
    my $array_object = $file->load_csv( $options ) ||
        die( $file->error );

This takes an optional hash or hash reference of options, loads the CSV file data, and returns them an L<array object|Module::Generic::Array> of rows. Each row can be either an array reference or a hash reference, depending on the options provided.

If an empty row is found in the CSV, an equally empty row of hash reference, or array reference will be added to the result set. This behavior can be modified with the C<skip_empty_rows> option passed to L<Text::CSV> (default is to return an empty ar...

This method requires L<Text::CSV> to work, and it will attempt to load it dynamically.

If an error occurs, it will set an L<error object|Module::Generic::Exception>, and returns an empty list in list context, or C<undef> in scalar context.

Below are the supported options. Any other options will be passed directly to L<Text::CSV> upon object instantiation.

=over 4

=item * C<binmode>

Optional. Specifies the encoding to apply when reading the file. This defaults to C<utf-8>

=item * C<callback>

Optional. If provided, must be a code reference, such as an anonymous subroutine or a reference to a subroutine. This callback will be called for each parsed CSV row. The callback receives one argument: an hash reference, or an array reference repres...

The type of data the callback receives (array reference or hash reference) depends on other options you provide.

For example, if you do not provide any columns using C<cols> or C<columns>, or if you expect an array reference to be returned, then the callback will be called with an array reference of columns.

Any exception thrown during the execution of the callback will be caught, and an L<error object|Module::Generic::Exception> will be set, and an empty list will be returned in list context, and C<undef> will be returned in scalar context.

=item * C<columns> or C<cols>

Optional. Specifies the exact column names to be returned in the final dataset.

For example:

    my $array_object = $file->load_csv(
        cols => [qw( name age locale zipcode )],
    ) || die( $file->error );

Note that you could also achieve the same result with the C<headers> option.

=item * C<detect_bom>

Optional. Defaults to true.

If set to a true value, this will check the CSV file for a C<BOM> (Byte-order mark), remove it by seeking past it, and use it to determine the necessary underlying encoding of the file to read the rest of the data. The encoding is then applied to the...

Using the C<BOM> detection, it recognises C<bocu-1>, C<gb-18030>, C<utf-ebcdic>, C<utf-1>, C<utf-8>, C<utf-16be>, C<utf-16le>, C<utf-32be>, C<utf-32le>, C<scsu>

=item * C<expects>

Optional. Determines the expected return format.

If not specified, defaults to hash references if C<cols> (or C<columns>) or C<headers> option value provides column names, otherwise array references.

=over 8

=item * C<array>

Returns an L<array object|Module::Generic::Array> of array references.

=item * C<hash>

Returns an L<array object|Module::Generic::Array> of hash references.

=back

For example:

    my $array_object = $file->load_csv(
        headers => [qw( name age locale zipcode )],
        expects => 'array'
    ) || die( $file->error );

This would return an L<array object|Module::Generic::Array> of array references, even though we provided headers names.

However

    my $array_object = $file->load_csv(
        headers => [qw( name age locale zipcode )],
    ) || die( $file->error );

would return an L<array object|Module::Generic::Array> of hash references.

=item * C<headers>

Optional. Defines how the first row of the CSV should be treated. This is only used if the option C<cols> or C<columns> is not used.

Depending on the value, it affects the return value.

It can take the following values:

=over 8

=item * C<auto>

Uses the first row as headers, and returns an L<array object|Module::Generic::Array> of hash references.

=item * C<discard> or C<skip>

Skips the first row, and returns an L<array object|Module::Generic::Array> of array reference, one for each row of CSV data.

=item * C<lc>

Converts headers to lowercase, and returns an L<array object|Module::Generic::Array> of hash references, with all keys thus set to lowercase.

=item * C<uc>

( run in 1.837 second using v1.01-cache-2.11-cpan-5b529ec07f3 )