Data-Frame

 view release on metacpan or  search on metacpan

lib/Data/Frame.pm  view on Meta::CPAN

package Data::Frame;
$Data::Frame::VERSION = '0.006005';
# ABSTRACT: data frame implementation

use 5.016;
use warnings;

use Data::Frame::Class;
with 'MooX::Traits';

use failures qw{
	columns::mismatch columns::length columns::unbalanced
	rows::mismatch rows::length rows::unique
	column::exists column::name::string
	index index::exists
};

use Hash::Ordered;
use PDL::Basic qw(sequence);
use PDL::Core qw(pdl null);
use List::AllUtils 0.19 qw(
  each_arrayref pairgrep pairkeys pairmap pairwise reduce zip
);

use PDL::DateTime  ();
use PDL::Primitive ();
use PDL::Factor    ();
use PDL::SV        ();
use PDL::StringfiableExtension;
use Ref::Util qw(is_plain_arrayref is_plain_hashref);
use Scalar::Util qw(blessed looks_like_number);
use Sereal::Encoder 4.005;
use Text::Table::Tiny;
use Type::Params;
use Types::Standard qw(
  Any ArrayRef CodeRef Enum CycleTuple HashRef Maybe Str
);
use Types::PDL qw(Piddle Piddle1D);

use Data::Frame::Column::Helper;

use Data::Frame::Indexer qw(:all);
use Data::Frame::Types qw(:all);
use Data::Frame::Util qw(:all);

use overload (
    '""' => sub { $_[0]->string },
    '.=' => sub {                    # this is similar to PDL
        my ( $self, $other ) = @_;
        $self->assign($other);
    },
    '==' => sub {
        my ( $self, $other ) = @_;
        $self->_compare( $other, 'eq' );
    },
    'eq' => sub {
        my ( $self, $other ) = @_;
        $self->_compare( $other, 'eq' );
    },
    '!=' => sub {
        my ( $self, $other ) = @_;
        $self->_compare( $other, 'ne' );
    },
    '<' => sub {
        my ( $self, $other, $swap ) = @_;
        $self->_compare( $other, ( $swap ? 'ge' : 'lt' ) );
    },
    '<=' => sub {
        my ( $self, $other, $swap ) = @_;
        $self->_compare( $other, ( $swap ? 'gt' : 'le' ) );
    },
    '>' => sub {    # use '<' overload
        my ( $self, $other, $swap ) = @_;
        $swap ? ( $self < $other ) : ( $other < $self );
    },
    '>=' => sub {    # use '<=' overload
        my ( $self, $other, $swap ) = @_;
        $swap ? ( $self <= $other ) : ( $other <= $self );
    },
    fallback => 1
);

# Relative tolerance. This can be used for data frame comparison.
our $TOLERANCE_REL = undef;
our $doubleformat = '%.8g';

# Check if all columns have same length or have a length of 1.
around BUILDARGS($orig, $class : @args) {
    my %args = @args;   

    if ( my $columns = $args{columns} ) {
        my $columns_is_aref = Ref::Util::is_plain_arrayref($columns);
        my $columns_href;
        if ($columns_is_aref) {
            $columns_href = {@$columns};
        }
        else {
            $columns_href = $columns;
        }

        my @lengths    = map { $_->length } values %$columns_href;
        my $max_length = List::AllUtils::max(@lengths);
        for my $column_name ( sort keys %$columns_href ) {
            my $data = $columns_href->{$column_name};
            if ( $data->length != $max_length ) {
                if ( $data->length == 1 ) {
                    if ($columns_is_aref) {
                        my $idx = List::AllUtils::lastidx {
                            $_ eq $column_name
                        }
                        List::AllUtils::pairkeys(@$columns);
                        $columns->[ 2 * $idx + 1 ] = $data->repeat($max_length);
                    }
                    else {    # hashref
                        $columns->{$column_name} = $data->repeat($max_length);
                    }
                }
                else {
                    die
"Column piddles must all be same length or have a length of 1";
                }
            }
        }
    }
    return $class->$orig(\%args);
}

sub _trait_namespace { 'Data::Frame::Role' } # override for MooX::Traits

has _columns => ( is => 'ro', default => sub { Hash::Ordered->new; } );

has _row_names => ( is => 'rw', predicate => 1 );

with qw(
  Data::Frame::Role::Rlike
  Data::Frame::IO::CSV
  Data::Frame::Partial::Eval
  Data::Frame::Partial::Sugar
);

sub BUILD {
	my ($self, $args) = @_;
	my $colspec = delete $args->{columns};

	if( defined $colspec ) {
		my @columns =
			  ref $colspec eq 'HASH'
			? map { ($_, $colspec->{$_} ) } sort { $a cmp $b } keys %$colspec
			: @$colspec;
		$self->add_columns(@columns);
	}

lib/Data/Frame.pm  view on Meta::CPAN

    }
    else {
        my $names_getter = "${row_or_column}_names";
        my @names        = $self->$names_getter()->flatten;
        return $indexer->indexer->map(
            sub {
                my ($name) = @_;
                my $ridx = List::AllUtils::firstidx { $name eq $_ } @names;
                if ( $ridx < 0 ) {
                    die "Cannot find $row_or_column name '$name'.";
                }
                return $ridx;
            }
        );
    }
}

method _cindexer_to_indices (Indexer $indexer) {
    return $self->_indexer_to_indices( $indexer, 'column' );
}

method _rindexer_to_indices (Indexer $indexer) {
    if ( $indexer->$_DOES('Data::Frame::Indexer::Label') ) {
        die "select_rows() does not yet support label indexer";
    }

    return $self->_indexer_to_indices( $indexer, 'row' );
}

method at (@rest) {
    my ( $rindexer, $cindexer ) = $self->_check_slice_args(@_);

    my $cindex = $cindexer->indexer->[0];
    my $col;
    if ( $cindexer->$_DOES('Data::Frame::Indexer::Integer') ) {
        $col = $self->nth_column($cindex);
    }
    else {    # Label;
        $col = $self->column($cindex);
    }

    if ( defined $rindexer ) {
        return $col->at( $rindexer->indexer->[0] );
    }
    else {
        return $col;
    }
}


method exists ($col_name) {
    $self->_columns->exists($col_name);
}

method delete ($col_name) {
    $self->_columns->delete($col_name);
}

method rename ((HashRef | CodeRef) $href_or_coderef) {
    my $f =
      Ref::Util::is_plain_coderef($href_or_coderef)
      ? $href_or_coderef
      : sub { $href_or_coderef->{ $_[0] } };
    my $new_names = $self->names->map( sub { $f->($_) // $_ } );
    $self->names($new_names);
    return $self;
}


method set ($indexer, $data) {
    state $check =
      Type::Params::compile( Indexer->plus_coercions(IndexerFromLabels),
        ColumnLike->plus_coercions( ArrayRef, sub { PDL::SV->new($_) } ) );
    ($indexer) = $check->( $indexer, $data );

    if ( $data->length == 1 ) {
        $data = $data->repeat( $self->nrow );
    }

    # Only Label indexer can be used to add new columns.
    my $name;
    if ( $indexer->$_DOES('Data::Frame::Indexer::Label') ) {
        $name = $indexer->indexer->[0];
    }
    else {
        my $cidx = $indexer->indexer->[0];
        if ( $cidx >= $self->ncol ) {
            die "Invalid column index: $cidx";
        }
        $name = $self->column_names->at($cidx);
    }

    if ( $self->exists($name) ) {
        $self->_column_validate( $name => $data );
        $self->_columns->set( $name => $data );
    }
    else {
        $self->add_column( $name, $data );
    }
    return;
}


method isempty () { $self->nrow == 0; }


method column_names(@rest) {
    my @colnames =
      (
        @rest == 1
          and ( Ref::Util::is_plain_arrayref( $rest[0] )
            or $rest[0]->$_can('flatten') )
      )
      ? $rest[0]->flatten
      : @rest;

	if( @colnames ) {
        unless (@colnames == $self->length) {
			failure::columns::length->throw({
					msg => "incorrect number of column names",
					trace => failure->croak_trace,
				});
        }
        # rename column names
        my @values = $self->_columns->values;
        $self->_columns->clear;

        # List::AllUtils and List::Util 's zip func are different.
        # See also https://github.com/houseabsolute/List-AllUtils/issues/12
        $self->_columns->push( zip( @colnames, @values ) );
	}
	return [ $self->_columns->keys ];
}

*col_names = \&column_names;
*names = \&column_names;

sub row_names {
	my ($self, @rest) = @_;
	if( @rest ) {
		# setting row names
		my $new_rows;
        if ( ref $rest[0] ) {
            if ( Ref::Util::is_plain_arrayref($rest[0]) ) {
                $new_rows = [ @{$rest[0]} ];
            }
            elsif ( $rest[0]->isa('PDL') ) {

                # TODO just run uniq?
                $new_rows = $rest[0]->unpdl;
            }
        }
        $new_rows //= [ @rest ];

		failure::rows::length->throw({
				msg => "invalid row names length",
				trace => failure->croak_trace,
			}) if $self->number_of_rows != $new_rows->length;
		failure::rows::unique->throw({
				msg => "non-unique row names",
				trace => failure->croak_trace,
			}) if $new_rows->length != $new_rows->uniq->length;

		return $self->_row_names( PDL::SV->new($new_rows) );
	}
	if( not $self->_has_row_names ) {
		# if it has never been set before
		return sequence($self->number_of_rows);
	}
	# else, if row_names has been set
	return $self->_row_names;
}

sub _make_actual_row_names {
	my ($self) = @_;
	if( not $self->_has_row_names ) {
		$self->_row_names( $self->row_names );
	}
}


method column($colname) {
	failure::column::exists->throw({
			msg => "column $colname does not exist",
			trace => failure->croak_trace,
		}) unless $self->exists( $colname );
	return $self->_columns->get($colname);
}

# supports negative indices
method nth_column($index) {
	failure::index->throw({
			msg => "requires index",
			trace => failure->croak_trace
		}) unless defined $index;
	failure::index::exists->throw({
			msg => "column index out of bounds",
			trace => failure->croak_trace,
		}) if $index >= $self->number_of_columns;
	# fine if $index < 0 because negative indices are supported
	return ($self->_columns->values)[$index];
}


lib/Data/Frame.pm  view on Meta::CPAN

            if ( defined $bad_to_val and $column->badflag ) {
                $column = $column->setbadtoval($bad_to_val);
            }

            my $indices_false = PDL::Primitive::which(
                defined $both_bad ? ( !$both_bad & $column ) : $column );
            return $indices_false->unpdl->map( sub { [ $_, $cidx ] } )->flatten;
        }
    );
    return pdl($coordinates);
}


method merge (DataFrame $df) {
    my $class   = ref($self);
    my $columns = [
        $self->names->map( sub { $_ => $self->column($_) } )->flatten,
        $df->names->map( sub { $_ => $df->column($_) } )->flatten
    ];
    return $class->new(
        columns   => $columns,
        row_names => $self->row_names
    );
}
*cbind = \&merge;

method append (DataFrame $df) {
    if ( $df->nrow == 0 ) {                     # $df is empty
        return $self->clone();
    }
    if ( $self->column_names->length == 0) {    # $self has no columns
        return $df->clone;
    }

    my $class   = ref($self);
    my $columns = $self->names->map(
        sub {
            my $col = $self->column($_);
            # use glue() as PDL's append() cannot handle bad values
            $_ => $col->glue( 0, $df->column($_) );
        }
    );
    return $class->new( columns => $columns );
}
*rbind = \&append;



method transform ($func) {
    state $check = Type::Params::compile(
        (
            CodeRef | ( HashRef [ Maybe [CodeRef] ] ) |
              ( CycleTuple [ Str, Maybe [CodeRef] ] )
        )
    );
    ($func) = $check->($func);

    my $class = ref($self);

    my @columns;
    if ( Ref::Util::is_coderef($func) ) {
        @columns =
          $self->names->map( sub {
            $_ => $func->( $self->column($_), $self );
          } )->flatten;
    }
    else {    # hashref or arrayref
        my $column_names = $self->names;
        my $hashref;
        my @new_column_names;
        if ( Ref::Util::is_hashref($func) ) {
            $hashref = $func;
            @new_column_names =
              grep { !$self->exists($_) } sort( keys %$hashref );
        }
        else {    # arrayref
            $hashref = {@$func};
            @new_column_names = grep { !$self->exists($_) } ( pairkeys @$func );
        }

        @columns = $column_names->map(
            sub {
                my $f = exists($hashref->{$_}) ? $hashref->{$_} : sub { $_[0] };
                $f //= sub { undef };
                $_ => $f->( $self->column($_), $self );
            }
        )->flatten;
        push @columns,
          map { my $f = $hashref->{$_}; $_ => $f->( undef, $self ) }
          @new_column_names;
    }

    my %columns_to_drop = @columns;
    %columns_to_drop = pairgrep { not defined $b } %columns_to_drop;

    return $class->new(
        columns   => [ pairgrep { !exists($columns_to_drop{$a}) } @columns ],
        row_names => $self->row_names,
    );
}


method split (ColumnLike $factor) {
    if ($factor->$_DOES('PDL::Factor')) {
        $factor = $factor->{PDL};
    }
    # avoid // as breaks Devel::Cover
    my $uniq_values = $factor->$_call_if_can('uniq');
    $uniq_values = [ List::AllUtils::uniq( $factor->flatten ) ]
      if !defined $uniq_values;

    my @rslt = map {
        my $indices = PDL::Primitive::which( $factor == $_ );
        $_ => $self->select_rows($indices);
    } $uniq_values->flatten;

    return (wantarray ? @rslt : { @rslt });
}


method sort ($by_columns, $ascending=true) {
    return $self->clone if $by_columns->length == 0;

    my $row_indices = $self->sorti( $by_columns, $ascending );
    return $self->select_rows($row_indices);
}

method sorti ($by_columns, $ascending=true) {
    if (Ref::Util::is_plain_arrayref($ascending)) {
        $ascending = logical($ascending);
    }

    return pdl( [ 0 .. $self->nrow - 1 ] ) if $by_columns->length == 0;

    my $is_number = $by_columns->map( sub { $self->is_numeric_column($_) } );
    my $compare = sub {
        my ( $a, $b ) = @_;
        for my $i ( 0 .. $#$is_number ) {
            my $rslt = (
                  $is_number->[$i]
                ? $a->[$i] <=> $b->[$i]
                : $a->[$i] cmp $b->[$i]
            );
            next if $rslt == 0;

            my $this_ascending = $ascending->$_call_if_can( 'at', $i )
              // $ascending;
            return ( $this_ascending ? $rslt : -$rslt );
        }
        return 0;
    };

    my $ea =
      each_arrayref( @{ $by_columns->map( sub { $self->at($_)->unpdl } ) } );
    my @sorted_row_indices = map { $_->[0] }
      sort { $compare->( $a->[1], $b->[1] ) }
      map {
        my @row_data = $ea->();
        [ $_, \@row_data ];
      } ( 0 .. $self->nrow - 1 );

    return pdl( \@sorted_row_indices );
}


my $sereal = Sereal::Encoder->new();

sub _serialize_row {
    my ($self, $i, $columns, $columns_isbad) = @_;

    my @row_data =
      map {
        my $isbad = $columns_isbad->[$_];
        ( defined $isbad and $isbad->at($i) )
          ? ( 1, undef )
          : ( 0, $columns->[$_]->at($i) );
      } ( 0 .. $#$columns );
    return $sereal->encode( \@row_data );
}

method uniq () {
    my %uniq;
    my @uniq_ridx;

    my @columns = map { $self->column($_) } @{ $self->column_names };
    my @columns_isbad = map { $_->badflag ? $_->isbad : undef } @columns;

    for my $i ( 0 .. $self->nrow - 1 ) {
        my $key = $self->_serialize_row($i, \@columns, \@columns_isbad);



( run in 1.047 second using v1.01-cache-2.11-cpan-39bf76dae61 )