Data-Frame
view release on metacpan or search on metacpan
lib/Data/Frame.pm view on Meta::CPAN
package Data::Frame;
$Data::Frame::VERSION = '0.006005';
# ABSTRACT: data frame implementation
use 5.016;
use warnings;
use Data::Frame::Class;
with 'MooX::Traits';
use failures qw{
columns::mismatch columns::length columns::unbalanced
rows::mismatch rows::length rows::unique
column::exists column::name::string
index index::exists
};
use Hash::Ordered;
use PDL::Basic qw(sequence);
use PDL::Core qw(pdl null);
use List::AllUtils 0.19 qw(
each_arrayref pairgrep pairkeys pairmap pairwise reduce zip
);
use PDL::DateTime ();
use PDL::Primitive ();
use PDL::Factor ();
use PDL::SV ();
use PDL::StringfiableExtension;
use Ref::Util qw(is_plain_arrayref is_plain_hashref);
use Scalar::Util qw(blessed looks_like_number);
use Sereal::Encoder 4.005;
use Text::Table::Tiny;
use Type::Params;
use Types::Standard qw(
Any ArrayRef CodeRef Enum CycleTuple HashRef Maybe Str
);
use Types::PDL qw(Piddle Piddle1D);
use Data::Frame::Column::Helper;
use Data::Frame::Indexer qw(:all);
use Data::Frame::Types qw(:all);
use Data::Frame::Util qw(:all);
use overload (
'""' => sub { $_[0]->string },
'.=' => sub { # this is similar to PDL
my ( $self, $other ) = @_;
$self->assign($other);
},
'==' => sub {
my ( $self, $other ) = @_;
$self->_compare( $other, 'eq' );
},
'eq' => sub {
my ( $self, $other ) = @_;
$self->_compare( $other, 'eq' );
},
'!=' => sub {
my ( $self, $other ) = @_;
$self->_compare( $other, 'ne' );
},
'<' => sub {
my ( $self, $other, $swap ) = @_;
$self->_compare( $other, ( $swap ? 'ge' : 'lt' ) );
},
'<=' => sub {
my ( $self, $other, $swap ) = @_;
$self->_compare( $other, ( $swap ? 'gt' : 'le' ) );
},
'>' => sub { # use '<' overload
my ( $self, $other, $swap ) = @_;
$swap ? ( $self < $other ) : ( $other < $self );
},
'>=' => sub { # use '<=' overload
my ( $self, $other, $swap ) = @_;
$swap ? ( $self <= $other ) : ( $other <= $self );
},
fallback => 1
);
# Relative tolerance. This can be used for data frame comparison.
our $TOLERANCE_REL = undef;
our $doubleformat = '%.8g';
# Check if all columns have same length or have a length of 1.
around BUILDARGS($orig, $class : @args) {
my %args = @args;
if ( my $columns = $args{columns} ) {
my $columns_is_aref = Ref::Util::is_plain_arrayref($columns);
my $columns_href;
if ($columns_is_aref) {
$columns_href = {@$columns};
}
else {
$columns_href = $columns;
}
my @lengths = map { $_->length } values %$columns_href;
my $max_length = List::AllUtils::max(@lengths);
for my $column_name ( sort keys %$columns_href ) {
my $data = $columns_href->{$column_name};
if ( $data->length != $max_length ) {
if ( $data->length == 1 ) {
if ($columns_is_aref) {
my $idx = List::AllUtils::lastidx {
$_ eq $column_name
}
List::AllUtils::pairkeys(@$columns);
$columns->[ 2 * $idx + 1 ] = $data->repeat($max_length);
}
else { # hashref
$columns->{$column_name} = $data->repeat($max_length);
}
}
else {
die
"Column piddles must all be same length or have a length of 1";
}
}
}
}
return $class->$orig(\%args);
}
sub _trait_namespace { 'Data::Frame::Role' } # override for MooX::Traits
has _columns => ( is => 'ro', default => sub { Hash::Ordered->new; } );
has _row_names => ( is => 'rw', predicate => 1 );
with qw(
Data::Frame::Role::Rlike
Data::Frame::IO::CSV
Data::Frame::Partial::Eval
Data::Frame::Partial::Sugar
);
sub BUILD {
my ($self, $args) = @_;
my $colspec = delete $args->{columns};
if( defined $colspec ) {
my @columns =
ref $colspec eq 'HASH'
? map { ($_, $colspec->{$_} ) } sort { $a cmp $b } keys %$colspec
: @$colspec;
$self->add_columns(@columns);
}
lib/Data/Frame.pm view on Meta::CPAN
}
else {
my $names_getter = "${row_or_column}_names";
my @names = $self->$names_getter()->flatten;
return $indexer->indexer->map(
sub {
my ($name) = @_;
my $ridx = List::AllUtils::firstidx { $name eq $_ } @names;
if ( $ridx < 0 ) {
die "Cannot find $row_or_column name '$name'.";
}
return $ridx;
}
);
}
}
method _cindexer_to_indices (Indexer $indexer) {
return $self->_indexer_to_indices( $indexer, 'column' );
}
method _rindexer_to_indices (Indexer $indexer) {
if ( $indexer->$_DOES('Data::Frame::Indexer::Label') ) {
die "select_rows() does not yet support label indexer";
}
return $self->_indexer_to_indices( $indexer, 'row' );
}
method at (@rest) {
my ( $rindexer, $cindexer ) = $self->_check_slice_args(@_);
my $cindex = $cindexer->indexer->[0];
my $col;
if ( $cindexer->$_DOES('Data::Frame::Indexer::Integer') ) {
$col = $self->nth_column($cindex);
}
else { # Label;
$col = $self->column($cindex);
}
if ( defined $rindexer ) {
return $col->at( $rindexer->indexer->[0] );
}
else {
return $col;
}
}
method exists ($col_name) {
$self->_columns->exists($col_name);
}
method delete ($col_name) {
$self->_columns->delete($col_name);
}
method rename ((HashRef | CodeRef) $href_or_coderef) {
my $f =
Ref::Util::is_plain_coderef($href_or_coderef)
? $href_or_coderef
: sub { $href_or_coderef->{ $_[0] } };
my $new_names = $self->names->map( sub { $f->($_) // $_ } );
$self->names($new_names);
return $self;
}
method set ($indexer, $data) {
state $check =
Type::Params::compile( Indexer->plus_coercions(IndexerFromLabels),
ColumnLike->plus_coercions( ArrayRef, sub { PDL::SV->new($_) } ) );
($indexer) = $check->( $indexer, $data );
if ( $data->length == 1 ) {
$data = $data->repeat( $self->nrow );
}
# Only Label indexer can be used to add new columns.
my $name;
if ( $indexer->$_DOES('Data::Frame::Indexer::Label') ) {
$name = $indexer->indexer->[0];
}
else {
my $cidx = $indexer->indexer->[0];
if ( $cidx >= $self->ncol ) {
die "Invalid column index: $cidx";
}
$name = $self->column_names->at($cidx);
}
if ( $self->exists($name) ) {
$self->_column_validate( $name => $data );
$self->_columns->set( $name => $data );
}
else {
$self->add_column( $name, $data );
}
return;
}
method isempty () { $self->nrow == 0; }
method column_names(@rest) {
my @colnames =
(
@rest == 1
and ( Ref::Util::is_plain_arrayref( $rest[0] )
or $rest[0]->$_can('flatten') )
)
? $rest[0]->flatten
: @rest;
if( @colnames ) {
unless (@colnames == $self->length) {
failure::columns::length->throw({
msg => "incorrect number of column names",
trace => failure->croak_trace,
});
}
# rename column names
my @values = $self->_columns->values;
$self->_columns->clear;
# List::AllUtils and List::Util 's zip func are different.
# See also https://github.com/houseabsolute/List-AllUtils/issues/12
$self->_columns->push( zip( @colnames, @values ) );
}
return [ $self->_columns->keys ];
}
*col_names = \&column_names;
*names = \&column_names;
sub row_names {
my ($self, @rest) = @_;
if( @rest ) {
# setting row names
my $new_rows;
if ( ref $rest[0] ) {
if ( Ref::Util::is_plain_arrayref($rest[0]) ) {
$new_rows = [ @{$rest[0]} ];
}
elsif ( $rest[0]->isa('PDL') ) {
# TODO just run uniq?
$new_rows = $rest[0]->unpdl;
}
}
$new_rows //= [ @rest ];
failure::rows::length->throw({
msg => "invalid row names length",
trace => failure->croak_trace,
}) if $self->number_of_rows != $new_rows->length;
failure::rows::unique->throw({
msg => "non-unique row names",
trace => failure->croak_trace,
}) if $new_rows->length != $new_rows->uniq->length;
return $self->_row_names( PDL::SV->new($new_rows) );
}
if( not $self->_has_row_names ) {
# if it has never been set before
return sequence($self->number_of_rows);
}
# else, if row_names has been set
return $self->_row_names;
}
sub _make_actual_row_names {
my ($self) = @_;
if( not $self->_has_row_names ) {
$self->_row_names( $self->row_names );
}
}
method column($colname) {
failure::column::exists->throw({
msg => "column $colname does not exist",
trace => failure->croak_trace,
}) unless $self->exists( $colname );
return $self->_columns->get($colname);
}
# supports negative indices
method nth_column($index) {
failure::index->throw({
msg => "requires index",
trace => failure->croak_trace
}) unless defined $index;
failure::index::exists->throw({
msg => "column index out of bounds",
trace => failure->croak_trace,
}) if $index >= $self->number_of_columns;
# fine if $index < 0 because negative indices are supported
return ($self->_columns->values)[$index];
}
lib/Data/Frame.pm view on Meta::CPAN
if ( defined $bad_to_val and $column->badflag ) {
$column = $column->setbadtoval($bad_to_val);
}
my $indices_false = PDL::Primitive::which(
defined $both_bad ? ( !$both_bad & $column ) : $column );
return $indices_false->unpdl->map( sub { [ $_, $cidx ] } )->flatten;
}
);
return pdl($coordinates);
}
method merge (DataFrame $df) {
my $class = ref($self);
my $columns = [
$self->names->map( sub { $_ => $self->column($_) } )->flatten,
$df->names->map( sub { $_ => $df->column($_) } )->flatten
];
return $class->new(
columns => $columns,
row_names => $self->row_names
);
}
*cbind = \&merge;
method append (DataFrame $df) {
if ( $df->nrow == 0 ) { # $df is empty
return $self->clone();
}
if ( $self->column_names->length == 0) { # $self has no columns
return $df->clone;
}
my $class = ref($self);
my $columns = $self->names->map(
sub {
my $col = $self->column($_);
# use glue() as PDL's append() cannot handle bad values
$_ => $col->glue( 0, $df->column($_) );
}
);
return $class->new( columns => $columns );
}
*rbind = \&append;
method transform ($func) {
state $check = Type::Params::compile(
(
CodeRef | ( HashRef [ Maybe [CodeRef] ] ) |
( CycleTuple [ Str, Maybe [CodeRef] ] )
)
);
($func) = $check->($func);
my $class = ref($self);
my @columns;
if ( Ref::Util::is_coderef($func) ) {
@columns =
$self->names->map( sub {
$_ => $func->( $self->column($_), $self );
} )->flatten;
}
else { # hashref or arrayref
my $column_names = $self->names;
my $hashref;
my @new_column_names;
if ( Ref::Util::is_hashref($func) ) {
$hashref = $func;
@new_column_names =
grep { !$self->exists($_) } sort( keys %$hashref );
}
else { # arrayref
$hashref = {@$func};
@new_column_names = grep { !$self->exists($_) } ( pairkeys @$func );
}
@columns = $column_names->map(
sub {
my $f = exists($hashref->{$_}) ? $hashref->{$_} : sub { $_[0] };
$f //= sub { undef };
$_ => $f->( $self->column($_), $self );
}
)->flatten;
push @columns,
map { my $f = $hashref->{$_}; $_ => $f->( undef, $self ) }
@new_column_names;
}
my %columns_to_drop = @columns;
%columns_to_drop = pairgrep { not defined $b } %columns_to_drop;
return $class->new(
columns => [ pairgrep { !exists($columns_to_drop{$a}) } @columns ],
row_names => $self->row_names,
);
}
method split (ColumnLike $factor) {
if ($factor->$_DOES('PDL::Factor')) {
$factor = $factor->{PDL};
}
# avoid // as breaks Devel::Cover
my $uniq_values = $factor->$_call_if_can('uniq');
$uniq_values = [ List::AllUtils::uniq( $factor->flatten ) ]
if !defined $uniq_values;
my @rslt = map {
my $indices = PDL::Primitive::which( $factor == $_ );
$_ => $self->select_rows($indices);
} $uniq_values->flatten;
return (wantarray ? @rslt : { @rslt });
}
method sort ($by_columns, $ascending=true) {
return $self->clone if $by_columns->length == 0;
my $row_indices = $self->sorti( $by_columns, $ascending );
return $self->select_rows($row_indices);
}
method sorti ($by_columns, $ascending=true) {
if (Ref::Util::is_plain_arrayref($ascending)) {
$ascending = logical($ascending);
}
return pdl( [ 0 .. $self->nrow - 1 ] ) if $by_columns->length == 0;
my $is_number = $by_columns->map( sub { $self->is_numeric_column($_) } );
my $compare = sub {
my ( $a, $b ) = @_;
for my $i ( 0 .. $#$is_number ) {
my $rslt = (
$is_number->[$i]
? $a->[$i] <=> $b->[$i]
: $a->[$i] cmp $b->[$i]
);
next if $rslt == 0;
my $this_ascending = $ascending->$_call_if_can( 'at', $i )
// $ascending;
return ( $this_ascending ? $rslt : -$rslt );
}
return 0;
};
my $ea =
each_arrayref( @{ $by_columns->map( sub { $self->at($_)->unpdl } ) } );
my @sorted_row_indices = map { $_->[0] }
sort { $compare->( $a->[1], $b->[1] ) }
map {
my @row_data = $ea->();
[ $_, \@row_data ];
} ( 0 .. $self->nrow - 1 );
return pdl( \@sorted_row_indices );
}
my $sereal = Sereal::Encoder->new();
sub _serialize_row {
my ($self, $i, $columns, $columns_isbad) = @_;
my @row_data =
map {
my $isbad = $columns_isbad->[$_];
( defined $isbad and $isbad->at($i) )
? ( 1, undef )
: ( 0, $columns->[$_]->at($i) );
} ( 0 .. $#$columns );
return $sereal->encode( \@row_data );
}
method uniq () {
my %uniq;
my @uniq_ridx;
my @columns = map { $self->column($_) } @{ $self->column_names };
my @columns_isbad = map { $_->badflag ? $_->isbad : undef } @columns;
for my $i ( 0 .. $self->nrow - 1 ) {
my $key = $self->_serialize_row($i, \@columns, \@columns_isbad);
( run in 1.047 second using v1.01-cache-2.11-cpan-39bf76dae61 )