AI-FANN-Evolving

 view release on metacpan or  search on metacpan

lib/AI/FANN/Evolving/TrainData.pm  view on Meta::CPAN

package AI::FANN::Evolving::TrainData;
use strict;
use List::Util 'shuffle';
use AI::FANN ':all';
use Algorithm::Genetic::Diploid::Base;
use base 'Algorithm::Genetic::Diploid::Base';

our $AUTOLOAD;
my $log = __PACKAGE__->logger;

=head1 NAME

AI::FANN::Evolving::TrainData - wrapper class for FANN data

=head1 METHODS

=over

=item new

Constructor takes named arguments. By default, ignores column
named ID and considers column named CLASS as classifier.

=cut

sub new {
	my $self = shift->SUPER::new(
		'ignore'    => [ 'ID'    ],
		'dependent' => [ 'CLASS' ],
		'header'    => {},
		'table'     => [],
		@_
	);
	my %args  = @_;
	$self->read_data($args{'file'}) if $args{'file'};
	$self->trim_data if $args{'trim'};
	return $self;
}

=item ignore_columns

Getter/setter for column names to ignore in the train data structure. 
For example: an identifier columns named 'ID'

=cut

sub ignore_columns {
	my $self = shift;
	$self->{'ignore'} = \@_ if @_;
	return @{ $self->{'ignore'} };
}

=item dependent_columns

Getter/setter for column name(s) of the output value(s).

=cut

sub dependent_columns {
	my $self = shift;
	$self->{'dependent'} = \@_ if @_;
	return @{ $self->{'dependent'} };
}

=item predictor_columns

Getter for column name(s) of input value(s)

=cut

sub predictor_columns {
	my $self = shift;
	my @others = ( $self->ignore_columns, $self->dependent_columns );
	my %skip = map { $_ => 1 } @others;
	return grep { ! $skip{$_} } keys %{ $self->{'header'} };
}

=item predictor_data

Getter for rows of input values

lib/AI/FANN/Evolving/TrainData.pm  view on Meta::CPAN

sub trim_data {
	my $self = shift;
	my @trimmed;
	ROW: for my $row ( @{ $self->{'table'} } ) {
		next ROW if grep { not defined $_ } @{ $row };
		push @trimmed, $row;
	}
	my $num = $self->{'size'} - scalar @trimmed;
	$log->info("removed $num incomplete rows");
	$self->{'table'} = \@trimmed;
}

=item sample_data

Sample a fraction of the data

=cut

sub sample_data {
	my $self   = shift;
	my $sample = shift || 0.5;
	my $clone1 = $self->clone;
	my $clone2 = $self->clone;
	my $size   = $self->size;
	my @sample;
	$clone2->{'table'} = \@sample;
	while( scalar(@sample) < int( $size * $sample ) ) {
		my @shuffled = shuffle( @{ $clone1->{'table'} } );
		push @sample, shift @shuffled;
		$clone1->{'table'} = \@shuffled;
	}
	return $clone2, $clone1;
}

=item partition_data

Creates two clones that partition the data according to the provided ratio.

=cut

sub partition_data {
	my $self   = shift;
	my $sample = shift || 0.5;
	my $clone1 = $self->clone;
	my $clone2 = $self->clone;
	my $remain = 1 - $sample;
	$log->info("going to partition into $sample : $remain");
		
	# compute number of different dependent patterns and ratios of each
	my @dependents = $self->dependent_data;
	my %seen;
	for my $dep ( @dependents ) {
		my $key = join '/', @{ $dep };
		$seen{$key}++;
	}
	
	# adjust counts to sample size
	for my $key ( keys %seen ) {
		$log->debug("counts: $key => $seen{$key}");
		$seen{$key} = int( $seen{$key} * $sample );
		$log->debug("rescaled: $key => $seen{$key}");
	}

	# start the sampling	
	my @dc = map { $self->{'header'}->{$_} } $self->dependent_columns;
	my @new_table; # we will populate this
	my @table = @{ $clone1->{'table'} }; # work on cloned instance
	
	# as long as there is still sampling to do 
	SAMPLE: while( grep { !!$_ } values %seen ) {
		for my $i ( 0 .. $#table ) {
			my @r = @{ $table[$i] };
			my $key = join '/', @r[@dc];
			if ( $seen{$key} ) {
				my $rand = rand(1);
				if ( $rand < $sample ) {
					push @new_table, \@r;
					splice @table, $i, 1;
					$seen{$key}--;
					$log->debug("still to go for $key: $seen{$key}");
					next SAMPLE;
				}
			}
		}
	}
	$clone2->{'table'} = \@new_table;
	$clone1->{'table'} = \@table;
	return $clone2, $clone1;
}

=item size

Returns the number of data records

=cut

sub size { scalar @{ shift->{'table'} } }

=item to_fann

Packs data into an L<AI::FANN> TrainData structure

=cut

sub to_fann {
	$log->debug("encoding data as FANN struct");
	my $self = shift;
	my @cols = @_ ? @_ : $self->predictor_columns;
	my @deps = $self->dependent_data;
	my @pred = $self->predictor_data( 'cols' => \@cols );
	my @interdigitated;
	for my $i ( 0 .. $#deps ) {
		push @interdigitated, $pred[$i], $deps[$i];
	}
	return AI::FANN::TrainData->new(@interdigitated);
}

=back

=cut



( run in 0.965 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )