data results from the CPAN

AI-FANN-Evolving

view release on metacpan or search on metacpan

lib/AI/FANN/Evolving/TrainData.pm
LICENSE
Makefile.PL
MANIFEST			This list of files
MYMETA.json
MYMETA.yml
README.md
script/aivolver
t/00-load.t
t/01-run.t
t/02-data.t
t/03-fann-wrapper.t
t/perl-critic.t
t/perlcriticrc
t/pod-coverage.t
t/pod.t
META.yml                                 Module YAML meta-data (added by MakeMaker)
META.json                                Module JSON meta-data (added by MakeMaker)

lib/AI/FANN/Evolving.pm view on Meta::CPAN

=head1 NAME

AI::FANN::Evolving - artificial neural network that evolves

=head1 METHODS

=over

=item new

Constructor requires 'file', or 'data' and 'neurons' arguments. Optionally takes 
'connection_rate' argument for sparse topologies. Returns a wrapper around L<AI::FANN>.

=cut

sub new {
	my $class = shift;
	my %args  = @_;
	my $self  = {};
	bless $self, $class;
	$self->_init(%args);
	
	# de-serialize from a file
	if ( my $file = $args{'file'} ) {
		$self->{'ann'} = AI::FANN->new_from_file($file);
		$log->debug("instantiating from file $file");
		return $self;
	}
	
	# build new topology from input data
	elsif ( my $data = $args{'data'} ) {
		$log->debug("instantiating from data $data");
		$data = $data->to_fann if $data->isa('AI::FANN::Evolving::TrainData');
		
		# prepare arguments
		my $neurons = $args{'neurons'} || ( $data->num_inputs + 1 );
		my @sizes = ( 
			$data->num_inputs, 
			$neurons,
			$data->num_outputs
		);
		
		# build topology
		if ( $args{'connection_rate'} ) {
			$self->{'ann'} = AI::FANN->new_sparse( $args{'connection_rate'}, @sizes );
		}
		else {
			$self->{'ann'} = AI::FANN->new_standard( @sizes );
		}

lib/AI/FANN/Evolving.pm view on Meta::CPAN

			$ann->num_inputs, 
			$ann->num_inputs + 1,
			$ann->num_outputs,
		);
		
		# copy the AI::FANN properties
		$ann->template($self->{'ann'});
		return $self;
	}
	else {
		die "Need 'file', 'data' or 'ann' argument!";
	}
}

=item template

Uses the object as a template for the properties of the argument, e.g.
$ann1->template($ann2) applies the properties of $ann1 to $ann2

=cut

lib/AI/FANN/Evolving.pm view on Meta::CPAN

=cut

sub clone {
	my $self = shift;
	$log->debug("cloning...");
	
	# we delete the reference here so we can use 
	# Algorithm::Genetic::Diploid::Base's cloning method, which
	# dumps and loads from YAML. This wouldn't work if the 
	# reference is still attached because it cannot be 
	# stringified, being an XS data structure
	my $ann = delete $self->{'ann'};
	my $clone = $self->SUPER::clone;
	
	# clone the ANN by writing it to a temp file in "FANN/FLO"
	# format and reading that back in, then delete the file
	my ( $fh, $file ) = tempfile();
	close $fh;
	$ann->save($file);
	$clone->{'ann'} = __PACKAGE__->new_from_file($file);
	unlink $file;
	
	# now re-attach the original ANN to the invocant
	$self->{'ann'} = $ann;
	
	return $clone;
}

=item train

Trains the AI on the provided data object

=cut

sub train {
	my ( $self, $data ) = @_;
	if ( $self->train_type eq 'cascade' ) {
		$log->debug("cascade training");
	
		# set learning curve
		$self->cascade_activation_functions( $self->activation_function );
		
		# train
		$self->{'ann'}->cascadetrain_on_data(
			$data,
			$self->neurons,
			$self->neuron_printfreq,
			$self->error,
		);
	}
	else {
		$log->debug("normal training");
	
		# set learning curves
		$self->hidden_activation_function( $self->activation_function );
		$self->output_activation_function( $self->activation_function );
		
		# train
		$self->{'ann'}->train_on_data(
			$data,
			$self->epochs,
			$self->epoch_printfreq,
			$self->error,
		);	
	}
}

=item enum_properties

Returns a hash whose keys are names of enums and values the possible states for the

lib/AI/FANN/Evolving/Experiment.pm view on Meta::CPAN

		my $value = shift;
		$log->info("assigning new workdir $value");
		$self->{'workdir'} = $value;
	}
	else {
		$log->debug("retrieving workdir");
	}
	return $self->{'workdir'};
}

=item traindata

Getter/setter for the L<AI::FANN::TrainData> object.

=cut

sub traindata {
	my $self = shift;
	if ( @_ ) {
		my $value = shift;
		$log->info("assigning new traindata $value");
		$self->{'traindata'} = $value;
	}
	else {
		$log->debug("retrieving traindata");
	}
	return $self->{'traindata'};
}

=item run

Runs the experiment!

=cut

sub run {
	my $self = shift;

lib/AI/FANN/Evolving/Factory.pm view on Meta::CPAN

use strict;
use Algorithm::Genetic::Diploid;
use base 'Algorithm::Genetic::Diploid::Factory';

our $AUTOLOAD;

my %defaults = (
	'experiment' => 'AI::FANN::Evolving::Experiment',
	'chromosome' => 'AI::FANN::Evolving::Chromosome',
	'gene'       => 'AI::FANN::Evolving::Gene',
	'traindata'  => 'AI::FANN::Evolving::TrainData',
);

=head1 NAME

AI::FANN::Evolving::Factory - creator of objects

=head1 METHODS

=over

=item new

Constructor takes named arguments. Key is a short name (e.g. 'traindata'), value is a 
fully qualified package name (e.g. L<AI::FANN::TrainData>) from which to instantiate 
objects identified by the short name.

=back

=cut

sub new { shift->SUPER::new(%defaults,@_) }

1;

lib/AI/FANN/Evolving/Gene.pm view on Meta::CPAN

Constructor is passed named arguments. Instantiates a trained L<AI::FANN::Evolving> ANN

=cut

sub new {

	# initialize self up the inheritance tree
	my $self = shift->SUPER::new(@_);
			
	# instantiate and train the FANN object
	my $traindata = $self->experiment->traindata;
	$self->ann( AI::FANN::Evolving->new( 'data' => $traindata ) );
	return $self;
}

=item ann

Getter/setter for an L<AI::FANN::Evolving> ANN

=cut

sub ann {

lib/AI/FANN/Evolving/Gene.pm view on Meta::CPAN

sub make_function {
	my $self = shift;
	my $ann = $self->ann;
	my $error_func = $self->experiment->error_func;
	$log->debug("making fitness function");
	
	# build the fitness function
	return sub {		
	
		# train the AI
		$ann->train( $self->experiment->traindata );
	
		# isa TrainingData object, this is what we need to use
		# to make our prognostications. It is a different data 
		# set (out of sample) than the TrainingData object that
		# the AI was trained on.
		my $env = shift;		
		
		# this is a number which we try to keep as near to zero
		# as possible
		my $fitness = 0;
		
		# iterate over the list of input/output pairs
		for my $i ( 0 .. ( $env->length - 1 ) ) {
			my ( $input, $expected ) = $env->data($i);
			my $observed = $ann->run($input);
			
			use Data::Dumper;
			$log->debug("Observed: ".Dumper($observed));
			$log->debug("Expected: ".Dumper($expected));
			
			# invoke the error_func provided by the experiment
			$fitness += $error_func->($observed,$expected);
		}
		$fitness /= $env->length;

lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN

use List::Util 'shuffle';
use AI::FANN ':all';
use Algorithm::Genetic::Diploid::Base;
use base 'Algorithm::Genetic::Diploid::Base';

our $AUTOLOAD;
my $log = __PACKAGE__->logger;

=head1 NAME

AI::FANN::Evolving::TrainData - wrapper class for FANN data

=head1 METHODS

=over

=item new

Constructor takes named arguments. By default, ignores column
named ID and considers column named CLASS as classifier.

lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN


sub new {
	my $self = shift->SUPER::new(
		'ignore'    => [ 'ID'    ],
		'dependent' => [ 'CLASS' ],
		'header'    => {},
		'table'     => [],
		@_
	);
	my %args  = @_;
	$self->read_data($args{'file'}) if $args{'file'};
	$self->trim_data if $args{'trim'};
	return $self;
}

=item ignore_columns

Getter/setter for column names to ignore in the train data structure. 
For example: an identifier columns named 'ID'

=cut

sub ignore_columns {
	my $self = shift;
	$self->{'ignore'} = \@_ if @_;
	return @{ $self->{'ignore'} };
}

lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN


=cut

sub predictor_columns {
	my $self = shift;
	my @others = ( $self->ignore_columns, $self->dependent_columns );
	my %skip = map { $_ => 1 } @others;
	return grep { ! $skip{$_} } keys %{ $self->{'header'} };
}

=item predictor_data

Getter for rows of input values

=cut

sub predictor_data {
	my ( $self, %args ) = @_;
	my $i = $args{'row'};
	my @cols = $args{'cols'} ? @{ $args{'cols'} } : $self->predictor_columns;
	
	# build hash of indices to keep
	my %keep = map { $self->{'header'}->{$_} => 1 } @cols;
	
	# only return a single row
	if ( defined $i ) {
		my @pred;
		for my $j ( 0 .. $#{ $self->{'table'}->[$i] } ) {
			push @pred, $self->{'table'}->[$i]->[$j] if $keep{$j};
		}
		return \@pred;
	}
	else {
		my @preds;
		my $max = $self->size - 1;
		for my $j ( 0 .. $max ) {
			push @preds, $self->predictor_data( 'row' => $j, 'cols' => \@cols);
		}
		return @preds;
	}
}

=item dependent_data

Getter for dependent (classifier) data

=cut

sub dependent_data {
	my ( $self, $i ) = @_;
	my @dc = map { $self->{'header'}->{$_} } $self->dependent_columns;
	if ( defined $i ) {
		return [ map { $self->{'table'}->[$i]->[$_] } @dc ];
	}
	else {
		my @dep;
		for my $j ( 0 .. $self->size - 1 ) {
			push @dep, $self->dependent_data($j);
		}
		return @dep;
	}
}

=item read_data

Reads provided input file

=cut

sub read_data {
	my ( $self, $file ) = @_; # file is tab-delimited
	$log->debug("reading data from file $file");
	open my $fh, '<', $file or die "Can't open $file: $!";
	my ( %header, @table );
	while(<$fh>) {
		chomp;
		next if /^\s*$/;
		my @fields = split /\t/, $_;
		if ( not %header ) {
			my $i = 0;
			%header = map { $_ => $i++ } @fields;
		}
		else {
			push @table, \@fields;
		}
	}
	$self->{'header'} = \%header;
	$self->{'table'}  = \@table;
	return $self;
}

=item write_data

Writes to provided output file

=cut

sub write_data {
	my ( $self, $file ) = @_;
	
	# use file or STDOUT
	my $fh;
	if ( $file ) {
		open $fh, '>', $file or die "Can't write to $file: $!";
		$log->info("writing data to $file");
	}
	else {
		$fh = \*STDOUT;
		$log->info("writing data to STDOUT");
	}
	
	# print header
	my $h = $self->{'header'};
	print $fh join "\t", sort { $h->{$a} <=> $h->{$b} } keys %{ $h };
	print $fh "\n";
	
	# print rows
	for my $row ( @{ $self->{'table'} } ) {
		print $fh join "\t", @{ $row };
		print $fh "\n";
	}
}

=item trim_data

Trims sparse rows with missing values

=cut

sub trim_data {
	my $self = shift;
	my @trimmed;
	ROW: for my $row ( @{ $self->{'table'} } ) {
		next ROW if grep { not defined $_ } @{ $row };
		push @trimmed, $row;
	}
	my $num = $self->{'size'} - scalar @trimmed;
	$log->info("removed $num incomplete rows");
	$self->{'table'} = \@trimmed;
}

=item sample_data

Sample a fraction of the data

=cut

sub sample_data {
	my $self   = shift;
	my $sample = shift || 0.5;
	my $clone1 = $self->clone;
	my $clone2 = $self->clone;
	my $size   = $self->size;
	my @sample;
	$clone2->{'table'} = \@sample;
	while( scalar(@sample) < int( $size * $sample ) ) {
		my @shuffled = shuffle( @{ $clone1->{'table'} } );
		push @sample, shift @shuffled;
		$clone1->{'table'} = \@shuffled;
	}
	return $clone2, $clone1;
}

=item partition_data

Creates two clones that partition the data according to the provided ratio.

=cut

sub partition_data {
	my $self   = shift;
	my $sample = shift || 0.5;
	my $clone1 = $self->clone;
	my $clone2 = $self->clone;
	my $remain = 1 - $sample;
	$log->info("going to partition into $sample : $remain");
		
	# compute number of different dependent patterns and ratios of each
	my @dependents = $self->dependent_data;
	my %seen;
	for my $dep ( @dependents ) {
		my $key = join '/', @{ $dep };
		$seen{$key}++;
	}
	
	# adjust counts to sample size
	for my $key ( keys %seen ) {
		$log->debug("counts: $key => $seen{$key}");
		$seen{$key} = int( $seen{$key} * $sample );

lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN

			}
		}
	}
	$clone2->{'table'} = \@new_table;
	$clone1->{'table'} = \@table;
	return $clone2, $clone1;
}

=item size

Returns the number of data records

=cut

sub size { scalar @{ shift->{'table'} } }

=item to_fann

Packs data into an L<AI::FANN> TrainData structure

=cut

sub to_fann {
	$log->debug("encoding data as FANN struct");
	my $self = shift;
	my @cols = @_ ? @_ : $self->predictor_columns;
	my @deps = $self->dependent_data;
	my @pred = $self->predictor_data( 'cols' => \@cols );
	my @interdigitated;
	for my $i ( 0 .. $#deps ) {
		push @interdigitated, $pred[$i], $deps[$i];
	}
	return AI::FANN::TrainData->new(@interdigitated);
}

=back

=cut

script/aivolver view on Meta::CPAN

use YAML::Any 'LoadFile';
use File::Path 'make_path';
use AI::FANN::Evolving;
use AI::FANN::Evolving::TrainData;
use Algorithm::Genetic::Diploid::Logger ':levels';

# initialize config variables
my $verbosity = WARN; # log level
my $formatter = 'simple'; # log formatter
my %initialize;       # settings to start the population
my %data;             # train and test data files
my %experiment;       # experiment settings
my %ann;              # ANN settings
my $outfile;

# there are no arguments
if ( not @ARGV ) {
	pod2usage( '-verbose' => 0 );
}

# first argument is a config file
if ( -e $ARGV[0] ) {
	my $conf = shift;
	my $yaml = LoadFile($conf);
	$outfile    = $yaml->{'outfile'}         if defined $yaml->{'outfile'};
	$verbosity  = $yaml->{'verbosity'}       if defined $yaml->{'verbosity'};
	$formatter  = $yaml->{'formatter'}       if defined $yaml->{'formatter'};
	%initialize = %{ $yaml->{'initialize'} } if defined $yaml->{'initialize'};
	%data       = %{ $yaml->{'data'} }       if defined $yaml->{'data'};
	%experiment = %{ $yaml->{'experiment'} } if defined $yaml->{'experiment'};
	%ann        = %{ $yaml->{'ann'} }        if defined $yaml->{'ann'};
}

# process command line arguments
GetOptions(
	'verbose+'     => \$verbosity,
	'formatter=s'  => \$formatter,
	'outfile=s'    => \$outfile,
	'initialize=s' => \%initialize,
	'data=s'       => \%data,
	'experiment=s' => \%experiment,
	'ann=s'        => \%ann,
	'help|?'       => sub { pod2usage( '-verbose' => 1 ) },
	'manual'       => sub { pod2usage( '-verbose' => 2 ) },
);

# configure ANN
AI::FANN::Evolving->defaults(%ann);

# configure logger
my $log = Algorithm::Genetic::Diploid::Logger->new;
$log->level( 'level' => $verbosity );
$log->formatter( $formatter );

# read input data
my $deps   = join ', ', @{ $data{'dependent'} };
my $ignore = join ', ', @{ $data{'ignore'} };
$log->info("going to read train data $data{file}, ignoring '$ignore', dependent columns are '$deps'");
my $inputdata = AI::FANN::Evolving::TrainData->new(
	'file'      => $data{'file'},
	'dependent' => $data{'dependent'},
	'ignore'    => $data{'ignore'},
);
my ( $traindata, $testdata );
if ( $data{'type'} and lc $data{'type'} eq 'continuous' ) {
	( $traindata, $testdata ) = $inputdata->sample_data( $data{'fraction'} );
}
else {
	( $traindata, $testdata ) = $inputdata->partition_data( $data{'fraction'} );
}

$log->info("number of training data records: ".$traindata->size);
$log->info("number of test data records: ".$testdata->size);

# create first work dir
my $wd  = delete $experiment{'workdir'};
make_path($wd);
$wd .= '/0';

# create the experiment
my $exp = AI::FANN::Evolving::Experiment->new(
	'traindata' => $traindata->to_fann,
	'env'       => $testdata->to_fann,
	'workdir'   => $wd,
	%experiment,
);

# initialize the experiment
$exp->initialize(%initialize);

# run!
my ( $fittest, $fitness ) = $exp->run();
$log->info("*** overall best fitness: $fitness");

script/aivolver view on Meta::CPAN

Prints manual page and exits.

=item B<-v/--verbose>

Increments verbosity of the process. Can be used multiple times.

=item B<-o/--outfile <file.annE<gt>>

File name for the fittest ANN file over all generations.

=item B<-d/--data <key=valueE<gt>>

The C<data> argument is used multiple times, each time followed by a key/value pair
that defines the location of one of the data files. The key/value pairs are as follows:

=over

=item B<file=<data.tsvE<gt>>

Defines the location of a file of input data.

=item B<fraction=<numberE<gt>>

Fraction of input data to use for training (versus testing).

=back

=item B<-i/--initialize <key=valueE<gt>>

The C<initialize> argument is used multiple times, each time followed by a key/value
pair that defines one of the initialization settings for the (genetic) structure of the
evolving population. The key/value pairs are as follows:

=over

script/aivolver view on Meta::CPAN


Output directory.

=back

=back

=head1 DESCRIPTION

Artificial neural networks (ANNs) are decision-making machines that develop their
capabilities by training on input data. During this training, the ANN builds a
topology of input neurons, hidden neurons, and output neurons that respond to signals
in ways (and with sensitivities) that are determined by a variety of parameters. How
these parameters will interact to give rise to the final functionality of the ANN is
hard to predict I<a priori>, but can be optimized in a variety of ways.

C<aivolver> is a program that does this by evolving parameter settings using a genetic
algorithm that runs for a number of generations determined by C<ngens>. During this
process it writes the intermediate ANNs into the C<workdir> until the best result is
written to the C<outfile>.

The genetic algorithm proceeds by simulating a population of C<individual_count> diploid
individuals that each have C<chromosome_count> chromosomes whose C<gene_count> genes
encode the parameters of the ANN. During each generation, each individual is trained
on a sample data set, and the individual's fitness is then calculated by testing its
predictive abilities on an out-of-sample data set. The fittest individuals (whose
fraction of the total is determined by C<reproduction_rate>) are selected for breeding
in proportion to their fitness.

Before breeding, each individual undergoes a process of mutation, where a fraction of
the ANN parameters is randomly perturbed. Both the size of the fraction and the
maximum extent of the perturbation is determined by C<mutation_rate>. Subsequently, the
homologous chromosomes recombine (i.e. exchange parameters) at a rate determined by
C<crossover_rate>, which then results in (haploid) gametes. These gametes are fused with
those of other individuals to give rise to the next generation.

=head1 TRAINING AND TEST DATA

The data that is used for training the ANNs and for subsequently testing their predictive
abilities are provided as tab-separated tables. An example of an input data set is here:

L<https://github.com/naturalis/ai-fann-evolving/blob/master/examples/butterbeetles.tsv>

The tables have a header row, with at least the following columns:

=over

=item B<ID>

The C<ID> column contains a unique identifier (a string) for each record in the data set.

=item B<CLASS>

Each C<CLASS> column (multiple are allowed) specifies the classification that should
emerge from one of the output neurons. Often this would be an integer, for example
either C<1> or C<-1> for a binary classification. The number of C<CLASS> columns
determines the number of outputs in the ANN.

=item B<[others]>

t/01-run.t view on Meta::CPAN

	}
	return '';
});

# set quieter and quicker to give up
AI::FANN::Evolving->defaults( 'epoch_printfreq' => 0, 'epochs' => 200 );

# instantiate factory
my $fac = new_ok('AI::FANN::Evolving::Factory');

# prepare data
my $data = AI::FANN::Evolving::TrainData->new( 
	'file'      => "$Bin/../examples/Cochlopetalum.tsv",
	'ignore'    => [ 'image' ],
	'dependent' => [ 'C1', 'C2', 'C3', 'C4', 'C5' ],	
);
my ( $test, $train ) = $data->partition_data( 0.5 );

# create the experiment
my $exp = $fac->create_experiment( 
	'workdir'          => tempdir( 'CLEANUP' => 1 ),
	'traindata'        => $train->to_fann,
	'factory'          => $fac,
	'env'              => $test->to_fann,
	'mutation_rate'    => 0.1,
	'ngens'            => 2,
);
isa_ok( $exp, 'Algorithm::Genetic::Diploid::Experiment' );

# initialize the experiment
ok( $exp->initialize( 'individual_count' => 2 ), "initialized" );

t/02-data.t view on Meta::CPAN

use FindBin qw($Bin);
use Test::More 'no_plan';
use AI::FANN::Evolving::TrainData;
use Algorithm::Genetic::Diploid::Logger ':levels';
use Data::Dumper;

# instantiate a data object
my $file = "$Bin/../examples/merged.tsv";
my $data = AI::FANN::Evolving::TrainData->new( 
	'file'      => $file,
	'ignore'    => [ 'image' ],
	'dependent' => [ 'C1', 'C2', 'C3', 'C4' ],
);
ok( $data, "instantiate" );

# partition the data
my ( $d1, $d2 ) = $data->partition_data(0.2);
ok( $data->size == $d1->size + $d2->size, "partition" );

# pack data as FANN struct
ok( $d1->to_fann, "packed d1" );
ok( $d2->to_fann, "packed d2" );

t/03-fann-wrapper.t view on Meta::CPAN

use strict;
use warnings;
use Test::More 'no_plan';

BEGIN {
	use_ok('AI::FANN::Evolving');
	use_ok('AI::FANN::Evolving::TrainData');
}

##########################################################################################
# create a trivial data object:
my $data = AI::FANN::Evolving::TrainData->new(
	'header' => {
		'ID'    => 0, # simple integer id for the records
		's1'    => 1, # state 1
		's2'    => 2, # state 2
		'CLASS' => 3, # dependent 'xor' state
	},
	
	# this is the xor example from:
	# http://search.cpan.org/~salva/AI-FANN-0.10/lib/AI/FANN.pm
	'table' => [
		[ 1, -1, -1, -1 ],
		[ 2, -1, +1, +1 ],
		[ 3, +1, -1, +1 ],
		[ 4, +1, +1, -1 ],	
	],
);
ok( $data->size == 4, "instantiate data correctly" );

##########################################################################################
# train the FANN object on trivial data
my $ann = AI::FANN::Evolving->new( 'data' => $data, 'epoch_printfreq' => 0 );
$ann->train($data->to_fann);

# run the network
# this is the xor example from:
# http://search.cpan.org/~salva/AI-FANN-0.10/lib/AI/FANN.pm
my @result = ( -1, +1, +1, -1 );
my @input  = ( [ -1, -1 ], [ -1, +1 ], [ +1, -1 ], [ +1, +1 ] );
for my $i ( 0 .. $#input ) {
	my $output = $ann->run($input[$i]);
	ok( ! ( $result[$i] < 0 xor $output->[0] < 0 ), "observed and expected signs match" );
}

( run in 1.064 second using v1.01-cache-2.11-cpan-c333fce770f )