view release on metacpan or search on metacpan
lib/AI/FANN/Evolving/TrainData.pm
LICENSE
Makefile.PL
MANIFEST This list of files
MYMETA.json
MYMETA.yml
README.md
script/aivolver
t/00-load.t
t/01-run.t
t/02-data.t
t/03-fann-wrapper.t
t/perl-critic.t
t/perlcriticrc
t/pod-coverage.t
t/pod.t
META.yml Module YAML meta-data (added by MakeMaker)
META.json Module JSON meta-data (added by MakeMaker)
lib/AI/FANN/Evolving.pm view on Meta::CPAN
=head1 NAME
AI::FANN::Evolving - artificial neural network that evolves
=head1 METHODS
=over
=item new
Constructor requires 'file', or 'data' and 'neurons' arguments. Optionally takes
'connection_rate' argument for sparse topologies. Returns a wrapper around L<AI::FANN>.
=cut
sub new {
my $class = shift;
my %args = @_;
my $self = {};
bless $self, $class;
$self->_init(%args);
# de-serialize from a file
if ( my $file = $args{'file'} ) {
$self->{'ann'} = AI::FANN->new_from_file($file);
$log->debug("instantiating from file $file");
return $self;
}
# build new topology from input data
elsif ( my $data = $args{'data'} ) {
$log->debug("instantiating from data $data");
$data = $data->to_fann if $data->isa('AI::FANN::Evolving::TrainData');
# prepare arguments
my $neurons = $args{'neurons'} || ( $data->num_inputs + 1 );
my @sizes = (
$data->num_inputs,
$neurons,
$data->num_outputs
);
# build topology
if ( $args{'connection_rate'} ) {
$self->{'ann'} = AI::FANN->new_sparse( $args{'connection_rate'}, @sizes );
}
else {
$self->{'ann'} = AI::FANN->new_standard( @sizes );
}
lib/AI/FANN/Evolving.pm view on Meta::CPAN
$ann->num_inputs,
$ann->num_inputs + 1,
$ann->num_outputs,
);
# copy the AI::FANN properties
$ann->template($self->{'ann'});
return $self;
}
else {
die "Need 'file', 'data' or 'ann' argument!";
}
}
=item template
Uses the object as a template for the properties of the argument, e.g.
$ann1->template($ann2) applies the properties of $ann1 to $ann2
=cut
lib/AI/FANN/Evolving.pm view on Meta::CPAN
=cut
sub clone {
my $self = shift;
$log->debug("cloning...");
# we delete the reference here so we can use
# Algorithm::Genetic::Diploid::Base's cloning method, which
# dumps and loads from YAML. This wouldn't work if the
# reference is still attached because it cannot be
# stringified, being an XS data structure
my $ann = delete $self->{'ann'};
my $clone = $self->SUPER::clone;
# clone the ANN by writing it to a temp file in "FANN/FLO"
# format and reading that back in, then delete the file
my ( $fh, $file ) = tempfile();
close $fh;
$ann->save($file);
$clone->{'ann'} = __PACKAGE__->new_from_file($file);
unlink $file;
# now re-attach the original ANN to the invocant
$self->{'ann'} = $ann;
return $clone;
}
=item train
Trains the AI on the provided data object
=cut
sub train {
my ( $self, $data ) = @_;
if ( $self->train_type eq 'cascade' ) {
$log->debug("cascade training");
# set learning curve
$self->cascade_activation_functions( $self->activation_function );
# train
$self->{'ann'}->cascadetrain_on_data(
$data,
$self->neurons,
$self->neuron_printfreq,
$self->error,
);
}
else {
$log->debug("normal training");
# set learning curves
$self->hidden_activation_function( $self->activation_function );
$self->output_activation_function( $self->activation_function );
# train
$self->{'ann'}->train_on_data(
$data,
$self->epochs,
$self->epoch_printfreq,
$self->error,
);
}
}
=item enum_properties
Returns a hash whose keys are names of enums and values the possible states for the
lib/AI/FANN/Evolving/Experiment.pm view on Meta::CPAN
my $value = shift;
$log->info("assigning new workdir $value");
$self->{'workdir'} = $value;
}
else {
$log->debug("retrieving workdir");
}
return $self->{'workdir'};
}
=item traindata
Getter/setter for the L<AI::FANN::TrainData> object.
=cut
sub traindata {
my $self = shift;
if ( @_ ) {
my $value = shift;
$log->info("assigning new traindata $value");
$self->{'traindata'} = $value;
}
else {
$log->debug("retrieving traindata");
}
return $self->{'traindata'};
}
=item run
Runs the experiment!
=cut
sub run {
my $self = shift;
lib/AI/FANN/Evolving/Factory.pm view on Meta::CPAN
use strict;
use Algorithm::Genetic::Diploid;
use base 'Algorithm::Genetic::Diploid::Factory';
our $AUTOLOAD;
my %defaults = (
'experiment' => 'AI::FANN::Evolving::Experiment',
'chromosome' => 'AI::FANN::Evolving::Chromosome',
'gene' => 'AI::FANN::Evolving::Gene',
'traindata' => 'AI::FANN::Evolving::TrainData',
);
=head1 NAME
AI::FANN::Evolving::Factory - creator of objects
=head1 METHODS
=over
=item new
Constructor takes named arguments. Key is a short name (e.g. 'traindata'), value is a
fully qualified package name (e.g. L<AI::FANN::TrainData>) from which to instantiate
objects identified by the short name.
=back
=cut
sub new { shift->SUPER::new(%defaults,@_) }
1;
lib/AI/FANN/Evolving/Gene.pm view on Meta::CPAN
Constructor is passed named arguments. Instantiates a trained L<AI::FANN::Evolving> ANN
=cut
sub new {
# initialize self up the inheritance tree
my $self = shift->SUPER::new(@_);
# instantiate and train the FANN object
my $traindata = $self->experiment->traindata;
$self->ann( AI::FANN::Evolving->new( 'data' => $traindata ) );
return $self;
}
=item ann
Getter/setter for an L<AI::FANN::Evolving> ANN
=cut
sub ann {
lib/AI/FANN/Evolving/Gene.pm view on Meta::CPAN
sub make_function {
my $self = shift;
my $ann = $self->ann;
my $error_func = $self->experiment->error_func;
$log->debug("making fitness function");
# build the fitness function
return sub {
# train the AI
$ann->train( $self->experiment->traindata );
# isa TrainingData object, this is what we need to use
# to make our prognostications. It is a different data
# set (out of sample) than the TrainingData object that
# the AI was trained on.
my $env = shift;
# this is a number which we try to keep as near to zero
# as possible
my $fitness = 0;
# iterate over the list of input/output pairs
for my $i ( 0 .. ( $env->length - 1 ) ) {
my ( $input, $expected ) = $env->data($i);
my $observed = $ann->run($input);
use Data::Dumper;
$log->debug("Observed: ".Dumper($observed));
$log->debug("Expected: ".Dumper($expected));
# invoke the error_func provided by the experiment
$fitness += $error_func->($observed,$expected);
}
$fitness /= $env->length;
lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN
use List::Util 'shuffle';
use AI::FANN ':all';
use Algorithm::Genetic::Diploid::Base;
use base 'Algorithm::Genetic::Diploid::Base';
our $AUTOLOAD;
my $log = __PACKAGE__->logger;
=head1 NAME
AI::FANN::Evolving::TrainData - wrapper class for FANN data
=head1 METHODS
=over
=item new
Constructor takes named arguments. By default, ignores column
named ID and considers column named CLASS as classifier.
lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN
sub new {
my $self = shift->SUPER::new(
'ignore' => [ 'ID' ],
'dependent' => [ 'CLASS' ],
'header' => {},
'table' => [],
@_
);
my %args = @_;
$self->read_data($args{'file'}) if $args{'file'};
$self->trim_data if $args{'trim'};
return $self;
}
=item ignore_columns
Getter/setter for column names to ignore in the train data structure.
For example: an identifier columns named 'ID'
=cut
sub ignore_columns {
my $self = shift;
$self->{'ignore'} = \@_ if @_;
return @{ $self->{'ignore'} };
}
lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN
=cut
sub predictor_columns {
my $self = shift;
my @others = ( $self->ignore_columns, $self->dependent_columns );
my %skip = map { $_ => 1 } @others;
return grep { ! $skip{$_} } keys %{ $self->{'header'} };
}
=item predictor_data
Getter for rows of input values
=cut
sub predictor_data {
my ( $self, %args ) = @_;
my $i = $args{'row'};
my @cols = $args{'cols'} ? @{ $args{'cols'} } : $self->predictor_columns;
# build hash of indices to keep
my %keep = map { $self->{'header'}->{$_} => 1 } @cols;
# only return a single row
if ( defined $i ) {
my @pred;
for my $j ( 0 .. $#{ $self->{'table'}->[$i] } ) {
push @pred, $self->{'table'}->[$i]->[$j] if $keep{$j};
}
return \@pred;
}
else {
my @preds;
my $max = $self->size - 1;
for my $j ( 0 .. $max ) {
push @preds, $self->predictor_data( 'row' => $j, 'cols' => \@cols);
}
return @preds;
}
}
=item dependent_data
Getter for dependent (classifier) data
=cut
sub dependent_data {
my ( $self, $i ) = @_;
my @dc = map { $self->{'header'}->{$_} } $self->dependent_columns;
if ( defined $i ) {
return [ map { $self->{'table'}->[$i]->[$_] } @dc ];
}
else {
my @dep;
for my $j ( 0 .. $self->size - 1 ) {
push @dep, $self->dependent_data($j);
}
return @dep;
}
}
=item read_data
Reads provided input file
=cut
sub read_data {
my ( $self, $file ) = @_; # file is tab-delimited
$log->debug("reading data from file $file");
open my $fh, '<', $file or die "Can't open $file: $!";
my ( %header, @table );
while(<$fh>) {
chomp;
next if /^\s*$/;
my @fields = split /\t/, $_;
if ( not %header ) {
my $i = 0;
%header = map { $_ => $i++ } @fields;
}
else {
push @table, \@fields;
}
}
$self->{'header'} = \%header;
$self->{'table'} = \@table;
return $self;
}
=item write_data
Writes to provided output file
=cut
sub write_data {
my ( $self, $file ) = @_;
# use file or STDOUT
my $fh;
if ( $file ) {
open $fh, '>', $file or die "Can't write to $file: $!";
$log->info("writing data to $file");
}
else {
$fh = \*STDOUT;
$log->info("writing data to STDOUT");
}
# print header
my $h = $self->{'header'};
print $fh join "\t", sort { $h->{$a} <=> $h->{$b} } keys %{ $h };
print $fh "\n";
# print rows
for my $row ( @{ $self->{'table'} } ) {
print $fh join "\t", @{ $row };
print $fh "\n";
}
}
=item trim_data
Trims sparse rows with missing values
=cut
sub trim_data {
my $self = shift;
my @trimmed;
ROW: for my $row ( @{ $self->{'table'} } ) {
next ROW if grep { not defined $_ } @{ $row };
push @trimmed, $row;
}
my $num = $self->{'size'} - scalar @trimmed;
$log->info("removed $num incomplete rows");
$self->{'table'} = \@trimmed;
}
=item sample_data
Sample a fraction of the data
=cut
sub sample_data {
my $self = shift;
my $sample = shift || 0.5;
my $clone1 = $self->clone;
my $clone2 = $self->clone;
my $size = $self->size;
my @sample;
$clone2->{'table'} = \@sample;
while( scalar(@sample) < int( $size * $sample ) ) {
my @shuffled = shuffle( @{ $clone1->{'table'} } );
push @sample, shift @shuffled;
$clone1->{'table'} = \@shuffled;
}
return $clone2, $clone1;
}
=item partition_data
Creates two clones that partition the data according to the provided ratio.
=cut
sub partition_data {
my $self = shift;
my $sample = shift || 0.5;
my $clone1 = $self->clone;
my $clone2 = $self->clone;
my $remain = 1 - $sample;
$log->info("going to partition into $sample : $remain");
# compute number of different dependent patterns and ratios of each
my @dependents = $self->dependent_data;
my %seen;
for my $dep ( @dependents ) {
my $key = join '/', @{ $dep };
$seen{$key}++;
}
# adjust counts to sample size
for my $key ( keys %seen ) {
$log->debug("counts: $key => $seen{$key}");
$seen{$key} = int( $seen{$key} * $sample );
lib/AI/FANN/Evolving/TrainData.pm view on Meta::CPAN
}
}
}
$clone2->{'table'} = \@new_table;
$clone1->{'table'} = \@table;
return $clone2, $clone1;
}
=item size
Returns the number of data records
=cut
sub size { scalar @{ shift->{'table'} } }
=item to_fann
Packs data into an L<AI::FANN> TrainData structure
=cut
sub to_fann {
$log->debug("encoding data as FANN struct");
my $self = shift;
my @cols = @_ ? @_ : $self->predictor_columns;
my @deps = $self->dependent_data;
my @pred = $self->predictor_data( 'cols' => \@cols );
my @interdigitated;
for my $i ( 0 .. $#deps ) {
push @interdigitated, $pred[$i], $deps[$i];
}
return AI::FANN::TrainData->new(@interdigitated);
}
=back
=cut
script/aivolver view on Meta::CPAN
use YAML::Any 'LoadFile';
use File::Path 'make_path';
use AI::FANN::Evolving;
use AI::FANN::Evolving::TrainData;
use Algorithm::Genetic::Diploid::Logger ':levels';
# initialize config variables
my $verbosity = WARN; # log level
my $formatter = 'simple'; # log formatter
my %initialize; # settings to start the population
my %data; # train and test data files
my %experiment; # experiment settings
my %ann; # ANN settings
my $outfile;
# there are no arguments
if ( not @ARGV ) {
pod2usage( '-verbose' => 0 );
}
# first argument is a config file
if ( -e $ARGV[0] ) {
my $conf = shift;
my $yaml = LoadFile($conf);
$outfile = $yaml->{'outfile'} if defined $yaml->{'outfile'};
$verbosity = $yaml->{'verbosity'} if defined $yaml->{'verbosity'};
$formatter = $yaml->{'formatter'} if defined $yaml->{'formatter'};
%initialize = %{ $yaml->{'initialize'} } if defined $yaml->{'initialize'};
%data = %{ $yaml->{'data'} } if defined $yaml->{'data'};
%experiment = %{ $yaml->{'experiment'} } if defined $yaml->{'experiment'};
%ann = %{ $yaml->{'ann'} } if defined $yaml->{'ann'};
}
# process command line arguments
GetOptions(
'verbose+' => \$verbosity,
'formatter=s' => \$formatter,
'outfile=s' => \$outfile,
'initialize=s' => \%initialize,
'data=s' => \%data,
'experiment=s' => \%experiment,
'ann=s' => \%ann,
'help|?' => sub { pod2usage( '-verbose' => 1 ) },
'manual' => sub { pod2usage( '-verbose' => 2 ) },
);
# configure ANN
AI::FANN::Evolving->defaults(%ann);
# configure logger
my $log = Algorithm::Genetic::Diploid::Logger->new;
$log->level( 'level' => $verbosity );
$log->formatter( $formatter );
# read input data
my $deps = join ', ', @{ $data{'dependent'} };
my $ignore = join ', ', @{ $data{'ignore'} };
$log->info("going to read train data $data{file}, ignoring '$ignore', dependent columns are '$deps'");
my $inputdata = AI::FANN::Evolving::TrainData->new(
'file' => $data{'file'},
'dependent' => $data{'dependent'},
'ignore' => $data{'ignore'},
);
my ( $traindata, $testdata );
if ( $data{'type'} and lc $data{'type'} eq 'continuous' ) {
( $traindata, $testdata ) = $inputdata->sample_data( $data{'fraction'} );
}
else {
( $traindata, $testdata ) = $inputdata->partition_data( $data{'fraction'} );
}
$log->info("number of training data records: ".$traindata->size);
$log->info("number of test data records: ".$testdata->size);
# create first work dir
my $wd = delete $experiment{'workdir'};
make_path($wd);
$wd .= '/0';
# create the experiment
my $exp = AI::FANN::Evolving::Experiment->new(
'traindata' => $traindata->to_fann,
'env' => $testdata->to_fann,
'workdir' => $wd,
%experiment,
);
# initialize the experiment
$exp->initialize(%initialize);
# run!
my ( $fittest, $fitness ) = $exp->run();
$log->info("*** overall best fitness: $fitness");
script/aivolver view on Meta::CPAN
Prints manual page and exits.
=item B<-v/--verbose>
Increments verbosity of the process. Can be used multiple times.
=item B<-o/--outfile <file.annE<gt>>
File name for the fittest ANN file over all generations.
=item B<-d/--data <key=valueE<gt>>
The C<data> argument is used multiple times, each time followed by a key/value pair
that defines the location of one of the data files. The key/value pairs are as follows:
=over
=item B<file=<data.tsvE<gt>>
Defines the location of a file of input data.
=item B<fraction=<numberE<gt>>
Fraction of input data to use for training (versus testing).
=back
=item B<-i/--initialize <key=valueE<gt>>
The C<initialize> argument is used multiple times, each time followed by a key/value
pair that defines one of the initialization settings for the (genetic) structure of the
evolving population. The key/value pairs are as follows:
=over
script/aivolver view on Meta::CPAN
Output directory.
=back
=back
=head1 DESCRIPTION
Artificial neural networks (ANNs) are decision-making machines that develop their
capabilities by training on input data. During this training, the ANN builds a
topology of input neurons, hidden neurons, and output neurons that respond to signals
in ways (and with sensitivities) that are determined by a variety of parameters. How
these parameters will interact to give rise to the final functionality of the ANN is
hard to predict I<a priori>, but can be optimized in a variety of ways.
C<aivolver> is a program that does this by evolving parameter settings using a genetic
algorithm that runs for a number of generations determined by C<ngens>. During this
process it writes the intermediate ANNs into the C<workdir> until the best result is
written to the C<outfile>.
The genetic algorithm proceeds by simulating a population of C<individual_count> diploid
individuals that each have C<chromosome_count> chromosomes whose C<gene_count> genes
encode the parameters of the ANN. During each generation, each individual is trained
on a sample data set, and the individual's fitness is then calculated by testing its
predictive abilities on an out-of-sample data set. The fittest individuals (whose
fraction of the total is determined by C<reproduction_rate>) are selected for breeding
in proportion to their fitness.
Before breeding, each individual undergoes a process of mutation, where a fraction of
the ANN parameters is randomly perturbed. Both the size of the fraction and the
maximum extent of the perturbation is determined by C<mutation_rate>. Subsequently, the
homologous chromosomes recombine (i.e. exchange parameters) at a rate determined by
C<crossover_rate>, which then results in (haploid) gametes. These gametes are fused with
those of other individuals to give rise to the next generation.
=head1 TRAINING AND TEST DATA
The data that is used for training the ANNs and for subsequently testing their predictive
abilities are provided as tab-separated tables. An example of an input data set is here:
L<https://github.com/naturalis/ai-fann-evolving/blob/master/examples/butterbeetles.tsv>
The tables have a header row, with at least the following columns:
=over
=item B<ID>
The C<ID> column contains a unique identifier (a string) for each record in the data set.
=item B<CLASS>
Each C<CLASS> column (multiple are allowed) specifies the classification that should
emerge from one of the output neurons. Often this would be an integer, for example
either C<1> or C<-1> for a binary classification. The number of C<CLASS> columns
determines the number of outputs in the ANN.
=item B<[others]>
}
return '';
});
# set quieter and quicker to give up
AI::FANN::Evolving->defaults( 'epoch_printfreq' => 0, 'epochs' => 200 );
# instantiate factory
my $fac = new_ok('AI::FANN::Evolving::Factory');
# prepare data
my $data = AI::FANN::Evolving::TrainData->new(
'file' => "$Bin/../examples/Cochlopetalum.tsv",
'ignore' => [ 'image' ],
'dependent' => [ 'C1', 'C2', 'C3', 'C4', 'C5' ],
);
my ( $test, $train ) = $data->partition_data( 0.5 );
# create the experiment
my $exp = $fac->create_experiment(
'workdir' => tempdir( 'CLEANUP' => 1 ),
'traindata' => $train->to_fann,
'factory' => $fac,
'env' => $test->to_fann,
'mutation_rate' => 0.1,
'ngens' => 2,
);
isa_ok( $exp, 'Algorithm::Genetic::Diploid::Experiment' );
# initialize the experiment
ok( $exp->initialize( 'individual_count' => 2 ), "initialized" );
t/02-data.t view on Meta::CPAN
use FindBin qw($Bin);
use Test::More 'no_plan';
use AI::FANN::Evolving::TrainData;
use Algorithm::Genetic::Diploid::Logger ':levels';
use Data::Dumper;
# instantiate a data object
my $file = "$Bin/../examples/merged.tsv";
my $data = AI::FANN::Evolving::TrainData->new(
'file' => $file,
'ignore' => [ 'image' ],
'dependent' => [ 'C1', 'C2', 'C3', 'C4' ],
);
ok( $data, "instantiate" );
# partition the data
my ( $d1, $d2 ) = $data->partition_data(0.2);
ok( $data->size == $d1->size + $d2->size, "partition" );
# pack data as FANN struct
ok( $d1->to_fann, "packed d1" );
ok( $d2->to_fann, "packed d2" );
t/03-fann-wrapper.t view on Meta::CPAN
use strict;
use warnings;
use Test::More 'no_plan';
BEGIN {
use_ok('AI::FANN::Evolving');
use_ok('AI::FANN::Evolving::TrainData');
}
##########################################################################################
# create a trivial data object:
my $data = AI::FANN::Evolving::TrainData->new(
'header' => {
'ID' => 0, # simple integer id for the records
's1' => 1, # state 1
's2' => 2, # state 2
'CLASS' => 3, # dependent 'xor' state
},
# this is the xor example from:
# http://search.cpan.org/~salva/AI-FANN-0.10/lib/AI/FANN.pm
'table' => [
[ 1, -1, -1, -1 ],
[ 2, -1, +1, +1 ],
[ 3, +1, -1, +1 ],
[ 4, +1, +1, -1 ],
],
);
ok( $data->size == 4, "instantiate data correctly" );
##########################################################################################
# train the FANN object on trivial data
my $ann = AI::FANN::Evolving->new( 'data' => $data, 'epoch_printfreq' => 0 );
$ann->train($data->to_fann);
# run the network
# this is the xor example from:
# http://search.cpan.org/~salva/AI-FANN-0.10/lib/AI/FANN.pm
my @result = ( -1, +1, +1, -1 );
my @input = ( [ -1, -1 ], [ -1, +1 ], [ +1, -1 ], [ +1, +1 ] );
for my $i ( 0 .. $#input ) {
my $output = $ann->run($input[$i]);
ok( ! ( $result[$i] < 0 xor $output->[0] < 0 ), "observed and expected signs match" );
}