Algorithm-AM

 view release on metacpan or  search on metacpan

lib/Algorithm/AM/DataSet.pm  view on Meta::CPAN

use Algorithm::AM::DataSet::Item;
use Path::Tiny;
use Exporter::Easy (
    OK => ['dataset_from_file']
);

#pod =head1 SYNOPSIS
#pod
#pod  use Algorithm::AM::DataSet 'dataset_from_file';
#pod  use Algorithm::AM::DataSet::Item 'new_item';
#pod  my $dataset = Algorithm::AM::DataSet->new(cardinality => 10);
#pod  # or
#pod  $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas');
#pod  $dataset->add_item(
#pod    new_item(features => [qw(a b c d e f g h i)]));
#pod  my $item = $dataset->get_item(2);
#pod
#pod =head1 DESCRIPTION
#pod
#pod This package contains a list of items that can be used by
#pod L<Algorithm::AM> or L<Algorithm::AM::Batch> for classification.
#pod DataSets can be made one item at a time via the L</add_item> method,
#pod or they can be read from files via the L</dataset_from_file> function.
#pod
#pod =head2 C<new>
#pod
#pod Creates a new DataSet object. You must provide a C<cardinality> argument
#pod indicating the number of features to be contained in each data vector.
#pod You can then add items via the add_item method. Each item will contain
#pod a feature vector, and also optionally a class label and a comment
#pod (also called a "spec").
#pod
#pod =cut
sub new {
    my ($class, %opts) = @_;

    my $new_opts = _check_opts(%opts);

    my $self = bless $new_opts, $class;

    $self->_init;

    return $self;
}

# check the options for validity
# Return an option hash to initialize $self with
# For now only 'cardinality' is allowed/required.
sub _check_opts {
    my (%opts) = @_;

    my %final_opts;

    if(!defined $opts{cardinality}){
        croak q{Failed to provide 'cardinality' parameter};
    }
    $final_opts{cardinality} = $opts{cardinality};
    delete $opts{cardinality};

    if(keys %opts){
        # sort the keys in the error message to make testing possible
        croak 'Unknown parameters in DataSet constructor: ' .
            (join ', ', sort keys %opts);
    }

    return \%final_opts;
}

# initialize internal state
sub _init {
    my ($self) = @_;
    # contains all of the items in the dataset
    $self->{items} = [];

    # map unique class labels to unique integers;
    # these are the indices of the class labels in class_list below;
    # the indices must start at 1 for AM to work, as 0 is reserved
    # for heterogeneity.
    $self->{class_num_index} = {};
    # contains the list of class strings in an order that matches
    # the indices in class_num_index
    $self->{class_list} = [];
    # the total number of different classes contained in the data set
    $self->{num_classes} = 0;
    return;
}

#pod =head2 C<cardinality>
#pod
#pod Returns the number of features contained in the feature vector of a
#pod single item.
#pod
#pod =cut
sub cardinality {
    my ($self) = @_;
    return $self->{cardinality};
}

#pod =head2 C<size>
#pod
#pod Returns the number of items in the data set.
#pod
#pod =cut
sub size {
    my ($self) = @_;
    return scalar @{$self->{items}};
}

#pod =head2 C<classes>
#pod
#pod Returns the list of all unique class labels in the data set.
#pod
#pod =cut
sub classes {
    my ($self) = @_;
    return @{ $self->{class_list} };
}

#pod =head2 C<add_item>
#pod
#pod Adds a new item to the data set. The input may be either an

lib/Algorithm/AM/DataSet.pm  view on Meta::CPAN

#pod whitespace. Each feature value must be a single character with no
#pod separating characters, so here the features are f, e, a, t, u, r,
#pod e, and s.
#pod
#pod Lines beginning with a pound character (C<#>) are ignored.
#pod
#pod =cut
sub dataset_from_file {## no critic (RequireArgUnpacking)
    my (%opts) = (
        unknown => 'UNK',
        null => '=',
        @_
    );

    croak q[Failed to provide 'path' parameter]
        unless exists $opts{path};
    croak q[Failed to provide 'format' parameter]
        unless exists $opts{format};

    my ($path, $format, $unknown, $null) = (
        path($opts{path}), @opts{'format', 'unknown', 'null'});

    croak "Could not find file $path"
        unless $path->exists;

    my ($field_sep, $feature_sep);
    if($format eq 'commas'){
        # class/features/comment separated by a comma
        $field_sep   = qr{\s*,\s*};
        # features separated by space
        $feature_sep = qr{\s+};
    }elsif($format eq 'nocommas'){
        # class/features/comment separated by space
        $field_sep   = qr{\s+};
        # no seps for features; each is a single character
        $feature_sep = qr{};
    }else{
        croak "Unknown value $format for format parameter " .
            q{(should be 'commas' or 'nocommas')};
    }

    if(!defined $unknown){
        croak q[Must provide a defined value for 'unknown' parameter];
    }

    my $reader = _read_data_sub(
        $path, $unknown, $null, $field_sep, $feature_sep);
    my $item = $reader->();
    if(!$item){
        croak "No data found in file $path";
    }
    my $dataset = __PACKAGE__->new(cardinality => $item->cardinality);
    $dataset->add_item($item);
    while($item = $reader->()){
        $dataset->add_item($item);
    }
    return $dataset;
}

# return a sub that returns one Item per call from the given FH,
# and returns undef once the file is done being read. Throws errors
# on bad file contents.
# Input is file (Path::Tiny), string representing unknown class,
# string representing null feature, field separator (class,
# features, comment) and feature separator
sub _read_data_sub {
    my ($data_file, $unknown, $null,
        $field_sep, $feature_sep) = @_;
    my $data_fh = $data_file->openr_utf8;
    my $line_num = 0;
    return sub {
        my $line;
        # grab the next non-blank line from the file
        while($line = <$data_fh>){
            $line_num++;
            # skip comments
            next if $line =~ m/^\s*#/;
            # cross-platform chomp
            $line =~ s/\R$//;
            $line =~ s/^\s+|\s+$//g;
            last if $line;
        }
        return unless $line;
        my ($class, $feats, $comment) = split /$field_sep/, $line, 3;
        # the line has to have at least the class label and features
        if(!defined $feats){
            croak "Couldn't read data at line $line_num in $data_file";
        }
        # if the class is specified as unknown, set it to undef to
        # indicate this to Item
        if($class eq $unknown){
            undef $class;
        }

        my @data_vars = split /$feature_sep/, $feats;
        # set null features to ''
        @data_vars = map {$_ eq $null ? '' : $_} @data_vars;

        return Algorithm::AM::DataSet::Item->new(
            features=> \@data_vars,
            class => $class,
            comment => $comment
        );
    };
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Algorithm::AM::DataSet - Manage data used by Algorithm::AM

=head1 VERSION

version 3.13



( run in 1.159 second using v1.01-cache-2.11-cpan-4991d5b9bd9 )