Algorithm-AM
view release on metacpan or search on metacpan
lib/Algorithm/AM/DataSet.pm view on Meta::CPAN
my (%opts) = (
unknown => 'UNK',
null => '=',
@_
);
croak q[Failed to provide 'path' parameter]
unless exists $opts{path};
croak q[Failed to provide 'format' parameter]
unless exists $opts{format};
my ($path, $format, $unknown, $null) = (
path($opts{path}), @opts{'format', 'unknown', 'null'});
croak "Could not find file $path"
unless $path->exists;
my ($field_sep, $feature_sep);
if($format eq 'commas'){
# class/features/comment separated by a comma
$field_sep = qr{\s*,\s*};
# features separated by space
$feature_sep = qr{\s+};
}elsif($format eq 'nocommas'){
# class/features/comment separated by space
$field_sep = qr{\s+};
# no seps for features; each is a single character
$feature_sep = qr{};
}else{
croak "Unknown value $format for format parameter " .
q{(should be 'commas' or 'nocommas')};
}
if(!defined $unknown){
croak q[Must provide a defined value for 'unknown' parameter];
}
my $reader = _read_data_sub(
$path, $unknown, $null, $field_sep, $feature_sep);
my $item = $reader->();
if(!$item){
croak "No data found in file $path";
}
my $dataset = __PACKAGE__->new(cardinality => $item->cardinality);
$dataset->add_item($item);
while($item = $reader->()){
$dataset->add_item($item);
}
return $dataset;
}
# return a sub that returns one Item per call from the given FH,
# and returns undef once the file is done being read. Throws errors
# on bad file contents.
# Input is file (Path::Tiny), string representing unknown class,
# string representing null feature, field separator (class,
# features, comment) and feature separator
sub _read_data_sub {
my ($data_file, $unknown, $null,
$field_sep, $feature_sep) = @_;
my $data_fh = $data_file->openr_utf8;
my $line_num = 0;
return sub {
my $line;
# grab the next non-blank line from the file
while($line = <$data_fh>){
$line_num++;
# skip comments
next if $line =~ m/^\s*#/;
# cross-platform chomp
$line =~ s/\R$//;
$line =~ s/^\s+|\s+$//g;
last if $line;
}
return unless $line;
my ($class, $feats, $comment) = split /$field_sep/, $line, 3;
# the line has to have at least the class label and features
if(!defined $feats){
croak "Couldn't read data at line $line_num in $data_file";
}
# if the class is specified as unknown, set it to undef to
# indicate this to Item
if($class eq $unknown){
undef $class;
}
my @data_vars = split /$feature_sep/, $feats;
# set null features to ''
@data_vars = map {$_ eq $null ? '' : $_} @data_vars;
return Algorithm::AM::DataSet::Item->new(
features=> \@data_vars,
class => $class,
comment => $comment
);
};
}
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
Algorithm::AM::DataSet - Manage data used by Algorithm::AM
=head1 VERSION
version 3.13
=head1 SYNOPSIS
use Algorithm::AM::DataSet 'dataset_from_file';
use Algorithm::AM::DataSet::Item 'new_item';
my $dataset = Algorithm::AM::DataSet->new(cardinality => 10);
# or
$dataset = dataset_from_file(path => 'finnverb', format => 'nocommas');
( run in 1.234 second using v1.01-cache-2.11-cpan-5735350b133 )