Algorithm-AM

 view release on metacpan or  search on metacpan

lib/Algorithm/AM/DataSet.pm  view on Meta::CPAN

    my (%opts) = (
        unknown => 'UNK',
        null => '=',
        @_
    );

    croak q[Failed to provide 'path' parameter]
        unless exists $opts{path};
    croak q[Failed to provide 'format' parameter]
        unless exists $opts{format};

    my ($path, $format, $unknown, $null) = (
        path($opts{path}), @opts{'format', 'unknown', 'null'});

    croak "Could not find file $path"
        unless $path->exists;

    my ($field_sep, $feature_sep);
    if($format eq 'commas'){
        # class/features/comment separated by a comma
        $field_sep   = qr{\s*,\s*};
        # features separated by space
        $feature_sep = qr{\s+};
    }elsif($format eq 'nocommas'){
        # class/features/comment separated by space
        $field_sep   = qr{\s+};
        # no seps for features; each is a single character
        $feature_sep = qr{};
    }else{
        croak "Unknown value $format for format parameter " .
            q{(should be 'commas' or 'nocommas')};
    }

    if(!defined $unknown){
        croak q[Must provide a defined value for 'unknown' parameter];
    }

    my $reader = _read_data_sub(
        $path, $unknown, $null, $field_sep, $feature_sep);
    my $item = $reader->();
    if(!$item){
        croak "No data found in file $path";
    }
    my $dataset = __PACKAGE__->new(cardinality => $item->cardinality);
    $dataset->add_item($item);
    while($item = $reader->()){
        $dataset->add_item($item);
    }
    return $dataset;
}

# return a sub that returns one Item per call from the given FH,
# and returns undef once the file is done being read. Throws errors
# on bad file contents.
# Input is file (Path::Tiny), string representing unknown class,
# string representing null feature, field separator (class,
# features, comment) and feature separator
sub _read_data_sub {
    my ($data_file, $unknown, $null,
        $field_sep, $feature_sep) = @_;
    my $data_fh = $data_file->openr_utf8;
    my $line_num = 0;
    return sub {
        my $line;
        # grab the next non-blank line from the file
        while($line = <$data_fh>){
            $line_num++;
            # skip comments
            next if $line =~ m/^\s*#/;
            # cross-platform chomp
            $line =~ s/\R$//;
            $line =~ s/^\s+|\s+$//g;
            last if $line;
        }
        return unless $line;
        my ($class, $feats, $comment) = split /$field_sep/, $line, 3;
        # the line has to have at least the class label and features
        if(!defined $feats){
            croak "Couldn't read data at line $line_num in $data_file";
        }
        # if the class is specified as unknown, set it to undef to
        # indicate this to Item
        if($class eq $unknown){
            undef $class;
        }

        my @data_vars = split /$feature_sep/, $feats;
        # set null features to ''
        @data_vars = map {$_ eq $null ? '' : $_} @data_vars;

        return Algorithm::AM::DataSet::Item->new(
            features=> \@data_vars,
            class => $class,
            comment => $comment
        );
    };
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

Algorithm::AM::DataSet - Manage data used by Algorithm::AM

=head1 VERSION

version 3.13

=head1 SYNOPSIS

 use Algorithm::AM::DataSet 'dataset_from_file';
 use Algorithm::AM::DataSet::Item 'new_item';
 my $dataset = Algorithm::AM::DataSet->new(cardinality => 10);
 # or
 $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas');



( run in 1.234 second using v1.01-cache-2.11-cpan-5735350b133 )