AI-Categorizer
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/AI/Categorizer/Learner/Weka.pm view on Meta::CPAN
return $scores{1} || 0; # Not sure what weka's scores represent...
}
sub categorize_collection {
my ($self, %args) = @_;
my $c = $args{collection} or die "No collection provided";
my @alldocs;
while (my $d = $c->next) {
push @alldocs, $d;
}
my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]);
my @assigned;
my $l = $self->{model}{learners};
foreach my $cat (keys %$l) {
my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, "${cat}_model");
my @args = ($self->{java_path},
@{$self->{java_args}},
$self->{weka_classifier},
'-l', $machine_file,
'-T', $doc_file,
'-p', 0,
);
my @output = $self->do_cmd(@args);
foreach my $line (@output) {
next unless $line =~ /\S/;
# 0 large.elem 0.4515551620220952 numberth.high
unless ( $line =~ /^([\d.]+)\s+(\S+)\s+([\d.]+)\s+(\S+)/ ) {
warn "Can't parse line $line";
next;
}
my ($index, $predicted, $score) = ($1, $2, $3);
$assigned[$index]{$cat} = $score if $predicted; # Not sure what weka's scores represent
print STDERR "$index: assigned=($predicted) correct=(", $alldocs[$index]->is_in_category($cat) ? 1 : 0, ")\n"
if $self->verbose;
}
}
my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
foreach my $i (0..$#alldocs) {
$experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
}
return $experiment;
}
sub do_cmd {
my ($self, @cmd) = @_;
print STDERR " % @cmd\n" if $self->verbose;
my @output;
local *KID_TO_READ;
my $pid = open(KID_TO_READ, "-|");
if ($pid) { # parent
@output = <KID_TO_READ>;
close(KID_TO_READ) or warn "@cmd exited $?";
} else { # child
exec(@cmd) or die "Can't exec @cmd: $!";
}
return @output;
}
sub create_arff_file {
my ($self, $name, $docs, $dir) = @_;
$dir = $self->{model}{_in_dir} unless defined $dir;
my ($fh, $filename) = File::Temp::tempfile(
$name . "_XXXX", # Template
DIR => $dir,
SUFFIX => '.arff',
);
print $fh "\@RELATION foo\n\n";
my $feature_names = $self->{model}{all_features};
foreach my $name (@$feature_names) {
print $fh "\@ATTRIBUTE feature-$name REAL\n";
}
print $fh "\@ATTRIBUTE category {1, 0}\n\n";
my %feature_indices = map {$feature_names->[$_], $_} 0..$#{$feature_names};
my $last_index = keys %feature_indices;
# We use the 'sparse' format, see http://www.cs.waikato.ac.nz/~ml/weka/arff.html
print $fh "\@DATA\n";
foreach my $doc (@$docs) {
my ($features, $cat) = @$doc;
my $f = $features->as_hash;
my @ordered_keys = (sort {$feature_indices{$a} <=> $feature_indices{$b}}
grep {exists $feature_indices{$_}}
keys %$f);
print $fh ("{",
join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
"}\n"
);
}
return $filename;
}
sub save_state {
my ($self, $path) = @_;
{
local $self->{knowledge_set};
$self->SUPER::save_state($path);
}
return unless $self->{model};
my $model_dir = File::Spec->catdir($path, 'models');
mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!";
view all matches for this distributionview release on metacpan - search on metacpan
( run in 1.928 second using v1.00-cache-2.02-grep-82fe00e-cpan-72ae3ad1e6da )