AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

  # Create document file
  my $doc_file = $self->create_arff_file('doc', [[$doc->features, 0]], $self->{tmpdir});
  my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, $info->{machine_file});

  my @args = ($self->{java_path},
	      @{$self->{java_args}},
	      $self->{weka_classifier},
	      '-l', $machine_file,
	      '-T', $doc_file,
	      '-p', 0,
	     );

  my @output = $self->do_cmd(@args);

  my %scores;
  foreach (@output) {
    # <doc> <category> <score> <real_category>
    # 0 large.elem 0.4515551620220952 numberth.high
    next unless my ($index, $predicted, $score) = /^([\d.]+)\s+(\S+)\s+([\d.]+)/;
    $scores{$predicted} = $score;
  }

  return $scores{1} || 0;  # Not sure what weka's scores represent...
}

sub categorize_collection {
  my ($self, %args) = @_;
  my $c = $args{collection} or die "No collection provided";
  
  my @alldocs;
  while (my $d = $c->next) {
    push @alldocs, $d;
  }
  my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]);

  my @assigned;
  
  my $l = $self->{model}{learners};
  foreach my $cat (keys %$l) {
    my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, "${cat}_model");
    my @args = ($self->{java_path},
		@{$self->{java_args}},
		$self->{weka_classifier},
		'-l', $machine_file,
		'-T', $doc_file,
		'-p', 0,
               );

    my @output = $self->do_cmd(@args);

    foreach my $line (@output) {
      next unless $line =~ /\S/;
      
      # 0 large.elem 0.4515551620220952 numberth.high
      unless ( $line =~ /^([\d.]+)\s+(\S+)\s+([\d.]+)\s+(\S+)/ ) {
	warn "Can't parse line $line";
	next;
      }
      my ($index, $predicted, $score) = ($1, $2, $3);
      $assigned[$index]{$cat} = $score if $predicted;  # Not sure what weka's scores represent
      print STDERR "$index: assigned=($predicted) correct=(", $alldocs[$index]->is_in_category($cat) ? 1 : 0, ")\n"
	if $self->verbose;
    }
  }

  my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]);
  foreach my $i (0..$#alldocs) {
    $experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name);
  }

  return $experiment;
}


sub do_cmd {
  my ($self, @cmd) = @_;
  print STDERR " % @cmd\n" if $self->verbose;
  
  my @output;
  local *KID_TO_READ;
  my $pid = open(KID_TO_READ, "-|");
  
  if ($pid) {   # parent
    @output = <KID_TO_READ>;
    close(KID_TO_READ) or warn "@cmd exited $?";
    
  } else {      # child
    exec(@cmd) or die "Can't exec @cmd: $!";
  }
  
  return @output;
}

sub create_arff_file {
  my ($self, $name, $docs, $dir) = @_;
  $dir = $self->{model}{_in_dir} unless defined $dir;

  my ($fh, $filename) = File::Temp::tempfile(
					     $name . "_XXXX",  # Template
					     DIR    => $dir,
					     SUFFIX => '.arff',
					    );
  print $fh "\@RELATION foo\n\n";
  
  my $feature_names = $self->{model}{all_features};
  foreach my $name (@$feature_names) {
    print $fh "\@ATTRIBUTE feature-$name REAL\n";
  }
  print $fh "\@ATTRIBUTE category {1, 0}\n\n";
  
  my %feature_indices = map {$feature_names->[$_], $_} 0..$#{$feature_names};
  my $last_index = keys %feature_indices;
  
  # We use the 'sparse' format, see http://www.cs.waikato.ac.nz/~ml/weka/arff.html
  
  print $fh "\@DATA\n";
  foreach my $doc (@$docs) {
    my ($features, $cat) = @$doc;
    my $f = $features->as_hash;
    my @ordered_keys = (sort {$feature_indices{$a} <=> $feature_indices{$b}} 
			grep {exists $feature_indices{$_}}
			keys %$f);

    print $fh ("{",
	       join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"),
	       "}\n"
	      );
  }
  
  return $filename;
}

sub save_state {
  my ($self, $path) = @_;

  {
    local $self->{knowledge_set};
    $self->SUPER::save_state($path);
  }
  return unless $self->{model};

  my $model_dir = File::Spec->catdir($path, 'models');
  mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!";
  while (my ($name, $learner) = each %{$self->{model}{learners}}) {
    my $oldpath = File::Spec->catdir($self->{model}{_in_dir}, $learner->{machine_file});
    my $newpath = File::Spec->catfile($model_dir, "${name}_model");
    File::Copy::copy($oldpath, $newpath);
  }
  $self->{model}{_in_dir} = $model_dir;
}

sub restore_state {
  my ($pkg, $path) = @_;
  
  my $self = $pkg->SUPER::restore_state($path);

  my $model_dir = File::Spec->catdir($path, 'models');
  return $self unless -e $model_dir;
  $self->{model}{_in_dir} = $model_dir;
  
  return $self;
}

1;

__END__

=head1 NAME

AI::Categorizer::Learner::Weka - Pass-through wrapper to Weka system

=head1 SYNOPSIS

  use AI::Categorizer::Learner::Weka;
  
  # Here $k is an AI::Categorizer::KnowledgeSet object
  
  my $nb = new AI::Categorizer::Learner::Weka(...parameters...);
  $nb->train(knowledge_set => $k);
  $nb->save_state('filename');
  
  ... time passes ...
  
  $nb = AI::Categorizer::Learner->restore_state('filename');
  my $c = new AI::Categorizer::Collection::Files( path => ... );
  while (my $document = $c->next) {
    my $hypothesis = $nb->categorize($document);
    print "Best assigned category: ", $hypothesis->best_category, "\n";
  }

=head1 DESCRIPTION

This class doesn't implement any machine learners of its own, it
merely passes the data through to the Weka machine learning system
(http://www.cs.waikato.ac.nz/~ml/weka/).  This can give you access to
a collection of machine learning algorithms not otherwise implemented
in C<AI::Categorizer>.

Currently this is a simple command-line wrapper that calls C<java>
subprocesses.  In the future this may be converted to an
C<Inline::Java> wrapper for better performance (faster running
times).  However, if you're looking for really great performance,
you're probably looking in the wrong place - this Weka wrapper is
intended more as a way to try lots of different machine learning
methods.

=head1 METHODS

This class inherits from the C<AI::Categorizer::Learner> class, so all
of its methods are available unless explicitly mentioned here.

=head2 new()

Creates a new Weka Learner and returns it.  In addition to the
parameters accepted by the C<AI::Categorizer::Learner> class, the
Weka subclass accepts the following parameters:

=over 4

=item java_path

Specifies where the C<java> executable can be found on this system.
The default is simply C<java>, meaning that it will search your
C<PATH> to find java.

=item java_args

Specifies a list of any additional arguments to give to the java
process.  Commonly it's necessary to allocate more memory than the
default, using an argument like C<-Xmx130MB>.

=item weka_path

Specifies the path to the C<weka.jar> file containing the Weka
bytecode.  If Weka has been installed somewhere in your java
C<CLASSPATH>, you needn't specify a C<weka_path>.

=item weka_classifier

Specifies the Weka class to use for a categorizer.  The default is
C<weka.classifiers.NaiveBayes>.  Consult your Weka documentation for a
list of other classifiers available.

=item weka_args

Specifies a list of any additional arguments to pass to the Weka
classifier class when building the categorizer.



( run in 1.081 second using v1.01-cache-2.11-cpan-39bf76dae61 )