map results from the CPAN

AI-MaxEntropy
use strict;
use warnings;

package AI::MaxEntropy;

use Algorithm::LBFGS;
use AI::MaxEntropy::Model;
use XSLoader;

our $VERSION = '0.20';
XSLoader::load('AI::MaxEntropy', $VERSION);

sub new {
    my $class = shift;
    my $self = {
       smoother => {},
       algorithm => {},
       @_,
       samples => [],
       x_bucket => {},
       y_bucket => {},
       x_list => [],
       y_list => [],
       x_num => 0,
       y_num => 0,
       f_num => 0,
       af_num => 0,
       f_freq => [],
       f_map => [],
       last_cut => -1,
       _c => {}
    };
    return bless $self, $class;
}

sub see {
    my ($self, $x, $y, $w) = @_;
    $w = 1 if not defined($w);
    my ($x1, $y1) = ([], undef);
    # preprocess if $x is hashref
    $x = [
        map {
	    my $attr = $_;
	    ref($x->{$attr}) eq 'ARRAY' ? 
	        map { "$attr:$_" } @{$x->{$attr}} : "$_:$x->{$_}" 
        } keys %$x
    ] if ref($x) eq 'HASH';
    # update af_num
    $self->{af_num} = scalar(@$x) if $self->{af_num} == 0;
    $self->{af_num} = -1 if $self->{af_num} != scalar(@$x);
    # convert y from string to ID
    my $y_id = $self->{y_bucket}->{$y};
    # new y
    if (!defined($y_id)) {
        # update y_list, y_num, y_bucket, f_freq
        push @{$self->{y_list}}, $y;
	$self->{y_num} = scalar(@{$self->{y_list}});
	$y_id = $self->{y_num} - 1;
	$self->{y_bucket}->{$y} = $y_id;
	push @{$self->{f_freq}}, [map { 0 } (1 .. $self->{x_num})];
	# save ID
	$y1 = $y_id;
    }
    # old y
    else { $y1 = $y_id }
    # convert x from strings to IDs
    for (@$x) {
        my $x_id = $self->{x_bucket}->{$_};
	# new x
	if (!defined($x_id)) {
	    # update x_list, x_num, x_bucket, f_freq
	    push @{$self->{x_list}}, $_;
	    $self->{x_num} = scalar(@{$self->{x_list}});
	    $x_id = $self->{x_num} - 1;
	    $self->{x_bucket}->{$_} = $x_id;
	    push @{$self->{f_freq}->[$_]}, 0 for (0 .. $self->{y_num} - 1);
	    # save ID
	    push @$x1, $x_id;
	}
        # old x
	else { push @$x1, $x_id }
	# update f_freq
	$self->{f_freq}->[$y_id]->[$x_id] += $w;
    }
    # add the sample
    push @{$self->{samples}}, [$x1, $y1, $w];
    $self->{last_cut} = -1;
}

sub cut {
    my ($self, $t) = @_;
    $self->{f_num} = 0;
    for my $y (0 .. $self->{y_num} - 1) {
        for my $x (0 .. $self->{x_num} - 1) {
	    if ($self->{f_freq}->[$y]->[$x] >= $t) {
	        $self->{f_map}->[$y]->[$x] = $self->{f_num};
		$self->{f_num}++;
	    }
	    else { $self->{f_map}->[$y]->[$x] = -1 }
	}
    }
    $self->{last_cut} = $t;
}

sub forget_all {
    my $self = shift;
    $self->{samples} = [];
    $self->{x_bucket} = {};
    $self->{y_bucket} = {};
    $self->{x_num} = 0;
    $self->{y_num} = 0;
    $self->{f_num} = 0;
    $self->{x_list} = [];
    $self->{y_list} = [];
    $self->{af_num} = 0;
    $self->{f_freq} = [];
    $self->{f_map} = [];
    $self->{last_cut} = -1;
    $self->{_c} = {};
}

sub _cache {
    my $self = shift;
    $self->_cache_samples;
    $self->_cache_f_map;
}

sub _free_cache {
    my $self = shift;
    $self->_free_cache_samples;
    $self->_free_cache_f_map;
}

sub learn {
    my $self = shift;
    # cut 0 for default
    $self->cut(0) if $self->{last_cut} == -1;
    # initialize
    $self->{lambda} = [map { 0 } (1 .. $self->{f_num})];
    $self->_cache;
    # optimize
    my $type = $self->{algorithm}->{type} || 'lbfgs';
    if ($type eq 'lbfgs') {
        my $o = Algorithm::LBFGS->new(%{$self->{algorithm}});
        $o->fmin(\&_neg_log_likelihood, $self->{lambda},
            $self->{algorithm}->{progress_cb}, $self);
    }
    elsif ($type eq 'gis') {
        die 'GIS is not applicable'
	    if $self->{af_num} == -1 or $self->{last_cut} != 0;
	my $progress_cb = $self->{algorithm}->{progress_cb};
	$progress_cb = sub {
	    print "$_[0]: |lambda| = $_[3], |d_lambda| = $_[4]\n"; 0;
        } if defined($progress_cb) and $progress_cb eq 'verbose';
        my $epsilon = $self->{algorithm}->{epsilon} || 1e-3;
        $self->{lambda} = $self->_apply_gis($progress_cb, $epsilon);
    }
    else { die "$type is not a valid algorithm type" }
    # finish
    $self->_free_cache;
    return $self->_create_model;
}

sub _create_model {
    my $self = shift;
    my $model = AI::MaxEntropy::Model->new;    
    $model->{$_} = ref($self->{$_}) eq 'ARRAY' ? [@{$self->{$_}}] :
                   ref($self->{$_}) eq 'HASH' ? {%{$self->{$_}}} :
		   $self->{$_}
    for qw/x_list y_list lambda x_num y_num f_num x_bucket y_bucket/;
    $model->{f_map}->[$_] = [@{$self->{f_map}->[$_]}]
       for (0 .. $self->{y_num} - 1); 
    return $model;
}

1;

__END__

=head1 NAME

AI::MaxEntropy - Perl extension for learning Maximum Entropy Models

=head1 SYNOPSIS

  use AI::MaxEntropy;

  # create a maximum entropy learner
  my $me = AI::MaxEntropy->new; 
  
  # the learner see 2 red round smooth apples
  $me->see(['round', 'smooth', 'red'] => 'apple' => 2);
  
  # the learner see 3 yellow long smooth bananas
  $me->see(['long', 'smooth', 'yellow'] => 'banana' => 3);

  # and more

  # samples needn't have the same numbers of active features
  $me->see(['rough', 'big'] => 'pomelo');

  # the order of active features is not concerned, too
  $me->see(['big', 'rough'] => 'pomelo');

  # ...

  # and, let it learn
  my $model = $me->learn;

  # then, we can make predictions on unseen data

  # ask what a red thing is most likely to be
  print $model->predict(['red'])."\n";
  # the answer is apple, because all red things the learner have ever seen
  # are apples
  
  # ask what a smooth thing is most likely to be
  print $model->predict(['smooth'])."\n";
  # the answer is banana, because the learner have seen more smooth bananas
  # (weighted 3) than smooth apples (weighted 2)

  # ask what a red, long thing is most likely to be
  print $model->predict(['red', 'long'])."\n";
  # the answer is banana, because the learner have seen more long bananas
  # (weighted 3) than red apples (weighted 2)

  # print out scores of all possible answers to the feature round and red
  for ($model->all_labels) {
      my $s = $model->score(['round', 'red'] => $_);
      print "$_: $s\n";
  }
( run in 0.542 second using v1.01-cache-2.11-cpan-39bf76dae61 )