AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer.pm  view on Meta::CPAN

  my $package = shift;
  my %args = @_;
  my %defaults;
  if (exists $args{data_root}) {
    $defaults{training_set} = File::Spec->catfile($args{data_root}, 'training');
    $defaults{test_set} = File::Spec->catfile($args{data_root}, 'test');
    $defaults{category_file} = File::Spec->catfile($args{data_root}, 'cats.txt');
    delete $args{data_root};
  }

  return $package->SUPER::new(%defaults, %args);
}

#sub dump_parameters {
#  my $p = shift()->SUPER::dump_parameters;
#  delete $p->{stopwords} if $p->{stopword_file};
#  return $p;
#}

sub knowledge_set { shift->{knowledge_set} }
sub learner       { shift->{learner} }

# Combines several methods in one sub
sub run_experiment {
  my $self = shift;

lib/AI/Categorizer/Category.pm  view on Meta::CPAN

  (
   features => {
		class => 'AI::Categorizer::FeatureVector',
		delayed => 1,
	       },
  );

my %REGISTRY = ();

sub new {
  my $self = shift()->SUPER::new(@_);
  $self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} );
  $REGISTRY{$self->{name}} = $self;
  return $self;
}

sub by_name {
  my ($class, %args) = @_;
  return $REGISTRY{$args{name}} if exists $REGISTRY{$args{name}};
  return $class->new(%args);
}

lib/AI/Categorizer/Collection.pm  view on Meta::CPAN

  );

sub new {
  my ($class, %args) = @_;
  
  # Optimize so every document doesn't have to convert the stopword list to a hash
  if ($args{stopwords} and UNIVERSAL::isa($args{stopwords}, 'ARRAY')) {
    $args{stopwords} = { map {+$_ => 1} @{ $args{stopwords} } };
  }
  
  my $self = $class->SUPER::new(%args);

  if ($self->{category_file}) {
    local *FH;
    open FH, $self->{category_file} or die "Can't open $self->{category_file}: $!";
    while (<FH>) {
      my ($doc, @cats) = split;
      $self->{category_hash}{$doc} = \@cats;
    }
    close FH;
  }

lib/AI/Categorizer/Collection/DBI.pm  view on Meta::CPAN

  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document',
		 delayed => 1 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  die "Must provide 'dbh' or 'connection_string' arguments"
    unless $self->{dbh} or $self->{connection_string};
  
  unless ($self->{dbh}) {
    $self->{dbh} = DBI->connect($self->{connection_string}, '', '', {RaiseError => 1})
      or die DBI->errstr;
    delete $self->{connection_string};
  }
  

lib/AI/Categorizer/Collection/Files.pm  view on Meta::CPAN

use File::Spec;

__PACKAGE__->valid_params
  (
   path => { type => SCALAR|ARRAYREF },
   recurse => { type => BOOLEAN, default => 0 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  $self->{dir_fh} = do {local *FH; *FH};  # double *FH avoids a warning

  # Documents are contained in a directory, or list of directories
  $self->{path} = [$self->{path}] unless ref $self->{path};
  $self->{used} = [];

  $self->_next_path;
  return $self;
}

lib/AI/Categorizer/Collection/InMemory.pm  view on Meta::CPAN

use base qw(AI::Categorizer::Collection);

use Params::Validate qw(:types);

__PACKAGE__->valid_params
  (
   data => { type => HASHREF },
  );

sub new {
  my $self = shift()->SUPER::new(@_);
  
  while (my ($name, $params) = each %{$self->{data}}) {
    foreach (@{$params->{categories}}) {
      next if ref $_;
      $_ = AI::Categorizer::Category->by_name(name => $_);
    }
  }

  return $self;
}

lib/AI/Categorizer/Collection/SingleFile.pm  view on Meta::CPAN

  );

__PACKAGE__->contained_objects
  (
   document => { class => 'AI::Categorizer::Document::Text',
		 delayed => 1 },
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);
  
  $self->{fh} = do {local *FH; *FH};  # double *FH avoids a warning

  # Documents are contained in a file, or list of files
  $self->{path} = [$self->{path}] unless ref $self->{path};
  $self->{used} = [];

  $self->_next_path;
  return $self;
}

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

   features => { delayed => 1,
		 class => 'AI::Categorizer::FeatureVector' },
  );

### Constructors

my $NAME = 'a';

sub new {
  my $pkg = shift;
  my $self = $pkg->SUPER::new(name => $NAME++,  # Use a default name
			      @_);

  # Get efficient internal data structures
  $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );

  $self->_fix_stopwords;
  
  # A few different ways for the caller to initialize the content
  if (exists $self->{parse}) {
    $self->parse(content => delete $self->{parse});

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

use strict;
use base qw(XML::SAX::Base);

# Input: a hash which is weights of elements
# Output: object of this class
# Description: this is constructor
sub new{
  my ($class, %args) = @_;

  # call super class such as XML::SAX::Base
  my $self = $class->SUPER::new;

  # save weights of elements which is a hash for pairs <elementName, weight>
  # weight is times duplication of corresponding element
  # It is provided by caller(one of parameters) at construction, and
  # we must save it in order to use doing duplication at end_element
  $self->{weightHash} = $args{weights};

  # It is storage to store the data produced by Text, CDataSection and etc.
  $self->{content} = '';

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN


  # The level(depth) of the last called element in XML tree
  # Calling of start_element is the preorder of the tree traversal.
  # The level is the level of current visiting element in tree.
  # the first element is 0-level
  $self->{levelPointer} = 0;

  # all data will be saved into here, initially, it is an empty
  $self->{content} = "";

  #$self->SUPER::start_document($doc);
}

# Input: None
# Output: None
# Description:
# 	it is called whenever the parser ends the document
# 	it will be called at once
#	Nothing to do
sub end_document{
  my ($self, $doc)= @_;

  #$self->SUPER::end_document($doc);
}

# Input
#	LocalName: 	$el->{LocalName}
#	NamespaceURI: 	$el->{NamespaceURI}
#	Name		$el->{Name}
#	Prefix		$el->{Prefix}
#	Attributes	$el->{Attributes}
#	for each attribute
#		LocalName: 	$el->{LocalName}

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

  # its meaning is to append the new data at this location
  my $location= length $self->{content};

  # save the last location of the current content
  # so that at end_element the starting location of data of this element can be known
  $self->{locationArray}[$self->{levelPointer}] = $location;

  # for the next element, increase levelPointer
  $self->{levelPointer}++;

  #$self->SUPER::start_document($el);
}

# Input: None
# Output: None
# Description:
# 	it is called whenever the parser ends the element
sub end_element{
  my ($self, $el)= @_;

  $self->{levelPointer}--;

lib/AI/Categorizer/Document/XML.pm  view on Meta::CPAN

  
  # n - duplicate data by n times
  # get new content
  my $newContent= substr($self->{content}, $location);

  # start to copy
  for(my $i=1; $i<$weight;$i++){
    $self->{content} .= $newContent;
  }

  #$self->SUPER::end_document($el);
}

# Input: a hash which consists of pair <Data, Value>
# Output: None
# Description:
# 	it is called whenever the parser meets the text which comes from Text, CDataSection and etc
#	Value must be saved into content buffer.
sub characters{
  my ($self, $args)= @_;

lib/AI/Categorizer/Experiment.pm  view on Meta::CPAN

sub add_hypothesis {
  my ($self, $h, $correct, $name) = @_;
  die "No hypothesis given to add_hypothesis()" unless $h;
  $name = $h->document_name unless defined $name;
  
  $self->add_result([$h->categories], $correct, $name);
}

sub stats_table {
  my $self = shift;
  $self->SUPER::stats_table($self->{sig_figs});
}

1;

__END__

=head1 NAME

AI::Categorizer::Experiment - Coordinate experimental results

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN


sub new {
  my ($pkg, %args) = @_;
  
  # Shortcuts
  if ($args{tfidf_weighting}) {
    @args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};
    delete $args{tfidf_weighting};
  }

  my $self = $pkg->SUPER::new(%args);

  # Convert to AI::Categorizer::ObjectSet sets
  $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );
  $self->{documents}  = new AI::Categorizer::ObjectSet( @{$self->{documents}}  );

  if ($self->{load}) {
    my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };
    $self->load(%$args);
    delete $self->{load};
  }

lib/AI/Categorizer/Learner/DecisionTree.pm  view on Meta::CPAN

package AI::Categorizer::Learner::DecisionTree;
$VERSION = '0.01';

use strict;
use AI::DecisionTree;
use AI::Categorizer::Learner::Boolean;
use base qw(AI::Categorizer::Learner::Boolean);

sub create_model {
  my $self = shift;
  $self->SUPER::create_model;
  $self->{model}{first_tree}->do_purge;
  delete $self->{model}{first_tree};
}

sub create_boolean_model {
  my ($self, $positives, $negatives, $cat) = @_;
  
  my $t = new AI::DecisionTree(noise_mode => 'pick_best', 
			       verbose => $self->verbose);

lib/AI/Categorizer/Learner/DecisionTree.pm  view on Meta::CPAN

  }

  print STDERR "\nBuilding tree for category '", $cat->name, "'" if $self->verbose;
  $t->train;
  return $t;
}

sub get_scores {
  my ($self, $doc) = @_;
  local $self->{current_doc} = $doc->features->as_boolean_hash;
  return $self->SUPER::get_scores($doc);
}

sub get_boolean_score {
  my ($self, $doc, $t) = @_;
  return $t->get_result( attributes => $self->{current_doc} ) || 0;
}

1;
__END__

lib/AI/Categorizer/Learner/KNN.pm  view on Meta::CPAN

  return $self->{threshold};
}

sub categorize_collection {
  my $self = shift;
  
  my $f_class = $self->knowledge_set->contained_class('features');
  if ($f_class->can('all_features')) {
    $f_class->all_features([$self->knowledge_set->features->names]);
  }
  $self->SUPER::categorize_collection(@_);
}

sub get_scores {
  my ($self, $newdoc) = @_;
  my $currentDocName = $newdoc->name;
  #print "classifying $currentDocName\n";

  my $features = $newdoc->features->intersection($self->knowledge_set->features)->normalize;
  my $q = AI::Categorizer::Learner::KNN::Queue->new(size => $self->{k_value});

lib/AI/Categorizer/Learner/NaiveBayes.pm  view on Meta::CPAN


sub threshold {
  my $self = shift;
  $self->{threshold} = shift if @_;
  return $self->{threshold};
}

sub save_state {
  my $self = shift;
  local $self->{knowledge_set};  # Don't need the knowledge_set to categorize
  $self->SUPER::save_state(@_);
}

sub categories {
  my $self = shift;
  return map AI::Categorizer::Category->by_name( name => $_ ), $self->{model}->labels;
}

1;

__END__

lib/AI/Categorizer/Learner/Rocchio.pm  view on Meta::CPAN

   threshold        => {type => SCALAR, default => 0.1},
  );

sub create_model {
  my $self = shift;
  foreach my $doc ($self->knowledge_set->documents) {
    $doc->features->normalize;
  }
  
  $self->{model}{all_features} = $self->knowledge_set->features(undef);
  $self->SUPER::create_model(@_);
  delete $self->{knowledge_set};
}

sub create_boolean_model {
  my ($self, $positives, $negatives, $cat) = @_;
  my $posdocnum = @$positives;
  my $negdocnum = @$negatives;
  
  my $beta = $self->{positive_setting};
  my $gamma = $self->{negative_setting};

lib/AI/Categorizer/Learner/SVM.pm  view on Meta::CPAN

  (
   svm_kernel => {type => SCALAR, default => 'linear'},
  );

sub create_model {
  my $self = shift;
  my $f = $self->knowledge_set->features->as_hash;
  my $rmap = [ keys %$f ];
  $self->{model}{feature_map} = { map { $rmap->[$_], $_ } 0..$#$rmap };
  $self->{model}{feature_map_reverse} = $rmap;
  $self->SUPER::create_model(@_);
}

sub _doc_2_dataset {
  my ($self, $doc, $label, $fm) = @_;

  my $ds = new Algorithm::SVM::DataSet(Label => $label);
  my $f = $doc->features->as_hash;
  while (my ($k, $v) = each %$f) {
    next unless exists $fm->{$k};
    $ds->attribute( $fm->{$k}, $v );

lib/AI/Categorizer/Learner/SVM.pm  view on Meta::CPAN

    push @neg, $self->_doc_2_dataset($doc, 0, $self->{model}{feature_map});
  }

  $svm->train(@pos, @neg);
  return $svm;
}

sub get_scores {
  my ($self, $doc) = @_;
  local $self->{current_doc} = $self->_doc_2_dataset($doc, -1, $self->{model}{feature_map});
  return $self->SUPER::get_scores($doc);
}

sub get_boolean_score {
  my ($self, $doc, $svm) = @_;
  return $svm->predict($self->{current_doc});
}

sub save_state {
  my ($self, $path) = @_;
  {
    local $self->{model}{learners};
    local $self->{knowledge_set};
    $self->SUPER::save_state($path);
  }
  return unless $self->{model};
  
  my $svm_dir = File::Spec->catdir($path, 'svms');
  mkdir($svm_dir, 0777) or die "Couldn't create $svm_dir: $!";
  while (my ($name, $learner) = each %{$self->{model}{learners}}) {
    my $path = File::Spec->catfile($svm_dir, $name);
    $learner->save($path);
  }
}

sub restore_state {
  my ($self, $path) = @_;
  $self = $self->SUPER::restore_state($path);
  
  my $svm_dir = File::Spec->catdir($path, 'svms');
  return $self unless -e $svm_dir;
  opendir my($dh), $svm_dir or die "Can't open directory $svm_dir: $!";
  while (defined (my $file = readdir $dh)) {
    my $full_file = File::Spec->catfile($svm_dir, $file);
    next if -d $full_file;
    $self->{model}{learners}{$file} = new Algorithm::SVM(Model => $full_file);
  }
  return $self;

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

   tmpdir => {type => SCALAR, default => File::Spec->tmpdir},
  );

__PACKAGE__->contained_objects
  (
   features => {class => 'AI::Categorizer::FeatureVector', delayed => 1},
  );

sub new {
  my $class = shift;
  my $self = $class->SUPER::new(@_);

  for ('java_args', 'weka_args') {
    $self->{$_} = [] unless defined $self->{$_};
    $self->{$_} = [$self->{$_}] unless UNIVERSAL::isa($self->{$_}, 'ARRAY');
  }
  
  if (defined $self->{weka_path}) {
    push @{$self->{java_args}}, '-classpath', $self->{weka_path};
    delete $self->{weka_path};
  }

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

sub create_model {
  my ($self) = shift;
  my $m = $self->{model} ||= {};
  $m->{all_features} = [ $self->knowledge_set->features->names ];
  $m->{_in_dir} = File::Temp::tempdir( DIR => $self->{tmpdir} );

  # Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement)
  my $dummy_features = $self->create_delayed_object('features');
  $m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]);

  $self->SUPER::create_model(@_);
}

sub create_boolean_model {
  my ($self, $pos, $neg, $cat) = @_;

  my @docs = (map([$_->features, 1], @$pos),
	      map([$_->features, 0], @$neg));
  my $train_file = $self->create_arff_file($cat->name . '_train', \@docs);

  my %info = (machine_file => $cat->name . '_model');

lib/AI/Categorizer/Learner/Weka.pm  view on Meta::CPAN

  }
  
  return $filename;
}

sub save_state {
  my ($self, $path) = @_;

  {
    local $self->{knowledge_set};
    $self->SUPER::save_state($path);
  }
  return unless $self->{model};

  my $model_dir = File::Spec->catdir($path, 'models');
  mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!";
  while (my ($name, $learner) = each %{$self->{model}{learners}}) {
    my $oldpath = File::Spec->catdir($self->{model}{_in_dir}, $learner->{machine_file});
    my $newpath = File::Spec->catfile($model_dir, "${name}_model");
    File::Copy::copy($oldpath, $newpath);
  }
  $self->{model}{_in_dir} = $model_dir;
}

sub restore_state {
  my ($pkg, $path) = @_;
  
  my $self = $pkg->SUPER::restore_state($path);

  my $model_dir = File::Spec->catdir($path, 'models');
  return $self unless -e $model_dir;
  $self->{model}{_in_dir} = $model_dir;
  
  return $self;
}

1;



( run in 1.138 second using v1.01-cache-2.11-cpan-49f99fa48dc )