AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

  my $self = shift;
  $self->{verbose} = shift if @_;
  return $self->{verbose};
}

sub trim_doc_features {
  my ($self) = @_;
  
  foreach my $doc ($self->documents) {
    $doc->features( $doc->features->intersection($self->features) );
  }
}


sub prog_bar {
  my ($self, $collection) = @_;

  return sub {} unless $self->verbose;
  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";

  my $count = $collection->can('count_documents') ? $collection->count_documents : 0;
  
  my $pb = 'Time::Progress'->new;
  $pb->attr(max => $count);
  my $i = 0;
  return sub {
    $i++;
    return if $i % 25;
    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);
  };
}

# A little utility method for several other methods like scan_stats(),
# load(), read(), etc.
sub _make_collection {
  my ($self, $args) = @_;
  return $args->{collection} || $self->create_delayed_object('collection', %$args);
}

sub scan_stats {
  # Should determine:
  #  - number of documents
  #  - number of categories
  #  - avg. number of categories per document (whole corpus)
  #  - avg. number of tokens per document (whole corpus)
  #  - avg. number of types per document (whole corpus)
  #  - number of documents, tokens, & types for each category
  #  - "category skew index" (% variance?) by num. documents, tokens, and types

  my ($self, %args) = @_;
  my $collection = $self->_make_collection(\%args);
  my $pb = $self->prog_bar($collection);

  my %stats;


  while (my $doc = $collection->next) {
    $pb->();
    $stats{category_count_with_duplicates} += $doc->categories;

    my ($sum, $length) = ($doc->features->sum, $doc->features->length);
    $stats{document_count}++;
    $stats{token_count} += $sum;
    $stats{type_count}  += $length;
    
    foreach my $cat ($doc->categories) {
#warn $doc->name, ": ", $cat->name, "\n";
      $stats{categories}{$cat->name}{document_count}++;
      $stats{categories}{$cat->name}{token_count} += $sum;
      $stats{categories}{$cat->name}{type_count}  += $length;
    }
  }
  print "\n" if $self->verbose;

  my @cats = keys %{ $stats{categories} };

  $stats{category_count}          = @cats;
  $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};
  $stats{tokens_per_document}     = $stats{token_count} / $stats{document_count};
  $stats{types_per_document}      = $stats{type_count}  / $stats{document_count};

  foreach my $thing ('type', 'token', 'document') {
    $stats{"${thing}s_per_category"} = AI::Categorizer::Util::average
      ( map { $stats{categories}{$_}{"${thing}_count"} } @cats );

    next unless @cats;

    # Compute the skews
    my $ssum;
    foreach my $cat (@cats) {
      $ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;
    }
    $stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};
  }

  return \%stats;
}

sub load {
  my ($self, %args) = @_;
  my $c = $self->_make_collection(\%args);

  if ($self->{features_kept}) {
    # Read the whole thing in, then reduce
    $self->read( collection => $c );
    $self->select_features;

  } elsif ($self->{scan_first}) {
    # Figure out the feature set first, then read data in
    $self->scan_features( collection => $c );
    $c->rewind;
    $self->read( collection => $c );

  } else {
    # Don't do any feature reduction, just read the data
    $self->read( collection => $c );
  }
}

sub read {
  my ($self, %args) = @_;
  my $collection = $self->_make_collection(\%args);
  my $pb = $self->prog_bar($collection);
  
  while (my $doc = $collection->next) {
    $pb->();
    $self->add_document($doc);
  }
  print "\n" if $self->verbose;
}

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

  my $c = $self->_make_collection(\%args);

  my $pb = $self->prog_bar($c);
  my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );

  $self->delayed_object_params('document', use_features => $ranked_features);
  $self->delayed_object_params('collection', use_features => $ranked_features);
  return $ranked_features;
}

sub select_features {
  my $self = shift;
  
  my $f = $self->feature_selector->select_features(knowledge_set => $self);
  $self->features($f);
}

sub partition {
  my ($self, @sizes) = @_;
  my $num_docs = my @docs = $self->documents;
  my @groups;

  while (@sizes > 1) {
    my $size = int ($num_docs * shift @sizes);
    push @groups, [];
    for (0..$size) {
      push @{ $groups[-1] }, splice @docs, rand(@docs), 1;
    }
  }
  push @groups, \@docs;

  return map { ref($self)->new( documents => $_ ) } @groups;
}

sub make_document {
  my ($self, %args) = @_;
  my $cats = delete $args{categories};
  my @cats = map { $self->call_method('category', 'by_name', name => $_) } @$cats;
  my $d = $self->create_delayed_object('document', %args, categories => \@cats);
  $self->add_document($d);
}

sub add_document {
  my ($self, $doc) = @_;

  foreach ($doc->categories) {
    $_->add_document($doc);
  }
  $self->{documents}->insert($doc);
  $self->{categories}->insert($doc->categories);
}

sub save_features {
  my ($self, $file) = @_;
  
  my $f = ($self->{features} || { $self->delayed_object_params('document') }->{use_features})
    or croak "No features to save";
  
  open my($fh), "> $file" or croak "Can't create $file: $!";
  my $h = $f->as_hash;
  print $fh "# Total: ", $f->length, "\n";
  
  foreach my $k (sort {$h->{$b} <=> $h->{$a}} keys %$h) {
    print $fh "$k\t$h->{$k}\n";
  }
  close $fh;
}

sub restore_features {
  my ($self, $file, $n) = @_;
  
  open my($fh), "< $file" or croak "Can't open $file: $!";

  my %hash;
  while (<$fh>) {
    next if /^#/;
    /^(.*)\t([\d.]+)$/ or croak "Malformed line: $_";
    $hash{$1} = $2;
    last if defined $n and $. >= $n;
  }
  my $features = $self->create_delayed_object('features', features => \%hash);
  
  $self->delayed_object_params('document',   use_features => $features);
  $self->delayed_object_params('collection', use_features => $features);
}

1;

__END__

=head1 NAME

AI::Categorizer::KnowledgeSet - Encapsulates set of documents

=head1 SYNOPSIS

 use AI::Categorizer::KnowledgeSet;
 my $k = new AI::Categorizer::KnowledgeSet(...parameters...);
 my $nb = new AI::Categorizer::Learner::NaiveBayes(...parameters...);
 $nb->train(knowledge_set => $k);

=head1 DESCRIPTION

The KnowledgeSet class that provides an interface to a set of
documents, a set of categories, and a mapping between the two.  Many
parameters for controlling the processing of documents are managed by
the KnowledgeSet class.

=head1 METHODS

=over 4

=item new()

Creates a new KnowledgeSet and returns it.  Accepts the following
parameters:

=over 4

=item load

lib/AI/Categorizer/KnowledgeSet.pm  view on Meta::CPAN

retrieval".  The three characters indicate the three factors that will
be multiplied for each feature to find the final vector value for that
feature.  The default weighting is C<xxx>.

The first character specifies the "term frequency" component, which
can take the following values:

=over 4

=item b

Binary weighting - 1 for terms present in a document, 0 for terms absent.

=item t

Raw term frequency - equal to the number of times a feature occurs in
the document.

=item x

A synonym for 't'.

=item n

Normalized term frequency - 0.5 + 0.5 * t/max(t).  This is the same as
the 't' specification, but with term frequency normalized to lie
between 0.5 and 1.

=back

The second character specifies the "collection frequency" component, which
can take the following values:

=over 4

=item f

Inverse document frequency - multiply term C<t>'s value by C<log(N/n)>,
where C<N> is the total number of documents in the collection, and
C<n> is the number of documents in which term C<t> is found.

=item p

Probabilistic inverse document frequency - multiply term C<t>'s value
by C<log((N-n)/n)> (same variable meanings as above).

=item x

No change - multiply by 1.

=back


The third character specifies the "normalization" component, which
can take the following values:

=over 4

=item c

Apply cosine normalization - multiply by 1/length(document_vector).

=item x

No change - multiply by 1.

=back

The three components may alternatively be specified by the
C<term_weighting>, C<collection_weighting>, and C<normalize_weighting>
parameters respectively.

=item verbose

If set to a true value, some status/debugging information will be
output on C<STDOUT>.

=back


=item categories()

In a list context returns a list of all Category objects in this
KnowledgeSet.  In a scalar context returns the number of such objects.

=item documents()

In a list context returns a list of all Document objects in this
KnowledgeSet.  In a scalar context returns the number of such objects.

=item document()

Given a document name, returns the Document object with that name, or
C<undef> if no such Document object exists in this KnowledgeSet.

=item features()

Returns a FeatureSet object which represents the features of all the
documents in this KnowledgeSet.

=item verbose()

Returns the C<verbose> parameter of this KnowledgeSet, or sets it with
an optional argument.

=item scan_stats()

Scans all the documents of a Collection and returns a hash reference
containing several statistics about the Collection.  (XXX need to describe stats)

=item scan_features()

This method scans through a Collection object and determines the
"best" features (words) to use when loading the documents and training
the Learner.  This process is known as "feature selection", and it's a
very important part of categorization.

The Collection object should be specified as a C<collection> parameter,
or by giving the arguments to pass to the Collection's C<new()> method.

The process of feature selection is governed by the



( run in 0.874 second using v1.01-cache-2.11-cpan-39bf76dae61 )