AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

package AI::Categorizer::Document;

use strict;
use Class::Container;
use base qw(Class::Container);

use Params::Validate qw(:types);
use AI::Categorizer::ObjectSet;
use AI::Categorizer::FeatureVector;

__PACKAGE__->valid_params
  (
   name       => {
		  type => SCALAR, 
		 },
   categories => {
		  type => ARRAYREF,
		  default => [],
		  callbacks => { 'all are Category objects' => 
				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
			       },
		  public => 0,
		 },
   stopwords => {
		 type => ARRAYREF|HASHREF,
		 default => {},
		},
   content   => {
		 type => HASHREF|SCALAR,
		 default => undef,
		},
   parse => {
	     type => SCALAR,
	     optional => 1,
	    },
   parse_handle => {
		    type => HANDLE,
		    optional => 1,
		   },
   features => {
		isa => 'AI::Categorizer::FeatureVector',
		optional => 1,
	       },
   content_weights => {
		       type => HASHREF,
		       default => {},
		      },
   front_bias => {
		  type => SCALAR,
		  default => 0,
		  },
   use_features => {
		    type => HASHREF|UNDEF,
		    default => undef,
		   },
   stemming => {
		type => SCALAR|UNDEF,
		optional => 1,
	       },
   stopword_behavior => {
			 type => SCALAR,
			 default => "stem",
			},
  );

__PACKAGE__->contained_objects
  (
   features => { delayed => 1,
		 class => 'AI::Categorizer::FeatureVector' },
  );

### Constructors

my $NAME = 'a';

sub new {
  my $pkg = shift;
  my $self = $pkg->SUPER::new(name => $NAME++,  # Use a default name
			      @_);

  # Get efficient internal data structures
  $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );

  $self->_fix_stopwords;
  
  # A few different ways for the caller to initialize the content
  if (exists $self->{parse}) {
    $self->parse(content => delete $self->{parse});
    
  } elsif (exists $self->{parse_handle}) {
    $self->parse_handle(handle => delete $self->{parse_handle});
    
  } elsif (defined $self->{content}) {
    # Allow a simple string as the content
    $self->{content} = { body => $self->{content} } unless ref $self->{content};
  }
  
  $self->finish if $self->{content};
  return $self;
}

sub _fix_stopwords {
  my $self = shift;
  
  # Convert to hash
  $self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
    if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
  
  my $s = $self->{stopwords};

  # May need to perform stemming on the stopwords
  return unless keys %$s; # No point in doing anything if there are no stopwords
  return unless $self->{stopword_behavior} eq 'stem';
  return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
  return if $s->{___stemmed};
  
  my @keys = keys %$s;
  %$s = ();
  $self->stem_words(\@keys);
  $s->{$_} = 1 foreach @keys;
  
  # This flag is attached to the stopword structure itself so that
  # other documents will notice it.
  $s->{___stemmed} = 1;
}

sub finish {
  my $self = shift;
  $self->create_feature_vector;
  
  # Now we're done with all the content stuff
  delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}


# Parse a document format - a virtual method
sub parse;

sub parse_handle {
  my ($self, %args) = @_;
  my $fh = $args{handle} or die "No 'handle' argument given to parse_handle()";
  return $self->parse( content => join '', <$fh> );
}

### Accessors

sub name { $_[0]->{name} }
sub stopword_behavior { $_[0]->{stopword_behavior} }

sub features {
  my $self = shift;
  if (@_) {
    $self->{features} = shift;
  }
  return $self->{features};
}

sub categories {
  my $c = $_[0]->{categories};
  return wantarray ? $c->members : $c->size;
}


### Workers

sub create_feature_vector {
  my $self = shift;
  my $content = $self->{content};
  my $weights = $self->{content_weights};

  die "'stopword_behavior' must be one of 'stem', 'no_stem', or 'pre_stemmed'"
    unless $self->{stopword_behavior} =~ /^stem|no_stem|pre_stemmed$/;

  $self->{features} = $self->create_delayed_object('features');
  while (my ($name, $data) = each %$content) {
    my $t = $self->tokenize($data);
    $t = $self->_filter_tokens($t) if $self->{stopword_behavior} eq 'no_stem';
    $self->stem_words($t);
    $t = $self->_filter_tokens($t) if $self->{stopword_behavior} =~ /^stem|pre_stemmed$/;
    my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
    $self->{features}->add($h);
  }
}

sub is_in_category {
  return (ref $_[1]
	  ? $_[0]->{categories}->includes( $_[1] )
	  : $_[0]->{categories}->includes_name( $_[1] ));
    
}

sub tokenize {
  my $self = shift;
  my @tokens;
  while ($_[0] =~ /([-\w]+)/g) {
    my $word = lc $1;
    next unless $word =~ /[a-z]/;
    $word =~ s/^[^a-z]+//;  # Trim leading non-alpha characters (helps with ordinals)
    push @tokens, $word;
  }
  return \@tokens;
}

sub stem_words {
  my ($self, $tokens) = @_;
  return unless $self->{stemming};
  return if $self->{stemming} eq 'none';
  die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'"
    unless $self->{stemming} eq 'porter';
  
  eval {require Lingua::Stem; 1}
    or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n";

  @$tokens = @{ Lingua::Stem::stem(@$tokens) };
}

sub _filter_tokens {
  my ($self, $tokens_in) = @_;

  if ($self->{use_features}) {
    my $f = $self->{use_features}->as_hash;
    return [ grep  exists($f->{$_}), @$tokens_in ];
  } elsif ($self->{stopwords} and keys %{$self->{stopwords}}) {
    my $s = $self->{stopwords};
    return [ grep !exists($s->{$_}), @$tokens_in ];
  }
  return $tokens_in;
}

sub _weigh_tokens {
  my ($self, $tokens, $weight) = @_;

  my %counts;
  if (my $b = 0+$self->{front_bias}) {
    die "'front_bias' value must be between -1 and 1"
      unless -1 < $b and $b < 1;
    
    my $n = @$tokens;
    my $r = ($b-1)**2 / ($b+1);
    my $mult = $weight * log($r)/($r-1);
    
    my $i = 0;
    foreach my $feature (@$tokens) {
      $counts{$feature} += $mult * $r**($i/$n);
      $i++;
    }
    
  } else {
    foreach my $feature (@$tokens) {
      $counts{$feature} += $weight;
    }
  }

  return \%counts;
}

sub vectorize {
  my ($self, %args) = @_;
  if ($self->{stem_stopwords}) {
    my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
    $self->{stopwords} = { map {+$_, 1} @$s };
    $args{tokens} = $self->_filter_tokens($args{tokens});
  }
  return $self->_weigh_tokens($args{tokens}, $args{weight});
}

sub read {
  my ($class, %args) = @_;
  my $path = delete $args{path} or die "Must specify 'path' argument to read()";
  
  my $self = $class->new(%args);
  
  open my($fh), "< $path" or die "$path: $!";
  $self->parse_handle(handle => $fh);
  close $fh;
  
  $self->finish;
  return $self;
}

sub dump_features {
  my ($self, %args) = @_;
  my $path = $args{path} or die "No 'path' argument given to dump_features()";
  open my($fh), "> $path" or die "Can't create $path: $!";
  my $f = $self->features->as_hash;
  while (my ($k, $v) = each %$f) {
    print $fh "$k\t$v\n";
  }
}

1;

__END__

=head1 NAME

AI::Categorizer::Document - Embodies a document

=head1 SYNOPSIS

 use AI::Categorizer::Document;
 
 # Simplest way to create a document:
 my $d = new AI::Categorizer::Document(name => $string,
                                       content => $string);
 
 # Other parameters are accepted:
 my $d = new AI::Categorizer::Document(name => $string,
                                       categories => \@category_objects,
                                       content => { subject => $string,
                                                    body => $string2, ... },
                                       content_weights => { subject => 3,
                                                            body => 1, ... },
                                       stopwords => \%skip_these_words,
                                       stemming => $string,
                                       front_bias => $float,
                                       use_features => $feature_vector,
                                      );
 
 # Specify explicit feature vector:
 my $d = new AI::Categorizer::Document(name => $string);
 $d->features( $feature_vector );
 
 # Now pass the document to a categorization algorithm:
 my $learner = AI::Categorizer::Learner::NaiveBayes->restore_state($path);
 my $hypothesis = $learner->categorize($document);

=head1 DESCRIPTION

The Document class embodies the data in a single document, and
contains methods for turning this data into a FeatureVector.  Usually
documents are plain text, but subclasses of the Document class may
handle any kind of data.

=head1 METHODS

=over 4

=item new(%parameters)

Creates a new Document object.  Document objects are used during



( run in 1.227 second using v1.01-cache-2.11-cpan-39bf76dae61 )