AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

		  default => [],
		  callbacks => { 'all are Category objects' => 
				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} },
			       },
		  public => 0,
		 },
   stopwords => {
		 type => ARRAYREF|HASHREF,
		 default => {},
		},
   content   => {
		 type => HASHREF|SCALAR,
		 default => undef,
		},
   parse => {
	     type => SCALAR,
	     optional => 1,
	    },
   parse_handle => {
		    type => HANDLE,
		    optional => 1,
		   },
   features => {
		isa => 'AI::Categorizer::FeatureVector',
		optional => 1,
	       },
   content_weights => {
		       type => HASHREF,
		       default => {},
		      },
   front_bias => {
		  type => SCALAR,
		  default => 0,
		  },
   use_features => {
		    type => HASHREF|UNDEF,
		    default => undef,
		   },
   stemming => {
		type => SCALAR|UNDEF,
		optional => 1,
	       },
   stopword_behavior => {
			 type => SCALAR,
			 default => "stem",
			},
  );

__PACKAGE__->contained_objects
  (
   features => { delayed => 1,
		 class => 'AI::Categorizer::FeatureVector' },
  );

### Constructors

my $NAME = 'a';

sub new {
  my $pkg = shift;
  my $self = $pkg->SUPER::new(name => $NAME++,  # Use a default name
			      @_);

  # Get efficient internal data structures
  $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );

  $self->_fix_stopwords;
  
  # A few different ways for the caller to initialize the content
  if (exists $self->{parse}) {
    $self->parse(content => delete $self->{parse});
    
  } elsif (exists $self->{parse_handle}) {
    $self->parse_handle(handle => delete $self->{parse_handle});
    
  } elsif (defined $self->{content}) {
    # Allow a simple string as the content
    $self->{content} = { body => $self->{content} } unless ref $self->{content};
  }
  
  $self->finish if $self->{content};
  return $self;
}

sub _fix_stopwords {
  my $self = shift;
  
  # Convert to hash
  $self->{stopwords} = { map {($_ => 1)} @{ $self->{stopwords} } }
    if UNIVERSAL::isa($self->{stopwords}, 'ARRAY');
  
  my $s = $self->{stopwords};

  # May need to perform stemming on the stopwords
  return unless keys %$s; # No point in doing anything if there are no stopwords
  return unless $self->{stopword_behavior} eq 'stem';
  return if !defined($self->{stemming}) or $self->{stemming} eq 'none';
  return if $s->{___stemmed};
  
  my @keys = keys %$s;
  %$s = ();
  $self->stem_words(\@keys);
  $s->{$_} = 1 foreach @keys;
  
  # This flag is attached to the stopword structure itself so that
  # other documents will notice it.
  $s->{___stemmed} = 1;
}

sub finish {
  my $self = shift;
  $self->create_feature_vector;
  
  # Now we're done with all the content stuff
  delete @{$self}{'content', 'content_weights', 'stopwords', 'use_features'};
}


# Parse a document format - a virtual method
sub parse;



( run in 2.378 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )