AI-Categorizer

 view release on metacpan or  search on metacpan

lib/AI/Categorizer/Document.pm  view on Meta::CPAN

    my $h = $self->vectorize(tokens => $t, weight => exists($weights->{$name}) ? $weights->{$name} : 1 );
    $self->{features}->add($h);
  }
}

sub is_in_category {
  return (ref $_[1]
	  ? $_[0]->{categories}->includes( $_[1] )
	  : $_[0]->{categories}->includes_name( $_[1] ));
    
}

sub tokenize {
  my $self = shift;
  my @tokens;
  while ($_[0] =~ /([-\w]+)/g) {
    my $word = lc $1;
    next unless $word =~ /[a-z]/;
    $word =~ s/^[^a-z]+//;  # Trim leading non-alpha characters (helps with ordinals)
    push @tokens, $word;
  }
  return \@tokens;
}

sub stem_words {
  my ($self, $tokens) = @_;
  return unless $self->{stemming};
  return if $self->{stemming} eq 'none';
  die "Unknown stemming option '$self->{stemming}' - options are 'porter' or 'none'"
    unless $self->{stemming} eq 'porter';
  
  eval {require Lingua::Stem; 1}
    or die "Porter stemming requires the Lingua::Stem module, available from CPAN.\n";

  @$tokens = @{ Lingua::Stem::stem(@$tokens) };
}

sub _filter_tokens {
  my ($self, $tokens_in) = @_;

  if ($self->{use_features}) {
    my $f = $self->{use_features}->as_hash;
    return [ grep  exists($f->{$_}), @$tokens_in ];
  } elsif ($self->{stopwords} and keys %{$self->{stopwords}}) {
    my $s = $self->{stopwords};
    return [ grep !exists($s->{$_}), @$tokens_in ];
  }
  return $tokens_in;
}

sub _weigh_tokens {
  my ($self, $tokens, $weight) = @_;

  my %counts;
  if (my $b = 0+$self->{front_bias}) {
    die "'front_bias' value must be between -1 and 1"
      unless -1 < $b and $b < 1;
    
    my $n = @$tokens;
    my $r = ($b-1)**2 / ($b+1);
    my $mult = $weight * log($r)/($r-1);
    
    my $i = 0;
    foreach my $feature (@$tokens) {
      $counts{$feature} += $mult * $r**($i/$n);
      $i++;
    }
    
  } else {
    foreach my $feature (@$tokens) {
      $counts{$feature} += $weight;
    }
  }

  return \%counts;
}

sub vectorize {
  my ($self, %args) = @_;
  if ($self->{stem_stopwords}) {
    my $s = $self->stem_tokens([keys %{$self->{stopwords}}]);
    $self->{stopwords} = { map {+$_, 1} @$s };
    $args{tokens} = $self->_filter_tokens($args{tokens});
  }
  return $self->_weigh_tokens($args{tokens}, $args{weight});
}

sub read {
  my ($class, %args) = @_;
  my $path = delete $args{path} or die "Must specify 'path' argument to read()";
  
  my $self = $class->new(%args);
  
  open my($fh), "< $path" or die "$path: $!";
  $self->parse_handle(handle => $fh);
  close $fh;
  
  $self->finish;
  return $self;
}

sub dump_features {
  my ($self, %args) = @_;
  my $path = $args{path} or die "No 'path' argument given to dump_features()";
  open my($fh), "> $path" or die "Can't create $path: $!";
  my $f = $self->features->as_hash;
  while (my ($k, $v) = each %$f) {
    print $fh "$k\t$v\n";
  }
}

1;

__END__

=head1 NAME

AI::Categorizer::Document - Embodies a document

=head1 SYNOPSIS



( run in 0.604 second using v1.01-cache-2.11-cpan-39bf76dae61 )