Acme-Wabby
view release on metacpan or search on metacpan
$@ = "Error retrieving hash from " . $self->{'conf'}{'hash_file'};
}
};
if ($@) {
return undef;
}
%{$self->{'data'}{'hash'}} = %{$ref};
return 1;
}
# A method for adding a block of text to the current state.
# Arguments: Takes a scalar containing text to be added. Embedded newlines,
# random crap, et al are fine, they'll just be stripped out anyway.
# Returns: undef on failure, true on success. The only failure condition is
# currently if an invalid parameter is passed in.
sub add {
my $self = shift;
die "Invalid object" unless (ref($self) eq __PACKAGE__);
# Make sure we actually got something to add
my $text = shift;
unless ($text) {
return undef;
}
# If we don't care about case, lowercase the whole thing to start with
unless ($self->{'conf'}{'case_sensitive'}) {
$text = lc($text);
}
# Split the text into component phrases, which we define as being delimited
# by the characters below. I left the comma out because it seems to lead
# to slightly more coherent results.
my @phrases = split /[.!?;]/, $text;
foreach my $phrase (@phrases) {
# First, strip out any characters we don't want to deal with. We
# replace them with a space so that things like "the+dog" gets treated
# as "the dog".
$phrase =~ s/[^-a-zA-Z0-9 ']/ /g;
# Trim leading and trailing whitespace, and see if we still have
# anything left.
$phrase =~ s/^\s+//;
$phrase =~ s/\s+$//;
next if $phrase eq "";
my $last_word = 0;
my $idx = 0;
# Split the phrase into component words. We're splitting on simple
# whitespace here.
my @words = split /\s+/, $phrase;
# First we're going to loop through the words and clean them up a bit.
# While we're at it, we're going to find the index of the last real
# word in this phrase.
foreach my $word (@words) {
# Clean up the word a little bit. We allow hyphens and
# apostrophies to occur within words, but not at the beginning
# or ends of words.
$word =~ s/^\s+//;
$word =~ s/\s+$//;
$word =~ s/^-+//g;
$word =~ s/^'+//g;
$word =~ s/-+$//g;
$word =~ s/'+$//g;
# Only allow the single-character words of 'a' and 'I'.
# FIXME - Need to be able to configure this so that persons with
# non-english texts can pick values that make sense.
if (length($word) == 1 && lc($word) ne "i" && lc($word) ne "a") {
$word = "";
$idx++;
next;
}
# If this is a valid word, then mark this as a possible last word.
if ($word ne "") {
$last_word = $idx;
}
$idx++;
}
$idx = 0;
my $new_index = 0;
my $old_index = 0;
# Now we loop through the words, recording the transitions between them.
foreach my $word (@words) {
# Shock shock, we're going to ignore non-existent words.
if ($word eq "") {
$idx++;
next;
}
# If this is a new word that we've never seen before
if (!exists($self->{'data'}{'hash'}{$word})) {
# Add this word to the end of the word list, and to the hash,
# taking care to record its index for the next loop iteration.
$new_index = scalar(@{$self->{'data'}{'list'}});
$self->{'data'}{'hash'}{$word} = $new_index;
push @{$self->{'data'}{'list'}}, {word => $word, num => []};
# Add a transition from the previous word to this word.
push @{${$self->{'data'}{'list'}}[$old_index]{'num'}},
$new_index;
# If this word happens to be the last in the phrase, add a -1
# to its possible transitions so that we have the possibility
# of ending sentences here.
if ($idx == $last_word) {
push @{${$self->{'data'}{'list'}}[$new_index]{'num'}}, -1;
}
}
# If we've seen this word before
else {
( run in 2.640 seconds using v1.01-cache-2.11-cpan-63c85eba8c4 )