Acme-Wabby

 view release on metacpan or  search on metacpan

Wabby.pm  view on Meta::CPAN

            $@ = "Error retrieving hash from " . $self->{'conf'}{'hash_file'};
        }
    };
    if ($@) {
        return undef;
    }
    %{$self->{'data'}{'hash'}} = %{$ref};

    return 1;
}

# A method for adding a block of text to the current state.
# Arguments: Takes a scalar containing text to be added.  Embedded newlines,
#            random crap, et al are fine, they'll just be stripped out anyway.
# Returns:   undef on failure, true on success.  The only failure condition is
#            currently if an invalid parameter is passed in.
sub add {
    my $self = shift;
    die "Invalid object" unless (ref($self) eq __PACKAGE__);

    # Make sure we actually got something to add
    my $text = shift;
    unless ($text) {
        return undef;
    }

    # If we don't care about case, lowercase the whole thing to start with
    unless ($self->{'conf'}{'case_sensitive'}) {
        $text = lc($text);
    }

    # Split the text into component phrases, which we define as being delimited
    # by the characters below.  I left the comma out because it seems to lead
    # to slightly more coherent results.
    my @phrases = split /[.!?;]/, $text;
    foreach my $phrase (@phrases) {

        # First, strip out any characters we don't want to deal with.  We
        # replace them with a space so that things like "the+dog" gets treated
        # as "the dog".
        $phrase =~ s/[^-a-zA-Z0-9 ']/ /g;

        # Trim leading and trailing whitespace, and see if we still have
        # anything left.
        $phrase =~ s/^\s+//;
        $phrase =~ s/\s+$//;
        next if $phrase eq "";

        my $last_word = 0;
        my $idx = 0;
        # Split the phrase into component words.  We're splitting on simple
        # whitespace here.
        my @words = split /\s+/, $phrase;

        # First we're going to loop through the words and clean them up a bit.
        # While we're at it, we're going to find the index of the last real
        # word in this phrase.
        foreach my $word (@words) {

            # Clean up the word a little bit.  We allow hyphens and
            # apostrophies to occur within words, but not at the beginning
            # or ends of words.
            $word =~ s/^\s+//;
            $word =~ s/\s+$//;
            $word =~ s/^-+//g;
            $word =~ s/^'+//g;
            $word =~ s/-+$//g;
            $word =~ s/'+$//g;

            # Only allow the single-character words of 'a' and 'I'.
            # FIXME - Need to be able to configure this so that persons with
            # non-english texts can pick values that make sense.
            if (length($word) == 1 && lc($word) ne "i" && lc($word) ne "a") {
                $word = "";
                $idx++;
                next;
            }

            # If this is a valid word, then mark this as a possible last word.
            if ($word ne "") {
                $last_word = $idx;
            }
            $idx++;
        }

        $idx = 0;
        my $new_index = 0;
        my $old_index = 0;

        # Now we loop through the words, recording the transitions between them.
        foreach my $word (@words) {

            # Shock shock, we're going to ignore non-existent words.
            if ($word eq "") {
                $idx++;
                next;
            }

            # If this is a new word that we've never seen before
            if (!exists($self->{'data'}{'hash'}{$word})) {

                # Add this word to the end of the word list, and to the hash,
                # taking care to record its index for the next loop iteration.
                $new_index = scalar(@{$self->{'data'}{'list'}});
                $self->{'data'}{'hash'}{$word} = $new_index;
                push @{$self->{'data'}{'list'}}, {word => $word, num => []};

                # Add a transition from the previous word to this word.
                push @{${$self->{'data'}{'list'}}[$old_index]{'num'}},
                    $new_index;

                # If this word happens to be the last in the phrase, add a -1
                # to its possible transitions so that we have the possibility
                # of ending sentences here.
                if ($idx == $last_word) {
                    push @{${$self->{'data'}{'list'}}[$new_index]{'num'}}, -1;
                }
            }

            # If we've seen this word before
            else {



( run in 2.640 seconds using v1.01-cache-2.11-cpan-63c85eba8c4 )