length results from the CPAN

length

Algorithm-TicketClusterer

view release on metacpan or search on metacpan

examples/ticket_preprocessor_and_doc_modeler.pl view on Meta::CPAN

                     tickets_vocab_db          => $tickets_vocab_db,
                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     synset_cache_db           => $synset_cache_db,
                     stop_words_file           => $stop_words_file,
                     misspelled_words_file     => $misspelled_words_file,
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                );

## Extract information from Excel spreadsheets:
$clusterer->get_tickets_from_excel();

## Apply cleanup filters and add synonyms:
$clusterer->delete_markup_from_all_tickets();
$clusterer->apply_filter_to_all_tickets();
$clusterer->expand_all_tickets_with_synonyms();

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

        _stemmed_tickets_db     =>   $args{stemmed_tickets_db}, 
        _inverted_index_db      =>   $args{inverted_index_db},
        _tickets_vocab_db       =>   $args{tickets_vocab_db},
        _idf_db                 =>   $args{idf_db}, 
        _tkt_doc_vecs_db        =>   $args{tkt_doc_vecs_db},
        _tkt_doc_vecs_normed_db =>   $args{tkt_doc_vecs_normed_db},
        _clustering_fieldname   =>   $args{clustering_fieldname}, 
        _unique_id_fieldname    =>   $args{unique_id_fieldname}, 
        _stop_words_file        =>   $args{stop_words_file},
        _misspelled_words_file  =>   $args{misspelled_words_file},
        _min_word_length        =>   $args{min_word_length} || 4,
        _add_synsets_to_tickets =>   $args{add_synsets_to_tickets} || 0,
        _want_stemming          =>   $args{want_stemming} || 0,
        _how_many_retrievals    =>   $args{how_many_retrievals} || 5,
        _min_idf_threshold      =>   $args{min_idf_threshold},
        _max_num_syn_words      =>   $args{max_num_syn_words} || 3,
        _want_synset_caching    =>   $args{want_synset_caching} || 0,
        _stop_words             =>   {},
        _all_tickets            =>   [],
        _column_headers         =>   [],
        _good_columns           =>   [],

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

        if ($self->{_misspelled_words_file}) {
            my @misspelled_word_pairs = 
                @{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
            foreach my $wordpair (@misspelled_word_pairs) {
                my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
                $self->{_misspelled_words}->{$wrong_word} = $good_word;
            }
        }
    }
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    my $min = $self->{_min_word_length};
    my @words = split /\n|\r|\"|\'|\.|\,|\;|\?|\(|\)|\[|\]|\\|\/|\s+|\&/, $record;
    my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words;
    return unless @clean_words;
    my @new_words;
    foreach my $word (@words) {
        $word =~ s/(.+)[.,:!-]$/$1/;
        unless (($word eq 'no') or ($word eq 'not')) {
            next if length($word) < $self->{_min_word_length};
        }
        if (exists $self->{_misspelled_words}->{lc($word)}) {
            push @new_words, $self->{_misspelled_words}->{$word}; 
            next;
        }
        push @new_words, $word unless exists $self->{_stop_words}->{lc($word)};
    }
    my $new_record = join ' ', @new_words;
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
}

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

    $self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
}

sub _replace_negated_words_with_antonyms_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
    foreach my $word (@words_negated_with_not) {
        next unless (($word =~ /^\w+$/) && 
                     (length($word) > $self->{_min_word_length}));
        my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
        next unless @antonym_words > 0;
        $#antonym_words = $self->{_max_num_syn_words} - 1
              if @antonym_words > $self->{_max_num_syn_words};
        my $antonym_replacement_string = join ' ', @antonym_words;
        print "Antonym for $word is $antonym_replacement_string\n"
            if $self->{_debug2};
        $record =~ s/not\s+$word/$antonym_replacement_string/g;
    }
    my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig;
    foreach my $word (@words_negated_with_no) {
        next unless (($word =~ /^\w+$/) && 
                    (length($word) > $self->{_min_word_length}));
        my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
        next unless @antonym_words > 0;
        $#antonym_words = $self->{_max_num_syn_words} - 1
              if @antonym_words > $self->{_max_num_syn_words};
        my $antonym_replacement_string = join ' ', @antonym_words;
        print "Antonym for $word is $antonym_replacement_string\n"
            if $self->{_debug2};
        $record =~ s/no\s+$word/$antonym_replacement_string/g;
    }
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $record;

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

sub _add_to_words_their_synonyms_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    my @words = split /\s+/, $record;
    my @synonym_bag;
    foreach my $word (@words) {
        next if $word eq 'no';
        next if $word eq 'not';
        next unless $word =~ /^\w+$/ && 
                    length($word) > $self->{_min_word_length};
        my @synonym_words;
        @synonym_words = @{$self->{_synset_cache}->{$word}}
                      if exists $self->{_synset_cache}->{$word};        
        unless (exists $self->{_synset_cache}->{$word}) {
            @synonym_words = @{$self->_get_synonyms_for_word( $word )};
            print "syn-set for $word  =>   @synonym_words\n\n"
                if $self->{_debug2};
            my $word_root;
            if (@synonym_words == 0) {
                if ((length($word) > 4) && ($word =~ /(.+)s$/)) {
                    $word_root = $1;
                    @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
                        if length($word_root) >= $self->{_min_word_length};
                } elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) {
                    $word_root = $1;
                    @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
                        if length($word_root) >= $self->{_min_word_length};
                }
            }
            print "syn-set for word root $word_root  =>   @synonym_words\n\n" 
                if ( $self->{_debug2} && defined $word_root );
            _fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0;
            $#synonym_words = $self->{_max_num_syn_words} - 1
                  if @synonym_words > $self->{_max_num_syn_words};
            print "Retained syn-set for $word  =>   @synonym_words\n\n"
                if $self->{_debug2};
            $self->{_synset_cache}->{$word} = \@synonym_words;
            push @synonym_bag, @synonym_words;
        }
    }
    foreach my $syn_word (@synonym_bag) {
        push @words, lc($syn_word) 
            unless ((exists $self->{_stop_words}->{$syn_word}) || 
                        (length($syn_word) <= $self->{_min_word_length}));
    }
    my @sorted_words = sort @words;
    my $new_record = join ' ', @sorted_words;
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
}

sub store_processed_tickets_on_disk {
    my $self = shift;
    $self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
    unlink $self->{_processed_tickets_db};

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

sub get_ticket_vocabulary_and_construct_inverted_index {
    my $self = shift;
    my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
    $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
    unlink glob "$self->{_tickets_vocab_db}.*";   
    my %vocab_hist_on_disk;
    tie %vocab_hist_on_disk, 'SDBM_File',  
             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
            or die "Can't create DBM files: $!";       
    my %inverted_index;
    my $min = $self->{_min_word_length};
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
        my %uniques = ();
        my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
        my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
        my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        next unless @clean_words;
        @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}), 
                                                                 @clean_words;
        map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
        for (@clean_words) { $uniques{"\L$_"}++ };

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

    my $ticket_id = shift;
    unless (keys %{$self->{_doc_vector_template}}) {
        foreach ( sort keys %{$self->{_vocab_hist}} ) {
            $self->{_doc_vector_template}->{$_} = 0;      
        }
    }
    my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})};
    foreach ( sort keys %{$self->{_doc_vector_template}} ) {  
        $doc_vector{$_} = 0;    
    }
    my $min = $self->{_min_word_length};
    my $total_words_in_ticket = 0;
    my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
    my @clean_words = split /\s+/, $record;
    map { $doc_vector{"\L$_"}++ } 
            grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words; 
    die "Something went wrong. Doc vector size unequal to vocab size"
        unless $self->{_vocab_size} == scalar(keys %doc_vector);
    foreach (keys %doc_vector) {        
        $total_words_in_ticket += $doc_vector{$_};
    }

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

                            stemmed_tickets_db
                            inverted_index_db
                            tickets_vocab_db
                            idf_db
                            tkt_doc_vecs_db
                            tkt_doc_vecs_normed_db
                            synset_cache_db
                            want_synset_caching
                            add_synsets_to_tickets
                            clustering_fieldname
                            min_word_length
                            min_idf_threshold
                            max_num_syn_words
                            stop_words_file
                            misspelled_words_file
                            unique_id_fieldname
                            want_stemming
                            how_many_retrievals
                            debug1
                            debug2
                            debug3

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

    my $i = @$arr;                   
    while (--$i) {                   
        my $j = int rand( $i + 1 );  
        @$arr[$i, $j] = @$arr[$j, $i]; 
    }
}

sub _vec_scalar_product {
    my $vec1 = shift;
    my $vec2 = shift;
    die "Something is wrong --- the two vectors are of unequal length"
        unless @$vec1 == @$vec2;
    my $product;
    for my $i (0..@$vec1-1) {
        $product += $vec1->[$i] * $vec2->[$i];
    }
    return $product;
}

sub _vec_magnitude {
    my $vec = shift;

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

                         tickets_vocab_db          => $tickets_vocab_db,
                         idf_db                    => $idf_db,
                         tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                         tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                         synset_cache_db           => $synset_cache_db,
                         stop_words_file           => $stop_words_file,
                         misspelled_words_file     => $misspelled_words_file,
                         add_synsets_to_tickets    => 1,
                         want_synset_caching       => 1,
                         max_num_syn_words         => 3,
                         min_word_length           => 4,
                         want_stemming             => 1,
                    );
    
    ## Extract information from Excel spreadsheets:
    $clusterer->get_tickets_from_excel();
    
    ## Apply cleanup filters and add synonyms:
    $clusterer->delete_markup_from_all_tickets();
    $clusterer->apply_filter_to_all_tickets();
    $clusterer->expand_all_tickets_with_synonyms();

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     synset_cache_db           => $synset_cache_db,
                     stop_words_file           => $stop_words_file,
                     misspelled_words_file     => $misspelled_words_file,
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     min_idf_threshold         => 2.0,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                     how_many_retrievals       => 5,
                     debug1                    => 1,  # for processing, filtering Excel
                     debug2                    => 1,  # for doc modeling
                     debug3                    => 1,  # for retrieving similar tickets

                   );

Obviously, before you can invoke the constructor, you must provide values for the
variables shown to the right of the big arrows.  As to what these values should be is

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN

measure of the discriminatory power of the word.  Let's say you have a word that
occurs in only one out of 1000 tickets.  Such a word is obviously highly
discriminatory and its IDF would be the logarithm (to base 10) of the ratio of 1000
to 1, which is 3.  On the other hand, for a word that occurs in every one of 1000
tickets, its IDF value would be the logarithm of the ratio of 1000 to 1000, which is
0.  So, for the case when you have 1000 tickets, the upper bound on IDF is 3 and the
lower bound 0. This constructor parameter controls which of the query words you will
use for constructing the initial pool of tickets that will be used for matching.  The
larger the value of this threshold, the smaller the pool obviously.

=item I<min_word_length:> 

This parameter sets the minimum number of characters in a word in order for it to be
included for ticket processing.

=item I<misspelled_words_file:>

As to what extent you can improve ticket retrieval precision with the addition of
synonyms depends on the degree to which you can make corrections on the fly for the
spelling errors that occur frequently in tickets.  That fact makes the file you
supply through this constructor parameter very important.  For the current version of

lib/Algorithm/TicketClusterer.pm view on Meta::CPAN


By a production-quality tool, I mean a software package that you can I<actually> use
in a production environment for automated or semi-automated ticket routing in your
organization.  I am assuming you already have the tools in place that insert in
real-time the new tickets in an Excel spreadsheet.

Turning this module into a production tool will require that you find the best values
to use for the following three parameters that are needed by the constructor: (1)
C<min_idf_threshold> for the minimum C<idf> value for the words in a query ticket in
order for them to be considered for matching with the other tickets; (2)
C<min_word_length> for discarding words that are too short; and (3)
C<max_num_syn_words> for how many synonyms to retain for a word if the number of
synonyms returned by WordNet is too large.  In addition, you must also come up with a
misspelled-words file that is appropriate to your application domain and a stop-words
file.

In order to find the best values to use for the parameters that are mentioned above,
I suggest creating a graphical front-end for this module that would allow for
altering the values of the three parameters listed above in response to the
prevailing mis-routing rates for the tickets.  The front-end will display to an
operator the latest ticket that needs to be routed and a small set of the

t/test.t view on Meta::CPAN

                     stemmed_tickets_db        => "t/__test_stemmed_tickets_db",
                     inverted_index_db         => "t/__test_inverted_index_db",
                     tickets_vocab_db          => "t/__test_tickets_vocab_db",
                     idf_db                    => "t/__test_idf_db",
                     tkt_doc_vecs_db           => "t/__test_tkt_doc_vecs_db",
                     tkt_doc_vecs_normed_db    => "t/__test_tkt_doc_vecs_normed_db",
                     synset_cache_db           => "t/__test_synset_cache_db",
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                 );

my @returned = $tclusterer->_test_excel_for_tickets();
my @should_be = qw/0 4 0 6/;
#ok( @returned ~~ @should_be, 'Able to process Excel' );
my @comparisons = map {$returned[$_] == $should_be[$_] ? 1 : 0} (0..@returned-1);
my $final_compare = 1;
foreach my $i (0..@returned-1) {
    $final_compare *= $comparisons[$i]

( run in 0.340 second using v1.01-cache-2.11-cpan-33209edd8b4 )