Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        _synset_cache           =>   {},
        _vocab_hash             =>   {},
        _vocab_idf_hist         =>   {},
        _idf_t                  =>   {},
        _vocab_size             =>   undef,
        _doc_vector_template    =>   {},
        _tkt_doc_vecs           =>   {},
        _tkt_doc_vecs_normed    =>   {},
        _query_ticket_id        =>   undef,
        _inverted_index         =>   {},
        _debug1                 =>   $args{debug1} || 0, # for processing Excel
        _debug2                 =>   $args{debug2} || 0, # for modeling tickets
        _debug3                 =>   $args{debug3} || 0, # for retrieving similar tickets
        _wn                     =>   WordNet::QueryData->new( verbose => 0, 
                                                              noload => 1 ),
    }, $class;
}

#############################  Extract info from Excel  #######################

sub get_tickets_from_excel {
    my $self = shift;
    unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

                $col_headers_row = $row;
                $col_headers_found = 1;
            }
            if ($cell_value eq $clustering_fieldname) {
                $col_index_for_clustering_field = $col;
            }
        }
    }
    $self->{_good_columns} = \@good_columns;
    print "\nThe unique id is in column: $col_index_for_unique_id\n"
        if $self->{_debug1};
    print "The clustering field is in column: " .
                "$col_index_for_clustering_field\n\n" if $self->{_debug1};
    my %Column_Headers;
    foreach my $field_index (0..@good_columns-1) {
        my $key = "field_" . $field_index;
        $Column_Headers{$key} = "";
    }
    my @col_headers = map {
        my $cell = 
           $worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
        $cell ? _get_rid_of_wide_chars($cell->value()) : '';
    } @good_columns;
    $self->{_column_headers} = \@col_headers;
    $self->_display_column_headers() if $self->{_debug1};
    my $unique_id_field_index_in_good_columns = 
     _find_index_for_given_element( $col_index_for_unique_id, \@good_columns );
    my $clustering_field_index_in_good_columns =
     _find_index_for_given_element( $col_index_for_clustering_field, 
                             \@good_columns );
    die "Something is wrong with the info extracted from Excel " .
        "as the index for the column with unique IDs is not one of " .
        "good columns\n\n" 
        unless (defined $unique_id_field_index_in_good_columns) &&
               (defined $clustering_field_index_in_good_columns);

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    my $self = shift;
    my %check_hash;
    my @duplicates;
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if (exists $ticket->{$self->{_unique_id_fieldname}}) {
            push @duplicates, $ticket->{$self->{_unique_id_fieldname}} 
               if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
            $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
        }
    }
    if ($self->{_debug1}) {
        my $num_of_tickets = @{$self->{_all_tickets}};
        my $num_entries_check_hash = keys %check_hash;
        print "Number of tickets: $num_of_tickets\n";
        print "Number of keys in check hash: $num_entries_check_hash\n";
    }
    return \@duplicates;
}

sub show_original_ticket_for_given_id {
    my $self = shift;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        if ($@) {                                 
           die "Something went wrong with disk storage of synset cache: $@";
        }
    }
}

sub _expand_one_ticket_with_synonyms {
    my $self = shift;
    my $ticket_id = shift;
    print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n" 
                                              if $self->{_debug2};
    $self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id );
    $self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
}

sub _replace_negated_words_with_antonyms_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
    foreach my $word (@words_negated_with_not) {
        next unless (($word =~ /^\w+$/) && 
                     (length($word) > $self->{_min_word_length}));
        my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
        next unless @antonym_words > 0;
        $#antonym_words = $self->{_max_num_syn_words} - 1
              if @antonym_words > $self->{_max_num_syn_words};
        my $antonym_replacement_string = join ' ', @antonym_words;
        print "Antonym for $word is $antonym_replacement_string\n"
            if $self->{_debug2};
        $record =~ s/not\s+$word/$antonym_replacement_string/g;
    }
    my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig;
    foreach my $word (@words_negated_with_no) {
        next unless (($word =~ /^\w+$/) && 
                    (length($word) > $self->{_min_word_length}));
        my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
        next unless @antonym_words > 0;
        $#antonym_words = $self->{_max_num_syn_words} - 1
              if @antonym_words > $self->{_max_num_syn_words};
        my $antonym_replacement_string = join ' ', @antonym_words;
        print "Antonym for $word is $antonym_replacement_string\n"
            if $self->{_debug2};
        $record =~ s/no\s+$word/$antonym_replacement_string/g;
    }
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $record;
}

sub _add_to_words_their_synonyms_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    my @words = split /\s+/, $record;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        next if $word eq 'no';
        next if $word eq 'not';
        next unless $word =~ /^\w+$/ && 
                    length($word) > $self->{_min_word_length};
        my @synonym_words;
        @synonym_words = @{$self->{_synset_cache}->{$word}}
                      if exists $self->{_synset_cache}->{$word};        
        unless (exists $self->{_synset_cache}->{$word}) {
            @synonym_words = @{$self->_get_synonyms_for_word( $word )};
            print "syn-set for $word  =>   @synonym_words\n\n"
                if $self->{_debug2};
            my $word_root;
            if (@synonym_words == 0) {
                if ((length($word) > 4) && ($word =~ /(.+)s$/)) {
                    $word_root = $1;
                    @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
                        if length($word_root) >= $self->{_min_word_length};
                } elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) {
                    $word_root = $1;
                    @synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
                        if length($word_root) >= $self->{_min_word_length};
                }
            }
            print "syn-set for word root $word_root  =>   @synonym_words\n\n" 
                if ( $self->{_debug2} && defined $word_root );
            _fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0;
            $#synonym_words = $self->{_max_num_syn_words} - 1
                  if @synonym_words > $self->{_max_num_syn_words};
            print "Retained syn-set for $word  =>   @synonym_words\n\n"
                if $self->{_debug2};
            $self->{_synset_cache}->{$word} = \@synonym_words;
            push @synonym_bag, @synonym_words;
        }
    }
    foreach my $syn_word (@synonym_bag) {
        push @words, lc($syn_word) 
            unless ((exists $self->{_stop_words}->{$syn_word}) || 
                        (length($syn_word) <= $self->{_min_word_length}));
    }
    my @sorted_words = sort @words;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
            or die "Can't create DBM files: $!";       
    my %inverted_index;
    my $min = $self->{_min_word_length};
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
        my %uniques = ();
        my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
        my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
        my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        next unless @clean_words;
        @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}), 
                                                                 @clean_words;
        map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
        for (@clean_words) { $uniques{"\L$_"}++ };
        map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
        map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id } 
                                                            keys %uniques;
        $self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words;
    }
    foreach (keys %vocab_hist_on_disk) {
        $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
    }
    untie %vocab_hist_on_disk;
    $self->{_tkt_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
        if $self->{_debug2};
    # Calculate idf(t):
    $self->{_idf_db} = "idf.db" unless $self->{_idf_db};
    unlink glob "$self->{_idf_db}.*";   
    tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
                                            or die "Can't create DBM files: $!";       
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
                                           /
                                           (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                           / log(10) ); 

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    my @query_words = grep $_, split /\s+/, $query_record;
    my %relevant_tickets_set;
    die "\n\nYou did not set a value for the constructor parameter min_idf_threshold -- "
        unless $self->{_min_idf_threshold};
    foreach my $qword (@query_words) {
        map {$relevant_tickets_set{$_} = 1} @{$self->{_inverted_index}->{$qword}}
            if $self->{_idf_t}->{$qword} > $self->{_min_idf_threshold};
    }
    my @relevant_tickets = sort {$a <=> $b} keys %relevant_tickets_set;
    print "The relevant tickets for query: @relevant_tickets" 
        if $self->{_debug3};
    my $num_relevant_tkts = @relevant_tickets;
    print "\nThe number of tickets relevant to the query: $num_relevant_tkts\n\n";
    my %retrievals;
    my $rank = 0;
    foreach (sort {$self->_doc_vec_comparator} @relevant_tickets ) {
        $retrievals{$_} = $self->_similarity_to_query_ticket($_);
        $rank++;
        last if $rank == $self->{_how_many_retrievals};
    }
    if ($self->{_debug3}) {
        print "\n\nShowing the VSM retrievals and the similarity scores:\n\n";
        foreach (sort {$retrievals{$b} <=> $retrievals{$a}} keys %retrievals) {
            print "$_   =>   $retrievals{$_}\n";
        }
    }
    return \%retrievals;
}

sub _doc_vec_comparator {
    my $self = shift;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    my $product = _vec_scalar_product(\@vec, \@qvec);
    $product /= $vec_mag * $qvec_mag;
    return $product;
}


########################  Utility Subroutines  ##########################

sub _simple_stemmer {
    my $word = shift;
    my $debug = shift;
    print "\nStemming the word:        $word\n" if $debug;
    $word =~ s/(.*[a-z]t)ted$/$1/i;
    $word =~ s/(.*[a-z]t)ting$/$1/i;
    $word =~ s/(.*[a-z]l)ling$/$1/i;
    $word =~ s/(.*[a-z]g)ging$/$1/i;
    $word =~ s/(.*[a-z]ll)ed$/$1/i;
    $word =~ s/(.*[a-z][^aeious])s$/$1/i;
    $word =~ s/(.*[a-z])ies$/$1y/i;
    $word =~ s/(.*[a-z]s)es$/$1/i;
    $word =~ s/(.*[a-z][ck])es$/$1e/i;
    $word =~ s/(.*[a-z]+)tions$/$1tion/i;
    $word =~ s/(.*[a-z]+)mming$/$1m/i;
    $word =~ s/(.*[a-z]+[^rl])ing$/$1/i;
    $word =~ s/(.*[a-z]+o[sn])ing$/$1e/i;
    $word =~ s/(.*[a-z]+)tices$/$1tex/i;
    $word =~ s/(.*[a-z]+)pes$/$1pe/i;
    $word =~ s/(.*[a-z]+)sed$/$1se/i;
    $word =~ s/(.*[a-z]+)ed$/$1/i;
    $word =~ s/(.*[a-z]+)tation$/$1t/i;
    print "Stemmed word:                           $word\n\n" if $debug;
    return $word;
}

sub _exists {
    my $element = shift;
    my $array   = shift;
    my %hash;
    for my $item (@$array) {
        $hash{$item} = 1;
    }

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

                            add_synsets_to_tickets
                            clustering_fieldname
                            min_word_length
                            min_idf_threshold
                            max_num_syn_words
                            stop_words_file
                            misspelled_words_file
                            unique_id_fieldname
                            want_stemming
                            how_many_retrievals
                            debug1
                            debug2
                            debug3
                          /;
    my $found_match_flag;
    foreach my $param (@params) {

        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

tickets.  (It is not uncommon for engineering services to use jargon words
and acronyms that look like spelling errors to those not familiar with the
services.)  The module expects to see a file that is supplied through the
constructor parameter C<misspelled_words_file> that contains misspelled
words in the first column and their corrected versions in the second
column.  An example of such a file is included in the C<examples>
directory.  You would need to create your own version of such a file for
your application domain. Since conjuring up the misspellings that your
ticket submitters are likely to throw at you is futile, you might consider
using the following approach which I prefer to actually reading the tickets
for such errors: Turn on the debugging options in the constructor for some
initially collected spreadsheets and watch what sort of words the WordNet
is not able to supply any synonyms for.  In a large majority of cases,
these would be the misspelled words.

Expanding a ticket with synonyms is made complicated by the fact that some
common words have such a large number of synonyms that they can overwhelm
the relatively small number of words in a ticket.  Adding too many synonyms
in relation to the size of a ticket can not only distort the sense of the
ticket but it can also increase the computational cost of processing all
the tickets.

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

                     synset_cache_db           => $synset_cache_db,
                     stop_words_file           => $stop_words_file,
                     misspelled_words_file     => $misspelled_words_file,
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     min_idf_threshold         => 2.0,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                     how_many_retrievals       => 5,
                     debug1                    => 1,  # for processing, filtering Excel
                     debug2                    => 1,  # for doc modeling
                     debug3                    => 1,  # for retrieving similar tickets

                   );

Obviously, before you can invoke the constructor, you must provide values for the
variables shown to the right of the big arrows.  As to what these values should be is
made clear by the following alphabetized list that describes each of the constructor
parameters shown above:

=over 24

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

You can turn off the addition of synonyms to the tickets by setting this boolean
parameter to 0.

=item I<clustering_fieldname:>

This is for supplying to the constructor the heading of the column in your Excel
spreadsheet that contains the textual data for the tickets.  For example, if the
column heading for the textual content of the tickets is `Description', you must
supply this string as the value for the parameter C<clustering_fieldname>.

=item I<debug1:>

When this parameter is set, the module prints out information regarding what columns
of the spreadsheet it is extracting information from, the headers for those columns,
the index of the column that contains the textual content of the tickets, and of the
column that contains the unique integer identifier for each ticket.  If you are
dealing with spreadsheets with a large number of tickets, it is best to pipe the
output of the module into a file to see the debugging information.

=item I<debug2:>

When this parameter is set, you will see how WordNet is being utilized to generate
word synonyms. This debugging output is also useful to see the extent of misspellings
in the tickets.  If WordNet is unable to find the synonyms for a word, chances are
that the word is not spelled correctly (or that it is a jargon word or a jargon
acronym).

=item I<debug3:>

This debug flag applies to the calculations carried out during the retrieval of
similar tickets.  When this flag is set, the module will display the candidate set of
tickets to be considered for matching with the query ticket.  This candidate set is
chosen by using the inverted index to collect all the tickets that share words with
the query word provided the IDF value for each such word exceeds the threshold set by
the constructor parameter C<min_idf_threshold>.

=item I<excel_filename:>

This is obviously the name of the Excel file that contains the tickets you want to
process.



( run in 1.052 second using v1.01-cache-2.11-cpan-49f99fa48dc )