Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

            my $key = "field_" . $field_index;
            die "The Columns Headers hash has no field for index " .
                   "$field_index\n    "
                unless exists $col_headers[$field_index];
            $onerow{$col_headers[$field_index]} = $values[$field_index];
        }
        push @{$self->{_all_tickets}}, \%onerow;
    }
    my @duplicates_for_id_field = @{$self->_check_unique_id_field()};
    if (@duplicates_for_id_field > 0) {
        print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n";
        die "\n\nYour unique id field for tickets contains duplicate id's";
    }
    foreach my $ticket (@{$self->{_all_tickets}}) {    
        $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
            lc($ticket->{$clustering_fieldname});
    }
    $self->{_total_num_tickets} = scalar @{$self->{_all_tickets}};
    $self->store_raw_tickets_on_disk();
}

sub _test_excel_for_tickets {
    my $self = shift;
    use Text::Iconv;
    my $converter = Text::Iconv->new("utf-8", "windows-1251");
    my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing");
    my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
    my @worksheets = $workbook->worksheets();
    my ( $row_min, $row_max ) = $worksheets[0]->row_range();
    my ( $col_min, $col_max ) = $worksheets[0]->col_range();
    return ($row_min, $row_max, $col_min, $col_max);
}

sub _display_column_headers {
    my $self = shift;
    print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
    my $overall_header_string = join '  <>  ', @{$self->{_column_headers}};
    print "The column headers are: $overall_header_string\n\n";
}

sub _check_unique_id_field {
    my $self = shift;
    my %check_hash;
    my @duplicates;
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if (exists $ticket->{$self->{_unique_id_fieldname}}) {
            push @duplicates, $ticket->{$self->{_unique_id_fieldname}} 
               if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
            $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
        }
    }
    if ($self->{_debug1}) {
        my $num_of_tickets = @{$self->{_all_tickets}};
        my $num_entries_check_hash = keys %check_hash;
        print "Number of tickets: $num_of_tickets\n";
        print "Number of keys in check hash: $num_entries_check_hash\n";
    }
    return \@duplicates;
}

sub show_original_ticket_for_given_id {
    my $self = shift;
    my $id = shift;
    print "\n\nDisplaying the fields for the ticket $id:\n\n";
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
            foreach my $key (sort keys %{$ticket}) {
                my $value = $ticket->{$key};
                $value =~ s/^\s+//;
                $value =~ s/\s+$//;
                printf("%20s  ==>  %s\n", $key, $value);
            }
        }
    }
}

sub show_raw_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
    return $record;
}

# Needed by test.t
sub _raw_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    return $record;
}


sub show_processed_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
}

sub show_stemmed_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
}

# The following function is a good diagnostic tool to look into the
# array stored in $self->{_all_tickets}.  Each element of this array
# is a record that represents one row of the Excel file.
sub _show_row {
    my $self = shift;
    my $row_num = shift;
    my $total_rows = @{$self->{_all_tickets}};
    print "There are $total_rows items in the \$all_tickets array\n";
    die "The row that you want to see does not exist" 
        unless $row_num < $total_rows;
    my %record = %{$self->{_all_tickets}->[$row_num]};

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    foreach (keys %vocab_hist_on_disk) {
        $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
    }
    untie %vocab_hist_on_disk;
    $self->{_tkt_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
        if $self->{_debug2};
    # Calculate idf(t):
    $self->{_idf_db} = "idf.db" unless $self->{_idf_db};
    unlink glob "$self->{_idf_db}.*";   
    tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
                                            or die "Can't create DBM files: $!";       
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
                                           /
                                           (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                           / log(10) ); 
    }
    foreach (keys %idf_t_on_disk) {
        $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
    }
    untie %idf_t_on_disk;
}

sub display_tickets_vocab {
    my $self = shift;
    die "tickets vocabulary not yet constructed"
        unless keys %{$self->{_vocab_hist}};
    print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n";
    foreach (sort keys %{$self->{_vocab_hist}}){
        my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_});
        print "$outstring\n";
    }
    my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
    print "\nSize of the tickets vocabulary: $vocab_size\n\n";
}

sub display_inverse_document_frequencies {
    my $self = shift;
    die "tickets vocabulary not yet constructed"
        unless keys %{$self->{_vocab_idf_hist}};
    print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n";
    foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {               
        my $outstring = sprintf("%30s     %d", 
                       $_, $self->{_vocab_idf_hist}->{$_});
        print "$outstring\n";
    }
    print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n";
    foreach ( sort keys %{$self->{_idf_t}} ) {               
        my $outstring = sprintf("%30s     %f", $_,$self->{_idf_t}->{$_});
        print "$outstring\n";
    }
}

# The following subroutine is useful for diagnostic purposes.  It
# lists the number of tickets that a word appears in and also lists
# the tickets.  But be careful in interpreting its results.  Note
# if you invoke this subroutine after the synsets have been added
# to the tickets, you may find words being attributed to tickets
# that do not actually contain them in the original Excel sheet.
sub list_processed_tickets_for_a_word {
    my $self = shift;
    while (my $word = <STDIN>) {    #enter ctrl-D to exit the loop
        chomp $word;
        my @ticket_list;
        foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
            my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
            push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
        }
        my $num = @ticket_list;
        print "\nThe number of processed tickets that mention the word `$word': $num\n\n";
        print "The processed tickets: @ticket_list\n\n";
    }
}

sub list_stemmed_tickets_for_a_word {
    my $self = shift;
    while (my $word = <STDIN>) {    #enter ctrl-D to exit the loop
        chomp $word;
        my @ticket_list;
        foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
            my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
            push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
        }
        my $num = @ticket_list;
        print "\nThe number of stemmed tickets that mention the word `$word': $num\n\n";
        print "The stemmed tickets: @ticket_list\n\n";
    }
}

##############  Generate Document Vectors for Tickets  ####################

sub construct_doc_vectors_for_all_tickets {
    my $self = shift;
    foreach ( sort keys %{$self->{_vocab_hist}} ) {
        $self->{_doc_vector_template}->{$_} = 0;    
    }
    my $num_of_tickets = keys %{$self->{_stemmed_tkts_by_ids}};
    my $i = 1;
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
        my $doc_vec_ref = 
            $self->_construct_doc_vector_for_one_ticket($ticket_id);
        print "Finished constructing two doc vecs for ticket $ticket_id ($i out of $num_of_tickets)\n";
        $i++;
    }
}

sub _construct_doc_vector_for_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    unless (keys %{$self->{_doc_vector_template}}) {
        foreach ( sort keys %{$self->{_vocab_hist}} ) {
            $self->{_doc_vector_template}->{$_} = 0;      
        }
    }
    my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})};
    foreach ( sort keys %{$self->{_doc_vector_template}} ) {  
        $doc_vector{$_} = 0;    
    }
    my $min = $self->{_min_word_length};

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

                         misspelled_words_file     => $misspelled_words_file,
                         add_synsets_to_tickets    => 1,
                         want_synset_caching       => 1,
                         max_num_syn_words         => 3,
                         min_word_length           => 4,
                         want_stemming             => 1,
                    );
    
    ## Extract information from Excel spreadsheets:
    $clusterer->get_tickets_from_excel();
    
    ## Apply cleanup filters and add synonyms:
    $clusterer->delete_markup_from_all_tickets();
    $clusterer->apply_filter_to_all_tickets();
    $clusterer->expand_all_tickets_with_synonyms();
    
    ## Construct the VSM doc model for the tickets:
    $clusterer->get_ticket_vocabulary_and_construct_inverted_index();
    $clusterer->construct_doc_vectors_for_all_tickets();

    #  Of the various constructor parameters shown above, the following two
    #  are critical to how information is extracted from an Excel
    #  spreadsheet: `clustering_fieldname' and `unique_id_fieldname'.  The
    #  first is the heading of the column that contains the textual content
    #  of the tickets.  The second is the heading of the column that
    #  contains a unique integer identifier for each ticket.

    #  The nine database related constructor parameters (these end in the
    #  suffix `_db') are there in order to avoid repeated parsing of the
    #  spreadsheet and preprocessing of the tickets every time you need to
    #  make a retrieval for a new ticket.  The goal here is that after the
    #  ticket information has been ingested from a spreadsheet, you would
    #  want to carry out similar-ticket retrieval in real time.  (Whether
    #  or not real-time retrieval would be feasible in actual practice
    #  would also depend on what hardware you are using, obviously.)

    #  After the above preprocessing and doc modeling steps, you can
    #  extract the most similar past tickets for a new query ticket with a
    #  script in which the constructor call would look like:

    my $clusterer = Algorithm::TicketClusterer->new( 
    
                         clustering_fieldname      => $fieldname_for_clustering,
                         unique_id_fieldname       => $unique_id_fieldname,
                         raw_tickets_db            => $raw_tickets_db,
                         processed_tickets_db      => $processed_tickets_db,
                         stemmed_tickets_db        => $stemmed_tickets_db,
                         inverted_index_db         => $inverted_index_db,
                         tickets_vocab_db          => $tickets_vocab_db,
                         idf_db                    => $idf_db,
                         tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                         tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                         min_idf_threshold         => 1.8,
                         how_many_retrievals       => 5,
                    );
    
    my $query_tkt = 1393548;
    $clusterer->restore_ticket_vectors_and_inverted_index();
    my %retrieved = %{$clusterer->retrieve_similar_tickets_with_vsm($query_tkt)};
    foreach my $tkt_id (sort {$retrieved{$b} <=> $retrieved{$a}} keys %retrieved) {
        $clusterer->show_original_ticket_for_given_id( $tkt_id );
    }

    #  Of all the parameters shown above in the constructor call, the
    #  parameter min_idf_threshold plays a large role in what tickets are
    #  returned by the retrieval function. The value of this parameter
    #  depends on the number of tickets in your Excel spreadsheet.  If the
    #  number of tickets is in the low hundreds, this parameter is likely to
    #  require a value of 1.5 to 1.8.  If the number of tickets is in the
    #  thousands, the value of this parameter is likely to be between 2 and
    #  3. See the writeup on this parameter in the API description in the
    #  rest of this documentation.


=head1 CHANGES

Version 1.01 of the module removes the platform dependency of the functions used for
reading the text files for stop words, misspelled words, etc.


=head1 DESCRIPTION

B<Algorithm::TicketClusterer> is a I<perl5> module for retrieving
previously processed Excel-stored tickets similar to a new ticket.  Routing
decisions made for the past similar tickets can be useful in expediting the
routing of a new ticket.

Tickets are commonly used in software services industry and customer
support businesses to record requests for service, product complaints,
user feedback, and so on.

With regard to the routing of a ticket, you would want each new ticket to
be handled by the tech support individual who is most qualified to address
the issue raised in the ticket.  Identifying the right individual for each
new ticket in real-time is no easy task for organizations that man large
service centers and helpdesks.  So if it were possible to quickly identify
the previously processed tickets that are most similar to a new ticket, one
could think of constructing semi-automated (or, perhaps, even fully
automated) ticket routers.

Identifying old tickets similar to a new ticket is made challenging by the
fact that folks who submit tickets often write them quickly and informally.
The informal style of writing means that different people may use different
colloquial terms to describe the same thing. And the quickness associated
with their submission causes the tickets to frequently contain spelling and
other errors such as conjoined words, fragmentation of long words, and so
on.

This module is an attempt at dealing with these challenges.

The problem of different people using different words to describe the same
thing is taken care of by using WordNet to add to each ticket a designated
number of synonyms for each word in the ticket.  The idea is that after all
the tickets are expanded in this manner, they would become grounded in a
common vocabulary. The synonym expansion of a ticket takes place only after
the negated phrases (that is, the words preceded by 'no' or 'not') are
replaced by their antonyms.

Obviously, expanding a ticket by synonyms makes sense only after it is
corrected for spelling and other errors.  What sort of errors one looks for
and corrects would, in general, depend on the application domain of the

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

This method calls on the C<Spreadsheet::ParseExcel> module to extract the tickets
from the old-style Excel spreadsheets and the C<Spreadsheet::XLSX> module for doing
the same from the new-style Excel spreadsheets.

=item  B<get_ticket_vocabulary_and_construct_inverted_index()>

    $clusterer->get_ticket_vocabulary_and_construct_inverted_index()

As mentioned in B<THE THREE STAGES OF PROCESSING>, the second stage of processing ---
doc modeling of the tickets --- starts with the stemming of the words in the tickets,
constructing a vocabulary of all the stemmed words in all the tickets, and
constructing an inverted index for the vocabulary words.  All of these things are
accomplished by this method.

=item  B<restore_processed_tickets_from_disk()>

    $clusterer->restore_processed_tickets_from_disk()

This loads into your script the output of the ticket preprocessing stage.  This
method is called internally by C<restore_ticket_vectors_and_inverted_index()>, which
you would use in your ticket retrieval script, assuming it is separate from the
ticket preprocessing script.

=item B<restore_raw_tickets_from_disk()>

    $clusterer->restore_raw_tickets_from_disk()    

With this method, you are spared the trouble of having to repeatedly parse the same
Excel spreadsheet during the development phase as you are testing the module with
different query tickets.  This method is called internally by
C<restore_ticket_vectors_and_inverted_index()>.

=item  B<restore_stemmed_tickets_from_disk()>

        $clusterer->restore_stemmed_tickets_from_disk();

This method is called internally by
C<restore_ticket_vectors_and_inverted_index()>.

=item  B<restore_ticket_vectors_and_inverted_index()>

    $clusterer->restore_ticket_vectors_and_inverted_index()

If you are going to be doing ticket preprocessing and doc modeling in one script and
ticket retrieval in another, then this is the first method you would need to call in
the latter for the restoration of the VSM model for the tickets and the inverted
index.

=item B<retrieve_similar_tickets_with_vsm()>

    my $retrieved_hash_ref = $clusterer->retrieve_similar_tickets_with_vsm( $ticket_num )

It is this method that retrieves tickets that are most similar to a query ticket.
The method first utilizes the inverted index to construct a candidate list of the
tickets that share words with the query ticket.  Only those words play a role here
whose IDF values exceed C<min_idf_threshold>.  Subsequently, the query ticket vector
is matched with each of the ticket vectors in the candidate list.  The method returns
a reference to a hash whose keys are the IDs for the tickets that match the query
ticket and whose values the cosine similarity distance.

=item B<show_original_ticket_for_given_id()>

    $clusterer->show_original_ticket_for_given_id( $ticket_num )

The argument to the method is the unique integer ID of a ticket for which
you want to see all the fields as stored in the Excel spreadsheet.

=item B<show_raw_ticket_clustering_data_for_given_id()>

While the previous method shows all the fields for a ticket, this method
shows only the textual content --- the content you want to use for
establishing similarity between a query ticket and the other tickets.

=item B<show_processed_ticket_clustering_data_for_given_id()>

    $clusterer->show_processed_ticket_clustering_data_for_given_id( $ticket_num );

This is the method to call if you wish to examine the textual content of a ticket
after it goes through the preprocessing steps.  In particular, you will see the
corrections made, the synonyms added, etc.  You would need to set the argument
C<$ticket_num> to the unique integer ID of the ticket you are interested in.

=item  B<store_processed_tickets_on_disk()>

    $clusterer->store_processed_tickets_on_disk();

This stores in a database file the preprocessed textual content of the
tickets.

=item B<store_raw_tickets_on_disk()>

    $clusterer->store_raw_tickets_on_disk();

This method is called by the C<get_tickets_from_excel()> method to store on the disk
the tickets extracted from the Excel spreadsheet.  Obviously, you can also call it in
your own script for doing the same.

=item  B<store_stemmed_tickets_and_inverted_index_on_disk()>

    $clusterer->store_stemmed_tickets_and_inverted_index_on_disk()

This method stores in a database file the stemmed tickets and the inverted index that
are produced at the end of the second stage of processing.

=item B<show_stemmed_ticket_clustering_data_for_given_id()>

    $clusterer->show_stemmed_ticket_clustering_data_for_given_id( $ticket_num );

If you want to see what sort of a job the stemmer is doing for a ticket, this is the
method to call.  You would need to set the argument C<$ticket_num> to the unique
integer ID of the ticket you are interested in.

=item  B<store_ticket_vectors()>

    $clusterer->store_ticket_vectors()

As the name implies, this call stores the vectors, both regular and normalized, in a
database file on the disk.

=back

=head1 HOW THE MATCHING TICKETS ARE RETRIEVED



( run in 2.294 seconds using v1.01-cache-2.11-cpan-cdf2f3d4e48 )