Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

examples/ExampleExcelFile.xls
examples/SampleTest.xlsx
examples/misspelled_words.txt
examples/README
examples/retrieve_similar_tickets.pl
examples/stop_words.txt
examples/ticket_preprocessor_and_doc_modeler.pl
examples/README
lib/Algorithm/TicketClusterer.pm
Makefile.PL
MANIFEST			This list of files
README
t/test.t
t/__SampleTest.xlsx

META.yml                                 Module YAML meta-data (added by MakeMaker)
META.json                                Module JSON meta-data (added by MakeMaker)

examples/README  view on Meta::CPAN

The 'examples' directory:

This directory contains the following scripts that would be
your quickest way to become familiar with this module:

1.    ticket_preprocessor_and_doc_modeler.pl

2.    retrieve_similar_tickets.pl


Run the first script to see ticket preprocessing and doc
modeling being carried out on the (fake) tickets stored in
the Excel file ExampleExcelFile.xls in this directory.

Next, run the second script to retrieve five tickets that
are closest to the query ticket whose integer is supplied to
the retrieve_similar_tickets_with_vsm()> method in the
script.

If both scripts run fine, go through the statements in the
scripts to see how you need to sequence the different
preprocessing, doc modeling, and retrieval steps for your
tickets.

IMPORTANT1: The spreadsheet 

                        ExampleExcelFile.xls 

            is contrived.  Do not draw any conclusions about 
            the retrieval precision of this module based on
            the tickets in this file.

examples/retrieve_similar_tickets.pl  view on Meta::CPAN

#!/usr/bin/perl -w

###  retrieve_similar_tickets.pl

###  After the tickets stored in an Excel spreadsheet have been subject to the
###  preprocessing steps listed in the script `ticket_preprocessor_doc_modeler.pl',
###  you use the script shown here to retrieve the tickets that are most similar
###  to a given query ticket.

###  For obvious reasons, you would want the names of the database files
###  mentioned in this script to match the names in the ticket
###  preprocessing script.

###  IMPORTANT  IMPORTANT  IMPORTANT  IMPORTANT  IMPORTANT:
###
###  The parameter
###
###                 min_idf_threshold 
###
###
###  depends on the number of tickets in your Excel spreadsheet.  If the
###  number of tickets is in the low hundreds, this parameter is likely to

examples/retrieve_similar_tickets.pl  view on Meta::CPAN



#use lib '../blib/lib', '../blib/arch';

use strict;
use Algorithm::TicketClusterer;

my $fieldname_for_clustering = "Description";
my $unique_id_fieldname = "Request No";
my $raw_tickets_db = "raw_tickets.db";
my $processed_tickets_db = "processed_tickets.db";
my $stemmed_tickets_db = "stemmed_tickets.db";
my $inverted_index_db = "inverted_index.db";
my $tickets_vocab_db = "tickets_vocab.db";
my $idf_db = "idf.db";
my $tkt_doc_vecs_db = "tkt_doc_vecs.db";
my $tkt_doc_vecs_normed_db = "tkt_doc_vecs_normed.db";

my $clusterer = Algorithm::TicketClusterer->new( 

                     clustering_fieldname      => $fieldname_for_clustering,
                     unique_id_fieldname       => $unique_id_fieldname,
                     raw_tickets_db            => $raw_tickets_db,
                     processed_tickets_db      => $processed_tickets_db,
                     stemmed_tickets_db        => $stemmed_tickets_db,
                     inverted_index_db         => $inverted_index_db,
                     tickets_vocab_db          => $tickets_vocab_db,
                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     min_idf_threshold         => 1.3,
                     how_many_retrievals       => 5,
                );

examples/retrieve_similar_tickets.pl  view on Meta::CPAN

my $retrieved_hash_ref = $clusterer->retrieve_similar_tickets_with_vsm( $ticket_num );

print "\nDisplaying the tickets considered most similar to the query ticket $ticket_num\n\n";

my %retrieved_hash = %{$retrieved_hash_ref};
my $rank = 1;
foreach my $ticket_id (sort { $retrieved_hash{$b} <=> $retrieved_hash{$a} } 
                                                          keys %retrieved_hash) {
    my $similarity_score = $retrieved_hash{$ticket_id};
    print "\n\n\n --------- Retrieved ticket at similarity rank $rank   (simlarity score: $similarity_score) ---------\n";
    $clusterer->show_processed_ticket_clustering_data_for_given_id( $ticket_id );    
    $clusterer->show_original_ticket_for_given_id( $ticket_id );
    $rank++;
}

examples/ticket_preprocessor_and_doc_modeler.pl  view on Meta::CPAN

#!/usr/bin/perl -w

### ticket_preprocessor.pl

###  This is the script you must run on a new Excel spreadsheet before you
###  can retrieve similar tickets from the tickets stored in the
###  spreadsheet.

###  This script calls on a user to specify names for the nine databases
###  that are created for the tickets.  This is to avoid having to process
###  all the tickets every time you need to make a retrieval for a new
###  ticket.


#use lib '../blib/lib', '../blib/arch';

use strict;
use Algorithm::TicketClusterer;

my $excel_filename = "ExampleExcelFile.xls";
#my $excel_filename = "SampleTest.xlsx";
my $fieldname_for_clustering = "Description";
my $unique_id_fieldname = "Request No";
my $raw_tickets_db = "raw_tickets.db";
my $processed_tickets_db = "processed_tickets.db";
my $stemmed_tickets_db = "stemmed_tickets.db";
my $inverted_index_db = "inverted_index.db";
my $tickets_vocab_db = "tickets_vocab.db";
my $idf_db = "idf.db";
my $tkt_doc_vecs_db = "tkt_doc_vecs.db";
my $tkt_doc_vecs_normed_db = "tkt_doc_vecs_normed.db";
my $synset_cache_db = "synset_cache.db";
my $stop_words_file = "stop_words.txt";
my $misspelled_words_file = "misspelled_words.txt";

my $clusterer = Algorithm::TicketClusterer->new( 

                     excel_filename            => $excel_filename,
                     which_worksheet           => 1,
                     clustering_fieldname      => $fieldname_for_clustering,
                     unique_id_fieldname       => $unique_id_fieldname,
                     raw_tickets_db            => $raw_tickets_db,
                     processed_tickets_db      => $processed_tickets_db,
                     stemmed_tickets_db        => $stemmed_tickets_db,
                     inverted_index_db         => $inverted_index_db,
                     tickets_vocab_db          => $tickets_vocab_db,
                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     synset_cache_db           => $synset_cache_db,
                     stop_words_file           => $stop_words_file,
                     misspelled_words_file     => $misspelled_words_file,
                     add_synsets_to_tickets    => 1,

examples/ticket_preprocessor_and_doc_modeler.pl  view on Meta::CPAN

                     want_stemming             => 1,
                );

## Extract information from Excel spreadsheets:
$clusterer->get_tickets_from_excel();

## Apply cleanup filters and add synonyms:
$clusterer->delete_markup_from_all_tickets();
$clusterer->apply_filter_to_all_tickets();
$clusterer->expand_all_tickets_with_synonyms();
$clusterer->store_processed_tickets_on_disk();

## Construct the VSM doc model for the tickets:
$clusterer->get_ticket_vocabulary_and_construct_inverted_index();
$clusterer->construct_doc_vectors_for_all_tickets();
$clusterer->store_stemmed_tickets_and_inverted_index_on_disk();
$clusterer->store_ticket_vectors();

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

sub new { 
    my ($class, %args) = @_;
    my @params = keys %args;
    croak "\nYou have used a wrong name for a keyword argument " .
          "--- perhaps a misspelling\n" 
          if _check_for_illegal_params(@params) == 0;
    bless {
        _excel_filename         =>   $args{excel_filename}, 
        _which_worksheet        =>   $args{which_worksheet},
        _raw_tickets_db         =>   $args{raw_tickets_db}, 
        _processed_tickets_db   =>   $args{processed_tickets_db}, 
        _synset_cache_db        =>   $args{synset_cache_db}, 
        _stemmed_tickets_db     =>   $args{stemmed_tickets_db}, 
        _inverted_index_db      =>   $args{inverted_index_db},
        _tickets_vocab_db       =>   $args{tickets_vocab_db},
        _idf_db                 =>   $args{idf_db}, 
        _tkt_doc_vecs_db        =>   $args{tkt_doc_vecs_db},
        _tkt_doc_vecs_normed_db =>   $args{tkt_doc_vecs_normed_db},
        _clustering_fieldname   =>   $args{clustering_fieldname}, 
        _unique_id_fieldname    =>   $args{unique_id_fieldname}, 
        _stop_words_file        =>   $args{stop_words_file},

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        _want_stemming          =>   $args{want_stemming} || 0,
        _how_many_retrievals    =>   $args{how_many_retrievals} || 5,
        _min_idf_threshold      =>   $args{min_idf_threshold},
        _max_num_syn_words      =>   $args{max_num_syn_words} || 3,
        _want_synset_caching    =>   $args{want_synset_caching} || 0,
        _stop_words             =>   {},
        _all_tickets            =>   [],
        _column_headers         =>   [],
        _good_columns           =>   [],
        _tickets_by_ids         =>   {},
        _processed_tkts_by_ids  =>   {},
        _stemmed_tkts_by_ids    =>   {},
        _misspelled_words       =>   {},
        _total_num_tickets      =>   0,
        _synset_cache           =>   {},
        _vocab_hash             =>   {},
        _vocab_idf_hist         =>   {},
        _idf_t                  =>   {},
        _vocab_size             =>   undef,
        _doc_vector_template    =>   {},
        _tkt_doc_vecs           =>   {},
        _tkt_doc_vecs_normed    =>   {},
        _query_ticket_id        =>   undef,
        _inverted_index         =>   {},
        _debug1                 =>   $args{debug1} || 0, # for processing Excel
        _debug2                 =>   $args{debug2} || 0, # for modeling tickets
        _debug3                 =>   $args{debug3} || 0, # for retrieving similar tickets
        _wn                     =>   WordNet::QueryData->new( verbose => 0, 
                                                              noload => 1 ),
    }, $class;
}

#############################  Extract info from Excel  #######################

sub get_tickets_from_excel {
    my $self = shift;
    unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
    unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
    unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
    unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
    unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
    unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
    unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};   
    unlink glob "$self->{_tickets_vocab_db}.*";   
    unlink glob "$self->{_idf_db}.*";
    my $filename = $self->{_excel_filename} || die("Excel file required"),
    my $clustering_fieldname = $self->{_clustering_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


# Needed by test.t
sub _raw_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    return $record;
}


sub show_processed_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
}

sub show_stemmed_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        unless (($word eq 'no') or ($word eq 'not')) {
            next if length($word) < $self->{_min_word_length};
        }
        if (exists $self->{_misspelled_words}->{lc($word)}) {
            push @new_words, $self->{_misspelled_words}->{$word}; 
            next;
        }
        push @new_words, $word unless exists $self->{_stop_words}->{lc($word)};
    }
    my $new_record = join ' ', @new_words;
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
}

sub _get_synonyms_for_word {
    my $self = shift;
    my $word = shift;
    my $no_sense_indicators = 1;
    my $wn = $self->{_wn};
    my @parts_of_speech = $wn->querySense("$word");  

    my %noun_synonyms;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    my $num_of_tickets = $self->{_total_num_tickets};
    if ($self->{_want_synset_caching}) {
        eval {
            $self->{_synset_cache} = retrieve( $self->{_synset_cache_db} );
        } if -s $self->{_synset_cache_db};
        if ($@) {                                 
           print "Something went wrong with restoration of synset cache: $@";
        }
    }
    my $i = 1;
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
        $self->_expand_one_ticket_with_synonyms($ticket_id);
        print "Finished syn expansion of ticket $ticket_id ($i out of $num_of_tickets)\n";
        $i++;
    }
    if ($self->{_want_synset_caching}) {
        $self->{_synset_cache_db} = "synset_cache.db" unless $self->{_synset_cache_db};
        eval {                    
            store( $self->{_synset_cache}, $self->{_synset_cache_db} ); 
        };
        if ($@) {                                 

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    my $ticket_id = shift;
    print "\n\nEXPANDING TICKET $ticket_id WITH SYN-SETS:\n\n" 
                                              if $self->{_debug2};
    $self->_replace_negated_words_with_antonyms_one_ticket( $ticket_id );
    $self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
}

sub _replace_negated_words_with_antonyms_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
    foreach my $word (@words_negated_with_not) {
        next unless (($word =~ /^\w+$/) && 
                     (length($word) > $self->{_min_word_length}));
        my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
        next unless @antonym_words > 0;
        $#antonym_words = $self->{_max_num_syn_words} - 1
              if @antonym_words > $self->{_max_num_syn_words};
        my $antonym_replacement_string = join ' ', @antonym_words;
        print "Antonym for $word is $antonym_replacement_string\n"

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

                    (length($word) > $self->{_min_word_length}));
        my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
        next unless @antonym_words > 0;
        $#antonym_words = $self->{_max_num_syn_words} - 1
              if @antonym_words > $self->{_max_num_syn_words};
        my $antonym_replacement_string = join ' ', @antonym_words;
        print "Antonym for $word is $antonym_replacement_string\n"
            if $self->{_debug2};
        $record =~ s/no\s+$word/$antonym_replacement_string/g;
    }
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $record;
}

sub _add_to_words_their_synonyms_one_ticket {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    my @words = split /\s+/, $record;
    my @synonym_bag;
    foreach my $word (@words) {
        next if $word eq 'no';
        next if $word eq 'not';
        next unless $word =~ /^\w+$/ && 
                    length($word) > $self->{_min_word_length};
        my @synonym_words;
        @synonym_words = @{$self->{_synset_cache}->{$word}}
                      if exists $self->{_synset_cache}->{$word};        

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

            push @synonym_bag, @synonym_words;
        }
    }
    foreach my $syn_word (@synonym_bag) {
        push @words, lc($syn_word) 
            unless ((exists $self->{_stop_words}->{$syn_word}) || 
                        (length($syn_word) <= $self->{_min_word_length}));
    }
    my @sorted_words = sort @words;
    my $new_record = join ' ', @sorted_words;
    $self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
}

sub store_processed_tickets_on_disk {
    my $self = shift;
    $self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
    unlink $self->{_processed_tickets_db};
    eval {                    
        store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} ); 
    };
    if ($@) {                                 
       die "Something went wrong with disk storage of processed tickets: $@";
    }
}

sub store_stemmed_tickets_and_inverted_index_on_disk {
    my $self = shift;
    $self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db};
    unlink $self->{_stemmed_tickets_db};
    eval {                    
        print "\n\nStoring stemmed tickets on disk\n\n";
        store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} ); 

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    unlink $self->{_inverted_index_db};
    eval { 
        print "\nStoring inverted index on disk\n\n";
        store( $self->{_inverted_index}, $self->{_inverted_index_db} ); 
    };
    if ($@) {                                 
       die "Something went wrong with disk storage of the inverted index: $@";
    }
}

sub restore_processed_tickets_from_disk {
    my $self = shift;
    eval {
        $self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} );
    };
    if ($@) {                                 
       die "Something went wrong with restoration of processed tickets: $@";
    }
}

sub restore_stemmed_tickets_from_disk {
    my $self = shift;
    eval {
        $self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} );
    };
    if ($@) {                                 
       die "Something went wrong with restoration of stemmed tickets: $@";
    }
}

####################  Get Ticket Vocabulary and Word Counts #################

sub get_ticket_vocabulary_and_construct_inverted_index {
    my $self = shift;
    my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
    $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
    unlink glob "$self->{_tickets_vocab_db}.*";   
    my %vocab_hist_on_disk;
    tie %vocab_hist_on_disk, 'SDBM_File',  
             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
            or die "Can't create DBM files: $!";       
    my %inverted_index;
    my $min = $self->{_min_word_length};
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
        my %uniques = ();
        my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
        my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
        my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        next unless @clean_words;
        @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}), 
                                                                 @clean_words;
        map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
        for (@clean_words) { $uniques{"\L$_"}++ };
        map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
        map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id } 
                                                            keys %uniques;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        print "$outstring\n";
    }
}

# The following subroutine is useful for diagnostic purposes.  It
# lists the number of tickets that a word appears in and also lists
# the tickets.  But be careful in interpreting its results.  Note
# if you invoke this subroutine after the synsets have been added
# to the tickets, you may find words being attributed to tickets
# that do not actually contain them in the original Excel sheet.
sub list_processed_tickets_for_a_word {
    my $self = shift;
    while (my $word = <STDIN>) {    #enter ctrl-D to exit the loop
        chomp $word;
        my @ticket_list;
        foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
            my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
            push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
        }
        my $num = @ticket_list;
        print "\nThe number of processed tickets that mention the word `$word': $num\n\n";
        print "The processed tickets: @ticket_list\n\n";
    }
}

sub list_stemmed_tickets_for_a_word {
    my $self = shift;
    while (my $word = <STDIN>) {    #enter ctrl-D to exit the loop
        chomp $word;
        my @ticket_list;
        foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_stemmed_tkts_by_ids}}) {
            my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        store($self->{_tkt_doc_vecs_normed}, $self->{_tkt_doc_vecs_normed_db});
    };
    if ($@) {
        die "Something wrong with disk storage of normalized doc vecs: $@";
    }
}

sub restore_ticket_vectors_and_inverted_index {
    my $self = shift;
    $self->restore_raw_tickets_from_disk();
    $self->restore_processed_tickets_from_disk();
    $self->restore_stemmed_tickets_from_disk();
    tie my %vocab_hist_on_disk, 'SDBM_File', $self->{_tickets_vocab_db}, O_RDONLY, 0640
            or die "Can't connect with DBM file: $!";       
    foreach (keys %vocab_hist_on_disk) {
        $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
    }
    untie %vocab_hist_on_disk;
    tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDONLY, 0640
            or die "Can't connect with DBM file: $!";       
    foreach (keys %idf_t_on_disk) {

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    foreach my $i (0..@{$array_ref}-1) {
        return $i if $ele == $array_ref->[$i];
    }
}

sub _check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / excel_filename
                            which_worksheet
                            raw_tickets_db
                            processed_tickets_db
                            stemmed_tickets_db
                            inverted_index_db
                            tickets_vocab_db
                            idf_db
                            tkt_doc_vecs_db
                            tkt_doc_vecs_normed_db
                            synset_cache_db
                            want_synset_caching
                            add_synsets_to_tickets
                            clustering_fieldname

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

Algorithm::TicketClusterer - A Perl module for retrieving Excel-stored past
tickets that are most similar to a new ticket.  Tickets are commonly used
in software services industry and customer support businesses to record
requests for service, product complaints, user feedback, and so on.

=head1 SYNOPSIS

    use Algorithm::TicketClusterer;

    #  Extract the tickets from the Excel spreadsheet and subject the
    #  textual content of the tickets to various preprocessing and doc
    #  modeling steps.  The preprocessing steps consist of removing markup,
    #  dropping the words in a stop list, correcting spelling errors,
    #  detecting the need for antonyms, and, finally, adding word synonyms
    #  to the tickets in order to ground the tickets in a common
    #  vocabulary. The doc modeling steps consist of fitting a standard
    #  vector space model to the tickets.

    my $clusterer = Algorithm::TicketClusterer->new( 
    
                         excel_filename            => $excel_filename,
                         clustering_fieldname      => $fieldname_for_clustering,
                         which_worksheet           => $which_worksheet,
                         unique_id_fieldname       => $unique_id_fieldname,
                         raw_tickets_db            => $raw_tickets_db,
                         processed_tickets_db      => $processed_tickets_db,
                         stemmed_tickets_db        => $stemmed_tickets_db,
                         inverted_index_db         => $inverted_index_db,
                         tickets_vocab_db          => $tickets_vocab_db,
                         idf_db                    => $idf_db,
                         tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                         tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                         synset_cache_db           => $synset_cache_db,
                         stop_words_file           => $stop_words_file,
                         misspelled_words_file     => $misspelled_words_file,
                         add_synsets_to_tickets    => 1,

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


    #  Of the various constructor parameters shown above, the following two
    #  are critical to how information is extracted from an Excel
    #  spreadsheet: `clustering_fieldname' and `unique_id_fieldname'.  The
    #  first is the heading of the column that contains the textual content
    #  of the tickets.  The second is the heading of the column that
    #  contains a unique integer identifier for each ticket.

    #  The nine database related constructor parameters (these end in the
    #  suffix `_db') are there in order to avoid repeated parsing of the
    #  spreadsheet and preprocessing of the tickets every time you need to
    #  make a retrieval for a new ticket.  The goal here is that after the
    #  ticket information has been ingested from a spreadsheet, you would
    #  want to carry out similar-ticket retrieval in real time.  (Whether
    #  or not real-time retrieval would be feasible in actual practice
    #  would also depend on what hardware you are using, obviously.)

    #  After the above preprocessing and doc modeling steps, you can
    #  extract the most similar past tickets for a new query ticket with a
    #  script in which the constructor call would look like:

    my $clusterer = Algorithm::TicketClusterer->new( 
    
                         clustering_fieldname      => $fieldname_for_clustering,
                         unique_id_fieldname       => $unique_id_fieldname,
                         raw_tickets_db            => $raw_tickets_db,
                         processed_tickets_db      => $processed_tickets_db,
                         stemmed_tickets_db        => $stemmed_tickets_db,
                         inverted_index_db         => $inverted_index_db,
                         tickets_vocab_db          => $tickets_vocab_db,
                         idf_db                    => $idf_db,
                         tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                         tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                         min_idf_threshold         => 1.8,
                         how_many_retrievals       => 5,
                    );
    

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


=head1 CHANGES

Version 1.01 of the module removes the platform dependency of the functions used for
reading the text files for stop words, misspelled words, etc.


=head1 DESCRIPTION

B<Algorithm::TicketClusterer> is a I<perl5> module for retrieving
previously processed Excel-stored tickets similar to a new ticket.  Routing
decisions made for the past similar tickets can be useful in expediting the
routing of a new ticket.

Tickets are commonly used in software services industry and customer
support businesses to record requests for service, product complaints,
user feedback, and so on.

With regard to the routing of a ticket, you would want each new ticket to
be handled by the tech support individual who is most qualified to address
the issue raised in the ticket.  Identifying the right individual for each
new ticket in real-time is no easy task for organizations that man large
service centers and helpdesks.  So if it were possible to quickly identify
the previously processed tickets that are most similar to a new ticket, one
could think of constructing semi-automated (or, perhaps, even fully
automated) ticket routers.

Identifying old tickets similar to a new ticket is made challenging by the
fact that folks who submit tickets often write them quickly and informally.
The informal style of writing means that different people may use different
colloquial terms to describe the same thing. And the quickness associated
with their submission causes the tickets to frequently contain spelling and
other errors such as conjoined words, fragmentation of long words, and so
on.

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

using the following approach which I prefer to actually reading the tickets
for such errors: Turn on the debugging options in the constructor for some
initially collected spreadsheets and watch what sort of words the WordNet
is not able to supply any synonyms for.  In a large majority of cases,
these would be the misspelled words.

Expanding a ticket with synonyms is made complicated by the fact that some
common words have such a large number of synonyms that they can overwhelm
the relatively small number of words in a ticket.  Adding too many synonyms
in relation to the size of a ticket can not only distort the sense of the
ticket but it can also increase the computational cost of processing all
the tickets.

In order to deal with the pros and the cons of using synonyms, the present
module strikes a middle ground: You can specify how many synonyms to use
for a word (assuming that the number of synonyms supplied by WordNet is
larger than the number specified).  This allows you to experiment with
retrieval precision by altering the number of synonyms used.  The retained
synonyms are selected randomly from those supplied by WordNet.  (A smarter
way to select synonyms would be to base them on the context.  For example,
you would not want to use the synonym `programmer' for the noun `developer'
if your application domain is real-estate.  However, such context-dependent
selection of synonyms would take us into the realm of ontologies that I
have chosen to stay away from in this first version of the module.)

Another issue related to the overall run-time performance of this module is
the computational cost of the calls to WordNet through its Perl interface
C<WordNet::QueryData>.  This module uses what I have referred to as
I<synset caching> to make this process as efficient as possible.  The
result of each WordNet lookup is cached in a database file whose name you
supply through the constructor option C<synset_cache_db>.  If you are doing
a good job of catching spelling errors, the module will carry out a
decreasing number of WordNet lookups as the tickets are scanned for
expansion with synonyms.  In an experiment with a spreadsheet that
contained over 1400 real tickets, the last several hundred resulted in
hardly any calls to WordNet.

As currently programmed, the synset cache is deleted and then created
afresh at every call to the function that extracts information from an
Excel spreadsheet. You would want to change this behavior of the module if
you are planning to use it in a production environment where the different
spreadsheets are likely to deal with the same application domain.  To give
greater persistence to the synset cache, comment out the C<unlink
$self->{_synset_cache_db}> line in the method C<get_tickets_from_excel()>.
After a few updates of the synset cache, the module would almost never need
to make direct calls to WordNet, which would enhance the speed of the
module even further.

The textual content of the tickets, as produced by the preprocessing steps,
is used for document modeling and the doc model thus created used
subsequently for retrieving similar tickets.  The doc modeling is carried
out using the Vector Space Model (VSM) in which each ticket is represented
by a vector whose size equals the size of the vocabulary used in all the
tickets and whose elements represent the word frequencies in the
ticket. After such a model is constructed, a query ticket is compared with
the other tickets on the basis of the cosine similarity distance between
the corresponding vectors.

My decision to use the simplest of the text models --- the Vector Space

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

(3) The module uses stemming to reduce the variants of the same word to a
common root in order to limit the size of the vocabulary.  The stemming
used in the current module is rudimentary.  However, it would be easy to
plug into the module more powerful stemmers through their Perl interfaces.
Future versions of this module may do exactly that.



=head1 THE THREE STAGES OF PROCESSING

The tickets are processed in the following three stages:

=over

=item B<Ticket Preprocessing:>

This stage involves extracting the textual content of each ticket from the
Excel spreadsheet and subjecting it to the following steps: (1) deleting
markup; (2) dropping the stop words supplied through a file whose name is
provided as a value for the constructor parameter C<stop_words_file>; (3)
correcting spelling errors through the `bad-word good-word' entries in a
file whose name is supplied as a value for the constructor parameter
C<misspelled_words_file>; (4) replacing negated words with their antonyms;
and, finally, (5) adding synonyms.

=item B<Doc Modeling:>

Doc modeling consists of creating a Vector Space Model for the tickets
after they have been processed as described above.  VSM modeling involves
scanning the preprocessed tickets, stemming the words, and constructing a
vocabulary for all of the stemmed words in all the tickets.  Subsequently,
the alphabetized list of the vocabulary serves as a vector template for the
tickets. Each ticket is represented by a vector whose dimensionality equals
the size of the vocabulary; each element of this vector is an integer that
is the frequency of the vocabulary word corresponding to the index of the
element.  Doc modeling also involves calculating the inverse document
frequencies (IDF) values for the words and the inverted index for the
words.  The IDF values are used to diminish the importance of the words
that carry little discriminatory power vis-a-vis the tickets.  IDF for a
word is the logarithm of the ratio of the total number of tickets to the
number of tickets in which the word appears.  Obviously, if a word were to
appear in all the tickets, its IDF value would be zero.  The inverted index
entry for a word is the list of all the tickets that contain that word.
The inverted index greatly expedites the retrieval of tickets similar to a
given query ticket.

=item B<Similarity Based Retrieval:>

A query ticket is subject to the same preprocessing steps as all other
tickets.  Subsequently, it is also represented by a vector in the same
manner as the other tickets.  Using the stemmed words in the query ticket,
the inverted index is used to create a candidate list of ticket vectors for
matching with the query ticket vector.  For this, only those query words
are chosen whose IDF values exceed a threshold.  Finally, we compute the
cosine similarity distance between the query ticket vector and the ticket
vectors in the candidate list.  The matching ticket vectors are returned in
the order of decreasing similarity.

=back

=begin html

<br>

=end html

=head1 METHODS

The module provides the following methods for ticket preprocessing and for the
retrieval of tickets most similar to a given ticket:

=over

=item B<new()>

A call to C<new()> constructs a new instance of the C<Algorithm::TicketClusterer>
class:

    my $clusterer = Algorithm::TicketClusterer->new( 

                     excel_filename            => $excel_filename,
                     clustering_fieldname      => $fieldname_for_clustering,
                     unique_id_fieldname       => $unique_id_fieldname,
                     which_worksheet           => $which_worksheet,
                     raw_tickets_db            => $raw_tickets_db,
                     processed_tickets_db      => $processed_tickets_db,
                     stemmed_tickets_db        => $stemmed_tickets_db,
                     inverse_index_db          => $inverse_index_db,
                     tickets_vocab_db          => $tickets_vocab_db,
                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     synset_cache_db           => $synset_cache_db,
                     stop_words_file           => $stop_words_file,
                     misspelled_words_file     => $misspelled_words_file,
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     min_idf_threshold         => 2.0,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                     how_many_retrievals       => 5,
                     debug1                    => 1,  # for processing, filtering Excel
                     debug2                    => 1,  # for doc modeling
                     debug3                    => 1,  # for retrieving similar tickets

                   );

Obviously, before you can invoke the constructor, you must provide values for the
variables shown to the right of the big arrows.  As to what these values should be is
made clear by the following alphabetized list that describes each of the constructor
parameters shown above:

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

This debug flag applies to the calculations carried out during the retrieval of
similar tickets.  When this flag is set, the module will display the candidate set of
tickets to be considered for matching with the query ticket.  This candidate set is
chosen by using the inverted index to collect all the tickets that share words with
the query word provided the IDF value for each such word exceeds the threshold set by
the constructor parameter C<min_idf_threshold>.

=item I<excel_filename:>

This is obviously the name of the Excel file that contains the tickets you want to
process.

=item I<how_many_retrievals:>

The integer value supplied for this parameter determines how many tickets that are
most similar to a query ticket will be returned.

=item I<idf_db:>

You store the inverse document frequencies for the vocabulary words in a database
file whose name is supplied through this constructor parameter.  As mentioned
earlier, the IDF for a word is, in principle, the logarithm of the ratio of the total
number of tickets to the DF (Document Frequency) for the word.  The DF of a word is
the number of tickets in which the word appears.

=item I<inverted_index_db:>

If you plan to create separate scripts for the three stages of processing described
earlier, you must store the inverted index in a database file so that it can be used
by the script whose job is to carry out similarity based ticket retrieval. The
inverted index is stored in a database file whose name is supplied through this
constructor parameter.

=item I<max_num_syn_words:>

As mentioned in B<DESCRIPTION>, some words can have a very large number of synonyms
--- much larger than the number of words that may exist in a typical ticket.  If you
were to add all such synonyms to a ticket, you run the danger of altering the sense
of the ticket, besides unnecessarily increasing the size of the vocabulary. This
parameter limits the number of synonyms chosen to the value used for the parameter.
When the number of synonyms returned by WordNet is greater than the value set for
this parameter, the synonyms retained are chosen randomly from the list returned by
WordNet.

=item I<min_idf_threshold:>

First recall that IDF stands for Inverse Document Frequency.  It is calculated during
the second of the three-stage processing of the tickets as described in the section
B<THE THREE STAGES OF PROCESSING TICKETS>.  The IDF value of a word gives us a
measure of the discriminatory power of the word.  Let's say you have a word that
occurs in only one out of 1000 tickets.  Such a word is obviously highly
discriminatory and its IDF would be the logarithm (to base 10) of the ratio of 1000
to 1, which is 3.  On the other hand, for a word that occurs in every one of 1000
tickets, its IDF value would be the logarithm of the ratio of 1000 to 1000, which is
0.  So, for the case when you have 1000 tickets, the upper bound on IDF is 3 and the
lower bound 0. This constructor parameter controls which of the query words you will
use for constructing the initial pool of tickets that will be used for matching.  The
larger the value of this threshold, the smaller the pool obviously.

=item I<min_word_length:> 

This parameter sets the minimum number of characters in a word in order for it to be
included for ticket processing.

=item I<misspelled_words_file:>

As to what extent you can improve ticket retrieval precision with the addition of
synonyms depends on the degree to which you can make corrections on the fly for the
spelling errors that occur frequently in tickets.  That fact makes the file you
supply through this constructor parameter very important.  For the current version of
the module, this file must contain exactly two columns, with the first entry in each
row the misspelled word and the second entry the correctly spelled word.  See this
file in the C<examples> directory for how to format it.

=item I<processed_tickets_db:>

As mentioned earlier in B<DESCRIPTION>, the tickets must be subject to various
preprocessing steps before they can be used for document modeling for the purpose of
retrieval. Preprocessing consists of stop words removal, spelling corrections,
antonym detection, synonym addition, etc.  The tickets resulting from preprocessing
are stored in a database file whose name you supply through this constructor
parameter.

=item I<raw_tickets_db:>

The raw tickets extracted from the Excel spreadsheet are stored in a database file
whose name you supply through this constructor parameter.  The idea here is that we
do not want to process an Excel spreadsheet for each new attempt at matching a query
ticket with the previously recorded tickets in the same spreadsheet.  It is much
faster to load the database back into the runtime environment than to process a large
spreadsheet.

=item I<stemmed_tickets_db:>

As mentioned in the section B<THE THREE STAGES OF PROCESSING>, one of the first
things you do in the second stage of processing is to stem the words in the tickets.
Stemming is important because it reduces the size of the vocabulary.  To illustrate,
stemming would reduce both the words `programming' and `programmed' to the common
root 'program'.  This module uses a very simple stemmer whose rules can be found in
the utility subroutine C<_simple_stemmer()>.  It would be trivial to expand on these
rules, or, for that matter, to use the Perl module C<Lingua::Stem::En> for a full
application of the Porter Stemming Algorithm.  The stemmed tickets are saved in a
database file whose name is supplied through this constructor parameter.

=item I<stop_words_file:>

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

total number of words in the ticket.

=item  B<delete_markup_from_all_tickets()>

    $clusterer->delete_markup_from_all_tickets()

It is not uncommon for the textual content of a ticket to contain HTML markup. This
method deletes such strings.  Note that this method is not capable of deleting
complex markup that may include HTML comment blocks, may cross line boundaries, or
when the textual content includes angle brackets that denote "less than" or "greater
then".  If your tickets require more sophisticated processing for the removal of
markup, you might consider using the C<HTML::Restrict> module.


=item  B<display_all_doc_vectors()>

=item  B<display_all_normalized_doc_vectors()>

These two methods are useful for troubleshooting if things don't look right with
regard to retrieval.

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


This method displays the ticket vocabulary constructed by a call to
C<get_ticket_vocabulary_and_construct_inverted_index()>.  The vocabulary display
consists of an alphabetized list of the words in all the tickets along with the
frequency of each word.

=item  B<expand_all_tickets_with_synonyms()>

    $clusterer->expand_all_tickets_with_synonyms();

This is the final step in the preprocessing of the tickets before they are ready for
the doc modeling stage.  This method calls other functions internal to the module
that ultimately make calls to WordNet through the Perl interface provided by the
C<WordNet::QueryData> module.

=item B<get_tickets_from_excel():>

    $clusterer->get_tickets_from_excel()

This method calls on the C<Spreadsheet::ParseExcel> module to extract the tickets
from the old-style Excel spreadsheets and the C<Spreadsheet::XLSX> module for doing
the same from the new-style Excel spreadsheets.

=item  B<get_ticket_vocabulary_and_construct_inverted_index()>

    $clusterer->get_ticket_vocabulary_and_construct_inverted_index()

As mentioned in B<THE THREE STAGES OF PROCESSING>, the second stage of processing ---
doc modeling of the tickets --- starts with the stemming of the words in the tickets,
constructing a vocabulary of all the stemmed words in all the tickets, and
constructing an inverted index for the vocabulary words.  All of these things are
accomplished by this method.

=item  B<restore_processed_tickets_from_disk()>

    $clusterer->restore_processed_tickets_from_disk()

This loads into your script the output of the ticket preprocessing stage.  This
method is called internally by C<restore_ticket_vectors_and_inverted_index()>, which
you would use in your ticket retrieval script, assuming it is separate from the
ticket preprocessing script.

=item B<restore_raw_tickets_from_disk()>

    $clusterer->restore_raw_tickets_from_disk()    

With this method, you are spared the trouble of having to repeatedly parse the same
Excel spreadsheet during the development phase as you are testing the module with
different query tickets.  This method is called internally by
C<restore_ticket_vectors_and_inverted_index()>.

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


        $clusterer->restore_stemmed_tickets_from_disk();

This method is called internally by
C<restore_ticket_vectors_and_inverted_index()>.

=item  B<restore_ticket_vectors_and_inverted_index()>

    $clusterer->restore_ticket_vectors_and_inverted_index()

If you are going to be doing ticket preprocessing and doc modeling in one script and
ticket retrieval in another, then this is the first method you would need to call in
the latter for the restoration of the VSM model for the tickets and the inverted
index.

=item B<retrieve_similar_tickets_with_vsm()>

    my $retrieved_hash_ref = $clusterer->retrieve_similar_tickets_with_vsm( $ticket_num )

It is this method that retrieves tickets that are most similar to a query ticket.
The method first utilizes the inverted index to construct a candidate list of the

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


The argument to the method is the unique integer ID of a ticket for which
you want to see all the fields as stored in the Excel spreadsheet.

=item B<show_raw_ticket_clustering_data_for_given_id()>

While the previous method shows all the fields for a ticket, this method
shows only the textual content --- the content you want to use for
establishing similarity between a query ticket and the other tickets.

=item B<show_processed_ticket_clustering_data_for_given_id()>

    $clusterer->show_processed_ticket_clustering_data_for_given_id( $ticket_num );

This is the method to call if you wish to examine the textual content of a ticket
after it goes through the preprocessing steps.  In particular, you will see the
corrections made, the synonyms added, etc.  You would need to set the argument
C<$ticket_num> to the unique integer ID of the ticket you are interested in.

=item  B<store_processed_tickets_on_disk()>

    $clusterer->store_processed_tickets_on_disk();

This stores in a database file the preprocessed textual content of the
tickets.

=item B<store_raw_tickets_on_disk()>

    $clusterer->store_raw_tickets_on_disk();

This method is called by the C<get_tickets_from_excel()> method to store on the disk
the tickets extracted from the Excel spreadsheet.  Obviously, you can also call it in
your own script for doing the same.

=item  B<store_stemmed_tickets_and_inverted_index_on_disk()>

    $clusterer->store_stemmed_tickets_and_inverted_index_on_disk()

This method stores in a database file the stemmed tickets and the inverted index that
are produced at the end of the second stage of processing.

=item B<show_stemmed_ticket_clustering_data_for_given_id()>

    $clusterer->show_stemmed_ticket_clustering_data_for_given_id( $ticket_num );

If you want to see what sort of a job the stemmer is doing for a ticket, this is the
method to call.  You would need to set the argument C<$ticket_num> to the unique
integer ID of the ticket you are interested in.

=item  B<store_ticket_vectors()>

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

ID of the query ticket.


=head1 THE C<examples> DIRECTORY

The C<examples> directory contains the following two scripts that would be your
quickest way to become familiar with this module:

=over

=item B<For ticket preprocessing and doc modeling:>

Run the script

    ticket_preprocessor_and_doc_modeler.pl

This will carry out preprocessing and doc modeling of the tickets that are stored in
the Excel file C<ExampleExcelFile.xls> that you will find in the same directory.

=item B<For retrieving similar tickets:>

Next, run the script

    retrieve_similar_tickets.pl

to retrieve five tickets that are closest to the query ticket whose integer ID is
supplied to the C<retrieve_similar_tickets_with_vsm()> method in the script.

t/test.t  view on Meta::CPAN

use Algorithm::TicketClusterer;

# Test 1 (Read Excel):

my $tclusterer = Algorithm::TicketClusterer->new( 
                     excel_filename            => "t/__SampleTest.xlsx",
                     which_worksheet           => 1,
                     clustering_fieldname      => "Description",
                     unique_id_fieldname       => "Request No",
                     raw_tickets_db            => "t/__test_raw_tickets_db",
                     processed_tickets_db      => "t/__test_processed_tickets_db",
                     stemmed_tickets_db        => "t/__test_stemmed_tickets_db",
                     inverted_index_db         => "t/__test_inverted_index_db",
                     tickets_vocab_db          => "t/__test_tickets_vocab_db",
                     idf_db                    => "t/__test_idf_db",
                     tkt_doc_vecs_db           => "t/__test_tkt_doc_vecs_db",
                     tkt_doc_vecs_normed_db    => "t/__test_tkt_doc_vecs_normed_db",
                     synset_cache_db           => "t/__test_synset_cache_db",
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                 );

my @returned = $tclusterer->_test_excel_for_tickets();
my @should_be = qw/0 4 0 6/;
#ok( @returned ~~ @should_be, 'Able to process Excel' );
my @comparisons = map {$returned[$_] == $should_be[$_] ? 1 : 0} (0..@returned-1);
my $final_compare = 1;
foreach my $i (0..@returned-1) {
    $final_compare *= $comparisons[$i]
}
ok( $final_compare, 'Able to process Excel' );

## Test 2 (Check Clustering Data):

$tclusterer->get_tickets_from_excel();
my $clustering_data = $tclusterer->_raw_ticket_clustering_data_for_given_id(101);

ok( $clustering_data =~ /i am unable/, 'Able to extract the clustering field from Excel' );


## Test 3 (Check Synset Extraction from WordNet):



( run in 0.425 second using v1.01-cache-2.11-cpan-8d75d55dd25 )