view release on metacpan or search on metacpan
examples/ticket_preprocessor_and_doc_modeler.pl view on Meta::CPAN
tickets_vocab_db => $tickets_vocab_db,
idf_db => $idf_db,
tkt_doc_vecs_db => $tkt_doc_vecs_db,
tkt_doc_vecs_normed_db => $tkt_doc_vecs_normed_db,
synset_cache_db => $synset_cache_db,
stop_words_file => $stop_words_file,
misspelled_words_file => $misspelled_words_file,
add_synsets_to_tickets => 1,
want_synset_caching => 1,
max_num_syn_words => 3,
min_word_length => 4,
want_stemming => 1,
);
## Extract information from Excel spreadsheets:
$clusterer->get_tickets_from_excel();
## Apply cleanup filters and add synonyms:
$clusterer->delete_markup_from_all_tickets();
$clusterer->apply_filter_to_all_tickets();
$clusterer->expand_all_tickets_with_synonyms();
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
_stemmed_tickets_db => $args{stemmed_tickets_db},
_inverted_index_db => $args{inverted_index_db},
_tickets_vocab_db => $args{tickets_vocab_db},
_idf_db => $args{idf_db},
_tkt_doc_vecs_db => $args{tkt_doc_vecs_db},
_tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db},
_clustering_fieldname => $args{clustering_fieldname},
_unique_id_fieldname => $args{unique_id_fieldname},
_stop_words_file => $args{stop_words_file},
_misspelled_words_file => $args{misspelled_words_file},
_min_word_length => $args{min_word_length} || 4,
_add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0,
_want_stemming => $args{want_stemming} || 0,
_how_many_retrievals => $args{how_many_retrievals} || 5,
_min_idf_threshold => $args{min_idf_threshold},
_max_num_syn_words => $args{max_num_syn_words} || 3,
_want_synset_caching => $args{want_synset_caching} || 0,
_stop_words => {},
_all_tickets => [],
_column_headers => [],
_good_columns => [],
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
if ($self->{_misspelled_words_file}) {
my @misspelled_word_pairs =
@{_fetch_word_pairs_from_file($self->{_misspelled_words_file})};
foreach my $wordpair (@misspelled_word_pairs) {
my ($wrong_word, $good_word) = grep $_, split /\s+/, $wordpair;
$self->{_misspelled_words}->{$wrong_word} = $good_word;
}
}
}
my $record = $self->{_tickets_by_ids}->{$ticket_id};
my $min = $self->{_min_word_length};
my @words = split /\n|\r|\"|\'|\.|\,|\;|\?|\(|\)|\[|\]|\\|\/|\s+|\&/, $record;
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @words;
return unless @clean_words;
my @new_words;
foreach my $word (@words) {
$word =~ s/(.+)[.,:!-]$/$1/;
unless (($word eq 'no') or ($word eq 'not')) {
next if length($word) < $self->{_min_word_length};
}
if (exists $self->{_misspelled_words}->{lc($word)}) {
push @new_words, $self->{_misspelled_words}->{$word};
next;
}
push @new_words, $word unless exists $self->{_stop_words}->{lc($word)};
}
my $new_record = join ' ', @new_words;
$self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
}
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
$self->_add_to_words_their_synonyms_one_ticket( $ticket_id );
}
sub _replace_negated_words_with_antonyms_one_ticket {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
my @words_negated_with_not = $record =~ /\bnot\s+(\w+)/ig;
foreach my $word (@words_negated_with_not) {
next unless (($word =~ /^\w+$/) &&
(length($word) > $self->{_min_word_length}));
my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
next unless @antonym_words > 0;
$#antonym_words = $self->{_max_num_syn_words} - 1
if @antonym_words > $self->{_max_num_syn_words};
my $antonym_replacement_string = join ' ', @antonym_words;
print "Antonym for $word is $antonym_replacement_string\n"
if $self->{_debug2};
$record =~ s/not\s+$word/$antonym_replacement_string/g;
}
my @words_negated_with_no = $record =~ /\bno\s+(\w+)/ig;
foreach my $word (@words_negated_with_no) {
next unless (($word =~ /^\w+$/) &&
(length($word) > $self->{_min_word_length}));
my @antonym_words = @{$self->_get_antonyms_for_word( $word )};
next unless @antonym_words > 0;
$#antonym_words = $self->{_max_num_syn_words} - 1
if @antonym_words > $self->{_max_num_syn_words};
my $antonym_replacement_string = join ' ', @antonym_words;
print "Antonym for $word is $antonym_replacement_string\n"
if $self->{_debug2};
$record =~ s/no\s+$word/$antonym_replacement_string/g;
}
$self->{_processed_tkts_by_ids}->{$ticket_id} = $record;
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
sub _add_to_words_their_synonyms_one_ticket {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
my @words = split /\s+/, $record;
my @synonym_bag;
foreach my $word (@words) {
next if $word eq 'no';
next if $word eq 'not';
next unless $word =~ /^\w+$/ &&
length($word) > $self->{_min_word_length};
my @synonym_words;
@synonym_words = @{$self->{_synset_cache}->{$word}}
if exists $self->{_synset_cache}->{$word};
unless (exists $self->{_synset_cache}->{$word}) {
@synonym_words = @{$self->_get_synonyms_for_word( $word )};
print "syn-set for $word => @synonym_words\n\n"
if $self->{_debug2};
my $word_root;
if (@synonym_words == 0) {
if ((length($word) > 4) && ($word =~ /(.+)s$/)) {
$word_root = $1;
@synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
if length($word_root) >= $self->{_min_word_length};
} elsif ((length($word) > 6) && ($word =~ /(.+)ing$/)) {
$word_root = $1;
@synonym_words = @{$self->_get_synonyms_for_word( $word_root )}
if length($word_root) >= $self->{_min_word_length};
}
}
print "syn-set for word root $word_root => @synonym_words\n\n"
if ( $self->{_debug2} && defined $word_root );
_fisher_yates_shuffle( \@synonym_words ) if @synonym_words > 0;
$#synonym_words = $self->{_max_num_syn_words} - 1
if @synonym_words > $self->{_max_num_syn_words};
print "Retained syn-set for $word => @synonym_words\n\n"
if $self->{_debug2};
$self->{_synset_cache}->{$word} = \@synonym_words;
push @synonym_bag, @synonym_words;
}
}
foreach my $syn_word (@synonym_bag) {
push @words, lc($syn_word)
unless ((exists $self->{_stop_words}->{$syn_word}) ||
(length($syn_word) <= $self->{_min_word_length}));
}
my @sorted_words = sort @words;
my $new_record = join ' ', @sorted_words;
$self->{_processed_tkts_by_ids}->{$ticket_id} = $new_record;
}
sub store_processed_tickets_on_disk {
my $self = shift;
$self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
unlink $self->{_processed_tickets_db};
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
sub get_ticket_vocabulary_and_construct_inverted_index {
my $self = shift;
my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
$self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
unlink glob "$self->{_tickets_vocab_db}.*";
my %vocab_hist_on_disk;
tie %vocab_hist_on_disk, 'SDBM_File',
$self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
my %inverted_index;
my $min = $self->{_min_word_length};
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
my %uniques = ();
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
next unless @clean_words;
@clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}),
@clean_words;
map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
for (@clean_words) { $uniques{"\L$_"}++ };
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
my $ticket_id = shift;
unless (keys %{$self->{_doc_vector_template}}) {
foreach ( sort keys %{$self->{_vocab_hist}} ) {
$self->{_doc_vector_template}->{$_} = 0;
}
}
my %doc_vector = %{_deep_copy_hash($self->{_doc_vector_template})};
foreach ( sort keys %{$self->{_doc_vector_template}} ) {
$doc_vector{$_} = 0;
}
my $min = $self->{_min_word_length};
my $total_words_in_ticket = 0;
my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
my @clean_words = split /\s+/, $record;
map { $doc_vector{"\L$_"}++ }
grep {exists $self->{_vocab_hist}->{"\L$_"}} @clean_words;
die "Something went wrong. Doc vector size unequal to vocab size"
unless $self->{_vocab_size} == scalar(keys %doc_vector);
foreach (keys %doc_vector) {
$total_words_in_ticket += $doc_vector{$_};
}
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
stemmed_tickets_db
inverted_index_db
tickets_vocab_db
idf_db
tkt_doc_vecs_db
tkt_doc_vecs_normed_db
synset_cache_db
want_synset_caching
add_synsets_to_tickets
clustering_fieldname
min_word_length
min_idf_threshold
max_num_syn_words
stop_words_file
misspelled_words_file
unique_id_fieldname
want_stemming
how_many_retrievals
debug1
debug2
debug3
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
my $i = @$arr;
while (--$i) {
my $j = int rand( $i + 1 );
@$arr[$i, $j] = @$arr[$j, $i];
}
}
sub _vec_scalar_product {
my $vec1 = shift;
my $vec2 = shift;
die "Something is wrong --- the two vectors are of unequal length"
unless @$vec1 == @$vec2;
my $product;
for my $i (0..@$vec1-1) {
$product += $vec1->[$i] * $vec2->[$i];
}
return $product;
}
sub _vec_magnitude {
my $vec = shift;
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
tickets_vocab_db => $tickets_vocab_db,
idf_db => $idf_db,
tkt_doc_vecs_db => $tkt_doc_vecs_db,
tkt_doc_vecs_normed_db => $tkt_doc_vecs_normed_db,
synset_cache_db => $synset_cache_db,
stop_words_file => $stop_words_file,
misspelled_words_file => $misspelled_words_file,
add_synsets_to_tickets => 1,
want_synset_caching => 1,
max_num_syn_words => 3,
min_word_length => 4,
want_stemming => 1,
);
## Extract information from Excel spreadsheets:
$clusterer->get_tickets_from_excel();
## Apply cleanup filters and add synonyms:
$clusterer->delete_markup_from_all_tickets();
$clusterer->apply_filter_to_all_tickets();
$clusterer->expand_all_tickets_with_synonyms();
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
idf_db => $idf_db,
tkt_doc_vecs_db => $tkt_doc_vecs_db,
tkt_doc_vecs_normed_db => $tkt_doc_vecs_normed_db,
synset_cache_db => $synset_cache_db,
stop_words_file => $stop_words_file,
misspelled_words_file => $misspelled_words_file,
add_synsets_to_tickets => 1,
want_synset_caching => 1,
min_idf_threshold => 2.0,
max_num_syn_words => 3,
min_word_length => 4,
want_stemming => 1,
how_many_retrievals => 5,
debug1 => 1, # for processing, filtering Excel
debug2 => 1, # for doc modeling
debug3 => 1, # for retrieving similar tickets
);
Obviously, before you can invoke the constructor, you must provide values for the
variables shown to the right of the big arrows. As to what these values should be is
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
measure of the discriminatory power of the word. Let's say you have a word that
occurs in only one out of 1000 tickets. Such a word is obviously highly
discriminatory and its IDF would be the logarithm (to base 10) of the ratio of 1000
to 1, which is 3. On the other hand, for a word that occurs in every one of 1000
tickets, its IDF value would be the logarithm of the ratio of 1000 to 1000, which is
0. So, for the case when you have 1000 tickets, the upper bound on IDF is 3 and the
lower bound 0. This constructor parameter controls which of the query words you will
use for constructing the initial pool of tickets that will be used for matching. The
larger the value of this threshold, the smaller the pool obviously.
=item I<min_word_length:>
This parameter sets the minimum number of characters in a word in order for it to be
included for ticket processing.
=item I<misspelled_words_file:>
As to what extent you can improve ticket retrieval precision with the addition of
synonyms depends on the degree to which you can make corrections on the fly for the
spelling errors that occur frequently in tickets. That fact makes the file you
supply through this constructor parameter very important. For the current version of
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
By a production-quality tool, I mean a software package that you can I<actually> use
in a production environment for automated or semi-automated ticket routing in your
organization. I am assuming you already have the tools in place that insert in
real-time the new tickets in an Excel spreadsheet.
Turning this module into a production tool will require that you find the best values
to use for the following three parameters that are needed by the constructor: (1)
C<min_idf_threshold> for the minimum C<idf> value for the words in a query ticket in
order for them to be considered for matching with the other tickets; (2)
C<min_word_length> for discarding words that are too short; and (3)
C<max_num_syn_words> for how many synonyms to retain for a word if the number of
synonyms returned by WordNet is too large. In addition, you must also come up with a
misspelled-words file that is appropriate to your application domain and a stop-words
file.
In order to find the best values to use for the parameters that are mentioned above,
I suggest creating a graphical front-end for this module that would allow for
altering the values of the three parameters listed above in response to the
prevailing mis-routing rates for the tickets. The front-end will display to an
operator the latest ticket that needs to be routed and a small set of the
stemmed_tickets_db => "t/__test_stemmed_tickets_db",
inverted_index_db => "t/__test_inverted_index_db",
tickets_vocab_db => "t/__test_tickets_vocab_db",
idf_db => "t/__test_idf_db",
tkt_doc_vecs_db => "t/__test_tkt_doc_vecs_db",
tkt_doc_vecs_normed_db => "t/__test_tkt_doc_vecs_normed_db",
synset_cache_db => "t/__test_synset_cache_db",
add_synsets_to_tickets => 1,
want_synset_caching => 1,
max_num_syn_words => 3,
min_word_length => 4,
want_stemming => 1,
);
my @returned = $tclusterer->_test_excel_for_tickets();
my @should_be = qw/0 4 0 6/;
#ok( @returned ~~ @should_be, 'Able to process Excel' );
my @comparisons = map {$returned[$_] == $should_be[$_] ? 1 : 0} (0..@returned-1);
my $final_compare = 1;
foreach my $i (0..@returned-1) {
$final_compare *= $comparisons[$i]