Algorithm-TicketClusterer
view release on metacpan or search on metacpan
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
package Algorithm::TicketClusterer;
#---------------------------------------------------------------------------
# Copyright (c) 2014 Avinash Kak. All rights reserved. This program is
# free software. You may modify and/or distribute it under the same terms
# as Perl itself. This copyright notice must remain attached to the file.
#
# Algorithm::TicketClusterer is a Perl module for retrieving Excel-stored
# past tickets that are most similar to a new ticket. Tickets are commonly
# used in software services industry and customer support businesses to
# record requests for service, product complaints, user feedback, and so
# on.
# ---------------------------------------------------------------------------
use 5.10.0;
use strict;
use warnings;
use Carp;
use Storable;
use Spreadsheet::ParseExcel;
use Spreadsheet::XLSX;
use WordNet::QueryData;
use Text::Iconv;
use SDBM_File;
use Fcntl;
our $VERSION = '1.01';
############################### The Constructor #############################
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument " .
"--- perhaps a misspelling\n"
if _check_for_illegal_params(@params) == 0;
bless {
_excel_filename => $args{excel_filename},
_which_worksheet => $args{which_worksheet},
_raw_tickets_db => $args{raw_tickets_db},
_processed_tickets_db => $args{processed_tickets_db},
_synset_cache_db => $args{synset_cache_db},
_stemmed_tickets_db => $args{stemmed_tickets_db},
_inverted_index_db => $args{inverted_index_db},
_tickets_vocab_db => $args{tickets_vocab_db},
_idf_db => $args{idf_db},
_tkt_doc_vecs_db => $args{tkt_doc_vecs_db},
_tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db},
_clustering_fieldname => $args{clustering_fieldname},
_unique_id_fieldname => $args{unique_id_fieldname},
_stop_words_file => $args{stop_words_file},
_misspelled_words_file => $args{misspelled_words_file},
_min_word_length => $args{min_word_length} || 4,
_add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0,
_want_stemming => $args{want_stemming} || 0,
_how_many_retrievals => $args{how_many_retrievals} || 5,
_min_idf_threshold => $args{min_idf_threshold},
_max_num_syn_words => $args{max_num_syn_words} || 3,
_want_synset_caching => $args{want_synset_caching} || 0,
_stop_words => {},
_all_tickets => [],
_column_headers => [],
_good_columns => [],
_tickets_by_ids => {},
_processed_tkts_by_ids => {},
_stemmed_tkts_by_ids => {},
_misspelled_words => {},
_total_num_tickets => 0,
_synset_cache => {},
_vocab_hash => {},
_vocab_idf_hist => {},
_idf_t => {},
_vocab_size => undef,
_doc_vector_template => {},
_tkt_doc_vecs => {},
_tkt_doc_vecs_normed => {},
_query_ticket_id => undef,
_inverted_index => {},
_debug1 => $args{debug1} || 0, # for processing Excel
_debug2 => $args{debug2} || 0, # for modeling tickets
_debug3 => $args{debug3} || 0, # for retrieving similar tickets
_wn => WordNet::QueryData->new( verbose => 0,
noload => 1 ),
}, $class;
}
############################# Extract info from Excel #######################
sub get_tickets_from_excel {
my $self = shift;
unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
( run in 1.026 second using v1.01-cache-2.11-cpan-39bf76dae61 )