Algorithm-TicketClusterer
view release on metacpan or search on metacpan
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
_excel_filename => $args{excel_filename},
_which_worksheet => $args{which_worksheet},
_raw_tickets_db => $args{raw_tickets_db},
_processed_tickets_db => $args{processed_tickets_db},
_synset_cache_db => $args{synset_cache_db},
_stemmed_tickets_db => $args{stemmed_tickets_db},
_inverted_index_db => $args{inverted_index_db},
_tickets_vocab_db => $args{tickets_vocab_db},
_idf_db => $args{idf_db},
_tkt_doc_vecs_db => $args{tkt_doc_vecs_db},
_tkt_doc_vecs_normed_db => $args{tkt_doc_vecs_normed_db},
_clustering_fieldname => $args{clustering_fieldname},
_unique_id_fieldname => $args{unique_id_fieldname},
_stop_words_file => $args{stop_words_file},
_misspelled_words_file => $args{misspelled_words_file},
_min_word_length => $args{min_word_length} || 4,
_add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0,
_want_stemming => $args{want_stemming} || 0,
_how_many_retrievals => $args{how_many_retrievals} || 5,
_min_idf_threshold => $args{min_idf_threshold},
_max_num_syn_words => $args{max_num_syn_words} || 3,
_want_synset_caching => $args{want_synset_caching} || 0,
_stop_words => {},
_all_tickets => [],
_column_headers => [],
_good_columns => [],
_tickets_by_ids => {},
_processed_tkts_by_ids => {},
_stemmed_tkts_by_ids => {},
_misspelled_words => {},
_total_num_tickets => 0,
_synset_cache => {},
_vocab_hash => {},
_vocab_idf_hist => {},
_idf_t => {},
_vocab_size => undef,
_doc_vector_template => {},
_tkt_doc_vecs => {},
_tkt_doc_vecs_normed => {},
_query_ticket_id => undef,
_inverted_index => {},
_debug1 => $args{debug1} || 0, # for processing Excel
_debug2 => $args{debug2} || 0, # for modeling tickets
_debug3 => $args{debug3} || 0, # for retrieving similar tickets
_wn => WordNet::QueryData->new( verbose => 0,
noload => 1 ),
}, $class;
}
############################# Extract info from Excel #######################
sub get_tickets_from_excel {
my $self = shift;
unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
unlink glob "$self->{_tickets_vocab_db}.*";
unlink glob "$self->{_idf_db}.*";
my $filename = $self->{_excel_filename} || die("Excel file required"),
my $clustering_fieldname = $self->{_clustering_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
my $unique_id_fieldname = $self->{_unique_id_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
my $workbook;
if ($filename =~ /\.xls$/) {
my $parser = Spreadsheet::ParseExcel->new();
$workbook = $parser->parse($filename);
die $parser->error() unless defined $workbook;
} elsif ($filename =~ /\.xlsx$/) {
# use Text::Iconv;
my $converter = Text::Iconv->new("utf-8", "windows-1251");
$workbook = Spreadsheet::XLSX->new($filename, $converter);
} else {
die "File suffix on the Excel file not recognized";
}
my @worksheets = $workbook->worksheets();
my $which_worksheet = $self->{_which_worksheet} ||
die "\nYou have not specified which Excel worksheet contains the tickets\n";
my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
my @good_columns;
my $col_headers_row;
my $col_headers_found = 0;
my $col_index_for_unique_id;
my $col_index_for_clustering_field;
for my $row ( $row_min .. $row_max ) {
last if $col_headers_found;
@good_columns = ();
for my $col ( $col_min .. $col_max ) {
my $cell =
$worksheets[$which_worksheet-1]->get_cell( $row, $col );
next unless $cell;
my $cell_value = _get_rid_of_wide_chars($cell->value());
push @good_columns, $col if $cell_value;
if ($cell_value eq $unique_id_fieldname) {
$col_index_for_unique_id = $col;
$col_headers_row = $row;
$col_headers_found = 1;
}
if ($cell_value eq $clustering_fieldname) {
$col_index_for_clustering_field = $col;
}
}
}
$self->{_good_columns} = \@good_columns;
print "\nThe unique id is in column: $col_index_for_unique_id\n"
if $self->{_debug1};
print "The clustering field is in column: " .
"$col_index_for_clustering_field\n\n" if $self->{_debug1};
my %Column_Headers;
foreach my $field_index (0..@good_columns-1) {
my $key = "field_" . $field_index;
$Column_Headers{$key} = "";
}
my @col_headers = map {
my $cell =
$worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
$cell ? _get_rid_of_wide_chars($cell->value()) : '';
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
sub store_processed_tickets_on_disk {
my $self = shift;
$self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
unlink $self->{_processed_tickets_db};
eval {
store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} );
};
if ($@) {
die "Something went wrong with disk storage of processed tickets: $@";
}
}
sub store_stemmed_tickets_and_inverted_index_on_disk {
my $self = shift;
$self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db};
unlink $self->{_stemmed_tickets_db};
eval {
print "\n\nStoring stemmed tickets on disk\n\n";
store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} );
};
if ($@) {
die "Something went wrong with disk storage of stemmed tickets: $@";
}
$self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db};
unlink $self->{_inverted_index_db};
eval {
print "\nStoring inverted index on disk\n\n";
store( $self->{_inverted_index}, $self->{_inverted_index_db} );
};
if ($@) {
die "Something went wrong with disk storage of the inverted index: $@";
}
}
sub restore_processed_tickets_from_disk {
my $self = shift;
eval {
$self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} );
};
if ($@) {
die "Something went wrong with restoration of processed tickets: $@";
}
}
sub restore_stemmed_tickets_from_disk {
my $self = shift;
eval {
$self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} );
};
if ($@) {
die "Something went wrong with restoration of stemmed tickets: $@";
}
}
#################### Get Ticket Vocabulary and Word Counts #################
sub get_ticket_vocabulary_and_construct_inverted_index {
my $self = shift;
my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
$self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
unlink glob "$self->{_tickets_vocab_db}.*";
my %vocab_hist_on_disk;
tie %vocab_hist_on_disk, 'SDBM_File',
$self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
my %inverted_index;
my $min = $self->{_min_word_length};
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
my %uniques = ();
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
next unless @clean_words;
@clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}),
@clean_words;
map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
for (@clean_words) { $uniques{"\L$_"}++ };
map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id }
keys %uniques;
$self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words;
}
foreach (keys %vocab_hist_on_disk) {
$self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
}
untie %vocab_hist_on_disk;
$self->{_tkt_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug2};
# Calculate idf(t):
$self->{_idf_db} = "idf.db" unless $self->{_idf_db};
unlink glob "$self->{_idf_db}.*";
tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
foreach (keys %{$self->{_vocab_idf_hist}}) {
$idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
/
(1 + $self->{_vocab_idf_hist}->{$_})))
/ log(10) );
}
foreach (keys %idf_t_on_disk) {
$self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
}
untie %idf_t_on_disk;
}
sub display_tickets_vocab {
my $self = shift;
die "tickets vocabulary not yet constructed"
unless keys %{$self->{_vocab_hist}};
print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n";
foreach (sort keys %{$self->{_vocab_hist}}){
my $outstring = sprintf("%30s %d", $_,$self->{_vocab_hist}->{$_});
print "$outstring\n";
}
my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
print "\nSize of the tickets vocabulary: $vocab_size\n\n";
}
sub display_inverse_document_frequencies {
my $self = shift;
die "tickets vocabulary not yet constructed"
unless keys %{$self->{_vocab_idf_hist}};
print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n";
foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {
my $outstring = sprintf("%30s %d",
$_, $self->{_vocab_idf_hist}->{$_});
print "$outstring\n";
}
print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n";
foreach ( sort keys %{$self->{_idf_t}} ) {
my $outstring = sprintf("%30s %f", $_,$self->{_idf_t}->{$_});
print "$outstring\n";
}
}
# The following subroutine is useful for diagnostic purposes. It
# lists the number of tickets that a word appears in and also lists
# the tickets. But be careful in interpreting its results. Note
# if you invoke this subroutine after the synsets have been added
# to the tickets, you may find words being attributed to tickets
# that do not actually contain them in the original Excel sheet.
sub list_processed_tickets_for_a_word {
my $self = shift;
while (my $word = <STDIN>) { #enter ctrl-D to exit the loop
chomp $word;
my @ticket_list;
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
}
my $num = @ticket_list;
( run in 1.646 second using v1.01-cache-2.11-cpan-5b529ec07f3 )