Algorithm-TicketClusterer
view release on metacpan or search on metacpan
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
_misspelled_words_file => $args{misspelled_words_file},
_min_word_length => $args{min_word_length} || 4,
_add_synsets_to_tickets => $args{add_synsets_to_tickets} || 0,
_want_stemming => $args{want_stemming} || 0,
_how_many_retrievals => $args{how_many_retrievals} || 5,
_min_idf_threshold => $args{min_idf_threshold},
_max_num_syn_words => $args{max_num_syn_words} || 3,
_want_synset_caching => $args{want_synset_caching} || 0,
_stop_words => {},
_all_tickets => [],
_column_headers => [],
_good_columns => [],
_tickets_by_ids => {},
_processed_tkts_by_ids => {},
_stemmed_tkts_by_ids => {},
_misspelled_words => {},
_total_num_tickets => 0,
_synset_cache => {},
_vocab_hash => {},
_vocab_idf_hist => {},
_idf_t => {},
_vocab_size => undef,
_doc_vector_template => {},
_tkt_doc_vecs => {},
_tkt_doc_vecs_normed => {},
_query_ticket_id => undef,
_inverted_index => {},
_debug1 => $args{debug1} || 0, # for processing Excel
_debug2 => $args{debug2} || 0, # for modeling tickets
_debug3 => $args{debug3} || 0, # for retrieving similar tickets
_wn => WordNet::QueryData->new( verbose => 0,
noload => 1 ),
}, $class;
}
############################# Extract info from Excel #######################
sub get_tickets_from_excel {
my $self = shift;
unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
unlink glob "$self->{_tickets_vocab_db}.*";
unlink glob "$self->{_idf_db}.*";
my $filename = $self->{_excel_filename} || die("Excel file required"),
my $clustering_fieldname = $self->{_clustering_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
my $unique_id_fieldname = $self->{_unique_id_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
my $workbook;
if ($filename =~ /\.xls$/) {
my $parser = Spreadsheet::ParseExcel->new();
$workbook = $parser->parse($filename);
die $parser->error() unless defined $workbook;
} elsif ($filename =~ /\.xlsx$/) {
# use Text::Iconv;
my $converter = Text::Iconv->new("utf-8", "windows-1251");
$workbook = Spreadsheet::XLSX->new($filename, $converter);
} else {
die "File suffix on the Excel file not recognized";
}
my @worksheets = $workbook->worksheets();
my $which_worksheet = $self->{_which_worksheet} ||
die "\nYou have not specified which Excel worksheet contains the tickets\n";
my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
my @good_columns;
my $col_headers_row;
my $col_headers_found = 0;
my $col_index_for_unique_id;
my $col_index_for_clustering_field;
for my $row ( $row_min .. $row_max ) {
last if $col_headers_found;
@good_columns = ();
for my $col ( $col_min .. $col_max ) {
my $cell =
$worksheets[$which_worksheet-1]->get_cell( $row, $col );
next unless $cell;
my $cell_value = _get_rid_of_wide_chars($cell->value());
push @good_columns, $col if $cell_value;
if ($cell_value eq $unique_id_fieldname) {
$col_index_for_unique_id = $col;
$col_headers_row = $row;
$col_headers_found = 1;
}
if ($cell_value eq $clustering_fieldname) {
$col_index_for_clustering_field = $col;
}
}
}
$self->{_good_columns} = \@good_columns;
print "\nThe unique id is in column: $col_index_for_unique_id\n"
if $self->{_debug1};
print "The clustering field is in column: " .
"$col_index_for_clustering_field\n\n" if $self->{_debug1};
my %Column_Headers;
foreach my $field_index (0..@good_columns-1) {
my $key = "field_" . $field_index;
$Column_Headers{$key} = "";
}
my @col_headers = map {
my $cell =
$worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
$cell ? _get_rid_of_wide_chars($cell->value()) : '';
} @good_columns;
$self->{_column_headers} = \@col_headers;
$self->_display_column_headers() if $self->{_debug1};
my $unique_id_field_index_in_good_columns =
_find_index_for_given_element( $col_index_for_unique_id, \@good_columns );
my $clustering_field_index_in_good_columns =
_find_index_for_given_element( $col_index_for_clustering_field,
\@good_columns );
die "Something is wrong with the info extracted from Excel " .
"as the index for the column with unique IDs is not one of " .
"good columns\n\n"
unless (defined $unique_id_field_index_in_good_columns) &&
(defined $clustering_field_index_in_good_columns);
for my $row_index ( $col_headers_row+1..$row_max-1) {
my @values = map {
my $cell =
$worksheets[$which_worksheet-1]->get_cell($row_index, $_);
$cell ? _get_rid_of_wide_chars($cell->value()) : '';
} @good_columns;
next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/;
next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/;
my %onerow;
foreach my $field_index (0..@good_columns-1) {
my $key = "field_" . $field_index;
die "The Columns Headers hash has no field for index " .
"$field_index\n "
unless exists $col_headers[$field_index];
$onerow{$col_headers[$field_index]} = $values[$field_index];
}
push @{$self->{_all_tickets}}, \%onerow;
}
my @duplicates_for_id_field = @{$self->_check_unique_id_field()};
if (@duplicates_for_id_field > 0) {
print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n";
die "\n\nYour unique id field for tickets contains duplicate id's";
}
foreach my $ticket (@{$self->{_all_tickets}}) {
$self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
lc($ticket->{$clustering_fieldname});
}
$self->{_total_num_tickets} = scalar @{$self->{_all_tickets}};
$self->store_raw_tickets_on_disk();
}
sub _test_excel_for_tickets {
my $self = shift;
use Text::Iconv;
my $converter = Text::Iconv->new("utf-8", "windows-1251");
my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing");
my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
my @worksheets = $workbook->worksheets();
my ( $row_min, $row_max ) = $worksheets[0]->row_range();
my ( $col_min, $col_max ) = $worksheets[0]->col_range();
return ($row_min, $row_max, $col_min, $col_max);
}
sub _display_column_headers {
my $self = shift;
print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
my $overall_header_string = join ' <> ', @{$self->{_column_headers}};
print "The column headers are: $overall_header_string\n\n";
}
sub _check_unique_id_field {
my $self = shift;
my %check_hash;
my @duplicates;
foreach my $ticket (@{$self->{_all_tickets}}) {
if (exists $ticket->{$self->{_unique_id_fieldname}}) {
push @duplicates, $ticket->{$self->{_unique_id_fieldname}}
if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
$check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
}
}
if ($self->{_debug1}) {
my $num_of_tickets = @{$self->{_all_tickets}};
my $num_entries_check_hash = keys %check_hash;
print "Number of tickets: $num_of_tickets\n";
print "Number of keys in check hash: $num_entries_check_hash\n";
}
return \@duplicates;
}
sub show_original_ticket_for_given_id {
my $self = shift;
my $id = shift;
print "\n\nDisplaying the fields for the ticket $id:\n\n";
foreach my $ticket (@{$self->{_all_tickets}}) {
if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
foreach my $key (sort keys %{$ticket}) {
my $value = $ticket->{$key};
$value =~ s/^\s+//;
$value =~ s/\s+$//;
printf("%20s ==> %s\n", $key, $value);
}
}
}
}
sub show_raw_ticket_clustering_data_for_given_id {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_tickets_by_ids}->{$ticket_id};
print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
"$record\n\n";
return $record;
}
( run in 2.889 seconds using v1.01-cache-2.11-cpan-5735350b133 )