Algorithm-TicketClusterer
view release on metacpan or search on metacpan
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
sub _test_excel_for_tickets {
my $self = shift;
use Text::Iconv;
my $converter = Text::Iconv->new("utf-8", "windows-1251");
my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing");
my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
my @worksheets = $workbook->worksheets();
my ( $row_min, $row_max ) = $worksheets[0]->row_range();
my ( $col_min, $col_max ) = $worksheets[0]->col_range();
return ($row_min, $row_max, $col_min, $col_max);
}
sub _display_column_headers {
my $self = shift;
print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
my $overall_header_string = join ' <> ', @{$self->{_column_headers}};
print "The column headers are: $overall_header_string\n\n";
}
sub _check_unique_id_field {
my $self = shift;
my %check_hash;
my @duplicates;
foreach my $ticket (@{$self->{_all_tickets}}) {
if (exists $ticket->{$self->{_unique_id_fieldname}}) {
push @duplicates, $ticket->{$self->{_unique_id_fieldname}}
if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
$check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
}
}
if ($self->{_debug1}) {
my $num_of_tickets = @{$self->{_all_tickets}};
my $num_entries_check_hash = keys %check_hash;
print "Number of tickets: $num_of_tickets\n";
print "Number of keys in check hash: $num_entries_check_hash\n";
}
return \@duplicates;
}
sub show_original_ticket_for_given_id {
my $self = shift;
my $id = shift;
print "\n\nDisplaying the fields for the ticket $id:\n\n";
foreach my $ticket (@{$self->{_all_tickets}}) {
if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
foreach my $key (sort keys %{$ticket}) {
my $value = $ticket->{$key};
$value =~ s/^\s+//;
$value =~ s/\s+$//;
printf("%20s ==> %s\n", $key, $value);
}
}
}
}
sub show_raw_ticket_clustering_data_for_given_id {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_tickets_by_ids}->{$ticket_id};
print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
"$record\n\n";
return $record;
}
# Needed by test.t
sub _raw_ticket_clustering_data_for_given_id {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_tickets_by_ids}->{$ticket_id};
return $record;
}
sub show_processed_ticket_clustering_data_for_given_id {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
"$record\n\n";
}
sub show_stemmed_ticket_clustering_data_for_given_id {
my $self = shift;
my $ticket_id = shift;
my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
"$record\n\n";
}
# The following function is a good diagnostic tool to look into the
# array stored in $self->{_all_tickets}. Each element of this array
# is a record that represents one row of the Excel file.
sub _show_row {
my $self = shift;
my $row_num = shift;
my $total_rows = @{$self->{_all_tickets}};
print "There are $total_rows items in the \$all_tickets array\n";
die "The row that you want to see does not exist"
unless $row_num < $total_rows;
my %record = %{$self->{_all_tickets}->[$row_num]};
foreach my $field (sort keys %record) {
my $value = $record{$field};
no warnings;
print "$field ==> $value\n";
}
}
sub store_raw_tickets_on_disk {
my $self = shift;
$self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db};
unlink $self->{_raw_tickets_db};
eval {
store( $self->{_all_tickets}, $self->{_raw_tickets_db} );
};
if ($@) {
die "Something went wrong with disk storage of ticket data: $@";
}
}
sub restore_raw_tickets_from_disk {
my $self = shift;
my $clustering_fieldname = $self->{_clustering_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
my $unique_id_fieldname = $self->{_unique_id_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
eval {
$self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} );
};
if ($@) {
die "Unable to retrieve raw tickets from disk: $@";
}
foreach my $ticket (@{$self->{_all_tickets}}) {
$self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
lc($ticket->{$clustering_fieldname});
}
$self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}};
}
sub delete_markup_from_all_tickets {
my $self = shift;
foreach my $ticket (@{$self->{_all_tickets}}) {
$self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}});
}
}
sub _delete_markup_from_one_ticket {
( run in 1.714 second using v1.01-cache-2.11-cpan-140bd7fdf52 )