Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


sub _test_excel_for_tickets {
    my $self = shift;
    use Text::Iconv;
    my $converter = Text::Iconv->new("utf-8", "windows-1251");
    my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing");
    my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
    my @worksheets = $workbook->worksheets();
    my ( $row_min, $row_max ) = $worksheets[0]->row_range();
    my ( $col_min, $col_max ) = $worksheets[0]->col_range();
    return ($row_min, $row_max, $col_min, $col_max);
}

sub _display_column_headers {
    my $self = shift;
    print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
    my $overall_header_string = join '  <>  ', @{$self->{_column_headers}};
    print "The column headers are: $overall_header_string\n\n";
}

sub _check_unique_id_field {
    my $self = shift;
    my %check_hash;
    my @duplicates;
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if (exists $ticket->{$self->{_unique_id_fieldname}}) {
            push @duplicates, $ticket->{$self->{_unique_id_fieldname}} 
               if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
            $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
        }
    }
    if ($self->{_debug1}) {
        my $num_of_tickets = @{$self->{_all_tickets}};
        my $num_entries_check_hash = keys %check_hash;
        print "Number of tickets: $num_of_tickets\n";
        print "Number of keys in check hash: $num_entries_check_hash\n";
    }
    return \@duplicates;
}

sub show_original_ticket_for_given_id {
    my $self = shift;
    my $id = shift;
    print "\n\nDisplaying the fields for the ticket $id:\n\n";
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
            foreach my $key (sort keys %{$ticket}) {
                my $value = $ticket->{$key};
                $value =~ s/^\s+//;
                $value =~ s/\s+$//;
                printf("%20s  ==>  %s\n", $key, $value);
            }
        }
    }
}

sub show_raw_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
    return $record;
}

# Needed by test.t
sub _raw_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    return $record;
}


sub show_processed_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING PROCESSED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
}

sub show_stemmed_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_stemmed_tkts_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING STEMMED CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
}

# The following function is a good diagnostic tool to look into the
# array stored in $self->{_all_tickets}.  Each element of this array
# is a record that represents one row of the Excel file.
sub _show_row {
    my $self = shift;
    my $row_num = shift;
    my $total_rows = @{$self->{_all_tickets}};
    print "There are $total_rows items in the \$all_tickets array\n";
    die "The row that you want to see does not exist" 
        unless $row_num < $total_rows;
    my %record = %{$self->{_all_tickets}->[$row_num]};
    foreach my $field (sort keys %record) {
        my $value = $record{$field};
        no warnings;
        print "$field  ==>  $value\n";
    }
}

sub store_raw_tickets_on_disk {
    my $self = shift;
    $self->{_raw_tickets_db} = "raw_tickets.db" unless $self->{_raw_tickets_db};
    unlink $self->{_raw_tickets_db};
    eval {                    
        store( $self->{_all_tickets}, $self->{_raw_tickets_db} ); 
    };
    if ($@) {                                 
        die "Something went wrong with disk storage of ticket data: $@";
    }
}

sub restore_raw_tickets_from_disk {
    my $self = shift;
    my $clustering_fieldname = $self->{_clustering_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
    my $unique_id_fieldname = $self->{_unique_id_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
    eval {                    
        $self->{_all_tickets} = retrieve( $self->{_raw_tickets_db} );
    };
    if ($@) {                                 
        die "Unable to retrieve raw tickets from disk: $@";
    }
    foreach my $ticket (@{$self->{_all_tickets}}) {    
        $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
            lc($ticket->{$clustering_fieldname});
    }
    $self->{_total_num_tickets} = scalar keys %{$self->{_tickets_by_ids}};
}

sub delete_markup_from_all_tickets {
    my $self = shift;    
    foreach my $ticket (@{$self->{_all_tickets}}) {
        $self->_delete_markup_from_one_ticket($ticket->{$self->{_unique_id_fieldname}});
    }
}

sub _delete_markup_from_one_ticket {



( run in 1.714 second using v1.01-cache-2.11-cpan-140bd7fdf52 )