Algorithm-TicketClusterer
view release on metacpan or search on metacpan
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
sub get_tickets_from_excel {
my $self = shift;
unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};
unlink glob "$self->{_tickets_vocab_db}.*";
unlink glob "$self->{_idf_db}.*";
my $filename = $self->{_excel_filename} || die("Excel file required"),
my $clustering_fieldname = $self->{_clustering_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
my $unique_id_fieldname = $self->{_unique_id_fieldname}
|| die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
my $workbook;
if ($filename =~ /\.xls$/) {
my $parser = Spreadsheet::ParseExcel->new();
$workbook = $parser->parse($filename);
die $parser->error() unless defined $workbook;
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
die "Something went wrong with restoration of stemmed tickets: $@";
}
}
#################### Get Ticket Vocabulary and Word Counts #################
sub get_ticket_vocabulary_and_construct_inverted_index {
my $self = shift;
my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
$self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
unlink glob "$self->{_tickets_vocab_db}.*";
my %vocab_hist_on_disk;
tie %vocab_hist_on_disk, 'SDBM_File',
$self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
my %inverted_index;
my $min = $self->{_min_word_length};
foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
my %uniques = ();
my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
lib/Algorithm/TicketClusterer.pm view on Meta::CPAN
foreach (keys %vocab_hist_on_disk) {
$self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
}
untie %vocab_hist_on_disk;
$self->{_tkt_vocab_done} = 1;
$self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
print "\n\nVocabulary size: $self->{_vocab_size}\n\n"
if $self->{_debug2};
# Calculate idf(t):
$self->{_idf_db} = "idf.db" unless $self->{_idf_db};
unlink glob "$self->{_idf_db}.*";
tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
or die "Can't create DBM files: $!";
foreach (keys %{$self->{_vocab_idf_hist}}) {
$idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
/
(1 + $self->{_vocab_idf_hist}->{$_})))
/ log(10) );
}
foreach (keys %idf_t_on_disk) {
$self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
ok( $clustering_data =~ /i am unable/, 'Able to extract the clustering field from Excel' );
## Test 3 (Check Synset Extraction from WordNet):
$tclusterer->expand_all_tickets_with_synonyms();
ok( -s "t/__test_synset_cache_db" > 20, 'Able to extract synsets from WordNet' );
unlink glob "t/__test_*";
( run in 0.546 second using v1.01-cache-2.11-cpan-49f99fa48dc )