Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN


sub get_tickets_from_excel {
    my $self = shift;
    unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
    unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
    unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
    unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
    unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
    unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
    unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};   
    unlink glob "$self->{_tickets_vocab_db}.*";   
    unlink glob "$self->{_idf_db}.*";
    my $filename = $self->{_excel_filename} || die("Excel file required"),
    my $clustering_fieldname = $self->{_clustering_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
    my $unique_id_fieldname = $self->{_unique_id_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
    my $workbook;
    if ($filename =~ /\.xls$/) {
        my $parser = Spreadsheet::ParseExcel->new();
        $workbook = $parser->parse($filename);
        die $parser->error() unless defined $workbook;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

       die "Something went wrong with restoration of stemmed tickets: $@";
    }
}

####################  Get Ticket Vocabulary and Word Counts #################

sub get_ticket_vocabulary_and_construct_inverted_index {
    my $self = shift;
    my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
    $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
    unlink glob "$self->{_tickets_vocab_db}.*";   
    my %vocab_hist_on_disk;
    tie %vocab_hist_on_disk, 'SDBM_File',  
             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
            or die "Can't create DBM files: $!";       
    my %inverted_index;
    my $min = $self->{_min_word_length};
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
        my %uniques = ();
        my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
        my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

    foreach (keys %vocab_hist_on_disk) {
        $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
    }
    untie %vocab_hist_on_disk;
    $self->{_tkt_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
        if $self->{_debug2};
    # Calculate idf(t):
    $self->{_idf_db} = "idf.db" unless $self->{_idf_db};
    unlink glob "$self->{_idf_db}.*";   
    tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
                                            or die "Can't create DBM files: $!";       
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
                                           /
                                           (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                           / log(10) ); 
    }
    foreach (keys %idf_t_on_disk) {
        $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};

t/test.t  view on Meta::CPAN


ok( $clustering_data =~ /i am unable/, 'Able to extract the clustering field from Excel' );


## Test 3 (Check Synset Extraction from WordNet):

$tclusterer->expand_all_tickets_with_synonyms();

ok( -s "t/__test_synset_cache_db" > 20, 'Able to extract synsets from WordNet' );

unlink glob "t/__test_*";



( run in 0.546 second using v1.01-cache-2.11-cpan-49f99fa48dc )