Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

examples/ticket_preprocessor_and_doc_modeler.pl  view on Meta::CPAN

###  can retrieve similar tickets from the tickets stored in the
###  spreadsheet.

###  This script calls on a user to specify names for the nine databases
###  that are created for the tickets.  This is to avoid having to process
###  all the tickets every time you need to make a retrieval for a new
###  ticket.


#use lib '../blib/lib', '../blib/arch';

use strict;
use Algorithm::TicketClusterer;

my $excel_filename = "ExampleExcelFile.xls";
#my $excel_filename = "SampleTest.xlsx";
my $fieldname_for_clustering = "Description";
my $unique_id_fieldname = "Request No";
my $raw_tickets_db = "raw_tickets.db";
my $processed_tickets_db = "processed_tickets.db";
my $stemmed_tickets_db = "stemmed_tickets.db";
my $inverted_index_db = "inverted_index.db";
my $tickets_vocab_db = "tickets_vocab.db";
my $idf_db = "idf.db";
my $tkt_doc_vecs_db = "tkt_doc_vecs.db";
my $tkt_doc_vecs_normed_db = "tkt_doc_vecs_normed.db";
my $synset_cache_db = "synset_cache.db";
my $stop_words_file = "stop_words.txt";
my $misspelled_words_file = "misspelled_words.txt";

my $clusterer = Algorithm::TicketClusterer->new( 

                     excel_filename            => $excel_filename,
                     which_worksheet           => 1,
                     clustering_fieldname      => $fieldname_for_clustering,
                     unique_id_fieldname       => $unique_id_fieldname,
                     raw_tickets_db            => $raw_tickets_db,
                     processed_tickets_db      => $processed_tickets_db,
                     stemmed_tickets_db        => $stemmed_tickets_db,
                     inverted_index_db         => $inverted_index_db,
                     tickets_vocab_db          => $tickets_vocab_db,
                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     synset_cache_db           => $synset_cache_db,
                     stop_words_file           => $stop_words_file,
                     misspelled_words_file     => $misspelled_words_file,
                     add_synsets_to_tickets    => 1,
                     want_synset_caching       => 1,
                     max_num_syn_words         => 3,
                     min_word_length           => 4,
                     want_stemming             => 1,
                );

## Extract information from Excel spreadsheets:
$clusterer->get_tickets_from_excel();

## Apply cleanup filters and add synonyms:
$clusterer->delete_markup_from_all_tickets();
$clusterer->apply_filter_to_all_tickets();
$clusterer->expand_all_tickets_with_synonyms();
$clusterer->store_processed_tickets_on_disk();

## Construct the VSM doc model for the tickets:
$clusterer->get_ticket_vocabulary_and_construct_inverted_index();
$clusterer->construct_doc_vectors_for_all_tickets();
$clusterer->store_stemmed_tickets_and_inverted_index_on_disk();
$clusterer->store_ticket_vectors();



( run in 1.433 second using v1.01-cache-2.11-cpan-5b529ec07f3 )