Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        _excel_filename         =>   $args{excel_filename}, 
        _which_worksheet        =>   $args{which_worksheet},
        _raw_tickets_db         =>   $args{raw_tickets_db}, 
        _processed_tickets_db   =>   $args{processed_tickets_db}, 
        _synset_cache_db        =>   $args{synset_cache_db}, 
        _stemmed_tickets_db     =>   $args{stemmed_tickets_db}, 
        _inverted_index_db      =>   $args{inverted_index_db},
        _tickets_vocab_db       =>   $args{tickets_vocab_db},
        _idf_db                 =>   $args{idf_db}, 
        _tkt_doc_vecs_db        =>   $args{tkt_doc_vecs_db},
        _tkt_doc_vecs_normed_db =>   $args{tkt_doc_vecs_normed_db},
        _clustering_fieldname   =>   $args{clustering_fieldname}, 
        _unique_id_fieldname    =>   $args{unique_id_fieldname}, 
        _stop_words_file        =>   $args{stop_words_file},
        _misspelled_words_file  =>   $args{misspelled_words_file},
        _min_word_length        =>   $args{min_word_length} || 4,
        _add_synsets_to_tickets =>   $args{add_synsets_to_tickets} || 0,
        _want_stemming          =>   $args{want_stemming} || 0,
        _how_many_retrievals    =>   $args{how_many_retrievals} || 5,
        _min_idf_threshold      =>   $args{min_idf_threshold},
        _max_num_syn_words      =>   $args{max_num_syn_words} || 3,
        _want_synset_caching    =>   $args{want_synset_caching} || 0,
        _stop_words             =>   {},
        _all_tickets            =>   [],
        _column_headers         =>   [],
        _good_columns           =>   [],
        _tickets_by_ids         =>   {},
        _processed_tkts_by_ids  =>   {},
        _stemmed_tkts_by_ids    =>   {},
        _misspelled_words       =>   {},
        _total_num_tickets      =>   0,
        _synset_cache           =>   {},
        _vocab_hash             =>   {},
        _vocab_idf_hist         =>   {},
        _idf_t                  =>   {},
        _vocab_size             =>   undef,
        _doc_vector_template    =>   {},
        _tkt_doc_vecs           =>   {},
        _tkt_doc_vecs_normed    =>   {},
        _query_ticket_id        =>   undef,
        _inverted_index         =>   {},
        _debug1                 =>   $args{debug1} || 0, # for processing Excel
        _debug2                 =>   $args{debug2} || 0, # for modeling tickets
        _debug3                 =>   $args{debug3} || 0, # for retrieving similar tickets
        _wn                     =>   WordNet::QueryData->new( verbose => 0, 
                                                              noload => 1 ),
    }, $class;
}

#############################  Extract info from Excel  #######################

sub get_tickets_from_excel {
    my $self = shift;
    unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
    unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
    unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
    unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
    unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
    unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
    unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};   
    unlink glob "$self->{_tickets_vocab_db}.*";   
    unlink glob "$self->{_idf_db}.*";
    my $filename = $self->{_excel_filename} || die("Excel file required"),
    my $clustering_fieldname = $self->{_clustering_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
    my $unique_id_fieldname = $self->{_unique_id_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
    my $workbook;
    if ($filename =~ /\.xls$/) {
        my $parser = Spreadsheet::ParseExcel->new();
        $workbook = $parser->parse($filename);
        die $parser->error() unless defined $workbook;
    } elsif ($filename =~ /\.xlsx$/) {
#        use Text::Iconv;
        my $converter = Text::Iconv->new("utf-8", "windows-1251");
        $workbook = Spreadsheet::XLSX->new($filename, $converter);
    } else {
        die "File suffix on the Excel file not recognized";
    }
    my @worksheets = $workbook->worksheets();
    my $which_worksheet = $self->{_which_worksheet} || 
        die "\nYou have not specified which Excel worksheet contains the tickets\n";
    my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
    my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
    my @good_columns;
    my $col_headers_row;
    my $col_headers_found = 0;
    my $col_index_for_unique_id;
    my $col_index_for_clustering_field;
    for my $row ( $row_min .. $row_max ) {
        last if $col_headers_found;
        @good_columns = ();
        for my $col ( $col_min .. $col_max ) {
            my $cell = 
                   $worksheets[$which_worksheet-1]->get_cell( $row, $col );
            next unless $cell;
            my $cell_value = _get_rid_of_wide_chars($cell->value());
            push @good_columns, $col if $cell_value;
            if ($cell_value eq $unique_id_fieldname) {
                $col_index_for_unique_id = $col;
                $col_headers_row = $row;
                $col_headers_found = 1;
            }
            if ($cell_value eq $clustering_fieldname) {
                $col_index_for_clustering_field = $col;
            }
        }
    }
    $self->{_good_columns} = \@good_columns;
    print "\nThe unique id is in column: $col_index_for_unique_id\n"
        if $self->{_debug1};
    print "The clustering field is in column: " .
                "$col_index_for_clustering_field\n\n" if $self->{_debug1};
    my %Column_Headers;
    foreach my $field_index (0..@good_columns-1) {
        my $key = "field_" . $field_index;
        $Column_Headers{$key} = "";
    }
    my @col_headers = map {
        my $cell = 
           $worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
        $cell ? _get_rid_of_wide_chars($cell->value()) : '';

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

sub store_processed_tickets_on_disk {
    my $self = shift;
    $self->{_processed_tickets_db} = "processed_tickets.db" unless $self->{_processed_tickets_db};
    unlink $self->{_processed_tickets_db};
    eval {                    
        store( $self->{_processed_tkts_by_ids}, $self->{_processed_tickets_db} ); 
    };
    if ($@) {                                 
       die "Something went wrong with disk storage of processed tickets: $@";
    }
}

sub store_stemmed_tickets_and_inverted_index_on_disk {
    my $self = shift;
    $self->{_stemmed_tickets_db} = "stemmed_tickets.db" unless $self->{_stemmed_tickets_db};
    unlink $self->{_stemmed_tickets_db};
    eval {                    
        print "\n\nStoring stemmed tickets on disk\n\n";
        store( $self->{_stemmed_tkts_by_ids}, $self->{_stemmed_tickets_db} ); 
    };
    if ($@) {                                 
       die "Something went wrong with disk storage of stemmed tickets: $@";
    }
    $self->{_inverted_index_db} = "inverted_index.db" unless $self->{_inverted_index_db};
    unlink $self->{_inverted_index_db};
    eval { 
        print "\nStoring inverted index on disk\n\n";
        store( $self->{_inverted_index}, $self->{_inverted_index_db} ); 
    };
    if ($@) {                                 
       die "Something went wrong with disk storage of the inverted index: $@";
    }
}

sub restore_processed_tickets_from_disk {
    my $self = shift;
    eval {
        $self->{_processed_tkts_by_ids} = retrieve( $self->{_processed_tickets_db} );
    };
    if ($@) {                                 
       die "Something went wrong with restoration of processed tickets: $@";
    }
}

sub restore_stemmed_tickets_from_disk {
    my $self = shift;
    eval {
        $self->{_stemmed_tkts_by_ids} = retrieve( $self->{_stemmed_tickets_db} );
    };
    if ($@) {                                 
       die "Something went wrong with restoration of stemmed tickets: $@";
    }
}

####################  Get Ticket Vocabulary and Word Counts #################

sub get_ticket_vocabulary_and_construct_inverted_index {
    my $self = shift;
    my $total_num_of_tickets = keys %{$self->{_processed_tkts_by_ids}};
    $self->{_tickets_vocab_db} = "tickets_vocab.db" unless $self->{_tickets_vocab_db};
    unlink glob "$self->{_tickets_vocab_db}.*";   
    my %vocab_hist_on_disk;
    tie %vocab_hist_on_disk, 'SDBM_File',  
             $self->{_tickets_vocab_db}, O_RDWR|O_CREAT, 0640
            or die "Can't create DBM files: $!";       
    my %inverted_index;
    my $min = $self->{_min_word_length};
    foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
        my %uniques = ();
        my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
        my @brokenup = split /\n|\r|\"|\'|\.|\(|\)|\[|\]|\\|\/|\s+/, $record;
        my @clean_words = grep $_, map { /([a-z0-9_]{$min,})/i;$1 } @brokenup;
        next unless @clean_words;
        @clean_words = grep $_, map &_simple_stemmer($_, $self->{_debug2}), 
                                                                 @clean_words;
        map { $vocab_hist_on_disk{"\L$_"}++ } grep $_, @clean_words;
        for (@clean_words) { $uniques{"\L$_"}++ };
        map { $self->{_vocab_idf_hist}->{"\L$_"}++ } keys %uniques;
        map { push @{$self->{_inverted_index}->{"\L$_"}}, $ticket_id } 
                                                            keys %uniques;
        $self->{_stemmed_tkts_by_ids}->{$ticket_id} = join ' ', @clean_words;
    }
    foreach (keys %vocab_hist_on_disk) {
        $self->{_vocab_hist}->{$_} = $vocab_hist_on_disk{$_};
    }
    untie %vocab_hist_on_disk;
    $self->{_tkt_vocab_done} = 1;
    $self->{_vocab_size} = scalar( keys %{$self->{_vocab_hist}} );
    print "\n\nVocabulary size:  $self->{_vocab_size}\n\n"
        if $self->{_debug2};
    # Calculate idf(t):
    $self->{_idf_db} = "idf.db" unless $self->{_idf_db};
    unlink glob "$self->{_idf_db}.*";   
    tie my %idf_t_on_disk, 'SDBM_File', $self->{_idf_db}, O_RDWR|O_CREAT, 0640
                                            or die "Can't create DBM files: $!";       
    foreach (keys %{$self->{_vocab_idf_hist}}) {
        $idf_t_on_disk{$_} = abs( (1 + log($total_num_of_tickets
                                           /
                                           (1 + $self->{_vocab_idf_hist}->{$_}))) 
                                           / log(10) ); 
    }
    foreach (keys %idf_t_on_disk) {
        $self->{_idf_t}->{$_} = $idf_t_on_disk{$_};
    }
    untie %idf_t_on_disk;
}

sub display_tickets_vocab {
    my $self = shift;
    die "tickets vocabulary not yet constructed"
        unless keys %{$self->{_vocab_hist}};
    print "\n\nDisplaying tickets vocabulary (the number shown against each word is the number of times each word appears in ALL the tickets):\n\n";
    foreach (sort keys %{$self->{_vocab_hist}}){
        my $outstring = sprintf("%30s     %d", $_,$self->{_vocab_hist}->{$_});
        print "$outstring\n";
    }
    my $vocab_size = scalar( keys %{$self->{_vocab_hist}} );
    print "\nSize of the tickets vocabulary: $vocab_size\n\n";
}

sub display_inverse_document_frequencies {
    my $self = shift;
    die "tickets vocabulary not yet constructed"
        unless keys %{$self->{_vocab_idf_hist}};
    print "\n\nDisplaying inverse document frequencies (the number of tickets in which each word appears):\n\n";
    foreach ( sort keys %{$self->{_vocab_idf_hist}} ) {               
        my $outstring = sprintf("%30s     %d", 
                       $_, $self->{_vocab_idf_hist}->{$_});
        print "$outstring\n";
    }
    print "\nDisplaying idf(t) = log(D/d(t)) where D is total number of tickets and d(t) the number of tickets with the word t:\n";
    foreach ( sort keys %{$self->{_idf_t}} ) {               
        my $outstring = sprintf("%30s     %f", $_,$self->{_idf_t}->{$_});
        print "$outstring\n";
    }
}

# The following subroutine is useful for diagnostic purposes.  It
# lists the number of tickets that a word appears in and also lists
# the tickets.  But be careful in interpreting its results.  Note
# if you invoke this subroutine after the synsets have been added
# to the tickets, you may find words being attributed to tickets
# that do not actually contain them in the original Excel sheet.
sub list_processed_tickets_for_a_word {
    my $self = shift;
    while (my $word = <STDIN>) {    #enter ctrl-D to exit the loop
        chomp $word;
        my @ticket_list;
        foreach my $ticket_id (sort {$a <=> $b} keys %{$self->{_processed_tkts_by_ids}}) {
            my $record = $self->{_processed_tkts_by_ids}->{$ticket_id};
            push @ticket_list, $ticket_id if $record =~ /\b$word\b/i;
        }
        my $num = @ticket_list;



( run in 1.646 second using v1.01-cache-2.11-cpan-5b529ec07f3 )