Algorithm-TicketClusterer

 view release on metacpan or  search on metacpan

lib/Algorithm/TicketClusterer.pm  view on Meta::CPAN

        _misspelled_words_file  =>   $args{misspelled_words_file},
        _min_word_length        =>   $args{min_word_length} || 4,
        _add_synsets_to_tickets =>   $args{add_synsets_to_tickets} || 0,
        _want_stemming          =>   $args{want_stemming} || 0,
        _how_many_retrievals    =>   $args{how_many_retrievals} || 5,
        _min_idf_threshold      =>   $args{min_idf_threshold},
        _max_num_syn_words      =>   $args{max_num_syn_words} || 3,
        _want_synset_caching    =>   $args{want_synset_caching} || 0,
        _stop_words             =>   {},
        _all_tickets            =>   [],
        _column_headers         =>   [],
        _good_columns           =>   [],
        _tickets_by_ids         =>   {},
        _processed_tkts_by_ids  =>   {},
        _stemmed_tkts_by_ids    =>   {},
        _misspelled_words       =>   {},
        _total_num_tickets      =>   0,
        _synset_cache           =>   {},
        _vocab_hash             =>   {},
        _vocab_idf_hist         =>   {},
        _idf_t                  =>   {},
        _vocab_size             =>   undef,
        _doc_vector_template    =>   {},
        _tkt_doc_vecs           =>   {},
        _tkt_doc_vecs_normed    =>   {},
        _query_ticket_id        =>   undef,
        _inverted_index         =>   {},
        _debug1                 =>   $args{debug1} || 0, # for processing Excel
        _debug2                 =>   $args{debug2} || 0, # for modeling tickets
        _debug3                 =>   $args{debug3} || 0, # for retrieving similar tickets
        _wn                     =>   WordNet::QueryData->new( verbose => 0, 
                                                              noload => 1 ),
    }, $class;
}

#############################  Extract info from Excel  #######################

sub get_tickets_from_excel {
    my $self = shift;
    unlink $self->{_raw_tickets_db} if -s $self->{_raw_tickets_db};
    unlink $self->{_processed_tickets_db} if -s $self->{_processed_tickets_db};
    unlink $self->{_synset_cache_db} if -s $self->{_synset_cache_db};
    unlink $self->{_stemmed_tickets_db} if -s $self->{_stemmed_tickets_db};
    unlink $self->{_inverted_index_db} if -s $self->{_inverted_index_db};
    unlink $self->{_tkt_doc_vecs_db} if -s $self->{_tkt_doc_vecs_db};
    unlink $self->{_tkt_doc_vecs_normed_db} if -s $self->{_tkt_doc_vecs_normed_db};   
    unlink glob "$self->{_tickets_vocab_db}.*";   
    unlink glob "$self->{_idf_db}.*";
    my $filename = $self->{_excel_filename} || die("Excel file required"),
    my $clustering_fieldname = $self->{_clustering_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter clustering_fieldname that points to the data to be clustered in your Excel sheet -- ");
    my $unique_id_fieldname = $self->{_unique_id_fieldname} 
      || die("\nYou forgot to specify a value for the constructor parameter unique_id_fieldname that is a unique integer identifier for the rows of your Excel sheet -- ");
    my $workbook;
    if ($filename =~ /\.xls$/) {
        my $parser = Spreadsheet::ParseExcel->new();
        $workbook = $parser->parse($filename);
        die $parser->error() unless defined $workbook;
    } elsif ($filename =~ /\.xlsx$/) {
#        use Text::Iconv;
        my $converter = Text::Iconv->new("utf-8", "windows-1251");
        $workbook = Spreadsheet::XLSX->new($filename, $converter);
    } else {
        die "File suffix on the Excel file not recognized";
    }
    my @worksheets = $workbook->worksheets();
    my $which_worksheet = $self->{_which_worksheet} || 
        die "\nYou have not specified which Excel worksheet contains the tickets\n";
    my ( $row_min, $row_max ) = $worksheets[$which_worksheet-1]->row_range();
    my ( $col_min, $col_max ) = $worksheets[$which_worksheet-1]->col_range();
    my @good_columns;
    my $col_headers_row;
    my $col_headers_found = 0;
    my $col_index_for_unique_id;
    my $col_index_for_clustering_field;
    for my $row ( $row_min .. $row_max ) {
        last if $col_headers_found;
        @good_columns = ();
        for my $col ( $col_min .. $col_max ) {
            my $cell = 
                   $worksheets[$which_worksheet-1]->get_cell( $row, $col );
            next unless $cell;
            my $cell_value = _get_rid_of_wide_chars($cell->value());
            push @good_columns, $col if $cell_value;
            if ($cell_value eq $unique_id_fieldname) {
                $col_index_for_unique_id = $col;
                $col_headers_row = $row;
                $col_headers_found = 1;
            }
            if ($cell_value eq $clustering_fieldname) {
                $col_index_for_clustering_field = $col;
            }
        }
    }
    $self->{_good_columns} = \@good_columns;
    print "\nThe unique id is in column: $col_index_for_unique_id\n"
        if $self->{_debug1};
    print "The clustering field is in column: " .
                "$col_index_for_clustering_field\n\n" if $self->{_debug1};
    my %Column_Headers;
    foreach my $field_index (0..@good_columns-1) {
        my $key = "field_" . $field_index;
        $Column_Headers{$key} = "";
    }
    my @col_headers = map {
        my $cell = 
           $worksheets[$which_worksheet-1]->get_cell($col_headers_row, $_);
        $cell ? _get_rid_of_wide_chars($cell->value()) : '';
    } @good_columns;
    $self->{_column_headers} = \@col_headers;
    $self->_display_column_headers() if $self->{_debug1};
    my $unique_id_field_index_in_good_columns = 
     _find_index_for_given_element( $col_index_for_unique_id, \@good_columns );
    my $clustering_field_index_in_good_columns =
     _find_index_for_given_element( $col_index_for_clustering_field, 
                             \@good_columns );
    die "Something is wrong with the info extracted from Excel " .
        "as the index for the column with unique IDs is not one of " .
        "good columns\n\n" 
        unless (defined $unique_id_field_index_in_good_columns) &&
               (defined $clustering_field_index_in_good_columns);
    for my $row_index ( $col_headers_row+1..$row_max-1) { 
        my @values = map {
            my $cell = 
              $worksheets[$which_worksheet-1]->get_cell($row_index, $_);
            $cell ? _get_rid_of_wide_chars($cell->value()) : '';
        } @good_columns;
        next unless $values[$unique_id_field_index_in_good_columns] =~ /\d+/;
        next unless $values[$clustering_field_index_in_good_columns] =~ /\w+/;
        my %onerow;
        foreach my $field_index (0..@good_columns-1) {
            my $key = "field_" . $field_index;
            die "The Columns Headers hash has no field for index " .
                   "$field_index\n    "
                unless exists $col_headers[$field_index];
            $onerow{$col_headers[$field_index]} = $values[$field_index];
        }
        push @{$self->{_all_tickets}}, \%onerow;
    }
    my @duplicates_for_id_field = @{$self->_check_unique_id_field()};
    if (@duplicates_for_id_field > 0) {
        print "Your supposedly unique ID field values for duplicates: @duplicates_for_id_field\n";
        die "\n\nYour unique id field for tickets contains duplicate id's";
    }
    foreach my $ticket (@{$self->{_all_tickets}}) {    
        $self->{_tickets_by_ids}->{$ticket->{$unique_id_fieldname}} =
            lc($ticket->{$clustering_fieldname});
    }
    $self->{_total_num_tickets} = scalar @{$self->{_all_tickets}};
    $self->store_raw_tickets_on_disk();
}

sub _test_excel_for_tickets {
    my $self = shift;
    use Text::Iconv;
    my $converter = Text::Iconv->new("utf-8", "windows-1251");
    my $filename = $self->{_excel_filename} || die("Excel sheet needed for testing is missing");
    my $workbook = Spreadsheet::XLSX->new( $filename, $converter );
    my @worksheets = $workbook->worksheets();
    my ( $row_min, $row_max ) = $worksheets[0]->row_range();
    my ( $col_min, $col_max ) = $worksheets[0]->col_range();
    return ($row_min, $row_max, $col_min, $col_max);
}

sub _display_column_headers {
    my $self = shift;
    print "\nThe good columns are: @{$self->{_good_columns}}\n\n";
    my $overall_header_string = join '  <>  ', @{$self->{_column_headers}};
    print "The column headers are: $overall_header_string\n\n";
}

sub _check_unique_id_field {
    my $self = shift;
    my %check_hash;
    my @duplicates;
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if (exists $ticket->{$self->{_unique_id_fieldname}}) {
            push @duplicates, $ticket->{$self->{_unique_id_fieldname}} 
               if exists $check_hash{$ticket->{$self->{_unique_id_fieldname}}};
            $check_hash{$ticket->{$self->{_unique_id_fieldname}}} = 1;
        }
    }
    if ($self->{_debug1}) {
        my $num_of_tickets = @{$self->{_all_tickets}};
        my $num_entries_check_hash = keys %check_hash;
        print "Number of tickets: $num_of_tickets\n";
        print "Number of keys in check hash: $num_entries_check_hash\n";
    }
    return \@duplicates;
}

sub show_original_ticket_for_given_id {
    my $self = shift;
    my $id = shift;
    print "\n\nDisplaying the fields for the ticket $id:\n\n";
    foreach my $ticket (@{$self->{_all_tickets}}) {
        if ( $ticket->{$self->{_unique_id_fieldname}} == $id) {
            foreach my $key (sort keys %{$ticket}) {
                my $value = $ticket->{$key};
                $value =~ s/^\s+//;
                $value =~ s/\s+$//;
                printf("%20s  ==>  %s\n", $key, $value);
            }
        }
    }
}

sub show_raw_ticket_clustering_data_for_given_id {
    my $self = shift;
    my $ticket_id = shift;
    my $record = $self->{_tickets_by_ids}->{$ticket_id};
    print "\n\nDISPLAYING THE RAW CLUSTERING DATA FOR TICKET $ticket_id:\n\n" .
        "$record\n\n";
    return $record;
}



( run in 2.889 seconds using v1.01-cache-2.11-cpan-5735350b133 )