Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        my $clusters_and_determinants = 
                   $self->assign_data_to_clusters_initial_mahalanobis($cluster_centers);  
        $clusters = $clusters_and_determinants->[0];
        my @determinants = @{$clusters_and_determinants->[1]};
        my ($min,$max) = minmax(\@determinants);
        die "In cluster_for_given_K(): min determinant of covariance matrix for at " .
            "least one cluster is too small" if $min / $max < 0.001;
    } else {
        $clusters = $self->assign_data_to_clusters_initial($cluster_centers);  
    }
    my $cluster_nonexistent_flag = 0;
    foreach my $trial (0..2) {
        if ($self->{_use_mahalanobis_metric}) {
            ($clusters, $cluster_centers) = $self->assign_data_to_clusters_mahalanobis($clusters, $K);
        } else {
            ($clusters, $cluster_centers) = $self->assign_data_to_clusters( $clusters, $K );
        }
        my $num_of_clusters_returned = @$clusters;
        foreach my $cluster (@$clusters) {
            $cluster_nonexistent_flag = 1 if ((!defined $cluster) ||  (@$cluster == 0));
        }
        last unless $cluster_nonexistent_flag;
    }
    return ($clusters, $cluster_centers);
}

# This function is used when you set the "cluster_seeding" option to 'random' in the
# constructor.  Returns a set of K random integers.  These serve as indices to reach
# into the data array.  A data element whose index is one of the random numbers
# returned by this routine serves as an initial cluster center.  Note the quality
# check it runs on the list of K random integers constructed.  We first make sure
# that all K random integers are different.  Subsequently, we carry out a quality

lib/Algorithm/KMeans.pm  view on Meta::CPAN

# iterations. If this number reaches 100, we exit the while() loop anyway.  In most
# cases, this limit will not be reached.
sub assign_data_to_clusters {
    my $self = shift;
    my $clusters = shift;
    my $K = shift;
    my $final_cluster_centers;
    my $iteration_index = 0;
    while (1) {
        my $new_clusters;
        my $assignment_changed_flag = 0;
        my $current_cluster_center_index = 0;
        my $cluster_size_zero_condition = 0;
        my $how_many = @$clusters;
        my $cluster_centers = $self->update_cluster_centers( deep_copy_AoA_with_nulls( $clusters ) );
        $iteration_index++;
        foreach my $cluster (@$clusters) {
            my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
            foreach my $ele (@$cluster) {
                my @dist_from_clust_centers;
                foreach my $center (@$cluster_centers) {
                    push @dist_from_clust_centers, 
                               $self->distance($ele, $center);
                }
                my ($min, $best_center_index) = minimum( \@dist_from_clust_centers );
                my $best_cluster_center = $cluster_centers->[$best_center_index];
                if (vector_equal($current_cluster_center, $best_cluster_center)){
                    push @{$new_clusters->[$current_cluster_center_index]}, $ele;
                } else {
                    $assignment_changed_flag = 1;             
                    push @{$new_clusters->[$best_center_index]}, $ele;
                }
            }
            $current_cluster_center_index++;
        }
        next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
        # Now make sure that none of the K clusters is an empty cluster:
        foreach my $newcluster (@$new_clusters) {
            $cluster_size_zero_condition = 1 if ((!defined $newcluster) or  (@$newcluster == 0));
        }

lib/Algorithm/KMeans.pm  view on Meta::CPAN

            }
        }        
        foreach my $local_cluster (@$new_clusters) {
            if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
                push @$local_cluster, pop @$largest_cluster;
            }
        }
        next if (($cluster_size_zero_condition) && ($iteration_index < 100));
        last if $iteration_index == 100;
	$clusters = deep_copy_AoA( $new_clusters );
        last if $assignment_changed_flag == 0;
    }
    $final_cluster_centers = $self->update_cluster_centers( $clusters );
    return ($clusters, $final_cluster_centers);
}

sub assign_data_to_clusters_mahalanobis {
    my $self = shift;
    my $clusters = shift;
    my $K = shift;
    my $final_cluster_centers;
    my $iteration_index = 0;
    while (1) {
        my $new_clusters;
        my $assignment_changed_flag = 0;
        my $current_cluster_center_index = 0;
        my $cluster_size_zero_condition = 0;
        my $how_many = @$clusters;
        my $cluster_centers_and_covariances = 
        $self->update_cluster_centers_and_covariances_mahalanobis(deep_copy_AoA_with_nulls($clusters));
        my $cluster_centers = $cluster_centers_and_covariances->[0];
        my $cluster_covariances = $cluster_centers_and_covariances->[1];
        $iteration_index++;
        foreach my $cluster (@$clusters) {
            my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];

lib/Algorithm/KMeans.pm  view on Meta::CPAN

                    next if $@;
                    push @mahalanobis_dist_from_clust_centers, $maha_distance; 
                }
                my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
                die "The Mahalanobis metric may not be appropriate for the data" 
                    unless defined $best_center_index;
                my $best_cluster_center = $cluster_centers->[$best_center_index];
                if (vector_equal($current_cluster_center, $best_cluster_center)){
                    push @{$new_clusters->[$current_cluster_center_index]}, $ele;
                } else {
                    $assignment_changed_flag = 1;             
                    push @{$new_clusters->[$best_center_index]}, $ele;
                }
            }
            $current_cluster_center_index++;
        }
        next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
        # Now make sure that none of the K clusters is an empty cluster:
        foreach my $newcluster (@$new_clusters) {
            $cluster_size_zero_condition = 1 if ((!defined $newcluster) or  (@$newcluster == 0));
        }

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        }        
        foreach my $local_cluster (@$new_clusters) {
            if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
                push @$local_cluster, pop @$largest_cluster;
            }
        }
        next if (($cluster_size_zero_condition) && ($iteration_index < 100));
        last if $iteration_index == 100;
        # Now do a deep copy of new_clusters into clusters
	$clusters = deep_copy_AoA( $new_clusters );
        last if $assignment_changed_flag == 0;
    }
    $final_cluster_centers = $self->update_cluster_centers( $clusters );
    return ($clusters, $final_cluster_centers);
}

sub update_cluster_centers_and_covariances_mahalanobis {
    my $self = shift;
    my @clusters = @{ shift @_ };
    my @new_cluster_centers;
    my @new_cluster_covariances;

lib/Algorithm/KMeans.pm  view on Meta::CPAN

                            K
                            Kmin
                            Kmax
                            terminal_output
                            write_clusters_to_files
                            do_variance_normalization
                            cluster_seeding
                            use_mahalanobis_metric
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub get_value_index_hash {
    my $arr = shift;
    my %hash;
    foreach my $index (0..@$arr-1) {
        $hash{$arr->[$index]} = $index if $arr->[$index] > 0;
    }
    return \%hash;
}

lib/Algorithm/KMeans.pm  view on Meta::CPAN


  my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                          mask     => $mask,
                                          K        => 0,
                                          cluster_seeding => 'random',    # or 'smart'
                                          terminal_output => 1,
                                          write_clusters_to_files => 1,
                                        );

  # Although not shown above, you can obviously set the 'do_variance_normalization'
  # flag here also if you wish.

  # For very large data files, setting K to 0 will result in searching through too
  # many values for K.  For such cases, you can range limit the values of K to search
  # through by

  my $clusterer = Algorithm::KMeans->new( datafile => $datafile,
                                          mask     => "N111",
                                          Kmin     => 3,
                                          Kmax     => 10,
                                          cluster_seeding => 'random',    # or 'smart'



( run in 7.052 seconds using v1.01-cache-2.11-cpan-94b05bcf43c )