Algorithm-KMeans

 view release on metacpan or  search on metacpan

lib/Algorithm/KMeans.pm  view on Meta::CPAN

        foreach my $cluster (@$clusters) {
            my $current_cluster_center = $cluster_centers->[$current_cluster_center_index];
            my $current_cluster_covariance = $cluster_covariances->[$current_cluster_center_index]; 
            foreach my $ele (@$cluster) {
                my @mahalanobis_dist_from_clust_centers;
                foreach my $i (0..@$clusters-1) {
                    my $center = $cluster_centers->[$i];
                    my $covariance = $cluster_covariances->[$i];
                    my $maha_distance;
                    eval {
                        $maha_distance = $self->distance_mahalanobis($ele, $center, $covariance);
                    };
                    next if $@;
                    push @mahalanobis_dist_from_clust_centers, $maha_distance; 
                }
                my ($min, $best_center_index) = minimum( \@mahalanobis_dist_from_clust_centers );
                die "The Mahalanobis metric may not be appropriate for the data" 
                    unless defined $best_center_index;
                my $best_cluster_center = $cluster_centers->[$best_center_index];
                if (vector_equal($current_cluster_center, $best_cluster_center)){
                    push @{$new_clusters->[$current_cluster_center_index]}, $ele;
                } else {
                    $assignment_changed_flag = 1;             
                    push @{$new_clusters->[$best_center_index]}, $ele;
                }
            }
            $current_cluster_center_index++;
        }
        next if ((@$new_clusters != @$clusters) && ($iteration_index < 100));
        # Now make sure that none of the K clusters is an empty cluster:
        foreach my $newcluster (@$new_clusters) {
            $cluster_size_zero_condition = 1 if ((!defined $newcluster) or  (@$newcluster == 0));
        }
        push @$new_clusters, (undef) x ($K - @$new_clusters) if @$new_clusters < $K;
        # During clustering for a fixed K, should a cluster inadvertently
        # become empty, steal a member from the largest cluster to hopefully
        # spawn a new cluster:
        my $largest_cluster;
        foreach my $local_cluster (@$new_clusters) {
            next if !defined $local_cluster;
            $largest_cluster = $local_cluster if !defined $largest_cluster;
            if (@$local_cluster > @$largest_cluster) {
                $largest_cluster = $local_cluster; 
            }
        }        
        foreach my $local_cluster (@$new_clusters) {
            if ( (!defined $local_cluster) || (@$local_cluster == 0) ) {
                push @$local_cluster, pop @$largest_cluster;
            }
        }
        next if (($cluster_size_zero_condition) && ($iteration_index < 100));
        last if $iteration_index == 100;
        # Now do a deep copy of new_clusters into clusters
	$clusters = deep_copy_AoA( $new_clusters );
        last if $assignment_changed_flag == 0;
    }
    $final_cluster_centers = $self->update_cluster_centers( $clusters );
    return ($clusters, $final_cluster_centers);
}

sub update_cluster_centers_and_covariances_mahalanobis {
    my $self = shift;
    my @clusters = @{ shift @_ };
    my @new_cluster_centers;
    my @new_cluster_covariances;
    # During clustering for a fixed K, should a cluster inadvertently become empty,
    # steal a member from the largest cluster to hopefully spawn a new cluster:
    my $largest_cluster;
    foreach my $cluster (@clusters) {
        next if !defined $cluster;
        $largest_cluster = $cluster if !defined $largest_cluster;
        if (@$cluster > @$largest_cluster) {
            $largest_cluster = $cluster; 
        }
    }        
    foreach my $cluster (@clusters) {
        if ( (!defined $cluster) || (@$cluster == 0) ) {
            push @$cluster, pop @$largest_cluster;
        }
    }
    foreach my $cluster (@clusters) {
        die "Cluster became empty --- untenable condition " .
            "for a given K.  Try again. \n" if !defined $cluster;
        my $cluster_size = @$cluster;
        die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
        my @new_cluster_center = @{$self->add_point_coords( $cluster )};
        @new_cluster_center = map {my $x = $_/$cluster_size; $x} @new_cluster_center;
        push @new_cluster_centers, \@new_cluster_center;
        # for covariance calculation:
        my ($num_rows,$num_cols) = ($self->{_data_dimensions}, scalar(@$cluster));
        my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
        my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
        # All the record labels are stored in the array $self->{_data_id_tags}.  The
        # actual data for clustering is stored in a hash at $self->{_data} whose keys are
        # the record labels; the value associated with each key is the array holding the
        # corresponding numerical multidimensional data.
        foreach my $j (0..$num_cols-1) {
            my $tag = $cluster->[$j];            
            my $data = $self->{_data}->{$tag};
            my @diff_from_mean = vector_subtract($data, \@new_cluster_center);
            $matrix->set_col($j, \@diff_from_mean);
        }
        my $transposed = transpose( $matrix );
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        if ($self->{_debug}) {
            print "\nDisplaying the Covariance Matrix for cluster:";
            display_matrix( $covariance );
        }
        push @new_cluster_covariances, $covariance;
    }
    return [\@new_cluster_centers, \@new_cluster_covariances];
}

# After each new assignment of the data points to the clusters on the basis of the
# current values for the cluster centers, we call the routine shown here for updating
# the values of the cluster centers.
sub update_cluster_centers {
    my $self = shift;
    my @clusters = @{ shift @_ };
    my @new_cluster_centers;
    # During clustering for a fixed K, should a cluster inadvertently become empty,
    # steal a member from the largest cluster to hopefully spawn a new cluster:
    my $largest_cluster;
    foreach my $cluster (@clusters) {
        next if !defined $cluster;
        $largest_cluster = $cluster if !defined $largest_cluster;
        if (@$cluster > @$largest_cluster) {
            $largest_cluster = $cluster; 
        }
    }        
    foreach my $cluster (@clusters) {
        if ( (!defined $cluster) || (@$cluster == 0) ) {
            push @$cluster, pop @$largest_cluster;
        }
    }
    foreach my $cluster (@clusters) {
        die "Cluster became empty --- untenable condition " .
            "for a given K.  Try again. \n" if !defined $cluster;
        my $cluster_size = @$cluster;
        die "Cluster size is zero --- untenable.\n" if $cluster_size == 0;
        my @new_cluster_center = @{$self->add_point_coords( $cluster )};
        @new_cluster_center = map {my $x = $_/$cluster_size; $x} 
                                  @new_cluster_center;
        push @new_cluster_centers, \@new_cluster_center;
    }        
    return \@new_cluster_centers;
}

sub which_cluster_for_new_data_element {
    my $self = shift;
    my $ele = shift;
    die "The dimensionality of the new data element is not correct: $!"
        unless @$ele == $self->{_data_dimensions};
    my %distance_to_new_ele_hash;
    foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
        $distance_to_new_ele_hash{$cluster_id} = $self->distance2($ele, 
                                             $self->{_cluster_centers_hash}->{$cluster_id});
    }
    my @values = values %distance_to_new_ele_hash;
    my ($min,$max) = minmax(\@values);
    my $answer;
    foreach my $cluster_id (keys %distance_to_new_ele_hash) {
        $answer = $cluster_id if $distance_to_new_ele_hash{$cluster_id} == $min;
    }
    return $answer;
}

sub which_cluster_for_new_data_element_mahalanobis {
    my $self = shift;
    my $ele = shift;
    die "The dimensionality of the new data element is not correct: $!"
        unless @$ele == $self->{_data_dimensions};
    my %distance_to_new_ele_hash;
    foreach my $cluster_id (sort keys %{$self->{_cluster_centers_hash}}) {
        $distance_to_new_ele_hash{$cluster_id} = 
                $self->distance_mahalanobis2($ele, $self->{_cluster_centers_hash}->{$cluster_id},
                                             $self->{_cluster_covariances_hash}->{$cluster_id});

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 0.433 second using v1.00-cache-2.02-grep-82fe00e-cpan-b63e86051f13 )