Algorithm-Kmeanspp
view release on metacpan or search on metacpan
lib/Algorithm/Kmeanspp.pm view on Meta::CPAN
package Algorithm::Kmeanspp;
use strict;
use warnings;
use base qw(Class::Accessor::Fast);
use Carp qw(croak);
use List::Util qw(shuffle);
our $VERSION = '0.03';
__PACKAGE__->mk_accessors($_) for qw(vectors centroids clusters);
sub new {
my $class = shift;
my $self = $class->SUPER::new( {@_} );
$self->vectors({}) if !$self->vectors;
$self->centroids([]) if !$self->centroids;
$self->clusters([]) if !$self->clusters;
return $self;
}
sub add_document {
my ($self, $id, $vector) = @_;
return if !defined $id || !$vector;
$self->vectors->{$id} = $vector;
}
sub do_clustering {
my ($self, $num_cluster, $num_iter) = @_;
croak 'The number of clusters must be less than the number of input documents.'
if $num_cluster > scalar(keys %{ $self->vectors });
croak 'The number of clusters must be greater than zero.'
if $num_cluster <= 0;
map { push @{ $self->clusters }, [] } (0 .. $num_cluster-1);
$self->_choose_smart_centroids($num_cluster);
my $assignment = $self->_assign_cluster;
for (my $i = 0; $i < $num_iter; $i++) {
$self->_move_centroids;
my $new_assignment = $self->_assign_cluster;
my $is_changed = 0;
foreach my $id (keys %{ $assignment }) {
if ($assignment->{$id} != $new_assignment->{$id}) {
$is_changed = 1;
last;
}
}
last if !$is_changed;
$assignment = $new_assignment if $new_assignment;
}
}
sub _choose_smart_centroids {
my ($self, $num_cluster) = @_;
my $cur_potential = 0;
# choose one random centroid
my $vector = (shuffle values %{ $self->vectors })[0];
push @{ $self->centroids }, $vector;
my %closest_dist;
foreach my $id (keys %{ $self->vectors }) {
$closest_dist{$id} = $self->_squared_euclid_distance(
$self->vectors->{$id}, $vector);
$cur_potential += $closest_dist{$id};
}
# choose each centroid
for (my $i = 1; $i < $num_cluster; $i++) {
my $randval = rand() * $cur_potential;
my $centroid_id;
foreach my $id (keys %{ $self->vectors }) {
$centroid_id = $id;
last if $randval <= $closest_dist{$id};
$randval -= $closest_dist{$id};
}
my $new_potential = 0;
( run in 1.639 second using v1.01-cache-2.11-cpan-39bf76dae61 )