Algorithm-Kmeanspp

 view release on metacpan or  search on metacpan

lib/Algorithm/Kmeanspp.pm  view on Meta::CPAN

package Algorithm::Kmeanspp;

use strict;
use warnings;
use base qw(Class::Accessor::Fast);
use Carp qw(croak);
use List::Util qw(shuffle);

our $VERSION = '0.03';

__PACKAGE__->mk_accessors($_) for qw(vectors centroids clusters);

sub new {
    my $class = shift;
    my $self = $class->SUPER::new( {@_} );
    $self->vectors({})  if !$self->vectors;
    $self->centroids([])  if !$self->centroids;
    $self->clusters([])  if !$self->clusters;
    return $self;
}

sub add_document {
    my ($self, $id, $vector) = @_;
    return if !defined $id || !$vector;
    $self->vectors->{$id} = $vector;
}

sub do_clustering {
    my ($self, $num_cluster, $num_iter) = @_;
    croak 'The number of clusters must be less than the number of input documents.'
        if $num_cluster > scalar(keys %{ $self->vectors });
    croak 'The number of clusters must be greater than zero.'
        if $num_cluster <= 0;

    map { push @{ $self->clusters }, [] } (0 .. $num_cluster-1);
    $self->_choose_smart_centroids($num_cluster);
    my $assignment = $self->_assign_cluster;
    for (my $i = 0; $i < $num_iter; $i++) {
        $self->_move_centroids;
        my $new_assignment = $self->_assign_cluster;
        my $is_changed = 0;
        foreach my $id (keys %{ $assignment }) {
            if ($assignment->{$id} != $new_assignment->{$id}) {
                $is_changed = 1;
                last;
            }
        }
        last if !$is_changed;
        $assignment = $new_assignment if $new_assignment;
    }
}

sub _choose_smart_centroids {
    my ($self, $num_cluster) = @_;
    my $cur_potential = 0;

    # choose one random centroid
    my $vector = (shuffle values %{ $self->vectors })[0];
    push @{ $self->centroids }, $vector;
    my %closest_dist;
    foreach my $id (keys %{ $self->vectors }) {
        $closest_dist{$id} = $self->_squared_euclid_distance(
            $self->vectors->{$id}, $vector);
        $cur_potential += $closest_dist{$id};
    }
    # choose each centroid
    for (my $i = 1; $i < $num_cluster; $i++) {
        my $randval = rand() * $cur_potential;
        my $centroid_id;
        foreach my $id (keys %{ $self->vectors }) {
            $centroid_id = $id;
            last if $randval <= $closest_dist{$id};
            $randval -= $closest_dist{$id};
        }
        my $new_potential = 0;



( run in 1.639 second using v1.01-cache-2.11-cpan-39bf76dae61 )