Algorithm-DBSCAN
view release on metacpan or search on metacpan
lib/Algorithm/DBSCAN.pm view on Meta::CPAN
=cut
=head2 new
The constructor takes 3 parameters:
$dataset: The Algorithm::DBSCAN::Dataset dataset object
Create the Dataset object:
my $dataset = Algorithm::DBSCAN::Dataset->new();
Add points (the first parameter is a point_id the other are point coordinates)
$dataset->AddPoint(new Algorithm::DBSCAN::Point('point_1', 1, 2, 3, 4, 5);
$eps: The epsilon parameter used for region density computation
WARNING: This implementation uses the sqare distance between the points to avoid
a useless square root call. If you want to use the euclidian distance you need to
convert it to the right value yourself.
For example for the previous point with 5 dimensions
$eps = $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance;
$min_points: the minimal number of points in a region with a radius of $eps. $eps
and $min_points are the 2 parameters used to compute the denisty of a region. If
the number of points in a region with radius $eps is lower than $min_points the
point is considered as an outlier point that can't be included in any cluster.
=cut
sub new {
my($type, $dataset, $eps, $min_points) = @_;
my $self = {};
$self->{dataset_object} = $dataset;
$self->{dataset} = $dataset->{points};
@{$self->{id_list}} = keys %{$dataset->{points}};
$self->{eps} = $eps;
$self->{min_points} = $min_points;
$self->{current_cluster} = 1;
$self->{use_external_region_index} = 0;
bless($self, $type);
return($self);
}
=head2 FindClusters
The main method that will run the DBSCAN algorithm on the Dataset.
=cut
sub FindClusters {
my ($self, $starting_point_id) = @_;
my $i = 0;
unshift(@{$self->{id_list}}, $starting_point_id) if (defined $starting_point_id);
foreach my $id (@{$self->{id_list}}) {
my $point = $self->{dataset}->{$id};
say "$i";
$i++;
next if ($point->{visited});
$point->{visited} = 1;
$self->_one_more_point_visited();
my $neighborPts = $self->GetRegion($point);
#say Dumper($neighborPts);
if (scalar(@$neighborPts) < $self->{min_points}) {
$point->{cluster_id} = -1;
}
else {
$self->ExpandCluster($point, $neighborPts);
}
}
}
=head2 ExpandCluster
This method will expand the cluster starting by the neighborhood of point $point
=cut
sub ExpandCluster {
my ($self, $point, $neighborPts) = @_;
if (scalar(@$neighborPts) < $self->{min_points}) {
$point->{cluster_id} = -1;
}
else {
$self->{current_cluster}++;
$point->{cluster_id} = $self->{current_cluster};
my %cluster_points;
map { $cluster_points{$_}++ } @$neighborPts;
my $cluster_expanded = 0;
do {
$cluster_expanded = 0;
foreach my $id (keys %cluster_points) {
my $p = $self->{dataset}->{$id};
unless ($p->{visited}) {
$p->{visited} = 1;
$self->_one_more_point_visited();
my $neighborPtsOfClusterMember = $self->GetRegion($p);
if (scalar(@$neighborPtsOfClusterMember) >= $self->{min_points}) {
map { $cluster_points{$_}++ } @$neighborPtsOfClusterMember;
say "Cluster [$self->{current_cluster}] has now [".scalar(keys %cluster_points)."] members, added region of point:[$p->{point_id}]";
$cluster_expanded = 1;
last;
}
}
$p->{cluster_id} = $self->{current_cluster} unless($p->{cluster_id});
}
}
while($cluster_expanded);
}
}
=head2 GetRegion
Find all points in the dataset that are in the neighborhood of $point
=cut
sub GetRegion {
my ($self, $point) = @_;
my $result;
my $coordinate_id = join(',', @{$point->{coordinates}});
if ($self->{use_external_region_index}) {
my $fh = $self->{region_index_filehandle};
seek($fh, $self->{region_seek_index}->{$point->{id}}, 0) or return;
my $region_str = <$fh>;
my @points = split(/\s+/, $region_str);
shift(@points);
$result = \@points;
}
else {
unless ($self->{point_neighbourhood_cache}->{$coordinate_id}) {
my @region;
foreach my $region_candidate_point_id (@{$self->{id_list}}) {
push(@region, $region_candidate_point_id) if ($self->{dataset}->{$region_candidate_point_id}->Distance($point) < $self->{eps});
}
$self->{point_neighbourhood_cache}->{$coordinate_id} = \@region;
}
$result = $self->{point_neighbourhood_cache}->{$coordinate_id};
}
return $result;
}
=head2 UseRegionIndex
For huge datasets a region index can be generated separately (and using multiple cores).
The index is a list of regions for each point in the dataset.
=cut
sub UseRegionIndex {
my ($self, $region_index_filename) = @_;
open(my $fh, "<", $region_index_filename);
my $offset = 0;
while (<$fh>) {
my @points = split(/\s+/, $_);
$self->{region_seek_index}->{$points[0]} = $offset;
$offset = tell($fh);
}
$self->{use_external_region_index} = 1;
$self->{region_index_filehandle} = $fh;
}
=head2 PrintClusters
Will print the contents of the clusters
=cut
sub PrintClusters {
my ($self, $point) = @_;
my %clusters;
foreach my $point (@{$self->{dataset}}) {
push(@{$clusters{$point->{cluster_id}}}, $point->{point_id});
}
foreach my $cluster_id (sort keys %clusters) {
say "CLUSTER: $cluster_id";
foreach my $point_id (sort @{$clusters{$cluster_id}}) {
my $min_distance = 1000000000000;
my $closest_point_id;
foreach my $distance_point_id (sort @{$clusters{$cluster_id}}) {
if ($distance_point_id ne $point_id) {
my $this_point = $self->{dataset_object}->GetPointById($point_id);
my $distance_point = $self->{dataset_object}->GetPointById($distance_point_id);
my $distance = $this_point->Distance($distance_point);
if ($distance < $min_distance) {
$min_distance = $distance;
$closest_point_id = $distance_point_id;
}
}
}
say "\t$point_id : (closest point: $closest_point_id, distance: $min_distance)";
}
}
}
=head2 PrintClustersShort
Will print the contents of the clusters (abreviated version)
=cut
sub PrintClustersShort {
my ($self) = @_;
my %clusters;
foreach my $id (keys %{$self->{dataset}}) {
my $point = $self->{dataset}->{$id};
push(@{$clusters{$point->{cluster_id}}}, $point->{point_id});
}
foreach my $cluster_id (sort keys %clusters) {
say "CLUSTER: $cluster_id, [".scalar(@{$clusters{$cluster_id}})."] points";
my $nb = 0;
foreach my $point_id (sort @{$clusters{$cluster_id}}) {
$nb++;
say "\t$point_id";
last if ($nb >= 100);
}
}
}
=head2 _one_more_point_visited
Simple method used to display progress
=cut
sub _one_more_point_visited {
my ($self) = @_;
$self->{nb_visited_points}++;
$self->{start_time} = time() unless ($self->{start_time});
my $eta = time() + ((time() - $self->{start_time})/$self->{nb_visited_points})*(500000);
my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($eta);
say "ETA:".sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
say "nb visited:".$self->{nb_visited_points};
}
=head1 AUTHOR
Michal TOMA, C<< <mtoma at cpan.org> >>
=head1 BUGS
Please report any bugs or feature requests on github: L<https://github.com/mtoma/Algorithm-DBSCAN>
By e-mail to C<bug-algorithm-dbscan at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Algorithm-DBSCAN>. I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Algorithm::DBSCAN
You can also look for information at:
=over 5
=item * Github: Issues (report bugs here)
L<https://github.com/mtoma/Algorithm-DBSCAN>
=item * RT: CPAN's request tracker (report bugs here)
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Algorithm-DBSCAN>
=item * AnnoCPAN: Annotated CPAN documentation
L<http://annocpan.org/dist/Algorithm-DBSCAN>
=item * CPAN Ratings
L<http://cpanratings.perl.org/d/Algorithm-DBSCAN>
=item * Search CPAN
L<http://search.cpan.org/dist/Algorithm-DBSCAN/>
=back
=head1 LICENSE AND COPYRIGHT
Copyright 2016 Michal TOMA.
This program is free software; you can redistribute it and/or modify it
under the terms of the the Artistic License (2.0). You may obtain a
copy of the full license at:
L<http://www.perlfoundation.org/artistic_license_2_0>
( run in 0.667 second using v1.01-cache-2.11-cpan-483215c6ad5 )