Algorithm-DBSCAN
view release on metacpan or search on metacpan
lib/Algorithm/DBSCAN.pm view on Meta::CPAN
For example for the previous point with 5 dimensions
$eps = $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance;
$min_points: the minimal number of points in a region with a radius of $eps. $eps
and $min_points are the 2 parameters used to compute the denisty of a region. If
the number of points in a region with radius $eps is lower than $min_points the
point is considered as an outlier point that can't be included in any cluster.
=cut
sub new {
my($type, $dataset, $eps, $min_points) = @_;
my $self = {};
$self->{dataset_object} = $dataset;
$self->{dataset} = $dataset->{points};
@{$self->{id_list}} = keys %{$dataset->{points}};
$self->{eps} = $eps;
$self->{min_points} = $min_points;
$self->{current_cluster} = 1;
$self->{use_external_region_index} = 0;
bless($self, $type);
return($self);
}
=head2 FindClusters
The main method that will run the DBSCAN algorithm on the Dataset.
=cut
sub FindClusters {
my ($self, $starting_point_id) = @_;
my $i = 0;
unshift(@{$self->{id_list}}, $starting_point_id) if (defined $starting_point_id);
foreach my $id (@{$self->{id_list}}) {
my $point = $self->{dataset}->{$id};
say "$i";
$i++;
next if ($point->{visited});
$point->{visited} = 1;
$self->_one_more_point_visited();
my $neighborPts = $self->GetRegion($point);
#say Dumper($neighborPts);
if (scalar(@$neighborPts) < $self->{min_points}) {
$point->{cluster_id} = -1;
}
else {
$self->ExpandCluster($point, $neighborPts);
}
}
}
=head2 ExpandCluster
This method will expand the cluster starting by the neighborhood of point $point
=cut
sub ExpandCluster {
my ($self, $point, $neighborPts) = @_;
if (scalar(@$neighborPts) < $self->{min_points}) {
$point->{cluster_id} = -1;
}
else {
$self->{current_cluster}++;
$point->{cluster_id} = $self->{current_cluster};
my %cluster_points;
map { $cluster_points{$_}++ } @$neighborPts;
my $cluster_expanded = 0;
do {
$cluster_expanded = 0;
foreach my $id (keys %cluster_points) {
my $p = $self->{dataset}->{$id};
unless ($p->{visited}) {
$p->{visited} = 1;
$self->_one_more_point_visited();
my $neighborPtsOfClusterMember = $self->GetRegion($p);
if (scalar(@$neighborPtsOfClusterMember) >= $self->{min_points}) {
map { $cluster_points{$_}++ } @$neighborPtsOfClusterMember;
say "Cluster [$self->{current_cluster}] has now [".scalar(keys %cluster_points)."] members, added region of point:[$p->{point_id}]";
$cluster_expanded = 1;
last;
}
}
$p->{cluster_id} = $self->{current_cluster} unless($p->{cluster_id});
}
}
while($cluster_expanded);
}
}
=head2 GetRegion
Find all points in the dataset that are in the neighborhood of $point
=cut
sub GetRegion {
my ($self, $point) = @_;
my $result;
my $coordinate_id = join(',', @{$point->{coordinates}});
if ($self->{use_external_region_index}) {
my $fh = $self->{region_index_filehandle};
seek($fh, $self->{region_seek_index}->{$point->{id}}, 0) or return;
my $region_str = <$fh>;
my @points = split(/\s+/, $region_str);
shift(@points);
$result = \@points;
}
else {
unless ($self->{point_neighbourhood_cache}->{$coordinate_id}) {
my @region;
foreach my $region_candidate_point_id (@{$self->{id_list}}) {
push(@region, $region_candidate_point_id) if ($self->{dataset}->{$region_candidate_point_id}->Distance($point) < $self->{eps});
}
$self->{point_neighbourhood_cache}->{$coordinate_id} = \@region;
}
$result = $self->{point_neighbourhood_cache}->{$coordinate_id};
}
return $result;
}
=head2 UseRegionIndex
For huge datasets a region index can be generated separately (and using multiple cores).
The index is a list of regions for each point in the dataset.
=cut
sub UseRegionIndex {
my ($self, $region_index_filename) = @_;
open(my $fh, "<", $region_index_filename);
my $offset = 0;
while (<$fh>) {
my @points = split(/\s+/, $_);
$self->{region_seek_index}->{$points[0]} = $offset;
$offset = tell($fh);
}
$self->{use_external_region_index} = 1;
$self->{region_index_filehandle} = $fh;
( run in 1.001 second using v1.01-cache-2.11-cpan-5b529ec07f3 )