Algorithm-DBSCAN

 view release on metacpan or  search on metacpan

lib/Algorithm/DBSCAN.pm  view on Meta::CPAN

        For example for the previous point with 5 dimensions 
        $eps = $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance; 
        
    $min_points: the minimal number of points in a region with a radius of $eps. $eps 
    and $min_points are the 2 parameters used to compute the denisty of a region. If 
    the number of points in a region with radius $eps is lower than $min_points the 
    point is considered as an outlier point that can't be included in any cluster.

=cut

sub new {
	my($type, $dataset, $eps, $min_points) = @_;
	
	my $self = {};
	$self->{dataset_object} = $dataset;
	$self->{dataset} = $dataset->{points};
	@{$self->{id_list}} = keys %{$dataset->{points}};
	$self->{eps} = $eps;
	$self->{min_points} = $min_points;
	$self->{current_cluster} = 1;
	$self->{use_external_region_index} = 0;
		
	bless($self, $type);

	return($self);
}

=head2 FindClusters

The main method that will run the DBSCAN algorithm on the Dataset.

=cut

sub FindClusters {
	my ($self, $starting_point_id) = @_;

	my $i = 0;
	unshift(@{$self->{id_list}}, $starting_point_id) if (defined $starting_point_id);
	foreach my $id (@{$self->{id_list}}) {
		my $point = $self->{dataset}->{$id};
		say "$i";
		$i++;
		next if ($point->{visited});
		$point->{visited} = 1;
		$self->_one_more_point_visited();
		
		my $neighborPts = $self->GetRegion($point);
#say Dumper($neighborPts);
		
		if (scalar(@$neighborPts) < $self->{min_points}) {
			$point->{cluster_id} = -1;
		}
		else {
			$self->ExpandCluster($point, $neighborPts);
		}
	}
}

=head2 ExpandCluster

This method will expand the cluster starting by the neighborhood of point $point

=cut

sub ExpandCluster {
	my ($self, $point, $neighborPts) = @_;
	
	if (scalar(@$neighborPts) < $self->{min_points}) {
		$point->{cluster_id} = -1;
	}
	else {
		$self->{current_cluster}++;

		$point->{cluster_id} = $self->{current_cluster};
	
		my %cluster_points;
		map { $cluster_points{$_}++ } @$neighborPts;
		my $cluster_expanded = 0;
		do {
			$cluster_expanded = 0;
			foreach my $id (keys %cluster_points) {
				my $p = $self->{dataset}->{$id};
				unless ($p->{visited}) {
					$p->{visited} = 1;
					$self->_one_more_point_visited();
					
					my $neighborPtsOfClusterMember = $self->GetRegion($p);
					if (scalar(@$neighborPtsOfClusterMember) >= $self->{min_points}) {
						map { $cluster_points{$_}++ } @$neighborPtsOfClusterMember;

say "Cluster [$self->{current_cluster}] has now [".scalar(keys %cluster_points)."] members, added region of point:[$p->{point_id}]";
						$cluster_expanded = 1;
						last;
					}
				}

				$p->{cluster_id} = $self->{current_cluster} unless($p->{cluster_id});
			}
		}
		while($cluster_expanded);
	}
}

=head2 GetRegion

Find all points in the dataset that are in the neighborhood of $point

=cut

sub GetRegion {
	my ($self, $point) = @_;

	my $result; 
	
	my $coordinate_id = join(',', @{$point->{coordinates}});
	if ($self->{use_external_region_index}) {
		my $fh = $self->{region_index_filehandle};
		seek($fh, $self->{region_seek_index}->{$point->{id}}, 0) or return;
		my $region_str = <$fh>;
		my @points = split(/\s+/, $region_str);
		shift(@points);
		$result = \@points;
	}
	else {
		unless ($self->{point_neighbourhood_cache}->{$coordinate_id}) {
			my @region;
			
			foreach my $region_candidate_point_id (@{$self->{id_list}}) {
				push(@region, $region_candidate_point_id) if ($self->{dataset}->{$region_candidate_point_id}->Distance($point) < $self->{eps});
			}
			$self->{point_neighbourhood_cache}->{$coordinate_id} = \@region;
		}
		
		$result = $self->{point_neighbourhood_cache}->{$coordinate_id};
	}
	
	return $result;
}

=head2 UseRegionIndex

For huge datasets a region index can be generated separately (and using multiple cores).
The index is a list of regions for each point in the dataset.

=cut

sub UseRegionIndex {
	my ($self, $region_index_filename) = @_;

	open(my $fh,  "<", $region_index_filename);
	my $offset = 0;

	while (<$fh>) {
		my @points = split(/\s+/, $_);
		$self->{region_seek_index}->{$points[0]} = $offset;
		$offset = tell($fh);
	}
		
	$self->{use_external_region_index} = 1;
	$self->{region_index_filehandle} = $fh;



( run in 1.001 second using v1.01-cache-2.11-cpan-5b529ec07f3 )