Algorithm-DBSCAN

 view release on metacpan or  search on metacpan

lib/Algorithm/DBSCAN.pm  view on Meta::CPAN


=cut

=head2 new

The constructor takes 3 parameters:
    
    $dataset: The Algorithm::DBSCAN::Dataset dataset object
        
        Create the Dataset object:
            my $dataset = Algorithm::DBSCAN::Dataset->new();
        
        Add points (the first parameter is a point_id the other are point coordinates)
            $dataset->AddPoint(new Algorithm::DBSCAN::Point('point_1', 1, 2, 3, 4, 5);
            
    $eps: The epsilon parameter used for region density computation
        WARNING: This implementation uses the sqare distance between the points to avoid 
        a useless square root call. If you want to use the euclidian distance you need to 
        convert it to the right value yourself.
        
        For example for the previous point with 5 dimensions 
        $eps = $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance * $euclidian_distance; 
        
    $min_points: the minimal number of points in a region with a radius of $eps. $eps 
    and $min_points are the 2 parameters used to compute the denisty of a region. If 
    the number of points in a region with radius $eps is lower than $min_points the 
    point is considered as an outlier point that can't be included in any cluster.

=cut

sub new {
	my($type, $dataset, $eps, $min_points) = @_;
	
	my $self = {};
	$self->{dataset_object} = $dataset;
	$self->{dataset} = $dataset->{points};
	@{$self->{id_list}} = keys %{$dataset->{points}};
	$self->{eps} = $eps;
	$self->{min_points} = $min_points;
	$self->{current_cluster} = 1;
	$self->{use_external_region_index} = 0;
		
	bless($self, $type);

	return($self);
}

=head2 FindClusters

The main method that will run the DBSCAN algorithm on the Dataset.

=cut

sub FindClusters {
	my ($self, $starting_point_id) = @_;

	my $i = 0;
	unshift(@{$self->{id_list}}, $starting_point_id) if (defined $starting_point_id);
	foreach my $id (@{$self->{id_list}}) {
		my $point = $self->{dataset}->{$id};
		say "$i";
		$i++;
		next if ($point->{visited});
		$point->{visited} = 1;
		$self->_one_more_point_visited();
		
		my $neighborPts = $self->GetRegion($point);
#say Dumper($neighborPts);
		
		if (scalar(@$neighborPts) < $self->{min_points}) {
			$point->{cluster_id} = -1;
		}
		else {
			$self->ExpandCluster($point, $neighborPts);
		}
	}
}

=head2 ExpandCluster

This method will expand the cluster starting by the neighborhood of point $point

=cut

sub ExpandCluster {
	my ($self, $point, $neighborPts) = @_;
	
	if (scalar(@$neighborPts) < $self->{min_points}) {
		$point->{cluster_id} = -1;
	}
	else {
		$self->{current_cluster}++;

		$point->{cluster_id} = $self->{current_cluster};
	
		my %cluster_points;
		map { $cluster_points{$_}++ } @$neighborPts;
		my $cluster_expanded = 0;
		do {
			$cluster_expanded = 0;
			foreach my $id (keys %cluster_points) {
				my $p = $self->{dataset}->{$id};
				unless ($p->{visited}) {
					$p->{visited} = 1;
					$self->_one_more_point_visited();
					
					my $neighborPtsOfClusterMember = $self->GetRegion($p);
					if (scalar(@$neighborPtsOfClusterMember) >= $self->{min_points}) {
						map { $cluster_points{$_}++ } @$neighborPtsOfClusterMember;

say "Cluster [$self->{current_cluster}] has now [".scalar(keys %cluster_points)."] members, added region of point:[$p->{point_id}]";
						$cluster_expanded = 1;
						last;
					}
				}

				$p->{cluster_id} = $self->{current_cluster} unless($p->{cluster_id});
			}
		}
		while($cluster_expanded);
	}
}

=head2 GetRegion

Find all points in the dataset that are in the neighborhood of $point

=cut

sub GetRegion {
	my ($self, $point) = @_;

	my $result; 
	
	my $coordinate_id = join(',', @{$point->{coordinates}});
	if ($self->{use_external_region_index}) {
		my $fh = $self->{region_index_filehandle};
		seek($fh, $self->{region_seek_index}->{$point->{id}}, 0) or return;
		my $region_str = <$fh>;
		my @points = split(/\s+/, $region_str);
		shift(@points);
		$result = \@points;
	}
	else {
		unless ($self->{point_neighbourhood_cache}->{$coordinate_id}) {
			my @region;
			
			foreach my $region_candidate_point_id (@{$self->{id_list}}) {
				push(@region, $region_candidate_point_id) if ($self->{dataset}->{$region_candidate_point_id}->Distance($point) < $self->{eps});
			}
			$self->{point_neighbourhood_cache}->{$coordinate_id} = \@region;
		}
		
		$result = $self->{point_neighbourhood_cache}->{$coordinate_id};
	}
	
	return $result;
}

=head2 UseRegionIndex

For huge datasets a region index can be generated separately (and using multiple cores).
The index is a list of regions for each point in the dataset.

=cut

sub UseRegionIndex {
	my ($self, $region_index_filename) = @_;

	open(my $fh,  "<", $region_index_filename);
	my $offset = 0;

	while (<$fh>) {
		my @points = split(/\s+/, $_);
		$self->{region_seek_index}->{$points[0]} = $offset;
		$offset = tell($fh);
	}
		
	$self->{use_external_region_index} = 1;
	$self->{region_index_filehandle} = $fh;
}

=head2 PrintClusters

Will print the contents of the clusters

=cut

sub PrintClusters {
	my ($self, $point) = @_;

	my %clusters;
	
	foreach my $point (@{$self->{dataset}}) {
		push(@{$clusters{$point->{cluster_id}}}, $point->{point_id});
	}
	
	foreach my $cluster_id (sort keys %clusters) {
		say "CLUSTER: $cluster_id";
		foreach my $point_id (sort @{$clusters{$cluster_id}}) {
			my $min_distance = 1000000000000;
			my $closest_point_id;
			foreach my $distance_point_id (sort @{$clusters{$cluster_id}}) {
				if ($distance_point_id ne $point_id) {
					my $this_point = $self->{dataset_object}->GetPointById($point_id);
					my $distance_point = $self->{dataset_object}->GetPointById($distance_point_id);
					
					my $distance = $this_point->Distance($distance_point);
					
					if ($distance < $min_distance) {
						$min_distance = $distance;
						$closest_point_id = $distance_point_id;
					}
				}
			}
			
			say "\t$point_id : (closest point: $closest_point_id, distance: $min_distance)";
		}
	}
}

=head2 PrintClustersShort

Will print the contents of the clusters (abreviated version)

=cut

sub PrintClustersShort {
	my ($self) = @_;

	my %clusters;

	foreach my $id (keys %{$self->{dataset}}) {
	my $point = $self->{dataset}->{$id};
		push(@{$clusters{$point->{cluster_id}}}, $point->{point_id});
	}

	foreach my $cluster_id (sort keys %clusters) {
	say "CLUSTER: $cluster_id, [".scalar(@{$clusters{$cluster_id}})."] points";
	my $nb = 0;
		foreach my $point_id (sort @{$clusters{$cluster_id}}) {
			$nb++;
			say "\t$point_id";
			last if ($nb >= 100);
		}
	}
}

=head2 _one_more_point_visited

Simple method used to display progress

=cut

sub _one_more_point_visited {
	my ($self) = @_;
	
	$self->{nb_visited_points}++;
	$self->{start_time} = time() unless ($self->{start_time});
	my $eta = time() + ((time() - $self->{start_time})/$self->{nb_visited_points})*(500000);
	my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($eta);

	say "ETA:".sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
	say "nb visited:".$self->{nb_visited_points};
}

=head1 AUTHOR

Michal TOMA, C<< <mtoma at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests on github: L<https://github.com/mtoma/Algorithm-DBSCAN>

By e-mail to C<bug-algorithm-dbscan at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Algorithm-DBSCAN>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.




=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Algorithm::DBSCAN


You can also look for information at:

=over 5

=item * Github: Issues (report bugs here)

L<https://github.com/mtoma/Algorithm-DBSCAN>

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Algorithm-DBSCAN>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Algorithm-DBSCAN>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Algorithm-DBSCAN>

=item * Search CPAN

L<http://search.cpan.org/dist/Algorithm-DBSCAN/>

=back


=head1 LICENSE AND COPYRIGHT

Copyright 2016 Michal TOMA.

This program is free software; you can redistribute it and/or modify it
under the terms of the the Artistic License (2.0). You may obtain a
copy of the full license at:

L<http://www.perlfoundation.org/artistic_license_2_0>



( run in 0.667 second using v1.01-cache-2.11-cpan-483215c6ad5 )