Algorithm-ExpectationMaximization

 view release on metacpan or  search on metacpan

examples/canned_example1.pl  view on Meta::CPAN

#  to be used for clustering.
my $mask = "N11";    

my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile            => $datafile,
                                mask                => $mask,
                                K                   => 3,
                                max_em_iterations   => 300,
                                seeding             => 'random',
                                terminal_output     => 1,
                                debug               => 0,
                );

$clusterer->read_data_from_file();

# For visualizing the raw data:
my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);

srand(time);

examples/canned_example2.pl  view on Meta::CPAN

my $mask = "N11";                      

my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile            => $datafile,
                                mask                => $mask,
                                K                   => 2,
                                max_em_iterations   => 300,
                                seeding             => 'random',
#                               seeding             => 'kmeans',
                                terminal_output     => 1,
                                debug               => 0,
                );

$clusterer->read_data_from_file();

my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);

srand(time);
$clusterer->seed_the_clusters();

examples/canned_example3.pl  view on Meta::CPAN

my $mask = "N11";  

my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile            => $datafile,
                                mask                => $mask,
                                K                   => 3,
                                max_em_iterations   => 300,
                                seeding             => 'manual',
                                seed_tags           => ['a7', 'b3', 'c4'],
                                terminal_output     => 1,
                                debug               => 0,
                );

$clusterer->read_data_from_file();

my $data_visualization_mask = "11";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);

srand(time);
$clusterer->seed_the_clusters();

examples/canned_example4.pl  view on Meta::CPAN


my $mask = "N111";  

my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile            => $datafile,
                                mask                => $mask,
                                K                   => 3,
                                max_em_iterations   => 300,
                                seeding             => 'random',
                                terminal_output     => 1,
                                debug               => 0,
                );

$clusterer->read_data_from_file();

my $data_visualization_mask = "111";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);

srand(time);
$clusterer->seed_the_clusters();

examples/canned_example5.pl  view on Meta::CPAN


my $mask = "N111";  

my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile              => $datafile,
                                mask                  => $mask,
                                K                     => 3,
                                max_em_iterations     => 300,
                                seeding               => 'random',
                                terminal_output       => 1,
                                debug                 => 0,
                );

$clusterer->read_data_from_file();

my $data_visualization_mask = "111";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);

srand(time);
$clusterer->seed_the_clusters();

examples/canned_example6.pl  view on Meta::CPAN


my $mask = "N1";    

my $clusterer = Algorithm::ExpectationMaximization->new(
                                datafile            => $datafile,
                                mask                => $mask,
                                K                   => 2,
                                max_em_iterations   => 300,
                                seeding             => 'random',
                                terminal_output     => 1,
                                debug               => 0,
                );

$clusterer->read_data_from_file();

my $data_visualization_mask = "1";
$clusterer->visualize_data($data_visualization_mask);
$clusterer->plot_hardcopy_data($data_visualization_mask);

srand(time);
$clusterer->seed_the_clusters();

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

          if check_for_illegal_params(@params) == 0;
    bless {
        _datafile         =>   $args{datafile} || croak("datafile required"),
        _mask             =>   $args{mask}     || croak("mask required"),
        _K                =>   $args{K}       || croak("number of clusters required"),
        _terminal_output  =>   $args{terminal_output}   || 0,
        _seeding          =>   $args{seeding}           || 'random',
        _seed_tags        =>   $args{seed_tags}         || [],
        _max_em_iterations=>   $args{max_em_iterations} || 100,
        _class_priors     =>   $args{class_priors}      || [],
        _debug            =>   $args{debug}             || 0,
        _N                =>   0,
        _data             =>   {},
        _data_id_tags     =>   [],
        _clusters         =>   [],
        _cluster_centers  =>   [],
        _data_dimensions  =>   0,
        _cluster_normalizers            => [],
        _cluster_means                  => [],
        _cluster_covariances            => [],
        _class_labels_for_data          => {},

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

            $self->{_expected_class_probs}->{$data_id} = [];
        }
        # Calculate prob(x | C_i) --- this is the prob of data point x as
        # a member of class C_i. You must do this for all K classes:
        foreach my $cluster_index(0..$self->{_K}-1) {
            $self->find_prob_at_each_datapoint_for_given_mean_and_covar(
                $self->{_cluster_means}->[$cluster_index],
                $self->{_cluster_covariances}->[$cluster_index] );
        }
        $self->{_cluster_normalizers} = [];
        if ($self->{_debug}) {
            print "\n\nDisplaying prob of a data point vis-a-vis each class:\n\n";
            foreach my $data_id (sort keys %{$self->{_data}}) {
                my $class_probs_at_a_point = 
                      $self->{_class_probs_at_each_data_point}->{$data_id};
                print "Class probs for $data_id: @$class_probs_at_a_point\n"
            }
        }
        # Calculate prob(C_i | x) which is the posterior prob of class
        # considered as a r.v. to be C_i at a given point x.  For a given
        # x, the sum of such probabilities over all C_i must add up to 1:
        $self->find_expected_classes_at_each_datapoint();
        if ($self->{_debug}) {
            print "\n\nDisplaying expected class probs at each data point:\n\n";
            foreach my $data_id (sort keys %{$self->{_expected_class_probs}}) {
                my $expected_classes_at_a_point = 
                                   $self->{_expected_class_probs}->{$data_id};
                print "Expected classes $data_id: @$expected_classes_at_a_point\n";
            }
        }
        # 1. UPDATE MEANS:        
        my @new_means;
        foreach my $cluster_index(0..$self->{_K}-1) {         

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

                $data_vec->set_col(0,$data_record);
                $new_means[$cluster_index] +=  
                    $self->{_expected_class_probs}->{$data_id}->[$cluster_index] *
                    $data_vec->copy();
                $self->{_cluster_normalizers}->[$cluster_index] +=
                    $self->{_expected_class_probs}->{$data_id}->[$cluster_index];
            }
            $new_means[$cluster_index] *= 1.0 / 
                                 $self->{_cluster_normalizers}->[$cluster_index];
        }
        if ($self->{_debug}) {
            foreach my $meanvec (@new_means) {
                display_matrix("At EM Iteration $em_iteration, new mean vector is", 
                                                                          $meanvec);
            }
        }
        $self->{_cluster_means} = \@new_means;
        # 2. UPDATE COVARIANCES:
        my @new_covariances;
        foreach my $cluster_index(0..$self->{_K}-1) {         
            $new_covariances[$cluster_index] = 

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        $self->{_cluster_covariances} = \@new_covariances;
        # 3. UPDATE PRIORS:
        $self->{_old_old_priors} = deep_copy_array( $self->{_old_priors} )
                                    if @{$self->{_old_priors}} > 0;
        $self->{_old_priors} = deep_copy_array( $self->{_class_priors} ); 
        foreach my $cluster_index(0..$self->{_K}-1) {      
            $self->{_class_priors}->[$cluster_index] = 
                 $self->{_cluster_normalizers}->[$cluster_index] / $self->{_N};
        }
        my @priors = @{$self->{_class_priors}};
        print "\nUpdated priors: @priors\n\n\n" if $self->{_debug};
        push @{$self->{_fisher_quality_vs_iteration}}, 
                                                 $self->clustering_quality_fisher();
        push @{$self->{_mdl_quality_vs_iteration}}, $self->clustering_quality_mdl();
        if ( ($em_iteration > 5 && $self->reached_convergence()) 
             || ($em_iteration ==  $self->{_max_em_iterations} - 1) ) {
            my @old_old_priors = @{$self->{_old_old_priors}};  
            my @old_priors = @{$self->{_old_priors}};
            print "\n\nPrevious to previous priors:       @old_old_priors\n";
            print "Previous priors:                   @old_priors\n";
            print "Current class priors:              @{$self->{_class_priors}}\n";

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        push @probability_distributions, [];
    }
    foreach my $cluster_index (0..$self->{_K}-1) {
        my $mean_vec = $self->{_cluster_means}->[$cluster_index];
        my $covar = $self->{_cluster_covariances}->[$cluster_index];
        foreach my $data_id (keys %{$self->{_data}}) {
            my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);
            $data_vec->set_col( 0, $self->{_data}->{$data_id});
            my $datavec_minus_mean = $data_vec - $mean_vec;
            display_matrix( "datavec minus mean is ", $datavec_minus_mean )
                                    if $self->{_debug};
            my $exponent = undef;
            if ($self->{_data_dimensions} > 1) {
                $exponent = -0.5 * vector_matrix_multiply( 
                                               transpose($datavec_minus_mean),
                                matrix_vector_multiply($covar->inverse(), $datavec_minus_mean ) );
            } else {
                my @var_inverse = $covar->inverse()->as_list;
                my $var_inverse_val = $var_inverse[0];
                my @data_minus_mean = $datavec_minus_mean->as_list;
                my $data_minus_mean_val = $data_minus_mean[0];
                $exponent = -0.5 * ($data_minus_mean_val ** 2) * $var_inverse_val;
            }
            print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
            my $coefficient = 1.0 / \
                 ( (2 * $Math::GSL::Const::M_PI)**$self->{_data_dimensions} * sqrt($covar->det()) );
            my $prob = $coefficient * exp($exponent);
            push @{$probability_distributions[$cluster_index]}, $data_id 
                if $prob > $theta;
        }
    }
    return \@probability_distributions;
}

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    print "\n\nMDL Quality vs. Iterations: @{$self->{_mdl_quality_vs_iteration}}\n\n";
}

sub find_prob_at_each_datapoint_for_given_mean_and_covar {
    my $self = shift;
    my $mean_vec_ref = shift;
    my $covar_ref = shift;
    foreach my $data_id (keys %{$self->{_data}}) {
        my $data_vec = Math::GSL::Matrix->new($self->{_data_dimensions},1);        
        $data_vec->set_col( 0, $self->{_data}->{$data_id});
        if ($self->{_debug}) {
            display_matrix("data vec in find prob function", $data_vec);
            display_matrix("mean vec in find prob function", $mean_vec_ref);
            display_matrix("covariance in find prob function", $covar_ref);
        }
        my $datavec_minus_mean = $data_vec - $mean_vec_ref;
        display_matrix( "datavec minus mean is ", $datavec_minus_mean ) if $self->{_debug};
        my $exponent = undef;
        if ($self->{_data_dimensions} > 1) {
            $exponent = -0.5 * vector_matrix_multiply( transpose($datavec_minus_mean),
                matrix_vector_multiply( $covar_ref->inverse(), $datavec_minus_mean ) );
        } elsif (defined blessed($covar_ref)) {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_val;
        } else {
            my @data_minus_mean = $datavec_minus_mean->as_list;
            my $data_minus_mean_val = $data_minus_mean[0];
            $exponent = -0.5 * ($data_minus_mean_val ** 2) / $covar_ref;
        }
        print "\nThe value of the exponent is: $exponent\n\n" if $self->{_debug};
        my $coefficient = undef;
        if ($self->{_data_dimensions} > 1) {
            $coefficient = 1.0 / sqrt( ((2 * $Math::GSL::Const::M_PI) ** $self->{_data_dimensions}) * 
                                                                                $covar_ref->det()) ;
        } elsif (!defined blessed($covar_ref)) {
            $coefficient =  1.0 / sqrt(2 * $covar_ref * $Math::GSL::Const::M_PI);
        } else {         
            my @covar_as_matrix = $covar_ref->as_list;
            my $covar_val = $covar_as_matrix[0];
            $coefficient =  1.0 / sqrt(2 * $covar_val * $Math::GSL::Const::M_PI);

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

        }
        foreach my $j (0..$num_cols-1) {
            my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
            $matrix->set_col($j, \@new_col);
        }
        my $transposed = transpose( $matrix );
        my $covariance = matrix_multiply( $matrix, $transposed );
        $covariance *= 1.0 / $num_cols;
        push @seed_based_covariances, $covariance;
        display_matrix("Displaying the seed covariance",  $covariance )
            if $self->{_debug};
    }
    return (\@seed_mean_vecs, \@seed_based_covariances);
}

# The most popular seeding mode for EM is random. We include two other seeding modes
# --- kmeans and manual --- since they do produce good results for specialized cases.
# For example, when the clusters in your data are non-overlapping and not too
# anisotropic, the kmeans based seeding should work at least as well as the random
# seeding.  In such cases --- AND ONLY IN SUCH CASES --- the kmeans based seeding has
# the advantage of avoiding the getting stuck in a local-maximum problem of the EM

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

    my $K = shift;
    if ($self->{_data_dimensions} == 1) {
        my @one_d_data;
        foreach my $j (0..$self->{_N}-1) {
            my $tag = $self->{_data_id_tags}[$j];     
            push @one_d_data, $self->{_data}->{$tag}->[0];
        }
        my @peak_points = 
                    find_peak_points_in_given_direction(\@one_d_data,$K);
        print "highest points at data values: @peak_points\n" 
                                                         if $self->{_debug};
        my @cluster_centers;
        foreach my $peakpoint (@peak_points) {
            push @cluster_centers, [$peakpoint];
        }
        return \@cluster_centers;
    }
    my ($num_rows,$num_cols) = ($self->{_data_dimensions},$self->{_N});
    my $matrix = Math::GSL::Matrix->new($num_rows,$num_cols);
    my $mean_vec = Math::GSL::Matrix->new($num_rows,1);
    # All the record labels are stored in the array $self->{_data_id_tags}.
    # The actual data for clustering is stored in a hash at $self->{_data}
    # whose keys are the record labels; the value associated with each
    # key is the array holding the corresponding numerical multidimensional
    # data.
    foreach my $j (0..$num_cols-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        $matrix->set_col($j, $data);
    }
    if ($self->{_debug}) {
        print "\nDisplaying the original data as a matrix:";
        display_matrix( $matrix ); 
    }
    foreach my $j (0..$num_cols-1) {
        $mean_vec += $matrix->col($j);
    }
    $mean_vec *=  1.0 / $num_cols;
    if ($self->{_debug}) {
        print "Displaying the mean vector for the data:";
        display_matrix( $mean_vec );
    }
    foreach my $j (0..$num_cols-1) {
        my @new_col = ($matrix->col($j) - $mean_vec)->as_list;
        $matrix->set_col($j, \@new_col);
    }
    if ($self->{_debug}) {
        print "Displaying mean subtracted data as a matrix:";
        display_matrix( $matrix ); 
    }
    my $transposed = transpose( $matrix );
    if ($self->{_debug}) {
        print "Displaying transposed data matrix:";
        display_matrix( $transposed );
    }
    my $covariance = matrix_multiply( $matrix, $transposed );
    $covariance *= 1.0 / $num_cols;
    if ($self->{_debug}) {
        print "\nDisplaying the Covariance Matrix for your data:";
        display_matrix( $covariance );
    }
    my ($eigenvalues, $eigenvectors) = $covariance->eigenpair;
    my $num_of_eigens = @$eigenvalues;     
    my $largest_eigen_index = 0;
    my $smallest_eigen_index = 0;
    print "Eigenvalue 0:   $eigenvalues->[0]\n" if $self->{_debug};
    foreach my $i (1..$num_of_eigens-1) {
        $largest_eigen_index = $i if $eigenvalues->[$i] > 
                                     $eigenvalues->[$largest_eigen_index];
        $smallest_eigen_index = $i if $eigenvalues->[$i] < 
                                     $eigenvalues->[$smallest_eigen_index];
        print "Eigenvalue $i:   $eigenvalues->[$i]\n" if $self->{_debug};
    }
    print "\nlargest eigen index: $largest_eigen_index\n" if $self->{_debug};
    print "\nsmallest eigen index: $smallest_eigen_index\n\n" 
                                                          if $self->{_debug};
    foreach my $i (0..$num_of_eigens-1) {
        my @vec = $eigenvectors->[$i]->as_list;
        print "Eigenvector $i:   @vec\n" if $self->{_debug};
    }
    my @largest_eigen_vec = $eigenvectors->[$largest_eigen_index]->as_list;
    print "\nLargest eigenvector:   @largest_eigen_vec\n" if $self->{_debug};
    my @max_var_direction;
    # Each element of the array @largest_eigen_vec is a Math::Complex object
    foreach my $k (0..@largest_eigen_vec-1) {
        my ($mag, $theta) = $largest_eigen_vec[$k] =~ /\[(\d*\.\d+),(\S+)\]/;
        if ($theta eq '0') {
            $max_var_direction[$k] = $mag;
        } elsif ($theta eq 'pi') {
            $max_var_direction[$k] = -1.0 * $mag;
        } else {
            die "eigendecomposition of covariance matrix produced a complex eigenvector --- something is wrong";
        }
    }
    # "Maximum variance direction: @max_var_direction
    print "\nMaximum Variance Direction: @max_var_direction\n\n" 
                                                 if $self->{_debug};
    # We now project all data points on the largest eigenvector.
    # Each projection will yield a single point on the eigenvector.
    my @projections;
    foreach my $j (0..$self->{_N}-1) {
        my $tag = $self->{_data_id_tags}[$j];     
        my $data = $self->{_data}->{$tag};
        die "Dimensionality of the largest eigenvector does not "
            . "match the dimensionality of the data" 
          unless @max_var_direction == $self->{_data_dimensions};
        my $projection = vector_multiply($data, \@max_var_direction);
        push @projections, $projection;
    }
    print "All projection points: @projections\n" if $self->{_debug};
    my @peak_points = find_peak_points_in_given_direction(\@projections, $K);
    print "highest points at points along largest eigenvec: @peak_points\n"
                                              if $self->{_debug};
    my @cluster_centers;
    foreach my $peakpoint (@peak_points) {
        my @actual_peak_coords = map {$peakpoint * $_} @max_var_direction;
        push @cluster_centers, \@actual_peak_coords;
    }
    return \@cluster_centers;
}

# Used by the kmeans part of the code: This method is called by the previous method
# to locate K peaks in a smoothed histogram of the data points projected onto the

lib/Algorithm/ExpectationMaximization.pm  view on Meta::CPAN

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / datafile
                            mask
                            K
                            terminal_output
                            max_em_iterations
                            seeding
                            class_priors
                            seed_tags
                            debug
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }



( run in 1.466 second using v1.01-cache-2.11-cpan-49f99fa48dc )