Algorithm-TrunkClassifier

 view release on metacpan or  search on metacpan

Algorithm/TrunkClassifier/FeatureSelection.xs  view on Meta::CPAN

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"

#include "ppport.h"
#include "src/feature_selection.h"

MODULE = Algorithm::TrunkClassifier::FeatureSelection			PACKAGE = Algorithm::TrunkClassifier::FeatureSelection

int
indTTest(expData, numFeatures, numSamples, sampleNames, normal, malign)
	double ** 	expData
	int 		numFeatures
	int 		numSamples
	char ** 	sampleNames
	char * 		normal

Algorithm/TrunkClassifier/ppport.h  view on Meta::CPAN

ck_null|||
ck_open|||
ck_readline|||
ck_repeat|||
ck_require|||
ck_retarget|||
ck_return|||
ck_rfun|||
ck_rvconst|||
ck_sassign|||
ck_select|||
ck_shift|||
ck_sort|||
ck_spair|||
ck_split|||
ck_subr|||
ck_substr|||
ck_svconst|||
ck_trunc|||
ck_unpack|||
ckwarn_d||5.009003|

Algorithm/TrunkClassifier/src/feature_selection.c  view on Meta::CPAN

/*
 * MaxClassPol.cpp
 *
 *  Created on: 26 feb 2012
 *      Author: Wolfcastle
 *
 *  Description
 *		This library implements the independent t-test
 */

#include "feature_selection.h"

/*
Description: Feature selection method that uses the independent t-test to select a feature
Parameters: (1) 2D expression data matrix, (2) number of matrix rows, (3) number of matrix columns (4) list of column names,
            (5) first group symbol and (6) second group symbol
Return value: Index of the top t-value feature (row in data matrix)
*/
int indTTest(double** expData, int numFeatures, int numSamples, char** sampleNames, char* NORMAL, char* MALIGN){
	
	//Determine class sizes
	int sample;
	int numNormal = 0;
	int numMalign = 0;

Changes  view on Meta::CPAN

Revision history for Perl extension Algorithm::TrunkClassifier.

v1.0.1 Mars 2013
	- fixed inspect command output
	- cleaned up code in feature selection library
	- added Windows installation instructions to README
	- added Pubmed link in documentation

v1.0.0 November 2012
	- release version distribution of the module was built

MANIFEST  view on Meta::CPAN

lib/Algorithm/TrunkClassifier.pm
lib/Algorithm/TrunkClassifier/CommandProcessor.pm
lib/Algorithm/TrunkClassifier/DataWrapper.pm
lib/Algorithm/TrunkClassifier/Classification.pm
lib/Algorithm/TrunkClassifier/FeatureSelection.pm
lib/Algorithm/TrunkClassifier/DecisionTrunk.pm
lib/Algorithm/TrunkClassifier/Util.pm
Algorithm/TrunkClassifier/FeatureSelection.xs
Algorithm/TrunkClassifier/typemap
Algorithm/TrunkClassifier/ppport.h
Algorithm/TrunkClassifier/src/feature_selection.h
Algorithm/TrunkClassifier/src/feature_selection.c
t/run_classifier.pl
t/test_data.txt
t/test_supp.txt
pod/TrunkClassifier.pod

lib/Algorithm/TrunkClassifier.pm  view on Meta::CPAN

#Classifier arguments
my $CLASSIFY = "loocv";		#Classification procedure (loocv|split|dual)
my $SPLITPERCENT = 20;		#Percentage of samples to use as test set when using -c split
my $TESTSET = "";			#Name of test dataset when using -c dual
my $CLASSNAME = "TISSUE";	#Name of classification variable
my $OUTPUT = ".";			#Name of output folder
my $LEVELS = 0;				#Number of levels in decision trunks (forced)
my $PROSPECT = "";			#Check input data without running classifier
my $SUPPFILE = "";			#File containing class information
my $VERBOSE = 0;			#Report progress during classifier run
my $USEALL = 0;				#Circumvent level selection and use all trunks for classification
my $DATAFILE = "";			#File containing input data

#Description: Wrapper function for running the decision trunk classifier
#Parameters: Command line arguments
#Return value: None
sub runClassifier{
	#Handle commands line arguments
	my $processor = Algorithm::TrunkClassifier::CommandProcessor->new(\$CLASSIFY, \$SPLITPERCENT, \$TESTSET, \$CLASSNAME, \$OUTPUT, \$LEVELS, \$PROSPECT, \$SUPPFILE, \$VERBOSE, \$USEALL, \$DATAFILE);
	$processor->processCmd(@_);
	

lib/Algorithm/TrunkClassifier/Classification.pm  view on Meta::CPAN

			}
		}
	}
	elsif($CLASSIFY eq "dual"){
		$trainingSet = $dataWrapper->copy();
		$testSet = $testset->copy();
	}
	
	#Build trunks using leave-one-out
	my %featureOccurrence;
	my %selectedFeatures;
	my %looTrunks = ("1" => [], "2" => [], "3" => [], "4" => [], "5" => []);
	my $levelBreak = 0;
	for(my $levelLimit = 1; $levelLimit <= 5; $levelLimit++){
		if($VERBOSE){
			print("Trunk classifier: Building decision trunks with $levelLimit level(s) using leave-one-out\n");
		}
		
		#Build one trunk for each left out sample
		for(my $sampleIndex = 0; $sampleIndex < $trainingSet->getNumSamples(); $sampleIndex++){
			if($VERBOSE){
				print("Trunk classifier: Fold ", $sampleIndex + 1, " of ", $dataWrapper->getNumSamples(), "\n");
			}
			my $buildSet = $trainingSet->copy();
			$buildSet->leaveOneOut($sampleIndex);
			my $decisionTrunk = buildTrunk($buildSet, $levelLimit, $sampleIndex, \%featureOccurrence, \%selectedFeatures, \$levelBreak, $VERBOSE);
			
			#Add trunk to hash
			push(@{$looTrunks{$levelLimit}}, $decisionTrunk);
			
		}
		
		if($levelBreak){
			undef $featureOccurrence{$levelLimit};
			$looTrunks{$levelLimit} = [];
			last;

lib/Algorithm/TrunkClassifier/Classification.pm  view on Meta::CPAN

	close(CTS_TRUNKS);
	close(REPORT);
	close(LOG);
	if($VERBOSE){
		print("Trunk classifier: Job finished\n");
	}
}

#Description: Wrapper for the trunk build loop
#Parameters: (1) Training dataset, (2) level limit, (3) sample index, (4) feature occurrence hash ref,
#            (5) selected features hash ref, (6) level break flag ref, (7) verbose flag
#Return value: Decision trunk object
sub buildTrunk($ $ $ $ $ $ $){
	my ($buildSet, $levelLimit, $sampleIndex, $featOccurRef, $selFeatRef, $levelBreakRef, $VERBOSE) = @_;
	
	#Trunk build loop
	my $decisionTrunk = Algorithm::TrunkClassifier::DecisionTrunk->new();
	my $noSampleBreak = 0;
	for(my $levelIndex = 1; $levelIndex <= $levelLimit; $levelIndex++){
	
		#Perform feature selection
		my $featureName;
		my $featureIndex;
		my @expRow;
		if(!$selFeatRef->{$sampleIndex}{$levelIndex}){
			$featureIndex = Algorithm::TrunkClassifier::FeatureSelection::indTTest(
				$buildSet->getDataMatrix(), $buildSet->getNumProbes(),
				$buildSet->getNumSamples(), $buildSet->getClassVector(),
				$buildSet->getClassOneName(), $buildSet->getClassTwoName());
			$featureName = $buildSet->getProbeName($featureIndex);
			@expRow = $buildSet->getMatrixRow($featureIndex);

lib/Algorithm/TrunkClassifier/Classification.pm  view on Meta::CPAN

		#Add level to decision trunk
		$decisionTrunk->addLevel($featureName, $lowerDecision, $higherDecision, $lowerClass, $higherClass);
		
		if($noSampleBreak){
			last;
		}
	}
	return $decisionTrunk;
}

#Description: Determine the decision trunk level with highest feature selection stability
#Parameters: (1) Hash reference containing selected features, (2) number of samples in the dataset
#Return value: Number of decision trunk levels to use for classification
sub stabilityCheck($ $){
	my ($hashRef, $numSamples) = @_;
	my %featOccurrence = %{$hashRef};
	my $numThresh = 6;
	my $chosenLevel = 0;
	foreach my $levelIndex (1 .. 5){
		if(!$featOccurrence{$levelIndex}){
			next;
		}

lib/Algorithm/TrunkClassifier/CommandProcessor.pm  view on Meta::CPAN

Options
	-p, --procedure     Classification procedure to use [loocv|split|dual]
	-e, --split         Percentage of samples to use as test set when using -p split
	-t, --testset       Dataset to classify when using -c dual
    -c, --classvar      Name of the classification variable to use
    -o, --output        Name of the output folder
    -l, --levels        Force classifier to use trunks with X levels for classification
    -i, --inspect       Check data file before running [samples|probes|classes]
    -s, --supp          Supplementary file containing class information
    -v, --verbose       Report progress during classifier run
    -u, --useall        Circumvent level selection and use all trunks for classification
    -h, --help          Print command line help

Output
    performance:   Classification accuracy for each LOOCV fold, as well as average accuracy
    loo_trunks:    Structures of leave-one-out decision trunks
    cts_trunks:    Structure of trunks built with compete training set
    class_report:  Classification of all test samples
    log:           Arguments used
END
	die $doc;

pod/TrunkClassifier.pod  view on Meta::CPAN

=head1 SYNOPSIS

  use Algorithm::TrunkClassifier qw(runClassifier);

=head1 DESCRIPTION

This module contains the implementation of the Decision Trunk Classifier. The algorithm
can be used to perform binary classification on numeric data, e.g. the result of a
gene expression profiling experiment. Classification is based on so-called decision
trunks, which consist of a sequence of decision levels, represented as nodes in the
trunk. For each decision level, a probe is selected from the input data, and two decision
threshold are calculated. These threshold are associated to two outgoing edges from
the decision level. One edge represents the first class and the other edge represents
the second class.

During classification, the decision levels of a trunk are considered one at a time. To
classify a sample, its expression of the probe at the decision level is compared to the
thresholds of outgoing edges. If the expression is less than the first threshold,
class1 is assigned to the sample. If, on the other hand, the expression is greater than
the second threshold, class2 is assigned to the sample. In the case expression is
in-between the thresholds, the algorithm proceeds to the next decision level of the

pod/TrunkClassifier.pod  view on Meta::CPAN


The value should be the name of the classification variable to use. Default is TISSUE.

=item C<-o value>

The value should be the name of the output folder. Created if it does not exist in the
current directory. Default is current directory.

=item C<-l value>

By default, the algorithm selects the number of decision levels to use for
classification. To override this, supply the -l option and an integer from 1 to 5. This
will force the algorithm to use that number of decision levels.

=item C<-i value>

This option can be used to inspect the dataset without running the classifier.
The option takes one of three possible values: C<samples>, C<probes> or C<classes>.

samples: prints the number of samples in each class for the classification variable
probes:  prints the number of probes in the dataset

pod/TrunkClassifier.pod  view on Meta::CPAN


Note: If the C<-p dual> option is used, two datasets must be supplied. In this case the
supplementary file needs to contain the class information of all samples in both datasets.

=item C<-v>

This option makes the algorithm report its progress to the terminal during a run.

=item C<-u>

This option circumvents selection of decision levels and makes the algorithm use trunks
with 1, 2, 3, 4 and 5 decision levels during classification.

=item C<-h>

This option causes argument documentation to be printed to the terminal.

=back

=head2 OUTPUT

The algorithm produces five files as output: F<performance.txt>, F<loo_trunks.txt>,
F<cts_trunks>, F<class_report.txt> and F<log.txt>. The classification accuracy
can be found in F<performance.txt>. In case of leave-one-out cross validation, the
accuracy for each fold is reported along with the average accuracy across all folds.
Since the test set consists of one sample, the accuracy of one LOOCV fold is either
0 % (wrong) or 100 % (correct). For split-sample and dual datasets classification, only
the average accuracy is reported since there is only one test set.

The F<loo_trunks.txt> file contains the decision trunks resulting from leave-one-out
training on the training set. Since the training set is different in each fold,
different probes may be selected in the trunks. The decision levels of a trunk are shown
in order starting with the first level at the top. Each level consists of two rows:
the first row shows the name of the probe and the second row contains the decision
thresholds and the associated class labels. An illustration of a decision trunk with
three levels is shown here

              Probe X
  <= A (class1)     > B (class2)
  
              Probe Y
  <= C (class1)     > D (class2)

pod/TrunkClassifier.pod  view on Meta::CPAN

of 0 and standard deviation of 0.5 (normal distribution) for all genes, while the remaining
100 samples (malignant) have a mean of 1 and standard deviation of 0.5. The F<test_supp.txt>
is a supplementary file containing the class information associated to the random dataset.
To run the algorithm with this dataset, use the following command.

C<perl run_classifier.pl -v -o test_set_tissue -s test_supp.txt test_data.txt>

Since a supplementary file is given, a new data file with class information will be
written. Following this, the algorithm will build decision trunks and determine how many
decision levels to use for classification. Finally, LOOCV will be performed using the
selected trunks and output written. If no classification variable is explicitly given,
the algorithm will default to TISSUE. For the random dataset, this variable states if the
sample comes from healthy tissue or from a tumor. The supplementary file labels healthy
samples as T_HEALTHY and tumor samples as T_MALIGN. By looking in the supplementary file
it can also be seen that the random dataset comes with a second classification variable:
GRADE. This variable states if the tumor samples comes from an low- or high-state tumor.
This is indicated by G_LOW and G_HIGH. Since the healthy samples do not come from tumors,
they do not have GRADE classes. To indicate this, #NA is used. The #NA symbol is
interpreted by the algorithm as a null class, causing the sample to be excluded if GRADE
is given as the classification variable. To test this, use the following command.

C<perl run_classifier.pl -v -c GRADE -o test_set_stage -s test_supp.txt test_data.txt>

By comparing the output files, differences can be seen in how many folds of LOOCV has
been carried out, and in what probes where selected for the decision trunks. The log
file will also reflect that a different classification variable was used. Accuracy will
be good when classifying TISSUE, because the healthy and tumor samples have sufficiently
different gene expression values. For GRADE, however, all tumor samples have the same mean
and standard deviation, so the algorithm is not able to separate them.

=head2 WARNINGS AND ERROR MESSAGES

If an invalid argument is given, or if there is something wrong with the input data file
or supplementary file, the algorithm will output a warning or error message. Warnings
will not prevent the algorithm from running, but errors will. Here is a list of all



( run in 1.171 second using v1.01-cache-2.11-cpan-49f99fa48dc )