Algorithm-TrunkClassifier

 view release on metacpan or  search on metacpan

lib/Algorithm/TrunkClassifier/Classification.pm  view on Meta::CPAN

package Algorithm::TrunkClassifier::Classification;

use warnings;
use strict;

use Algorithm::TrunkClassifier::DataWrapper;
use Algorithm::TrunkClassifier::FeatureSelection;
use Algorithm::TrunkClassifier::DecisionTrunk;
use Algorithm::TrunkClassifier::Util;
use POSIX;

our $VERSION = "v1.0.1";

#Description: Function responsible for building decision trunks and classifying test samples using LOOCV
#Parameters: (1) Package, (2) input dataset, (3) test dataset, (4) classification procedure, (5) split percent,
#            (6) testset data file name, (7) classification variable name, (8) output folder name,
#            (9) number of levels, (10) verbose flag, (11) input data file name (12) useall flag
#Return value: None
sub trainAndClassify($ $ $ $ $ $ $ $ $ $ $ $ $){
	shift(@_);
	my ($dataWrapper, $testset, $CLASSIFY, $SPLITPERCENT, $TESTFILE, $CLASSNAME, $OUTPUT, $LEVELS, $VERBOSE, $DATAFILE, $USEALL) = @_;
	
	#Create output files
	if(!-e $OUTPUT && $OUTPUT ne "."){
		system("mkdir $OUTPUT");
	}
	open(PERFORMANCE, ">$OUTPUT/performance.txt") or die "Error: Unable to create output file\n";
	open(LOO_TRUNKS, ">$OUTPUT/loo_trunks.txt") or die "Error: Unable to create output file\n";
	open(CTS_TRUNKS, ">$OUTPUT/cts_trunks.txt") or die "Error: Unable to create output file\n";
	open(REPORT, ">$OUTPUT/class_report.txt") or die "Error: Unable to create output file\n";
	open(LOG, ">$OUTPUT/log.txt") or die "Error: Unable to create output file\n";
	
	#Establish training and test set
	my $trainingSet;
	my $testSet;
	if($CLASSIFY eq "loocv"){
		$trainingSet = $dataWrapper->copy();
	}
	elsif($CLASSIFY eq "split"){
		my $containsBoth = 0;
		while(!$containsBoth){
			$trainingSet = $dataWrapper->copy();
			$testSet = $trainingSet->splitSamples($SPLITPERCENT);
			my $class1 = $trainingSet->getClassOneName();
			my $class2 = $trainingSet->getClassTwoName();
			if($trainingSet->getClassSize($class1) && $trainingSet->getClassSize($class2)){
				$containsBoth = 1;
			}
		}
	}
	elsif($CLASSIFY eq "dual"){
		$trainingSet = $dataWrapper->copy();
		$testSet = $testset->copy();
	}
	
	#Build trunks using leave-one-out
	my %featureOccurrence;
	my %selectedFeatures;
	my %looTrunks = ("1" => [], "2" => [], "3" => [], "4" => [], "5" => []);
	my $levelBreak = 0;
	for(my $levelLimit = 1; $levelLimit <= 5; $levelLimit++){
		if($VERBOSE){
			print("Trunk classifier: Building decision trunks with $levelLimit level(s) using leave-one-out\n");
		}
		
		#Build one trunk for each left out sample
		for(my $sampleIndex = 0; $sampleIndex < $trainingSet->getNumSamples(); $sampleIndex++){
			if($VERBOSE){
				print("Trunk classifier: Fold ", $sampleIndex + 1, " of ", $dataWrapper->getNumSamples(), "\n");
			}
			my $buildSet = $trainingSet->copy();
			$buildSet->leaveOneOut($sampleIndex);
			my $decisionTrunk = buildTrunk($buildSet, $levelLimit, $sampleIndex, \%featureOccurrence, \%selectedFeatures, \$levelBreak, $VERBOSE);
			
			#Add trunk to hash
			push(@{$looTrunks{$levelLimit}}, $decisionTrunk);
			
		}
		
		if($levelBreak){
			undef $featureOccurrence{$levelLimit};
			$looTrunks{$levelLimit} = [];
			last;
		}
	}
	
	#Build trunks using complete training set
	my %ctsTrunks = ("1" => 0, "2" => 0, "3" => 0, "4" => 0, "5" => 0);
	my %selFeats;
	my %dummyHash;
	$levelBreak = 0;
	if($VERBOSE){
			print("Trunk classifier: Building decision trunks using complete training set\n");
		}
	for(my $levelLimit = 1; $levelLimit <= 5; $levelLimit++){
		my $buildSet = $trainingSet->copy();
		my $decisionTrunk = buildTrunk($buildSet, $levelLimit, 0, \%dummyHash, \%selFeats, \$levelBreak, $VERBOSE);
		
		#Add trunk to hash
		$ctsTrunks{$levelLimit} = $decisionTrunk;
		
		if($levelBreak){
			last;
		}
	}
	
	#Determine number of levels to use for classification
	my @numTrunkLevels;
	my $trunkType = "";
	if($CLASSIFY eq "loocv"){
		$trunkType = "LOO";
	}
	elsif($CLASSIFY eq "split" || $CLASSIFY eq "dual"){
		$trunkType = "CTS"
	}
	if(!$USEALL && $LEVELS){
		if(!@{$looTrunks{$LEVELS}}){
			for(my $levelIndex = $LEVELS - 1; $levelIndex > 0; $levelIndex--){
				if(@{$looTrunks{$LEVELS}}){
					push(@numTrunkLevels, $levelIndex);
					warn "Warning: Supplied level is to high, using $trunkType trunks with $levelIndex level(s) instead\n";
				}
			}
		}
		else{
			push(@numTrunkLevels, $LEVELS);
			if($VERBOSE){
				print("Trunk classifier: Using $trunkType trunks with $numTrunkLevels[0] level(s) (forced)\n");
			}
		}
	}
	elsif(!$USEALL){
		push(@numTrunkLevels, stabilityCheck(\%featureOccurrence, $dataWrapper->getNumSamples()));

lib/Algorithm/TrunkClassifier/Classification.pm  view on Meta::CPAN

		print(LOO_TRUNKS ">Trunks with $numLevels level(s)\n\n");
		foreach my $trunk (@{$looTrunks{$numLevels}}){
			$trunkCount++;
			print(LOO_TRUNKS ">Trunk $trunkCount\n", $trunk->report());
		}
		print(CTS_TRUNKS ">Trunk with $numLevels level(s)\n\n");
		print(CTS_TRUNKS $ctsTrunks{$numLevels}->report());
	}
	if($USEALL){
		$numTrunkLevels[0] = "USEALL";
	}
	print(PERFORMANCE join("\n", @performance));
	print(REPORT join("\n", @classReport));
	if($CLASSIFY ne "dual"){
		$TESTFILE = "NA";
	}
	if($CLASSIFY ne "split"){
		$SPLITPERCENT = "NA";
	}
	my $name1 = $dataWrapper->getClassOneName();
	my $name2 = $dataWrapper->getClassTwoName();
	my $log = "Trunk classifier log\n";
	$log .= "Input data file: $DATAFILE\n";
	$log .= "Testset data file: $TESTFILE\n";
	$log .= "Procedure: $CLASSIFY\n";
	$log .= "Split percent: $SPLITPERCENT\n";
	$log .= "Number of levels: $numTrunkLevels[0]\n";
	$log .= "Classification variable: $CLASSNAME\n";
	$log .= "Training set classes:\n";
	if($CLASSIFY eq "loocv"){
		$log .= "\tClass one size: " . $dataWrapper->getClassSize($name1) . " ($name1)\n";
		$log .= "\tClass two size: " . $dataWrapper->getClassSize($name2) . " ($name2)\n";
	}
	else{
		$log .= "\tClass one size: " . $trainingSet->getClassSize($name1) . " ($name1)\n";
		$log .= "\tClass two size: " . $trainingSet->getClassSize($name2) . " ($name2)\n";
	}
	$log .= "Test set classes:\n";
	if($CLASSIFY eq "loocv"){
		$log .= "\tClass one size: NA\n";
		$log .= "\tClass two size: NA\n";
	}
	else{
		$log .= "\tClass one size: " . $testSet->getClassSize($name1) . " ($name1)\n";
		$log .= "\tClass two size: " . $testSet->getClassSize($name2) . " ($name2)\n";
	}
	$log .= "Version: $VERSION";
	print(LOG $log);
	close(PERFORMANCE);
	close(LOO_TRUNKS);
	close(CTS_TRUNKS);
	close(REPORT);
	close(LOG);
	if($VERBOSE){
		print("Trunk classifier: Job finished\n");
	}
}

#Description: Wrapper for the trunk build loop
#Parameters: (1) Training dataset, (2) level limit, (3) sample index, (4) feature occurrence hash ref,
#            (5) selected features hash ref, (6) level break flag ref, (7) verbose flag
#Return value: Decision trunk object
sub buildTrunk($ $ $ $ $ $ $){
	my ($buildSet, $levelLimit, $sampleIndex, $featOccurRef, $selFeatRef, $levelBreakRef, $VERBOSE) = @_;
	
	#Trunk build loop
	my $decisionTrunk = Algorithm::TrunkClassifier::DecisionTrunk->new();
	my $noSampleBreak = 0;
	for(my $levelIndex = 1; $levelIndex <= $levelLimit; $levelIndex++){
	
		#Perform feature selection
		my $featureName;
		my $featureIndex;
		my @expRow;
		if(!$selFeatRef->{$sampleIndex}{$levelIndex}){
			$featureIndex = Algorithm::TrunkClassifier::FeatureSelection::indTTest(
				$buildSet->getDataMatrix(), $buildSet->getNumProbes(),
				$buildSet->getNumSamples(), $buildSet->getClassVector(),
				$buildSet->getClassOneName(), $buildSet->getClassTwoName());
			$featureName = $buildSet->getProbeName($featureIndex);
			@expRow = $buildSet->getMatrixRow($featureIndex);
			my @savedRow = $buildSet->getMatrixRow($featureIndex);
			$buildSet->removeProbe($featureIndex);
			$selFeatRef->{$sampleIndex}{$levelIndex} = {"feature" => $featureName, "index" => $featureIndex, "row" => \@savedRow};
			if(!$featOccurRef->{$levelIndex}{$featureName}){
				$featOccurRef->{$levelIndex}{$featureName} = 1;
			}
			else{
				$featOccurRef->{$levelIndex}{$featureName}++;
			}
		}
		else{
			$featureName = $selFeatRef->{$sampleIndex}{$levelIndex}{"feature"};
			$featureIndex = $selFeatRef->{$sampleIndex}{$levelIndex}{"index"};
			@expRow = @{$selFeatRef->{$sampleIndex}{$levelIndex}{"row"}};
			$buildSet->removeProbe($featureIndex);
		}
		
		#Initialise variables
		my @expBuffer = @expRow;
		my @classSetInd = (0 .. ($buildSet->getNumSamples() - 1));
		my @classVector = @{$buildSet->getClassVector()};
		my $numSamples = $buildSet->getNumSamples();
		Algorithm::TrunkClassifier::Util::dataSort(\@expRow, \@classVector);
		Algorithm::TrunkClassifier::Util::dataSort(\@expBuffer, \@classSetInd);
		
		#Determine quartile thresholds
		my $quantStep = $numSamples / 4;
		my $lowerThresh;
		my $higherThresh;
		my $lowFloor = floor($quantStep);
		$lowerThresh = ($expRow[$lowFloor] + $expRow[$lowFloor+1]) / 2;
		my $highFloor = floor($quantStep * 3);
		if(!$expRow[$highFloor+1]){
			$higherThresh = $expRow[$highFloor];
		}
		else{
			$higherThresh = ($expRow[$highFloor] + $expRow[$highFloor+1]) / 2;
		}
		
		#Determine low and high class
		my $lowerClass = "";
		my $higherClass = "";
		if($classVector[0] eq $buildSet->getClassOneName()){
			$lowerClass = $buildSet->getClassOneName();
			$higherClass = $buildSet->getClassTwoName();
		}
		elsif($classVector[0] eq $buildSet->getClassTwoName()){
			$lowerClass = $buildSet->getClassTwoName();
			$higherClass = $buildSet->getClassOneName();
		}

lib/Algorithm/TrunkClassifier/Classification.pm  view on Meta::CPAN

					last;
				}
			}
			$higherDecision = $decisionBuffer;
		}
		else{
			#Do not use quartiles at last level of trunk
			$decisionBuffer = "";
			for(my $classSample = 0; $buildSet->getNumSamples(); $classSample++){
				if($classVector[$classSample] ne $lowerClass){
					$decisionBuffer = ($expRow[$classSample - 1] + $expRow[$classSample]) / 2;
					last;
				}
			}
			$lowerDecision = $decisionBuffer;
			for(my $classSample = $numSamples - 1; $classSample >= 0; $classSample--){
				if($classVector[$classSample] ne $higherClass){
					if($classSample != $numSamples - 1){
						$decisionBuffer = ($expRow[$classSample] + $expRow[$classSample + 1]) / 2;
					}
					else{
						$decisionBuffer = $expRow[$classSample];
					}
					last;
				}
			}
			$higherDecision = $decisionBuffer;
			$lowerDecision = ($lowerDecision + $higherDecision) / 2;
			$higherDecision = $lowerDecision;
		}
		
		#Remove samples in quartiles
		@indToRemove = sort {$b <=> $a} @indToRemove;
		foreach my $index (@indToRemove){
			$buildSet->removeSample($index);
		}
		
		#Check that there are at least one sample left in each class and at least four samples left in total
		my $classOneSize = $buildSet->getClassSize($buildSet->getClassOneName());
		my $classTwoSize = $buildSet->getClassSize($buildSet->getClassTwoName());
		if($levelIndex < $levelLimit && ($classOneSize < 1 || $classTwoSize < 1 || $classOneSize + $classTwoSize < 4)){
			if($VERBOSE){
				print("Trunk classifier: Not enough samples, stopping at level $levelIndex of $levelLimit\n");
			}
			$noSampleBreak = 1;
			${$levelBreakRef} = 1;
			$lowerDecision = ($lowerDecision + $higherDecision) / 2;
			$higherDecision = $lowerDecision;
		}
		
		#Add level to decision trunk
		$decisionTrunk->addLevel($featureName, $lowerDecision, $higherDecision, $lowerClass, $higherClass);
		
		if($noSampleBreak){
			last;
		}
	}
	return $decisionTrunk;
}

#Description: Determine the decision trunk level with highest feature selection stability
#Parameters: (1) Hash reference containing selected features, (2) number of samples in the dataset
#Return value: Number of decision trunk levels to use for classification
sub stabilityCheck($ $){
	my ($hashRef, $numSamples) = @_;
	my %featOccurrence = %{$hashRef};
	my $numThresh = 6;
	my $chosenLevel = 0;
	foreach my $levelIndex (1 .. 5){
		if(!$featOccurrence{$levelIndex}){
			next;
		}
		my %features = %{$featOccurrence{$levelIndex}};
		my $numFeats = scalar(keys(%features));
		if($numFeats > $numThresh){
			next;
		}
		$chosenLevel = $levelIndex;
	}
	if(!$chosenLevel){
		$chosenLevel = 1;
	}
	return $chosenLevel;
}

return 1;



( run in 0.761 second using v1.01-cache-2.11-cpan-efa8479b9fe )