Algorithm-TrunkClassifier
view release on metacpan or search on metacpan
lib/Algorithm/TrunkClassifier/Classification.pm view on Meta::CPAN
package Algorithm::TrunkClassifier::Classification;
use warnings;
use strict;
use Algorithm::TrunkClassifier::DataWrapper;
use Algorithm::TrunkClassifier::FeatureSelection;
use Algorithm::TrunkClassifier::DecisionTrunk;
use Algorithm::TrunkClassifier::Util;
use POSIX;
our $VERSION = "v1.0.1";
#Description: Function responsible for building decision trunks and classifying test samples using LOOCV
#Parameters: (1) Package, (2) input dataset, (3) test dataset, (4) classification procedure, (5) split percent,
# (6) testset data file name, (7) classification variable name, (8) output folder name,
# (9) number of levels, (10) verbose flag, (11) input data file name (12) useall flag
#Return value: None
sub trainAndClassify($ $ $ $ $ $ $ $ $ $ $ $ $){
shift(@_);
my ($dataWrapper, $testset, $CLASSIFY, $SPLITPERCENT, $TESTFILE, $CLASSNAME, $OUTPUT, $LEVELS, $VERBOSE, $DATAFILE, $USEALL) = @_;
#Create output files
if(!-e $OUTPUT && $OUTPUT ne "."){
system("mkdir $OUTPUT");
}
open(PERFORMANCE, ">$OUTPUT/performance.txt") or die "Error: Unable to create output file\n";
open(LOO_TRUNKS, ">$OUTPUT/loo_trunks.txt") or die "Error: Unable to create output file\n";
open(CTS_TRUNKS, ">$OUTPUT/cts_trunks.txt") or die "Error: Unable to create output file\n";
open(REPORT, ">$OUTPUT/class_report.txt") or die "Error: Unable to create output file\n";
open(LOG, ">$OUTPUT/log.txt") or die "Error: Unable to create output file\n";
#Establish training and test set
my $trainingSet;
my $testSet;
if($CLASSIFY eq "loocv"){
$trainingSet = $dataWrapper->copy();
}
elsif($CLASSIFY eq "split"){
my $containsBoth = 0;
while(!$containsBoth){
$trainingSet = $dataWrapper->copy();
$testSet = $trainingSet->splitSamples($SPLITPERCENT);
my $class1 = $trainingSet->getClassOneName();
my $class2 = $trainingSet->getClassTwoName();
if($trainingSet->getClassSize($class1) && $trainingSet->getClassSize($class2)){
$containsBoth = 1;
}
}
}
elsif($CLASSIFY eq "dual"){
$trainingSet = $dataWrapper->copy();
$testSet = $testset->copy();
}
#Build trunks using leave-one-out
my %featureOccurrence;
my %selectedFeatures;
my %looTrunks = ("1" => [], "2" => [], "3" => [], "4" => [], "5" => []);
my $levelBreak = 0;
for(my $levelLimit = 1; $levelLimit <= 5; $levelLimit++){
if($VERBOSE){
print("Trunk classifier: Building decision trunks with $levelLimit level(s) using leave-one-out\n");
}
#Build one trunk for each left out sample
for(my $sampleIndex = 0; $sampleIndex < $trainingSet->getNumSamples(); $sampleIndex++){
if($VERBOSE){
print("Trunk classifier: Fold ", $sampleIndex + 1, " of ", $dataWrapper->getNumSamples(), "\n");
}
my $buildSet = $trainingSet->copy();
$buildSet->leaveOneOut($sampleIndex);
my $decisionTrunk = buildTrunk($buildSet, $levelLimit, $sampleIndex, \%featureOccurrence, \%selectedFeatures, \$levelBreak, $VERBOSE);
#Add trunk to hash
push(@{$looTrunks{$levelLimit}}, $decisionTrunk);
lib/Algorithm/TrunkClassifier/Classification.pm view on Meta::CPAN
print(LOO_TRUNKS ">Trunks with $numLevels level(s)\n\n");
foreach my $trunk (@{$looTrunks{$numLevels}}){
$trunkCount++;
print(LOO_TRUNKS ">Trunk $trunkCount\n", $trunk->report());
}
print(CTS_TRUNKS ">Trunk with $numLevels level(s)\n\n");
print(CTS_TRUNKS $ctsTrunks{$numLevels}->report());
}
if($USEALL){
$numTrunkLevels[0] = "USEALL";
}
print(PERFORMANCE join("\n", @performance));
print(REPORT join("\n", @classReport));
if($CLASSIFY ne "dual"){
$TESTFILE = "NA";
}
if($CLASSIFY ne "split"){
$SPLITPERCENT = "NA";
}
my $name1 = $dataWrapper->getClassOneName();
my $name2 = $dataWrapper->getClassTwoName();
my $log = "Trunk classifier log\n";
$log .= "Input data file: $DATAFILE\n";
$log .= "Testset data file: $TESTFILE\n";
$log .= "Procedure: $CLASSIFY\n";
$log .= "Split percent: $SPLITPERCENT\n";
$log .= "Number of levels: $numTrunkLevels[0]\n";
$log .= "Classification variable: $CLASSNAME\n";
$log .= "Training set classes:\n";
if($CLASSIFY eq "loocv"){
$log .= "\tClass one size: " . $dataWrapper->getClassSize($name1) . " ($name1)\n";
$log .= "\tClass two size: " . $dataWrapper->getClassSize($name2) . " ($name2)\n";
}
else{
$log .= "\tClass one size: " . $trainingSet->getClassSize($name1) . " ($name1)\n";
$log .= "\tClass two size: " . $trainingSet->getClassSize($name2) . " ($name2)\n";
}
$log .= "Test set classes:\n";
if($CLASSIFY eq "loocv"){
$log .= "\tClass one size: NA\n";
$log .= "\tClass two size: NA\n";
}
else{
$log .= "\tClass one size: " . $testSet->getClassSize($name1) . " ($name1)\n";
$log .= "\tClass two size: " . $testSet->getClassSize($name2) . " ($name2)\n";
}
$log .= "Version: $VERSION";
print(LOG $log);
close(PERFORMANCE);
close(LOO_TRUNKS);
close(CTS_TRUNKS);
close(REPORT);
close(LOG);
if($VERBOSE){
print("Trunk classifier: Job finished\n");
}
}
#Description: Wrapper for the trunk build loop
#Parameters: (1) Training dataset, (2) level limit, (3) sample index, (4) feature occurrence hash ref,
# (5) selected features hash ref, (6) level break flag ref, (7) verbose flag
#Return value: Decision trunk object
sub buildTrunk($ $ $ $ $ $ $){
my ($buildSet, $levelLimit, $sampleIndex, $featOccurRef, $selFeatRef, $levelBreakRef, $VERBOSE) = @_;
#Trunk build loop
my $decisionTrunk = Algorithm::TrunkClassifier::DecisionTrunk->new();
my $noSampleBreak = 0;
for(my $levelIndex = 1; $levelIndex <= $levelLimit; $levelIndex++){
#Perform feature selection
my $featureName;
my $featureIndex;
my @expRow;
if(!$selFeatRef->{$sampleIndex}{$levelIndex}){
$featureIndex = Algorithm::TrunkClassifier::FeatureSelection::indTTest(
$buildSet->getDataMatrix(), $buildSet->getNumProbes(),
$buildSet->getNumSamples(), $buildSet->getClassVector(),
$buildSet->getClassOneName(), $buildSet->getClassTwoName());
$featureName = $buildSet->getProbeName($featureIndex);
@expRow = $buildSet->getMatrixRow($featureIndex);
my @savedRow = $buildSet->getMatrixRow($featureIndex);
$buildSet->removeProbe($featureIndex);
$selFeatRef->{$sampleIndex}{$levelIndex} = {"feature" => $featureName, "index" => $featureIndex, "row" => \@savedRow};
if(!$featOccurRef->{$levelIndex}{$featureName}){
$featOccurRef->{$levelIndex}{$featureName} = 1;
}
else{
$featOccurRef->{$levelIndex}{$featureName}++;
}
}
else{
$featureName = $selFeatRef->{$sampleIndex}{$levelIndex}{"feature"};
$featureIndex = $selFeatRef->{$sampleIndex}{$levelIndex}{"index"};
@expRow = @{$selFeatRef->{$sampleIndex}{$levelIndex}{"row"}};
$buildSet->removeProbe($featureIndex);
}
#Initialise variables
my @expBuffer = @expRow;
my @classSetInd = (0 .. ($buildSet->getNumSamples() - 1));
my @classVector = @{$buildSet->getClassVector()};
my $numSamples = $buildSet->getNumSamples();
Algorithm::TrunkClassifier::Util::dataSort(\@expRow, \@classVector);
Algorithm::TrunkClassifier::Util::dataSort(\@expBuffer, \@classSetInd);
#Determine quartile thresholds
my $quantStep = $numSamples / 4;
my $lowerThresh;
my $higherThresh;
my $lowFloor = floor($quantStep);
$lowerThresh = ($expRow[$lowFloor] + $expRow[$lowFloor+1]) / 2;
my $highFloor = floor($quantStep * 3);
if(!$expRow[$highFloor+1]){
$higherThresh = $expRow[$highFloor];
}
else{
$higherThresh = ($expRow[$highFloor] + $expRow[$highFloor+1]) / 2;
}
#Determine low and high class
( run in 1.785 second using v1.01-cache-2.11-cpan-f5b5a18a01a )