Algorithm-TrunkClassifier
view release on metacpan or search on metacpan
lib/Algorithm/TrunkClassifier/DataWrapper.pm view on Meta::CPAN
package Algorithm::TrunkClassifier::DataWrapper;
use warnings;
use strict;
use POSIX;
our $VERSION = "v1.0.1";
my $NULL_CLASS = "#NA";
my $PROSPECT_SAMPLES = "samples";
my $PROSPECT_PROBES = "probes";
my $PROSPECT_CLASSES = "classes";
#Description: TrunkClassifier::DataWrapper constructor
#Parameters: (1) TrunkClassifier::DataWrapper, (2) classification variable name, (3) prospect flag,
# (4) supplementary file name, (5) input data file name, (6) verbose flag, (7) dataset type
#Return value: TrunkClassifier::DataWrapper object
sub new{
my ($class, $className, $prospect, $suppFileName, $dataFileName, $VERBOSE, $datasetType) = @_;
my $self = {
"colnames" => "",
"rownames" => "",
"data_matrix" => "",
"class_vector" => "",
"class_one" => "",
"class_two" => ""
};
bless($self, $class);
if(scalar(@_) == 1){
return $self;
}
#If supplementary file is given, write new input data file with meta data
if($suppFileName){
$dataFileName = readSuppFile($suppFileName, $dataFileName, $VERBOSE, $datasetType);
}
#Read input data file
readExpData($self, $className, $prospect, $dataFileName, $datasetType);
return $self;
}
#Description: Reads the supplementary file and writes new input data file with meta data
#Parameters: (1) supplementary file name, (2) input data file name, (3) dataset type
#Return value: New input data file name
sub readSuppFile($ $ $ $){
my ($suppFileName, $dataFileName, $VERBOSE, $datasetType) = @_;
#Read supplementary file
open(SUPP_FILE, $suppFileName) or die "Error: Unable to open supplementary file '$suppFileName'\n";
my @suppFile = <SUPP_FILE>;
my $content = join("", @suppFile);
$content =~ s/\r|\n\r|\r\n/\n/g;
@suppFile = split(/\n+/, $content);
close(SUPP_FILE);
#Extract classification variable names
my @classNames = split(/\t/, shift(@suppFile));
my $numCols = scalar(@classNames);
shift(@classNames);
if($numCols < 2){
warn "Warning: No classification variable names found in supplementary file\n";
return $dataFileName;
}
foreach my $className (@classNames){
$className = uc($className);
}
my %classes;
foreach my $classVar (@classNames){
$classes{$classVar} = {};
}
#Determine classes of each classification variable and assign classes to samples
my %sampleClasses;
for(my $lineIndex = 0; $lineIndex < scalar(@suppFile); $lineIndex++){
if($suppFile[$lineIndex] =~ /^\s*$/){
lib/Algorithm/TrunkClassifier/DataWrapper.pm view on Meta::CPAN
my $lineWarn = $classVarIndex + 1;
warn "Warning: Missing class in supplmentary file at line $lineWarn, replacing with #NA\n";
$class = $NULL_CLASS;
}
$sampleClasses{$sampleName}[$classVarIndex] = $class;
if($class ne $NULL_CLASS && !$classes{$classNames[$classVarIndex]}{$class}){
$classes{$classNames[$classVarIndex]}{$class} = 1;
}
}
}
foreach my $classVar (keys(%classes)){
if(scalar(keys(%{$classes{$classVar}})) != 2){
die "Error: Class variable $classVar in supplementary file does not have two classes\n";
}
}
if(!%sampleClasses){
warn "Warning: No sample classes found in supplementary file\n";
return $dataFileName;
}
#Read input data file and write new data file with classification info
open(DATA_FILE, $dataFileName) or die "Unable to open $datasetType '$dataFileName'\n";
my @dataFile = <DATA_FILE>;
close(DATA_FILE);
my @header = split(/\t/, $dataFile[0]);
shift(@header);
chomp(@header);
foreach my $sampleName (@header){
$sampleName = uc($sampleName);
$sampleName =~ s/\n|\r//g;
}
$dataFileName =~ s/\.[^.]+$/_wmeta.txt/;
if($VERBOSE){
print("Trunk classifier: Supplementary file supplied, writing new $datasetType with meta data\n");
}
open(DATA_FILE, ">$dataFileName") or die "Unable to create new data file '$dataFileName'\n";
my $meta = "";
for(my $classVarIndex = 0; $classVarIndex < scalar(@classNames); $classVarIndex++){
my $className = $classNames[$classVarIndex];
my @classKeys = keys(%{$classes{$className}});
$meta .= "#CLASSVAR $className @classKeys\n";
$meta .= "#CLASSMEM $className";
foreach my $sampleName (@header){
if(!$sampleClasses{$sampleName}[$classVarIndex]){
warn "Warning: Sample '$sampleName' has no '$className' class in supplementary file\n";
$meta .= " " . "#NA";
}
else{
$meta .= " " . $sampleClasses{$sampleName}[$classVarIndex];
}
}
$meta .= "\n";
}
print(DATA_FILE $meta . join("", @dataFile));
close(DATA_FILE);
return $dataFileName;
}
#Description: Reads input data file with expression values and meta data
#Parameters: (1) TrunkClassifier::DataWrapper object, (2) classification variable name
# (3) prospect flag, (4) input data file name, (5) dataset type
#Return value: None
sub readExpData($ $ $ $ $){
my ($self, $className, $prospect, $dataFileName, $datasetType) = @_;
$className = uc($className);
#Read input data file
if(!open(DATA_FILE, $dataFileName)){
die "Error: Unable to open $datasetType '$dataFileName'\n";
}
my @dataFile = <DATA_FILE>;
close(DATA_FILE);
my $content = join("", @dataFile);
$content =~ s/\r|\n\r|\r\n/\n/g;
@dataFile = split(/\n+/, $content);
foreach my $row (@dataFile){
$row =~ s/\n//g;
}
#Extract meta data rows
my @metarows;
my $rowcounter = 0;
while($rowcounter < scalar(@dataFile)){
$dataFile[$rowcounter] =~ s/^\s+//;
if($dataFile[$rowcounter] =~ /^\s*$/){
shift(@dataFile);
}
elsif($dataFile[$rowcounter] =~ /^#/){
push(@metarows, shift(@dataFile));
}
else{
$rowcounter++;
}
}
#Extract samples
my @samples = split(/\t/, shift(@dataFile));
shift(@samples);
my $totNumSamples = scalar(@samples);
if(!$totNumSamples){
die "Error: No samples in $datasetType\n";
}
#Check that class variable exists and that all samples have valid class membership
my %classes;
my %membership;
foreach my $row (@metarows){
if($row =~ /^#CLASSVAR/){
my @cols = split(/\s+/, $row);
shift(@cols);
if(!$cols[0]){
warn "Warning: CLASSVAR name missing in meta data of $datasetType\n";
next;
}
if(!$cols[1] || !$cols[2]){
warn "Warning: CLASSVAR class labels for '$cols[0]' missing in meta data of $datasetType\n";
next;
}
if($cols[1] eq $NULL_CLASS || $cols[2] eq $NULL_CLASS){
die "Error: CLASSVAR class label equals NULL CLASS in $datasetType\n";
lib/Algorithm/TrunkClassifier/DataWrapper.pm view on Meta::CPAN
my $classTwoCount = 0;
for(my $sampleIndex = 0; $sampleIndex < $totNumSamples; $sampleIndex++){
if($classVector[$sampleIndex] eq $classOne){
$classOneCount++;
push(@includedInd, $sampleIndex);
}
elsif($classVector[$sampleIndex] eq $classTwo){
$classTwoCount++;
push(@includedInd, $sampleIndex);
}
}
if(!$classOneCount){
die "Error: Class '$classOne' for classification variable '$className' has zero members in $datasetType\n";
}
if(!$classTwoCount){
die "Error: Class '$classTwo' for classification variable '$className' has zero members in $datasetType\n";
}
my $numIncInd = scalar(@includedInd);
#Check for sample duplicates
for(my $outer = 0; $outer < $totNumSamples - 1; $outer++){
for(my $inner = $outer + 1; $inner < $totNumSamples; $inner++){
if($samples[$outer] eq $samples[$inner]){
warn "Warning: Duplicate sample name '$samples[$outer]' at positions ", $outer + 1, " and ", $inner + 1, " in $datasetType\n";
}
}
}
#Initialise Algorithm::TrunkClassifier::DataWrapper object
my @incSampleNames;
my @incClassVector;
my @probeNames;
my @dataMatrix;
foreach my $index (@includedInd){
push(@incSampleNames, $samples[$index]);
push(@incClassVector, $classVector[$index]);
}
for(my $rowIndex = 0; $rowIndex < scalar(@dataFile); $rowIndex++){
$dataFile[$rowIndex] =~ s/,/./g;
my @cols = split(/\t/, $dataFile[$rowIndex]);
if(scalar(@cols) != $totNumSamples + 1){
die "Error: Wrong number of columns in $datasetType at probe ", $rowIndex + 1, "\n";
}
my $probe = "$rowIndex:" . shift(@cols);
push(@probeNames, $probe);
my @includedCols;
foreach my $index (@includedInd){
$cols[$index] =~ s/\s+//g;
if($cols[$index] !~ /^-?[0-9]+(\.[0-9]+)?([Ee][+\-]?[0-9]+)?$/){
warn "Warning: Missing/invalid value '$cols[$index]' in $datasetType at probe ", $rowIndex + 1, "\n";
$cols[$index] =~ s/[^0-9]+//g;
}
if($cols[$index] !~ /\./){
$cols[$index] .= ".0";
}
push(@includedCols, $cols[$index] + 0);
}
push(@dataMatrix, \@includedCols);
}
#Check prospect flag
if($prospect){
if($prospect eq $PROSPECT_SAMPLES){
die "Number of samples with $className class\n$classOne: $classOneCount\n$classTwo: $classTwoCount\n";
}
elsif($prospect eq $PROSPECT_PROBES){
my $numProbes = scalar(@dataMatrix);
die "Number of probes in dataset: $numProbes\n";
}
elsif($prospect eq $PROSPECT_CLASSES){
my @classKeys = keys(%classes);
die "Classes in the dataset: @classKeys\n";
}
}
$self->{"colnames"} = \@incSampleNames;
$self->{"rownames"} = \@probeNames;
$self->{"data_matrix"} = \@dataMatrix;
$self->{"class_vector"} = \@incClassVector;
$self->{"class_one"} = $classOne;
$self->{"class_two"} = $classTwo;
}
#Description: Returns the number of samples in the dataset
#Parameters: (1) TrunkClassifier::DataWrapper object
#Return value: Number of elements in "colnames" attribute
sub getNumSamples($){
my $self = shift(@_);
return scalar(@{$self->{"colnames"}});
}
#Description: Returns the number of probes in the dataset
#Parameters: (1) TrunkClassifier::DataWrapper object
#Return value: Number of rows in "rownames" array
sub getNumProbes($){
my $self = shift(@_);
return scalar(@{$self->{"rownames"}});
}
#Description: Returns the row names of the DataWrapper object
#Parameters: (1) TrunkClassifier::DataWrapper object
#Return value: Array of row names
sub getProbeList($){
my $self = shift(@_);
return @{$self->{"rownames"}};
}
#Description: Returns a reference to the data matrix
#Parameters: (1) TrunkClassifier::DataWrapper object
#Return value: Array reference
sub getDataMatrix($){
my $self = shift(@_);
return $self->{"data_matrix"};
}
#Description: Returns a reference to the class vector
#Parameters: (1) TrunkClassifier::DataWrapper object
#Return value: Array reference
sub getClassVector($){
my $self = shift(@_);
return $self->{"class_vector"};
}
( run in 1.386 second using v1.01-cache-2.11-cpan-f5b5a18a01a )