Algorithm-TrunkClassifier

 view release on metacpan or  search on metacpan

lib/Algorithm/TrunkClassifier/DataWrapper.pm  view on Meta::CPAN

		if($dataFile[$rowcounter] =~ /^\s*$/){
			shift(@dataFile);
		}
		elsif($dataFile[$rowcounter] =~ /^#/){
			push(@metarows, shift(@dataFile));
		}
		else{
			$rowcounter++;
		}
	}
	
	#Extract samples
	my @samples = split(/\t/, shift(@dataFile));
	shift(@samples);
	my $totNumSamples = scalar(@samples);
	if(!$totNumSamples){
		die "Error: No samples in $datasetType\n";
	}
	
	#Check that class variable exists and that all samples have valid class membership
	my %classes;
	my %membership;
	foreach my $row (@metarows){
		if($row =~ /^#CLASSVAR/){
			my @cols = split(/\s+/, $row);
			shift(@cols);
			if(!$cols[0]){
				warn "Warning: CLASSVAR name missing in meta data of $datasetType\n";
				next;
			}
			if(!$cols[1] || !$cols[2]){
				warn "Warning: CLASSVAR class labels for '$cols[0]' missing in meta data of $datasetType\n";
				next;
			}
			if($cols[1] eq $NULL_CLASS || $cols[2] eq $NULL_CLASS){
				die "Error: CLASSVAR class label equals NULL CLASS in $datasetType\n";
			}
			my $classVarName = uc($cols[0]);
			my $class1 = uc($cols[1]);
			my $class2 = uc($cols[2]);
			$classes{$classVarName} = {$class1 => 1, $class2 => 1};
		}
		if($row =~ /^#CLASSMEM/){
			my @cols = split(/\s+/, $row);
			shift(@cols);
			if(!$cols[0]){
				warn "Warning: CLASSMEM name missing in meta data of $datasetType\n";
				next;
			}
			my $classVarName = uc(shift(@cols));
			foreach my $class (@cols){
				$class = uc($class);
			}
			$membership{$classVarName} = \@cols;
		}
	}
	if(!$classes{$className} || !$membership{$className}){
		die "Error: Missing meta data for classification variable '$className' in $datasetType\n";
	}
	if(scalar(@{$membership{$className}}) != $totNumSamples){
		die "Error: CLASSMEM vector for '$className' and sample vector have different lengths in $datasetType\n";
	}
	foreach my $class (@{$membership{$className}}){
		if($class ne $NULL_CLASS && !$classes{$className}{$class}){
			die "Error: Invalid class label in '$className' CLASSMEM vector in $datasetType\n";
		}
	}
	my @classVector = @{$membership{$className}};
	my @classBuffer = sort(keys(%{$classes{$className}}));
	my $classOne = $classBuffer[0];
	my $classTwo = $classBuffer[1];
	
	#Determine what sample indexes to include
	my @includedInd;
	my $classOneCount = 0;
	my $classTwoCount = 0;
	for(my $sampleIndex = 0; $sampleIndex < $totNumSamples; $sampleIndex++){
		if($classVector[$sampleIndex] eq $classOne){
			$classOneCount++;
			push(@includedInd, $sampleIndex);
		}
		elsif($classVector[$sampleIndex] eq $classTwo){
			$classTwoCount++;
			push(@includedInd, $sampleIndex);
		}
	}
	if(!$classOneCount){
		die "Error: Class '$classOne' for classification variable '$className' has zero members in $datasetType\n";
	}
	if(!$classTwoCount){
		die "Error: Class '$classTwo' for classification variable '$className' has zero members in $datasetType\n";
	}
	my $numIncInd = scalar(@includedInd);
	
	#Check for sample duplicates
	for(my $outer = 0; $outer < $totNumSamples - 1; $outer++){
		for(my $inner = $outer + 1; $inner < $totNumSamples; $inner++){
			if($samples[$outer] eq $samples[$inner]){
				warn "Warning: Duplicate sample name '$samples[$outer]' at positions ", $outer + 1, " and ", $inner + 1, " in $datasetType\n";
			}
		}
	}
	
	#Initialise Algorithm::TrunkClassifier::DataWrapper object
	my @incSampleNames;
	my @incClassVector;
	my @probeNames;
	my @dataMatrix;
	foreach my $index (@includedInd){
		push(@incSampleNames, $samples[$index]);
		push(@incClassVector, $classVector[$index]);
	}
	for(my $rowIndex = 0; $rowIndex < scalar(@dataFile); $rowIndex++){
		$dataFile[$rowIndex] =~ s/,/./g;
		my @cols = split(/\t/, $dataFile[$rowIndex]);
		if(scalar(@cols) != $totNumSamples + 1){
			die "Error: Wrong number of columns in $datasetType at probe ", $rowIndex + 1, "\n";
		}
		my $probe = "$rowIndex:" . shift(@cols);
		push(@probeNames, $probe);
		my @includedCols;



( run in 1.431 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )