ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

		$startingMatrix{$key} = ${$explicitMatrixRef}{$key}
	    }
	    $i++;
	}

	#output the cui list if needed
	if (exists ${$lbdOptionsRef}{'cuiListOutputFile'}) {
	    open OUT, ">".${$lbdOptionsRef}{'cuiListOutputFile'} or die ("ERROR: cannot open cuiListOutputFile:".${$lbdOptionsRef}{'cuiListOutputFile'}."\n");
	    foreach my $cui (keys %startingMatrix) {
		print OUT "$cui\n";
	    }
	    close OUT;
	}
    }

    #return the starting matrix
    return \%startingMatrix;
}


# gets and returns a hash of row keys of the specifies semantic types
# input:  $matrixRef <- a ref to a sparse matrix
#         $acceptTypesRef <- a ref to a hash of accept types (TUIs)
#         $umls <- an instance of UMLS::Interface
# output: \%rowsToKeep <- a ref to hash of rows to keep, each key is 
#                         a CUI, and values are 1. All CUIs specify rows
#                         of acceptable semantic types
sub getRowsOfSemanticTypes {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
    
    #loop through the matrix and keep the rows that are of the 
    # desired semantic types
    my %rowsToKeep = ();
    foreach my $cui1 (keys %{$matrixRef}) {
	my $typesRef = $umls->getSt($cui1);
	foreach my $type(@{$typesRef}) {
	    my $abr = $umls->getStAbr($type);

	    #check the cui for removal
	    if (exists ${$acceptTypesRef}{$type}) {
		$rowsToKeep{$cui1} = 1;
		last;
	    }
	}
    }

    #return the rowsToKeep
    return \%rowsToKeep
}

# generates a hash of all association scores from the matrix
# the hash keys are $rowKey,$colKey. Hash values are the association scores
# between the $rowKey and $colKey. All co-occurring cui pairs from the matrix
# are calculated
# input:  $matrixRef <- a reference to a sparse matrix
#         $rankingMeasue <- a string specifying the ranking measure to use
#         $umls_association <- an instance of UMLS::Association
# output: \%cuiPairs <- a ref to a hash of CUI pairs and their assocaition
#                       each key of the hash is a comma seperated string 
#                       containing cui1, and cui2 of the pair 
#                       (e.g. 'cui1,cui2'), and each value is their association
#                       score using the specified assocition measure
sub getAssociationScores {
    my $matrixRef = shift;
    my $rankingMeasure = shift;
    my $umls_association = shift;
    print "   getting Association Scores, rankingMeasure = $rankingMeasure\n";
    
    #generate a list of cui pairs in the matrix
    my %cuiPairs = ();
    print "   generating association scores:\n";
    foreach my $rowKey (keys %{$matrixRef}) {
	foreach my $colKey (keys %{${$matrixRef}{$rowKey}}) {
	    $cuiPairs{"$rowKey,$colKey"} = ${${$matrixRef}{$rowKey}}{$colKey};
	}
    }
    
    #get ranks for all the cui pairs in the matrix
    #return a hash of cui pairs and their frequency
    if ($rankingMeasure eq 'frequency') {
	return \%cuiPairs;
    } else {
	#updates values in cuiPairs hash with their association scores and returns
	Rank::getBatchAssociationScores(\%cuiPairs, $matrixRef, $rankingMeasure, $umls_association);
	return \%cuiPairs;
    }
}

# gets the min and max value of a hash
# returns a two element array, where the first value is the min, and
# the second values is the max
# input:  $hashref <- a reference to a hash with numbers as values
# output: ($min, $max) <- the minimum and maximum values in the hash
sub getMinMax {
    my $hashRef = shift;
    
    #loop through each key and record the min/max
    my $min = 999999;
    my $max = -999999;
    foreach my $key (keys %{$hashRef}) {
	my $val = ${$hashRef}{$key};
	if ($val < $min) {
	    $min = $val;
	}
	if ($val > $max) {
	    $max = $val;
	}
    }
    return ($min,$max);
}

# Applies a threshold to a matrix using a corresponding association scores
# hash. Any keys less than the threshold are not copied to the new matrix
# input:  $threshold <- a scalar threshold
#         $assocScoresRef <- a reference to a cui pair hash of association
#                            scores. Each key is a comma seperated cui pair
#                            (e.g. 'cui1,cui2'), values are their association
#                            scores.
#         $matrixRef <- a reference to a co-occurrence sparse matrix that 
#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix < a ref to a new matrix, built from the 
#         $matrixRef after applying the $threshold
sub applyThreshold {
    my $threshold = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();
    my ($cui1, $cui2);
    foreach my $key (keys %{$assocScoresRef}) {

	#add key if val >= threshold
	if (${$assocScoresRef}{$key} >= $threshold) {
	    ($cui1,$cui2) = split(/,/, $key);

	    #create new hash at rowkey location
	    if (!(exists $thresholdedMatrix{$cui1})) {
		my %newHash = ();
		$thresholdedMatrix{$cui1} = \%newHash;
	    }
	    #set key value
	    ${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
	    $postKeyCount++;
	}
    }

    #return the thresholded matrix
    return \%thresholdedMatrix;
}

# Grabs the K highest ranked samples. This is for thresholding based the number 
# of samples. Used in explicit timeslicing
# input:  $k <- the number of samples to get
#         $assocScoresRef <- a reference to a cui pair hash of association
#                            scores. Each key is a comma seperated cui pair
#                            (e.g. 'cui1,cui2'), values are their association
#                            scores.
#         $matrixRef <- a reference to a co-occurrence sparse matrix that 
#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the 
#                                $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
    my $k = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;
    print "getting $k highest ranked samples\n";

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();

    #get the keys sorted by value in descending order
    my @sortedKeys = sort { $assocScoresRef->{$b} <=> $assocScoresRef->{$a} } keys(%$assocScoresRef);
    my $threshold =  ${$assocScoresRef}{$sortedKeys[$k-1]};
    print " threshold = $threshold\n";

    #add the first k keys to the thresholded matrix
    my ($cui1, $cui2);
    foreach my $key (@sortedKeys) {
	($cui1, $cui2) = split(/,/, $key);

	#create new hash at rowkey location (if needed)
	if (!(exists $thresholdedMatrix{$cui1})) {
	    my %newHash = ();
	    $thresholdedMatrix{$cui1} = \%newHash;
	}

	#set key value for the key pair
	${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
	$postKeyCount++;

	#stop adding keys when below the threshold
	if (${$assocScoresRef}{$key} < $threshold) {
	    last;
	}
    }
    #return the thresholded matrix
    return \%thresholdedMatrix;
}


# calculates precision and recall at $numIntervals (e.g. 10 for 10%) recall 
# intervals using an implicit ranking threshold
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
#         $numIntervals <- the number of recall intervals to generate
# output: (\%precision, \%recall) <- refs to hashes of precision and recall. 
#                                    Each hash key is the interval number, and 
#                                    the value is the precision and recall 
#                                    respectively



( run in 1.286 second using v1.01-cache-2.11-cpan-f56aa216473 )