ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

		($truePositive/($truePositive+$falsePositive)); 
	} #else precision += 0 ... nothing needs to be done
	if ((scalar keys %{${$trueMatrixRef}{$rowKey}}) > 0) {
	    $recall += 
		($truePositive/
		 (scalar keys %{${$trueMatrixRef}{$rowKey}}));
	} #else recall += 0
    }

    #calculate the averages (divide by the number of rows 
    #    = the number of terms in the post cutoff matrix)
    $precision /= scalar keys %{$trueMatrixRef};
    $recall /= scalar keys %{$trueMatrixRef};

    #return the average precision and recall
    return ($precision, $recall);
}


# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are 
# the only rows that are needed.
# input:  $startingMatrixRef <- a ref to the starting sparse matrix
#         $explicitMatrix Ref <- a ref to the explicit sparse matrix
#         $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $postCutoffFileName = shift;
    print "loading postCutoff Matrix\n";
    
    #open the post cutoff file
    open IN, $postCutoffFileName 
	or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");

    #create hash of cuis to grab
    my %cuisToGrab = ();
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$cuisToGrab{$rowKey} = 1;
    }

    #read in values of the post cutoff matrix for the start terms
    my %postCutoffMatrix = ();
    my ($cui1, $cui2, $val);
    while (my $line = <IN>) {
	#grab values from the line
	chomp $line;
	($cui1, $cui2, $val) = split(/\t/,$line);

	#see if this line contains a key that should be read in 
	if (exists $cuisToGrab{$cui1}) {

	    #add the value
	    if (!(defined $postCutoffMatrix{$cui1})) {
		my %newHash = ();
		$postCutoffMatrix{$cui1} = \%newHash;
	    }

	    #check to ensure that the column cui is in the 
	    #  vocabulary of the pre-cutoff dataset.
	    #  it is impossible to make predictions of words that
	    #  don't already exist
	    #NOTE: this assumes $explicitMatrixRef is a square 
	    #   matrix (so unordered)
	    if (exists ${$explicitMatrixRef}{$cui2}) {
		${$postCutoffMatrix{$cui1}}{$cui2} = $val;
	    }
	}
    }
    close IN;

    #return the post cutoff matrix
    return \%postCutoffMatrix;
}

#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input:  $explicitMatrixRef <- a ref to the explicit sparse matrix
#         $lbdOptionsRef <- the LBD options
#         $startTermAcceptTypesRef <- a reference to an hash of accept 
#                                     types for start terms (TUIs)
#         $numRows <- the number of random rows to load (if random)
#         $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
    my $explicitMatrixRef = shift;
    my $lbdOptionsRef = shift;
    my $startTermAcceptTypesRef = shift;
    my $numRows = shift;
    my $umls_interface = shift;

    #generate the starting matrix randomly or from a file
    my %startingMatrix = ();

    #check if a file is defined
    if (exists ${$lbdOptionsRef}{'cuiListFileName'}) {
	#grab the rows defined by the cuiListFile
	my $cuisRef = &loadCUIs(${$lbdOptionsRef}{'cuiListFileName'});
	foreach my $cui (keys %{$cuisRef}) {
	    if(exists ${$explicitMatrixRef}{$cui}) {
		$startingMatrix{$cui} = ${$explicitMatrixRef}{$cui};	
	    }
	    else {
		print STDERR "WARNING: CUI from cuiListFileName is not in explicitMatrix: $cui\n";
	    }
	}
    }
    else {
	#randomly grab rows
	#apply semantic filter to the rows (just retreive appropriate rows)
	my $rowsToKeepRef = getRowsOfSemanticTypes(
	    $explicitMatrixRef, $startTermAcceptTypesRef, $umls_interface);
	((scalar keys %{$rowsToKeepRef}) >= $numRows) or die("ERROR: number of acceptable rows starting terms is less than $numRows\n");

	#randomly select 100 rows (to generate the 'starting matrix')
	#generate random numbers from 0 to number of rows in the explicit matrix
	my %rowNumbers = ();
	while ((scalar keys %rowNumbers) < $numRows) {
	    $rowNumbers{int(rand(scalar keys %{$rowsToKeepRef}))} = 1;
	}

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

	Rank::getBatchAssociationScores(\%cuiPairs, $matrixRef, $rankingMeasure, $umls_association);
	return \%cuiPairs;
    }
}

# gets the min and max value of a hash
# returns a two element array, where the first value is the min, and
# the second values is the max
# input:  $hashref <- a reference to a hash with numbers as values
# output: ($min, $max) <- the minimum and maximum values in the hash
sub getMinMax {
    my $hashRef = shift;
    
    #loop through each key and record the min/max
    my $min = 999999;
    my $max = -999999;
    foreach my $key (keys %{$hashRef}) {
	my $val = ${$hashRef}{$key};
	if ($val < $min) {
	    $min = $val;
	}
	if ($val > $max) {
	    $max = $val;
	}
    }
    return ($min,$max);
}

# Applies a threshold to a matrix using a corresponding association scores
# hash. Any keys less than the threshold are not copied to the new matrix
# input:  $threshold <- a scalar threshold
#         $assocScoresRef <- a reference to a cui pair hash of association
#                            scores. Each key is a comma seperated cui pair
#                            (e.g. 'cui1,cui2'), values are their association
#                            scores.
#         $matrixRef <- a reference to a co-occurrence sparse matrix that 
#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix < a ref to a new matrix, built from the 
#         $matrixRef after applying the $threshold
sub applyThreshold {
    my $threshold = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();
    my ($cui1, $cui2);
    foreach my $key (keys %{$assocScoresRef}) {

	#add key if val >= threshold
	if (${$assocScoresRef}{$key} >= $threshold) {
	    ($cui1,$cui2) = split(/,/, $key);

	    #create new hash at rowkey location
	    if (!(exists $thresholdedMatrix{$cui1})) {
		my %newHash = ();
		$thresholdedMatrix{$cui1} = \%newHash;
	    }
	    #set key value
	    ${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
	    $postKeyCount++;
	}
    }

    #return the thresholded matrix
    return \%thresholdedMatrix;
}

# Grabs the K highest ranked samples. This is for thresholding based the number 
# of samples. Used in explicit timeslicing
# input:  $k <- the number of samples to get
#         $assocScoresRef <- a reference to a cui pair hash of association
#                            scores. Each key is a comma seperated cui pair
#                            (e.g. 'cui1,cui2'), values are their association
#                            scores.
#         $matrixRef <- a reference to a co-occurrence sparse matrix that 
#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the 
#                                $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
    my $k = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;
    print "getting $k highest ranked samples\n";

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();

    #get the keys sorted by value in descending order
    my @sortedKeys = sort { $assocScoresRef->{$b} <=> $assocScoresRef->{$a} } keys(%$assocScoresRef);
    my $threshold =  ${$assocScoresRef}{$sortedKeys[$k-1]};
    print " threshold = $threshold\n";

    #add the first k keys to the thresholded matrix
    my ($cui1, $cui2);
    foreach my $key (@sortedKeys) {
	($cui1, $cui2) = split(/,/, $key);

	#create new hash at rowkey location (if needed)
	if (!(exists $thresholdedMatrix{$cui1})) {
	    my %newHash = ();
	    $thresholdedMatrix{$cui1} = \%newHash;
	}

	#set key value for the key pair
	${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
	$postKeyCount++;

	#stop adding keys when below the threshold
	if (${$assocScoresRef}{$key} < $threshold) {
	    last;
	}
    }
    #return the thresholded matrix
    return \%thresholdedMatrix;
}


# calculates precision and recall at $numIntervals (e.g. 10 for 10%) recall 
# intervals using an implicit ranking threshold
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
#         $numIntervals <- the number of recall intervals to generate
# output: (\%precision, \%recall) <- refs to hashes of precision and recall. 
#                                    Each hash key is the interval number, and 
#                                    the value is the precision and recall 
#                                    respectively
sub calculatePrecisionAndRecall_implicit {
    my $trueMatrixRef = shift; #a ref to the true matrix
    my $rowRanksRef = shift; #a ref to ranked predictions, each hash element are the predictions for a single cui, at each element is an array of cuis ordered by their rank
    my $numIntervals = shift; #the recall intervals to test at

    #find precision and recall curves for each cui that is being predicted
    #  take the sum of precisions, then average after the loop
    my %precision = ();
    my %recall = ();
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
	
	#get the number of predicted discoveries and true discoveries
	my $numPredictions = scalar @{$rankedPredictionsRef};
	my $numTrue = scalar keys %{$trueRef};

	#skip if there are NO new discoveries for this start term
	if ($numTrue == 0) {
	    next;
	}
	#skip if there are NO predictions for this start term
	if ($numPredictions == 0) {
	    next;
	}

	#determine precision and recall at 10% intervals of the number of 
	#predicted true vaules. This is done by simulating a threshold being
	#applied, so the top $numToTest ranked terms are tested at 10% intervals
	my $interval = $numPredictions/$numIntervals;
	for (my $i = 0; $i <= 1; $i+=(1/$numIntervals)) {
	    
	    #determine the number true to grab
	    my $numTrueForInterval = 1; #at $i = 0, grab just the first term that is true

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: \%meanPrecision <- a hash of mean preicsions at K, each key is the 
#                            value of k, the the value is the precision at that
#                            k
sub calculatePrecisionAtK {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
  
    #generate precision at k at intervals of 10 for k = 10-100
    my %meanPrecision = ();
    my $interval = 1;
    for (my $k = 1; $k <= 100; $k+=$interval) {
	$meanPrecision{$k} = 0;

	#average the mean precision over all terms
	foreach my $rowKey (keys %{$trueMatrixRef}) {
	    my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
	    
	    #skip for rows that have no predictions
	    if (!defined $rankedPredictionsRef) {
		next;
	    } 
	    my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	    #threshold the interval, so that it does not exceed 
	    # the number of predictions
	    my $interval = $k;
	    if ($k > scalar @{$rankedPredictionsRef}) {
		$interval = scalar @{$rankedPredictionsRef};
	    }

	    #find the number of true positives in the top $interval ranked terms
	    my $truePositiveCount = 0;
	    for (my $rank = 0; $rank < $interval; $rank++) {
		my $cui = ${$rankedPredictionsRef}[$rank];
		if (exists ${$trueRef}{$cui}) {
		    $truePositiveCount++;
		}
	    }

	    #add this precision to the mean precisions at k
	    $meanPrecision{$k} += ($truePositiveCount/$interval);
	}
	#take the mean of the precisions
	$meanPrecision{$k} /= (scalar keys %{$trueMatrixRef});

	#after computing precision at 1-10, change interval to 10
	if ($k == 10) {
	    $interval = 10;
	}
    }

    #return the mean precisions at k
    return \%meanPrecision;
}


# calculates the number of co-occurrences in the gold set of the top ranked 
# k predictions at k at intervals of 1, from k = 1-10 and intervals of 10 
# for 10-100. Co-occurrence counts are averaged over each of the starting terms
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: \%meanCooccurrenceCounts <- a hash of mean preicsions at K, each key 
#                                     is the value of k, the the value is the 
#                                     precision at that k
sub calculateMeanCooccurrencesAtK {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
  
    #generate mean cooccurrences at k at intervals of 10 for k = 10-100
    my %meanCooccurrenceCount = (); #count of the number of co-occurrences for each k
    my $interval = 1;
    for (my $k = 1; $k <= 100; $k+=$interval) {
	$meanCooccurrenceCount{$k} = 0;

	#average the mean co-occurrenes over all terms
	#  the true matrix contains only rows for the cuis being tested 
        #  or in time slicing
	foreach my $rowKey (keys %{$trueMatrixRef}) {
	    my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
	    
	    #skip for rows that have no predictions
	    if (!defined $rankedPredictionsRef) {
		next;
	    } 
	    my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries

	    #threshold the interval, so that it does not exceed 
	    # the number of predictions
	    my $interval = $k;
	    if ($k > scalar @{$rankedPredictionsRef}) {
		$interval = scalar @{$rankedPredictionsRef};
	    }

	    #find the number of true co-occurrence for the top $interval 
	    # ranked terms
	    my $cooccurrenceCount = 0;
	    for (my $rank = 0; $rank < $interval; $rank++) {
		my $cui = ${$rankedPredictionsRef}[$rank];
		if (exists ${$trueRef}{$cui}) {
		    $cooccurrenceCount += ${$trueRef}{$cui};
		}
	    }

	    #add this precision to the mean precisions at k
	    $meanCooccurrenceCount{$k} += $cooccurrenceCount;
	}
	#take the mean of the cooccurrence counts
	$meanCooccurrenceCount{$k} /= (scalar keys %{$trueMatrixRef});

	#after computing cooccurrence counts  at 1-10, change interval to 10
	if ($k == 10) {
	    $interval = 10;



( run in 1.697 second using v1.01-cache-2.11-cpan-39bf76dae61 )