ALBD
view release on metacpan or search on metacpan
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# corresponds to the assocScoresRef
# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the
# $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
my $k = shift;
my $assocScoresRef = shift;
my $matrixRef = shift;
print "getting $k highest ranked samples\n";
#apply the threshold
my $preKeyCount = scalar keys %{$assocScoresRef};
my $postKeyCount = 0;
my %thresholdedMatrix = ();
#get the keys sorted by value in descending order
my @sortedKeys = sort { $assocScoresRef->{$b} <=> $assocScoresRef->{$a} } keys(%$assocScoresRef);
my $threshold = ${$assocScoresRef}{$sortedKeys[$k-1]};
print " threshold = $threshold\n";
#add the first k keys to the thresholded matrix
my ($cui1, $cui2);
foreach my $key (@sortedKeys) {
($cui1, $cui2) = split(/,/, $key);
#create new hash at rowkey location (if needed)
if (!(exists $thresholdedMatrix{$cui1})) {
my %newHash = ();
$thresholdedMatrix{$cui1} = \%newHash;
}
#set key value for the key pair
${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
$postKeyCount++;
#stop adding keys when below the threshold
if (${$assocScoresRef}{$key} < $threshold) {
last;
}
}
#return the thresholded matrix
return \%thresholdedMatrix;
}
# calculates precision and recall at $numIntervals (e.g. 10 for 10%) recall
# intervals using an implicit ranking threshold
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# $numIntervals <- the number of recall intervals to generate
# output: (\%precision, \%recall) <- refs to hashes of precision and recall.
# Each hash key is the interval number, and
# the value is the precision and recall
# respectively
sub calculatePrecisionAndRecall_implicit {
my $trueMatrixRef = shift; #a ref to the true matrix
my $rowRanksRef = shift; #a ref to ranked predictions, each hash element are the predictions for a single cui, at each element is an array of cuis ordered by their rank
my $numIntervals = shift; #the recall intervals to test at
#find precision and recall curves for each cui that is being predicted
# take the sum of precisions, then average after the loop
my %precision = ();
my %recall = ();
foreach my $rowKey (keys %{$trueMatrixRef}) {
my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
#get the number of predicted discoveries and true discoveries
my $numPredictions = scalar @{$rankedPredictionsRef};
my $numTrue = scalar keys %{$trueRef};
#skip if there are NO new discoveries for this start term
if ($numTrue == 0) {
next;
}
#skip if there are NO predictions for this start term
if ($numPredictions == 0) {
next;
}
#determine precision and recall at 10% intervals of the number of
#predicted true vaules. This is done by simulating a threshold being
#applied, so the top $numToTest ranked terms are tested at 10% intervals
my $interval = $numPredictions/$numIntervals;
for (my $i = 0; $i <= 1; $i+=(1/$numIntervals)) {
#determine the number true to grab
my $numTrueForInterval = 1; #at $i = 0, grab just the first term that is true
if ($i > 0) {
$numTrueForInterval = $numTrue*$i;
}
#grab true discoveries until the recall rate is exceeded
my $truePositive = 0;
my $numChecked = 0;
for (my $j = 0; $j < $numPredictions; $j++) {
#get the jth ranked cui and check if it is a true discovery
my $cui = ${$rankedPredictionsRef}[$j];
if (exists ${$trueRef}{$cui}) {
$truePositive++;
}
$numChecked++;
#check if the recall rate has been reached
if ($truePositive > $numTrueForInterval) {
last;
}
}
#sum precision at this interval, average over number of rows is
# taken outside of the loop
$precision{$i} += ($truePositive / $numChecked); #number that are selected that are true
$recall{$i} += ($truePositive / $numTrue); #number of true that are selected
}
}
#calculate the average precision at each interval
foreach my $i (keys %precision) {
#divide by the number of rows in the true matrix ref
# because those are the number of cuis we are testing
# it is possible that the predictions has rows that are
# not in the true, and those should be ignored.
$precision{$i} /= (scalar keys %{$trueMatrixRef});
$recall{$i} /= (scalar keys %{$trueMatrixRef});
}
#return the precision and recall at 10% intervals
return (\%precision, \%recall);
}
# calculates the mean average precision (MAP)
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# output: $map <- a scalar value of mean average precision (MAP)
sub calculateMeanAveragePrecision {
#grab the input
my $trueMatrixRef = shift; # a matrix of true discoveries
my $rowRanksRef = shift; # a hash of ranked predicted discoveries
print "calculating mean average precision\n";
#calculate MAP for each true discovery being predicted
my $map = 0;
foreach my $rowKey (keys %{$trueMatrixRef}) {
my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
#skip for rows that have no predictions
if (!defined $rankedPredictionsRef) {
next;
}
my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
my $numPredictions = scalar @{$rankedPredictionsRef};
#calculate the average precision of this true cui, by comparing
# the predicted vs. true values ordered and weighted by their rank
my $ap = 0; #average precision
my $truePositiveCount = 0;
#start at 1, since divide by rank...subtract one when indexing
for (my $rank = 1; $rank <= $numPredictions; $rank++) {
my $cui = ${$rankedPredictionsRef}[$rank-1];
if (exists ${$trueRef}{$cui}) {
$truePositiveCount++;
$ap += ($truePositiveCount/($rank));
}
}
#calculate the average precision, and add to map
if ($truePositiveCount > 0) {
$ap /= $truePositiveCount;
} #else, $ap is already 0 so do nothing
$map += $ap;
}
#take the mean of the average precisions
# divide by the number of true discoveries that you summed over
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
next;
}
my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
#threshold the interval, so that it does not exceed
# the number of predictions
my $interval = $k;
if ($k > scalar @{$rankedPredictionsRef}) {
$interval = scalar @{$rankedPredictionsRef};
}
#find the number of true positives in the top $interval ranked terms
my $truePositiveCount = 0;
for (my $rank = 0; $rank < $interval; $rank++) {
my $cui = ${$rankedPredictionsRef}[$rank];
if (exists ${$trueRef}{$cui}) {
$truePositiveCount++;
}
}
#add this precision to the mean precisions at k
$meanPrecision{$k} += ($truePositiveCount/$interval);
}
#take the mean of the precisions
$meanPrecision{$k} /= (scalar keys %{$trueMatrixRef});
#after computing precision at 1-10, change interval to 10
if ($k == 10) {
$interval = 10;
}
}
#return the mean precisions at k
return \%meanPrecision;
}
# calculates the number of co-occurrences in the gold set of the top ranked
# k predictions at k at intervals of 1, from k = 1-10 and intervals of 10
# for 10-100. Co-occurrence counts are averaged over each of the starting terms
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# output: \%meanCooccurrenceCounts <- a hash of mean preicsions at K, each key
# is the value of k, the the value is the
# precision at that k
sub calculateMeanCooccurrencesAtK {
#grab the input
my $trueMatrixRef = shift; # a matrix of true discoveries
my $rowRanksRef = shift; # a hash of ranked predicted discoveries
#generate mean cooccurrences at k at intervals of 10 for k = 10-100
my %meanCooccurrenceCount = (); #count of the number of co-occurrences for each k
my $interval = 1;
for (my $k = 1; $k <= 100; $k+=$interval) {
$meanCooccurrenceCount{$k} = 0;
#average the mean co-occurrenes over all terms
# the true matrix contains only rows for the cuis being tested
# or in time slicing
foreach my $rowKey (keys %{$trueMatrixRef}) {
my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
#skip for rows that have no predictions
if (!defined $rankedPredictionsRef) {
next;
}
my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
#threshold the interval, so that it does not exceed
# the number of predictions
my $interval = $k;
if ($k > scalar @{$rankedPredictionsRef}) {
$interval = scalar @{$rankedPredictionsRef};
}
#find the number of true co-occurrence for the top $interval
# ranked terms
my $cooccurrenceCount = 0;
for (my $rank = 0; $rank < $interval; $rank++) {
my $cui = ${$rankedPredictionsRef}[$rank];
if (exists ${$trueRef}{$cui}) {
$cooccurrenceCount += ${$trueRef}{$cui};
}
}
#add this precision to the mean precisions at k
$meanCooccurrenceCount{$k} += $cooccurrenceCount;
}
#take the mean of the cooccurrence counts
$meanCooccurrenceCount{$k} /= (scalar keys %{$trueMatrixRef});
#after computing cooccurrence counts at 1-10, change interval to 10
if ($k == 10) {
$interval = 10;
}
}
#return the mean precisions at k
return \%meanCooccurrenceCount;
}
1;
( run in 0.712 second using v1.01-cache-2.11-cpan-39bf76dae61 )