ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA  02111-1307, USA.

package TimeSlicing;
use strict;
use warnings;

use LiteratureBasedDiscovery::Discovery;


#
# Calculates and outputs to STDOUT Time Slicing evaluation stats of
# precision and recall at $numIntervals intervals, Mean Average Precision
# (MAP), precision at k, and frequency at k
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
#         $numIntervals <- the number of recall intervals to generate
sub outputTimeSlicingResults {
    #grab the input
    my $goldMatrixRef = shift;
    my $rowRanksRef = shift;
    my $numIntervals = shift;


#calculate and output stats
#------------------------------------------

 #calculate precision and recall
    print "calculating precision and recall\n";
    my ($precisionRef, $recallRef) = &calculatePrecisionAndRecall_implicit(
	 $goldMatrixRef, $rowRanksRef, $numIntervals);

    #output precision and recall
    print "----- average precision at 10% recall intervals (i recall precision) ----> \n";
    foreach my $i (sort {$a <=> $b} keys %{$precisionRef}) {
	print "      $i ${$recallRef}{$i} ${$precisionRef}{$i}\n";
    }
    print "\n";
    
#-------------------------------------------
    
    #calculate mean average precision
    my $map = &calculateMeanAveragePrecision(
	$goldMatrixRef, $rowRanksRef);
    #output mean average precision
    print "---------- mean average precision ---------------> \n";
    print "      MAP = $map\n";
    print "\n";

#-------------------------------------------
    
    #calculate precision at k
    print "calculating precision at k\n";
    my $precisionAtKRef = &calculatePrecisionAtK($goldMatrixRef, $rowRanksRef);
    #output precision at k
    print "---------- mean precision at k intervals ---------------> \n";
    foreach my $k (sort {$a <=> $b} keys %{$precisionAtKRef}) {
	print "      $k ${$precisionAtKRef}{$k}\n";
    }
    print "\n";

#-------------------------------------------
    
    #calculate cooccurrences at k
    print "calculating mean cooccurrences at k\n";
    my $cooccurrencesAtKRef = &calculateMeanCooccurrencesAtK($goldMatrixRef, $rowRanksRef);
    #output cooccurrences at k
    print "---------- mean cooccurrences at k intervals ---------------> \n";
    foreach my $k (sort {$a <=> $b} keys %{$cooccurrencesAtKRef}) {
	print "      $k ${$cooccurrencesAtKRef}{$k}\n";
    }
    print "\n";

}


# loads a list of cuis for use in time slicing from file
# the CUI file contains a line seperated list of CUIs
# input:  $cuiFileName <- a string specifying the file to load cuis from
# output: $\%cuis <- a ref to a hash of cuis, each key is a cui, values are 1
sub loadCUIs {
    my $cuiFileName = shift;
    
    #open the file
    open IN, $cuiFileName 
	or die("ERROR: cannot open CUI File: $cuiFileName\n");
    
    #read each line of the file
    my %cuis = ();
    while (my $line = <IN>) {
	chomp $line;
	
	#only add the line if it properly formatted
	if ($line =~ /C\d{7}/) {
	    $cuis{$line} = 1;
	}
    }
    close IN;

    return \%cuis;
}


# calculates average precision and recall of the generated implicit matrix 
# compared to the post cutoff matrix
# input:  $predictionsMatrixRef <- a ref to a sparse matrix of predicted 
#                                  discoveries

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN


	#determine precision and recall at 10% intervals of the number of 
	#predicted true vaules. This is done by simulating a threshold being
	#applied, so the top $numToTest ranked terms are tested at 10% intervals
	my $interval = $numPredictions/$numIntervals;
	for (my $i = 0; $i <= 1; $i+=(1/$numIntervals)) {
	    
	    #determine the number true to grab
	    my $numTrueForInterval = 1; #at $i = 0, grab just the first term that is true
	    if ($i > 0) {
		$numTrueForInterval = $numTrue*$i;
	    }

	    #grab true discoveries until the recall rate is exceeded
	    my $truePositive = 0;
	    my $numChecked = 0;
	    for (my $j = 0; $j < $numPredictions; $j++) {

		#get the jth ranked cui and check if it is a true discovery
		my $cui = ${$rankedPredictionsRef}[$j];
		if (exists ${$trueRef}{$cui}) {
		    $truePositive++;
		}
		$numChecked++;

		#check if the recall rate has been reached
		if ($truePositive > $numTrueForInterval) {
		    last;
		}
	    }
	    #sum precision at this interval, average over number of rows is 
	    # taken outside of the loop
	    $precision{$i} += ($truePositive / $numChecked); #number that are selected that are true
	    $recall{$i} += ($truePositive / $numTrue); #number of true that are selected	
	}
    }

    #calculate the average precision at each interval
    foreach my $i (keys %precision) {
	#divide by the number of rows in the true matrix ref
	# because those are the number of cuis we are testing
	# it is possible that the predictions has rows that are 
	# not in the true, and those should be ignored.
	$precision{$i} /= (scalar keys %{$trueMatrixRef});
	$recall{$i} /= (scalar keys %{$trueMatrixRef});
    }

    #return the precision and recall at 10% intervals
    return (\%precision, \%recall);
}



# calculates the mean average precision (MAP)
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: $map <- a scalar value of mean average precision (MAP)
sub calculateMeanAveragePrecision {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
    print "calculating mean average precision\n";

    #calculate MAP for each true discovery being predicted
    my $map = 0;
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions

	#skip for rows that have no predictions
	if (!defined $rankedPredictionsRef) {
	    next;
	} 
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	my $numPredictions = scalar @{$rankedPredictionsRef};

	#calculate the average precision of this true cui, by comparing 
	# the predicted vs. true values ordered and weighted by their rank
	my $ap = 0; #average precision
	my $truePositiveCount = 0;
	#start at 1, since divide by rank...subtract one when indexing
	for (my $rank = 1; $rank <= $numPredictions; $rank++) {
	    my $cui = ${$rankedPredictionsRef}[$rank-1];
	    if (exists ${$trueRef}{$cui}) {
		$truePositiveCount++;
		$ap += ($truePositiveCount/($rank));
	    }
	}

	#calculate the average precision, and add to map
	if ($truePositiveCount > 0) {
	    $ap /= $truePositiveCount;
	} #else, $ap is already 0 so do nothing
	$map += $ap;
    }

    #take the mean of the average precisions
    # divide by the number of true discoveries that you summed over
    $map /= (scalar keys %{$trueMatrixRef});

    #return the mean average precision
    return $map;
}


# calculates the mean precision at k at intervals of 1, 
# from k = 1-10 and intervals of 10 for 10-100
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: \%meanPrecision <- a hash of mean preicsions at K, each key is the 
#                            value of k, the the value is the precision at that
#                            k
sub calculatePrecisionAtK {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
  
    #generate precision at k at intervals of 10 for k = 10-100
    my %meanPrecision = ();
    my $interval = 1;
    for (my $k = 1; $k <= 100; $k+=$interval) {
	$meanPrecision{$k} = 0;

	#average the mean precision over all terms
	foreach my $rowKey (keys %{$trueMatrixRef}) {
	    my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
	    
	    #skip for rows that have no predictions
	    if (!defined $rankedPredictionsRef) {
		next;
	    } 
	    my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	    #threshold the interval, so that it does not exceed 
	    # the number of predictions
	    my $interval = $k;
	    if ($k > scalar @{$rankedPredictionsRef}) {
		$interval = scalar @{$rankedPredictionsRef};
	    }

	    #find the number of true positives in the top $interval ranked terms
	    my $truePositiveCount = 0;
	    for (my $rank = 0; $rank < $interval; $rank++) {
		my $cui = ${$rankedPredictionsRef}[$rank];
		if (exists ${$trueRef}{$cui}) {
		    $truePositiveCount++;
		}
	    }

	    #add this precision to the mean precisions at k
	    $meanPrecision{$k} += ($truePositiveCount/$interval);
	}
	#take the mean of the precisions
	$meanPrecision{$k} /= (scalar keys %{$trueMatrixRef});

	#after computing precision at 1-10, change interval to 10
	if ($k == 10) {
	    $interval = 10;
	}



( run in 0.813 second using v1.01-cache-2.11-cpan-39bf76dae61 )