ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN



# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are 
# the only rows that are needed.
# input:  $startingMatrixRef <- a ref to the starting sparse matrix
#         $explicitMatrix Ref <- a ref to the explicit sparse matrix
#         $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $postCutoffFileName = shift;
    print "loading postCutoff Matrix\n";
    
    #open the post cutoff file
    open IN, $postCutoffFileName 
	or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");

    #create hash of cuis to grab
    my %cuisToGrab = ();
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$cuisToGrab{$rowKey} = 1;
    }

    #read in values of the post cutoff matrix for the start terms
    my %postCutoffMatrix = ();
    my ($cui1, $cui2, $val);
    while (my $line = <IN>) {
	#grab values from the line
	chomp $line;
	($cui1, $cui2, $val) = split(/\t/,$line);

	#see if this line contains a key that should be read in 
	if (exists $cuisToGrab{$cui1}) {

	    #add the value
	    if (!(defined $postCutoffMatrix{$cui1})) {
		my %newHash = ();
		$postCutoffMatrix{$cui1} = \%newHash;
	    }

	    #check to ensure that the column cui is in the 
	    #  vocabulary of the pre-cutoff dataset.
	    #  it is impossible to make predictions of words that
	    #  don't already exist
	    #NOTE: this assumes $explicitMatrixRef is a square 
	    #   matrix (so unordered)
	    if (exists ${$explicitMatrixRef}{$cui2}) {
		${$postCutoffMatrix{$cui1}}{$cui2} = $val;
	    }
	}
    }
    close IN;

    #return the post cutoff matrix
    return \%postCutoffMatrix;
}

#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input:  $explicitMatrixRef <- a ref to the explicit sparse matrix
#         $lbdOptionsRef <- the LBD options
#         $startTermAcceptTypesRef <- a reference to an hash of accept 
#                                     types for start terms (TUIs)
#         $numRows <- the number of random rows to load (if random)
#         $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
    my $explicitMatrixRef = shift;
    my $lbdOptionsRef = shift;
    my $startTermAcceptTypesRef = shift;
    my $numRows = shift;
    my $umls_interface = shift;

    #generate the starting matrix randomly or from a file
    my %startingMatrix = ();

    #check if a file is defined
    if (exists ${$lbdOptionsRef}{'cuiListFileName'}) {
	#grab the rows defined by the cuiListFile
	my $cuisRef = &loadCUIs(${$lbdOptionsRef}{'cuiListFileName'});
	foreach my $cui (keys %{$cuisRef}) {
	    if(exists ${$explicitMatrixRef}{$cui}) {
		$startingMatrix{$cui} = ${$explicitMatrixRef}{$cui};	
	    }
	    else {
		print STDERR "WARNING: CUI from cuiListFileName is not in explicitMatrix: $cui\n";
	    }
	}
    }
    else {
	#randomly grab rows
	#apply semantic filter to the rows (just retreive appropriate rows)
	my $rowsToKeepRef = getRowsOfSemanticTypes(
	    $explicitMatrixRef, $startTermAcceptTypesRef, $umls_interface);
	((scalar keys %{$rowsToKeepRef}) >= $numRows) or die("ERROR: number of acceptable rows starting terms is less than $numRows\n");

	#randomly select 100 rows (to generate the 'starting matrix')
	#generate random numbers from 0 to number of rows in the explicit matrix
	my %rowNumbers = ();
	while ((scalar keys %rowNumbers) < $numRows) {
	    $rowNumbers{int(rand(scalar keys %{$rowsToKeepRef}))} = 1;
	}

	#fill starting matrix with keys corresponding to the random numbers 
	my $i = 0;
	foreach my $key (keys %{$rowsToKeepRef}) {
	    if (exists $rowNumbers{$i}) {
		$startingMatrix{$key} = ${$explicitMatrixRef}{$key}
	    }
	    $i++;
	}

	#output the cui list if needed
	if (exists ${$lbdOptionsRef}{'cuiListOutputFile'}) {
	    open OUT, ">".${$lbdOptionsRef}{'cuiListOutputFile'} or die ("ERROR: cannot open cuiListOutputFile:".${$lbdOptionsRef}{'cuiListOutputFile'}."\n");
	    foreach my $cui (keys %startingMatrix) {
		print OUT "$cui\n";
	    }
	    close OUT;
	}
    }

    #return the starting matrix
    return \%startingMatrix;
}


# gets and returns a hash of row keys of the specifies semantic types
# input:  $matrixRef <- a ref to a sparse matrix
#         $acceptTypesRef <- a ref to a hash of accept types (TUIs)
#         $umls <- an instance of UMLS::Interface
# output: \%rowsToKeep <- a ref to hash of rows to keep, each key is 
#                         a CUI, and values are 1. All CUIs specify rows
#                         of acceptable semantic types
sub getRowsOfSemanticTypes {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
    
    #loop through the matrix and keep the rows that are of the 
    # desired semantic types
    my %rowsToKeep = ();
    foreach my $cui1 (keys %{$matrixRef}) {
	my $typesRef = $umls->getSt($cui1);
	foreach my $type(@{$typesRef}) {
	    my $abr = $umls->getStAbr($type);

	    #check the cui for removal
	    if (exists ${$acceptTypesRef}{$type}) {
		$rowsToKeep{$cui1} = 1;
		last;
	    }
	}
    }

    #return the rowsToKeep
    return \%rowsToKeep

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

#                                    Each hash key is the interval number, and 
#                                    the value is the precision and recall 
#                                    respectively
sub calculatePrecisionAndRecall_implicit {
    my $trueMatrixRef = shift; #a ref to the true matrix
    my $rowRanksRef = shift; #a ref to ranked predictions, each hash element are the predictions for a single cui, at each element is an array of cuis ordered by their rank
    my $numIntervals = shift; #the recall intervals to test at

    #find precision and recall curves for each cui that is being predicted
    #  take the sum of precisions, then average after the loop
    my %precision = ();
    my %recall = ();
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
	
	#get the number of predicted discoveries and true discoveries
	my $numPredictions = scalar @{$rankedPredictionsRef};
	my $numTrue = scalar keys %{$trueRef};

	#skip if there are NO new discoveries for this start term
	if ($numTrue == 0) {
	    next;
	}
	#skip if there are NO predictions for this start term
	if ($numPredictions == 0) {
	    next;
	}

	#determine precision and recall at 10% intervals of the number of 
	#predicted true vaules. This is done by simulating a threshold being
	#applied, so the top $numToTest ranked terms are tested at 10% intervals
	my $interval = $numPredictions/$numIntervals;
	for (my $i = 0; $i <= 1; $i+=(1/$numIntervals)) {
	    
	    #determine the number true to grab
	    my $numTrueForInterval = 1; #at $i = 0, grab just the first term that is true
	    if ($i > 0) {
		$numTrueForInterval = $numTrue*$i;
	    }

	    #grab true discoveries until the recall rate is exceeded
	    my $truePositive = 0;
	    my $numChecked = 0;
	    for (my $j = 0; $j < $numPredictions; $j++) {

		#get the jth ranked cui and check if it is a true discovery
		my $cui = ${$rankedPredictionsRef}[$j];
		if (exists ${$trueRef}{$cui}) {
		    $truePositive++;
		}
		$numChecked++;

		#check if the recall rate has been reached
		if ($truePositive > $numTrueForInterval) {
		    last;
		}
	    }
	    #sum precision at this interval, average over number of rows is 
	    # taken outside of the loop
	    $precision{$i} += ($truePositive / $numChecked); #number that are selected that are true
	    $recall{$i} += ($truePositive / $numTrue); #number of true that are selected	
	}
    }

    #calculate the average precision at each interval
    foreach my $i (keys %precision) {
	#divide by the number of rows in the true matrix ref
	# because those are the number of cuis we are testing
	# it is possible that the predictions has rows that are 
	# not in the true, and those should be ignored.
	$precision{$i} /= (scalar keys %{$trueMatrixRef});
	$recall{$i} /= (scalar keys %{$trueMatrixRef});
    }

    #return the precision and recall at 10% intervals
    return (\%precision, \%recall);
}



# calculates the mean average precision (MAP)
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: $map <- a scalar value of mean average precision (MAP)
sub calculateMeanAveragePrecision {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
    print "calculating mean average precision\n";

    #calculate MAP for each true discovery being predicted
    my $map = 0;
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions

	#skip for rows that have no predictions
	if (!defined $rankedPredictionsRef) {
	    next;
	} 
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	my $numPredictions = scalar @{$rankedPredictionsRef};

	#calculate the average precision of this true cui, by comparing 
	# the predicted vs. true values ordered and weighted by their rank
	my $ap = 0; #average precision
	my $truePositiveCount = 0;
	#start at 1, since divide by rank...subtract one when indexing
	for (my $rank = 1; $rank <= $numPredictions; $rank++) {
	    my $cui = ${$rankedPredictionsRef}[$rank-1];
	    if (exists ${$trueRef}{$cui}) {
		$truePositiveCount++;
		$ap += ($truePositiveCount/($rank));
	    }
	}

	#calculate the average precision, and add to map
	if ($truePositiveCount > 0) {



( run in 2.393 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )