ALBD
view release on metacpan or search on metacpan
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
# input: $scoresRef <- a hash ref to a hash of cuis and scores (hash{cui} = score)
# output: an array ref of the ranked cuis in descending order
sub rankDescending {
#grab the input
my $scoresRef = shift;
#order in descending order, and use the CUI string as a tiebreaker
my @rankedCuis = ();
my @tiedCuis = ();
my $currentScore = -1;
foreach my $cui (
#sort function to sort by value
sort {${$scoresRef}{$b} <=> ${$scoresRef}{$a}}
keys %{$scoresRef}) {
#see if this cui is tied with previuos
if (${$scoresRef}{$cui} != $currentScore) {
#this cui is not tied with previuos,
# so save all previuos ones to the ranked array
# Here, we sort by key name, so the tie breaker
# is the cui name itself. This is arbitrary but
# allows for results to be precisely replicated.
# UPDATE: Almost precisely replicated. There is
# a numerical stability problem so that the sort
# by value will chunk out differently depending
# on the run. So one run something with a values of
# 0.66666666666667 will be sorted above another item
# with that same value, the next run sorted with it.
# this is essentially unavoidable without implementing
# a tolerance threshold which seems like overkill
foreach my $tiedCui (sort @tiedCuis) {
push @rankedCuis, $tiedCui;
}
#clear the list of tied CUIs
@tiedCuis = ();
}
#add current CUI to the tied CUI list and update the
# current score
$currentScore = ${$scoresRef}{$cui};
push @tiedCuis, $cui;
}
#add any remaining tied cuis to the final list
foreach my $cui (sort @tiedCuis) {
push @rankedCuis, $cui;
}
#return the ranked cuis
return \@rankedCuis;
}
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# gets association scores for a set of cui pairs
# input: $cuiPairsRef <- reference to a hash of pairs of matrix indeces (key = '1,2')
# $matrixRef <- a reference to a sparse matrix of n11 values
# $measure <- the association measure to perform
# $association <- an instance of UMLS::Association
# output: none, bu the cuiPairs ref has values updated to reflect the
# computed assocation score
sub getBatchAssociationScores {
my $cuiPairsRef = shift;
my $matrixRef = shift;
my $measure = shift;
my $association = shift;
#optionally pass in $n1pRef, $np1Ref, and $npp
# do this if they get calculated multiple times
# (such as with time slicing)
my $n1pRef = shift;
my $np1Ref = shift;
my $npp = shift;
#if the measure is frequency, you only need to return
# the cuiPairs ref which already holds CUI frequencies
if ($measure eq 'freq') {
return $cuiPairsRef;
}
#calculate stats if needed
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = &getAllStats($matrixRef);
}
#get association scores for each CUI pair
my ($n11, $cui1, $cui2);
foreach my $key (keys %{$cuiPairsRef}) {
#get the cui indeces
($cui1, $cui2) = split(/,/,$key);
#assume calculation cannot be made
${$cuiPairsRef}{$key} = -1;
#get n11
$n11 = ${${$matrixRef}{$cui1}}{$cui2};
#get association if possible (only possible if the terms have co-occurred)
if (defined $n11) {
${$cuiPairsRef}{$key} = $association->_calculateAssociation_fromObservedCounts($n11, ${$n1pRef}{$cui1}, ${$np1Ref}{$cui2}, $npp, $measure);
}
}
}
# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input: $matrixRef <- ref to the co-occurrence matrix (the sparse matrix
# of n11 values)
# output: \@vals <- an array ref of three values:
# \%n1p - a hash ref where the key is a cui and value is n1p
# \%np1 - a hash ref where the key is a cui and value is np1
# $npp - a scalar of npp
sub getAllStats {
my $matrixRef = shift;
#get all np1, n1p, and npp values of values for each cui
my %np1 = ();
my %n1p = ();
my $npp = 0;
( run in 2.033 seconds using v1.01-cache-2.11-cpan-df04353d9ac )