ALBD
view release on metacpan or search on metacpan
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
$tiedAMWScores{$cuiPair} = ${$amwScoresRef}{$cuiPair};
}
#add the cui pairs by descending amw score
foreach my $cuiPair (sort {$tiedAMWScores{$b} <=> $tiedAMWScores{$a}} keys %tiedAMWScores) {
$ltcAMWScores{$cuiPair} = $currentRank;
$currentRank--;
}
}
#return the scores
return \%ltcAMWScores;
}
#TODO this is an untested method
# gets the max cosine distance score between all a terms and each cTerm
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub score_cosineDistance {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all the A->C pairs
my $acPairsRef = &_getACPairs($startingMatrixRef, $implicitMatrixRef);
my %scores = ();
foreach my $pairKey (keys %{$acPairsRef}) {
#get the A and C keys
my ($aKey, $cKey) = split(/,/,$pairKey);
#grab the A and C explicit vectors
my $aVectorRef = ${$explicitMatrixRef}{$aKey};
my $cVectorRef = ${$explicitMatrixRef}{$cKey};
#find the numerator which is the sum of A[i]*C[i] values
my $numerator = 0;
foreach my $key (keys ${$aVectorRef}) {
if (exists ${$cVectorRef}{$key}) {
$numerator += ${$aVectorRef}{$key} * ${$cVectorRef}{$key};
}
}
#find the sum of A squared
my $aSum = 0;
foreach my $key (keys ${$aVectorRef}) {
$aSum += ($key*$key);
}
#find the sum of C squared
my $cSum = 0;
foreach my $key (keys ${$aVectorRef}) {
$cSum += ($key*$key);
}
#find the denominator, which is the product of A and C lengths
my $denom = sqrt($aSum)*sqrt($cSum);
#set the score (maximum score seen for that C term)
my $score = -1;
if ($denom != 0) {
$score = $numerator/$denom;
}
if (exists $scores{$cKey}) {
if ($score > $scores{$cKey}) {
$scores{$cKey} = $score;
}
}
else {
$scores{$cKey} = $score;
}
}
return \%scores;
}
# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input: $startingMatrixRef <- ref to the starting matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
# and values are set to the value at that index in the implicit matrix
sub _getACPairs {
my $startingMatrixRef = shift;
my $implicitMatrixRef = shift;
#generate a list of ac pairs
my %acPairs = ();
foreach my $keyA (keys %{$implicitMatrixRef}) {
foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
$acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
}
}
return \%acPairs;
}
# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bc pairs
my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
# Find the linking term count for each cTerm
my %scores = ();
my ($key1, $key2);
foreach my $pairKey (keys %{$bcPairsRef}) {
#cTerm is the second value ($key2)
($key1, $key2) = split(/,/,$pairKey);
#automatically initializes to 0
$scores{$key2}++;
}
return \%scores;
}
# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bc pairs
my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
# Find the frequency count for each cTerm
my %scores = ();
my ($key1, $key2);
foreach my $pairKey (keys %{$bcPairsRef}) {
#cTerm is the second value ($key2)
($key1, $key2) = split(/,/,$pairKey);
#automatically initializes to 0 (with +=)
$scores{$key2} += ${$bcPairsRef}{$pairKey};
}
return \%scores;
}
# scores each implicit CUI using an assocation measure. Score is the maximum
# association between a column in the implicit matrix, and one of the start
# matrix terms (so max between any A and that C term).
# Score is calculated using the implicit matrix
# input: $startCuisRef <- ref to an array of start cuis (A terms)
# $implicitMatrixFileName <- fileName of the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
#LBD Info
my $startCuisRef = shift;
my $implicitMatrixFileName = shift;
my $measure = shift;
my $association = shift;
######################################
#Get hashes for A and C terms
#####################################
#create a hash of starting terms
my %aTerms = ();
foreach my $cui (@{$startCuisRef}) {
$aTerms{$cui} = 1;
}
#get all the target terms (terms that co-occur with aTerms
# in the implicit matrix file = the implicit terms)
open IN, "$implicitMatrixFileName";
my %cTerms = ();
while (my $line = <IN>) {
$line =~ /(C\d{7})\s(C\d{7})/;
if (exists $aTerms{$1}) {
$cTerms{$2} = 1;
}
}
######################################
#Get Co-occurrence values, N11, N1P, NP1, NPP
######################################
#NPP is the number of Co-occurreces total
#@NP1 is the number of co-occurrences of a C term with any term ... so sum of XXX\tCTerm\tVal for each cTerm
#@N1P is the number of co-occurrences of any A term ... so sum of anyATerm\tXXX\t
#N11{Cterm} is the sum of anyATerm\tCTerm\tVal
seek IN, 0,0; #reset to the beginning of the implicit file
#iterate over the lines of interest, and grab values
my %np1 = ();
my %n11 = ();
my $n1p = 0;
my $npp = 0;
my $matchedCuiB = 0;
my ($cuiA, $cuiB, $val);
while (my $line = <IN>) {
#grab data from the line
($cuiA, $cuiB, $val) = split(/\t/,$line);
#see if updates are necessary
if (exists $aTerms{$cuiA} || exists $cTerms{$cuiB}) {
#update npp
$npp += $3;
#update np1
if (exists $cTerms{$cuiB}) {
$np1{$cuiB} += $val;
$matchedCuiB = 1;
}
#update n1p
if (exists $aTerms{$cuiA}) {
$n1p += $val;
#update n11 if needed
if ($matchedCuiB) {
$n11{$cuiB} += $val;
$matchedCuiB = 0;
}
}
}
}
######################################
# Calculate Association for each c term
######################################
my %associationScores = ();
foreach my $cTerm(keys %cTerms) {
$associationScores{$cTerm} =
$association->_calculateAssociation_fromObservedCounts($n11{$cTerm}, $n1p, $np1{$cTerm}, $npp, $measure);
}
return \%associationScores;
}
# scores each implicit CUI using an assocation measure. Score is the maximum
# association between any of the linking terms.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromAllPairs {
#LBD Info
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
#update max for this implicit cui or create if needed
if (!exists $scores{$key2}) {
$scores{$key2} = ${$bcPairsRef}{$pairKey};
}
elsif (${$bcPairsRef}{$pairKey} > $scores{$key2}) {
$scores{$key2} = ${$bcPairsRef}{$pairKey}
}
}
}
return \%scores;
}
sub scoreImplicit_minimumWeightAssociation {
}
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# Builds a list of B->C term pairs that also co-occurr with A terms
# Only adds B->C term pairs for C terms that are also present in the
# implicitMatrix.
# The value of the bcPairs Hash is the value in the explicit matrix
# for that pair.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of BC term pairs. Each key is "$bTerm,$cTerm",
# value is by default the frequency of BC co-occurrences in the
# matrix
sub _getBCPairs {
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bTerms
my %bTerms = ();
my $rowRef;
foreach my $rowKey (keys %{$startingMatrixRef}) {
$rowRef = ${$startingMatrixRef}{$rowKey};
foreach my $colKey (keys %{$rowRef}) {
$bTerms{$colKey} = 1;
}
}
#get all the cTerms (unique column values in the implicit matrix)
my %cTerms = ();
foreach my $rowKey(keys %{$implicitMatrixRef}) {
$rowRef = ${$implicitMatrixRef}{$rowKey};
foreach my $colKey (keys %{$rowRef}) {
$cTerms{$colKey} = 1;
}
}
#get all bc pairs, set value to be the frequency of co-occurrence
my %bcPairs = ();
foreach my $bTerm(keys %bTerms) {
$rowRef = ${$explicitMatrixRef}{$bTerm};
if ($rowRef) {
foreach my $cTerm(keys %{$rowRef}) {
if (exists $cTerms{$cTerm}) {
#add because this a->b->c term (%cTerms) is also a b->c term
$bcPairs{"$bTerm,$cTerm"} = ${$rowRef}{$cTerm};
}
}
}
}
return \%bcPairs;
}
# ranks the scores in descending order
# input: $scoresRef <- a hash ref to a hash of cuis and scores (hash{cui} = score)
# output: an array ref of the ranked cuis in descending order
sub rankDescending {
#grab the input
my $scoresRef = shift;
#order in descending order, and use the CUI string as a tiebreaker
my @rankedCuis = ();
my @tiedCuis = ();
my $currentScore = -1;
foreach my $cui (
#sort function to sort by value
sort {${$scoresRef}{$b} <=> ${$scoresRef}{$a}}
keys %{$scoresRef}) {
#see if this cui is tied with previuos
if (${$scoresRef}{$cui} != $currentScore) {
#this cui is not tied with previuos,
# so save all previuos ones to the ranked array
# Here, we sort by key name, so the tie breaker
# is the cui name itself. This is arbitrary but
# allows for results to be precisely replicated.
# UPDATE: Almost precisely replicated. There is
# a numerical stability problem so that the sort
# by value will chunk out differently depending
# on the run. So one run something with a values of
# 0.66666666666667 will be sorted above another item
# with that same value, the next run sorted with it.
# this is essentially unavoidable without implementing
# a tolerance threshold which seems like overkill
foreach my $tiedCui (sort @tiedCuis) {
push @rankedCuis, $tiedCui;
}
#clear the list of tied CUIs
@tiedCuis = ();
}
#add current CUI to the tied CUI list and update the
# current score
$currentScore = ${$scoresRef}{$cui};
push @tiedCuis, $cui;
}
#add any remaining tied cuis to the final list
foreach my $cui (sort @tiedCuis) {
push @rankedCuis, $cui;
}
#return the ranked cuis
return \@rankedCuis;
}
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# gets association scores for a set of cui pairs
# input: $cuiPairsRef <- reference to a hash of pairs of matrix indeces (key = '1,2')
# $matrixRef <- a reference to a sparse matrix of n11 values
# $measure <- the association measure to perform
# $association <- an instance of UMLS::Association
# output: none, bu the cuiPairs ref has values updated to reflect the
# computed assocation score
sub getBatchAssociationScores {
my $cuiPairsRef = shift;
my $matrixRef = shift;
my $measure = shift;
my $association = shift;
#optionally pass in $n1pRef, $np1Ref, and $npp
# do this if they get calculated multiple times
# (such as with time slicing)
my $n1pRef = shift;
my $np1Ref = shift;
my $npp = shift;
#if the measure is frequency, you only need to return
# the cuiPairs ref which already holds CUI frequencies
if ($measure eq 'freq') {
return $cuiPairsRef;
}
#calculate stats if needed
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = &getAllStats($matrixRef);
}
#get association scores for each CUI pair
my ($n11, $cui1, $cui2);
foreach my $key (keys %{$cuiPairsRef}) {
#get the cui indeces
($cui1, $cui2) = split(/,/,$key);
#assume calculation cannot be made
${$cuiPairsRef}{$key} = -1;
#get n11
$n11 = ${${$matrixRef}{$cui1}}{$cui2};
#get association if possible (only possible if the terms have co-occurred)
if (defined $n11) {
${$cuiPairsRef}{$key} = $association->_calculateAssociation_fromObservedCounts($n11, ${$n1pRef}{$cui1}, ${$np1Ref}{$cui2}, $npp, $measure);
}
}
}
# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input: $matrixRef <- ref to the co-occurrence matrix (the sparse matrix
# of n11 values)
# output: \@vals <- an array ref of three values:
# \%n1p - a hash ref where the key is a cui and value is n1p
# \%np1 - a hash ref where the key is a cui and value is np1
# $npp - a scalar of npp
sub getAllStats {
my $matrixRef = shift;
( run in 1.421 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )