ALBD
view release on metacpan or search on metacpan
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
#assume calculation cannot be made
$score{$cTerm} = -1;
#only calculate if np1 > 0
if ($np1{$cTerm} > 0) {
#get score
$score{$cTerm} = $association->_calculateAssociation_fromObservedCounts($n11{$cTerm}, $n1p, $np1{$cTerm}, $npp, $measure);
}
}
return \%score;
}
# scores each implicit CUI using an assocation measure. Score is the average
# of the minimum between association score between start and linking, and
# linking and target.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# $abScoresRef <- hashRef of the a to b scores used in AMW
# key is the a,b cui pair (e.g. hash{'C00,C11'})
# values are their score
#
# Optional Input for passing in precalculated stats
# so that they don't have to get recalcualted each time
# such as in timeslicing
# $n1pRef <- hashRef where key is a cui, value is n1p
# $np1Ref <- hashRef where key is a cui, value is np1
# $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_averageMinimumWeight {
#grab input
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
my $measure = shift;
my $association = shift;
my $abScoresRef = shift;
#optionally pass in stats so they don't get recalculated for
# multiple terms (such as with time slicing)
my $n1pRef = shift;
my $np1Ref = shift;
my $npp = shift;
#get all BC pairs (call it bcScores because it will hold the scores)
my $bcScoresRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
#get cui pair scores
&getBatchAssociationScores(
$bcScoresRef, $explicitMatrixRef, $measure, $association,
$n1pRef, $np1Ref, $npp);
#find the max a->b score (since there can be multiple a terms)
my %maxABScores = ();
my ($key1, $key2, $score);
foreach my $pairKey (keys %{$abScoresRef}) {
#second value is b term
($key1, $key2) = split(/,/,$pairKey);
$score = ${$abScoresRef}{$pairKey};
if ($score != -1) { #only compute for associations that exist
if (exists $maxABScores{$key2}) {
if ($score > $maxABScores{$key2}) {
$maxABScores{$key2} = $score;
}
} else {
$maxABScores{$key2} = $score;
}
}
}
# Find the average minimum weight (cScores) for each c term
# average of minimum a->b score and b->c score
my %cScores = ();
my %counts = ();
my ($value, $count, $min, $bTerm, $cTerm);
#sum min scores
foreach my $pairKey (keys %{$bcScoresRef}) {
#only compute for scores that exist
if (${$bcScoresRef}{$pairKey} != -1) {
#first is bTerm, second is cTerm
($bTerm, $cTerm) = split(/,/,$pairKey);
#check there is an AB value
if ($maxABScores{$bTerm} != -1) {
#get the minimum between a->b and b->c
$min = ${$bcScoresRef}{$pairKey};
if ($maxABScores{$bTerm} < $min) {
$min = $maxABScores{$bTerm};
}
#increase the sum (automatically initialize to 0)
$cScores{$cTerm} += $min;
$counts{$cTerm}++;
}
}
}
#normalize by counts
foreach my $key (keys %cScores) {
$cScores{$key} /= $counts{$key}
}
return \%cScores;
}
# scores each implicit CUI using linking term count, and AMW as a tie breaker
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# $abScoresRef <- hashRef of the a to b scores used in AMW
# key is the a,b cui pair (e.g. hash{'C00,C11'})
# values are their score
# Optional Input for passing in precalculated stats
# so that they don't have to get recalcualted each time
# such as in timeslicing
# $n1pRef <- hashRef where key is a cui, value is n1p
# $np1Ref <- hashRef where key is a cui, value is np1
# $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_LTC_AMW {
#grab the input
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
my $measure = shift;
my $association = shift;
my $abScoresRef = shift;
#optionally pass in stats so they don't get recalculated for
# multiple terms (such as with time slicing)
my $n1pRef = shift;
my $np1Ref = shift;
my $nppRef = shift;
#get linking term count scores
my $ltcAssociationsRef = &scoreImplicit_linkingTermCount($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
#set the score (maximum score seen for that C term)
my $score = -1;
if ($denom != 0) {
$score = $numerator/$denom;
}
if (exists $scores{$cKey}) {
if ($score > $scores{$cKey}) {
$scores{$cKey} = $score;
}
}
else {
$scores{$cKey} = $score;
}
}
return \%scores;
}
# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input: $startingMatrixRef <- ref to the starting matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
# and values are set to the value at that index in the implicit matrix
sub _getACPairs {
my $startingMatrixRef = shift;
my $implicitMatrixRef = shift;
#generate a list of ac pairs
my %acPairs = ();
foreach my $keyA (keys %{$implicitMatrixRef}) {
foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
$acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
}
}
return \%acPairs;
}
# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bc pairs
my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
# Find the linking term count for each cTerm
my %scores = ();
my ($key1, $key2);
foreach my $pairKey (keys %{$bcPairsRef}) {
#cTerm is the second value ($key2)
($key1, $key2) = split(/,/,$pairKey);
#automatically initializes to 0
$scores{$key2}++;
}
return \%scores;
}
# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bc pairs
my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
# Find the frequency count for each cTerm
my %scores = ();
my ($key1, $key2);
foreach my $pairKey (keys %{$bcPairsRef}) {
#cTerm is the second value ($key2)
($key1, $key2) = split(/,/,$pairKey);
#automatically initializes to 0 (with +=)
$scores{$key2} += ${$bcPairsRef}{$pairKey};
}
return \%scores;
}
# scores each implicit CUI using an assocation measure. Score is the maximum
# association between a column in the implicit matrix, and one of the start
# matrix terms (so max between any A and that C term).
# Score is calculated using the implicit matrix
# input: $startCuisRef <- ref to an array of start cuis (A terms)
# $implicitMatrixFileName <- fileName of the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
#LBD Info
my $startCuisRef = shift;
my $implicitMatrixFileName = shift;
my $measure = shift;
my $association = shift;
######################################
#Get hashes for A and C terms
#####################################
#create a hash of starting terms
my %aTerms = ();
foreach my $cui (@{$startCuisRef}) {
$aTerms{$cui} = 1;
}
#get all the target terms (terms that co-occur with aTerms
# in the implicit matrix file = the implicit terms)
open IN, "$implicitMatrixFileName";
my %cTerms = ();
while (my $line = <IN>) {
$line =~ /(C\d{7})\s(C\d{7})/;
if (exists $aTerms{$1}) {
$cTerms{$2} = 1;
}
}
######################################
#Get Co-occurrence values, N11, N1P, NP1, NPP
######################################
#NPP is the number of Co-occurreces total
#@NP1 is the number of co-occurrences of a C term with any term ... so sum of XXX\tCTerm\tVal for each cTerm
#@N1P is the number of co-occurrences of any A term ... so sum of anyATerm\tXXX\t
#N11{Cterm} is the sum of anyATerm\tCTerm\tVal
seek IN, 0,0; #reset to the beginning of the implicit file
#iterate over the lines of interest, and grab values
my %np1 = ();
my %n11 = ();
my $n1p = 0;
my $npp = 0;
my $matchedCuiB = 0;
my ($cuiA, $cuiB, $val);
( run in 2.599 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )