ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	#assume calculation cannot be made
	$score{$cTerm} = -1;
	
	#only calculate if np1 > 0
	if ($np1{$cTerm} > 0) {
	    #get score
	    $score{$cTerm} = $association->_calculateAssociation_fromObservedCounts($n11{$cTerm}, $n1p, $np1{$cTerm}, $npp, $measure);
	}
    }
    
    return \%score;
}


# scores each implicit CUI using an assocation measure. Score is the average
# of the minimum between association score between start and linking, and
# linking and target.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
#         $abScoresRef <- hashRef of the a to b scores used in AMW
#                         key is the a,b cui pair (e.g. hash{'C00,C11'})
#                         values are their score
#
#         Optional Input for passing in precalculated stats
#         so that they don't have to get recalcualted each time
#         such as in timeslicing
#         $n1pRef <- hashRef where key is a cui, value is n1p
#         $np1Ref <- hashRef where key is a cui, value is np1
#         $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_averageMinimumWeight {
    #grab input
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;
    my $abScoresRef = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $npp = shift;

    #get all BC pairs (call it bcScores because it will hold the scores)
    my $bcScoresRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    #get cui pair scores
    &getBatchAssociationScores(
	$bcScoresRef, $explicitMatrixRef, $measure, $association,
	$n1pRef, $np1Ref, $npp);

    #find the max a->b score (since there can be multiple a terms)
    my %maxABScores = ();
    my ($key1, $key2, $score);
    foreach my $pairKey (keys %{$abScoresRef}) {
	 #second value is b term
	($key1, $key2) = split(/,/,$pairKey);
	$score = ${$abScoresRef}{$pairKey};

	if ($score != -1) { #only compute for associations that exist
	    if (exists $maxABScores{$key2}) {
		if ($score > $maxABScores{$key2}) {
		    $maxABScores{$key2} = $score;
		}
	    } else {
		$maxABScores{$key2} = $score;
	    }
	}
    }

    # Find the average minimum weight (cScores) for each c term
    # average of minimum a->b score and b->c score
    my %cScores = ();
    my %counts = ();
    my ($value, $count, $min, $bTerm, $cTerm);
    #sum min scores
    foreach my $pairKey (keys %{$bcScoresRef}) {

	#only compute for scores that exist
	if (${$bcScoresRef}{$pairKey} != -1) {
	    #first is bTerm, second is cTerm
	    ($bTerm, $cTerm) = split(/,/,$pairKey);
	    
	    #check there is an AB value
	    if ($maxABScores{$bTerm} != -1) {  

		#get the minimum between a->b and b->c
		$min = ${$bcScoresRef}{$pairKey};  
		if ($maxABScores{$bTerm} < $min) {
		    $min = $maxABScores{$bTerm};
		}

		#increase the sum (automatically initialize to 0)
		$cScores{$cTerm} += $min;
		$counts{$cTerm}++; 
	    }
	}
    }
    #normalize by counts
    foreach my $key (keys %cScores) {
	$cScores{$key} /= $counts{$key}
    }
 
    return \%cScores;
}


# scores each implicit CUI using linking term count, and AMW as a tie breaker
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
#         $abScoresRef <- hashRef of the a to b scores used in AMW
#                         key is the a,b cui pair (e.g. hash{'C00,C11'})
#                         values are their score
#         Optional Input for passing in precalculated stats
#         so that they don't have to get recalcualted each time
#         such as in timeslicing
#         $n1pRef <- hashRef where key is a cui, value is n1p
#         $np1Ref <- hashRef where key is a cui, value is np1
#         $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_LTC_AMW {
    #grab the input
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;
    my $abScoresRef = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $nppRef = shift;

    #get linking term count scores
    my $ltcAssociationsRef = &scoreImplicit_linkingTermCount($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


	#set the score (maximum score seen for that C term)
	my $score = -1;
	if ($denom != 0) {
	    $score = $numerator/$denom;
	}
	if (exists $scores{$cKey}) {
	    if ($score > $scores{$cKey}) {
		$scores{$cKey} = $score;
	    }
	}
	else {
	    $scores{$cKey} = $score;
	}	
    }
    
    return \%scores;
}

# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input:  $startingMatrixRef <- ref to the starting matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
#         and values are set to the value at that index in the implicit matrix
sub _getACPairs {
    my $startingMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #generate a list of ac pairs
    my %acPairs = ();
    foreach my $keyA (keys %{$implicitMatrixRef}) {
	foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
	    $acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
	}
    }
    
    return \%acPairs;

}


# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the linking term count for each cTerm
    my %scores = ();
    my ($key1, $key2);
    foreach my $pairKey (keys %{$bcPairsRef}) {
	#cTerm is the second value ($key2)
	($key1, $key2) = split(/,/,$pairKey);

	#automatically initializes to 0
	$scores{$key2}++;
    }
    return \%scores;
}


# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the frequency count for each cTerm
    my %scores = ();
    my ($key1, $key2);
    foreach my $pairKey (keys %{$bcPairsRef}) {
	#cTerm is the second value ($key2)
	($key1, $key2) = split(/,/,$pairKey);

	#automatically initializes to 0 (with +=)
	$scores{$key2} += ${$bcPairsRef}{$pairKey};
    }
    return \%scores;
}

# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between a column in the implicit matrix, and one of the start 
# matrix terms (so max between any A and that C term). 
# Score is calculated using the implicit matrix
# input:  $startCuisRef <- ref to an array of start cuis (A terms)
#         $implicitMatrixFileName <- fileName of the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
    #LBD Info
    my $startCuisRef = shift;
    my $implicitMatrixFileName = shift;
    my $measure = shift;
    my $association = shift;

######################################
    #Get hashes for A and C terms
#####################################
    #create a hash of starting terms
    my %aTerms = ();
    foreach my $cui (@{$startCuisRef}) {
	$aTerms{$cui} = 1;
    }

    #get all the target terms (terms that co-occur with aTerms 
    # in the implicit matrix file = the implicit terms)
    open IN, "$implicitMatrixFileName";
    my %cTerms = ();
    while (my $line = <IN>) {
	$line =~ /(C\d{7})\s(C\d{7})/;
	if (exists $aTerms{$1}) {
	    $cTerms{$2} = 1;
	}
    }

######################################
    #Get Co-occurrence values, N11, N1P, NP1, NPP
######################################
    #NPP is the number of Co-occurreces total
    #@NP1 is the number of co-occurrences of a C term with any term ... so sum of XXX\tCTerm\tVal for each cTerm
    #@N1P is the number of co-occurrences of any A term ... so sum of anyATerm\tXXX\t
    #N11{Cterm} is the sum of anyATerm\tCTerm\tVal
    seek IN, 0,0; #reset to the beginning of the implicit file

    #iterate over the lines of interest, and grab values
    my %np1 = ();
    my %n11 = ();
    my $n1p = 0;
    my $npp = 0;
    my $matchedCuiB = 0;
    my ($cuiA, $cuiB, $val);



( run in 2.599 seconds using v1.01-cache-2.11-cpan-5a3173703d6 )