ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

# input: $scoresRef <- a hash ref to a hash of cuis and scores (hash{cui} = score)
# output: an array ref of the ranked cuis in descending order
sub rankDescending {
    #grab the input
    my $scoresRef = shift;

    #order in descending order, and use the CUI string as a tiebreaker
    my @rankedCuis = ();
    my @tiedCuis = ();
    my $currentScore = -1;
    foreach my $cui (
	#sort function to sort by value
	sort {${$scoresRef}{$b} <=> ${$scoresRef}{$a}} 
	keys %{$scoresRef}) {

	#see if this cui is tied with previuos
	if (${$scoresRef}{$cui} != $currentScore) {
	    #this cui is not tied with previuos,
	    # so save all previuos ones to the ranked array
	    # Here, we sort by key name, so the tie breaker
	    # is the cui name itself. This is arbitrary but 
	    # allows for results to be precisely replicated.
	    # UPDATE: Almost precisely replicated. There is 
	    # a numerical stability problem so that the sort
	    # by value will chunk out differently depending 
	    # on the run. So one run something with a values of 
	    # 0.66666666666667 will be sorted above another item
	    # with that same value, the next run sorted with it.
	    # this is essentially unavoidable without implementing
	    # a tolerance threshold which seems like overkill
	    foreach my $tiedCui (sort @tiedCuis) {
		push @rankedCuis, $tiedCui;
	    }

	    #clear the list of tied CUIs
	    @tiedCuis = ();
	}
	#add current CUI to the tied CUI list and update the
	# current score
	$currentScore = ${$scoresRef}{$cui};
	push @tiedCuis, $cui;
    }
    #add any remaining tied cuis to the final list
    foreach my $cui (sort @tiedCuis) {
	push @rankedCuis, $cui;
    }

    #return the ranked cuis
    return \@rankedCuis;
}


#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

# gets association scores for a set of cui pairs 
# input:  $cuiPairsRef <- reference to a hash of pairs of matrix indeces (key = '1,2')
#         $matrixRef <- a reference to a sparse matrix of n11 values
#         $measure <- the association measure to perform
#         $association <- an instance of UMLS::Association
# output: none, bu the cuiPairs ref has values updated to reflect the 
#         computed assocation score
sub getBatchAssociationScores {
    my $cuiPairsRef = shift;
    my $matrixRef = shift;
    my $measure = shift;
    my $association = shift;
    
    #optionally pass in $n1pRef, $np1Ref, and $npp
    # do this if they get calculated multiple times
    # (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $npp = shift;

    #if the measure is frequency, you only need to return 
    # the cuiPairs ref which already holds CUI frequencies
    if ($measure eq 'freq') {
	return $cuiPairsRef;
    }

    #calculate stats if needed
    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
	($n1pRef, $np1Ref, $npp) = &getAllStats($matrixRef);
    }
    
    #get association scores for each CUI pair
    my ($n11, $cui1, $cui2);
    foreach my $key (keys %{$cuiPairsRef}) {
	#get the cui indeces
	($cui1, $cui2) = split(/,/,$key);

	#assume calculation cannot be made
	${$cuiPairsRef}{$key} = -1;

	#get n11
	$n11 = ${${$matrixRef}{$cui1}}{$cui2};

	#get association if possible (only possible if the terms have co-occurred)
	if (defined $n11) {
	    ${$cuiPairsRef}{$key} = $association->_calculateAssociation_fromObservedCounts($n11, ${$n1pRef}{$cui1}, ${$np1Ref}{$cui2}, $npp, $measure);
	}
    }
}

# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input:  $matrixRef <- ref to the co-occurrence matrix (the sparse matrix 
#                       of n11 values)
# output: \@vals <- an array ref of three values:
#                   \%n1p - a hash ref where the key is a cui and value is n1p
#                   \%np1 - a hash ref where the key is a cui and value is np1
#                   $npp - a scalar of npp
sub getAllStats {
    my $matrixRef = shift;

    #get all np1, n1p, and npp values of values for each cui
    my %np1 = ();
    my %n1p = ();
    my $npp = 0;



( run in 2.033 seconds using v1.01-cache-2.11-cpan-df04353d9ac )