ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	    $tiedAMWScores{$cuiPair} = ${$amwScoresRef}{$cuiPair};
	}

	#add the cui pairs by descending amw score
	foreach my $cuiPair (sort {$tiedAMWScores{$b} <=> $tiedAMWScores{$a}} keys %tiedAMWScores) {
	    $ltcAMWScores{$cuiPair} = $currentRank;
	    $currentRank--;
	}
    }

    #return the scores
    return \%ltcAMWScores;
}

#TODO this is an untested method
# gets the max cosine distance score between all a terms and each cTerm 
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub score_cosineDistance {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all the A->C pairs
    my $acPairsRef = &_getACPairs($startingMatrixRef, $implicitMatrixRef);
    my %scores = ();
    foreach my $pairKey (keys %{$acPairsRef}) {
	#get the A and C keys
	my ($aKey, $cKey) = split(/,/,$pairKey);

	#grab the A and C explicit vectors
	my $aVectorRef = ${$explicitMatrixRef}{$aKey};
	my $cVectorRef = ${$explicitMatrixRef}{$cKey};

	#find the numerator which is the sum of A[i]*C[i] values
	my $numerator = 0;
	foreach my $key (keys ${$aVectorRef}) {
	    if (exists ${$cVectorRef}{$key}) {
		$numerator += ${$aVectorRef}{$key} * ${$cVectorRef}{$key};
	    }
	}

	#find the sum of A squared
	my $aSum = 0;
	foreach my $key (keys ${$aVectorRef}) {
	    $aSum += ($key*$key);
	}

	#find the sum of C squared
	my $cSum = 0;
	foreach my $key (keys ${$aVectorRef}) {
	    $cSum += ($key*$key);
	}

	#find the denominator, which is the product of A and C lengths
	my $denom = sqrt($aSum)*sqrt($cSum);

	#set the score (maximum score seen for that C term)
	my $score = -1;
	if ($denom != 0) {
	    $score = $numerator/$denom;
	}
	if (exists $scores{$cKey}) {
	    if ($score > $scores{$cKey}) {
		$scores{$cKey} = $score;
	    }
	}
	else {
	    $scores{$cKey} = $score;
	}	
    }
    
    return \%scores;
}

# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input:  $startingMatrixRef <- ref to the starting matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
#         and values are set to the value at that index in the implicit matrix
sub _getACPairs {
    my $startingMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #generate a list of ac pairs
    my %acPairs = ();
    foreach my $keyA (keys %{$implicitMatrixRef}) {
	foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
	    $acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
	}
    }
    
    return \%acPairs;

}


# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the linking term count for each cTerm
    my %scores = ();
    my ($key1, $key2);
    foreach my $pairKey (keys %{$bcPairsRef}) {
	#cTerm is the second value ($key2)
	($key1, $key2) = split(/,/,$pairKey);

	#automatically initializes to 0
	$scores{$key2}++;
    }
    return \%scores;
}


# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the frequency count for each cTerm
    my %scores = ();
    my ($key1, $key2);
    foreach my $pairKey (keys %{$bcPairsRef}) {
	#cTerm is the second value ($key2)
	($key1, $key2) = split(/,/,$pairKey);

	#automatically initializes to 0 (with +=)
	$scores{$key2} += ${$bcPairsRef}{$pairKey};
    }
    return \%scores;
}

# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between a column in the implicit matrix, and one of the start 
# matrix terms (so max between any A and that C term). 
# Score is calculated using the implicit matrix
# input:  $startCuisRef <- ref to an array of start cuis (A terms)
#         $implicitMatrixFileName <- fileName of the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
    #LBD Info
    my $startCuisRef = shift;
    my $implicitMatrixFileName = shift;
    my $measure = shift;
    my $association = shift;

######################################
    #Get hashes for A and C terms
#####################################
    #create a hash of starting terms
    my %aTerms = ();
    foreach my $cui (@{$startCuisRef}) {
	$aTerms{$cui} = 1;
    }

    #get all the target terms (terms that co-occur with aTerms 
    # in the implicit matrix file = the implicit terms)
    open IN, "$implicitMatrixFileName";
    my %cTerms = ();
    while (my $line = <IN>) {
	$line =~ /(C\d{7})\s(C\d{7})/;
	if (exists $aTerms{$1}) {
	    $cTerms{$2} = 1;
	}
    }

######################################
    #Get Co-occurrence values, N11, N1P, NP1, NPP
######################################
    #NPP is the number of Co-occurreces total
    #@NP1 is the number of co-occurrences of a C term with any term ... so sum of XXX\tCTerm\tVal for each cTerm
    #@N1P is the number of co-occurrences of any A term ... so sum of anyATerm\tXXX\t
    #N11{Cterm} is the sum of anyATerm\tCTerm\tVal
    seek IN, 0,0; #reset to the beginning of the implicit file

    #iterate over the lines of interest, and grab values
    my %np1 = ();
    my %n11 = ();
    my $n1p = 0;
    my $npp = 0;
    my $matchedCuiB = 0;
    my ($cuiA, $cuiB, $val);
    while (my $line = <IN>) {
	#grab data from the line
	($cuiA, $cuiB, $val) = split(/\t/,$line);

	#see if updates are necessary
	if (exists $aTerms{$cuiA} || exists $cTerms{$cuiB}) {

	    #update npp
	    $npp += $3;
	    
	    #update np1
	    if (exists $cTerms{$cuiB}) {
		$np1{$cuiB} += $val;
		$matchedCuiB = 1;
	    }

	    #update n1p
	    if (exists $aTerms{$cuiA}) {
		$n1p += $val;

		#update n11 if needed
		if ($matchedCuiB) {
		    $n11{$cuiB} += $val;
		    $matchedCuiB = 0;
		}
	    }
	}
    }


######################################
    # Calculate Association for each c term
######################################
    my %associationScores = ();
    foreach my $cTerm(keys %cTerms) {
	$associationScores{$cTerm} = 
	    $association->_calculateAssociation_fromObservedCounts($n11{$cTerm}, $n1p, $np1{$cTerm}, $npp, $measure);
    }

    return \%associationScores;
}

# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between any of the linking terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromAllPairs {
    #LBD Info

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


	    #update max for this implicit cui or create if needed
	    if (!exists $scores{$key2}) {
		$scores{$key2} = ${$bcPairsRef}{$pairKey};
	    }
	    elsif (${$bcPairsRef}{$pairKey} > $scores{$key2}) {
		$scores{$key2} = ${$bcPairsRef}{$pairKey}
	    }
	}
    }
    
    return \%scores;
}


sub scoreImplicit_minimumWeightAssociation {

}


#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX



# Builds a list of B->C term pairs that also co-occurr with A terms
# Only adds B->C term pairs for C terms that are also present in the 
# implicitMatrix.
# The value of the bcPairs Hash is the value in the explicit matrix 
# for that pair.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of BC term pairs. Each key is "$bTerm,$cTerm", 
#         value is by default the frequency of BC co-occurrences in the 
#         matrix
sub _getBCPairs {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bTerms
    my %bTerms = ();
    my $rowRef;
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$rowRef = ${$startingMatrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {
	    $bTerms{$colKey} = 1;
	}
    }

    #get all the cTerms (unique column values in the implicit matrix)
    my %cTerms = ();
    foreach my $rowKey(keys %{$implicitMatrixRef}) {
	$rowRef = ${$implicitMatrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {
	    $cTerms{$colKey} = 1;
	}
    }

    #get all bc pairs, set value to be the frequency of co-occurrence
    my %bcPairs = ();
    foreach my $bTerm(keys %bTerms) {
	$rowRef = ${$explicitMatrixRef}{$bTerm};
	if ($rowRef) {
	    foreach my $cTerm(keys %{$rowRef}) {
		if (exists $cTerms{$cTerm}) {
		    #add because this a->b->c term (%cTerms) is also a b->c term
		    $bcPairs{"$bTerm,$cTerm"} = ${$rowRef}{$cTerm};
		}
	    }
	}
    }
    return \%bcPairs;
}


# ranks the scores in descending order
# input: $scoresRef <- a hash ref to a hash of cuis and scores (hash{cui} = score)
# output: an array ref of the ranked cuis in descending order
sub rankDescending {
    #grab the input
    my $scoresRef = shift;

    #order in descending order, and use the CUI string as a tiebreaker
    my @rankedCuis = ();
    my @tiedCuis = ();
    my $currentScore = -1;
    foreach my $cui (
	#sort function to sort by value
	sort {${$scoresRef}{$b} <=> ${$scoresRef}{$a}} 
	keys %{$scoresRef}) {

	#see if this cui is tied with previuos
	if (${$scoresRef}{$cui} != $currentScore) {
	    #this cui is not tied with previuos,
	    # so save all previuos ones to the ranked array
	    # Here, we sort by key name, so the tie breaker
	    # is the cui name itself. This is arbitrary but 
	    # allows for results to be precisely replicated.
	    # UPDATE: Almost precisely replicated. There is 
	    # a numerical stability problem so that the sort
	    # by value will chunk out differently depending 
	    # on the run. So one run something with a values of 
	    # 0.66666666666667 will be sorted above another item
	    # with that same value, the next run sorted with it.
	    # this is essentially unavoidable without implementing
	    # a tolerance threshold which seems like overkill
	    foreach my $tiedCui (sort @tiedCuis) {
		push @rankedCuis, $tiedCui;
	    }

	    #clear the list of tied CUIs
	    @tiedCuis = ();
	}
	#add current CUI to the tied CUI list and update the
	# current score
	$currentScore = ${$scoresRef}{$cui};
	push @tiedCuis, $cui;
    }
    #add any remaining tied cuis to the final list
    foreach my $cui (sort @tiedCuis) {
	push @rankedCuis, $cui;
    }

    #return the ranked cuis
    return \@rankedCuis;
}


#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

# gets association scores for a set of cui pairs 
# input:  $cuiPairsRef <- reference to a hash of pairs of matrix indeces (key = '1,2')
#         $matrixRef <- a reference to a sparse matrix of n11 values
#         $measure <- the association measure to perform
#         $association <- an instance of UMLS::Association
# output: none, bu the cuiPairs ref has values updated to reflect the 
#         computed assocation score
sub getBatchAssociationScores {
    my $cuiPairsRef = shift;
    my $matrixRef = shift;
    my $measure = shift;
    my $association = shift;
    
    #optionally pass in $n1pRef, $np1Ref, and $npp
    # do this if they get calculated multiple times
    # (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $npp = shift;

    #if the measure is frequency, you only need to return 
    # the cuiPairs ref which already holds CUI frequencies
    if ($measure eq 'freq') {
	return $cuiPairsRef;
    }

    #calculate stats if needed
    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
	($n1pRef, $np1Ref, $npp) = &getAllStats($matrixRef);
    }
    
    #get association scores for each CUI pair
    my ($n11, $cui1, $cui2);
    foreach my $key (keys %{$cuiPairsRef}) {
	#get the cui indeces
	($cui1, $cui2) = split(/,/,$key);

	#assume calculation cannot be made
	${$cuiPairsRef}{$key} = -1;

	#get n11
	$n11 = ${${$matrixRef}{$cui1}}{$cui2};

	#get association if possible (only possible if the terms have co-occurred)
	if (defined $n11) {
	    ${$cuiPairsRef}{$key} = $association->_calculateAssociation_fromObservedCounts($n11, ${$n1pRef}{$cui1}, ${$np1Ref}{$cui2}, $npp, $measure);
	}
    }
}

# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input:  $matrixRef <- ref to the co-occurrence matrix (the sparse matrix 
#                       of n11 values)
# output: \@vals <- an array ref of three values:
#                   \%n1p - a hash ref where the key is a cui and value is n1p
#                   \%np1 - a hash ref where the key is a cui and value is np1
#                   $npp - a scalar of npp
sub getAllStats {
    my $matrixRef = shift;



( run in 1.421 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )