ALBD
view release on metacpan or search on metacpan
lib/ALBD.pm view on Meta::CPAN
my $umls_association;
#####################################################
####################################################
# performs LBD
# input: none
# ouptut: none, but a results file is written to disk
sub performLBD {
my $self = shift;
my $start; #used to record run times
#implicit matrix ranking requires a different set of procedures
if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') {
$self->performLBD_implicitMatrixRanking();
return;
}
if (exists $lbdOptions{'targetCuis'}) {
$self->performLBD_closedDiscovery();
return;
}
if (exists $lbdOptions{'precisionAndRecall_explicit'}) {
$self->timeSlicing_generatePrecisionAndRecall_explicit();
return;
}
if (exists $lbdOptions{'precisionAndRecall_implicit'}) {
$self->timeSlicing_generatePrecisionAndRecall_implicit();
return;
}
print "Open Discovery\n";
print $self->_parametersToString();
#Get inputs
my $startCuisRef = $self->_getStartCuis();
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
print "linkingAcceptTypes = ".(join(',', keys %{$linkingAcceptTypesRef}))."\n";
print "targetAcceptTypes = ".(join(',', keys %{$targetAcceptTypesRef}))."\n";
#Get the Explicit Matrix
$start = time;
my $explicitMatrixRef;
if(!defined $lbdOptions{'explicitInputFile'}) {
die ("ERROR: explicitInputFile must be defined in LBD config file\n");
}
$explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
print "Got Explicit Matrix in ".(time() - $start)."\n";
#Get the Starting Matrix
$start = time();
my $startingMatrixRef =
Discovery::getRows($startCuisRef, $explicitMatrixRef);
print "Got Starting Matrix in ".(time() - $start)."\n";
#if using average minimum weight, grab the a->b scores
my %abPairsWithScores = ();
if ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight'
|| $lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
#apply semantic type filter to columns only
if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
}
#initialize the abPairs to frequency of co-occurrence
foreach my $row (keys %{$startingMatrixRef}) {
foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
$abPairsWithScores{"$row,$col"} = ${${$startingMatrixRef}{$row}}{$col};
}
}
Rank::getBatchAssociationScores(\%abPairsWithScores, $explicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
}
#Apply Semantic Type Filter to the explicit matrix
if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
$start = time();
Filters::semanticTypeFilter_rowsAndColumns(
$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
print "Semantic Type Filter in ".(time() - $start)."\n";
}
#Get Implicit Connections
$start = time();
my $implicitMatrixRef;
if (defined $lbdOptions{'implicitInputFile'}) {
$implicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'implicitInputFile'});
} else {
$implicitMatrixRef = Discovery::findImplicit($explicitMatrixRef, $startingMatrixRef);
}
print "Got Implicit Matrix in ".(time() - $start)."\n";
#Remove Known Connections
$start = time();
$implicitMatrixRef = Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);
print "Removed Known Connections in ".(time() - $start)."\n";
#Apply Semantic Type Filter
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
$start = time();
Filters::semanticTypeFilter_columns(
$implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
print "Semantic Type Filter in ".(time() - $start)."\n";
}
#Score Implicit Connections
$start = time();
my $scoresRef;
if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
$scoresRef = Rank::scoreImplicit_fromAllPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
$scoresRef = Rank::scoreImplicit_averageMinimumWeight($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores);
} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
$scoresRef = Rank::scoreImplicit_linkingTermCount($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
$scoresRef = Rank::scoreImplicit_frequency($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
$scoresRef = Rank::scoreImplicit_ltcAssociation($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
$scoresRef = Rank::scoreImplicit_LTC_AMW($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores);
} else {
lib/ALBD.pm view on Meta::CPAN
open OUT, ">$lbdOptions{implicitOutputFile}"
or die "unable to open implicit ouput file: "
."$lbdOptions{implicitOutputFile}\n";
my $outputString = $self->_rankedTermsToString($scoresRef, $ranksRef);
my $paramsString = $self->_parametersToString();
print OUT $paramsString;
print OUT $outputString;
close OUT;
#Done
print "DONE!\n\n";
}
=cut
##################################################
################ Time Slicing ####################
##################################################
#NOTE: This function isn't really tested, and is really slow right now
# Generates precision and recall values by varying the threshold
# of the A->B ranking measure.
# input: none
# output: none, but precision and recall values are printed to STDOUT
sub timeSlicing_generatePrecisionAndRecall_explicit {
my $NUM_SAMPLES = 100; #TODO, read fomr file number of samples to average over for timeslicing
my $self = shift;
print "In timeSlicing_generatePrecisionAndRecall\n";
my $numIntervals = 10;
#Get inputs
my $startAcceptTypesRef = $self->_getAcceptTypes('start');
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
#Get the Explicit Matrix
my $explicitMatrixRef;
if(!defined $lbdOptions{'explicitInputFile'}) {
die ("ERROR: explicitInputFile must be defined in LBD config file\n");
}
$explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
#------------------------------------------
#create the starting matrix
my $startingMatrixRef
= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);
#get association scores for the starting matrix
my $assocScoresRef = TimeSlicing::getAssociationScores(
$startingMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
my ($min, $max) = TimeSlicing::getMinMax($assocScoresRef);
my $range = $max-$min;
#load the post cutoff matrix for the necassary rows
my $postCutoffMatrixRef
= TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});
#apply a semantic type filter to the post cutoff matrix
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$postCutoffMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#apply a threshold at $numIntervals% intervals to generate an 11 point
# interpolated precision/recall curve for linking term ranking/thresholding
#stats for collecting info about predicted vs. true
my $predictedAverage = 0;
my $trueAverage = 0;
my $trueMin = 99999;
my $trueMax = -999999;
my $predictedMin = 999999;
my $predictedMax = 999999;
my $predictedTotal = 0;
my $trueTotal = 0;
my $allPairsCount = scalar keys %{$assocScoresRef};
for (my $i = $numIntervals; $i >= 0; $i--) {
#determine the number of samples to threshold
my $numSamples = $i*($allPairsCount/$numIntervals);
print "i, numSamples/allPairsCount = $i, $numSamples/$allPairsCount\n";
#grab samples at just 10 to estimate the final point (this is what
# makes it an 11 point curve)
if ($numSamples == 0) {
$numSamples = 10;
}
#apply a threshold (number of samples)
my $thresholdedStartingMatrixRef = TimeSlicing::grabKHighestRankedSamples($numSamples, $assocScoresRef, $startingMatrixRef);
#generate implicit knowledge
my $implicitMatrixRef = Discovery::findImplicit($explicitMatrixRef, $thresholdedStartingMatrixRef);
#Remove Known Connections
$implicitMatrixRef
= Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);
#apply a semantic type filter to the implicit matrix
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#calculate precision and recall
my ($precision, $recall) = TimeSlicing::calculatePrecisionRecall(
$implicitMatrixRef, $postCutoffMatrixRef);
print "precision = $precision, recall = $recall\n";
#calculate averages/min/max only for $i= $numIntervals, which is all terms
if ($i == $numIntervals) {
#average over all terms
foreach my $rowKey(keys %{$implicitMatrixRef}) {
#get the counts true and predicted for this term (row of matrix)
my $numPredicted = scalar keys %{${$implicitMatrixRef}{$rowKey}};
my $numTrue = scalar keys %{${$postCutoffMatrixRef}{$rowKey}};
#sum counts
$predictedAverage += $numPredicted;
$trueAverage += $numTrue;
#update min and max
if ($numPredicted < $predictedMin) {
$predictedMin = $numPredicted;
}
if ($numPredicted > $predictedMax) {
$predictedMax = $numPredicted;
}
if ($numTrue < $trueMin) {
$predictedMin = $numTrue;
}
if ($numTrue > $trueMax) {
$predictedMax = $numTrue;
}
$predictedTotal += $numPredicted;
$trueTotal += $numTrue;
}
#take the average, both true and predicted matrices
# have the same number of rows.
$predictedAverage /= (scalar keys %{$implicitMatrixRef});
$trueAverage /= (scalar keys %{$implicitMatrixRef});
}
}
#output stats
print "predicted - total, min, max, average = $predictedTotal, $predictedMin, $predictedMax, $predictedAverage\n";
print "true - total, min, max, average = $trueTotal, $trueMin, $trueMax, $trueAverage\n";
}
# generates precision and recall values by varying the threshold
# of the A->C ranking measure. Also generates precision at k, and
# mean average precision
# input: none
# output: none, but precision, recall, precision at k, and map values
# output to STDOUT
sub timeSlicing_generatePrecisionAndRecall_implicit {
my $NUM_SAMPLES = 200; #TODO, read fomr file number of samples to average over for timeslicing
my $self = shift;
my $start; #used to record run times
print "In timeSlicing_generatePrecisionAndRecall_implicit\n";
#Get inputs
my $startAcceptTypesRef = $self->_getAcceptTypes('start');
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
#-----------
# Starting Matrix Creation
#-----------
#Get the Explicit Matrix
print "loading explicit\n";
my $explicitMatrixRef;
if(!defined $lbdOptions{'explicitInputFile'}) {
die ("ERROR: explicitInputFile must be defined in LBD config file\n");
}
$explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
#create the starting matrix
print "generating starting\n";
my $startingMatrixRef
= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);
#----------
#--------
# Gold Loading/Creation
#--------
#load or create the gold matrix
my $goldMatrixRef;
if (exists $lbdOptions{'goldInputFile'}) {
print "inputting gold\n";
$goldMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'goldInputFile'});
}
else {
print "loading post cutoff\n";
$goldMatrixRef = TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});
#remove explicit knowledge from the post cutoff matrix
$goldMatrixRef = Discovery::removeExplicit($startingMatrixRef, $goldMatrixRef);
#apply a semantic type filter to the post cutoff matrix
print "applying semantic filter to post-cutoff matrix\n";
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$goldMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#TODO why is the gold matrix outputting with an extra line between samples?
#output the gold matrix
if (exists $lbdOptions{'goldOutputFile'}) {
print "outputting gold\n";
Discovery::outputMatrixToFile($lbdOptions{'goldOutputFile'}, $goldMatrixRef);
}
}
#-------
#-------
# AB Scoring (if needed)
#-------
#if using average minimum weight, grab the a->b scores, #TODO this is sloppy here, but it has to be here...how to make it fit better?
my %abPairsWithScores = ();
if ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight'
|| $lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
print "getting AB scores\n";
#apply semantic type filter to columns only
if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
}
#intitialize the abPairs to the frequency of co-ocurrence
foreach my $row (keys %{$startingMatrixRef}) {
foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
$abPairsWithScores{"$row,$col"} = ${${$startingMatrixRef}{$row}}{$col};
}
}
Rank::getBatchAssociationScores(
\%abPairsWithScores, $explicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
}
#--------
#------------
# Matrix Filtering/Thresholding
#------------
#load or threshold the matrix
if (exists $lbdOptions{'thresholdedMatrix'}) {
print "loading thresholded matrix\n";
$explicitMatrixRef = (); #clear (for memory)
$explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'thresholdedMatrix'});
}
#else {#TODO apply a threshold}
#NOTE, we must threshold the entire matrix because that is how we are calculating association scores
#Apply Semantic Type Filter to the explicit matrix
print "applying semantic filter to explicit matrix\n";
if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_rowsAndColumns(
$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
}
#------------
# Prediction Generation
#------------
#load or create the predictions matrix
my $predictionsMatrixRef;
if (exists $lbdOptions{'predictionsInFile'}) {
print "loading predictions\n";
$predictionsMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'predictionsInFile'});
}
else {
print "generating predictions\n";
#generate implicit knowledge
print "Squaring Matrix\n";
$predictionsMatrixRef = Discovery::findImplicit(
$explicitMatrixRef, $startingMatrixRef);
#Remove Known Connections
print "Removing Known from Predictions\n";
$predictionsMatrixRef
= Discovery::removeExplicit($startingMatrixRef, $predictionsMatrixRef);
#apply a semantic type filter to the predictions matrix
print "Applying Semantic Filter to Predictions\n";
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$predictionsMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#save the implicit knowledge matrix to file
if (exists ($lbdOptions{'predictionsOutFile'})) {
print "outputting predictions\n";
Discovery::outputMatrixToFile($lbdOptions{'predictionsOutFile'}, $predictionsMatrixRef);
}
}
#-------------------------------------------
#At this point, the explicitMatrixRef has been filtered and thresholded
#The predictions matrix Ref has been generated from the filtered and
# thresholded explicitMatrixRef, only rows of starting terms remain, filtered, and
# had explicit removed
#Association scores are generated using the explicitMatrixRef
#--------------
# Get the ranks of all predictions
#--------------
#get the scores and ranks seperately for each row
# thereby generating scores and ranks for each starting
# term individually
my %rowRanks = ();
my ($n1pRef, $np1Ref, $npp);
print "getting row ranks\n";
foreach my $rowKey (keys %{$predictionsMatrixRef}) {
#grab rows from start and implicit matrices
my %startingRow = ();
$startingRow{$rowKey} = ${$startingMatrixRef}{$rowKey};
my %implicitRow = ();
$implicitRow{$rowKey} = ${$predictionsMatrixRef}{$rowKey};
#Score Implicit Connections
my $scoresRef;
if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
#get stats just a single time
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
}
$scoresRef = Rank::scoreImplicit_fromAllPairs(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, $n1pRef, $np1Ref, $npp);
} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
#get stats just a single time
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
}
$scoresRef = Rank::scoreImplicit_averageMinimumWeight(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
$scoresRef = Rank::scoreImplicit_linkingTermCount(\%startingRow, $explicitMatrixRef, \%implicitRow);
} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
$scoresRef = Rank::scoreImplicit_frequency(\%startingRow, $explicitMatrixRef, \%implicitRow);
} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
$scoresRef = Rank::scoreImplicit_ltcAssociation(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association);
} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
#get stats just a single time
lib/ALBD.pm view on Meta::CPAN
# Each line of the file contains a new key-value pair (e.g. <key>value)
# If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
my $self = shift;
my $configFileName = shift;
#read in all options from the config file
open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
my %optionsHash = ();
my $firstChar;
while (my $line = <IN>) {
#check if its a comment or blank line
$firstChar = substr $line, 0, 1;
if ($firstChar ne '#' && $line =~ /[^\s]+/) {
#line contains data, grab the key and value
$line =~ /<([^>]+)>([^\n]*)/;
#make sure the data was read in correctly
if (!$1) {
print STDERR
"Warning: Invalid line in $configFileName: $line\n";
}
else {
#data was grabbed from the line, add to hash
if ($2) {
#add key and value to the optionsHash
$optionsHash{$1} = $2;
}
else {
#add key and set default value to the optionsHash
$optionsHash{$1} = 1;
}
}
}
}
close IN;
return \%optionsHash;
}
# transforms the string of start cuis to an array
# input: none
# output: an array ref of CUIs
sub _getStartCuis {
my $self = shift;
my @startCuis = split(',',$lbdOptions{'startCuis'});
return \@startCuis;
}
# transforms the string of target cuis to an array
# input: none
# output: an array ref of CUIs
sub _getTargetCuis {
my $self = shift;
my @targetCuis = split(',',$lbdOptions{'targetCuis'});
return \@targetCuis;
}
# transforms the string of accept types or groups into a hash of accept TUIs
# input: a string specifying whether linking or target types are being defined
# output: a hash of acceptable TUIs
sub _getAcceptTypes {
my $self = shift;
my $stepString = shift; #either 'linking' or 'target'
#get the accept types
my %acceptTypes = ();
#add all types for groups specified
my $string = $stepString.'AcceptGroups';
if (defined $lbdOptions{$string}) {
#accept groups were specified
my @acceptGroups = split(',',$lbdOptions{$string});
#add all the types of each group
foreach my $group(@acceptGroups) {
my $typesRef = Filters::getTypesOfGroup($group, $umls_interface);
foreach my $key(keys %{$typesRef}) {
$acceptTypes{$key} = 1;
}
}
}
#add all types specified
$string = $stepString.'AcceptTypes';
if (defined $lbdOptions{$string}) {
#convert each type to a tui and add
my $tui;
my @acceptTypes = split(',',$lbdOptions{$string});
foreach my $abr(@acceptTypes) {
$tui = uc $umls_interface->getStTui($abr);
$acceptTypes{$tui} = 1;
}
}
return \%acceptTypes;
}
##############################################################################
# function to produce output
##############################################################################
# outputs the implicit terms to string
# input: $scoresRef <- a reference to a hash of scores (hash{CUI}=score)
# $ranksRef <- a reference to an array of CUIs ranked by their score
# $printTo <- optional, outputs the $printTo top ranked terms. If not
# specified, all terms are output
# output: a line seperated string containing ranked terms, scores, and thier
# preferred terms
sub _rankedTermsToString {
my $self = shift;
my $scoresRef = shift;
my $ranksRef = shift;
my $printTo = shift;
#set printTo
if (!$printTo) {
$printTo = scalar @{$ranksRef};
}
#construct the output string
my $string = '';
my $index;
for (my $i = 0; $i < $printTo; $i++) {
#add the rank
$index = $i+1;
$string .= "$index\t";
#add the score
$string .= sprintf "%.5f\t", "${$scoresRef}{${$ranksRef}[$i]}\t";
#add the CUI
$string .= "${$ranksRef}[$i]\t";
#add the name
my $name = $umls_interface->getPreferredTerm(${$ranksRef}[$i]);
#if no preferred name, get anything
if (!defined $name || $name eq '') {
my $termListRef = $umls_interface->getTermList('C0440102');
if (scalar @{$termListRef} > 0) {
$name = '.**'.${$termListRef}[0];
}
}
$string .= "$name\n";
}
#return the string of ranked terms
return $string;
( run in 0.652 second using v1.01-cache-2.11-cpan-df04353d9ac )