ALBD

 view release on metacpan or  search on metacpan

lib/ALBD.pm  view on Meta::CPAN

    my $ranksRef = Rank::rankDescending($scoresRef);
    print "Ranked in: ".(time()-$start)."\n";

#Output The Results
    open OUT, ">$lbdOptions{implicitOutputFile}" 
	or die "unable to open implicit ouput file: "
	."$lbdOptions{implicitOutputFile}\n";
    my $outputString = $self->_rankedTermsToString($scoresRef, $ranksRef);
    my $paramsString = $self->_parametersToString();
    print OUT $paramsString;
    print OUT $outputString;
    close OUT;

#Done
    print "DONE!\n\n";
}
=cut


##################################################
################ Time Slicing ####################
##################################################

#NOTE: This function isn't really tested, and is really slow right now
# Generates precision and recall values by varying the threshold
# of the A->B ranking measure.
# input:  none
# output: none, but precision and recall values are printed to STDOUT
sub timeSlicing_generatePrecisionAndRecall_explicit {
    my $NUM_SAMPLES = 100; #TODO, read fomr file number of samples to average over for timeslicing
    my $self = shift;
    print "In timeSlicing_generatePrecisionAndRecall\n";

    my $numIntervals = 10;

#Get inputs
    my $startAcceptTypesRef = $self->_getAcceptTypes('start');
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');


#Get the Explicit Matrix
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});

#------------------------------------------

    #create the starting matrix
    my $startingMatrixRef 
	= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);

    #get association scores for the starting matrix
    my $assocScoresRef = TimeSlicing::getAssociationScores(
	$startingMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    my ($min, $max) = TimeSlicing::getMinMax($assocScoresRef);
    my $range = $max-$min;

    #load the post cutoff matrix for the necassary rows
    my $postCutoffMatrixRef 
	= TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});

    #apply a semantic type filter to the post cutoff matrix
    if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$postCutoffMatrixRef, $targetAcceptTypesRef, $umls_interface);
    }

    #apply a threshold at $numIntervals% intervals to generate an 11 point
    # interpolated precision/recall curve for linking term ranking/thresholding
    #stats for collecting info about predicted vs. true
    my $predictedAverage = 0;
    my $trueAverage = 0; 
    my $trueMin = 99999;
    my $trueMax = -999999; 
    my $predictedMin = 999999;
    my $predictedMax = 999999;
    my $predictedTotal = 0;
    my $trueTotal = 0;
    my $allPairsCount = scalar keys %{$assocScoresRef};
    for (my $i = $numIntervals; $i >= 0; $i--) {

	#determine the number of samples to threshold
	my $numSamples = $i*($allPairsCount/$numIntervals);
	print "i, numSamples/allPairsCount = $i, $numSamples/$allPairsCount\n";
	#grab samples at just 10 to estimate the final point (this is what 
	# makes it an 11 point curve)
	if ($numSamples == 0) {
	    $numSamples = 10;
	}

	#apply a threshold (number of samples)
	my $thresholdedStartingMatrixRef = TimeSlicing::grabKHighestRankedSamples($numSamples, $assocScoresRef, $startingMatrixRef);

	#generate implicit knowledge
	my $implicitMatrixRef = Discovery::findImplicit($explicitMatrixRef, $thresholdedStartingMatrixRef);

	#Remove Known Connections
	$implicitMatrixRef 
	    = Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);

	#apply a semantic type filter to the implicit matrix
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#calculate precision and recall
	my ($precision, $recall) = TimeSlicing::calculatePrecisionRecall(
	    $implicitMatrixRef, $postCutoffMatrixRef);
	print "precision = $precision, recall = $recall\n";

	#calculate averages/min/max only for $i= $numIntervals, which is all terms
	if ($i == $numIntervals) {
	    #average over all terms
	    foreach my $rowKey(keys %{$implicitMatrixRef}) {
		#get the counts true and predicted for this term (row of matrix)
		my $numPredicted = scalar keys %{${$implicitMatrixRef}{$rowKey}};
		my $numTrue = scalar keys %{${$postCutoffMatrixRef}{$rowKey}};

		#sum counts
		$predictedAverage += $numPredicted;
		$trueAverage += $numTrue;
		
		#update min and max
		if ($numPredicted < $predictedMin) {
		    $predictedMin = $numPredicted;
		}
		if ($numPredicted > $predictedMax) {
		    $predictedMax = $numPredicted;
		}
		if ($numTrue < $trueMin) {
		    $predictedMin = $numTrue;
		}
		if ($numTrue > $trueMax) {
		    $predictedMax = $numTrue;
		}

		$predictedTotal += $numPredicted;
		$trueTotal += $numTrue;
	    }
	    #take the average, both true and predicted matrices
	    # have the same number of rows.
	    $predictedAverage /= (scalar keys %{$implicitMatrixRef});
	    $trueAverage /= (scalar keys %{$implicitMatrixRef});
	}
    } 

    #output stats
    print "predicted - total, min, max, average = $predictedTotal, $predictedMin, $predictedMax, $predictedAverage\n";
    print "true - total, min, max, average = $trueTotal, $trueMin, $trueMax, $trueAverage\n";
}


# generates precision and recall values by varying the threshold
# of the A->C ranking measure. Also generates precision at k, and
# mean average precision
# input:  none
# output: none, but precision, recall, precision at k, and map values
#         output to STDOUT
sub timeSlicing_generatePrecisionAndRecall_implicit {
    my $NUM_SAMPLES = 200; #TODO, read fomr file number of samples to average over for timeslicing
    my $self = shift;
    my $start; #used to record run times
    print "In timeSlicing_generatePrecisionAndRecall_implicit\n";

    #Get inputs
    my $startAcceptTypesRef = $self->_getAcceptTypes('start');
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');

#-----------
# Starting Matrix Creation
#-----------
    #Get the Explicit Matrix
    print "loading explicit\n";
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});

    #create the starting matrix
    print "generating starting\n";
    my $startingMatrixRef 
	= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);
#----------
    

#--------
# Gold Loading/Creation
#--------
    #load or create the gold matrix
    my $goldMatrixRef;
    if (exists $lbdOptions{'goldInputFile'}) {
	print "inputting gold\n";
	$goldMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'goldInputFile'});
    }
    else {
	print "loading post cutoff\n";
	$goldMatrixRef = TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});

	#remove explicit knowledge from the post cutoff matrix
	$goldMatrixRef = Discovery::removeExplicit($startingMatrixRef, $goldMatrixRef);

	#apply a semantic type filter to the post cutoff matrix
	print "applying semantic filter to post-cutoff matrix\n";
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$goldMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#TODO why is the gold matrix outputting with an extra line between samples?
	#output the gold matrix
	if (exists $lbdOptions{'goldOutputFile'}) {
	    print "outputting gold\n";
	    Discovery::outputMatrixToFile($lbdOptions{'goldOutputFile'}, $goldMatrixRef); 
	}
    }
#-------
  
#-------
# AB Scoring (if needed)
#-------
    #if using average minimum weight, grab the a->b scores, #TODO this is sloppy here, but it has to be here...how to make it fit better?
    my %abPairsWithScores = ();
    if ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight'
		|| $lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	print "getting AB scores\n";

	#apply semantic type filter to columns only
	if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
	}
	#intitialize the abPairs to the frequency of co-ocurrence
	foreach my $row (keys %{$startingMatrixRef}) {
	    foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
		$abPairsWithScores{"$row,$col"} = ${${$startingMatrixRef}{$row}}{$col}; 
	    }
	}
	Rank::getBatchAssociationScores(
	    \%abPairsWithScores, $explicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    }
#--------

#------------
# Matrix Filtering/Thresholding
#------------
    #load or threshold the matrix
    if (exists $lbdOptions{'thresholdedMatrix'}) {
	print "loading thresholded matrix\n";
	$explicitMatrixRef = (); #clear (for memory)
	$explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'thresholdedMatrix'});
    }
    #else {#TODO apply a threshold}
    #NOTE, we must threshold the entire matrix because that is how we are calculating association scores

    #Apply Semantic Type Filter to the explicit matrix
    print "applying semantic filter to explicit matrix\n";
    if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	Filters::semanticTypeFilter_rowsAndColumns(
	    $explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
    }

#------------
# Prediction Generation



( run in 0.808 second using v1.01-cache-2.11-cpan-ceb78f64989 )