ALBD

 view release on metacpan or  search on metacpan

MANIFEST  view on Meta::CPAN

config/interface
config/interfaceConfig
config/lbd
lib/ALBD.pm
lib/LiteratureBasedDiscovery/Discovery.pm
lib/LiteratureBasedDiscovery/Evaluation.pm
lib/LiteratureBasedDiscovery/Filters.pm
lib/LiteratureBasedDiscovery/Rank.pm
lib/LiteratureBasedDiscovery/TimeSlicing.pm
samples/lbdConfig
samples/postCutoffMatrix
samples/runSample.pl
samples/sampleExplicitMatrix
samples/sampleGoldMatrix
samples/timeSliceCuiList
samples/timeSlicingConfig
samples/configFileSamples/UMLSAssociationConfig
samples/configFileSamples/UMLSInterfaceConfig
samples/configFileSamples/UMLSInterfaceInternalConfig
t/test.t
t/goldSampleOutput

README  view on Meta::CPAN

    generates co-occurrence files where order does matter, so the sentence
    'cui1 cui2' will only mark a co-occurrence between cui1 and cui2, but
    not between cui2 and cui1).

    removeCUIPair.pl -- removes all occurrences of the specified CUI pair
    from the co-occurrence matrix

    removeExplicit.pl -- removes any keys that occur in an explicit
    co-occurrence matrix from another co-occurrence matrix (typically the
    squared explicit co-occurrence matrix itself, which generates a
    prediction matrix, or the post cutoff matrix used in time slicing to
    generate a gold standard file)

    testMatrixEquality.pl -- checks to see if two co-occurrence matrix files
    contain the same data

    Also included are several subfolders with more specific purposes. Within
    the dataStats subfolder are scripts to collect various statistics about
    the co-occurrence matrices used in LBD. These scriptsinclude:

    getCUICooccurrences.pl -- a data statistics file that gets the number of

lib/ALBD.pm  view on Meta::CPAN

    #create the starting matrix
    my $startingMatrixRef 
	= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);

    #get association scores for the starting matrix
    my $assocScoresRef = TimeSlicing::getAssociationScores(
	$startingMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    my ($min, $max) = TimeSlicing::getMinMax($assocScoresRef);
    my $range = $max-$min;

    #load the post cutoff matrix for the necassary rows
    my $postCutoffMatrixRef 
	= TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});

    #apply a semantic type filter to the post cutoff matrix
    if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$postCutoffMatrixRef, $targetAcceptTypesRef, $umls_interface);
    }

    #apply a threshold at $numIntervals% intervals to generate an 11 point
    # interpolated precision/recall curve for linking term ranking/thresholding
    #stats for collecting info about predicted vs. true
    my $predictedAverage = 0;
    my $trueAverage = 0; 
    my $trueMin = 99999;
    my $trueMax = -999999; 
    my $predictedMin = 999999;

lib/ALBD.pm  view on Meta::CPAN

	    = Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);

	#apply a semantic type filter to the implicit matrix
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#calculate precision and recall
	my ($precision, $recall) = TimeSlicing::calculatePrecisionRecall(
	    $implicitMatrixRef, $postCutoffMatrixRef);
	print "precision = $precision, recall = $recall\n";

	#calculate averages/min/max only for $i= $numIntervals, which is all terms
	if ($i == $numIntervals) {
	    #average over all terms
	    foreach my $rowKey(keys %{$implicitMatrixRef}) {
		#get the counts true and predicted for this term (row of matrix)
		my $numPredicted = scalar keys %{${$implicitMatrixRef}{$rowKey}};
		my $numTrue = scalar keys %{${$postCutoffMatrixRef}{$rowKey}};

		#sum counts
		$predictedAverage += $numPredicted;
		$trueAverage += $numTrue;
		
		#update min and max
		if ($numPredicted < $predictedMin) {
		    $predictedMin = $numPredicted;
		}
		if ($numPredicted > $predictedMax) {

lib/ALBD.pm  view on Meta::CPAN

#--------
# Gold Loading/Creation
#--------
    #load or create the gold matrix
    my $goldMatrixRef;
    if (exists $lbdOptions{'goldInputFile'}) {
	print "inputting gold\n";
	$goldMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'goldInputFile'});
    }
    else {
	print "loading post cutoff\n";
	$goldMatrixRef = TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});

	#remove explicit knowledge from the post cutoff matrix
	$goldMatrixRef = Discovery::removeExplicit($startingMatrixRef, $goldMatrixRef);

	#apply a semantic type filter to the post cutoff matrix
	print "applying semantic filter to post-cutoff matrix\n";
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$goldMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#TODO why is the gold matrix outputting with an extra line between samples?
	#output the gold matrix
	if (exists $lbdOptions{'goldOutputFile'}) {
	    print "outputting gold\n";
	    Discovery::outputMatrixToFile($lbdOptions{'goldOutputFile'}, $goldMatrixRef); 

lib/LiteratureBasedDiscovery/Evaluation.pm  view on Meta::CPAN

# ALBD::Evaluation.pm
#
# Provides functionality to evaluate LBD systems
# Key components are:
# Results Matrix <- all new knowledge generated by an LBD system (e.g.
#                   all proposed discoveries of a system from pre-cutoff
#                   data).
# Gold Standard Matrix <- the gold standard knowledge matrix (e.g. all
#                         knowledge present in the post-cutoff dataset
#                         that is not present in the pre-cutoff dataset).
#
# Copyright (c) 2017
#
# Sam Henry
# henryst at vcu.edu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

	    $cuis{$line} = 1;
	}
    }
    close IN;

    return \%cuis;
}


# calculates average precision and recall of the generated implicit matrix 
# compared to the post cutoff matrix
# input:  $predictionsMatrixRef <- a ref to a sparse matrix of predicted 
#                                  discoveries
#         $trueMatrixRef <- a ref to a sparse matrix of true discoveries
# output: ($precision, $recall) <- two scalar values specifying the precision 
#                                  and recall
sub calculatePrecisionRecall {
    my $predictionsMatrixRef = shift; #a matrix of predicted discoveries
    my $trueMatrixRef = shift; #a matrix of true discoveries
    print "calculating precision and recall\n";

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

		($truePositive/($truePositive+$falsePositive)); 
	} #else precision += 0 ... nothing needs to be done
	if ((scalar keys %{${$trueMatrixRef}{$rowKey}}) > 0) {
	    $recall += 
		($truePositive/
		 (scalar keys %{${$trueMatrixRef}{$rowKey}}));
	} #else recall += 0
    }

    #calculate the averages (divide by the number of rows 
    #    = the number of terms in the post cutoff matrix)
    $precision /= scalar keys %{$trueMatrixRef};
    $recall /= scalar keys %{$trueMatrixRef};

    #return the average precision and recall
    return ($precision, $recall);
}


# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are 
# the only rows that are needed.
# input:  $startingMatrixRef <- a ref to the starting sparse matrix
#         $explicitMatrix Ref <- a ref to the explicit sparse matrix
#         $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $postCutoffFileName = shift;
    print "loading postCutoff Matrix\n";
    
    #open the post cutoff file
    open IN, $postCutoffFileName 
	or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");

    #create hash of cuis to grab
    my %cuisToGrab = ();
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$cuisToGrab{$rowKey} = 1;
    }

    #read in values of the post cutoff matrix for the start terms
    my %postCutoffMatrix = ();
    my ($cui1, $cui2, $val);
    while (my $line = <IN>) {
	#grab values from the line
	chomp $line;
	($cui1, $cui2, $val) = split(/\t/,$line);

	#see if this line contains a key that should be read in 
	if (exists $cuisToGrab{$cui1}) {

	    #add the value
	    if (!(defined $postCutoffMatrix{$cui1})) {
		my %newHash = ();
		$postCutoffMatrix{$cui1} = \%newHash;
	    }

	    #check to ensure that the column cui is in the 
	    #  vocabulary of the pre-cutoff dataset.
	    #  it is impossible to make predictions of words that
	    #  don't already exist
	    #NOTE: this assumes $explicitMatrixRef is a square 
	    #   matrix (so unordered)
	    if (exists ${$explicitMatrixRef}{$cui2}) {
		${$postCutoffMatrix{$cui1}}{$cui2} = $val;
	    }
	}
    }
    close IN;

    #return the post cutoff matrix
    return \%postCutoffMatrix;
}

#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input:  $explicitMatrixRef <- a ref to the explicit sparse matrix
#         $lbdOptionsRef <- the LBD options
#         $startTermAcceptTypesRef <- a reference to an hash of accept 
#                                     types for start terms (TUIs)
#         $numRows <- the number of random rows to load (if random)
#         $umls_interface <- an instance of the UMLS::Interface

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix < a ref to a new matrix, built from the 
#         $matrixRef after applying the $threshold
sub applyThreshold {
    my $threshold = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();
    my ($cui1, $cui2);
    foreach my $key (keys %{$assocScoresRef}) {

	#add key if val >= threshold
	if (${$assocScoresRef}{$key} >= $threshold) {
	    ($cui1,$cui2) = split(/,/, $key);

	    #create new hash at rowkey location
	    if (!(exists $thresholdedMatrix{$cui1})) {
		my %newHash = ();
		$thresholdedMatrix{$cui1} = \%newHash;
	    }
	    #set key value
	    ${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
	    $postKeyCount++;
	}
    }

    #return the thresholded matrix
    return \%thresholdedMatrix;
}

# Grabs the K highest ranked samples. This is for thresholding based the number 
# of samples. Used in explicit timeslicing
# input:  $k <- the number of samples to get

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the 
#                                $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
    my $k = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;
    print "getting $k highest ranked samples\n";

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();

    #get the keys sorted by value in descending order
    my @sortedKeys = sort { $assocScoresRef->{$b} <=> $assocScoresRef->{$a} } keys(%$assocScoresRef);
    my $threshold =  ${$assocScoresRef}{$sortedKeys[$k-1]};
    print " threshold = $threshold\n";

    #add the first k keys to the thresholded matrix
    my ($cui1, $cui2);
    foreach my $key (@sortedKeys) {
	($cui1, $cui2) = split(/,/, $key);

	#create new hash at rowkey location (if needed)
	if (!(exists $thresholdedMatrix{$cui1})) {
	    my %newHash = ();
	    $thresholdedMatrix{$cui1} = \%newHash;
	}

	#set key value for the key pair
	${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
	$postKeyCount++;

	#stop adding keys when below the threshold
	if (${$assocScoresRef}{$key} < $threshold) {
	    last;
	}
    }
    #return the thresholded matrix
    return \%thresholdedMatrix;
}

samples/runSample.pl  view on Meta::CPAN

#Demo file, showing how to run open discovery using the sample data, and how 
# to perform time slicing evaluation using the sample data

# run a sample lbd using the parameters in the lbd configuration file
print "\n           OPEN DISCOVERY          \n";
`perl ../utils/runDiscovery.pl lbdConfig`;
print "LBD Open discovery results output to sampleOutput\n\n";

# run a sample time slicing
# first remove the co-occurrences of the precutoff matrix (in this case it is 
# the sampleExplicitMatrix from the post cutoff matrix. This generates a gold 
# standard discovery matrix from which time slicing may be performed
# This requires modifying the removeExplicit.pl, which we have done for you. 
# The variables for this example in removeExplicit.pl are:
#  my $matrixFileName = 'sampleExplicitMatrix';
#  my $squaredMatrixFileName = postCutoffMatrix;
#  my $outputFileName = 'sampleGoldMatrix';
#`perl ../utils/datasetCreator/removeExplicit.pl`;

# next, run time slicing 
print "          TIME SLICING          \n";
`perl ../utils/runDiscovery.pl timeSlicingConfig > sampleTimeSliceOutput`;
print "LBD Time Slicing results output to sampleTimeSliceOutput\n";

utils/datasetCreator/removeExplicit.pl  view on Meta::CPAN

#removes the explicit co-occurrence matrix from the squared explicit 
# co-occurrence matrix. This generates a gold standard true discovery file

my $matrixFileName = '../../samples/sampleExplicitMatrix';
my $squaredMatrixFileName = '../../samples/postCutoffMatrix';
my $outputFileName = '../../samples/sampleGoldMatrix';

&removeExplicit($matrixFileName, $squaredMatrixFileName, $outputFileName);

###############################
###############################

#removes explicit knowledge ($matrixFileName) from the implicit 
# knowledge ($squaredMatrixFileName)
sub removeExplicit {



( run in 0.769 second using v1.01-cache-2.11-cpan-5a3173703d6 )