view release on metacpan or search on metacpan
config/interface
config/interfaceConfig
config/lbd
lib/ALBD.pm
lib/LiteratureBasedDiscovery/Discovery.pm
lib/LiteratureBasedDiscovery/Evaluation.pm
lib/LiteratureBasedDiscovery/Filters.pm
lib/LiteratureBasedDiscovery/Rank.pm
lib/LiteratureBasedDiscovery/TimeSlicing.pm
samples/lbdConfig
samples/postCutoffMatrix
samples/runSample.pl
samples/sampleExplicitMatrix
samples/sampleGoldMatrix
samples/timeSliceCuiList
samples/timeSlicingConfig
samples/configFileSamples/UMLSAssociationConfig
samples/configFileSamples/UMLSInterfaceConfig
samples/configFileSamples/UMLSInterfaceInternalConfig
t/test.t
t/goldSampleOutput
generates co-occurrence files where order does matter, so the sentence
'cui1 cui2' will only mark a co-occurrence between cui1 and cui2, but
not between cui2 and cui1).
removeCUIPair.pl -- removes all occurrences of the specified CUI pair
from the co-occurrence matrix
removeExplicit.pl -- removes any keys that occur in an explicit
co-occurrence matrix from another co-occurrence matrix (typically the
squared explicit co-occurrence matrix itself, which generates a
prediction matrix, or the post cutoff matrix used in time slicing to
generate a gold standard file)
testMatrixEquality.pl -- checks to see if two co-occurrence matrix files
contain the same data
Also included are several subfolders with more specific purposes. Within
the dataStats subfolder are scripts to collect various statistics about
the co-occurrence matrices used in LBD. These scriptsinclude:
getCUICooccurrences.pl -- a data statistics file that gets the number of
lib/ALBD.pm view on Meta::CPAN
#create the starting matrix
my $startingMatrixRef
= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);
#get association scores for the starting matrix
my $assocScoresRef = TimeSlicing::getAssociationScores(
$startingMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
my ($min, $max) = TimeSlicing::getMinMax($assocScoresRef);
my $range = $max-$min;
#load the post cutoff matrix for the necassary rows
my $postCutoffMatrixRef
= TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});
#apply a semantic type filter to the post cutoff matrix
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$postCutoffMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#apply a threshold at $numIntervals% intervals to generate an 11 point
# interpolated precision/recall curve for linking term ranking/thresholding
#stats for collecting info about predicted vs. true
my $predictedAverage = 0;
my $trueAverage = 0;
my $trueMin = 99999;
my $trueMax = -999999;
my $predictedMin = 999999;
lib/ALBD.pm view on Meta::CPAN
= Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);
#apply a semantic type filter to the implicit matrix
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#calculate precision and recall
my ($precision, $recall) = TimeSlicing::calculatePrecisionRecall(
$implicitMatrixRef, $postCutoffMatrixRef);
print "precision = $precision, recall = $recall\n";
#calculate averages/min/max only for $i= $numIntervals, which is all terms
if ($i == $numIntervals) {
#average over all terms
foreach my $rowKey(keys %{$implicitMatrixRef}) {
#get the counts true and predicted for this term (row of matrix)
my $numPredicted = scalar keys %{${$implicitMatrixRef}{$rowKey}};
my $numTrue = scalar keys %{${$postCutoffMatrixRef}{$rowKey}};
#sum counts
$predictedAverage += $numPredicted;
$trueAverage += $numTrue;
#update min and max
if ($numPredicted < $predictedMin) {
$predictedMin = $numPredicted;
}
if ($numPredicted > $predictedMax) {
lib/ALBD.pm view on Meta::CPAN
#--------
# Gold Loading/Creation
#--------
#load or create the gold matrix
my $goldMatrixRef;
if (exists $lbdOptions{'goldInputFile'}) {
print "inputting gold\n";
$goldMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'goldInputFile'});
}
else {
print "loading post cutoff\n";
$goldMatrixRef = TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});
#remove explicit knowledge from the post cutoff matrix
$goldMatrixRef = Discovery::removeExplicit($startingMatrixRef, $goldMatrixRef);
#apply a semantic type filter to the post cutoff matrix
print "applying semantic filter to post-cutoff matrix\n";
if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
Filters::semanticTypeFilter_columns(
$goldMatrixRef, $targetAcceptTypesRef, $umls_interface);
}
#TODO why is the gold matrix outputting with an extra line between samples?
#output the gold matrix
if (exists $lbdOptions{'goldOutputFile'}) {
print "outputting gold\n";
Discovery::outputMatrixToFile($lbdOptions{'goldOutputFile'}, $goldMatrixRef);
lib/LiteratureBasedDiscovery/Evaluation.pm view on Meta::CPAN
# ALBD::Evaluation.pm
#
# Provides functionality to evaluate LBD systems
# Key components are:
# Results Matrix <- all new knowledge generated by an LBD system (e.g.
# all proposed discoveries of a system from pre-cutoff
# data).
# Gold Standard Matrix <- the gold standard knowledge matrix (e.g. all
# knowledge present in the post-cutoff dataset
# that is not present in the pre-cutoff dataset).
#
# Copyright (c) 2017
#
# Sam Henry
# henryst at vcu.edu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
$cuis{$line} = 1;
}
}
close IN;
return \%cuis;
}
# calculates average precision and recall of the generated implicit matrix
# compared to the post cutoff matrix
# input: $predictionsMatrixRef <- a ref to a sparse matrix of predicted
# discoveries
# $trueMatrixRef <- a ref to a sparse matrix of true discoveries
# output: ($precision, $recall) <- two scalar values specifying the precision
# and recall
sub calculatePrecisionRecall {
my $predictionsMatrixRef = shift; #a matrix of predicted discoveries
my $trueMatrixRef = shift; #a matrix of true discoveries
print "calculating precision and recall\n";
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
($truePositive/($truePositive+$falsePositive));
} #else precision += 0 ... nothing needs to be done
if ((scalar keys %{${$trueMatrixRef}{$rowKey}}) > 0) {
$recall +=
($truePositive/
(scalar keys %{${$trueMatrixRef}{$rowKey}}));
} #else recall += 0
}
#calculate the averages (divide by the number of rows
# = the number of terms in the post cutoff matrix)
$precision /= scalar keys %{$trueMatrixRef};
$recall /= scalar keys %{$trueMatrixRef};
#return the average precision and recall
return ($precision, $recall);
}
# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are
# the only rows that are needed.
# input: $startingMatrixRef <- a ref to the starting sparse matrix
# $explicitMatrix Ref <- a ref to the explicit sparse matrix
# $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $postCutoffFileName = shift;
print "loading postCutoff Matrix\n";
#open the post cutoff file
open IN, $postCutoffFileName
or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");
#create hash of cuis to grab
my %cuisToGrab = ();
foreach my $rowKey (keys %{$startingMatrixRef}) {
$cuisToGrab{$rowKey} = 1;
}
#read in values of the post cutoff matrix for the start terms
my %postCutoffMatrix = ();
my ($cui1, $cui2, $val);
while (my $line = <IN>) {
#grab values from the line
chomp $line;
($cui1, $cui2, $val) = split(/\t/,$line);
#see if this line contains a key that should be read in
if (exists $cuisToGrab{$cui1}) {
#add the value
if (!(defined $postCutoffMatrix{$cui1})) {
my %newHash = ();
$postCutoffMatrix{$cui1} = \%newHash;
}
#check to ensure that the column cui is in the
# vocabulary of the pre-cutoff dataset.
# it is impossible to make predictions of words that
# don't already exist
#NOTE: this assumes $explicitMatrixRef is a square
# matrix (so unordered)
if (exists ${$explicitMatrixRef}{$cui2}) {
${$postCutoffMatrix{$cui1}}{$cui2} = $val;
}
}
}
close IN;
#return the post cutoff matrix
return \%postCutoffMatrix;
}
#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input: $explicitMatrixRef <- a ref to the explicit sparse matrix
# $lbdOptionsRef <- the LBD options
# $startTermAcceptTypesRef <- a reference to an hash of accept
# types for start terms (TUIs)
# $numRows <- the number of random rows to load (if random)
# $umls_interface <- an instance of the UMLS::Interface
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# corresponds to the assocScoresRef
# output: \%thresholdedMatrix < a ref to a new matrix, built from the
# $matrixRef after applying the $threshold
sub applyThreshold {
my $threshold = shift;
my $assocScoresRef = shift;
my $matrixRef = shift;
#apply the threshold
my $preKeyCount = scalar keys %{$assocScoresRef};
my $postKeyCount = 0;
my %thresholdedMatrix = ();
my ($cui1, $cui2);
foreach my $key (keys %{$assocScoresRef}) {
#add key if val >= threshold
if (${$assocScoresRef}{$key} >= $threshold) {
($cui1,$cui2) = split(/,/, $key);
#create new hash at rowkey location
if (!(exists $thresholdedMatrix{$cui1})) {
my %newHash = ();
$thresholdedMatrix{$cui1} = \%newHash;
}
#set key value
${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
$postKeyCount++;
}
}
#return the thresholded matrix
return \%thresholdedMatrix;
}
# Grabs the K highest ranked samples. This is for thresholding based the number
# of samples. Used in explicit timeslicing
# input: $k <- the number of samples to get
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the
# $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
my $k = shift;
my $assocScoresRef = shift;
my $matrixRef = shift;
print "getting $k highest ranked samples\n";
#apply the threshold
my $preKeyCount = scalar keys %{$assocScoresRef};
my $postKeyCount = 0;
my %thresholdedMatrix = ();
#get the keys sorted by value in descending order
my @sortedKeys = sort { $assocScoresRef->{$b} <=> $assocScoresRef->{$a} } keys(%$assocScoresRef);
my $threshold = ${$assocScoresRef}{$sortedKeys[$k-1]};
print " threshold = $threshold\n";
#add the first k keys to the thresholded matrix
my ($cui1, $cui2);
foreach my $key (@sortedKeys) {
($cui1, $cui2) = split(/,/, $key);
#create new hash at rowkey location (if needed)
if (!(exists $thresholdedMatrix{$cui1})) {
my %newHash = ();
$thresholdedMatrix{$cui1} = \%newHash;
}
#set key value for the key pair
${$thresholdedMatrix{$cui1}}{$cui2} = ${${$matrixRef}{$cui1}}{$cui2};
$postKeyCount++;
#stop adding keys when below the threshold
if (${$assocScoresRef}{$key} < $threshold) {
last;
}
}
#return the thresholded matrix
return \%thresholdedMatrix;
}
samples/runSample.pl view on Meta::CPAN
#Demo file, showing how to run open discovery using the sample data, and how
# to perform time slicing evaluation using the sample data
# run a sample lbd using the parameters in the lbd configuration file
print "\n OPEN DISCOVERY \n";
`perl ../utils/runDiscovery.pl lbdConfig`;
print "LBD Open discovery results output to sampleOutput\n\n";
# run a sample time slicing
# first remove the co-occurrences of the precutoff matrix (in this case it is
# the sampleExplicitMatrix from the post cutoff matrix. This generates a gold
# standard discovery matrix from which time slicing may be performed
# This requires modifying the removeExplicit.pl, which we have done for you.
# The variables for this example in removeExplicit.pl are:
# my $matrixFileName = 'sampleExplicitMatrix';
# my $squaredMatrixFileName = postCutoffMatrix;
# my $outputFileName = 'sampleGoldMatrix';
#`perl ../utils/datasetCreator/removeExplicit.pl`;
# next, run time slicing
print " TIME SLICING \n";
`perl ../utils/runDiscovery.pl timeSlicingConfig > sampleTimeSliceOutput`;
print "LBD Time Slicing results output to sampleTimeSliceOutput\n";
utils/datasetCreator/removeExplicit.pl view on Meta::CPAN
#removes the explicit co-occurrence matrix from the squared explicit
# co-occurrence matrix. This generates a gold standard true discovery file
my $matrixFileName = '../../samples/sampleExplicitMatrix';
my $squaredMatrixFileName = '../../samples/postCutoffMatrix';
my $outputFileName = '../../samples/sampleGoldMatrix';
&removeExplicit($matrixFileName, $squaredMatrixFileName, $outputFileName);
###############################
###############################
#removes explicit knowledge ($matrixFileName) from the implicit
# knowledge ($squaredMatrixFileName)
sub removeExplicit {