ALBD
view release on metacpan or search on metacpan
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# the only rows that are needed.
# input: $startingMatrixRef <- a ref to the starting sparse matrix
# $explicitMatrix Ref <- a ref to the explicit sparse matrix
# $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $postCutoffFileName = shift;
print "loading postCutoff Matrix\n";
#open the post cutoff file
open IN, $postCutoffFileName
or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");
#create hash of cuis to grab
my %cuisToGrab = ();
foreach my $rowKey (keys %{$startingMatrixRef}) {
$cuisToGrab{$rowKey} = 1;
}
#read in values of the post cutoff matrix for the start terms
my %postCutoffMatrix = ();
my ($cui1, $cui2, $val);
while (my $line = <IN>) {
#grab values from the line
chomp $line;
($cui1, $cui2, $val) = split(/\t/,$line);
#see if this line contains a key that should be read in
if (exists $cuisToGrab{$cui1}) {
#add the value
if (!(defined $postCutoffMatrix{$cui1})) {
my %newHash = ();
$postCutoffMatrix{$cui1} = \%newHash;
}
#check to ensure that the column cui is in the
# vocabulary of the pre-cutoff dataset.
# it is impossible to make predictions of words that
# don't already exist
#NOTE: this assumes $explicitMatrixRef is a square
# matrix (so unordered)
if (exists ${$explicitMatrixRef}{$cui2}) {
${$postCutoffMatrix{$cui1}}{$cui2} = $val;
}
}
}
close IN;
#return the post cutoff matrix
return \%postCutoffMatrix;
}
#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input: $explicitMatrixRef <- a ref to the explicit sparse matrix
# $lbdOptionsRef <- the LBD options
# $startTermAcceptTypesRef <- a reference to an hash of accept
# types for start terms (TUIs)
# $numRows <- the number of random rows to load (if random)
# $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
my $explicitMatrixRef = shift;
my $lbdOptionsRef = shift;
my $startTermAcceptTypesRef = shift;
my $numRows = shift;
my $umls_interface = shift;
#generate the starting matrix randomly or from a file
my %startingMatrix = ();
#check if a file is defined
if (exists ${$lbdOptionsRef}{'cuiListFileName'}) {
#grab the rows defined by the cuiListFile
my $cuisRef = &loadCUIs(${$lbdOptionsRef}{'cuiListFileName'});
foreach my $cui (keys %{$cuisRef}) {
if(exists ${$explicitMatrixRef}{$cui}) {
$startingMatrix{$cui} = ${$explicitMatrixRef}{$cui};
}
else {
print STDERR "WARNING: CUI from cuiListFileName is not in explicitMatrix: $cui\n";
}
}
}
else {
#randomly grab rows
#apply semantic filter to the rows (just retreive appropriate rows)
my $rowsToKeepRef = getRowsOfSemanticTypes(
$explicitMatrixRef, $startTermAcceptTypesRef, $umls_interface);
((scalar keys %{$rowsToKeepRef}) >= $numRows) or die("ERROR: number of acceptable rows starting terms is less than $numRows\n");
#randomly select 100 rows (to generate the 'starting matrix')
#generate random numbers from 0 to number of rows in the explicit matrix
my %rowNumbers = ();
while ((scalar keys %rowNumbers) < $numRows) {
$rowNumbers{int(rand(scalar keys %{$rowsToKeepRef}))} = 1;
}
#fill starting matrix with keys corresponding to the random numbers
my $i = 0;
foreach my $key (keys %{$rowsToKeepRef}) {
if (exists $rowNumbers{$i}) {
$startingMatrix{$key} = ${$explicitMatrixRef}{$key}
}
$i++;
}
#output the cui list if needed
if (exists ${$lbdOptionsRef}{'cuiListOutputFile'}) {
open OUT, ">".${$lbdOptionsRef}{'cuiListOutputFile'} or die ("ERROR: cannot open cuiListOutputFile:".${$lbdOptionsRef}{'cuiListOutputFile'}."\n");
foreach my $cui (keys %startingMatrix) {
print OUT "$cui\n";
}
close OUT;
}
}
#return the starting matrix
return \%startingMatrix;
}
# gets and returns a hash of row keys of the specifies semantic types
# input: $matrixRef <- a ref to a sparse matrix
# $acceptTypesRef <- a ref to a hash of accept types (TUIs)
# $umls <- an instance of UMLS::Interface
# output: \%rowsToKeep <- a ref to hash of rows to keep, each key is
# a CUI, and values are 1. All CUIs specify rows
# of acceptable semantic types
sub getRowsOfSemanticTypes {
my $matrixRef = shift;
my $acceptTypesRef = shift;
my $umls = shift;
#loop through the matrix and keep the rows that are of the
# desired semantic types
my %rowsToKeep = ();
foreach my $cui1 (keys %{$matrixRef}) {
my $typesRef = $umls->getSt($cui1);
foreach my $type(@{$typesRef}) {
my $abr = $umls->getStAbr($type);
#check the cui for removal
if (exists ${$acceptTypesRef}{$type}) {
$rowsToKeep{$cui1} = 1;
last;
}
}
}
#return the rowsToKeep
return \%rowsToKeep
}
# generates a hash of all association scores from the matrix
# the hash keys are $rowKey,$colKey. Hash values are the association scores
# between the $rowKey and $colKey. All co-occurring cui pairs from the matrix
# are calculated
# input: $matrixRef <- a reference to a sparse matrix
# $rankingMeasue <- a string specifying the ranking measure to use
# $umls_association <- an instance of UMLS::Association
# output: \%cuiPairs <- a ref to a hash of CUI pairs and their assocaition
# each key of the hash is a comma seperated string
# containing cui1, and cui2 of the pair
# (e.g. 'cui1,cui2'), and each value is their association
# score using the specified assocition measure
sub getAssociationScores {
my $matrixRef = shift;
my $rankingMeasure = shift;
my $umls_association = shift;
print " getting Association Scores, rankingMeasure = $rankingMeasure\n";
#generate a list of cui pairs in the matrix
my %cuiPairs = ();
print " generating association scores:\n";
foreach my $rowKey (keys %{$matrixRef}) {
foreach my $colKey (keys %{${$matrixRef}{$rowKey}}) {
$cuiPairs{"$rowKey,$colKey"} = ${${$matrixRef}{$rowKey}}{$colKey};
}
}
#get ranks for all the cui pairs in the matrix
#return a hash of cui pairs and their frequency
if ($rankingMeasure eq 'frequency') {
return \%cuiPairs;
} else {
#updates values in cuiPairs hash with their association scores and returns
Rank::getBatchAssociationScores(\%cuiPairs, $matrixRef, $rankingMeasure, $umls_association);
return \%cuiPairs;
}
}
# gets the min and max value of a hash
# returns a two element array, where the first value is the min, and
# the second values is the max
# input: $hashref <- a reference to a hash with numbers as values
# output: ($min, $max) <- the minimum and maximum values in the hash
sub getMinMax {
my $hashRef = shift;
#loop through each key and record the min/max
my $min = 999999;
my $max = -999999;
foreach my $key (keys %{$hashRef}) {
( run in 1.426 second using v1.01-cache-2.11-cpan-df04353d9ac )