view release on metacpan or search on metacpan
lib/ALBD.pm view on Meta::CPAN
#references to other packages
my $umls_interface;
my $umls_association;
#####################################################
####################################################
# performs LBD
# input: none
# ouptut: none, but a results file is written to disk
sub performLBD {
my $self = shift;
my $start; #used to record run times
#implicit matrix ranking requires a different set of procedures
if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') {
$self->performLBD_implicitMatrixRanking();
return;
}
if (exists $lbdOptions{'targetCuis'}) {
$self->performLBD_closedDiscovery();
lib/ALBD.pm view on Meta::CPAN
#Done
print "DONE!\n\n";
}
#----------------------------------------------------------------------------
# performs LBD, closed discovery
# input: none
# ouptut: none, but a results file is written to disk
sub performLBD_closedDiscovery {
my $self = shift;
my $start; #used to record run times
print "Closed Discovery\n";
print $self->_parametersToString();
#Get inputs
my $startCuisRef = $self->_getStartCuis();
my $targetCuisRef = $self->_getTargetCuis();
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
lib/ALBD.pm view on Meta::CPAN
# primarily memory constraints or time constraints now, because this
# requires the entire implicit matrix be computed. This can be done, but
# access to it is then slow. Would require a major redo of the code
#
=comment
# performs LBD, but using implicit matrix ranking schemes.
# Since the order of operations for those methods are slighly different
# a new method has been created.
# input: none
# output: none, but a results file is written to disk
sub performLBD_implicitMatrixRanking {
my $self = shift;
my $start; #used to record run times
print $self->_parametersToString();
print "In Implicit Ranking\n";
#Get inputs
my $startCuisRef = $self->_getStartCuis();
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
lib/ALBD.pm view on Meta::CPAN
##################################################
################ Time Slicing ####################
##################################################
#NOTE: This function isn't really tested, and is really slow right now
# Generates precision and recall values by varying the threshold
# of the A->B ranking measure.
# input: none
# output: none, but precision and recall values are printed to STDOUT
sub timeSlicing_generatePrecisionAndRecall_explicit {
my $NUM_SAMPLES = 100; #TODO, read fomr file number of samples to average over for timeslicing
my $self = shift;
print "In timeSlicing_generatePrecisionAndRecall\n";
my $numIntervals = 10;
#Get inputs
my $startAcceptTypesRef = $self->_getAcceptTypes('start');
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
lib/ALBD.pm view on Meta::CPAN
print "true - total, min, max, average = $trueTotal, $trueMin, $trueMax, $trueAverage\n";
}
# generates precision and recall values by varying the threshold
# of the A->C ranking measure. Also generates precision at k, and
# mean average precision
# input: none
# output: none, but precision, recall, precision at k, and map values
# output to STDOUT
sub timeSlicing_generatePrecisionAndRecall_implicit {
my $NUM_SAMPLES = 200; #TODO, read fomr file number of samples to average over for timeslicing
my $self = shift;
my $start; #used to record run times
print "In timeSlicing_generatePrecisionAndRecall_implicit\n";
#Get inputs
my $startAcceptTypesRef = $self->_getAcceptTypes('start');
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
lib/ALBD.pm view on Meta::CPAN
}
##############################################################################
# functions to grab parameters and inialize all input
##############################################################################
# method to create a new LiteratureBasedDiscovery object
# input: $optionsHashRef <- a reference to an LBD options hash
# output: a new LBD object
sub new {
my $self = {};
my $className = shift;
my $optionsHashRef = shift;
bless($self, $className);
$self->_initialize($optionsHashRef);
return $self;
}
# Initializes everything needed for Literature Based Discovery
# input: $optionsHashRef <- reference to LBD options hash (command line input)
# output: none, but global parameters are set
sub _initialize {
my $self = shift;
my $optionsHashRef = shift;
#initialize UMLS::Interface
my %tHash = ();
$tHash{'t'} = 1; #default hash values are with t=1 (silence module output)
my $componentOptions = \%tHash;
if (${$optionsHashRef}{'interfaceConfig'} ne '') {
#read configuration file if its defined
$componentOptions =
lib/ALBD.pm view on Meta::CPAN
%lbdOptions = %{$self->_readConfigFile(${$optionsHashRef}{'lbdConfig'})};
}
# Reads the config file in as an options hash
# input: the name of a configuration file that has key fields in '<>'s,
# The '>' is followed directly by the value for that key, no space.
# Each line of the file contains a new key-value pair (e.g. <key>value)
# If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
my $self = shift;
my $configFileName = shift;
#read in all options from the config file
open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
my %optionsHash = ();
my $firstChar;
while (my $line = <IN>) {
#check if its a comment or blank line
$firstChar = substr $line, 0, 1;
lib/ALBD.pm view on Meta::CPAN
}
}
close IN;
return \%optionsHash;
}
# transforms the string of start cuis to an array
# input: none
# output: an array ref of CUIs
sub _getStartCuis {
my $self = shift;
my @startCuis = split(',',$lbdOptions{'startCuis'});
return \@startCuis;
}
# transforms the string of target cuis to an array
# input: none
# output: an array ref of CUIs
sub _getTargetCuis {
my $self = shift;
my @targetCuis = split(',',$lbdOptions{'targetCuis'});
return \@targetCuis;
}
# transforms the string of accept types or groups into a hash of accept TUIs
# input: a string specifying whether linking or target types are being defined
# output: a hash of acceptable TUIs
sub _getAcceptTypes {
my $self = shift;
my $stepString = shift; #either 'linking' or 'target'
#get the accept types
my %acceptTypes = ();
#add all types for groups specified
my $string = $stepString.'AcceptGroups';
if (defined $lbdOptions{$string}) {
#accept groups were specified
lib/ALBD.pm view on Meta::CPAN
##############################################################################
# function to produce output
##############################################################################
# outputs the implicit terms to string
# input: $scoresRef <- a reference to a hash of scores (hash{CUI}=score)
# $ranksRef <- a reference to an array of CUIs ranked by their score
# $printTo <- optional, outputs the $printTo top ranked terms. If not
# specified, all terms are output
# output: a line seperated string containing ranked terms, scores, and thier
# preferred terms
sub _rankedTermsToString {
my $self = shift;
my $scoresRef = shift;
my $ranksRef = shift;
my $printTo = shift;
#set printTo
if (!$printTo) {
$printTo = scalar @{$ranksRef};
}
lib/ALBD.pm view on Meta::CPAN
$string .= "$name\n";
}
#return the string of ranked terms
return $string;
}
# converts the current objects parameters to a string
# input : none
# output: a string of parameters that were used for LBD
sub _parametersToString {
my $self = shift;
#LBD options
my $paramsString = "Parameters:\n";
foreach my $key (sort keys %lbdOptions) {
$paramsString .= "$key -> $lbdOptions{$key}\n";
}
$paramsString .= "\n";
return $paramsString;
#association options? TODO
#interface options? TODO
}
# returns the version currently being used
# input : none
# output: the version number being used
sub version {
my $self = shift;
return $VERSION;
}
##############################################################################
# functions for debugging
##############################################################################
=comment
sub debugLBD {
my $self = shift;
my $startingCuisRef = shift;
print "Starting CUIs = ".(join(',', @{$startingCuisRef}))."\n";
#Get the Explicit Matrix
my ($explicitMatrixRef, $cuiToIndexRef, $indexToCuiRef, $matrixSize) =
Discovery::tableToSparseMatrix('N_11', $cuiFinder);
print "Explicit Matrix:\n";
_printMatrix($explicitMatrixRef, $matrixSize, $indexToCuiRef);
lib/ALBD.pm view on Meta::CPAN
#Test other rank methods
my $scoresRef = Rank::scoreImplicit_fromAllPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{rankingMethod}, $umls_association);
my $ranksRef = Rank::rankDescending($scoresRef);
print "Scores: \n";
foreach my $cui (keys %{$scoresRef}) {
print " scores{$cui} = ${$scoresRef}{$cui}\n";
}
print "Ranks = ".join(',', @{$ranksRef})."\n";
}
sub _printMatrix {
my $matrixRef = shift;
my $matrixSize = shift;
my $indexToCuiRef = shift;
for (my $i = 0; $i < $matrixSize; $i++) {
my $index1 = ${$indexToCuiRef}{$i};
for (my $j = 0; $j < $matrixSize; $j++) {
my $printed = 0;
my $index2 = ${$indexToCuiRef}{$j};
my $hash1Ref = ${$matrixRef}{$index1};
lib/LiteratureBasedDiscovery/Discovery.pm view on Meta::CPAN
######################################################################
# Functions to perform Literature Based Discovery
######################################################################
# gets the rows of the cuis from the matrix
# input: $cuisRef <- an array reference to a list of CUIs
# $matrixRef <- a reference to a co-occurrence matrix
# output: a hash ref to a sparse matrix containing just the rows retrieved
sub getRows {
my $cuisRef = shift;
my $matrixRef = shift;
my %rows = ();
my $rowRef;
#add each cui row to the starting matrix
foreach my $cui(@{$cuisRef}) {
#if there is a row for this cui
if (exists ${$matrixRef}{$cui}) {
$rowRef = ${$matrixRef}{$cui};
lib/LiteratureBasedDiscovery/Discovery.pm view on Meta::CPAN
#NOTE...this is calculating B*A ... but is that appropriate? ... I think that it is, but the values are maybe not so appropriate ... B*A is nice because it makes the implicit matrix not keep track of non-starting cui rows. ...but the values are...
# finds the implicit connections for all CUIs (based on squaring)
# It does this by multiplying $matrixB*$matrixA. If $matrix B is the starting
# matrix, and $matrixA is the explicitMatrix, this method works correctly and
# efficiently. $matrixA and $matrixB may also be the explicit matrix but
# this is more inefficient.
# input: $matrixARef <- ref to a sparse matrix
# $matrixBRef <- ref to a sparse matrix
# output: ref to a sparse matrix of the product of B*A
sub findImplicit {
my $matrixARef = shift;
my $matrixBRef = shift;
my %product = ();
#loop over the rows of the B matrix
foreach my $key0 (keys %{$matrixBRef}) {
#loop over row
foreach my $key1 (keys %{$matrixARef}) {
lib/LiteratureBasedDiscovery/Discovery.pm view on Meta::CPAN
return \%product;
}
# removes explicit connections from the matrix of implicit connections by
# removing keys (O(k), where k is the number of keys in the explicit matrix,
# we expect the explicit k to be smaller than the implicit k)
# input: $explicitMatrixRef <- reference to the explicit knowledge matrix
# $implicitMatrixRef <- reference to the implicit knowledge matrix
# output: ref to the implicit matrix with explicit knowledge removed
sub removeExplicit {
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#Check each key of the explicit matrix to see if it exists
# in the implicit matrix
foreach my $key1(keys %{$explicitMatrixRef}) {
if (exists ${$implicitMatrixRef}{$key1}) {
foreach my $key2(keys %{${$explicitMatrixRef}{$key1}}) {
if (exists ${${$implicitMatrixRef}{$key1}}{$key2}) {
delete ${${$implicitMatrixRef}{$key1}}{$key2};
lib/LiteratureBasedDiscovery/Discovery.pm view on Meta::CPAN
}
}
return $implicitMatrixRef;
}
# loads a tab seperated file as a sparse matrix (a hash of hashes)
# each line of the file contains CUI1 <TAB> CUI2 <TAB> Count
# input: the filename containing the data
# output: a hash ref to the sparse matrix (${$hash{$index1}}{$index2} = value)
sub fileToSparseMatrix {
my $fileName = shift;
open IN, $fileName or die ("unable to open file: $fileName\n");
my %matrix = ();
my ($cui1,$cui2,$val);
while (my $line = <IN>) {
chomp $line;
($cui1,$cui2,$val) = split(/\t/,$line);
if (!exists $matrix{$cui1}) {
lib/LiteratureBasedDiscovery/Discovery.pm view on Meta::CPAN
}
close IN;
return \%matrix;
}
# outputs the matrix to the output file in sparse matrix format, which
# is a file containing rowKey\tcolKey\tvalue
# input: $outFile - a string specifying the output file
# $matrixRef - a ref to the sparse matrix containing the data
# output: nothing, but the matrix is output to file
sub outputMatrixToFile {
my $outFile = shift;
my $matrixRef = shift;
#open the output file and output fhe matrx
open OUT, ">$outFile" or die ("Error opening matrix output file: $outFile\n");
my $rowRef;
foreach my $rowKey (keys %{$matrixRef}) {
$rowRef = ${$matrixRef}{$rowKey};
foreach my $colKey (keys %{$rowRef}) {
print OUT "$rowKey\t$colKey\t${$rowRef}{$colKey}\n";
lib/LiteratureBasedDiscovery/Discovery.pm view on Meta::CPAN
}
#Note: Table to sparse is no longer used, but could be useful in the future
=comment
# retreive a table from mysql and convert it to a sparse matrix (a hash of
# hashes)
# input : $tableName <- the name of the table to output
# #cuiFinder <- an instance of UMLS::Interface::CuiFinder
# output: a hash ref to the sparse matrix (${$hash{$index1}}{$index2} = value)
sub tableToSparseMatrix {
my $tableName = shift;
my $cuiFinder = shift;
# check tableName
#TODO check that the table exists in the database
# or die "Error: table does not exist: $tableName\n";
# set up database
my $db = $cuiFinder->_getDB();
lib/LiteratureBasedDiscovery/Evaluation.pm view on Meta::CPAN
package Evaluation;
use strict;
use warnings;
# Timeslicing evaluation that calculates the precision of LBD
# (O(k), where k is the number of keys in results)
# input: $resultsMatrixRef <- ref a matrix of LBD results
# $goldMatrixRef <- ref to a gold standard matrix
# output: the precision of results
sub calculatePrecision {
my $resultsMatrixRef = shift;
my $goldMatrixRef = shift;
# calculate the precision which is the percentage of results that are
# are in the gold standard
# (percent of generated that is gold)
my $count = 0;
foreach my $key(keys %{$resultsMatrixRef}) {
if (exists ${$goldMatrixRef}{$key}) {
$count++;
lib/LiteratureBasedDiscovery/Evaluation.pm view on Meta::CPAN
}
return $count/(scalar keys %{$resultsMatrixRef});
}
# Timeslicing evaluation that calculate the recall of LBD
# (O(k), where k (is the number of keys in gold)
# input: $resultsMatrixRef <- ref a matrix of LBD results
# $goldMatrixRef <- ref to a gold standard matrix
# output: the recall of results
sub calculateRecall {
my $resultsMatrixRef = shift;
my $goldMatrixRef = shift;
# calculate the recall which is the percentage of knowledge in the gold
# standard that was generated by the LBD system
# (percent of gold that is generated)
my $count = 0;
foreach my $key(keys %{$goldMatrixRef}) {
if (exists ${$resultsMatrixRef}{$key}) {
$count++;
lib/LiteratureBasedDiscovery/Filters.pm view on Meta::CPAN
use UMLS::Interface;
# applies a semantic group filter to the matrix, by removing keys that
# are not allowed semantic type. Eliminates both rows and columns, so
# is applied to the full explicit matrix
# input: $matrixRef <- ref to a sparse matrix to be filtered
# $acceptTypesRef <- a ref to a hash of accept type strings
# $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated
sub semanticTypeFilter_rowsAndColumns {
my $matrixRef = shift;
my $acceptTypesRef = shift;
my $umls = shift;
=comment
#Count the number of keys before and after filtering (for debugging)
my %termsHash = ();
foreach my $key1 (keys %{$matrixRef}) {
foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
$termsHash{$key2} = 1;
lib/LiteratureBasedDiscovery/Filters.pm view on Meta::CPAN
# applies a semantic group filter to the matrix, by removing keys that
# are not allowed semantic type. Only removes types from rows,
# so is applied for times slicing, before randomly selecting terms of
# one semantic type
# input: $matrixRef <- ref to a sparse matrix to be filtered
# $acceptTypesRef <- a ref to a hash of accept type strings
# $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated
sub semanticTypeFilter_rows {
my $matrixRef = shift;
my $acceptTypesRef = shift;
my $umls = shift;
=comment
#Count the number of keys before and after filtering (for debugging)
my %termsHash = ();
foreach my $key1 (keys %{$matrixRef}) {
foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
$termsHash{$key2} = 1;
lib/LiteratureBasedDiscovery/Filters.pm view on Meta::CPAN
# applies a semantic group filter to the matrix, by removing keys that
# are not allowed semantic type. Only removes types from columns,
# so is applied to the implicit matrix (starting term rows with implicit
# columns).
# input: $matrixRef <- ref to a sparse matrix to be filtered
# $acceptTypesRef <- a ref to a hash of accept type strings
# $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated
sub semanticTypeFilter_columns {
my $matrixRef = shift;
my $acceptTypesRef = shift;
my $umls = shift;
=comment
#Count the number of keys before and after filtering (for debugging)
my %termsHash = ();
foreach my $key1 (keys %{$matrixRef}) {
foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
$termsHash{$key2} = 1;
lib/LiteratureBasedDiscovery/Filters.pm view on Meta::CPAN
}
print " number of keys after filtering = ".(scalar keys %termsHash)."\n";
=cut
}
# gets the semantic types of the group
# input: $group <- a string specifying a semantic group
# $umls <- an instance of UMLS::Interface
# output: a ref to a hash of TUIs
sub getTypesOfGroup {
my $group = shift;
my $umls = shift;
#add each type of the group to the set of accept types
my %acceptTuis = ();
my @groupTypes = @{ $umls->getStsFromSg($group) };
foreach my $abr(@groupTypes) {
#check that it is defined (types that are no longer in
#the UMLS may be returned as part of the group)
if (defined $abr) {
lib/LiteratureBasedDiscovery/Filters.pm view on Meta::CPAN
$acceptTuis{$tui} = 1;
}
}
return \%acceptTuis;
}
# gets all semantic types of the UMLS
# input: $umls <- an instance of UMLS::Interface
# output: a ref to an array of TUIs
sub getAllTypes {
my $umls = shift;
my $abrRef = $umls->getAllSts();
my @tuis = ();
foreach my $abr(@{$abrRef}) {
push @tuis, uc $umls->getStTui($abr);
}
return \@tuis;
}
# gets all semantic groups of the UMLS
# input: $umls <- an instance of UMLS::Interface
# output: a ref to a hash of semantic groups
sub getAllGroups {
my $umls = shift;
my $groupsRef = $umls->getAllSemanticGroups();
return $groupsRef;
}
1;
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
# scores each implicit CUI using an assocation measure, but the input to
# the association measure is based on linking term counts, rather than
# co-occurrence counts.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_ltcAssociation {
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
my $measure = shift;
my $association = shift;
#bTerms to calculate n1p (number of unique co-occurring terms)
my %bTerms = ();
my $rowRef;
foreach my $rowKey (keys %{$startingMatrixRef}) {
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
# key is the a,b cui pair (e.g. hash{'C00,C11'})
# values are their score
#
# Optional Input for passing in precalculated stats
# so that they don't have to get recalcualted each time
# such as in timeslicing
# $n1pRef <- hashRef where key is a cui, value is n1p
# $np1Ref <- hashRef where key is a cui, value is np1
# $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_averageMinimumWeight {
#grab input
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
my $measure = shift;
my $association = shift;
my $abScoresRef = shift;
#optionally pass in stats so they don't get recalculated for
# multiple terms (such as with time slicing)
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
# $abScoresRef <- hashRef of the a to b scores used in AMW
# key is the a,b cui pair (e.g. hash{'C00,C11'})
# values are their score
# Optional Input for passing in precalculated stats
# so that they don't have to get recalcualted each time
# such as in timeslicing
# $n1pRef <- hashRef where key is a cui, value is n1p
# $np1Ref <- hashRef where key is a cui, value is np1
# $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_LTC_AMW {
#grab the input
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
my $measure = shift;
my $association = shift;
my $abScoresRef = shift;
#optionally pass in stats so they don't get recalculated for
# multiple terms (such as with time slicing)
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
#return the scores
return \%ltcAMWScores;
}
#TODO this is an untested method
# gets the max cosine distance score between all a terms and each cTerm
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub score_cosineDistance {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all the A->C pairs
my $acPairsRef = &_getACPairs($startingMatrixRef, $implicitMatrixRef);
my %scores = ();
foreach my $pairKey (keys %{$acPairsRef}) {
#get the A and C keys
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
}
return \%scores;
}
# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input: $startingMatrixRef <- ref to the starting matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
# and values are set to the value at that index in the implicit matrix
sub _getACPairs {
my $startingMatrixRef = shift;
my $implicitMatrixRef = shift;
#generate a list of ac pairs
my %acPairs = ();
foreach my $keyA (keys %{$implicitMatrixRef}) {
foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
$acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
}
}
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
}
# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bc pairs
my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
# Find the linking term count for each cTerm
my %scores = ();
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
return \%scores;
}
# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bc pairs
my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
# Find the frequency count for each cTerm
my %scores = ();
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
# scores each implicit CUI using an assocation measure. Score is the maximum
# association between a column in the implicit matrix, and one of the start
# matrix terms (so max between any A and that C term).
# Score is calculated using the implicit matrix
# input: $startCuisRef <- ref to an array of start cuis (A terms)
# $implicitMatrixFileName <- fileName of the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
#LBD Info
my $startCuisRef = shift;
my $implicitMatrixFileName = shift;
my $measure = shift;
my $association = shift;
######################################
#Get hashes for A and C terms
#####################################
#create a hash of starting terms
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
}
# scores each implicit CUI using an assocation measure. Score is the maximum
# association between any of the linking terms.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# $measure <- the string of the umls association measure to use
# $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromAllPairs {
#LBD Info
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
my $measure = shift;
my $association = shift;
#optionally pass in stats so they don't get recalculated for
# multiple terms (such as with time slicing)
my $n1pRef = shift;
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
elsif (${$bcPairsRef}{$pairKey} > $scores{$key2}) {
$scores{$key2} = ${$bcPairsRef}{$pairKey}
}
}
}
return \%scores;
}
sub scoreImplicit_minimumWeightAssociation {
}
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# Builds a list of B->C term pairs that also co-occurr with A terms
# Only adds B->C term pairs for C terms that are also present in the
# implicitMatrix.
# The value of the bcPairs Hash is the value in the explicit matrix
# for that pair.
# input: $startingMatrixRef <- ref to the starting matrix
# $explicitMatrixRef <- ref to the explicit matrix
# $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of BC term pairs. Each key is "$bTerm,$cTerm",
# value is by default the frequency of BC co-occurrences in the
# matrix
sub _getBCPairs {
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $implicitMatrixRef = shift;
#get all bTerms
my %bTerms = ();
my $rowRef;
foreach my $rowKey (keys %{$startingMatrixRef}) {
$rowRef = ${$startingMatrixRef}{$rowKey};
foreach my $colKey (keys %{$rowRef}) {
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
}
}
}
return \%bcPairs;
}
# ranks the scores in descending order
# input: $scoresRef <- a hash ref to a hash of cuis and scores (hash{cui} = score)
# output: an array ref of the ranked cuis in descending order
sub rankDescending {
#grab the input
my $scoresRef = shift;
#order in descending order, and use the CUI string as a tiebreaker
my @rankedCuis = ();
my @tiedCuis = ();
my $currentScore = -1;
foreach my $cui (
#sort function to sort by value
sort {${$scoresRef}{$b} <=> ${$scoresRef}{$a}}
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
# gets association scores for a set of cui pairs
# input: $cuiPairsRef <- reference to a hash of pairs of matrix indeces (key = '1,2')
# $matrixRef <- a reference to a sparse matrix of n11 values
# $measure <- the association measure to perform
# $association <- an instance of UMLS::Association
# output: none, bu the cuiPairs ref has values updated to reflect the
# computed assocation score
sub getBatchAssociationScores {
my $cuiPairsRef = shift;
my $matrixRef = shift;
my $measure = shift;
my $association = shift;
#optionally pass in $n1pRef, $np1Ref, and $npp
# do this if they get calculated multiple times
# (such as with time slicing)
my $n1pRef = shift;
my $np1Ref = shift;
lib/LiteratureBasedDiscovery/Rank.pm view on Meta::CPAN
# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input: $matrixRef <- ref to the co-occurrence matrix (the sparse matrix
# of n11 values)
# output: \@vals <- an array ref of three values:
# \%n1p - a hash ref where the key is a cui and value is n1p
# \%np1 - a hash ref where the key is a cui and value is np1
# $npp - a scalar of npp
sub getAllStats {
my $matrixRef = shift;
#get all np1, n1p, and npp values of values for each cui
my %np1 = ();
my %n1p = ();
my $npp = 0;
my $val;
foreach my $key1 (keys %{$matrixRef}) {
foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
$val = ${${$matrixRef}{$key1}}{$key2};
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# Calculates and outputs to STDOUT Time Slicing evaluation stats of
# precision and recall at $numIntervals intervals, Mean Average Precision
# (MAP), precision at k, and frequency at k
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# $numIntervals <- the number of recall intervals to generate
sub outputTimeSlicingResults {
#grab the input
my $goldMatrixRef = shift;
my $rowRanksRef = shift;
my $numIntervals = shift;
#calculate and output stats
#------------------------------------------
#calculate precision and recall
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
}
print "\n";
}
# loads a list of cuis for use in time slicing from file
# the CUI file contains a line seperated list of CUIs
# input: $cuiFileName <- a string specifying the file to load cuis from
# output: $\%cuis <- a ref to a hash of cuis, each key is a cui, values are 1
sub loadCUIs {
my $cuiFileName = shift;
#open the file
open IN, $cuiFileName
or die("ERROR: cannot open CUI File: $cuiFileName\n");
#read each line of the file
my %cuis = ();
while (my $line = <IN>) {
chomp $line;
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
}
# calculates average precision and recall of the generated implicit matrix
# compared to the post cutoff matrix
# input: $predictionsMatrixRef <- a ref to a sparse matrix of predicted
# discoveries
# $trueMatrixRef <- a ref to a sparse matrix of true discoveries
# output: ($precision, $recall) <- two scalar values specifying the precision
# and recall
sub calculatePrecisionRecall {
my $predictionsMatrixRef = shift; #a matrix of predicted discoveries
my $trueMatrixRef = shift; #a matrix of true discoveries
print "calculating precision and recall\n";
#bounds check, the predictions matrix must contain keys
if ((scalar keys %{$predictionsMatrixRef}) < 1) {
return (0,0); #precision and recall are both zero
}
#calculate precision and recall averaged over each cui
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
}
# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are
# the only rows that are needed.
# input: $startingMatrixRef <- a ref to the starting sparse matrix
# $explicitMatrix Ref <- a ref to the explicit sparse matrix
# $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
my $startingMatrixRef = shift;
my $explicitMatrixRef = shift;
my $postCutoffFileName = shift;
print "loading postCutoff Matrix\n";
#open the post cutoff file
open IN, $postCutoffFileName
or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");
#create hash of cuis to grab
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input: $explicitMatrixRef <- a ref to the explicit sparse matrix
# $lbdOptionsRef <- the LBD options
# $startTermAcceptTypesRef <- a reference to an hash of accept
# types for start terms (TUIs)
# $numRows <- the number of random rows to load (if random)
# $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
my $explicitMatrixRef = shift;
my $lbdOptionsRef = shift;
my $startTermAcceptTypesRef = shift;
my $numRows = shift;
my $umls_interface = shift;
#generate the starting matrix randomly or from a file
my %startingMatrix = ();
#check if a file is defined
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
}
# gets and returns a hash of row keys of the specifies semantic types
# input: $matrixRef <- a ref to a sparse matrix
# $acceptTypesRef <- a ref to a hash of accept types (TUIs)
# $umls <- an instance of UMLS::Interface
# output: \%rowsToKeep <- a ref to hash of rows to keep, each key is
# a CUI, and values are 1. All CUIs specify rows
# of acceptable semantic types
sub getRowsOfSemanticTypes {
my $matrixRef = shift;
my $acceptTypesRef = shift;
my $umls = shift;
#loop through the matrix and keep the rows that are of the
# desired semantic types
my %rowsToKeep = ();
foreach my $cui1 (keys %{$matrixRef}) {
my $typesRef = $umls->getSt($cui1);
foreach my $type(@{$typesRef}) {
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# between the $rowKey and $colKey. All co-occurring cui pairs from the matrix
# are calculated
# input: $matrixRef <- a reference to a sparse matrix
# $rankingMeasue <- a string specifying the ranking measure to use
# $umls_association <- an instance of UMLS::Association
# output: \%cuiPairs <- a ref to a hash of CUI pairs and their assocaition
# each key of the hash is a comma seperated string
# containing cui1, and cui2 of the pair
# (e.g. 'cui1,cui2'), and each value is their association
# score using the specified assocition measure
sub getAssociationScores {
my $matrixRef = shift;
my $rankingMeasure = shift;
my $umls_association = shift;
print " getting Association Scores, rankingMeasure = $rankingMeasure\n";
#generate a list of cui pairs in the matrix
my %cuiPairs = ();
print " generating association scores:\n";
foreach my $rowKey (keys %{$matrixRef}) {
foreach my $colKey (keys %{${$matrixRef}{$rowKey}}) {
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
Rank::getBatchAssociationScores(\%cuiPairs, $matrixRef, $rankingMeasure, $umls_association);
return \%cuiPairs;
}
}
# gets the min and max value of a hash
# returns a two element array, where the first value is the min, and
# the second values is the max
# input: $hashref <- a reference to a hash with numbers as values
# output: ($min, $max) <- the minimum and maximum values in the hash
sub getMinMax {
my $hashRef = shift;
#loop through each key and record the min/max
my $min = 999999;
my $max = -999999;
foreach my $key (keys %{$hashRef}) {
my $val = ${$hashRef}{$key};
if ($val < $min) {
$min = $val;
}
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# hash. Any keys less than the threshold are not copied to the new matrix
# input: $threshold <- a scalar threshold
# $assocScoresRef <- a reference to a cui pair hash of association
# scores. Each key is a comma seperated cui pair
# (e.g. 'cui1,cui2'), values are their association
# scores.
# $matrixRef <- a reference to a co-occurrence sparse matrix that
# corresponds to the assocScoresRef
# output: \%thresholdedMatrix < a ref to a new matrix, built from the
# $matrixRef after applying the $threshold
sub applyThreshold {
my $threshold = shift;
my $assocScoresRef = shift;
my $matrixRef = shift;
#apply the threshold
my $preKeyCount = scalar keys %{$assocScoresRef};
my $postKeyCount = 0;
my %thresholdedMatrix = ();
my ($cui1, $cui2);
foreach my $key (keys %{$assocScoresRef}) {
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# of samples. Used in explicit timeslicing
# input: $k <- the number of samples to get
# $assocScoresRef <- a reference to a cui pair hash of association
# scores. Each key is a comma seperated cui pair
# (e.g. 'cui1,cui2'), values are their association
# scores.
# $matrixRef <- a reference to a co-occurrence sparse matrix that
# corresponds to the assocScoresRef
# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the
# $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
my $k = shift;
my $assocScoresRef = shift;
my $matrixRef = shift;
print "getting $k highest ranked samples\n";
#apply the threshold
my $preKeyCount = scalar keys %{$assocScoresRef};
my $postKeyCount = 0;
my %thresholdedMatrix = ();
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# $numIntervals <- the number of recall intervals to generate
# output: (\%precision, \%recall) <- refs to hashes of precision and recall.
# Each hash key is the interval number, and
# the value is the precision and recall
# respectively
sub calculatePrecisionAndRecall_implicit {
my $trueMatrixRef = shift; #a ref to the true matrix
my $rowRanksRef = shift; #a ref to ranked predictions, each hash element are the predictions for a single cui, at each element is an array of cuis ordered by their rank
my $numIntervals = shift; #the recall intervals to test at
#find precision and recall curves for each cui that is being predicted
# take the sum of precisions, then average after the loop
my %precision = ();
my %recall = ();
foreach my $rowKey (keys %{$trueMatrixRef}) {
my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# calculates the mean average precision (MAP)
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# output: $map <- a scalar value of mean average precision (MAP)
sub calculateMeanAveragePrecision {
#grab the input
my $trueMatrixRef = shift; # a matrix of true discoveries
my $rowRanksRef = shift; # a hash of ranked predicted discoveries
print "calculating mean average precision\n";
#calculate MAP for each true discovery being predicted
my $map = 0;
foreach my $rowKey (keys %{$trueMatrixRef}) {
my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# from k = 1-10 and intervals of 10 for 10-100
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# output: \%meanPrecision <- a hash of mean preicsions at K, each key is the
# value of k, the the value is the precision at that
# k
sub calculatePrecisionAtK {
#grab the input
my $trueMatrixRef = shift; # a matrix of true discoveries
my $rowRanksRef = shift; # a hash of ranked predicted discoveries
#generate precision at k at intervals of 10 for k = 10-100
my %meanPrecision = ();
my $interval = 1;
for (my $k = 1; $k <= 100; $k+=$interval) {
$meanPrecision{$k} = 0;
lib/LiteratureBasedDiscovery/TimeSlicing.pm view on Meta::CPAN
# for 10-100. Co-occurrence counts are averaged over each of the starting terms
# input: $trueMatrixRef <- a ref to a hash of true discoveries
# $rowRanksRef <- a ref to a hash of arrays of ranked predictions.
# Each hash key is a cui, each hash element is an
# array of ranked predictions for that cui. The ranked
# predictions are cuis are ordered in descending order
# based on association. (from Rank::RankDescending)
# output: \%meanCooccurrenceCounts <- a hash of mean preicsions at K, each key
# is the value of k, the the value is the
# precision at that k
sub calculateMeanCooccurrencesAtK {
#grab the input
my $trueMatrixRef = shift; # a matrix of true discoveries
my $rowRanksRef = shift; # a hash of ranked predicted discoveries
#generate mean cooccurrences at k at intervals of 10 for k = 10-100
my %meanCooccurrenceCount = (); #count of the number of co-occurrences for each k
my $interval = 1;
for (my $k = 1; $k <= 100; $k+=$interval) {
$meanCooccurrenceCount{$k} = 0;
}
}
ok($fAtKSame == 1, "Frequency at K Matches");
print "Done with Time Slicing Tests\n";
############################################################
#function to read in time slicing data values
sub readTimeSlicingData {
my $fileName = shift;
#read in the gold time slicing values
my @APScores = ();
my $MAP;
my @PAtKScores = ();
my @FAtKScores = ();
open IN, "$fileName"
#open IN, './t/goldSampleTimeSliceOutput'
or die ("Error: Cannot open timeSliceOutput: $fileName\n");
utils/datasetCreator/applyMaxThreshold.pl view on Meta::CPAN
my $inputFile = '/home/henryst/lbdData/groupedData/reg/1975_1999_window8_noOrder';
my $outputFile = '/home/henryst/lbdData/groupedData/1975_1999_window8_noOrder_threshold5000u';
my $maxThreshold = 5000;
my $applyToUnique = 1;
my $countRef = &getStats($inputFile, $applyToUnique);
&applyMaxThreshold($inputFile, $outputFile, $maxThreshold, $countRef);
# gets co-occurrence stats, returns a hash of (unique) co-occurrence counts
# for each CUI. (count is unique or not depending on $applyToUnique)
sub getStats {
my $inputFile = shift;
my $applyToUnique = shift;
#open files
open IN, $inputFile or die("ERROR: unable to open inputFile\n");
print "Getting Stats\n";
#count stats for each line of the file
my ($cui1, $cui2, $val);
my %count = (); #a count of the number of (unique) co-occurrences
utils/datasetCreator/applyMaxThreshold.pl view on Meta::CPAN
#does not matter, the matrix will have been pre-processed to ensure
#the second cui will appear first in the key. In the case where order
#does matter we just shouldnt be counting it anyway
}
close IN;
return \%count;
}
#applies a maxThreshold, $countRef is the output of getStats
sub applyMaxThreshold {
my $inputFile = shift;
my $outputFile = shift;
my $maxThreshold = shift;
my $countRef = shift;
#open the input and output
open IN, $inputFile or die("ERROR: unable to open inputFile\n");
open OUT, ">$outputFile"
or die ("ERROR: unable to open outputFile: $outputFile\n");
utils/datasetCreator/applyMinThreshold.pl view on Meta::CPAN
#$minThreshold number of co-occurrences
my $minThreshold = 5;
my $inputFile = '/home/henryst/1975_2015_window8_noOrder_preThresh';
my $outputFile = '/home/henryst/1975_2015_window8_noOrder_threshold'.$minThreshold;
&applyMinThreshold($minThreshold, $inputFile, $outputFile);
############
sub applyMinThreshold {
#grab the input
my $minThreshold = shift;
my $inputFile = shift;
my $outputFile = shift;
#open files
open IN, $inputFile or die("ERROR: unable to open inputFile\n");
open OUT, ">$outputFile"
or die ("ERROR: unable to open outputFile: $outputFile\n");
utils/datasetCreator/applySemanticFilter.pl view on Meta::CPAN
&applySemanticFilter($matrixFileName, $outputFileName,
$acceptTypesString, $acceptGroupsString,
###################################################################
###################################################################
# Applies the semantic type filter
sub applySemanticFilter {
#grab the input
my $matrixFileName = shift;
my $outputFileName = shift;
my $acceptTypesString = shift;
my $acceptGroupsString = shift;
my $interfaceConfig = shift;
my $columnsOnly = shift;
print STDERR "Applying Semantic Filter to $matrixFileName\n";
utils/datasetCreator/applySemanticFilter.pl view on Meta::CPAN
#TODO re-enable this and then try to run again
#disconnect from the database and return
#$umls_interface->disconnect();
}
# transforms the string of accept types or groups into a hash of accept TUIs
# input: a string specifying whether linking or target types are being defined
# output: a hash of acceptable TUIs
sub getAcceptTypes {
my $umls_interface = shift;
my $acceptTypesString = shift;
my $acceptGroupsString = shift;
#get the accept types
my %acceptTypes = ();
#add all types for groups specified
#accept groups were specified
my @acceptGroups = split(',',$acceptGroupsString);
utils/datasetCreator/combineCooccurrenceMatrices.pl view on Meta::CPAN
#user input
$dataFolder = '/home/henryst/hadoopByYear/output/';
$startYear = '1983';
$endYear = '1985';
$windowSize = 8;
&combineFiles($startYear,$endYear,$windowSize);
#####################################################
####### Program Start ########
sub combineFiles {
my $startYear = shift;
my $endYear = shift;
my $windowSize = shift;
#Check on I/O
my $outFileName = "$startYear".'_'."$endYear".'_window'."$windowSize";
(!(-e $outFileName))
or die ("ERROR: output file already exists: $outFileName\n");
open OUT, ">$outFileName"
or die ("ERROR: unable to open output file: $outFileName\n");
utils/datasetCreator/dataStats/getMatrixStats.pl view on Meta::CPAN
#gets matrix stats for a matrix file
# (number of rows, number of columns, number of keys)
&getStats('/home/henryst/lbdData/groupedData/1852_window1_squared_inParts');
#############################################
# gets the stats for the matrix
#############################################
sub getStats {
my $fileName = shift;
print STDERR "$fileName\n";
#read in the matrix
open IN, $fileName or die ("unable to open file: $fileName\n");
my %matrix = ();
my $numCooccurrences = 0;
while (my $line = <IN>) {
#$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;
$line =~ /([^\s]+)\s([^\s]+)\s([\d]+)/;
utils/datasetCreator/dataStats/metaAnalysis.pl view on Meta::CPAN
my $dataFolder = '/home/henryst/lbdData/dataByYear/1960_1989';
my $startYear = '1809';
my $endYear = '2015';
my $windowSize = 1;
my $statsOutFileName = '/home/henryst/lbdData/stats_window1';
&folderMetaAnalysis($startYear, $endYear, $windowSize, $statsOutFileName, $dataFolder);
#####################
# runs meta analysis on a set of files
sub folderMetaAnalysis {
my $startYear = shift;
my $endYear = shift;
my $windowSize = shift;
my $statsOutFileName= shift;
my $dataFolder = shift;
#Check on I/O
open OUT, ">$statsOutFileName"
or die ("ERROR: unable to open stats out file: $statsOutFileName\n");
utils/datasetCreator/dataStats/metaAnalysis.pl view on Meta::CPAN
print " ERROR: unable to open $inFile\n";
}
}
close OUT;
print "Done getting stats\n";
}
##############################
# runs meta analysis on a single file
sub metaAnalysis {
my $fileName = shift;
open IN, $fileName or die ("unable to open file: $fileName\n");
my $numCooccurrences = 0;
my %rowKeys = (); #number of rows
my %colKeys = (); #number of columns
my %uniqueKeys = (); #vocabulary size
while (my $line = <IN>) {
$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;
utils/datasetCreator/removeCUIPair.pl view on Meta::CPAN
my $cuiA = 'C0021665'; #somatomedic c
my $cuiB = 'C0003765'; #arginine
my $matrixFileName = '/home/henryst/lbdData/groupedData/1960_1989_window8_ordered';
my $matrixOutFileName = $matrixFileName.'_removed';
&removeCuiPair($cuiA, $cuiB, $matrixFileName, $matrixOutFileName);
print STDERR "DONE\n";
###########################################
# remove the CUI pair from the dataset
sub removeCuiPair {
my $cuiA = shift;
my $cuiB = shift;
my $matrixFileName = shift;
my $matrixOutFileName = shift;
print STDERR "removing $cuiA,$cuiB from $matrixFileName\n";
#open the in and out files
open IN, $matrixFileName
or die ("ERROR: cannot open matrix in file: $matrixFileName\n");
open OUT, ">$matrixOutFileName"
utils/datasetCreator/removeExplicit.pl view on Meta::CPAN
my $squaredMatrixFileName = '../../samples/postCutoffMatrix';
my $outputFileName = '../../samples/sampleGoldMatrix';
&removeExplicit($matrixFileName, $squaredMatrixFileName, $outputFileName);
###############################
###############################
#removes explicit knowledge ($matrixFileName) from the implicit
# knowledge ($squaredMatrixFileName)
sub removeExplicit {
my $matrixFileName = shift; #the explicit knowledge matrix (usually not filtered)
my $squaredMatrixFileName = shift; #the implicit with explicit knowledge matrix (filtered squared)
my $outputFileName = shift; #the implicit knowledge matrix output file
print STDERR "Removing Explicit from $matrixFileName\n";
#read in the matrix
open IN, $matrixFileName
or die("ERROR: unable to open matrix input file: $matrixFileName\n");
my %matrix = ();
my $numCooccurrences = 0;
utils/datasetCreator/squaring/convertForSquaring_MATLAB.pl view on Meta::CPAN
#convert from MATLAB sparse format
$fileName = "1980_1984_window1_ordered_filtered";
&convertFrom("/home/henryst/lbdData/groupedData/squared/$fileName".'_squared', "/home/henryst/lbdData/groupedData/squared/$fileName".'_squared_convertedBack',"/home/henryst/lbdData/groupedData/forSquaring/".$fileName.'_keys');
########################################
########################################
#converts the matrix to format for squaring in MATLAB
sub convertTo {
#grab input
my $inFile = shift;
my $matrixOutFile = shift;
my $keyOutFile = shift;
print STDERR "converting $inFile\n";
#open all the files
open IN, $inFile
or die ("ERROR: unable to open inFile: $inFile\n");
open MATRIX_OUT, ">$matrixOutFile"
utils/datasetCreator/squaring/convertForSquaring_MATLAB.pl view on Meta::CPAN
#output the keys file
print " Outputting keys\n";
foreach my $key (sort keys %keyHash) {
print KEY_OUT "$key\t$keyHash{$key}\n";
}
close KEY_OUT;
print " DONE!\n";
}
#converts the from format for squaring in MATLAB
sub convertFrom {
#grab input
my $matrixInFile = shift;
my $matrixOutFile = shift;
my $keyInFile = shift;
print "converting $matrixInFile\n";
#open all the files
open IN, $matrixInFile
or die ("ERROR: unable to open matrixInFile: $matrixInFile\n");
open MATRIX_OUT, ">$matrixOutFile"
utils/datasetCreator/squaring/squareMatrix_perl.pl view on Meta::CPAN
&outputMatrix(\%product, $options{'outputFile'});
print STDERR "DONE!\n";
#########################################################
# Helper Functions
#########################################################
sub outputMatrix {
my $matrixRef = shift;
my $outputFile = shift;
#append to the output file
print STDERR "outputFile = $outputFile\n";
open OUT, '>>'.$outputFile or die ("ERROR: unable to open output file: $options{outputFile}\n");
#ouput the matrix
foreach my $key0 (keys %{$matrixRef}) {
foreach my $key1 (keys %{$product{$key0}}) {
utils/datasetCreator/squaring/squareMatrix_perl.pl view on Meta::CPAN
}
#clear the matrix
my %newHash = ();
$matrixRef = \%newHash;
close OUT;
}
sub fileToSparseMatrix {
my $fileName = shift;
open IN, $fileName or die ("unable to open file: $fileName\n");
my %matrix = ();
while (my $line = <IN>) {
chomp $line;
$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;
if (!exists $matrix{$1}) {
my %hash = ();
$matrix{$1} = \%hash;
utils/runDiscovery.pl view on Meta::CPAN
$options{'lbdConfig'} = shift;
defined $options{'lbdConfig'} or die ($usage);
my $lbd = ALBD->new(\%options);
$lbd->performLBD();
############################################################################
# function to output help messages for this program
############################################################################
sub showHelp() {
print "This utility takes an lbd configuration file and outputs\n";
print "the results of lbd to file. The parameters for LBD are\n";
print "specified in the input file. Please see samples/lbd or\n";
print "samples/thresholding for sample input files and descriptions\n";
print "of parameters and full details on what can be in an LBD input\n";
print "file.\n";
print "\n";
print "Usage: runDiscovery.pl LBD_CONFIG_FILE [OPTIONS]\n";