ALBD

 view release on metacpan or  search on metacpan

lib/ALBD.pm  view on Meta::CPAN

#references to other packages
my $umls_interface;
my $umls_association;

#####################################################
####################################################

# performs LBD
# input:  none
# ouptut: none, but a results file is written to disk
sub performLBD {
    my $self = shift;
    my $start; #used to record run times

    #implicit matrix ranking requires a different set of procedures
    if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') { 
	$self->performLBD_implicitMatrixRanking();
	return;
    }
    if (exists $lbdOptions{'targetCuis'}) {
	$self->performLBD_closedDiscovery();

lib/ALBD.pm  view on Meta::CPAN


#Done
    print "DONE!\n\n";
}

#----------------------------------------------------------------------------

# performs LBD, closed discovery
# input:  none
# ouptut: none, but a results file is written to disk
sub performLBD_closedDiscovery {
    my $self = shift;
    my $start; #used to record run times

    print "Closed Discovery\n";
    print $self->_parametersToString();

#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $targetCuisRef = $self->_getTargetCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');

lib/ALBD.pm  view on Meta::CPAN

# primarily memory constraints or time constraints now, because this
# requires the entire implicit matrix be computed. This can be done, but
# access to it is then slow. Would require a major redo of the code
#
=comment
# performs LBD, but using implicit matrix ranking schemes.
# Since the order of operations for those methods are slighly different
# a new method has been created.
# input:  none
# output: none, but a results file is written to disk
sub performLBD_implicitMatrixRanking {
    my $self = shift;
    my $start; #used to record run times
    print  $self->_parametersToString();
    print "In Implicit Ranking\n";
    
#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
    print "startCuis = ".(join(',', @{$startCuisRef}))."\n";

lib/ALBD.pm  view on Meta::CPAN


##################################################
################ Time Slicing ####################
##################################################

#NOTE: This function isn't really tested, and is really slow right now
# Generates precision and recall values by varying the threshold
# of the A->B ranking measure.
# input:  none
# output: none, but precision and recall values are printed to STDOUT
sub timeSlicing_generatePrecisionAndRecall_explicit {
    my $NUM_SAMPLES = 100; #TODO, read fomr file number of samples to average over for timeslicing
    my $self = shift;
    print "In timeSlicing_generatePrecisionAndRecall\n";

    my $numIntervals = 10;

#Get inputs
    my $startAcceptTypesRef = $self->_getAcceptTypes('start');
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');

lib/ALBD.pm  view on Meta::CPAN

    print "true - total, min, max, average = $trueTotal, $trueMin, $trueMax, $trueAverage\n";
}


# generates precision and recall values by varying the threshold
# of the A->C ranking measure. Also generates precision at k, and
# mean average precision
# input:  none
# output: none, but precision, recall, precision at k, and map values
#         output to STDOUT
sub timeSlicing_generatePrecisionAndRecall_implicit {
    my $NUM_SAMPLES = 200; #TODO, read fomr file number of samples to average over for timeslicing
    my $self = shift;
    my $start; #used to record run times
    print "In timeSlicing_generatePrecisionAndRecall_implicit\n";

    #Get inputs
    my $startAcceptTypesRef = $self->_getAcceptTypes('start');
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');

lib/ALBD.pm  view on Meta::CPAN

}



##############################################################################
#        functions to grab parameters and inialize all input
##############################################################################
# method to create a new LiteratureBasedDiscovery object
# input: $optionsHashRef <- a reference to an LBD options hash
# output: a new LBD object
sub new {
    my $self = {};
    my $className = shift;
    my $optionsHashRef = shift;
    bless($self, $className);

    $self->_initialize($optionsHashRef);
    return $self;
}

# Initializes everything needed for Literature Based Discovery
# input: $optionsHashRef <- reference to LBD options hash (command line input)
# output: none, but global parameters are set
sub _initialize {
    my $self = shift;
    my $optionsHashRef = shift; 

    #initialize UMLS::Interface
    my %tHash = ();
    $tHash{'t'} = 1; #default hash values are with t=1 (silence module output)
    my $componentOptions = \%tHash;
    if (${$optionsHashRef}{'interfaceConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 

lib/ALBD.pm  view on Meta::CPAN

    %lbdOptions = %{$self->_readConfigFile(${$optionsHashRef}{'lbdConfig'})};
    
}    

# Reads the config file in as an options hash
# input: the name of a configuration file that has key fields in '<>'s, 
#        The '>' is followed directly by the value for that key, no space.
#        Each line of the file contains a new key-value pair (e.g. <key>value)
#        If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
    my $self = shift;
    my $configFileName = shift;
    
    #read in all options from the config file
    open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
    my %optionsHash = ();
    my $firstChar;
    while (my $line = <IN>) {
	#check if its a comment or blank line
	$firstChar = substr $line, 0, 1;

lib/ALBD.pm  view on Meta::CPAN

	}
    }
    close IN;

    return \%optionsHash;
}

# transforms the string of start cuis to an array
# input:  none
# output: an array ref of CUIs
sub _getStartCuis {
    my $self = shift;
    my @startCuis = split(',',$lbdOptions{'startCuis'});
    return \@startCuis;
}

# transforms the string of target cuis to an array
# input:  none
# output: an array ref of CUIs
sub _getTargetCuis {
    my $self = shift;
    my @targetCuis = split(',',$lbdOptions{'targetCuis'});
    return \@targetCuis;
}

# transforms the string of accept types or groups into a hash of accept TUIs
# input:  a string specifying whether linking or target types are being defined
# output: a hash of acceptable TUIs
sub _getAcceptTypes {
    my $self = shift;
    my $stepString = shift; #either 'linking' or 'target'

    #get the accept types 
    my %acceptTypes = ();

    #add all types for groups specified
    my $string = $stepString.'AcceptGroups';
    if (defined $lbdOptions{$string}) {
	#accept groups were specified

lib/ALBD.pm  view on Meta::CPAN

##############################################################################
#        function to produce output
##############################################################################
# outputs the implicit terms to string
# input:  $scoresRef <- a reference to a hash of scores (hash{CUI}=score)
#         $ranksRef <- a reference to an array of CUIs ranked by their score
#         $printTo <- optional, outputs the $printTo top ranked terms. If not
#                     specified, all terms are output
# output: a line seperated string containing ranked terms, scores, and thier
#         preferred terms
sub _rankedTermsToString {
    my $self = shift;
    my $scoresRef = shift;
    my $ranksRef = shift;
    my $printTo = shift;

    #set printTo
    if (!$printTo) {
	$printTo = scalar @{$ranksRef};
    }
    

lib/ALBD.pm  view on Meta::CPAN

	$string .= "$name\n";
    }

    #return the string of ranked terms
    return $string;
}

# converts the current objects parameters to a string
# input : none
# output: a string of parameters that were used for LBD
sub _parametersToString {
    my $self = shift;
        
    #LBD options
    my $paramsString = "Parameters:\n";
    foreach my $key (sort keys %lbdOptions) {
	$paramsString .= "$key -> $lbdOptions{$key}\n";
    }
    $paramsString .= "\n";
    return $paramsString;
    #association options? TODO
    #interface options? TODO
}


# returns the version currently being used
# input : none
# output: the version number being used
sub version {
    my $self = shift;
    return $VERSION;
}

##############################################################################
#        functions for debugging
##############################################################################
=comment
sub debugLBD {
    my $self = shift;
    my $startingCuisRef = shift;

    print "Starting CUIs = ".(join(',', @{$startingCuisRef}))."\n";

#Get the Explicit Matrix
    my ($explicitMatrixRef, $cuiToIndexRef, $indexToCuiRef, $matrixSize) = 
	Discovery::tableToSparseMatrix('N_11', $cuiFinder);
    print "Explicit Matrix:\n";
    _printMatrix($explicitMatrixRef, $matrixSize, $indexToCuiRef);

lib/ALBD.pm  view on Meta::CPAN

#Test other rank methods
    my $scoresRef = Rank::scoreImplicit_fromAllPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{rankingMethod}, $umls_association);
    my $ranksRef = Rank::rankDescending($scoresRef);
    print "Scores: \n";
    foreach my $cui (keys %{$scoresRef}) {
	print "   scores{$cui} = ${$scoresRef}{$cui}\n";
    }
    print "Ranks = ".join(',', @{$ranksRef})."\n";
}

sub _printMatrix {
    my $matrixRef = shift;
    my $matrixSize = shift;
    my $indexToCuiRef = shift;
    
    for (my $i = 0; $i < $matrixSize; $i++) {
	my $index1 = ${$indexToCuiRef}{$i};
	for (my $j = 0; $j < $matrixSize; $j++) {
	    my $printed = 0;
	    my $index2 = ${$indexToCuiRef}{$j};
	    my $hash1Ref =  ${$matrixRef}{$index1};

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN


######################################################################
#           Functions to perform Literature Based Discovery
######################################################################


# gets the rows of the cuis from the matrix
# input:  $cuisRef <- an array reference to a list of CUIs
#         $matrixRef <- a reference to a co-occurrence matrix
# output: a hash ref to a sparse matrix containing just the rows retrieved
sub getRows {
    my $cuisRef = shift;
    my $matrixRef = shift;

    my %rows = ();
    my $rowRef;
    #add each cui row to the starting matrix
    foreach my $cui(@{$cuisRef}) {
	#if there is a row for this cui
	if (exists ${$matrixRef}{$cui}) {
	    $rowRef = ${$matrixRef}{$cui};

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

#NOTE...this is calculating B*A ... but is that appropriate?  ... I think that it is, but the values are maybe not so appropriate    ... B*A is nice because it makes the implicit matrix not keep track of non-starting cui rows.   ...but the values are...

# finds the implicit connections for all CUIs (based on squaring)
# It does this by multiplying $matrixB*$matrixA. If $matrix B is the starting
# matrix, and $matrixA is the explicitMatrix, this method works correctly and
# efficiently. $matrixA and $matrixB may also be the explicit matrix but
# this is more inefficient.
# input:  $matrixARef <- ref to a sparse matrix
#         $matrixBRef <- ref to a sparse matrix
# output: ref to a sparse matrix of the product of B*A
sub findImplicit {
    my $matrixARef = shift; 
    my $matrixBRef = shift;

    my %product = ();
    #loop over the rows of the B matrix
    foreach my $key0 (keys %{$matrixBRef}) {  

	#loop over row
	foreach my $key1 (keys %{$matrixARef}) {	

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

    return \%product;
}


# removes explicit connections from the matrix of implicit connections by 
# removing keys (O(k), where k is the number of keys in the explicit matrix,
# we expect the explicit k to be smaller than the implicit k)
# input: $explicitMatrixRef <- reference to the explicit knowledge matrix
#        $implicitMatrixRef <- reference to the implicit knowledge matrix
# output: ref to the implicit matrix with explicit knowledge removed
sub removeExplicit {
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #Check each key of the explicit matrix to see if it exists
    # in the implicit matrix
    foreach my $key1(keys %{$explicitMatrixRef}) {
	if (exists ${$implicitMatrixRef}{$key1}) {
	    foreach my $key2(keys %{${$explicitMatrixRef}{$key1}}) {
		if (exists ${${$implicitMatrixRef}{$key1}}{$key2}) {
		    delete ${${$implicitMatrixRef}{$key1}}{$key2};

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

	}
    }
    return $implicitMatrixRef;
}


# loads a tab seperated file as a sparse matrix (a hash of hashes)
#    each line of the file contains CUI1 <TAB> CUI2 <TAB> Count
# input:  the filename containing the data
# output: a hash ref to the sparse matrix (${$hash{$index1}}{$index2} = value)
sub fileToSparseMatrix {
    my $fileName = shift;

    open IN, $fileName or die ("unable to open file: $fileName\n");
    my %matrix = ();
    my ($cui1,$cui2,$val);
    while (my $line = <IN>) {
	chomp $line;
	($cui1,$cui2,$val) = split(/\t/,$line);
	
	if (!exists $matrix{$cui1}) {

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

    }
    close IN;
    return \%matrix;
}

# outputs the matrix to the output file in sparse matrix format, which
# is a file containing rowKey\tcolKey\tvalue
# input:  $outFile - a string specifying the output file
#         $matrixRef - a ref to the sparse matrix containing the data
# output: nothing, but the matrix is output to file
sub outputMatrixToFile {
    my $outFile = shift;
    my $matrixRef = shift;
    
    #open the output file and output fhe matrx
    open OUT, ">$outFile" or die ("Error opening matrix output file: $outFile\n");
    my $rowRef;
    foreach my $rowKey (keys %{$matrixRef}) {
	$rowRef = ${$matrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {
	    print OUT "$rowKey\t$colKey\t${$rowRef}{$colKey}\n";

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

}


#Note: Table to sparse is no longer used, but could be useful in the future
=comment
#  retreive a table from mysql and convert it to a sparse matrix (a hash of 
#     hashes)
#  input : $tableName <- the name of the table to output
#          #cuiFinder <- an instance of UMLS::Interface::CuiFinder
#  output: a hash ref to the sparse matrix (${$hash{$index1}}{$index2} = value)
sub tableToSparseMatrix {
    my $tableName = shift;
    my $cuiFinder = shift;

    # check tableName
    #TODO check that the table exists in the database
    # or die "Error: table does not exist: $tableName\n";

    #  set up database
    my $db = $cuiFinder->_getDB(); 
    

lib/LiteratureBasedDiscovery/Evaluation.pm  view on Meta::CPAN


package Evaluation;
use strict;
use warnings;

# Timeslicing evaluation that calculates the precision of LBD 
# (O(k), where k is the number of keys in results)
# input: $resultsMatrixRef <- ref a matrix of LBD results
#        $goldMatrixRef <- ref to a gold standard matrix
# output: the precision of results
sub calculatePrecision {
    my $resultsMatrixRef = shift;
    my $goldMatrixRef = shift;

    # calculate the precision which is the percentage of results that are 
    # are in the gold standard
    # (percent of generated that is gold)
    my $count = 0;
    foreach my $key(keys %{$resultsMatrixRef}) {
	if (exists ${$goldMatrixRef}{$key}) {
	    $count++;

lib/LiteratureBasedDiscovery/Evaluation.pm  view on Meta::CPAN

    }
    return $count/(scalar keys %{$resultsMatrixRef});

}

# Timeslicing evaluation that calculate the recall of LBD 
# (O(k), where k (is the number of keys in gold)
# input: $resultsMatrixRef <- ref a matrix of LBD results
#        $goldMatrixRef <- ref to a gold standard matrix
# output: the recall of results
sub calculateRecall {
    my $resultsMatrixRef = shift;
    my $goldMatrixRef = shift;
    
    # calculate the recall which is the percentage of knowledge in the gold
    # standard that was generated by the LBD system 
    # (percent of gold that is generated)
    my $count = 0;
    foreach my $key(keys %{$goldMatrixRef}) {
	if (exists ${$resultsMatrixRef}{$key}) {
	    $count++;

lib/LiteratureBasedDiscovery/Filters.pm  view on Meta::CPAN


use UMLS::Interface;

# applies a semantic group filter to the matrix, by removing keys that 
# are not allowed semantic type. Eliminates both rows and columns, so
# is applied to the full explicit matrix
# input:  $matrixRef <- ref to a sparse matrix to be filtered
#         $acceptTypesRef <- a ref to a hash of accept type strings
#         $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated 
sub semanticTypeFilter_rowsAndColumns {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
 
=comment   
    #Count the number of keys before and after filtering (for debugging)
    my %termsHash = ();
    foreach my $key1 (keys %{$matrixRef}) {
	foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
	    $termsHash{$key2} = 1;

lib/LiteratureBasedDiscovery/Filters.pm  view on Meta::CPAN



# applies a semantic group filter to the matrix, by removing keys that 
# are not allowed semantic type. Only removes types from rows, 
# so is applied for times slicing, before randomly selecting terms of 
# one semantic type
# input:  $matrixRef <- ref to a sparse matrix to be filtered
#         $acceptTypesRef <- a ref to a hash of accept type strings
#         $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated 
sub semanticTypeFilter_rows {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
    
=comment
    #Count the number of keys before and after filtering (for debugging)
    my %termsHash = ();
    foreach my $key1 (keys %{$matrixRef}) {
	foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
	    $termsHash{$key2} = 1;

lib/LiteratureBasedDiscovery/Filters.pm  view on Meta::CPAN



# applies a semantic group filter to the matrix, by removing keys that 
# are not allowed semantic type. Only removes types from columns, 
# so is applied to the implicit matrix (starting term rows with implicit
# columns).
# input:  $matrixRef <- ref to a sparse matrix to be filtered
#         $acceptTypesRef <- a ref to a hash of accept type strings
#         $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated 
sub semanticTypeFilter_columns {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
 
=comment   
    #Count the number of keys before and after filtering (for debugging)
    my %termsHash = ();
    foreach my $key1 (keys %{$matrixRef}) {
	foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
	    $termsHash{$key2} = 1;

lib/LiteratureBasedDiscovery/Filters.pm  view on Meta::CPAN

    }
    print "   number of keys after filtering = ".(scalar keys %termsHash)."\n";
=cut

}

# gets the semantic types of the group
# input:  $group <- a string specifying a semantic group
#         $umls <- an instance of UMLS::Interface
# output: a ref to a hash of TUIs
sub getTypesOfGroup {
    my $group = shift;
    my $umls = shift;

    #add each type of the group to the set of accept types
    my %acceptTuis = ();
    my @groupTypes = @{ $umls->getStsFromSg($group) };
    foreach my $abr(@groupTypes) {
	#check that it is defined (types that are no longer in 
	#the UMLS may be returned as part of the group)
	if (defined $abr) {

lib/LiteratureBasedDiscovery/Filters.pm  view on Meta::CPAN

	    $acceptTuis{$tui} = 1;
	}
    }

    return \%acceptTuis;
}

# gets all semantic types of the UMLS
# input:  $umls <- an instance of UMLS::Interface
# output: a ref to an array of TUIs
sub getAllTypes {
    my $umls = shift;

    my $abrRef = $umls->getAllSts();
    my @tuis = ();
    foreach my $abr(@{$abrRef}) {
	push @tuis, uc $umls->getStTui($abr);
    }

    return \@tuis;
}

# gets all semantic groups of the UMLS
# input:  $umls <- an instance of UMLS::Interface
# output: a ref to a hash of semantic groups
sub getAllGroups {
    my $umls = shift;
    my $groupsRef = $umls->getAllSemanticGroups();
    return $groupsRef;
}

1;

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


# scores each implicit CUI using an assocation measure, but the input to 
# the association measure is based on linking term counts, rather than
# co-occurrence counts.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_ltcAssociation {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;

    #bTerms to calculate n1p (number of unique co-occurring terms)
    my %bTerms = ();
    my $rowRef;
    foreach my $rowKey (keys %{$startingMatrixRef}) {

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

#                         key is the a,b cui pair (e.g. hash{'C00,C11'})
#                         values are their score
#
#         Optional Input for passing in precalculated stats
#         so that they don't have to get recalcualted each time
#         such as in timeslicing
#         $n1pRef <- hashRef where key is a cui, value is n1p
#         $np1Ref <- hashRef where key is a cui, value is np1
#         $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_averageMinimumWeight {
    #grab input
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;
    my $abScoresRef = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

#         $abScoresRef <- hashRef of the a to b scores used in AMW
#                         key is the a,b cui pair (e.g. hash{'C00,C11'})
#                         values are their score
#         Optional Input for passing in precalculated stats
#         so that they don't have to get recalcualted each time
#         such as in timeslicing
#         $n1pRef <- hashRef where key is a cui, value is n1p
#         $np1Ref <- hashRef where key is a cui, value is np1
#         $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_LTC_AMW {
    #grab the input
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;
    my $abScoresRef = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

    #return the scores
    return \%ltcAMWScores;
}

#TODO this is an untested method
# gets the max cosine distance score between all a terms and each cTerm 
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub score_cosineDistance {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all the A->C pairs
    my $acPairsRef = &_getACPairs($startingMatrixRef, $implicitMatrixRef);
    my %scores = ();
    foreach my $pairKey (keys %{$acPairsRef}) {
	#get the A and C keys

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

    }
    
    return \%scores;
}

# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input:  $startingMatrixRef <- ref to the starting matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
#         and values are set to the value at that index in the implicit matrix
sub _getACPairs {
    my $startingMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #generate a list of ac pairs
    my %acPairs = ();
    foreach my $keyA (keys %{$implicitMatrixRef}) {
	foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
	    $acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
	}
    }

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


}


# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the linking term count for each cTerm
    my %scores = ();

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

    return \%scores;
}


# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the frequency count for each cTerm
    my %scores = ();

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between a column in the implicit matrix, and one of the start 
# matrix terms (so max between any A and that C term). 
# Score is calculated using the implicit matrix
# input:  $startCuisRef <- ref to an array of start cuis (A terms)
#         $implicitMatrixFileName <- fileName of the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
    #LBD Info
    my $startCuisRef = shift;
    my $implicitMatrixFileName = shift;
    my $measure = shift;
    my $association = shift;

######################################
    #Get hashes for A and C terms
#####################################
    #create a hash of starting terms

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

}

# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between any of the linking terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromAllPairs {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)
    my $n1pRef = shift;

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	    elsif (${$bcPairsRef}{$pairKey} > $scores{$key2}) {
		$scores{$key2} = ${$bcPairsRef}{$pairKey}
	    }
	}
    }
    
    return \%scores;
}


sub scoreImplicit_minimumWeightAssociation {

}


#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX



# Builds a list of B->C term pairs that also co-occurr with A terms
# Only adds B->C term pairs for C terms that are also present in the 
# implicitMatrix.
# The value of the bcPairs Hash is the value in the explicit matrix 
# for that pair.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of BC term pairs. Each key is "$bTerm,$cTerm", 
#         value is by default the frequency of BC co-occurrences in the 
#         matrix
sub _getBCPairs {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bTerms
    my %bTerms = ();
    my $rowRef;
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$rowRef = ${$startingMatrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	    }
	}
    }
    return \%bcPairs;
}


# ranks the scores in descending order
# input: $scoresRef <- a hash ref to a hash of cuis and scores (hash{cui} = score)
# output: an array ref of the ranked cuis in descending order
sub rankDescending {
    #grab the input
    my $scoresRef = shift;

    #order in descending order, and use the CUI string as a tiebreaker
    my @rankedCuis = ();
    my @tiedCuis = ();
    my $currentScore = -1;
    foreach my $cui (
	#sort function to sort by value
	sort {${$scoresRef}{$b} <=> ${$scoresRef}{$a}} 

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

# gets association scores for a set of cui pairs 
# input:  $cuiPairsRef <- reference to a hash of pairs of matrix indeces (key = '1,2')
#         $matrixRef <- a reference to a sparse matrix of n11 values
#         $measure <- the association measure to perform
#         $association <- an instance of UMLS::Association
# output: none, bu the cuiPairs ref has values updated to reflect the 
#         computed assocation score
sub getBatchAssociationScores {
    my $cuiPairsRef = shift;
    my $matrixRef = shift;
    my $measure = shift;
    my $association = shift;
    
    #optionally pass in $n1pRef, $np1Ref, and $npp
    # do this if they get calculated multiple times
    # (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input:  $matrixRef <- ref to the co-occurrence matrix (the sparse matrix 
#                       of n11 values)
# output: \@vals <- an array ref of three values:
#                   \%n1p - a hash ref where the key is a cui and value is n1p
#                   \%np1 - a hash ref where the key is a cui and value is np1
#                   $npp - a scalar of npp
sub getAllStats {
    my $matrixRef = shift;

    #get all np1, n1p, and npp values of values for each cui
    my %np1 = ();
    my %n1p = ();
    my $npp = 0;
    my $val;
    foreach my $key1 (keys %{$matrixRef}) {
	foreach my $key2 (keys %{${$matrixRef}{$key1}}) {
	    $val = ${${$matrixRef}{$key1}}{$key2};

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# Calculates and outputs to STDOUT Time Slicing evaluation stats of
# precision and recall at $numIntervals intervals, Mean Average Precision
# (MAP), precision at k, and frequency at k
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
#         $numIntervals <- the number of recall intervals to generate
sub outputTimeSlicingResults {
    #grab the input
    my $goldMatrixRef = shift;
    my $rowRanksRef = shift;
    my $numIntervals = shift;


#calculate and output stats
#------------------------------------------

 #calculate precision and recall

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

    }
    print "\n";

}


# loads a list of cuis for use in time slicing from file
# the CUI file contains a line seperated list of CUIs
# input:  $cuiFileName <- a string specifying the file to load cuis from
# output: $\%cuis <- a ref to a hash of cuis, each key is a cui, values are 1
sub loadCUIs {
    my $cuiFileName = shift;
    
    #open the file
    open IN, $cuiFileName 
	or die("ERROR: cannot open CUI File: $cuiFileName\n");
    
    #read each line of the file
    my %cuis = ();
    while (my $line = <IN>) {
	chomp $line;

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

}


# calculates average precision and recall of the generated implicit matrix 
# compared to the post cutoff matrix
# input:  $predictionsMatrixRef <- a ref to a sparse matrix of predicted 
#                                  discoveries
#         $trueMatrixRef <- a ref to a sparse matrix of true discoveries
# output: ($precision, $recall) <- two scalar values specifying the precision 
#                                  and recall
sub calculatePrecisionRecall {
    my $predictionsMatrixRef = shift; #a matrix of predicted discoveries
    my $trueMatrixRef = shift; #a matrix of true discoveries
    print "calculating precision and recall\n";

    #bounds check, the predictions matrix must contain keys
    if ((scalar keys %{$predictionsMatrixRef}) < 1) {
	return (0,0); #precision and recall are both zero
    }

    #calculate precision and recall averaged over each cui

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

}


# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are 
# the only rows that are needed.
# input:  $startingMatrixRef <- a ref to the starting sparse matrix
#         $explicitMatrix Ref <- a ref to the explicit sparse matrix
#         $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $postCutoffFileName = shift;
    print "loading postCutoff Matrix\n";
    
    #open the post cutoff file
    open IN, $postCutoffFileName 
	or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");

    #create hash of cuis to grab

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN


#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input:  $explicitMatrixRef <- a ref to the explicit sparse matrix
#         $lbdOptionsRef <- the LBD options
#         $startTermAcceptTypesRef <- a reference to an hash of accept 
#                                     types for start terms (TUIs)
#         $numRows <- the number of random rows to load (if random)
#         $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
    my $explicitMatrixRef = shift;
    my $lbdOptionsRef = shift;
    my $startTermAcceptTypesRef = shift;
    my $numRows = shift;
    my $umls_interface = shift;

    #generate the starting matrix randomly or from a file
    my %startingMatrix = ();

    #check if a file is defined

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

}


# gets and returns a hash of row keys of the specifies semantic types
# input:  $matrixRef <- a ref to a sparse matrix
#         $acceptTypesRef <- a ref to a hash of accept types (TUIs)
#         $umls <- an instance of UMLS::Interface
# output: \%rowsToKeep <- a ref to hash of rows to keep, each key is 
#                         a CUI, and values are 1. All CUIs specify rows
#                         of acceptable semantic types
sub getRowsOfSemanticTypes {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
    
    #loop through the matrix and keep the rows that are of the 
    # desired semantic types
    my %rowsToKeep = ();
    foreach my $cui1 (keys %{$matrixRef}) {
	my $typesRef = $umls->getSt($cui1);
	foreach my $type(@{$typesRef}) {

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# between the $rowKey and $colKey. All co-occurring cui pairs from the matrix
# are calculated
# input:  $matrixRef <- a reference to a sparse matrix
#         $rankingMeasue <- a string specifying the ranking measure to use
#         $umls_association <- an instance of UMLS::Association
# output: \%cuiPairs <- a ref to a hash of CUI pairs and their assocaition
#                       each key of the hash is a comma seperated string 
#                       containing cui1, and cui2 of the pair 
#                       (e.g. 'cui1,cui2'), and each value is their association
#                       score using the specified assocition measure
sub getAssociationScores {
    my $matrixRef = shift;
    my $rankingMeasure = shift;
    my $umls_association = shift;
    print "   getting Association Scores, rankingMeasure = $rankingMeasure\n";
    
    #generate a list of cui pairs in the matrix
    my %cuiPairs = ();
    print "   generating association scores:\n";
    foreach my $rowKey (keys %{$matrixRef}) {
	foreach my $colKey (keys %{${$matrixRef}{$rowKey}}) {

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

	Rank::getBatchAssociationScores(\%cuiPairs, $matrixRef, $rankingMeasure, $umls_association);
	return \%cuiPairs;
    }
}

# gets the min and max value of a hash
# returns a two element array, where the first value is the min, and
# the second values is the max
# input:  $hashref <- a reference to a hash with numbers as values
# output: ($min, $max) <- the minimum and maximum values in the hash
sub getMinMax {
    my $hashRef = shift;
    
    #loop through each key and record the min/max
    my $min = 999999;
    my $max = -999999;
    foreach my $key (keys %{$hashRef}) {
	my $val = ${$hashRef}{$key};
	if ($val < $min) {
	    $min = $val;
	}

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# hash. Any keys less than the threshold are not copied to the new matrix
# input:  $threshold <- a scalar threshold
#         $assocScoresRef <- a reference to a cui pair hash of association
#                            scores. Each key is a comma seperated cui pair
#                            (e.g. 'cui1,cui2'), values are their association
#                            scores.
#         $matrixRef <- a reference to a co-occurrence sparse matrix that 
#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix < a ref to a new matrix, built from the 
#         $matrixRef after applying the $threshold
sub applyThreshold {
    my $threshold = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();
    my ($cui1, $cui2);
    foreach my $key (keys %{$assocScoresRef}) {

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# of samples. Used in explicit timeslicing
# input:  $k <- the number of samples to get
#         $assocScoresRef <- a reference to a cui pair hash of association
#                            scores. Each key is a comma seperated cui pair
#                            (e.g. 'cui1,cui2'), values are their association
#                            scores.
#         $matrixRef <- a reference to a co-occurrence sparse matrix that 
#                       corresponds to the assocScoresRef
# output: \%thresholdedMatrix <- a ref to a sparse matrix containing only the 
#                                $k ranked samples (cui pairs)
sub grabKHighestRankedSamples {
    my $k = shift;
    my $assocScoresRef = shift;
    my $matrixRef = shift;
    print "getting $k highest ranked samples\n";

    #apply the threshold
    my $preKeyCount = scalar keys %{$assocScoresRef};
    my $postKeyCount = 0;
    my %thresholdedMatrix = ();

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
#         $numIntervals <- the number of recall intervals to generate
# output: (\%precision, \%recall) <- refs to hashes of precision and recall. 
#                                    Each hash key is the interval number, and 
#                                    the value is the precision and recall 
#                                    respectively
sub calculatePrecisionAndRecall_implicit {
    my $trueMatrixRef = shift; #a ref to the true matrix
    my $rowRanksRef = shift; #a ref to ranked predictions, each hash element are the predictions for a single cui, at each element is an array of cuis ordered by their rank
    my $numIntervals = shift; #the recall intervals to test at

    #find precision and recall curves for each cui that is being predicted
    #  take the sum of precisions, then average after the loop
    my %precision = ();
    my %recall = ();
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN



# calculates the mean average precision (MAP)
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: $map <- a scalar value of mean average precision (MAP)
sub calculateMeanAveragePrecision {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
    print "calculating mean average precision\n";

    #calculate MAP for each true discovery being predicted
    my $map = 0;
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# from k = 1-10 and intervals of 10 for 10-100
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: \%meanPrecision <- a hash of mean preicsions at K, each key is the 
#                            value of k, the the value is the precision at that
#                            k
sub calculatePrecisionAtK {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
  
    #generate precision at k at intervals of 10 for k = 10-100
    my %meanPrecision = ();
    my $interval = 1;
    for (my $k = 1; $k <= 100; $k+=$interval) {
	$meanPrecision{$k} = 0;

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

# for 10-100. Co-occurrence counts are averaged over each of the starting terms
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: \%meanCooccurrenceCounts <- a hash of mean preicsions at K, each key 
#                                     is the value of k, the the value is the 
#                                     precision at that k
sub calculateMeanCooccurrencesAtK {
    #grab the input
    my $trueMatrixRef = shift; # a matrix of true discoveries
    my $rowRanksRef = shift; # a hash of ranked predicted discoveries
  
    #generate mean cooccurrences at k at intervals of 10 for k = 10-100
    my %meanCooccurrenceCount = (); #count of the number of co-occurrences for each k
    my $interval = 1;
    for (my $k = 1; $k <= 100; $k+=$interval) {
	$meanCooccurrenceCount{$k} = 0;

t/test.t  view on Meta::CPAN

    }
}
ok($fAtKSame == 1, "Frequency at K Matches");

print "Done with Time Slicing Tests\n";



############################################################
#function to read in time slicing data values
sub readTimeSlicingData {
    my $fileName = shift;

    #read in the gold time slicing values
    my @APScores = ();
    my $MAP;
    my @PAtKScores = ();
    my @FAtKScores = ();
    open IN, "$fileName" 
    #open IN, './t/goldSampleTimeSliceOutput'
	or die ("Error: Cannot open timeSliceOutput: $fileName\n");

utils/datasetCreator/applyMaxThreshold.pl  view on Meta::CPAN

my $inputFile = '/home/henryst/lbdData/groupedData/reg/1975_1999_window8_noOrder';
my $outputFile = '/home/henryst/lbdData/groupedData/1975_1999_window8_noOrder_threshold5000u';
my $maxThreshold = 5000;
my $applyToUnique = 1;
my $countRef = &getStats($inputFile, $applyToUnique);
&applyMaxThreshold($inputFile, $outputFile, $maxThreshold, $countRef);


# gets co-occurrence stats, returns a hash of (unique) co-occurrence counts 
# for each CUI. (count is unique or not depending on $applyToUnique)
sub getStats {
    my $inputFile = shift;
    my $applyToUnique = shift;

    #open files
    open IN, $inputFile or die("ERROR: unable to open inputFile\n");   

    print "Getting Stats\n";
    #count stats for each line of the file
    my ($cui1, $cui2, $val);
    my %count = (); #a count of the number of (unique) co-occurrences

utils/datasetCreator/applyMaxThreshold.pl  view on Meta::CPAN

	#does not matter, the matrix will have been pre-processed to ensure 
	#the second cui will appear first in the key. In the case where order 
	#does matter we just shouldnt be counting it anyway
    }
    close IN;

    return \%count;
}

#applies a maxThreshold, $countRef is the output of getStats
sub applyMaxThreshold {
    my $inputFile = shift;
    my $outputFile = shift;
    my $maxThreshold = shift;
    my $countRef = shift;

    #open the input and output
    open IN, $inputFile or die("ERROR: unable to open inputFile\n");
    open OUT, ">$outputFile" 
	or die ("ERROR: unable to open outputFile: $outputFile\n");

utils/datasetCreator/applyMinThreshold.pl  view on Meta::CPAN

#$minThreshold number of co-occurrences

my $minThreshold = 5;
my $inputFile = '/home/henryst/1975_2015_window8_noOrder_preThresh';
my $outputFile = '/home/henryst/1975_2015_window8_noOrder_threshold'.$minThreshold;
&applyMinThreshold($minThreshold, $inputFile, $outputFile);


############

sub applyMinThreshold {
    #grab the input
    my $minThreshold = shift;
    my $inputFile = shift;
    my $outputFile = shift;

    #open files
    open IN, $inputFile or die("ERROR: unable to open inputFile\n");
    open OUT, ">$outputFile" 
	or die ("ERROR: unable to open outputFile: $outputFile\n");

utils/datasetCreator/applySemanticFilter.pl  view on Meta::CPAN


&applySemanticFilter($matrixFileName, $outputFileName, 
		     $acceptTypesString, $acceptGroupsString,



###################################################################
###################################################################

# Applies the semantic type filter
sub applySemanticFilter {
    #grab the input
    my $matrixFileName = shift;
    my $outputFileName = shift;
    my $acceptTypesString = shift;
    my $acceptGroupsString = shift;
    my $interfaceConfig = shift;
    my $columnsOnly = shift;

    print STDERR "Applying Semantic Filter to $matrixFileName\n";

utils/datasetCreator/applySemanticFilter.pl  view on Meta::CPAN


    #TODO re-enable this and then try to run again
    #disconnect from the database and return
    #$umls_interface->disconnect();
}


# transforms the string of accept types or groups into a hash of accept TUIs
# input:  a string specifying whether linking or target types are being defined
# output: a hash of acceptable TUIs
sub getAcceptTypes {
    my $umls_interface = shift;
    my $acceptTypesString = shift;
    my $acceptGroupsString = shift;

    #get the accept types 
    my %acceptTypes = ();

    #add all types for groups specified
    #accept groups were specified
    my @acceptGroups = split(',',$acceptGroupsString);

utils/datasetCreator/combineCooccurrenceMatrices.pl  view on Meta::CPAN

#user input
$dataFolder = '/home/henryst/hadoopByYear/output/';
$startYear = '1983';
$endYear = '1985';
$windowSize = 8;
&combineFiles($startYear,$endYear,$windowSize);


#####################################################
####### Program Start ########
sub combineFiles {
    my $startYear = shift;
    my $endYear = shift;
    my $windowSize = shift;

#Check on I/O
    my $outFileName = "$startYear".'_'."$endYear".'_window'."$windowSize";
(!(-e $outFileName)) 
    or die ("ERROR: output file already exists: $outFileName\n");
open OUT, ">$outFileName" 
    or die ("ERROR: unable to open output file: $outFileName\n");

utils/datasetCreator/dataStats/getMatrixStats.pl  view on Meta::CPAN

#gets matrix stats for a matrix file 
# (number of rows, number of columns, number of keys)

&getStats('/home/henryst/lbdData/groupedData/1852_window1_squared_inParts');


#############################################
# gets the stats for the matrix
#############################################
sub getStats {
    my $fileName = shift;
    print STDERR "$fileName\n";

#read in the matrix
    open IN, $fileName or die ("unable to open file: $fileName\n");
    my %matrix = ();
    my $numCooccurrences = 0;
    while (my $line = <IN>) {
	#$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;
	$line =~ /([^\s]+)\s([^\s]+)\s([\d]+)/;

utils/datasetCreator/dataStats/metaAnalysis.pl  view on Meta::CPAN

my $dataFolder = '/home/henryst/lbdData/dataByYear/1960_1989';
my $startYear = '1809';
my $endYear = '2015';
my $windowSize = 1;
my $statsOutFileName = '/home/henryst/lbdData/stats_window1';
&folderMetaAnalysis($startYear, $endYear, $windowSize, $statsOutFileName, $dataFolder);


#####################
# runs meta analysis on a set of files
sub folderMetaAnalysis {
    my $startYear = shift;
    my $endYear = shift;
    my $windowSize = shift;
    my $statsOutFileName= shift;
    my $dataFolder = shift;

    #Check on I/O
    open OUT, ">$statsOutFileName" 
	or die ("ERROR: unable to open stats out file: $statsOutFileName\n");

utils/datasetCreator/dataStats/metaAnalysis.pl  view on Meta::CPAN

	    print "   ERROR: unable to open $inFile\n";
	}
    }
    close OUT;
    print "Done getting stats\n";
}


##############################
# runs meta analysis on a single file
sub metaAnalysis {
    my $fileName = shift;
    
    open IN, $fileName or die ("unable to open file: $fileName\n");
    
    my $numCooccurrences = 0; 
    my %rowKeys = ();  #number of rows
    my %colKeys = ();  #number of columns
    my %uniqueKeys = (); #vocabulary size
    while (my $line = <IN>) {
	$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;

utils/datasetCreator/removeCUIPair.pl  view on Meta::CPAN

my $cuiA = 'C0021665'; #somatomedic c
my $cuiB = 'C0003765'; #arginine
my $matrixFileName = '/home/henryst/lbdData/groupedData/1960_1989_window8_ordered';
my $matrixOutFileName = $matrixFileName.'_removed';
&removeCuiPair($cuiA, $cuiB, $matrixFileName, $matrixOutFileName);

print STDERR "DONE\n";

###########################################
# remove the CUI pair from the dataset
sub removeCuiPair {
    my $cuiA = shift;
    my $cuiB = shift;
    my $matrixFileName = shift;
    my $matrixOutFileName = shift;
    print STDERR "removing $cuiA,$cuiB from $matrixFileName\n";
    
    #open the in and out files
    open IN, $matrixFileName 
	or die ("ERROR: cannot open matrix in file: $matrixFileName\n");
    open OUT, ">$matrixOutFileName" 

utils/datasetCreator/removeExplicit.pl  view on Meta::CPAN

my $squaredMatrixFileName = '../../samples/postCutoffMatrix';
my $outputFileName = '../../samples/sampleGoldMatrix';

&removeExplicit($matrixFileName, $squaredMatrixFileName, $outputFileName);

###############################
###############################

#removes explicit knowledge ($matrixFileName) from the implicit 
# knowledge ($squaredMatrixFileName)
sub removeExplicit {
    my $matrixFileName = shift;  #the explicit knowledge matrix (usually not filtered)
    my $squaredMatrixFileName = shift;  #the implicit with explicit knowledge matrix (filtered squared)
    my $outputFileName = shift; #the implicit knowledge matrix output file
    print STDERR "Removing Explicit from $matrixFileName\n";

    #read in the matrix
    open IN, $matrixFileName 
	or die("ERROR: unable to open matrix input file: $matrixFileName\n");
    my %matrix = ();
    my $numCooccurrences = 0;

utils/datasetCreator/squaring/convertForSquaring_MATLAB.pl  view on Meta::CPAN


#convert from MATLAB sparse format
$fileName = "1980_1984_window1_ordered_filtered";
&convertFrom("/home/henryst/lbdData/groupedData/squared/$fileName".'_squared', "/home/henryst/lbdData/groupedData/squared/$fileName".'_squared_convertedBack',"/home/henryst/lbdData/groupedData/forSquaring/".$fileName.'_keys');


########################################
########################################

#converts the matrix to format for squaring in MATLAB
sub convertTo {
    #grab input
    my $inFile = shift;
    my $matrixOutFile = shift;
    my $keyOutFile = shift;
    print STDERR "converting $inFile\n";
  
    #open all the files
    open IN, $inFile
	or die ("ERROR: unable to open inFile: $inFile\n");
    open MATRIX_OUT, ">$matrixOutFile" 

utils/datasetCreator/squaring/convertForSquaring_MATLAB.pl  view on Meta::CPAN

    #output the keys file
    print "   Outputting keys\n";
    foreach my $key (sort keys %keyHash) {
	print KEY_OUT "$key\t$keyHash{$key}\n";
    }
    close KEY_OUT;
    print "   DONE!\n";
}

#converts the from format for squaring in MATLAB
sub convertFrom {
    #grab input
    my $matrixInFile = shift;
    my $matrixOutFile = shift;
    my $keyInFile = shift;
    print "converting $matrixInFile\n";
  
    #open all the files
    open IN, $matrixInFile
	or die ("ERROR: unable to open matrixInFile: $matrixInFile\n");
    open MATRIX_OUT, ">$matrixOutFile" 

utils/datasetCreator/squaring/squareMatrix_perl.pl  view on Meta::CPAN

&outputMatrix(\%product, $options{'outputFile'});
print STDERR "DONE!\n";




#########################################################
# Helper Functions
#########################################################

sub outputMatrix {
    my $matrixRef = shift;
    my $outputFile = shift;

    #append to the output file
    print STDERR "outputFile = $outputFile\n";
    open OUT, '>>'.$outputFile or die ("ERROR: unable to open output file: $options{outputFile}\n");

    #ouput the matrix
    foreach my $key0 (keys %{$matrixRef}) {  
	foreach my $key1 (keys %{$product{$key0}}) {

utils/datasetCreator/squaring/squareMatrix_perl.pl  view on Meta::CPAN

    }
    
    #clear the matrix
    my %newHash = ();
    $matrixRef = \%newHash;

    close OUT;
}


sub fileToSparseMatrix {
    my $fileName = shift;

    open IN, $fileName or die ("unable to open file: $fileName\n");
    my %matrix = ();
    while (my $line = <IN>) {
	chomp $line;
	$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;
	if (!exists $matrix{$1}) {
	    my %hash = ();
	    $matrix{$1} = \%hash;

utils/runDiscovery.pl  view on Meta::CPAN


$options{'lbdConfig'} = shift;
defined $options{'lbdConfig'} or die ($usage);

my $lbd = ALBD->new(\%options);
$lbd->performLBD();

############################################################################
#  function to output help messages for this program
############################################################################
sub showHelp() {
        
    print "This utility takes an lbd configuration file and outputs\n";
    print "the results of lbd to file. The parameters for LBD are\n";
    print "specified in the input file. Please see samples/lbd or\n";
    print "samples/thresholding for sample input files and descriptions\n";
    print "of parameters and full details on what can be in an LBD input\n";
    print "file.\n";
    
    print "\n";
    print "Usage: runDiscovery.pl LBD_CONFIG_FILE [OPTIONS]\n";



( run in 0.937 second using v1.01-cache-2.11-cpan-4d50c553e7e )