ALBD

 view release on metacpan or  search on metacpan

GPL.txt  view on Meta::CPAN


    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.

    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.

    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)

These requirements apply to the modified work as a whole.  If

GPL.txt  view on Meta::CPAN


		     END OF TERMS AND CONDITIONS

	    How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

GPL.txt  view on Meta::CPAN

    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

    Gnomovision version 69, Copyright (C) year name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

config/lbd  view on Meta::CPAN

# For parameters where no value is needed, just write the name of the
# parameter in '<>' (e.g. '<debug>')

<rankingProcedure>averageMinimumWeight
<rankingMethod>chi
<implicitOutputFile>../samples/sampleLBDOutput

<linkingAcceptGroups>CHEM,DISO,GENE,PHYS,ANAT
<targetAcceptGroups>CHEM,GENE

<startCuis>C0034734,C0034735

lib/ALBD.pm  view on Meta::CPAN

# row corresponding to a co-occurrence with a single CUI. Since the matrices 
# tend to be sparse, they are stored as hashes of hashes, where the the first 
# key is for a row, and the second key is for a column. The keys of each hash 
# are the indeces within the matrix. The hash values are the number of 
# co-ocurrences for that CUI pair (e.g. ${${$explicit{C0000000}}{C1111111} = 10 
# means that CUI C0000000 and C1111111 co-occurred 10 times).
#
# Now with an understanding of the data strucutres, below is a breif 
# description of each: 
#
# startingMatrix <- A matrix containing the explicit matrix rows for all of the
#                   start terms. This makes it easy to have multiple start terms
#                   and using this matrix as opposed to the entire explicit 
#                   matrix drastically improves performance.
# explicitMatrix <- A matrix containing explicit connections (known connections)
#                   for every CUI in the dataset.            
# implicitMatrix <- A matrix containing implicit connections (discovered 
#                   connections) for every CUI in the datast


package ALBD;

lib/ALBD.pm  view on Meta::CPAN

my $umls_association;

#####################################################
####################################################

# performs LBD
# input:  none
# ouptut: none, but a results file is written to disk
sub performLBD {
    my $self = shift;
    my $start; #used to record run times

    #implicit matrix ranking requires a different set of procedures
    if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') { 
	$self->performLBD_implicitMatrixRanking();
	return;
    }
    if (exists $lbdOptions{'targetCuis'}) {
	$self->performLBD_closedDiscovery();
	return;
    }

lib/ALBD.pm  view on Meta::CPAN

	return;
    }
    if (exists $lbdOptions{'precisionAndRecall_implicit'}) {
	$self->timeSlicing_generatePrecisionAndRecall_implicit();
	return;
    }
    print "Open Discovery\n";
    print $self->_parametersToString();

#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
    print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
    print "linkingAcceptTypes = ".(join(',', keys %{$linkingAcceptTypesRef}))."\n";
    print "targetAcceptTypes = ".(join(',', keys %{$targetAcceptTypesRef}))."\n";

#Get the Explicit Matrix
    $start = time;
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
    print "Got Explicit Matrix in ".(time() - $start)."\n";
    
#Get the Starting Matrix
    $start = time();
    my $startingMatrixRef = 
	Discovery::getRows($startCuisRef, $explicitMatrixRef);
    print "Got Starting Matrix in ".(time() - $start)."\n";

    #if using average minimum weight, grab the a->b scores
    my %abPairsWithScores = ();
    if ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight' 
	|| $lbdOptions{'rankingProcedure'} eq 'ltc_amw') {

	#apply semantic type filter to columns only
	if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
	}
	#initialize the abPairs to frequency of co-occurrence
	foreach my $row (keys %{$startingMatrixRef}) {
	    foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
		$abPairsWithScores{"$row,$col"} = ${${$startingMatrixRef}{$row}}{$col};
	    }
	}
        Rank::getBatchAssociationScores(\%abPairsWithScores, $explicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    }

    #Apply Semantic Type Filter to the explicit matrix
    if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	$start = time();
	Filters::semanticTypeFilter_rowsAndColumns(
	    $explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
	print "Semantic Type Filter in ".(time() - $start)."\n";
    }
    
#Get Implicit Connections
    $start = time();
    my $implicitMatrixRef;
    if (defined $lbdOptions{'implicitInputFile'}) {
	$implicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'implicitInputFile'});
    } else {
	$implicitMatrixRef = Discovery::findImplicit($explicitMatrixRef, $startingMatrixRef);
    }
    print "Got Implicit Matrix in ".(time() - $start)."\n";

#Remove Known Connections 
     $start = time();
     $implicitMatrixRef = Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);
     print "Removed Known Connections in ".(time() - $start)."\n";
 
#Apply Semantic Type Filter
    if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	$start = time();
	Filters::semanticTypeFilter_columns(
	    $implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
	print "Semantic Type Filter in ".(time() - $start)."\n";
    }

#Score Implicit Connections
    $start = time();	
    my $scoresRef;
    if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
	$scoresRef = Rank::scoreImplicit_fromAllPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    } elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
	$scoresRef = Rank::scoreImplicit_averageMinimumWeight($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores);
    } elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
	$scoresRef = Rank::scoreImplicit_linkingTermCount($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
    } elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
	$scoresRef = Rank::scoreImplicit_frequency($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);
    } elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
	$scoresRef = Rank::scoreImplicit_ltcAssociation($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    } elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	$scoresRef = Rank::scoreImplicit_LTC_AMW($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores);
    } else {
	die ("Error: Invalid Ranking Procedure\n");
    }    
    print "Scored in: ".(time()-$start)."\n";
  
#Rank Implicit Connections
    $start = time();
    my $ranksRef = Rank::rankDescending($scoresRef);
    print "Ranked in: ".(time()-$start)."\n";

#Output The Results
    open OUT, ">$lbdOptions{implicitOutputFile}" 
	or die "unable to open implicit ouput file: "
	."$lbdOptions{implicitOutputFile}\n";
    my $outputString = $self->_rankedTermsToString($scoresRef, $ranksRef);
    my $paramsString = $self->_parametersToString();
    print OUT $paramsString;
    print OUT $outputString;
    close OUT;

lib/ALBD.pm  view on Meta::CPAN

    print "DONE!\n\n";
}

#----------------------------------------------------------------------------

# performs LBD, closed discovery
# input:  none
# ouptut: none, but a results file is written to disk
sub performLBD_closedDiscovery {
    my $self = shift;
    my $start; #used to record run times

    print "Closed Discovery\n";
    print $self->_parametersToString();

#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $targetCuisRef = $self->_getTargetCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');

#Get the Explicit Matrix
    $start = time;
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
    print "Got Explicit Matrix in ".(time() - $start)."\n";
    
#Get the Starting Matrix
    $start = time();
    my $startingMatrixRef = 
	Discovery::getRows($startCuisRef, $explicitMatrixRef);
    print "Got Starting Matrix in ".(time() - $start)."\n";
    print "   numRows in startMatrix = ".(scalar keys %{$startingMatrixRef})."\n";

    #Apply Semantic Type Filter to the explicit matrix
    if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	$start = time();
	Filters::semanticTypeFilter_rowsAndColumns(
	    $explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
	print "Semantic Type Filter in ".(time() - $start)."\n";
    }

#Get the Target Matrix
    $start = time();
    my $targetMatrixRef = 
	Discovery::getRows($targetCuisRef, $explicitMatrixRef);
    print "Got Target Matrix in ".(time() - $start)."\n";
    print "   numRows in targetMatrix = ".(scalar keys %{$targetMatrixRef})."\n";

#find the linking terms in common for starting and target matrices
    print "Finding terms in common\n";
    #get starting linking terms
    my %startLinks = ();
    foreach my $row (keys %{$startingMatrixRef}) {
	foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
	    $startLinks{$col} = ${${$startingMatrixRef}{$row}}{$col};
	}
    }
    print "   num start links = ".(scalar keys %startLinks)."\n";
    #get target linking terms
    my %targetLinks = ();
    foreach my $row (keys %{$targetMatrixRef}) {
	foreach my $col (keys %{${$targetMatrixRef}{$row}}) {
	    $targetLinks{$col} = ${${$targetMatrixRef}{$row}}{$col};
	}
    }
    print "   num target links = ".(scalar keys %targetLinks)."\n";
    #find linking terms in common
    my %inCommon = ();
    foreach my $startLink (keys %startLinks) {
	if (exists $targetLinks{$startLink}) {
	    $inCommon{$startLink} = $startLinks{$startLink} + $targetLinks{$startLink};
	}
    }
     print "   num in common = ".(scalar keys %inCommon)."\n";

#Score and Rank
    #Score the linking terms in common
    my $scoresRef = \%inCommon;
    #TODO score is just summed frequency right now

    #Rank Implicit Connections
    $start = time();
    my $ranksRef = Rank::rankDescending($scoresRef);
    print "Ranked in: ".(time()-$start)."\n";

#Output The Results
    open OUT, ">$lbdOptions{implicitOutputFile}" 
	or die "unable to open implicit ouput file: "
	."$lbdOptions{implicitOutputFile}\n";
    my $outputString = $self->_rankedTermsToString($scoresRef, $ranksRef);
    my $paramsString = $self->_parametersToString();
    print OUT $paramsString;
    print OUT $outputString;

    print OUT "\n\n---------------------------------------\n\n";
    print OUT "starting linking terms:\n";
    print OUT join("\n", keys %startLinks);

    print OUT "\n\n---------------------------------------\n\n";
    print OUT "target linking terms:\n";
    print OUT join("\n", keys %targetLinks, );

    close OUT;

#Done
    print "DONE!\n\n";
}

lib/ALBD.pm  view on Meta::CPAN

# access to it is then slow. Would require a major redo of the code
#
=comment
# performs LBD, but using implicit matrix ranking schemes.
# Since the order of operations for those methods are slighly different
# a new method has been created.
# input:  none
# output: none, but a results file is written to disk
sub performLBD_implicitMatrixRanking {
    my $self = shift;
    my $start; #used to record run times
    print  $self->_parametersToString();
    print "In Implicit Ranking\n";
    
#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
    print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
    print "linkingAcceptTypes = ".(join(',', keys %{$linkingAcceptTypesRef}))."\n";
    print "targetAcceptTypes = ".(join(',', keys %{$targetAcceptTypesRef}))."\n";

#Score Implicit Connections
    $start = time();	
    my $scoresRef;
    $scoresRef = Rank::scoreImplicit_fromImplicitMatrix($startCuisRef,  $lbdOptions{'implicitInputFile'}, $lbdOptions{rankingMeasue}, $umls_association);
    print "Scored in: ".(time()-$start)."\n";
  
#Rank Implicit Connections
    $start = time();
    my $ranksRef = Rank::rankDescending($scoresRef);
    print "Ranked in: ".(time()-$start)."\n";

#Output The Results
    open OUT, ">$lbdOptions{implicitOutputFile}" 
	or die "unable to open implicit ouput file: "
	."$lbdOptions{implicitOutputFile}\n";
    my $outputString = $self->_rankedTermsToString($scoresRef, $ranksRef);
    my $paramsString = $self->_parametersToString();
    print OUT $paramsString;
    print OUT $outputString;
    close OUT;

lib/ALBD.pm  view on Meta::CPAN

# input:  none
# output: none, but precision and recall values are printed to STDOUT
sub timeSlicing_generatePrecisionAndRecall_explicit {
    my $NUM_SAMPLES = 100; #TODO, read fomr file number of samples to average over for timeslicing
    my $self = shift;
    print "In timeSlicing_generatePrecisionAndRecall\n";

    my $numIntervals = 10;

#Get inputs
    my $startAcceptTypesRef = $self->_getAcceptTypes('start');
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');


#Get the Explicit Matrix
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});

#------------------------------------------

    #create the starting matrix
    my $startingMatrixRef 
	= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);

    #get association scores for the starting matrix
    my $assocScoresRef = TimeSlicing::getAssociationScores(
	$startingMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    my ($min, $max) = TimeSlicing::getMinMax($assocScoresRef);
    my $range = $max-$min;

    #load the post cutoff matrix for the necassary rows
    my $postCutoffMatrixRef 
	= TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});

    #apply a semantic type filter to the post cutoff matrix
    if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$postCutoffMatrixRef, $targetAcceptTypesRef, $umls_interface);
    }

    #apply a threshold at $numIntervals% intervals to generate an 11 point
    # interpolated precision/recall curve for linking term ranking/thresholding
    #stats for collecting info about predicted vs. true

lib/ALBD.pm  view on Meta::CPAN

	#determine the number of samples to threshold
	my $numSamples = $i*($allPairsCount/$numIntervals);
	print "i, numSamples/allPairsCount = $i, $numSamples/$allPairsCount\n";
	#grab samples at just 10 to estimate the final point (this is what 
	# makes it an 11 point curve)
	if ($numSamples == 0) {
	    $numSamples = 10;
	}

	#apply a threshold (number of samples)
	my $thresholdedStartingMatrixRef = TimeSlicing::grabKHighestRankedSamples($numSamples, $assocScoresRef, $startingMatrixRef);

	#generate implicit knowledge
	my $implicitMatrixRef = Discovery::findImplicit($explicitMatrixRef, $thresholdedStartingMatrixRef);

	#Remove Known Connections
	$implicitMatrixRef 
	    = Discovery::removeExplicit($startingMatrixRef, $implicitMatrixRef);

	#apply a semantic type filter to the implicit matrix
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$implicitMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#calculate precision and recall
	my ($precision, $recall) = TimeSlicing::calculatePrecisionRecall(
	    $implicitMatrixRef, $postCutoffMatrixRef);

lib/ALBD.pm  view on Meta::CPAN


# generates precision and recall values by varying the threshold
# of the A->C ranking measure. Also generates precision at k, and
# mean average precision
# input:  none
# output: none, but precision, recall, precision at k, and map values
#         output to STDOUT
sub timeSlicing_generatePrecisionAndRecall_implicit {
    my $NUM_SAMPLES = 200; #TODO, read fomr file number of samples to average over for timeslicing
    my $self = shift;
    my $start; #used to record run times
    print "In timeSlicing_generatePrecisionAndRecall_implicit\n";

    #Get inputs
    my $startAcceptTypesRef = $self->_getAcceptTypes('start');
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');

#-----------
# Starting Matrix Creation
#-----------
    #Get the Explicit Matrix
    print "loading explicit\n";
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});

    #create the starting matrix
    print "generating starting\n";
    my $startingMatrixRef 
	= TimeSlicing::generateStartingMatrix($explicitMatrixRef, \%lbdOptions, $startAcceptTypesRef, $NUM_SAMPLES, $umls_interface);
#----------
    

#--------
# Gold Loading/Creation
#--------
    #load or create the gold matrix
    my $goldMatrixRef;
    if (exists $lbdOptions{'goldInputFile'}) {
	print "inputting gold\n";
	$goldMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'goldInputFile'});
    }
    else {
	print "loading post cutoff\n";
	$goldMatrixRef = TimeSlicing::loadPostCutOffMatrix($startingMatrixRef, $explicitMatrixRef, $lbdOptions{'postCutoffFileName'});

	#remove explicit knowledge from the post cutoff matrix
	$goldMatrixRef = Discovery::removeExplicit($startingMatrixRef, $goldMatrixRef);

	#apply a semantic type filter to the post cutoff matrix
	print "applying semantic filter to post-cutoff matrix\n";
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$goldMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#TODO why is the gold matrix outputting with an extra line between samples?
	#output the gold matrix

lib/ALBD.pm  view on Meta::CPAN

    if ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight'
		|| $lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	print "getting AB scores\n";

	#apply semantic type filter to columns only
	if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
	}
	#intitialize the abPairs to the frequency of co-ocurrence
	foreach my $row (keys %{$startingMatrixRef}) {
	    foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
		$abPairsWithScores{"$row,$col"} = ${${$startingMatrixRef}{$row}}{$col}; 
	    }
	}
	Rank::getBatchAssociationScores(
	    \%abPairsWithScores, $explicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    }
#--------

#------------
# Matrix Filtering/Thresholding
#------------

lib/ALBD.pm  view on Meta::CPAN

    if (exists $lbdOptions{'predictionsInFile'}) {
	print "loading predictions\n";
	$predictionsMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'predictionsInFile'});
    }
    else {
	print "generating predictions\n";

	#generate implicit knowledge
	print "Squaring Matrix\n";
	$predictionsMatrixRef = Discovery::findImplicit(
	    $explicitMatrixRef, $startingMatrixRef);

	#Remove Known Connections
	print "Removing Known from Predictions\n";
	$predictionsMatrixRef 
	    = Discovery::removeExplicit($startingMatrixRef, $predictionsMatrixRef);

	#apply a semantic type filter to the predictions matrix
	print "Applying Semantic Filter to Predictions\n";
	if ((scalar keys %{$targetAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$predictionsMatrixRef, $targetAcceptTypesRef, $umls_interface);
	}

	#save the implicit knowledge matrix to file
	if (exists ($lbdOptions{'predictionsOutFile'})) {
	    print "outputting predictions\n";
	    Discovery::outputMatrixToFile($lbdOptions{'predictionsOutFile'}, $predictionsMatrixRef);
	}
    }

#-------------------------------------------

    #At this point, the explicitMatrixRef has been filtered and thresholded
    #The predictions matrix Ref has been generated from the filtered and 
    #  thresholded explicitMatrixRef, only rows of starting terms remain, filtered, and 
    #  had explicit removed
    #Association scores are generated using the explicitMatrixRef


#--------------
# Get the ranks of all predictions
#--------------
    #get the scores and ranks seperately for each row
    # thereby generating scores and ranks for each starting
    # term individually
    my %rowRanks = ();
    my ($n1pRef, $np1Ref, $npp);
    print "getting row ranks\n";
    foreach my $rowKey (keys %{$predictionsMatrixRef}) { 
	#grab rows from start and implicit matrices
	my %startingRow = ();
	$startingRow{$rowKey} = ${$startingMatrixRef}{$rowKey};
	my %implicitRow = ();
	$implicitRow{$rowKey} = ${$predictionsMatrixRef}{$rowKey};

	#Score Implicit Connections	
	my $scoresRef;
	if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_fromAllPairs(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_averageMinimumWeight(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
	    $scoresRef = Rank::scoreImplicit_linkingTermCount(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
	    $scoresRef = Rank::scoreImplicit_frequency(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
	    $scoresRef = Rank::scoreImplicit_ltcAssociation(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_LTC_AMW(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	}  else {
	    die ("Error: Invalid Ranking Procedure\n");
	}    
	
	#Rank Implicit Connections
	my $ranksRef = Rank::rankDescending($scoresRef);

	#save the row ranks
	$rowRanks{$rowKey} = $ranksRef;
    }

lib/ALBD.pm  view on Meta::CPAN

		    $optionsHash{$1} = 1;
		}
	    }
	}
    }
    close IN;

    return \%optionsHash;
}

# transforms the string of start cuis to an array
# input:  none
# output: an array ref of CUIs
sub _getStartCuis {
    my $self = shift;
    my @startCuis = split(',',$lbdOptions{'startCuis'});
    return \@startCuis;
}

# transforms the string of target cuis to an array
# input:  none
# output: an array ref of CUIs
sub _getTargetCuis {
    my $self = shift;
    my @targetCuis = split(',',$lbdOptions{'targetCuis'});
    return \@targetCuis;
}

lib/ALBD.pm  view on Meta::CPAN

    my $self = shift;
    return $VERSION;
}

##############################################################################
#        functions for debugging
##############################################################################
=comment
sub debugLBD {
    my $self = shift;
    my $startingCuisRef = shift;

    print "Starting CUIs = ".(join(',', @{$startingCuisRef}))."\n";

#Get the Explicit Matrix
    my ($explicitMatrixRef, $cuiToIndexRef, $indexToCuiRef, $matrixSize) = 
	Discovery::tableToSparseMatrix('N_11', $cuiFinder);
    print "Explicit Matrix:\n";
    _printMatrix($explicitMatrixRef, $matrixSize, $indexToCuiRef);
    print "-----------------------\n";

#Get the Starting Matrix
    my $startingMatrixRef = 
	Discovery::getRows($startingCuisRef, $explicitMatrixRef);
    print "Starting Matrix:\n";
    _printMatrix($startingMatrixRef, $matrixSize, $indexToCuiRef);
    print "-----------------------\n";
    
#Get Implicit Connections
    my $implicitMatrixRef 
	= Discovery::findImplicit($explicitMatrixRef, $startingMatrixRef, 
				  $indexToCuiRef, $matrixSize);
    print "Implicit Matrix:\n";
    _printMatrix($implicitMatrixRef, $matrixSize, $indexToCuiRef);
    print "-----------------------\n";

#Remove Known Connections
    $implicitMatrixRef = Discovery::removeExplicit($explicitMatrixRef, 
						   $implicitMatrixRef);
    print "Implicit Matrix with Explicit Removed\n";
    _printMatrix($implicitMatrixRef, $matrixSize, $indexToCuiRef);

lib/ALBD.pm  view on Meta::CPAN

    my $npp = Rank::getNPP($explicitMatrixRef);
    my $n1p = Rank::getN1P('C0', $explicitMatrixRef);
    my $np1 = Rank::getNP1('C2', $explicitMatrixRef); 
    print "Contingency Table Values from Explicit Matrix\n";
    print "n11 = $n11\n";
    print "npp = $npp\n";
    print "n1p = $n1p\n";
    print "np1 = $np1\n";

#Test other rank methods
    my $scoresRef = Rank::scoreImplicit_fromAllPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $lbdOptions{rankingMethod}, $umls_association);
    my $ranksRef = Rank::rankDescending($scoresRef);
    print "Scores: \n";
    foreach my $cui (keys %{$scoresRef}) {
	print "   scores{$cui} = ${$scoresRef}{$cui}\n";
    }
    print "Ranks = ".join(',', @{$ranksRef})."\n";
}

sub _printMatrix {
    my $matrixRef = shift;

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

# gets the rows of the cuis from the matrix
# input:  $cuisRef <- an array reference to a list of CUIs
#         $matrixRef <- a reference to a co-occurrence matrix
# output: a hash ref to a sparse matrix containing just the rows retrieved
sub getRows {
    my $cuisRef = shift;
    my $matrixRef = shift;

    my %rows = ();
    my $rowRef;
    #add each cui row to the starting matrix
    foreach my $cui(@{$cuisRef}) {
	#if there is a row for this cui
	if (exists ${$matrixRef}{$cui}) {
	    $rowRef = ${$matrixRef}{$cui};

	    #add each row value to the starting matrix
	    foreach my $key(keys %{$rowRef}) {
		${$rows{$cui}}{$key} = ${$rowRef}{$key};
	    }
	}
    }
    return \%rows;
}


#NOTE...this is calculating B*A ... but is that appropriate?  ... I think that it is, but the values are maybe not so appropriate    ... B*A is nice because it makes the implicit matrix not keep track of non-starting cui rows.   ...but the values are...

# finds the implicit connections for all CUIs (based on squaring)
# It does this by multiplying $matrixB*$matrixA. If $matrix B is the starting
# matrix, and $matrixA is the explicitMatrix, this method works correctly and
# efficiently. $matrixA and $matrixB may also be the explicit matrix but
# this is more inefficient.
# input:  $matrixARef <- ref to a sparse matrix
#         $matrixBRef <- ref to a sparse matrix
# output: ref to a sparse matrix of the product of B*A
sub findImplicit {
    my $matrixARef = shift; 
    my $matrixBRef = shift;

lib/LiteratureBasedDiscovery/Filters.pm  view on Meta::CPAN

	    $termsHash{$key2} = 1;
	}
    }
    print "   number of keys after filtering = ".(scalar keys %termsHash)."\n";
=cut
}


# applies a semantic group filter to the matrix, by removing keys that 
# are not allowed semantic type. Only removes types from columns, 
# so is applied to the implicit matrix (starting term rows with implicit
# columns).
# input:  $matrixRef <- ref to a sparse matrix to be filtered
#         $acceptTypesRef <- a ref to a hash of accept type strings
#         $umls <- an instance of UMLS::Interface
# output: None, but $vectorRef is updated 
sub semanticTypeFilter_columns {
    my $matrixRef = shift;
    my $acceptTypesRef = shift;
    my $umls = shift;
 

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

# 59 Temple Place - Suite 330,
# Boston, MA  02111-1307, USA.

package Rank;
use strict;
use warnings;

# scores each implicit CUI using an assocation measure, but the input to 
# the association measure is based on linking term counts, rather than
# co-occurrence counts.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_ltcAssociation {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;

    #bTerms to calculate n1p (number of unique co-occurring terms)
    my %bTerms = ();
    my $rowRef;
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$rowRef = ${$startingMatrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {
	    $bTerms{$colKey} = 1;
	}
    }
    my $n1p = scalar keys %bTerms;

    #get all the cTerms (unique column values in the implicit matrix)
    my %cTerms = ();
    foreach my $rowKey(keys %{$implicitMatrixRef}) {
	$rowRef = ${$implicitMatrixRef}{$rowKey};

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	    #get score
	    $score{$cTerm} = $association->_calculateAssociation_fromObservedCounts($n11{$cTerm}, $n1p, $np1{$cTerm}, $npp, $measure);
	}
    }
    
    return \%score;
}


# scores each implicit CUI using an assocation measure. Score is the average
# of the minimum between association score between start and linking, and
# linking and target.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
#         $abScoresRef <- hashRef of the a to b scores used in AMW
#                         key is the a,b cui pair (e.g. hash{'C00,C11'})
#                         values are their score
#
#         Optional Input for passing in precalculated stats
#         so that they don't have to get recalcualted each time
#         such as in timeslicing
#         $n1pRef <- hashRef where key is a cui, value is n1p
#         $np1Ref <- hashRef where key is a cui, value is np1
#         $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_averageMinimumWeight {
    #grab input
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;
    my $abScoresRef = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $npp = shift;

    #get all BC pairs (call it bcScores because it will hold the scores)
    my $bcScoresRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    #get cui pair scores
    &getBatchAssociationScores(
	$bcScoresRef, $explicitMatrixRef, $measure, $association,
	$n1pRef, $np1Ref, $npp);

    #find the max a->b score (since there can be multiple a terms)
    my %maxABScores = ();
    my ($key1, $key2, $score);
    foreach my $pairKey (keys %{$abScoresRef}) {

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

    #normalize by counts
    foreach my $key (keys %cScores) {
	$cScores{$key} /= $counts{$key}
    }
 
    return \%cScores;
}


# scores each implicit CUI using linking term count, and AMW as a tie breaker
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
#         $abScoresRef <- hashRef of the a to b scores used in AMW
#                         key is the a,b cui pair (e.g. hash{'C00,C11'})
#                         values are their score
#         Optional Input for passing in precalculated stats
#         so that they don't have to get recalcualted each time
#         such as in timeslicing
#         $n1pRef <- hashRef where key is a cui, value is n1p
#         $np1Ref <- hashRef where key is a cui, value is np1
#         $npp <- scalar = value of npp
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_LTC_AMW {
    #grab the input
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;
    my $abScoresRef = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $nppRef = shift;

    #get linking term count scores
    my $ltcAssociationsRef = &scoreImplicit_linkingTermCount($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    #get average minimum weight scores
    my $amwScoresRef = &scoreImplicit_averageMinimumWeight($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef, $measure, $association, $abScoresRef, $n1pRef, $np1Ref, $nppRef); 

    #create a hash of cui pairs for which the key is the ltc, and the value is an array of cui pairs that have that LTC
    my %ltcHash = ();
    foreach my $pairKey (keys %{$ltcAssociationsRef}) {
		
	#get the LTC we will be tie breaking
	my $currentLTC = ${$ltcAssociationsRef}{$pairKey};
	if (!exists $ltcHash{$currentLTC}) {
	    my @newArray = ();
	    $ltcHash{$currentLTC} = \@newArray;

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	    $currentRank--;
	}
    }

    #return the scores
    return \%ltcAMWScores;
}

#TODO this is an untested method
# gets the max cosine distance score between all a terms and each cTerm 
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub score_cosineDistance {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all the A->C pairs
    my $acPairsRef = &_getACPairs($startingMatrixRef, $implicitMatrixRef);
    my %scores = ();
    foreach my $pairKey (keys %{$acPairsRef}) {
	#get the A and C keys
	my ($aKey, $cKey) = split(/,/,$pairKey);

	#grab the A and C explicit vectors
	my $aVectorRef = ${$explicitMatrixRef}{$aKey};
	my $cVectorRef = ${$explicitMatrixRef}{$cKey};

	#find the numerator which is the sum of A[i]*C[i] values

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

	}
	else {
	    $scores{$cKey} = $score;
	}	
    }
    
    return \%scores;
}

# gets a list of A->C pairs, and sets the value as the implicit matrix value
# input:  $startingMatrixRef <- ref to the starting matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref where keys are comma seperated cui pairs hash{'C000,C111'}
#         and values are set to the value at that index in the implicit matrix
sub _getACPairs {
    my $startingMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #generate a list of ac pairs
    my %acPairs = ();
    foreach my $keyA (keys %{$implicitMatrixRef}) {
	foreach my $keyC (%{${$implicitMatrixRef}{$keyA}}) {
	    $acPairs{$keyA,$keyC} = ${${$implicitMatrixRef}{$keyA}}{$keyC};
	}
    }
    
    return \%acPairs;

}


# scores each implicit CUI based on the number of linking terms between
# it and all starting terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_linkingTermCount {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the linking term count for each cTerm
    my %scores = ();
    my ($key1, $key2);
    foreach my $pairKey (keys %{$bcPairsRef}) {
	#cTerm is the second value ($key2)
	($key1, $key2) = split(/,/,$pairKey);

	#automatically initializes to 0
	$scores{$key2}++;
    }
    return \%scores;
}


# scores each implicit CUI based on the summed frequency of co-occurrence
# between it and all B terms (A->B frequencies are NOT considered)
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_frequency {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, $explicitMatrixRef, $implicitMatrixRef);

    # Find the frequency count for each cTerm
    my %scores = ();
    my ($key1, $key2);
    foreach my $pairKey (keys %{$bcPairsRef}) {
	#cTerm is the second value ($key2)
	($key1, $key2) = split(/,/,$pairKey);

	#automatically initializes to 0 (with +=)
	$scores{$key2} += ${$bcPairsRef}{$pairKey};
    }
    return \%scores;
}

# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between a column in the implicit matrix, and one of the start 
# matrix terms (so max between any A and that C term). 
# Score is calculated using the implicit matrix
# input:  $startCuisRef <- ref to an array of start cuis (A terms)
#         $implicitMatrixFileName <- fileName of the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromImplicitMatrix {
    #LBD Info
    my $startCuisRef = shift;
    my $implicitMatrixFileName = shift;
    my $measure = shift;
    my $association = shift;

######################################
    #Get hashes for A and C terms
#####################################
    #create a hash of starting terms
    my %aTerms = ();
    foreach my $cui (@{$startCuisRef}) {
	$aTerms{$cui} = 1;
    }

    #get all the target terms (terms that co-occur with aTerms 
    # in the implicit matrix file = the implicit terms)
    open IN, "$implicitMatrixFileName";
    my %cTerms = ();
    while (my $line = <IN>) {
	$line =~ /(C\d{7})\s(C\d{7})/;
	if (exists $aTerms{$1}) {

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

    foreach my $cTerm(keys %cTerms) {
	$associationScores{$cTerm} = 
	    $association->_calculateAssociation_fromObservedCounts($n11{$cTerm}, $n1p, $np1{$cTerm}, $npp, $measure);
    }

    return \%associationScores;
}

# scores each implicit CUI using an assocation measure. Score is the maximum 
# association between any of the linking terms.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
#         $measure <- the string of the umls association measure to use
#         $association <- an instance of umls association
# output: a hash ref of scores for each implicit key. (hash{cui} = score)
sub scoreImplicit_fromAllPairs {
    #LBD Info
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;
    my $measure = shift;
    my $association = shift;

    #optionally pass in stats so they don't get recalculated for
    # multiple terms (such as with time slicing)
    my $n1pRef = shift;
    my $np1Ref = shift;
    my $npp = shift;

    #get all bc pairs
    my $bcPairsRef = &_getBCPairs($startingMatrixRef, 
				  $explicitMatrixRef, $implicitMatrixRef);

    #get bc pairs scores
    &getBatchAssociationScores(
	$bcPairsRef, $explicitMatrixRef, $measure, $association,
	$n1pRef, $np1Ref, $npp);


    # Find the max explicitCUI,implicitCUI association for each implicit CUI. 
    # The association score is the maximum value between a C term and all 

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN

#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX



# Builds a list of B->C term pairs that also co-occurr with A terms
# Only adds B->C term pairs for C terms that are also present in the 
# implicitMatrix.
# The value of the bcPairs Hash is the value in the explicit matrix 
# for that pair.
# input:  $startingMatrixRef <- ref to the starting matrix
#         $explicitMatrixRef <- ref to the explicit matrix
#         $implicitMatrixRef <- ref to the implicit matrix
# output: a hash ref of BC term pairs. Each key is "$bTerm,$cTerm", 
#         value is by default the frequency of BC co-occurrences in the 
#         matrix
sub _getBCPairs {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #get all bTerms
    my %bTerms = ();
    my $rowRef;
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$rowRef = ${$startingMatrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {
	    $bTerms{$colKey} = 1;
	}
    }

    #get all the cTerms (unique column values in the implicit matrix)
    my %cTerms = ();
    foreach my $rowKey(keys %{$implicitMatrixRef}) {
	$rowRef = ${$implicitMatrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {

lib/LiteratureBasedDiscovery/Rank.pm  view on Meta::CPAN


	#get association if possible (only possible if the terms have co-occurred)
	if (defined $n11) {
	    ${$cuiPairsRef}{$key} = $association->_calculateAssociation_fromObservedCounts($n11, ${$n1pRef}{$cui1}, ${$np1Ref}{$cui2}, $npp, $measure);
	}
    }
}

# gets NP1, N1P, and NPP for all CUIs. This is used in time-
# slicing and makes it much faster than getting stats individually
# for each starting term
# input:  $matrixRef <- ref to the co-occurrence matrix (the sparse matrix 
#                       of n11 values)
# output: \@vals <- an array ref of three values:
#                   \%n1p - a hash ref where the key is a cui and value is n1p
#                   \%np1 - a hash ref where the key is a cui and value is np1
#                   $npp - a scalar of npp
sub getAllStats {
    my $matrixRef = shift;

    #get all np1, n1p, and npp values of values for each cui

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

    #    = the number of terms in the post cutoff matrix)
    $precision /= scalar keys %{$trueMatrixRef};
    $recall /= scalar keys %{$trueMatrixRef};

    #return the average precision and recall
    return ($precision, $recall);
}


# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are 
# the only rows that are needed.
# input:  $startingMatrixRef <- a ref to the starting sparse matrix
#         $explicitMatrix Ref <- a ref to the explicit sparse matrix
#         $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $postCutoffFileName = shift;
    print "loading postCutoff Matrix\n";
    
    #open the post cutoff file
    open IN, $postCutoffFileName 
	or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");

    #create hash of cuis to grab
    my %cuisToGrab = ();
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$cuisToGrab{$rowKey} = 1;
    }

    #read in values of the post cutoff matrix for the start terms
    my %postCutoffMatrix = ();
    my ($cui1, $cui2, $val);
    while (my $line = <IN>) {
	#grab values from the line
	chomp $line;
	($cui1, $cui2, $val) = split(/\t/,$line);

	#see if this line contains a key that should be read in 
	if (exists $cuisToGrab{$cui1}) {

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

	    }
	}
    }
    close IN;

    #return the post cutoff matrix
    return \%postCutoffMatrix;
}

#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input:  $explicitMatrixRef <- a ref to the explicit sparse matrix
#         $lbdOptionsRef <- the LBD options
#         $startTermAcceptTypesRef <- a reference to an hash of accept 
#                                     types for start terms (TUIs)
#         $numRows <- the number of random rows to load (if random)
#         $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
    my $explicitMatrixRef = shift;
    my $lbdOptionsRef = shift;
    my $startTermAcceptTypesRef = shift;
    my $numRows = shift;
    my $umls_interface = shift;

    #generate the starting matrix randomly or from a file
    my %startingMatrix = ();

    #check if a file is defined
    if (exists ${$lbdOptionsRef}{'cuiListFileName'}) {
	#grab the rows defined by the cuiListFile
	my $cuisRef = &loadCUIs(${$lbdOptionsRef}{'cuiListFileName'});
	foreach my $cui (keys %{$cuisRef}) {
	    if(exists ${$explicitMatrixRef}{$cui}) {
		$startingMatrix{$cui} = ${$explicitMatrixRef}{$cui};	
	    }
	    else {
		print STDERR "WARNING: CUI from cuiListFileName is not in explicitMatrix: $cui\n";
	    }
	}
    }
    else {
	#randomly grab rows
	#apply semantic filter to the rows (just retreive appropriate rows)
	my $rowsToKeepRef = getRowsOfSemanticTypes(
	    $explicitMatrixRef, $startTermAcceptTypesRef, $umls_interface);
	((scalar keys %{$rowsToKeepRef}) >= $numRows) or die("ERROR: number of acceptable rows starting terms is less than $numRows\n");

	#randomly select 100 rows (to generate the 'starting matrix')
	#generate random numbers from 0 to number of rows in the explicit matrix
	my %rowNumbers = ();
	while ((scalar keys %rowNumbers) < $numRows) {
	    $rowNumbers{int(rand(scalar keys %{$rowsToKeepRef}))} = 1;
	}

	#fill starting matrix with keys corresponding to the random numbers 
	my $i = 0;
	foreach my $key (keys %{$rowsToKeepRef}) {
	    if (exists $rowNumbers{$i}) {
		$startingMatrix{$key} = ${$explicitMatrixRef}{$key}
	    }
	    $i++;
	}

	#output the cui list if needed
	if (exists ${$lbdOptionsRef}{'cuiListOutputFile'}) {
	    open OUT, ">".${$lbdOptionsRef}{'cuiListOutputFile'} or die ("ERROR: cannot open cuiListOutputFile:".${$lbdOptionsRef}{'cuiListOutputFile'}."\n");
	    foreach my $cui (keys %startingMatrix) {
		print OUT "$cui\n";
	    }
	    close OUT;
	}
    }

    #return the starting matrix
    return \%startingMatrix;
}


# gets and returns a hash of row keys of the specifies semantic types
# input:  $matrixRef <- a ref to a sparse matrix
#         $acceptTypesRef <- a ref to a hash of accept types (TUIs)
#         $umls <- an instance of UMLS::Interface
# output: \%rowsToKeep <- a ref to hash of rows to keep, each key is 
#                         a CUI, and values are 1. All CUIs specify rows
#                         of acceptable semantic types

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

    my %precision = ();
    my %recall = ();
    foreach my $rowKey (keys %{$trueMatrixRef}) {
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	my $rankedPredictionsRef = ${$rowRanksRef}{$rowKey}; #an array ref of ranked predictions
	
	#get the number of predicted discoveries and true discoveries
	my $numPredictions = scalar @{$rankedPredictionsRef};
	my $numTrue = scalar keys %{$trueRef};

	#skip if there are NO new discoveries for this start term
	if ($numTrue == 0) {
	    next;
	}
	#skip if there are NO predictions for this start term
	if ($numPredictions == 0) {
	    next;
	}

	#determine precision and recall at 10% intervals of the number of 
	#predicted true vaules. This is done by simulating a threshold being
	#applied, so the top $numToTest ranked terms are tested at 10% intervals
	my $interval = $numPredictions/$numIntervals;
	for (my $i = 0; $i <= 1; $i+=(1/$numIntervals)) {
	    

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

	if (!defined $rankedPredictionsRef) {
	    next;
	} 
	my $trueRef = ${$trueMatrixRef}{$rowKey}; #a list of true discoveries
	my $numPredictions = scalar @{$rankedPredictionsRef};

	#calculate the average precision of this true cui, by comparing 
	# the predicted vs. true values ordered and weighted by their rank
	my $ap = 0; #average precision
	my $truePositiveCount = 0;
	#start at 1, since divide by rank...subtract one when indexing
	for (my $rank = 1; $rank <= $numPredictions; $rank++) {
	    my $cui = ${$rankedPredictionsRef}[$rank-1];
	    if (exists ${$trueRef}{$cui}) {
		$truePositiveCount++;
		$ap += ($truePositiveCount/($rank));
	    }
	}

	#calculate the average precision, and add to map
	if ($truePositiveCount > 0) {

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

	}
    }

    #return the mean precisions at k
    return \%meanPrecision;
}


# calculates the number of co-occurrences in the gold set of the top ranked 
# k predictions at k at intervals of 1, from k = 1-10 and intervals of 10 
# for 10-100. Co-occurrence counts are averaged over each of the starting terms
# input:  $trueMatrixRef <- a ref to a hash of true discoveries
#         $rowRanksRef <- a ref to a hash of arrays of ranked predictions. 
#                         Each hash key is a cui,  each hash element is an 
#                         array of ranked predictions for that cui. The ranked 
#                         predictions are cuis are ordered in descending order 
#                         based on association. (from Rank::RankDescending)
# output: \%meanCooccurrenceCounts <- a hash of mean preicsions at K, each key 
#                                     is the value of k, the the value is the 
#                                     precision at that k
sub calculateMeanCooccurrencesAtK {

samples/lbdConfig  view on Meta::CPAN

#            Configuration File for Literature Based Discovery
##############################################################################
# All the options in this file are parsed and used as parameters for
# Literature Based Discovery
# Options keys are in <>'s, and values follow directly after with no space. 
# As as example, the line "<rankingMethod>ll" will set the 'rankingMethod' 
# parameter with a value of 'll' for literature based discovery
#
# For parameters where no value is needed, just write the name of the
# parameter in '<>' (e.g. '<debug>')
# lines started with a # are skipped and may be used for comments

# The ranking procedure to use for LBD
# valid ranking procedures are:
#   allPairs (maxBC) - maximum B to C term value
#   averageMinimumWeight (AMW) - average of minimum A to B and B to C values
#   linkingTermCount* (LTC) - count of shared linking terms
#   frequency* (freq) - sum of A to B co-occurrences of shared B terms
#   ltcAssociation (LTA) - association measures that use linking terms as inputs
#   ltc_AMW - linking term count with AMW as a tie-breaker
#

samples/lbdConfig  view on Meta::CPAN

#<targetAcceptGroups>CHEM,GENE

# similar to target termcept groups, this restricts the acceptable target (C) 
# terms to terms within the semantic types listed
# See http://metampa.nlm.gov/Docs/SemanticTypes_2013AA.txt for a list
#<linkingAcceptGroups>clnd,chem

# Input file path for the explicit co-occurrence matrix used in LBD
<explicitInputFile>sampleExplicitMatrix

# A comma seperated list of starting (A) cuis used in LBD
<startCuis>C0001554,C1961131

# A comma seperated list of target (C) cuis. If specified, system enters closed
# discovery mode
#<targetCuis>

samples/timeSlicingConfig  view on Meta::CPAN

#            Configuration File for Literature Based Discovery
##############################################################################
# All the options in this file are parsed and used as parameters for
# Literature Based Discovery
# Options keys are in <>'s, and values follow directly after with no space. 
# As as example, the line "<rankingMethod>ll" will set the 'rankingMethod' 
# parameter with a value of 'll' for literature based discovery
#
# For parameters where no value is needed, just write the name of the
# parameter in '<>' (e.g. '<debug>')
# lines started with a # are skipped and may be used for comments

#----- Time Slicing Specific Parameters ------------------------

#Tell LBD to enter precision and recall mode (time slicing)
<precisionAndRecall_implicit>

# name of the file that contains a new line seperated list of cuis
# each cui serves as a start term. The average of all cuis in this
# file are what is reported for precision and recall
<cuiListFileName>timeSliceCuiList

# A list of starting accept types. This is used to randomly generate 100
# starting terms if a cuiListFileName is not specified. All starting terms
# will be of the types listed
<startAcceptTypes>dsyn


#--------------------------------------

# The ranking procedure to use for LBD
# valid ranking procedures are:
#   allPairs (maxBC) - maximum B to C term value
#   averageMinimumWeight (AMW) - average of minimum A to B and B to C values
#   linkingTermCount* (LTC) - count of shared linking terms
#   frequency* (freq) - sum of A to B co-occurrences of shared B terms

t/goldSampleOutput  view on Meta::CPAN

Parameters:
explicitInputFile -> sampleExplicitMatrix
implicitOutputFile -> sampleOutput
rankingMeasure -> ll
rankingProcedure -> averageMinimumWeight
startCuis -> C0001554,C1961131

1	10.78580	C0302583	Iron
2	2.70240	C1880198	
3	2.33680	C0027651	Neoplasm
4	2.33680	C0034494	
5	2.33680	C0242151	
6	2.33680	C0274281	
7	2.33680	C0332157	
8	2.33680	C0340639	
9	2.33680	C0741969	

t/goldSampleTimeSliceOutput  view on Meta::CPAN

In timeSlicing_generatePrecisionAndRecall_implicit
loading explicit
generating starting
inputting gold
getting AB scores
applying semantic filter to explicit matrix
generating predictions
Squaring Matrix
Removing Known from Predictions
Applying Semantic Filter to Predictions
outputting predictions
getting row ranks
calculating precision and recall

utils/datasetCreator/applySemanticFilter.pl  view on Meta::CPAN


####### User input
my $matrixFileName = '/home/henryst/lbdData/groupedData/1975_1999_window8_noOrder_threshold5';
my $outputFileName = $matrixFileName.'_filtered';
my $acceptTypesString = ''; #leave blank if none are applied
my $acceptGroupsString = 'CHEM,DISO,GENE,PHYS,ANAT'; #for the explicit matrix
my $interfaceConfig = '/home/share/packages/ALBD/config/interface';

#apply the filter to rows and columns or columns only
# apply to just columns generally for the implicit matrix
#   ...if the rows are just the starting terms
# apply to rows and columns generally for the explicit matrix
my $columnsOnly = 0; #apply to columns only, or rows and columns

&applySemanticFilter($matrixFileName, $outputFileName, 
		     $acceptTypesString, $acceptGroupsString,



###################################################################
###################################################################

utils/datasetCreator/combineCooccurrenceMatrices.pl  view on Meta::CPAN

# created seperately for each year, and stored in a single folder. Creating
# co-occurrence matrices in this manner is useful because it makes running
# the CUICollector faster, and because files can be easily combined for
# different time slicing or discovery replication results. We ran CUI Collector
# seperately for each year of the MetaMapped MEDLINES baseline and stored each
# co-occurrence matrix in a single folder "hadoopByYear/output/". That folder 
# contained file named the year and window size used (e.g. 1975_window8).
# The code may need to be modified slightly for other purposes.
use strict;
use warnings;
my $startYear;
my $endYear;
my $windowSize;
my $dataFolder;

#user input
$dataFolder = '/home/henryst/hadoopByYear/output/';
$startYear = '1983';
$endYear = '1985';
$windowSize = 8;
&combineFiles($startYear,$endYear,$windowSize);


#####################################################
####### Program Start ########
sub combineFiles {
    my $startYear = shift;
    my $endYear = shift;
    my $windowSize = shift;

#Check on I/O
    my $outFileName = "$startYear".'_'."$endYear".'_window'."$windowSize";
(!(-e $outFileName)) 
    or die ("ERROR: output file already exists: $outFileName\n");
open OUT, ">$outFileName" 
    or die ("ERROR: unable to open output file: $outFileName\n");

#combine the files
my %matrix = ();
for(my $year = $startYear; $year <= $endYear; $year++) {
    print "reading $year\n";
    my $inFile = $dataFolder.$year.'_window'.$windowSize;
    if (!(open IN, $inFile)) {
	print "   ERROR: unable to open $inFile\n";
	next;
    }

    #read each line of the file and add to the matrix
    while (my $line = <IN>) {
	#read values from the line

utils/datasetCreator/dataStats/metaAnalysis.pl  view on Meta::CPAN

use strict;
use warnings;

#perform meta-analysis on a single co-occurrence matrix
&metaAnalysis('/home/henryst/lbdData/groupedData/1960_1989_window8_noOrder');

#perform meta-analysis on a date range of co-occurrence matrices in a folder
# this expects a folder to contain a co-occurrence matrix for every year
# specified within the date range
my $dataFolder = '/home/henryst/lbdData/dataByYear/1960_1989';
my $startYear = '1809';
my $endYear = '2015';
my $windowSize = 1;
my $statsOutFileName = '/home/henryst/lbdData/stats_window1';
&folderMetaAnalysis($startYear, $endYear, $windowSize, $statsOutFileName, $dataFolder);


#####################
# runs meta analysis on a set of files
sub folderMetaAnalysis {
    my $startYear = shift;
    my $endYear = shift;
    my $windowSize = shift;
    my $statsOutFileName= shift;
    my $dataFolder = shift;

    #Check on I/O
    open OUT, ">$statsOutFileName" 
	or die ("ERROR: unable to open stats out file: $statsOutFileName\n");

    #print header row
    print OUT "year\tnumRows\tnumCols\tvocabularySize\tnumCooccurrences\n";

    #get stats for each file and output to file
    for(my $year = $startYear; $year <= $endYear; $year++) {
	print "reading $year\n";
	my $inFile = $dataFolder.$year.'_window'.$windowSize;
	if (open IN, $inFile) {
	    (my $numRows, my $numCols, my $vocabularySize, my $numCooccurrences)
		= &metaAnalysis($inFile);
	    print OUT "$year\t$numRows\t$numCols\t$vocabularySize\t$numCooccurrences\n"	
	}
	else {
	    #just skip the file
	    print "   ERROR: unable to open $inFile\n";



( run in 0.352 second using v1.01-cache-2.11-cpan-0d8aa00de5b )