ALBD

 view release on metacpan or  search on metacpan

lib/ALBD.pm  view on Meta::CPAN

#
# LiteratureBasedDiscovery.pm - provides functionality to perform LBD
#
# Matrix Representation:
# LBD is performed using Matrix and Vector operations. The major components 
# are an explicit knowledge matrix, which is squared to find the implicit 
# knowledge matrix.
#
# The explicit knowledge is read from UMLS::Association N11 matrix. This 
# matrix contains the co-occurrence counts for all CUI pairs. The 
# UMLS::Association database is completely independent from 
# implementation, so any dataset, window size, or anything else may be used. 
# Data is read in as a sparse matrix using the Discovery::tableToSparseMatrix 
# function. This returns the primary data structures and variables used 
# throughtout LBD.
#
# Matrix representation: 
# This module uses a matrix representation for LBD. All operations are 
# performed either as matrix or vector operations. The core data structure
# are the co-occurrence matrices explicitMatrix and implicitMatrix. These
# matrices have dimensions vocabulary size by vocabulary size. Each row 
# corresponds to the all co-occurrences for a single CUI. Each column of that 
# row corresponding to a co-occurrence with a single CUI. Since the matrices 
# tend to be sparse, they are stored as hashes of hashes, where the the first 
# key is for a row, and the second key is for a column. The keys of each hash 
# are the indeces within the matrix. The hash values are the number of 
# co-ocurrences for that CUI pair (e.g. ${${$explicit{C0000000}}{C1111111} = 10 
# means that CUI C0000000 and C1111111 co-occurred 10 times).
#
# Now with an understanding of the data strucutres, below is a breif 
# description of each: 
#
# startingMatrix <- A matrix containing the explicit matrix rows for all of the
#                   start terms. This makes it easy to have multiple start terms
#                   and using this matrix as opposed to the entire explicit 
#                   matrix drastically improves performance.
# explicitMatrix <- A matrix containing explicit connections (known connections)
#                   for every CUI in the dataset.            
# implicitMatrix <- A matrix containing implicit connections (discovered 
#                   connections) for every CUI in the datast


package ALBD;

use strict;
use warnings;

use LiteratureBasedDiscovery::Discovery;
use LiteratureBasedDiscovery::Evaluation;
use LiteratureBasedDiscovery::Rank;
use LiteratureBasedDiscovery::Filters;
use LiteratureBasedDiscovery::TimeSlicing;

use UMLS::Association;
use UMLS::Interface;

#### UPDATE VERSION HERE #######
use vars qw($VERSION);
$VERSION = 0.05;

#global variables
my $DEBUG = 0;
my $N11_TABLE = 'N_11';
my %lbdOptions = ();
   #rankingProcedure <-- the procedure to use for ranking
   #rankingMeasure <-- the association measure to use for ranking 
   #implicitOutputFile  <--- the output file of results
   #explicitInputFile <-- file to load explicit matrix from
   #implicitInputFile <-- load implicit from file rather than calculating

#references to other packages
my $umls_interface;
my $umls_association;

#####################################################
####################################################

# performs LBD
# input:  none
# ouptut: none, but a results file is written to disk
sub performLBD {
    my $self = shift;
    my $start; #used to record run times

    #implicit matrix ranking requires a different set of procedures
    if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') { 
	$self->performLBD_implicitMatrixRanking();
	return;
    }
    if (exists $lbdOptions{'targetCuis'}) {
	$self->performLBD_closedDiscovery();
	return;
    }
    if (exists $lbdOptions{'precisionAndRecall_explicit'}) {
	$self->timeSlicing_generatePrecisionAndRecall_explicit();
	return;
    }
    if (exists $lbdOptions{'precisionAndRecall_implicit'}) {
	$self->timeSlicing_generatePrecisionAndRecall_implicit();
	return;
    }
    print "Open Discovery\n";
    print $self->_parametersToString();

#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
    print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
    print "linkingAcceptTypes = ".(join(',', keys %{$linkingAcceptTypesRef}))."\n";
    print "targetAcceptTypes = ".(join(',', keys %{$targetAcceptTypesRef}))."\n";

#Get the Explicit Matrix
    $start = time;
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
    print "Got Explicit Matrix in ".(time() - $start)."\n";
    

lib/ALBD.pm  view on Meta::CPAN

	my $scoresRef;
	if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_fromAllPairs(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_averageMinimumWeight(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
	    $scoresRef = Rank::scoreImplicit_linkingTermCount(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
	    $scoresRef = Rank::scoreImplicit_frequency(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
	    $scoresRef = Rank::scoreImplicit_ltcAssociation(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_LTC_AMW(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	}  else {
	    die ("Error: Invalid Ranking Procedure\n");
	}    
	
	#Rank Implicit Connections
	my $ranksRef = Rank::rankDescending($scoresRef);

	#save the row ranks
	$rowRanks{$rowKey} = $ranksRef;
    }

    #output the results at 10 intervals
    TimeSlicing::outputTimeSlicingResults($goldMatrixRef, \%rowRanks, 10);
}



##############################################################################
#        functions to grab parameters and inialize all input
##############################################################################
# method to create a new LiteratureBasedDiscovery object
# input: $optionsHashRef <- a reference to an LBD options hash
# output: a new LBD object
sub new {
    my $self = {};
    my $className = shift;
    my $optionsHashRef = shift;
    bless($self, $className);

    $self->_initialize($optionsHashRef);
    return $self;
}

# Initializes everything needed for Literature Based Discovery
# input: $optionsHashRef <- reference to LBD options hash (command line input)
# output: none, but global parameters are set
sub _initialize {
    my $self = shift;
    my $optionsHashRef = shift; 

    #initialize UMLS::Interface
    my %tHash = ();
    $tHash{'t'} = 1; #default hash values are with t=1 (silence module output)
    my $componentOptions = \%tHash;
    if (${$optionsHashRef}{'interfaceConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 
	    $self->_readConfigFile(${$optionsHashRef}{'interfaceConfig'});
    }
    #else use default configuration
    $umls_interface = UMLS::Interface->new($componentOptions) 
	or die "Error: Unable to create UMLS::Interface object.\n";

    #initialize UMLS::Association
    $componentOptions = \%tHash;
    if (${$optionsHashRef}{'assocConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 
	    $self->_readConfigFile(${$optionsHashRef}{'assocConfig'});
    }
    #else use default configuation
    $umls_association = UMLS::Association->new($componentOptions) or 
	die "Error: Unable to create UMLS::Association object.\n";

    #initialize LBD parameters
    %lbdOptions = %{$self->_readConfigFile(${$optionsHashRef}{'lbdConfig'})};
    
}    

# Reads the config file in as an options hash
# input: the name of a configuration file that has key fields in '<>'s, 
#        The '>' is followed directly by the value for that key, no space.
#        Each line of the file contains a new key-value pair (e.g. <key>value)
#        If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
    my $self = shift;
    my $configFileName = shift;
    
    #read in all options from the config file
    open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
    my %optionsHash = ();
    my $firstChar;
    while (my $line = <IN>) {
	#check if its a comment or blank line
	$firstChar = substr $line, 0, 1;
	
	if ($firstChar ne '#' && $line =~ /[^\s]+/) {
	    #line contains data, grab the key and value
	    $line =~ /<([^>]+)>([^\n]*)/;	  

	    #make sure the data was read in correctly
	    if (!$1) {
		print STDERR 
		    "Warning: Invalid line in $configFileName: $line\n";
	    }



( run in 1.333 second using v1.01-cache-2.11-cpan-99c4e6809bf )