ALBD
view release on metacpan or search on metacpan
lib/ALBD.pm view on Meta::CPAN
#
# LiteratureBasedDiscovery.pm - provides functionality to perform LBD
#
# Matrix Representation:
# LBD is performed using Matrix and Vector operations. The major components
# are an explicit knowledge matrix, which is squared to find the implicit
# knowledge matrix.
#
# The explicit knowledge is read from UMLS::Association N11 matrix. This
# matrix contains the co-occurrence counts for all CUI pairs. The
# UMLS::Association database is completely independent from
# implementation, so any dataset, window size, or anything else may be used.
# Data is read in as a sparse matrix using the Discovery::tableToSparseMatrix
# function. This returns the primary data structures and variables used
# throughtout LBD.
#
# Matrix representation:
# This module uses a matrix representation for LBD. All operations are
# performed either as matrix or vector operations. The core data structure
# are the co-occurrence matrices explicitMatrix and implicitMatrix. These
# matrices have dimensions vocabulary size by vocabulary size. Each row
# corresponds to the all co-occurrences for a single CUI. Each column of that
# row corresponding to a co-occurrence with a single CUI. Since the matrices
# tend to be sparse, they are stored as hashes of hashes, where the the first
# key is for a row, and the second key is for a column. The keys of each hash
# are the indeces within the matrix. The hash values are the number of
# co-ocurrences for that CUI pair (e.g. ${${$explicit{C0000000}}{C1111111} = 10
# means that CUI C0000000 and C1111111 co-occurred 10 times).
#
# Now with an understanding of the data strucutres, below is a breif
# description of each:
#
# startingMatrix <- A matrix containing the explicit matrix rows for all of the
# start terms. This makes it easy to have multiple start terms
# and using this matrix as opposed to the entire explicit
# matrix drastically improves performance.
# explicitMatrix <- A matrix containing explicit connections (known connections)
# for every CUI in the dataset.
# implicitMatrix <- A matrix containing implicit connections (discovered
# connections) for every CUI in the datast
package ALBD;
use strict;
use warnings;
use LiteratureBasedDiscovery::Discovery;
use LiteratureBasedDiscovery::Evaluation;
use LiteratureBasedDiscovery::Rank;
use LiteratureBasedDiscovery::Filters;
use LiteratureBasedDiscovery::TimeSlicing;
use UMLS::Association;
use UMLS::Interface;
#### UPDATE VERSION HERE #######
use vars qw($VERSION);
$VERSION = 0.05;
#global variables
my $DEBUG = 0;
my $N11_TABLE = 'N_11';
my %lbdOptions = ();
#rankingProcedure <-- the procedure to use for ranking
#rankingMeasure <-- the association measure to use for ranking
#implicitOutputFile <--- the output file of results
#explicitInputFile <-- file to load explicit matrix from
#implicitInputFile <-- load implicit from file rather than calculating
#references to other packages
my $umls_interface;
my $umls_association;
#####################################################
####################################################
# performs LBD
# input: none
# ouptut: none, but a results file is written to disk
sub performLBD {
my $self = shift;
my $start; #used to record run times
#implicit matrix ranking requires a different set of procedures
if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') {
$self->performLBD_implicitMatrixRanking();
return;
}
if (exists $lbdOptions{'targetCuis'}) {
$self->performLBD_closedDiscovery();
return;
}
if (exists $lbdOptions{'precisionAndRecall_explicit'}) {
$self->timeSlicing_generatePrecisionAndRecall_explicit();
return;
}
if (exists $lbdOptions{'precisionAndRecall_implicit'}) {
$self->timeSlicing_generatePrecisionAndRecall_implicit();
return;
}
print "Open Discovery\n";
print $self->_parametersToString();
#Get inputs
my $startCuisRef = $self->_getStartCuis();
my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
print "linkingAcceptTypes = ".(join(',', keys %{$linkingAcceptTypesRef}))."\n";
print "targetAcceptTypes = ".(join(',', keys %{$targetAcceptTypesRef}))."\n";
#Get the Explicit Matrix
$start = time;
my $explicitMatrixRef;
if(!defined $lbdOptions{'explicitInputFile'}) {
die ("ERROR: explicitInputFile must be defined in LBD config file\n");
}
$explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
print "Got Explicit Matrix in ".(time() - $start)."\n";
lib/ALBD.pm view on Meta::CPAN
my $scoresRef;
if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
#get stats just a single time
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
}
$scoresRef = Rank::scoreImplicit_fromAllPairs(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, $n1pRef, $np1Ref, $npp);
} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
#get stats just a single time
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
}
$scoresRef = Rank::scoreImplicit_averageMinimumWeight(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
$scoresRef = Rank::scoreImplicit_linkingTermCount(\%startingRow, $explicitMatrixRef, \%implicitRow);
} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
$scoresRef = Rank::scoreImplicit_frequency(\%startingRow, $explicitMatrixRef, \%implicitRow);
} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
$scoresRef = Rank::scoreImplicit_ltcAssociation(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association);
} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
#get stats just a single time
if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
}
$scoresRef = Rank::scoreImplicit_LTC_AMW(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
} else {
die ("Error: Invalid Ranking Procedure\n");
}
#Rank Implicit Connections
my $ranksRef = Rank::rankDescending($scoresRef);
#save the row ranks
$rowRanks{$rowKey} = $ranksRef;
}
#output the results at 10 intervals
TimeSlicing::outputTimeSlicingResults($goldMatrixRef, \%rowRanks, 10);
}
##############################################################################
# functions to grab parameters and inialize all input
##############################################################################
# method to create a new LiteratureBasedDiscovery object
# input: $optionsHashRef <- a reference to an LBD options hash
# output: a new LBD object
sub new {
my $self = {};
my $className = shift;
my $optionsHashRef = shift;
bless($self, $className);
$self->_initialize($optionsHashRef);
return $self;
}
# Initializes everything needed for Literature Based Discovery
# input: $optionsHashRef <- reference to LBD options hash (command line input)
# output: none, but global parameters are set
sub _initialize {
my $self = shift;
my $optionsHashRef = shift;
#initialize UMLS::Interface
my %tHash = ();
$tHash{'t'} = 1; #default hash values are with t=1 (silence module output)
my $componentOptions = \%tHash;
if (${$optionsHashRef}{'interfaceConfig'} ne '') {
#read configuration file if its defined
$componentOptions =
$self->_readConfigFile(${$optionsHashRef}{'interfaceConfig'});
}
#else use default configuration
$umls_interface = UMLS::Interface->new($componentOptions)
or die "Error: Unable to create UMLS::Interface object.\n";
#initialize UMLS::Association
$componentOptions = \%tHash;
if (${$optionsHashRef}{'assocConfig'} ne '') {
#read configuration file if its defined
$componentOptions =
$self->_readConfigFile(${$optionsHashRef}{'assocConfig'});
}
#else use default configuation
$umls_association = UMLS::Association->new($componentOptions) or
die "Error: Unable to create UMLS::Association object.\n";
#initialize LBD parameters
%lbdOptions = %{$self->_readConfigFile(${$optionsHashRef}{'lbdConfig'})};
}
# Reads the config file in as an options hash
# input: the name of a configuration file that has key fields in '<>'s,
# The '>' is followed directly by the value for that key, no space.
# Each line of the file contains a new key-value pair (e.g. <key>value)
# If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
my $self = shift;
my $configFileName = shift;
#read in all options from the config file
open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
my %optionsHash = ();
my $firstChar;
while (my $line = <IN>) {
#check if its a comment or blank line
$firstChar = substr $line, 0, 1;
if ($firstChar ne '#' && $line =~ /[^\s]+/) {
#line contains data, grab the key and value
$line =~ /<([^>]+)>([^\n]*)/;
#make sure the data was read in correctly
if (!$1) {
print STDERR
"Warning: Invalid line in $configFileName: $line\n";
}
( run in 1.333 second using v1.01-cache-2.11-cpan-99c4e6809bf )