ALBD

 view release on metacpan or  search on metacpan

lib/ALBD.pm  view on Meta::CPAN


=head1 ABSTRACT

      This package consists of Perl modules along with supporting Perl
      programs that perform Literature Based Discovery (LBD). The core 
      data from which LBD is performed are co-occurrences matrices 
      generated from UMLS::Association. ALBD is based on the ABC
      co-occurrence model. Many options can be specified, and many
      ranking methods are available. The novel ranking methods that use
      association measure are available as well as frequency based
      ranking methods. See samples/lbd for more info. Can perform open and
      closed LBD as well as time slicing evaluation.

=head1 INSTALL

To install the module, run the following magic commands:

  perl Makefile.PL
  make
  make test
  make install

This will install the module in the standard location. You will, most
probably, require root privileges to install in standard system
directories. To install in a non-standard directory, specify a prefix
during the 'perl Makefile.PL' stage as:

  perl Makefile.PL PREFIX=/home/sid

It is possible to modify other parameters during installation. The
details of these can be found in the ExtUtils::MakeMaker
documentation. However, it is highly recommended not messing around
with other parameters, unless you know what you're doing.

=head1 CONFIGURATION FILE

There are many parameters that can be specified, both for open and
close discovery as well as time slicing evaluation. Please see the 
samples folder for info and sample configuration files.

=cut


######################################################################
#                          Description
######################################################################
#
# This is a description heared more towards understanding or modifying
# the code, rather than using the program.
#
# LiteratureBasedDiscovery.pm - provides functionality to perform LBD
#
# Matrix Representation:
# LBD is performed using Matrix and Vector operations. The major components 
# are an explicit knowledge matrix, which is squared to find the implicit 
# knowledge matrix.
#
# The explicit knowledge is read from UMLS::Association N11 matrix. This 
# matrix contains the co-occurrence counts for all CUI pairs. The 
# UMLS::Association database is completely independent from 
# implementation, so any dataset, window size, or anything else may be used. 
# Data is read in as a sparse matrix using the Discovery::tableToSparseMatrix 
# function. This returns the primary data structures and variables used 
# throughtout LBD.
#
# Matrix representation: 
# This module uses a matrix representation for LBD. All operations are 
# performed either as matrix or vector operations. The core data structure
# are the co-occurrence matrices explicitMatrix and implicitMatrix. These
# matrices have dimensions vocabulary size by vocabulary size. Each row 
# corresponds to the all co-occurrences for a single CUI. Each column of that 
# row corresponding to a co-occurrence with a single CUI. Since the matrices 
# tend to be sparse, they are stored as hashes of hashes, where the the first 
# key is for a row, and the second key is for a column. The keys of each hash 
# are the indeces within the matrix. The hash values are the number of 
# co-ocurrences for that CUI pair (e.g. ${${$explicit{C0000000}}{C1111111} = 10 
# means that CUI C0000000 and C1111111 co-occurred 10 times).
#
# Now with an understanding of the data strucutres, below is a breif 
# description of each: 
#
# startingMatrix <- A matrix containing the explicit matrix rows for all of the
#                   start terms. This makes it easy to have multiple start terms
#                   and using this matrix as opposed to the entire explicit 
#                   matrix drastically improves performance.
# explicitMatrix <- A matrix containing explicit connections (known connections)
#                   for every CUI in the dataset.            
# implicitMatrix <- A matrix containing implicit connections (discovered 
#                   connections) for every CUI in the datast


package ALBD;

use strict;
use warnings;

use LiteratureBasedDiscovery::Discovery;
use LiteratureBasedDiscovery::Evaluation;
use LiteratureBasedDiscovery::Rank;
use LiteratureBasedDiscovery::Filters;
use LiteratureBasedDiscovery::TimeSlicing;

use UMLS::Association;
use UMLS::Interface;

#### UPDATE VERSION HERE #######
use vars qw($VERSION);
$VERSION = 0.05;

#global variables
my $DEBUG = 0;
my $N11_TABLE = 'N_11';
my %lbdOptions = ();
   #rankingProcedure <-- the procedure to use for ranking
   #rankingMeasure <-- the association measure to use for ranking 
   #implicitOutputFile  <--- the output file of results
   #explicitInputFile <-- file to load explicit matrix from
   #implicitInputFile <-- load implicit from file rather than calculating

#references to other packages
my $umls_interface;
my $umls_association;

#####################################################
####################################################

# performs LBD
# input:  none
# ouptut: none, but a results file is written to disk
sub performLBD {
    my $self = shift;
    my $start; #used to record run times

    #implicit matrix ranking requires a different set of procedures
    if ($lbdOptions{'rankingProcedure'} eq 'implicitMatrix') { 
	$self->performLBD_implicitMatrixRanking();
	return;
    }
    if (exists $lbdOptions{'targetCuis'}) {
	$self->performLBD_closedDiscovery();
	return;
    }
    if (exists $lbdOptions{'precisionAndRecall_explicit'}) {
	$self->timeSlicing_generatePrecisionAndRecall_explicit();
	return;
    }
    if (exists $lbdOptions{'precisionAndRecall_implicit'}) {
	$self->timeSlicing_generatePrecisionAndRecall_implicit();
	return;
    }
    print "Open Discovery\n";
    print $self->_parametersToString();

#Get inputs
    my $startCuisRef = $self->_getStartCuis();
    my $linkingAcceptTypesRef = $self->_getAcceptTypes('linking');
    my $targetAcceptTypesRef = $self->_getAcceptTypes('target');
    print "startCuis = ".(join(',', @{$startCuisRef}))."\n";
    print "linkingAcceptTypes = ".(join(',', keys %{$linkingAcceptTypesRef}))."\n";
    print "targetAcceptTypes = ".(join(',', keys %{$targetAcceptTypesRef}))."\n";

#Get the Explicit Matrix
    $start = time;
    my $explicitMatrixRef;
    if(!defined $lbdOptions{'explicitInputFile'}) {
	die ("ERROR: explicitInputFile must be defined in LBD config file\n");
    }
    $explicitMatrixRef = Discovery::fileToSparseMatrix($lbdOptions{'explicitInputFile'});
    print "Got Explicit Matrix in ".(time() - $start)."\n";
    
#Get the Starting Matrix
    $start = time();
    my $startingMatrixRef = 
	Discovery::getRows($startCuisRef, $explicitMatrixRef);
    print "Got Starting Matrix in ".(time() - $start)."\n";

    #if using average minimum weight, grab the a->b scores
    my %abPairsWithScores = ();
    if ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight' 
	|| $lbdOptions{'rankingProcedure'} eq 'ltc_amw') {

	#apply semantic type filter to columns only
	if ((scalar keys %{$linkingAcceptTypesRef}) > 0) {
	    Filters::semanticTypeFilter_columns(
		$explicitMatrixRef, $linkingAcceptTypesRef, $umls_interface);
	}
	#initialize the abPairs to frequency of co-occurrence
	foreach my $row (keys %{$startingMatrixRef}) {
	    foreach my $col (keys %{${$startingMatrixRef}{$row}}) {
		$abPairsWithScores{"$row,$col"} = ${${$startingMatrixRef}{$row}}{$col};
	    }
	}
        Rank::getBatchAssociationScores(\%abPairsWithScores, $explicitMatrixRef, $lbdOptions{'rankingMeasure'}, $umls_association);
    }

lib/ALBD.pm  view on Meta::CPAN

	my $scoresRef;
	if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_fromAllPairs(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_averageMinimumWeight(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
	    $scoresRef = Rank::scoreImplicit_linkingTermCount(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
	    $scoresRef = Rank::scoreImplicit_frequency(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
	    $scoresRef = Rank::scoreImplicit_ltcAssociation(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_LTC_AMW(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	}  else {
	    die ("Error: Invalid Ranking Procedure\n");
	}    
	
	#Rank Implicit Connections
	my $ranksRef = Rank::rankDescending($scoresRef);

	#save the row ranks
	$rowRanks{$rowKey} = $ranksRef;
    }

    #output the results at 10 intervals
    TimeSlicing::outputTimeSlicingResults($goldMatrixRef, \%rowRanks, 10);
}



##############################################################################
#        functions to grab parameters and inialize all input
##############################################################################
# method to create a new LiteratureBasedDiscovery object
# input: $optionsHashRef <- a reference to an LBD options hash
# output: a new LBD object
sub new {
    my $self = {};
    my $className = shift;
    my $optionsHashRef = shift;
    bless($self, $className);

    $self->_initialize($optionsHashRef);
    return $self;
}

# Initializes everything needed for Literature Based Discovery
# input: $optionsHashRef <- reference to LBD options hash (command line input)
# output: none, but global parameters are set
sub _initialize {
    my $self = shift;
    my $optionsHashRef = shift; 

    #initialize UMLS::Interface
    my %tHash = ();
    $tHash{'t'} = 1; #default hash values are with t=1 (silence module output)
    my $componentOptions = \%tHash;
    if (${$optionsHashRef}{'interfaceConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 
	    $self->_readConfigFile(${$optionsHashRef}{'interfaceConfig'});
    }
    #else use default configuration
    $umls_interface = UMLS::Interface->new($componentOptions) 
	or die "Error: Unable to create UMLS::Interface object.\n";

    #initialize UMLS::Association
    $componentOptions = \%tHash;
    if (${$optionsHashRef}{'assocConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 
	    $self->_readConfigFile(${$optionsHashRef}{'assocConfig'});
    }
    #else use default configuation
    $umls_association = UMLS::Association->new($componentOptions) or 
	die "Error: Unable to create UMLS::Association object.\n";

    #initialize LBD parameters
    %lbdOptions = %{$self->_readConfigFile(${$optionsHashRef}{'lbdConfig'})};
    
}    

# Reads the config file in as an options hash
# input: the name of a configuration file that has key fields in '<>'s, 
#        The '>' is followed directly by the value for that key, no space.
#        Each line of the file contains a new key-value pair (e.g. <key>value)
#        If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
    my $self = shift;
    my $configFileName = shift;
    
    #read in all options from the config file
    open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
    my %optionsHash = ();
    my $firstChar;
    while (my $line = <IN>) {
	#check if its a comment or blank line
	$firstChar = substr $line, 0, 1;
	
	if ($firstChar ne '#' && $line =~ /[^\s]+/) {
	    #line contains data, grab the key and value
	    $line =~ /<([^>]+)>([^\n]*)/;	  

	    #make sure the data was read in correctly
	    if (!$1) {
		print STDERR 
		    "Warning: Invalid line in $configFileName: $line\n";
	    }
	    else {
		#data was grabbed from the line, add to hash
		if ($2) {
		    #add key and value to the optionsHash
		    $optionsHash{$1} = $2;
		}
		else {
		    #add key and set default value to the optionsHash
		    $optionsHash{$1} = 1;
		}
	    }
	}
    }
    close IN;

    return \%optionsHash;
}

# transforms the string of start cuis to an array
# input:  none
# output: an array ref of CUIs
sub _getStartCuis {
    my $self = shift;
    my @startCuis = split(',',$lbdOptions{'startCuis'});
    return \@startCuis;
}

# transforms the string of target cuis to an array
# input:  none
# output: an array ref of CUIs
sub _getTargetCuis {
    my $self = shift;
    my @targetCuis = split(',',$lbdOptions{'targetCuis'});
    return \@targetCuis;
}

# transforms the string of accept types or groups into a hash of accept TUIs
# input:  a string specifying whether linking or target types are being defined
# output: a hash of acceptable TUIs
sub _getAcceptTypes {
    my $self = shift;
    my $stepString = shift; #either 'linking' or 'target'

    #get the accept types 
    my %acceptTypes = ();

    #add all types for groups specified
    my $string = $stepString.'AcceptGroups';
    if (defined $lbdOptions{$string}) {
	#accept groups were specified
	my @acceptGroups = split(',',$lbdOptions{$string});

	#add all the types of each group
	foreach my $group(@acceptGroups) {
	    my $typesRef = Filters::getTypesOfGroup($group, $umls_interface);
	    foreach my $key(keys %{$typesRef}) {
		$acceptTypes{$key} = 1;
	    }
	}
    }

    #add all types specified
    $string = $stepString.'AcceptTypes';
    if (defined $lbdOptions{$string}) {
	#convert each type to a tui and add
	my $tui;
	my @acceptTypes = split(',',$lbdOptions{$string});
	foreach my $abr(@acceptTypes) {
	    $tui = uc $umls_interface->getStTui($abr);
	    $acceptTypes{$tui} = 1;
	}
    }
    
    return \%acceptTypes;
}



##############################################################################
#        function to produce output
##############################################################################
# outputs the implicit terms to string
# input:  $scoresRef <- a reference to a hash of scores (hash{CUI}=score)
#         $ranksRef <- a reference to an array of CUIs ranked by their score
#         $printTo <- optional, outputs the $printTo top ranked terms. If not
#                     specified, all terms are output
# output: a line seperated string containing ranked terms, scores, and thier
#         preferred terms
sub _rankedTermsToString {
    my $self = shift;
    my $scoresRef = shift;
    my $ranksRef = shift;
    my $printTo = shift;

    #set printTo
    if (!$printTo) {
	$printTo = scalar @{$ranksRef};
    }
    
    #construct the output string
    my $string = '';
    my $index;
    for (my $i = 0; $i < $printTo; $i++) {
	#add the rank
	$index = $i+1;
	$string .= "$index\t";
	#add the score
	$string .= sprintf "%.5f\t", "${$scoresRef}{${$ranksRef}[$i]}\t";
	#add the CUI
	$string .= "${$ranksRef}[$i]\t";
	#add the name
	my $name = $umls_interface->getPreferredTerm(${$ranksRef}[$i]);
	#if no preferred name, get anything
	if (!defined $name || $name eq '') {
	    my $termListRef = $umls_interface->getTermList('C0440102');
	    if (scalar @{$termListRef} > 0) {
		$name = '.**'.${$termListRef}[0];
	    }
	}

	$string .= "$name\n";
    }

    #return the string of ranked terms
    return $string;
}

# converts the current objects parameters to a string
# input : none
# output: a string of parameters that were used for LBD
sub _parametersToString {
    my $self = shift;
        
    #LBD options
    my $paramsString = "Parameters:\n";
    foreach my $key (sort keys %lbdOptions) {
	$paramsString .= "$key -> $lbdOptions{$key}\n";
    }
    $paramsString .= "\n";
    return $paramsString;
    #association options? TODO
    #interface options? TODO
}


# returns the version currently being used
# input : none
# output: the version number being used
sub version {
    my $self = shift;
    return $VERSION;
}

##############################################################################
#        functions for debugging



( run in 1.716 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )