ALBD

 view release on metacpan or  search on metacpan

lib/ALBD.pm  view on Meta::CPAN

# ALBD::ALBD
#
# Primary module
# This module contains only the top level functions. So a step by step methods
# Filtering, everything else is in different modules. 
#
# Copyright (c) 2017
#
# Sam Henry
# henryst at vcu.edu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA  02111-1307, USA.


=head1 NAME

ALBD - a perl implementation of Literature Based Discovery

=head1 SYNOPSIS
    
    use ALBD;
    %options = ();
    $options{'lbdConfig'} = 'configFile'
    my $lbd = LiteratureBasedDiscovery->new(\%options);
    $lbd->performLBD();

=head1 ABSTRACT

      This package consists of Perl modules along with supporting Perl
      programs that perform Literature Based Discovery (LBD). The core 
      data from which LBD is performed are co-occurrences matrices 
      generated from UMLS::Association. ALBD is based on the ABC
      co-occurrence model. Many options can be specified, and many
      ranking methods are available. The novel ranking methods that use
      association measure are available as well as frequency based
      ranking methods. See samples/lbd for more info. Can perform open and
      closed LBD as well as time slicing evaluation.

=head1 INSTALL

To install the module, run the following magic commands:

  perl Makefile.PL
  make
  make test
  make install

This will install the module in the standard location. You will, most
probably, require root privileges to install in standard system
directories. To install in a non-standard directory, specify a prefix
during the 'perl Makefile.PL' stage as:

  perl Makefile.PL PREFIX=/home/sid

It is possible to modify other parameters during installation. The
details of these can be found in the ExtUtils::MakeMaker
documentation. However, it is highly recommended not messing around
with other parameters, unless you know what you're doing.

=head1 CONFIGURATION FILE

There are many parameters that can be specified, both for open and
close discovery as well as time slicing evaluation. Please see the 
samples folder for info and sample configuration files.

=cut


######################################################################
#                          Description
######################################################################
#
# This is a description heared more towards understanding or modifying
# the code, rather than using the program.
#
# LiteratureBasedDiscovery.pm - provides functionality to perform LBD
#
# Matrix Representation:
# LBD is performed using Matrix and Vector operations. The major components 
# are an explicit knowledge matrix, which is squared to find the implicit 
# knowledge matrix.
#
# The explicit knowledge is read from UMLS::Association N11 matrix. This 
# matrix contains the co-occurrence counts for all CUI pairs. The 
# UMLS::Association database is completely independent from 
# implementation, so any dataset, window size, or anything else may be used. 
# Data is read in as a sparse matrix using the Discovery::tableToSparseMatrix 
# function. This returns the primary data structures and variables used 
# throughtout LBD.
#
# Matrix representation: 
# This module uses a matrix representation for LBD. All operations are 
# performed either as matrix or vector operations. The core data structure
# are the co-occurrence matrices explicitMatrix and implicitMatrix. These
# matrices have dimensions vocabulary size by vocabulary size. Each row 
# corresponds to the all co-occurrences for a single CUI. Each column of that 
# row corresponding to a co-occurrence with a single CUI. Since the matrices 
# tend to be sparse, they are stored as hashes of hashes, where the the first 
# key is for a row, and the second key is for a column. The keys of each hash 
# are the indeces within the matrix. The hash values are the number of 
# co-ocurrences for that CUI pair (e.g. ${${$explicit{C0000000}}{C1111111} = 10 

lib/ALBD.pm  view on Meta::CPAN

	#Score Implicit Connections	
	my $scoresRef;
	if ($lbdOptions{'rankingProcedure'} eq 'allPairs') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_fromAllPairs(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'averageMinimumWeight') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_averageMinimumWeight(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'linkingTermCount') {
	    $scoresRef = Rank::scoreImplicit_linkingTermCount(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'frequency') {
	    $scoresRef = Rank::scoreImplicit_frequency(\%startingRow, $explicitMatrixRef, \%implicitRow);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltcAssociation') {
	    $scoresRef = Rank::scoreImplicit_ltcAssociation(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association);
	} elsif ($lbdOptions{'rankingProcedure'} eq 'ltc_amw') {
	    #get stats just a single time
	    if (!defined $n1pRef || !defined $np1Ref || !defined $npp) {
		($n1pRef, $np1Ref, $npp) = Rank::getAllStats($explicitMatrixRef);
	    }
	    $scoresRef = Rank::scoreImplicit_LTC_AMW(\%startingRow, $explicitMatrixRef, \%implicitRow, $lbdOptions{'rankingMeasure'}, $umls_association, \%abPairsWithScores, $n1pRef, $np1Ref, $npp);
	}  else {
	    die ("Error: Invalid Ranking Procedure\n");
	}    
	
	#Rank Implicit Connections
	my $ranksRef = Rank::rankDescending($scoresRef);

	#save the row ranks
	$rowRanks{$rowKey} = $ranksRef;
    }

    #output the results at 10 intervals
    TimeSlicing::outputTimeSlicingResults($goldMatrixRef, \%rowRanks, 10);
}



##############################################################################
#        functions to grab parameters and inialize all input
##############################################################################
# method to create a new LiteratureBasedDiscovery object
# input: $optionsHashRef <- a reference to an LBD options hash
# output: a new LBD object
sub new {
    my $self = {};
    my $className = shift;
    my $optionsHashRef = shift;
    bless($self, $className);

    $self->_initialize($optionsHashRef);
    return $self;
}

# Initializes everything needed for Literature Based Discovery
# input: $optionsHashRef <- reference to LBD options hash (command line input)
# output: none, but global parameters are set
sub _initialize {
    my $self = shift;
    my $optionsHashRef = shift; 

    #initialize UMLS::Interface
    my %tHash = ();
    $tHash{'t'} = 1; #default hash values are with t=1 (silence module output)
    my $componentOptions = \%tHash;
    if (${$optionsHashRef}{'interfaceConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 
	    $self->_readConfigFile(${$optionsHashRef}{'interfaceConfig'});
    }
    #else use default configuration
    $umls_interface = UMLS::Interface->new($componentOptions) 
	or die "Error: Unable to create UMLS::Interface object.\n";

    #initialize UMLS::Association
    $componentOptions = \%tHash;
    if (${$optionsHashRef}{'assocConfig'} ne '') {
	#read configuration file if its defined
	$componentOptions = 
	    $self->_readConfigFile(${$optionsHashRef}{'assocConfig'});
    }
    #else use default configuation
    $umls_association = UMLS::Association->new($componentOptions) or 
	die "Error: Unable to create UMLS::Association object.\n";

    #initialize LBD parameters
    %lbdOptions = %{$self->_readConfigFile(${$optionsHashRef}{'lbdConfig'})};
    
}    

# Reads the config file in as an options hash
# input: the name of a configuration file that has key fields in '<>'s, 
#        The '>' is followed directly by the value for that key, no space.
#        Each line of the file contains a new key-value pair (e.g. <key>value)
#        If no value is provided, a default value of 1 is set
# output: a hash ref to a hash containing each key value pair
sub _readConfigFile {
    my $self = shift;
    my $configFileName = shift;
    
    #read in all options from the config file
    open IN, $configFileName or die("Error: Cannot open config file: $configFileName\n");
    my %optionsHash = ();
    my $firstChar;
    while (my $line = <IN>) {
	#check if its a comment or blank line
	$firstChar = substr $line, 0, 1;
	
	if ($firstChar ne '#' && $line =~ /[^\s]+/) {
	    #line contains data, grab the key and value
	    $line =~ /<([^>]+)>([^\n]*)/;	  

	    #make sure the data was read in correctly
	    if (!$1) {
		print STDERR 
		    "Warning: Invalid line in $configFileName: $line\n";



( run in 2.801 seconds using v1.01-cache-2.11-cpan-d8267643d1d )