ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

# ALBD::Disovery
#
# Perl module for performing Literature Based Discovery based using
# CUI associations as the primary linking and ranking method.
#
# Copyright (c) 2017
#
# Sam Henry
# henryst at vcu.edu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA  02111-1307, USA.

package Discovery;
use strict;
use warnings;
use DBI;

######################################################################
#                        MySQL Notes
######################################################################
#TODO I think some of these notes should be elsewhere
# A Note about the database structure expected
#   Each LBD database is expected to have:
#   PreCutoff_N11
#   PostCutoff_N11
#   PreCutoff_Implicit
#
# Both PreCutoff_N11 and PostCutoff_N11 should
# be generated manually using CUI_Collector
# PreCutoff_Implicit is generated using the tableToSparseMatrix
# function here, which exports a sparse matrix. That matrix 
# can then be imported into matlab, squared, and reloaded into
# a mysql database. With these 3 tables LBD can be performed


######################################################################
#                          Description
######################################################################
# Discovery.pm - provides matrix operations from  n11 counts from 
# UMLS::Association
#
#TODO I think some of these notes should be elsewhere
# 'B' term filtering may be applied by removing elements from the 
# explicit knowledge matrix before squaring. It is important to 
# replicate the original matrix before filtering so that explicit 
# knowledge can be removed from the implicit matrix.
# 'C' term filtering may be applied directly to the implicit
# knowledge matrix.
#
# A Typical workflow may look like:
# 1) load explicit knowledge from UMLS::Association
# 2) clone explicit knowledge (for removal from implicit)
# 3) apply filtering to explicit knowledge
# 4) square explicit knowledge to generate implicit knowledge
# 5) remove explicit knowledge from implicit knowledge
# 6) filter impicit knowledge
# 
# which has code as:
# TODO insert sample code

#NOTE: CUI merging/term expansion can also be easily done by adding
#   two or more explicit vectors, then generating explicit knowledge from
#   them.  BUT also interesting is that term expansion, etc... is 
#   unnecassary if we just rank against every term. We may however need 
#   to modify the ranking metrics to account for synonyms, etc.. (max value
#   of a set of synonyms or something)


######################################################################
#           Functions to perform Literature Based Discovery
######################################################################


# gets the rows of the cuis from the matrix
# input:  $cuisRef <- an array reference to a list of CUIs
#         $matrixRef <- a reference to a co-occurrence matrix
# output: a hash ref to a sparse matrix containing just the rows retrieved
sub getRows {
    my $cuisRef = shift;
    my $matrixRef = shift;

    my %rows = ();
    my $rowRef;
    #add each cui row to the starting matrix
    foreach my $cui(@{$cuisRef}) {
	#if there is a row for this cui
	if (exists ${$matrixRef}{$cui}) {
	    $rowRef = ${$matrixRef}{$cui};

	    #add each row value to the starting matrix
	    foreach my $key(keys %{$rowRef}) {
		${$rows{$cui}}{$key} = ${$rowRef}{$key};
	    }

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

# output: ref to a sparse matrix of the product of B*A
sub findImplicit {
    my $matrixARef = shift; 
    my $matrixBRef = shift;

    my %product = ();
    #loop over the rows of the B matrix
    foreach my $key0 (keys %{$matrixBRef}) {  

	#loop over row
	foreach my $key1 (keys %{$matrixARef}) {	

	    #loop over column
	    foreach my $key2 (keys %{${$matrixARef}{$key1}}) {
		#update values
		if (exists ${${$matrixBRef}{$key0}}{$key1}) {

		    #update
		    if (!exists ${$product{$key0}}{$key2}) {
			${$product{$key0}}{$key2} = 0;			
		    }
		    ${$product{$key0}}{$key2} += 
			${${$matrixBRef}{$key0}}{$key1} * 
			${${$matrixARef}{$key1}}{$key2};
		    
		}
	    }
	}
    }
    return \%product;
}


# removes explicit connections from the matrix of implicit connections by 
# removing keys (O(k), where k is the number of keys in the explicit matrix,
# we expect the explicit k to be smaller than the implicit k)
# input: $explicitMatrixRef <- reference to the explicit knowledge matrix
#        $implicitMatrixRef <- reference to the implicit knowledge matrix
# output: ref to the implicit matrix with explicit knowledge removed
sub removeExplicit {
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #Check each key of the explicit matrix to see if it exists
    # in the implicit matrix
    foreach my $key1(keys %{$explicitMatrixRef}) {
	if (exists ${$implicitMatrixRef}{$key1}) {
	    foreach my $key2(keys %{${$explicitMatrixRef}{$key1}}) {
		if (exists ${${$implicitMatrixRef}{$key1}}{$key2}) {
		    delete ${${$implicitMatrixRef}{$key1}}{$key2};
		}
	    }
	}
    }
    return $implicitMatrixRef;
}


# loads a tab seperated file as a sparse matrix (a hash of hashes)
#    each line of the file contains CUI1 <TAB> CUI2 <TAB> Count
# input:  the filename containing the data
# output: a hash ref to the sparse matrix (${$hash{$index1}}{$index2} = value)
sub fileToSparseMatrix {
    my $fileName = shift;

    open IN, $fileName or die ("unable to open file: $fileName\n");
    my %matrix = ();
    my ($cui1,$cui2,$val);
    while (my $line = <IN>) {
	chomp $line;
	($cui1,$cui2,$val) = split(/\t/,$line);
	
	if (!exists $matrix{$cui1}) {
	    my %hash = ();
	    $matrix{$cui1} = \%hash;
	}
	$matrix{$cui1}{$cui2} = $val;
    }
    close IN;
    return \%matrix;
}

# outputs the matrix to the output file in sparse matrix format, which
# is a file containing rowKey\tcolKey\tvalue
# input:  $outFile - a string specifying the output file
#         $matrixRef - a ref to the sparse matrix containing the data
# output: nothing, but the matrix is output to file
sub outputMatrixToFile {
    my $outFile = shift;
    my $matrixRef = shift;
    
    #open the output file and output fhe matrx
    open OUT, ">$outFile" or die ("Error opening matrix output file: $outFile\n");
    my $rowRef;
    foreach my $rowKey (keys %{$matrixRef}) {
	$rowRef = ${$matrixRef}{$rowKey};
	foreach my $colKey (keys %{$rowRef}) {
	    print OUT "$rowKey\t$colKey\t${$rowRef}{$colKey}\n";
	}
    }
}


#Note: Table to sparse is no longer used, but could be useful in the future
=comment
#  retreive a table from mysql and convert it to a sparse matrix (a hash of 
#     hashes)
#  input : $tableName <- the name of the table to output
#          #cuiFinder <- an instance of UMLS::Interface::CuiFinder
#  output: a hash ref to the sparse matrix (${$hash{$index1}}{$index2} = value)
sub tableToSparseMatrix {
    my $tableName = shift;
    my $cuiFinder = shift;

    # check tableName
    #TODO check that the table exists in the database
    # or die "Error: table does not exist: $tableName\n";

    #  set up database
    my $db = $cuiFinder->_getDB(); 
    
    # retreive the table as a nested hash where keys are CUI1, 
    # then CUI2, value is N11
     my @keyFields = ('cui_1', 'cui_2');
     my $matrixRef = $db->selectall_hashref(
	"select * from $tableName", \@keyFields);

    # set values of the loaded table to n_11
    # ...default is hash of hash of hash
    foreach my $key1(keys %{$matrixRef}) {
	foreach my $key2(keys %{${$matrixRef}{$key1}}) {
	    ${${$matrixRef}{$key1}}{$key2} = ${${${$matrixRef}{$key1}}{$key2}}{'n_11'};
	}
    }
    return $matrixRef;
}
=cut

1;



( run in 0.493 second using v1.01-cache-2.11-cpan-39bf76dae61 )