ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/Discovery.pm  view on Meta::CPAN

#   PreCutoff_N11
#   PostCutoff_N11
#   PreCutoff_Implicit
#
# Both PreCutoff_N11 and PostCutoff_N11 should
# be generated manually using CUI_Collector
# PreCutoff_Implicit is generated using the tableToSparseMatrix
# function here, which exports a sparse matrix. That matrix 
# can then be imported into matlab, squared, and reloaded into
# a mysql database. With these 3 tables LBD can be performed


######################################################################
#                          Description
######################################################################
# Discovery.pm - provides matrix operations from  n11 counts from 
# UMLS::Association
#
#TODO I think some of these notes should be elsewhere
# 'B' term filtering may be applied by removing elements from the 
# explicit knowledge matrix before squaring. It is important to 
# replicate the original matrix before filtering so that explicit 
# knowledge can be removed from the implicit matrix.
# 'C' term filtering may be applied directly to the implicit
# knowledge matrix.
#
# A Typical workflow may look like:
# 1) load explicit knowledge from UMLS::Association
# 2) clone explicit knowledge (for removal from implicit)
# 3) apply filtering to explicit knowledge
# 4) square explicit knowledge to generate implicit knowledge
# 5) remove explicit knowledge from implicit knowledge
# 6) filter impicit knowledge
# 
# which has code as:
# TODO insert sample code

#NOTE: CUI merging/term expansion can also be easily done by adding
#   two or more explicit vectors, then generating explicit knowledge from
#   them.  BUT also interesting is that term expansion, etc... is 
#   unnecassary if we just rank against every term. We may however need 
#   to modify the ranking metrics to account for synonyms, etc.. (max value
#   of a set of synonyms or something)


######################################################################
#           Functions to perform Literature Based Discovery
######################################################################


# gets the rows of the cuis from the matrix
# input:  $cuisRef <- an array reference to a list of CUIs
#         $matrixRef <- a reference to a co-occurrence matrix
# output: a hash ref to a sparse matrix containing just the rows retrieved
sub getRows {
    my $cuisRef = shift;
    my $matrixRef = shift;

    my %rows = ();
    my $rowRef;
    #add each cui row to the starting matrix
    foreach my $cui(@{$cuisRef}) {
	#if there is a row for this cui
	if (exists ${$matrixRef}{$cui}) {
	    $rowRef = ${$matrixRef}{$cui};

	    #add each row value to the starting matrix
	    foreach my $key(keys %{$rowRef}) {
		${$rows{$cui}}{$key} = ${$rowRef}{$key};
	    }
	}
    }
    return \%rows;
}


#NOTE...this is calculating B*A ... but is that appropriate?  ... I think that it is, but the values are maybe not so appropriate    ... B*A is nice because it makes the implicit matrix not keep track of non-starting cui rows.   ...but the values are...

# finds the implicit connections for all CUIs (based on squaring)
# It does this by multiplying $matrixB*$matrixA. If $matrix B is the starting
# matrix, and $matrixA is the explicitMatrix, this method works correctly and
# efficiently. $matrixA and $matrixB may also be the explicit matrix but
# this is more inefficient.
# input:  $matrixARef <- ref to a sparse matrix
#         $matrixBRef <- ref to a sparse matrix
# output: ref to a sparse matrix of the product of B*A
sub findImplicit {
    my $matrixARef = shift; 
    my $matrixBRef = shift;

    my %product = ();
    #loop over the rows of the B matrix
    foreach my $key0 (keys %{$matrixBRef}) {  

	#loop over row
	foreach my $key1 (keys %{$matrixARef}) {	

	    #loop over column
	    foreach my $key2 (keys %{${$matrixARef}{$key1}}) {
		#update values
		if (exists ${${$matrixBRef}{$key0}}{$key1}) {

		    #update
		    if (!exists ${$product{$key0}}{$key2}) {
			${$product{$key0}}{$key2} = 0;			
		    }
		    ${$product{$key0}}{$key2} += 
			${${$matrixBRef}{$key0}}{$key1} * 
			${${$matrixARef}{$key1}}{$key2};
		    
		}
	    }
	}
    }
    return \%product;
}


# removes explicit connections from the matrix of implicit connections by 
# removing keys (O(k), where k is the number of keys in the explicit matrix,
# we expect the explicit k to be smaller than the implicit k)
# input: $explicitMatrixRef <- reference to the explicit knowledge matrix
#        $implicitMatrixRef <- reference to the implicit knowledge matrix
# output: ref to the implicit matrix with explicit knowledge removed
sub removeExplicit {
    my $explicitMatrixRef = shift;
    my $implicitMatrixRef = shift;

    #Check each key of the explicit matrix to see if it exists
    # in the implicit matrix
    foreach my $key1(keys %{$explicitMatrixRef}) {
	if (exists ${$implicitMatrixRef}{$key1}) {
	    foreach my $key2(keys %{${$explicitMatrixRef}{$key1}}) {
		if (exists ${${$implicitMatrixRef}{$key1}}{$key2}) {
		    delete ${${$implicitMatrixRef}{$key1}}{$key2};
		}
	    }
	}
    }
    return $implicitMatrixRef;



( run in 3.726 seconds using v1.01-cache-2.11-cpan-d8267643d1d )