ALBD

 view release on metacpan or  search on metacpan

lib/LiteratureBasedDiscovery/TimeSlicing.pm  view on Meta::CPAN

		($truePositive/($truePositive+$falsePositive)); 
	} #else precision += 0 ... nothing needs to be done
	if ((scalar keys %{${$trueMatrixRef}{$rowKey}}) > 0) {
	    $recall += 
		($truePositive/
		 (scalar keys %{${$trueMatrixRef}{$rowKey}}));
	} #else recall += 0
    }

    #calculate the averages (divide by the number of rows 
    #    = the number of terms in the post cutoff matrix)
    $precision /= scalar keys %{$trueMatrixRef};
    $recall /= scalar keys %{$trueMatrixRef};

    #return the average precision and recall
    return ($precision, $recall);
}


# loads the post cutoff matrix from file. Only loads rows corresponding
# to rows in the starting matrix ref to save memory, and because those are 
# the only rows that are needed.
# input:  $startingMatrixRef <- a ref to the starting sparse matrix
#         $explicitMatrix Ref <- a ref to the explicit sparse matrix
#         $postCutoffFileName <- the filename to the postCutoffMatrix
# output: \%postCutoffMatrix <- a ref to the postCutoff sparse matrix
sub loadPostCutOffMatrix {
    my $startingMatrixRef = shift;
    my $explicitMatrixRef = shift;
    my $postCutoffFileName = shift;
    print "loading postCutoff Matrix\n";
    
    #open the post cutoff file
    open IN, $postCutoffFileName 
	or die ("ERROR: cannot open post cutoff file: $postCutoffFileName");

    #create hash of cuis to grab
    my %cuisToGrab = ();
    foreach my $rowKey (keys %{$startingMatrixRef}) {
	$cuisToGrab{$rowKey} = 1;
    }

    #read in values of the post cutoff matrix for the start terms
    my %postCutoffMatrix = ();
    my ($cui1, $cui2, $val);
    while (my $line = <IN>) {
	#grab values from the line
	chomp $line;
	($cui1, $cui2, $val) = split(/\t/,$line);

	#see if this line contains a key that should be read in 
	if (exists $cuisToGrab{$cui1}) {

	    #add the value
	    if (!(defined $postCutoffMatrix{$cui1})) {
		my %newHash = ();
		$postCutoffMatrix{$cui1} = \%newHash;
	    }

	    #check to ensure that the column cui is in the 
	    #  vocabulary of the pre-cutoff dataset.
	    #  it is impossible to make predictions of words that
	    #  don't already exist
	    #NOTE: this assumes $explicitMatrixRef is a square 
	    #   matrix (so unordered)
	    if (exists ${$explicitMatrixRef}{$cui2}) {
		${$postCutoffMatrix{$cui1}}{$cui2} = $val;
	    }
	}
    }
    close IN;

    #return the post cutoff matrix
    return \%postCutoffMatrix;
}

#TODO numRows should be read from file and sent with the lbdOptionsRef
# generates a starting matrix of numRows randomly selected terms
# input:  $explicitMatrixRef <- a ref to the explicit sparse matrix
#         $lbdOptionsRef <- the LBD options
#         $startTermAcceptTypesRef <- a reference to an hash of accept 
#                                     types for start terms (TUIs)
#         $numRows <- the number of random rows to load (if random)
#         $umls_interface <- an instance of the UMLS::Interface
# output: \%startingMatrix <- a ref to the starting sparse matrix
sub generateStartingMatrix {
    my $explicitMatrixRef = shift;
    my $lbdOptionsRef = shift;
    my $startTermAcceptTypesRef = shift;
    my $numRows = shift;
    my $umls_interface = shift;

    #generate the starting matrix randomly or from a file
    my %startingMatrix = ();

    #check if a file is defined
    if (exists ${$lbdOptionsRef}{'cuiListFileName'}) {
	#grab the rows defined by the cuiListFile
	my $cuisRef = &loadCUIs(${$lbdOptionsRef}{'cuiListFileName'});
	foreach my $cui (keys %{$cuisRef}) {
	    if(exists ${$explicitMatrixRef}{$cui}) {
		$startingMatrix{$cui} = ${$explicitMatrixRef}{$cui};	
	    }
	    else {
		print STDERR "WARNING: CUI from cuiListFileName is not in explicitMatrix: $cui\n";
	    }
	}
    }
    else {
	#randomly grab rows
	#apply semantic filter to the rows (just retreive appropriate rows)
	my $rowsToKeepRef = getRowsOfSemanticTypes(
	    $explicitMatrixRef, $startTermAcceptTypesRef, $umls_interface);
	((scalar keys %{$rowsToKeepRef}) >= $numRows) or die("ERROR: number of acceptable rows starting terms is less than $numRows\n");

	#randomly select 100 rows (to generate the 'starting matrix')
	#generate random numbers from 0 to number of rows in the explicit matrix
	my %rowNumbers = ();
	while ((scalar keys %rowNumbers) < $numRows) {
	    $rowNumbers{int(rand(scalar keys %{$rowsToKeepRef}))} = 1;
	}



( run in 1.427 second using v1.01-cache-2.11-cpan-39bf76dae61 )