ALBD
view release on metacpan or search on metacpan
utils/datasetCreator/combineCooccurrenceMatrices.pl view on Meta::CPAN
# combines the co-occurrences counts for the year range specified (inclusive
# e.g. 1983-1985 will combine counts from files of 1983, 1984, and 1985
# co-occurrences). This file is intended to run on co-occurrence matrices
# created seperately for each year, and stored in a single folder. Creating
# co-occurrence matrices in this manner is useful because it makes running
# the CUICollector faster, and because files can be easily combined for
# different time slicing or discovery replication results. We ran CUI Collector
# seperately for each year of the MetaMapped MEDLINES baseline and stored each
# co-occurrence matrix in a single folder "hadoopByYear/output/". That folder
# contained file named the year and window size used (e.g. 1975_window8).
# The code may need to be modified slightly for other purposes.
use strict;
use warnings;
my $startYear;
my $endYear;
my $windowSize;
my $dataFolder;
#user input
$dataFolder = '/home/henryst/hadoopByYear/output/';
$startYear = '1983';
$endYear = '1985';
$windowSize = 8;
&combineFiles($startYear,$endYear,$windowSize);
#####################################################
####### Program Start ########
sub combineFiles {
my $startYear = shift;
my $endYear = shift;
my $windowSize = shift;
#Check on I/O
my $outFileName = "$startYear".'_'."$endYear".'_window'."$windowSize";
(!(-e $outFileName))
or die ("ERROR: output file already exists: $outFileName\n");
open OUT, ">$outFileName"
or die ("ERROR: unable to open output file: $outFileName\n");
#combine the files
my %matrix = ();
for(my $year = $startYear; $year <= $endYear; $year++) {
print "reading $year\n";
my $inFile = $dataFolder.$year.'_window'.$windowSize;
if (!(open IN, $inFile)) {
print " ERROR: unable to open $inFile\n";
next;
}
#read each line of the file and add to the matrix
while (my $line = <IN>) {
#read values from the line
$line =~ /([^\s]+)\t([^\s]+)\t([^\s]+)/;
my $rowKey = $1;
my $colKey = $2;
my $val = $3;
#add the values to the matrix
if (!exists $matrix{$rowKey}) {
my %newHash = ();
$matrix{$rowKey} = \%newHash;
}
if (!exists ${$matrix{$rowKey}}{$colKey}) {
${$matrix{$rowKey}}{$colKey} = 0;
}
${$matrix{$rowKey}}{$colKey}+=$val;
}
close IN;
}
#output the matrix
print "outputting the matrix\n";
foreach my $rowKey(keys %matrix) {
foreach my $colKey(keys %{$matrix{$rowKey}}) {
print OUT "$rowKey\t$colKey\t${$matrix{$rowKey}}{$colKey}\n";
}
}
close OUT;
print "DONE!\n";
}
( run in 0.696 second using v1.01-cache-2.11-cpan-39bf76dae61 )