ALBD
view release on metacpan or search on metacpan
utils/datasetCreator/dataStats/metaAnalysis.pl view on Meta::CPAN
# determines the number of rows, columns, vocabulary size, and total number of
# co-occurrences of a co-occurrence file, or set of co-occurrence files
use strict;
use warnings;
#perform meta-analysis on a single co-occurrence matrix
&metaAnalysis('/home/henryst/lbdData/groupedData/1960_1989_window8_noOrder');
#perform meta-analysis on a date range of co-occurrence matrices in a folder
# this expects a folder to contain a co-occurrence matrix for every year
# specified within the date range
my $dataFolder = '/home/henryst/lbdData/dataByYear/1960_1989';
my $startYear = '1809';
my $endYear = '2015';
my $windowSize = 1;
my $statsOutFileName = '/home/henryst/lbdData/stats_window1';
&folderMetaAnalysis($startYear, $endYear, $windowSize, $statsOutFileName, $dataFolder);
#####################
# runs meta analysis on a set of files
sub folderMetaAnalysis {
my $startYear = shift;
my $endYear = shift;
my $windowSize = shift;
my $statsOutFileName= shift;
my $dataFolder = shift;
#Check on I/O
open OUT, ">$statsOutFileName"
or die ("ERROR: unable to open stats out file: $statsOutFileName\n");
#print header row
print OUT "year\tnumRows\tnumCols\tvocabularySize\tnumCooccurrences\n";
#get stats for each file and output to file
for(my $year = $startYear; $year <= $endYear; $year++) {
print "reading $year\n";
my $inFile = $dataFolder.$year.'_window'.$windowSize;
if (open IN, $inFile) {
(my $numRows, my $numCols, my $vocabularySize, my $numCooccurrences)
= &metaAnalysis($inFile);
print OUT "$year\t$numRows\t$numCols\t$vocabularySize\t$numCooccurrences\n"
}
else {
#just skip the file
print " ERROR: unable to open $inFile\n";
}
}
close OUT;
print "Done getting stats\n";
}
##############################
# runs meta analysis on a single file
sub metaAnalysis {
my $fileName = shift;
open IN, $fileName or die ("unable to open file: $fileName\n");
my $numCooccurrences = 0;
my %rowKeys = (); #number of rows
my %colKeys = (); #number of columns
my %uniqueKeys = (); #vocabulary size
while (my $line = <IN>) {
$line =~ /([^\t]+)\t([^\t]+)\t([\d]+)/;
#row = $1, col = $2, val = $3;
$rowKeys{$1} = 1;
$colKeys{$2} = 1;
$uniqueKeys{$1} = 1;
$uniqueKeys{$2} = 1;
$numCooccurrences++;
}
close IN;
my $numRows = scalar keys %rowKeys;
my $numCols = scalar keys %colKeys;
my $vocabularySize = scalar keys %uniqueKeys;
print "$fileName: $numRows, $numCols, $vocabularySize, $numCooccurrences\n";
return $numRows, $numCols, $vocabularySize, $numCooccurrences;
}
( run in 1.232 second using v1.01-cache-2.11-cpan-39bf76dae61 )