App-GeoCancerPrognosticDatasetsRetriever
view release on metacpan or search on metacpan
bin/geoCancerPrognosticDatasetsRetriever view on Meta::CPAN
print FH "$data";
close(FH);
return $geo_datasets_file;
}
############################ SUBROUTINE 6 #######################################################
#This subroutine performs minor formatting of a GEO input file to merge the title and abstract
#lines together to prevent the regex lines from missing potential keyword hits in the 'title'
#line.
sub format_input {
my $raw_input = $_[0];
my $out_file = $_[1];
my $concatenate;
print color ("green"), "Formatting Input: $input_file...", color("reset");
open (IN, "$prog_path/data/$raw_input") or die "Cannot open file for reformatting: $raw_input. $!.\n";
open (OUT, '>', "$prog_path/data/$out_file") or die "Cannot open file for writing reformatted data: $out_file $!\n";
while ($line = <IN>) {
#title line check only
if ($line =~ m/(^\d+\.\s+.*)/) {
$concatenate = $line;
chomp($concatenate);
}
#abstract line
elsif ($line !~ m/(^\d+\.\s+.*)/) {
$concatenate .= "$line";
print OUT "$concatenate";
$concatenate = ""; #reinitialize variable for next entry.
}
}
print color ("green"), "done\n", color("reset");
close (IN);
close (OUT);
}
############################ SUBROUTINE 7 #######################################################
#Check for the presence of curl in the $PATH. If not found, install on an Ubuntu system or if
#not Ubuntu, prompt user to install it manually.
sub check_curl {
#check for the presence of curl binary
my $check = qx{which curl};
#if no curl binary was found, install it on Ubuntu/Ubuntu-based systems
if (!$check) {
#check if current system is Ubuntu/or Ubuntu-based
my $ubuntu = qx{uname -a};
if ($ubuntu=~ /.+ubuntu.+/ig) {
print color ("red"), "curl binary was not found: follow onscreen instructions/input your password for its installation...\n\n", color("reset");
system("sudo apt -y install curl"); #install curl
print "done\n";
}
else {
print "curl is not found on this system: install it on your system.\n";
}
}
}
############################ SUBROUTINE 8 #######################################################
#This subroutine runs the main processing steps, while running other subroutines to continue the
#processing pipeline.
sub main {
my $main_formatted_input_file = $_[0];
my $main_output_file = $_[1];
print color ("green"), "Analyzing Input: $main_formatted_input_file file...\n", color("reset");
#open input file
open (FH, "$prog_path/data/$main_formatted_input_file") or die "Cannot open file: $main_formatted_input_file $!\n";
#open output file
open (FH2, '>', "$prog_path/results/$main_output_file") or die "Cannot open file for writing data: $!\n";
while ($line = <FH>) {
if ($line =~ m/.*(prognosis|prognostic|prognostically|prognosticator|survival|survive|survives|survived|surviving).*/ig) {
#if ($line =~ m/.*(progno.+\s?|surviv.+\s?).*/ig) {
$flag = 1;
$prog_flag = 1;
next;
}
#this conditional activates when the above keywords are not found and only "more..." is found.
#Note this conditional implicitly doesn't get executed if both the desired keyword and "more..." are found.
elsif ($line =~ m/.+(more\.\.\.)/ig) {
$flag = 1;
$wget_flag = 1;
next;
}
elsif ($line =~ m/(^Organism:\s+Homo\s+sapiens.*)/ig) {
$simple_hash{'Organism_line'} = "$1";
$human_flag = 1;
next;
}
elsif ($line =~ m/^Type:.+/) { next; }
#elsif ($line =~ m/.*Platform.?:\s+GPL(570|96|97)\s+.+/) {
elsif ($line =~ m/.*Platform.?:\s+GPL($regex_platform)\s+.+/) {
$i++;
$flag = 1;
print "$i.\n$line$simple_hash{'Organism_line'}\n";
print FH2 "$i.\n$line$simple_hash{'Organism_line'}\n";
( run in 0.593 second using v1.01-cache-2.11-cpan-39bf76dae61 )