App-geoCancerPrognosticDatasetsRetriever
view release on metacpan or search on metacpan
bin/geoCancerPrognosticDatasetsRetriever view on Meta::CPAN
}
#else no "interrupted" run directory was found. Start a new run.
else {
new_run($query_term_1);
}
sub new_run {
my $cancer = $_[0];
print color ("green"), "Downloading input file for \"$cancer\" cancer from GeoDatasets...", color("reset");
$input_file = download_geo_input($options{d});
print color ("green"), "done\n", color("reset");
system ("mkdir $run_dir"); #create results output directory
my $regex1 = join( '', ( split(/GPL/, $platform_gpl) ) );
$regex_platform = join( '|', ( split(/ /, $regex1) ) );
$output_file = "$cancer_type.out";
#Check for the presence of the input file.
unless (-e "$prog_path/data/$input_file") {
print color ("red"), "Input file: $input_file was not found.\n", color("reset");
exit;
}
}
}
}
############################ SUBROUTINE 5 #######################################################
# The following code was reused from the NCBI's NBK25501 reference textbook.
# See: https://www.ncbi.nlm.nih.gov/books/NBK25501/
# It was adapted in this subroutine with additional modifications.
sub download_geo_input {
my $query = $_[0];
my ($cancer) = split(/ /, $query);
my $geo_db = 'gds';
my $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
my $url = $base . "esearch.fcgi?db=$geo_db&term=$query&usehistory=y";
my $output = get($url);
my $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
my $key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);
#assemble the efetch URL
$url = $base . "efetch.fcgi?db=$geo_db&query_key=$key&WebEnv=$web";
$url .= "&rettype=abstract&retmode=text";
my $data = get($url);
#Check for a GeoDatasets timeout error and abort run, if found.
if (!$data) {
print color ("red"), "\nThe download from GeoDatasets was not successful...\nA GeoDatasets timeout error was detected: current run aborted...\nPlease restart the run...\n", color("reset");
exit; #abort current run
}
#add date & time to current input file download
my $geo_datasets_file = "$cancer\_cancer_GEO_$current_date_time.txt";
open(FH, '>', "$prog_path/data/$geo_datasets_file") or die "Cannot open file for writing the GDS input:$!\n";
binmode(FH, ":utf8");
print FH "$data";
close(FH);
return $geo_datasets_file;
}
############################ SUBROUTINE 6 #######################################################
#This subroutine performs minor formatting of a GEO input file to merge the title and abstract
#lines together to prevent the regex lines from missing potential keyword hits in the 'title'
#line.
sub format_input {
my $raw_input = $_[0];
my $out_file = $_[1];
my $concatenate;
print color ("green"), "Formatting Input: $input_file...", color("reset");
open (IN, "$prog_path/data/$raw_input") or die "Cannot open file for reformatting: $raw_input. $!.\n";
open (OUT, '>', "$prog_path/data/$out_file") or die "Cannot open file for writing reformatted data: $out_file $!\n";
while ($line = <IN>) {
#title line check only
if ($line =~ m/(^\d+\.\s+.*)/) {
$concatenate = $line;
chomp($concatenate);
}
#abstract line
elsif ($line !~ m/(^\d+\.\s+.*)/) {
$concatenate .= "$line";
print OUT "$concatenate";
$concatenate = ""; #reinitialize variable for next entry.
}
}
print color ("green"), "done\n", color("reset");
close (IN);
close (OUT);
}
############################ SUBROUTINE 7 #######################################################
#This subroutine runs the main processing steps, while running other subroutines to continue the
#processing pipeline.
sub main {
my $main_formatted_input_file = $_[0];
my $main_output_file = $_[1];
print color ("green"), "Analyzing Input: $main_formatted_input_file file...\n", color("reset");
#open input file
open (FH, "$prog_path/data/$main_formatted_input_file") or die "Cannot open file: $main_formatted_input_file $!\n";
#open output file
open (FH2, '>', "$results_subdir$main_output_file") or die "Cannot open file for writing data: $!\n";
( run in 1.376 second using v1.01-cache-2.11-cpan-39bf76dae61 )