App-GeoCancerPrognosticDatasetsRetriever
view release on metacpan or search on metacpan
bin/geoCancerPrognosticDatasetsRetriever view on Meta::CPAN
new_run($query_term_1);
}
sub new_run {
my $cancer = $_[0];
print color ("green"), "Downloading input file for \"$cancer\" cancer from GeoDatasets...", color("reset");
$input_file = download_geo_input($options{d});
print color ("green"), "done\n", color("reset");
system ("mkdir $run_dir"); #create results output directory
$platform_gpl= uc($options{p});
my $regex1 = join( '', ( split(/GPL/, $platform_gpl) ) );
$regex_platform = join( '|', ( split(/ /, $regex1) ) );
$output_file = "$cancer_type.out";
#Check for the presence of the input file.
unless (-e "$prog_path/data/$input_file") {
print color ("red"), "Input file: $input_file was not found.\n", color("reset");
exit;
}
}
}
my $local_query = $options{d};
my $local_gpl = $options{p};
$input_command_line = "User input command: ./geoCancerPrognosticDatasetsRetriever -d \"$local_query\" -p \"$local_gpl\"";
}
############################ SUBROUTINE 5 #######################################################
# The following code was reused from the NCBI's NBK25501 reference textbook.
# See: https://www.ncbi.nlm.nih.gov/books/NBK25501/
# It was adapted in this subroutine with additional modifications.
sub download_geo_input {
my $query = $_[0];
my ($cancer) = split(/ /, $query);
my $geo_db = 'gds';
my $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
my $url = $base . "esearch.fcgi?db=$geo_db&term=$query&usehistory=y";
my $output = get($url);
my $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
my $key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);
#assemble the efetch URL
$url = $base . "efetch.fcgi?db=$geo_db&query_key=$key&WebEnv=$web";
$url .= "&rettype=abstract&retmode=text";
my $data = get($url);
#Check for a GeoDatasets timeout error and abort run, if found.
if (!$data) {
print color ("red"), "\nThe download from GeoDatasets was not successful...\nA GeoDatasets timeout error was detected: current run aborted...\nPlease restart the run...\n", color("reset");
exit; #abort current run
}
#add date & time to current input file download
my $geo_datasets_file = "$cancer\_cancer_GEO_$current_date_time.txt";
open(FH, ">$prog_path/data/$geo_datasets_file") or die "Cannot open file for writing the GDS input:$!\n";
binmode(FH, ":utf8");
print FH "$data";
close(FH);
return $geo_datasets_file;
}
############################ SUBROUTINE 6 #######################################################
#This subroutine performs minor formatting of a GEO input file to merge the title and abstract
#lines together to prevent the regex lines from missing potential keyword hits in the 'title'
#line.
sub format_input {
my $raw_input = $_[0];
my $out_file = $_[1];
my $concatenate;
print color ("green"), "Formatting Input: $input_file...", color("reset");
open (IN, "$prog_path/data/$raw_input") or die "Cannot open file for reformatting: $raw_input. $!.\n";
open (OUT, '>', "$prog_path/data/$out_file") or die "Cannot open file for writing reformatted data: $out_file $!\n";
while ($line = <IN>) {
#title line check only
if ($line =~ m/(^\d+\.\s+.*)/) {
$concatenate = $line;
chomp($concatenate);
}
#abstract line
elsif ($line !~ m/(^\d+\.\s+.*)/) {
$concatenate .= "$line";
print OUT "$concatenate";
$concatenate = ""; #reinitialize variable for next entry.
}
}
print color ("green"), "done\n", color("reset");
close (IN);
close (OUT);
}
############################ SUBROUTINE 7 #######################################################
#Check for the presence of curl in the $PATH. If not found, install on an Ubuntu system or if
#not Ubuntu, prompt user to install it manually.
sub check_curl {
#check for the presence of curl binary
my $check = qx{which curl};
#if no curl binary was found, install it on Ubuntu/Ubuntu-based systems
if (!$check) {
#check if current system is Ubuntu/or Ubuntu-based
my $ubuntu = qx{uname -a};
if ($ubuntu=~ /.+ubuntu.+/ig) {
( run in 3.181 seconds using v1.01-cache-2.11-cpan-140bd7fdf52 )