App-geoCancerPrognosticDatasetsRetriever

 view release on metacpan or  search on metacpan

bin/geoCancerPrognosticDatasetsRetriever  view on Meta::CPAN

		}

		#else no "interrupted" run directory was found. Start a new run.
		else {
			
			new_run($query_term_1);
		}

		sub new_run {
			
			my $cancer      = $_[0];
			print color ("green"), "Downloading input file for \"$cancer\" cancer from GeoDatasets...", color("reset");
			$input_file     = download_geo_input($options{d});
			print color ("green"), "done\n", color("reset");
			system ("mkdir $run_dir"); #create results output directory
			my $regex1      = join( '', ( split(/GPL/, $platform_gpl) ) );
			$regex_platform = join( '|', ( split(/ /, $regex1) ) );
			$output_file    = "$cancer_type.out";
			
			#Check for the presence of the input file.
			unless (-e "$prog_path/data/$input_file") {
		
				print color ("red"), "Input file: $input_file was not found.\n", color("reset");
				exit;
			}
		}	
	}
}
############################ SUBROUTINE 5 #######################################################
# The following code was reused from the NCBI's NBK25501 reference textbook.
# See: https://www.ncbi.nlm.nih.gov/books/NBK25501/
# It was adapted in this subroutine with additional modifications.
sub download_geo_input {

    my $query    = $_[0];
    my ($cancer) = split(/ /, $query);
    my $geo_db   = 'gds';
    my $base     = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
    my $url      = $base . "esearch.fcgi?db=$geo_db&term=$query&usehistory=y";
    my $output   = get($url);                               
    my $web      = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);   
    my $key      = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);    

    #assemble the efetch URL
    $url         = $base . "efetch.fcgi?db=$geo_db&query_key=$key&WebEnv=$web";
    $url        .= "&rettype=abstract&retmode=text";
    
    my $data     = get($url);
    
    #Check for a GeoDatasets timeout error and abort run, if found.
    if (!$data) {
		
		print color ("red"), "\nThe download from GeoDatasets was not successful...\nA GeoDatasets timeout error was detected: current run aborted...\nPlease restart the run...\n", color("reset");
		exit; #abort current run
	}

    #add date & time to current input file download
	my $geo_datasets_file = "$cancer\_cancer_GEO_$current_date_time.txt"; 
    
    open(FH, '>', "$prog_path/data/$geo_datasets_file") or die "Cannot open file for writing the GDS input:$!\n";
    binmode(FH, ":utf8");
    
    print FH "$data";

    close(FH);
    
    return $geo_datasets_file;
}
############################ SUBROUTINE 6 #######################################################
#This subroutine performs minor formatting of a GEO input file to merge the title and abstract 
#lines together to prevent the regex lines from missing potential keyword hits in the 'title' 
#line.
sub format_input {
	
	my $raw_input 	=	$_[0];
	my $out_file  	=	$_[1];
	my $concatenate;
	
	print color ("green"), "Formatting Input: $input_file...", color("reset");
	
	open (IN, "$prog_path/data/$raw_input") or die "Cannot open file for reformatting: $raw_input. $!.\n";
	open (OUT, '>', "$prog_path/data/$out_file") or die "Cannot open file for writing reformatted data: $out_file $!\n";
	
	while ($line = <IN>) {
			
		#title line check only
		if ($line =~ m/(^\d+\.\s+.*)/) { 
			
			$concatenate = $line;
			chomp($concatenate);
		}
 			
		#abstract line
		elsif ($line !~ m/(^\d+\.\s+.*)/) {
			
			$concatenate .= "$line";
			print OUT "$concatenate";
			$concatenate  = ""; #reinitialize variable for next entry.
		}	
	}
	
	print color ("green"), "done\n", color("reset");
	
	close (IN);
	close (OUT);
}
############################ SUBROUTINE 7 #######################################################
#This subroutine runs the main processing steps, while running other subroutines to continue the 
#processing pipeline.
sub main {
	
	my $main_formatted_input_file  = $_[0];
	my $main_output_file           = $_[1];
	
	print color ("green"), "Analyzing Input: $main_formatted_input_file file...\n", color("reset");

	#open input file
	open (FH, "$prog_path/data/$main_formatted_input_file") or die "Cannot open file: $main_formatted_input_file $!\n";

	#open output file
	open (FH2, '>', "$results_subdir$main_output_file") or die "Cannot open file for writing data: $!\n";



( run in 1.376 second using v1.01-cache-2.11-cpan-39bf76dae61 )