App-GeoCancerPrognosticDatasetsRetriever

 view release on metacpan or  search on metacpan

bin/geoCancerPrognosticDatasetsRetriever  view on Meta::CPAN

    print FH "$data";

    close(FH);
    
    return $geo_datasets_file;
}
############################ SUBROUTINE 6 #######################################################
#This subroutine performs minor formatting of a GEO input file to merge the title and abstract 
#lines together to prevent the regex lines from missing potential keyword hits in the 'title' 
#line.
sub format_input {
	
	my $raw_input 	=	$_[0];
	my $out_file  	=	$_[1];
	my $concatenate;
	
	print color ("green"), "Formatting Input: $input_file...", color("reset");
	
	open (IN, "$prog_path/data/$raw_input") or die "Cannot open file for reformatting: $raw_input. $!.\n";
	open (OUT, '>', "$prog_path/data/$out_file") or die "Cannot open file for writing reformatted data: $out_file $!\n";
	
	while ($line = <IN>) {
			
		#title line check only
		if ($line =~ m/(^\d+\.\s+.*)/) { 
			
			$concatenate = $line;
			chomp($concatenate);
		}
 			
		#abstract line
		elsif ($line !~ m/(^\d+\.\s+.*)/) {
			
			$concatenate .= "$line";
			print OUT "$concatenate";
			$concatenate  = ""; #reinitialize variable for next entry.
		}	
	}
	
	print color ("green"), "done\n", color("reset");
	
	close (IN);
	close (OUT);
}
############################ SUBROUTINE 7 #######################################################
#Check for the presence of curl in the $PATH. If not found, install on an Ubuntu system or if 
#not Ubuntu, prompt user to install it manually.
sub check_curl {
	
	#check for the presence of curl binary
	my $check = qx{which curl};
	
	#if no curl binary was found, install it on Ubuntu/Ubuntu-based systems
	if (!$check) {
			
		#check if current system is Ubuntu/or Ubuntu-based
		my $ubuntu = qx{uname -a};
			
		if ($ubuntu=~ /.+ubuntu.+/ig) {
			
			print color ("red"), "curl binary was not found: follow onscreen instructions/input your password for its installation...\n\n", color("reset");
			system("sudo apt -y install curl"); #install curl
			print "done\n";	
		} 
			
		else { 
				
			print "curl is not found on this system: install it on your system.\n"; 
		}	
	}
}
############################ SUBROUTINE 8 #######################################################
#This subroutine runs the main processing steps, while running other subroutines to continue the 
#processing pipeline.
sub main {
	
	my $main_formatted_input_file  = $_[0];
	my $main_output_file           = $_[1];
	
	print color ("green"), "Analyzing Input: $main_formatted_input_file file...\n", color("reset");

	#open input file
	open (FH, "$prog_path/data/$main_formatted_input_file") or die "Cannot open file: $main_formatted_input_file $!\n";

	#open output file
	open (FH2, '>', "$prog_path/results/$main_output_file") or die "Cannot open file for writing data: $!\n";
    
	while ($line = <FH>) {
			
		if ($line =~ m/.*(prognosis|prognostic|prognostically|prognosticator|survival|survive|survives|survived|surviving).*/ig) { 
		#if ($line =~ m/.*(progno.+\s?|surviv.+\s?).*/ig) {
			
			$flag = 1; 
			$prog_flag = 1; 
			next; 
		}
		#this conditional activates when the above keywords are not found and only "more..." is found. 
		#Note this conditional implicitly doesn't get executed if both the desired keyword and "more..." are found.
		elsif ($line =~ m/.+(more\.\.\.)/ig) { 	
			
			$flag = 1; 
			$wget_flag = 1; 
			next;
		}
		
		elsif ($line =~ m/(^Organism:\s+Homo\s+sapiens.*)/ig) { 
			
			$simple_hash{'Organism_line'} = "$1";
			$human_flag = 1; 
			next; 
		}
		
		elsif ($line =~ m/^Type:.+/) { next; }
		
		#elsif ($line =~ m/.*Platform.?:\s+GPL(570|96|97)\s+.+/) {
		elsif ($line =~ m/.*Platform.?:\s+GPL($regex_platform)\s+.+/) {	
			
			$i++;
			$flag = 1;
			print "$i.\n$line$simple_hash{'Organism_line'}\n";
			print FH2 "$i.\n$line$simple_hash{'Organism_line'}\n";



( run in 0.593 second using v1.01-cache-2.11-cpan-39bf76dae61 )