Bio-DB-GFF

 view release on metacpan or  search on metacpan

bin/bp_bulk_load_gff  view on Meta::CPAN


my $db = Bio::DB::GFF->new(-adaptor=>$faux_adaptor,-dsn => $DSN,@auth)
  or die "Can't open database: ",Bio::DB::GFF->error,"\n";

$db->gff3_name_munging(1) if $MUNGE;

$MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
$MAX_BIN ||= $db->meta('max_bin') || 100_000_000;

# deal with really long lists of files
if ($LONG_LIST) {
  -d $LONG_LIST or die "The --long_list argument must be a directory\n";
  opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
  @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
  closedir GFFDIR;

  if (defined $FASTA && -d $FASTA) {
    opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
    push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
    closedir FASTA;
  }
  elsif (defined $FASTA && -f $FASTA) {
    push @ARGV, $FASTA;
  }
}

foreach (@ARGV) {
  $_ = "gunzip -c $_ |" if /\.gz$/;
  $_ = "uncompress -c $_ |" if /\.Z$/;
  $_ = "bunzip2 -c $_ |" if /\.bz2$/;
}

my (@gff,@fasta);
foreach (@ARGV) {
  if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
    push @fasta,$_;
  } else {
    push @gff,$_;
  }
}
@ARGV = @gff;
push @fasta,$FASTA if defined $FASTA;

# drop everything that was there before
my %FH;
my $tmpdir = File::Spec->tmpdir() || '/tmp';
$tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
-d $tmpdir or die <<END;
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
Please select a directory and indicate its location by setting the TMP environment variable, or
by using the --Temporary switch.
END
my @fasta_files_to_be_unlinked;
my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
foreach (@files) {
  $FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!";
  $FH{$_}->autoflush;
}

if ( $use_pg ) {
  $FH{FDATA()                }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
  $FH{FTYPE()                }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
  $FH{FGROUP()               }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
  $FH{FATTRIBUTE()           }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
  $FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
}
my $FID     = 1;
my $GID     = 1;
my $FTYPEID = 1;
my $ATTRIBUTEID = 1;
my %GROUPID     = ();
my %FTYPEID     = ();
my %ATTRIBUTEID = ();
my %DONE        = ();
my $FEATURES    = 0;

my %tmpfiles; # keep track of temporary fasta files
my $count;
my $fasta_sequence_id;
my $gff3;
my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files

$db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;

my $last  = Time::HiRes::time() if $timer;
my $start = $last;

  # avoid hanging on standalone --fasta load
if (!@ARGV) {
    $FH{NULL} = IO::File->new(">$tmpdir/null");
    push @ARGV, "$tmpdir/null";
}

my ($cmap_db);
if ($use_mysqlcmap){
  my $options = {
		 AutoCommit       => 1,
		 FetchHashKeyName => 'NAME_lc',
		 LongReadLen      => 3000,
		 LongTruncOk      => 1,
		 RaiseError       => 1,
		};

  $cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options );
}
# Only load CMap::Utils if using cmap
unless (!$use_mysqlcmap or
	eval {
	  require Bio::GMOD::CMap::Utils;
	  Bio::GMOD::CMap::Utils->import('next_number');
	  1;
	}
       ) {
  print STDERR "Error loading Bio::GMOD::CMap::Utils\n";
}


while (<>) {

  $current_file ||= $ARGV;

  # reset GFF3 flag if new filehandle
  unless($current_file eq $ARGV){

bin/bp_bulk_load_gff  view on Meta::CPAN

    $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
    $FH{FASTA}->print("$_\n");
    print STDERR "Preparing embedded sequence $1\n";
    push @fasta, "$tmpdir/$1\.fa";
    push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
    $tmpfiles{"$tmpdir/$1\.fa"}++;
    next;
  }

  elsif (/^\#\#\s*gff-version\s+(\d+)/) {
    $gff3 = ($1 >= 3);
    $db->print_gff3_warning() if $gff3;
    next;
  }

  elsif (/^\#\#\s*group-tags\s+(.+)/) {
    $db->preferred_groups(split(/\s+/,$1));
    next;
  }

  elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
    ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
	($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
  }

  elsif (/^\#/) {
    next;
  }

  else {
    ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
  }
  if ( not defined( $ref ) or length ($ref) == 0) {
    warn "\$ref is null.  source = $source, method = $method, group = $group\n";
    next;
  }
  $FEATURES++;
  my $size = $stop-$start+1;
  warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;

  $source = '\N' unless defined $source;
  $score  = '\N' if $score  eq '.';
  $strand = '\N' if $strand eq '.';
  $phase  = '\N' if $phase  eq '.';

  my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);

  # GFF2/3 transition
  $group_class = [$group_class] unless ref $group_class;
  $group_name  = [$group_name]  unless ref $group_name;

  for (my $i=0; $i < @$group_name; $i++) {
    $group_class->[$i]  ||= '\N';
    $group_name->[$i]   ||= '\N';
    $target_start ||= '\N';
    $target_stop  ||= '\N';
    $method       ||= '\N';
    $source       ||= '\N';

    my $fid     = $FID++;
    my $gid     = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])}  ||= $GID++;
    my $ftypeid = $FTYPEID{lc join('',$source,$method)}                       ||= $FTYPEID++;

    my $bin = bin($start,$stop,$db->min_bin);
    $FH{ FDATA()  }->print(    join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n"   );
    if ($use_mysqlcmap){
      my $feature_id    = next_number(
				      db         => $cmap_db,
				      table_name => 'cmap_feature',
				      id_field   => 'feature_id',
				     )
	or die 'No feature id';
      my $direction = $strand eq '-' ? -1:1;
      $FH{ FGROUP() }->print(
			     join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
			     ,"\n"
			    ) unless $DONE{"G$gid"}++;
    }
    else {
      $FH{ FGROUP() }->print(    join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
    }
    $FH{ FTYPE()  }->print(    join("\t",$ftypeid,$method,$source),"\n"                   ) unless $DONE{"T$ftypeid"}++;

    foreach (@$attributes) {
      my ($key,$value) = @$_;
      my $attributeid = $ATTRIBUTEID{$key}   ||= $ATTRIBUTEID++;
      $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n"                       ) unless $DONE{"A$attributeid"}++;
      $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
    }

    if ( $fid % 1000 == 0) {
      my $now    = Time::HiRes::time() if $timer;
      my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
      $last = $now;
      print STDERR "$fid features parsed$elapsed...";
      print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
    }
  }
}

$FH{FASTA}->close if exists $FH{FASTA};

for my $file (@fasta) {
  warn "Preparing DNA file $file....\n";
  if ($use_pg){
    $FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
  }
  my $old = select($FH{FDNA()});
  $db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
  if ($use_pg){
    $FH{FDNA() }->print("\\.\n\n");
  }
  warn "done...\n";
  select $old;
  unlink $file if $tmpfiles{$file};
}

if ($use_pg) {
  $FH{FDATA()                }->print("\\.\n\n");
  $FH{FTYPE()                }->print("\\.\n\n");
  $FH{FGROUP()               }->print("\\.\n\n");
  $FH{FATTRIBUTE()           }->print("\\.\n\n");
  $FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n");
}


$_->close foreach values %FH;
printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
warn "Loading feature data and analyzing tables.  You may see RDBMS messages here...\n";

if ($use_pg){
  warn "Loading feature data.  You may see Postgres comments...\n";

  foreach (@files) {
    my $file = "$tmpdir/$_.$$";

    $AUTH ? system("psql $AUTH -f $file $DBNAME")
          : system('psql','-f', $file, $DBNAME);

    unlink $file;



( run in 2.065 seconds using v1.01-cache-2.11-cpan-5735350b133 )