Bio-DB-GFF

 view release on metacpan or  search on metacpan

bin/bp_fast_load_gff  view on Meta::CPAN

  }

  elsif (/^>(\S+)/) {  # uh oh, sequence coming
      $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
      $FH{FASTA}->print("$_\n");
      push @fasta, "$tmpdir/$1\.fa";
      push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
      print STDERR "Processing embedded sequence $1\n";
      next;
  }

  elsif (/^\#\#\s*group-tags\s+(.+)/) {
    $db->preferred_groups(split(/\s+/,$1));
    next;
  }

  elsif (/^\#\#\s*gff-version\s+(\d+)/) {
    $gff3 = ($1 >= 3);
    $db->print_gff3_warning() if $gff3;
    next;
  }

  elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
    ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = 
      ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
  }

  elsif (/^\#/) {
    next;
  }

  else {
    ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
  }
  next unless defined $ref;
  $FEATURES++;

  warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;

  $source = '\N' unless defined $source;
  $score  = '\N' if $score  eq '.';
  $strand = '\N' if $strand eq '.';
  $phase  = '\N' if $phase  eq '.';

  my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
  # GFF2/3 transition
  $gclass = [$gclass] unless ref $gclass;
  $gname  = [$gname]  unless ref $gname;

  for (my $i=0; $i < @$gname; $i++) {
    my $group_class = $gclass->[$i];
    my $group_name  = $gname->[$i];
    $group_class  ||= '\N';
    $group_name   ||= '\N';
    $target_start ||= '\N';
    $target_stop  ||= '\N';
    $method       ||= '\N';
    $source       ||= '\N';

    my $fid     = $FID++;
    my $gid     = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
    my $ftypeid = $FTYPEID{lc join($;,$source,$method)}          ||= $FTYPEID++;

    my $bin = bin($start,$stop,$db->min_bin);
    $FH{ FDATA()  }->print(    join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n"   );
    $FH{ FGROUP() }->print(    join("\t",$gid,$group_class,$group_name),"\n"              ) unless $DONE{"fgroup$;$gid"}++;
    $FH{ FTYPE()  }->print(    join("\t",$ftypeid,$method,$source),"\n"                   ) unless $DONE{"ftype$;$ftypeid"}++;

    foreach (@$attributes) {
      my ($key,$value) = @$_;
      my $attributeid = $ATTRIBUTEID{lc $key}   ||= $ATTRIBUTEID++;
      $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n"                       ) unless $DONE{"fattribute$;$attributeid"}++;
      $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
    }

    if ( $FEATURES % 1000 == 0) {
      my $now    = Time::HiRes::time() if $timer;
      my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
      $last = $now;
      print STDERR "$fid features parsed$elapsed...";
      print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
    }
  }
}

$FH{FASTA}->close if exists $FH{FASTA};

printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
$start = time();

for my $fasta (@fasta) {
  warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n";
  my $old = select($FH{FDNA()});
  my $loaded = $db->load_fasta($fasta);
  warn "$fasta: $loaded records loaded\n";
  select $old;
}

printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
$start = time();

my $success = 1;
if ($DO_FAST) {
  warn "Indexing and analyzing tables.  This may take some time (you may see database messages during the process)...\n";
}

$_->close foreach values %FH;

if (!$DO_FAST) {
  warn "Loading feature data and analyzing tables.  You may see database messages here...\n";
  $success &&= system($COMMAND{$_}) == 0 foreach @files;
}

# wait for children
while (%PID) {
  sleep;
}
$success &&= !$FAILED;

cleanup();

printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer;

if ($success) {
  print "SUCCESS: $FEATURES features successfully loaded\n";
  exit 0;
} else {
  print "FAILURE: Please see standard error for details\n";
  exit -1;
}

if ($SUMMARY_STATS) {
    warn "Building summary statistics for coverage histograms...\n";
    $db->build_summary_statistics;
}

exit 0;

sub cleanup {
  foreach (@files,@fasta_files_to_be_unlinked) {
    unlink "$tmpdir/$_.$$";
  }
}

# load copies of some of the tables into memory
sub load_tables {
  my $dbh = shift;
  print STDERR "loading normalized group, type and attribute information...";
  $FID         = 1 + get_max_id($dbh,'fdata','fid');
  $GID         = 1 + get_max_id($dbh,'fgroup','gid');
  $FTYPEID     = 1 + get_max_id($dbh,'ftype','ftypeid');
  $ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id');
  get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname');
  get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod');
  get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
  print STDERR "ok\n";
}

sub get_max_id {
  my $dbh = shift;
  my ($table,$id) = @_;
  my $sql = "select max($id) from $table";
  my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
  $result->[0];
}

sub get_ids {
  my $dbh = shift;
  my ($done,$idhash,$table,$id,@columns) = @_;
  my $columns = join ',',$id,@columns;
  my $sql = "select $columns from $table";
  my $sth = $dbh->prepare($sql) or die $dbh->errstr;
  $sth->execute or die $dbh->errstr;
  while (my($id,@cols) = $sth->fetchrow_array) {
    my $key = lc join $;,@cols;
    $idhash->{$key} = $id;
    $done->{$table,$id}++;
  }
}

__END__



( run in 1.571 second using v1.01-cache-2.11-cpan-5735350b133 )