Bio-DB-GFF
view release on metacpan or search on metacpan
bin/bp_fast_load_gff view on Meta::CPAN
}
elsif (/^>(\S+)/) { # uh oh, sequence coming
$FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
$FH{FASTA}->print("$_\n");
push @fasta, "$tmpdir/$1\.fa";
push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
print STDERR "Processing embedded sequence $1\n";
next;
}
elsif (/^\#\#\s*group-tags\s+(.+)/) {
$db->preferred_groups(split(/\s+/,$1));
next;
}
elsif (/^\#\#\s*gff-version\s+(\d+)/) {
$gff3 = ($1 >= 3);
$db->print_gff3_warning() if $gff3;
next;
}
elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
}
elsif (/^\#/) {
next;
}
else {
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
}
next unless defined $ref;
$FEATURES++;
warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;
$source = '\N' unless defined $source;
$score = '\N' if $score eq '.';
$strand = '\N' if $strand eq '.';
$phase = '\N' if $phase eq '.';
my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
# GFF2/3 transition
$gclass = [$gclass] unless ref $gclass;
$gname = [$gname] unless ref $gname;
for (my $i=0; $i < @$gname; $i++) {
my $group_class = $gclass->[$i];
my $group_name = $gname->[$i];
$group_class ||= '\N';
$group_name ||= '\N';
$target_start ||= '\N';
$target_stop ||= '\N';
$method ||= '\N';
$source ||= '\N';
my $fid = $FID++;
my $gid = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
my $ftypeid = $FTYPEID{lc join($;,$source,$method)} ||= $FTYPEID++;
my $bin = bin($start,$stop,$db->min_bin);
$FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
$FH{ FGROUP() }->print( join("\t",$gid,$group_class,$group_name),"\n" ) unless $DONE{"fgroup$;$gid"}++;
$FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"ftype$;$ftypeid"}++;
foreach (@$attributes) {
my ($key,$value) = @$_;
my $attributeid = $ATTRIBUTEID{lc $key} ||= $ATTRIBUTEID++;
$FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"fattribute$;$attributeid"}++;
$FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
}
if ( $FEATURES % 1000 == 0) {
my $now = Time::HiRes::time() if $timer;
my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
$last = $now;
print STDERR "$fid features parsed$elapsed...";
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
}
}
}
$FH{FASTA}->close if exists $FH{FASTA};
printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
$start = time();
for my $fasta (@fasta) {
warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n";
my $old = select($FH{FDNA()});
my $loaded = $db->load_fasta($fasta);
warn "$fasta: $loaded records loaded\n";
select $old;
}
printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
$start = time();
my $success = 1;
if ($DO_FAST) {
warn "Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n";
}
$_->close foreach values %FH;
if (!$DO_FAST) {
warn "Loading feature data and analyzing tables. You may see database messages here...\n";
$success &&= system($COMMAND{$_}) == 0 foreach @files;
}
# wait for children
while (%PID) {
sleep;
}
$success &&= !$FAILED;
cleanup();
printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer;
if ($success) {
print "SUCCESS: $FEATURES features successfully loaded\n";
exit 0;
} else {
print "FAILURE: Please see standard error for details\n";
exit -1;
}
if ($SUMMARY_STATS) {
warn "Building summary statistics for coverage histograms...\n";
$db->build_summary_statistics;
}
exit 0;
sub cleanup {
foreach (@files,@fasta_files_to_be_unlinked) {
unlink "$tmpdir/$_.$$";
}
}
# load copies of some of the tables into memory
sub load_tables {
my $dbh = shift;
print STDERR "loading normalized group, type and attribute information...";
$FID = 1 + get_max_id($dbh,'fdata','fid');
$GID = 1 + get_max_id($dbh,'fgroup','gid');
$FTYPEID = 1 + get_max_id($dbh,'ftype','ftypeid');
$ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id');
get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname');
get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod');
get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
print STDERR "ok\n";
}
sub get_max_id {
my $dbh = shift;
my ($table,$id) = @_;
my $sql = "select max($id) from $table";
my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
$result->[0];
}
sub get_ids {
my $dbh = shift;
my ($done,$idhash,$table,$id,@columns) = @_;
my $columns = join ',',$id,@columns;
my $sql = "select $columns from $table";
my $sth = $dbh->prepare($sql) or die $dbh->errstr;
$sth->execute or die $dbh->errstr;
while (my($id,@cols) = $sth->fetchrow_array) {
my $key = lc join $;,@cols;
$idhash->{$key} = $id;
$done->{$table,$id}++;
}
}
__END__
( run in 1.571 second using v1.01-cache-2.11-cpan-5735350b133 )