Bio-DB-GFF
view release on metacpan or search on metacpan
bin/bp_bulk_load_gff view on Meta::CPAN
my $db = Bio::DB::GFF->new(-adaptor=>$faux_adaptor,-dsn => $DSN,@auth)
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
$db->gff3_name_munging(1) if $MUNGE;
$MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
$MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
# deal with really long lists of files
if ($LONG_LIST) {
-d $LONG_LIST or die "The --long_list argument must be a directory\n";
opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
@ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
closedir GFFDIR;
if (defined $FASTA && -d $FASTA) {
opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
closedir FASTA;
}
elsif (defined $FASTA && -f $FASTA) {
push @ARGV, $FASTA;
}
}
foreach (@ARGV) {
$_ = "gunzip -c $_ |" if /\.gz$/;
$_ = "uncompress -c $_ |" if /\.Z$/;
$_ = "bunzip2 -c $_ |" if /\.bz2$/;
}
my (@gff,@fasta);
foreach (@ARGV) {
if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
push @fasta,$_;
} else {
push @gff,$_;
}
}
@ARGV = @gff;
push @fasta,$FASTA if defined $FASTA;
# drop everything that was there before
my %FH;
my $tmpdir = File::Spec->tmpdir() || '/tmp';
$tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
-d $tmpdir or die <<END;
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
Please select a directory and indicate its location by setting the TMP environment variable, or
by using the --Temporary switch.
END
my @fasta_files_to_be_unlinked;
my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
foreach (@files) {
$FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!";
$FH{$_}->autoflush;
}
if ( $use_pg ) {
$FH{FDATA() }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
$FH{FTYPE() }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
$FH{FGROUP() }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
$FH{FATTRIBUTE() }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
$FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
}
my $FID = 1;
my $GID = 1;
my $FTYPEID = 1;
my $ATTRIBUTEID = 1;
my %GROUPID = ();
my %FTYPEID = ();
my %ATTRIBUTEID = ();
my %DONE = ();
my $FEATURES = 0;
my %tmpfiles; # keep track of temporary fasta files
my $count;
my $fasta_sequence_id;
my $gff3;
my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files
$db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;
my $last = Time::HiRes::time() if $timer;
my $start = $last;
# avoid hanging on standalone --fasta load
if (!@ARGV) {
$FH{NULL} = IO::File->new(">$tmpdir/null");
push @ARGV, "$tmpdir/null";
}
my ($cmap_db);
if ($use_mysqlcmap){
my $options = {
AutoCommit => 1,
FetchHashKeyName => 'NAME_lc',
LongReadLen => 3000,
LongTruncOk => 1,
RaiseError => 1,
};
$cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options );
}
# Only load CMap::Utils if using cmap
unless (!$use_mysqlcmap or
eval {
require Bio::GMOD::CMap::Utils;
Bio::GMOD::CMap::Utils->import('next_number');
1;
}
) {
print STDERR "Error loading Bio::GMOD::CMap::Utils\n";
}
while (<>) {
$current_file ||= $ARGV;
# reset GFF3 flag if new filehandle
unless($current_file eq $ARGV){
bin/bp_bulk_load_gff view on Meta::CPAN
$FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
$FH{FASTA}->print("$_\n");
print STDERR "Preparing embedded sequence $1\n";
push @fasta, "$tmpdir/$1\.fa";
push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
$tmpfiles{"$tmpdir/$1\.fa"}++;
next;
}
elsif (/^\#\#\s*gff-version\s+(\d+)/) {
$gff3 = ($1 >= 3);
$db->print_gff3_warning() if $gff3;
next;
}
elsif (/^\#\#\s*group-tags\s+(.+)/) {
$db->preferred_groups(split(/\s+/,$1));
next;
}
elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
}
elsif (/^\#/) {
next;
}
else {
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
}
if ( not defined( $ref ) or length ($ref) == 0) {
warn "\$ref is null. source = $source, method = $method, group = $group\n";
next;
}
$FEATURES++;
my $size = $stop-$start+1;
warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;
$source = '\N' unless defined $source;
$score = '\N' if $score eq '.';
$strand = '\N' if $strand eq '.';
$phase = '\N' if $phase eq '.';
my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
# GFF2/3 transition
$group_class = [$group_class] unless ref $group_class;
$group_name = [$group_name] unless ref $group_name;
for (my $i=0; $i < @$group_name; $i++) {
$group_class->[$i] ||= '\N';
$group_name->[$i] ||= '\N';
$target_start ||= '\N';
$target_stop ||= '\N';
$method ||= '\N';
$source ||= '\N';
my $fid = $FID++;
my $gid = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])} ||= $GID++;
my $ftypeid = $FTYPEID{lc join('',$source,$method)} ||= $FTYPEID++;
my $bin = bin($start,$stop,$db->min_bin);
$FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
if ($use_mysqlcmap){
my $feature_id = next_number(
db => $cmap_db,
table_name => 'cmap_feature',
id_field => 'feature_id',
)
or die 'No feature id';
my $direction = $strand eq '-' ? -1:1;
$FH{ FGROUP() }->print(
join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
,"\n"
) unless $DONE{"G$gid"}++;
}
else {
$FH{ FGROUP() }->print( join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
}
$FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++;
foreach (@$attributes) {
my ($key,$value) = @$_;
my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++;
$FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++;
$FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
}
if ( $fid % 1000 == 0) {
my $now = Time::HiRes::time() if $timer;
my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
$last = $now;
print STDERR "$fid features parsed$elapsed...";
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
}
}
}
$FH{FASTA}->close if exists $FH{FASTA};
for my $file (@fasta) {
warn "Preparing DNA file $file....\n";
if ($use_pg){
$FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
}
my $old = select($FH{FDNA()});
$db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
if ($use_pg){
$FH{FDNA() }->print("\\.\n\n");
}
warn "done...\n";
select $old;
unlink $file if $tmpfiles{$file};
}
if ($use_pg) {
$FH{FDATA() }->print("\\.\n\n");
$FH{FTYPE() }->print("\\.\n\n");
$FH{FGROUP() }->print("\\.\n\n");
$FH{FATTRIBUTE() }->print("\\.\n\n");
$FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n");
}
$_->close foreach values %FH;
printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
warn "Loading feature data and analyzing tables. You may see RDBMS messages here...\n";
if ($use_pg){
warn "Loading feature data. You may see Postgres comments...\n";
foreach (@files) {
my $file = "$tmpdir/$_.$$";
$AUTH ? system("psql $AUTH -f $file $DBNAME")
: system('psql','-f', $file, $DBNAME);
unlink $file;
( run in 2.065 seconds using v1.01-cache-2.11-cpan-5735350b133 )