BioPerl
view release on metacpan or search on metacpan
scripts/Bio-DB-GFF/bp_fast_load_gff.pl view on Meta::CPAN
If the list of GFF or fasta files exceeds the kernel limit for the
maximum number of command-line arguments, use the
--long_list /path/to/files option.
The adaptor used is dbi::mysqlopt. There is currently no way to
change this.
=head1 COMMAND-LINE OPTIONS
Command-line options can be abbreviated to single-letter options.
e.g. -d instead of --database.
--database <dsn> Mysql database name
--create Reinitialize/create data tables without asking
--local Try to load a remote database using local data.
--user Username to log in as
--fasta File or directory containing fasta files to load
--password Password to use for authentication
--long_list Directory containing a very large number of
GFF and/or FASTA files
--maxfeature Set the value of the maximum feature size (default 100Mb; must be a power of 10)
--group A list of one or more tag names (comma or space separated)
to be used for grouping in the 9th column.
--gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
--summary Generate summary statistics for drawing coverage histograms.
This can be run on a previously loaded database or during
the load.
--Temporary Location of a writable scratch directory
=head1 SEE ALSO
L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
=head1 AUTHOR
Lincoln Stein, lstein@cshl.org
Copyright (c) 2002 Cold Spring Harbor Laboratory
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. See DISCLAIMER.txt for
disclaimers of warranty.
=cut
package Bio::DB::GFF::Adaptor::faux;
use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
use vars '@ISA';
@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
sub insert_sequence {
my $self = shift;
my ($id,$offset,$seq) = @_;
print join "\t",$id,$offset,$seq,"\n";
}
package main;
eval "use Time::HiRes"; undef $@;
my $timer = defined &Time::HiRes::time;
my ($DSN,$CREATE,$USER,$PASSWORD,$FASTA,$FAILED,$LOCAL,%PID,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR,$SUMMARY_STATS);
if ($DO_FAST) {
$SIG{CHLD} = sub {
while ((my $child = waitpid(-1,&WNOHANG)) > 0) {
delete $PID{$child} or next;
$FAILED++ if $? != 0;
}
}
};
$SIG{INT} = $SIG{TERM} = sub {cleanup(); exit -1};
GetOptions ('database:s' => \$DSN,
'create' => \$CREATE,
'user:s' => \$USER,
'local' => \$LOCAL,
'password:s' => \$PASSWORD,
'fasta:s' => \$FASTA,
'group:s' => \$GROUP_TAG,
'long_list:s' => \$LONG_LIST,
'maxbin|maxfeature:s' => \$MAX_BIN,
'gff3_munge' => \$MUNGE,
'summary' => \$SUMMARY_STATS,
'Temporary:s' => \$TMPDIR,
) or (system('pod2text',$0), exit -1);
$DSN ||= 'test';
$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
my (@args,$AUTH);
if (defined $USER) {
push @args,(-user=>$USER);
$AUTH .= " -u$USER";
}
if (defined $PASSWORD) {
push @args,(-pass=>$PASSWORD);
$AUTH .= " -p$PASSWORD";
}
push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
my $db = Bio::DB::GFF->new(-adaptor=>'faux',-dsn => $DSN,@args)
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
$db->gff3_name_munging(1) if $MUNGE;
if ($CREATE) {
$SUMMARY_STATS++;
$MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
}
$MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
# deal with really long lists of files
if ($LONG_LIST) {
-d $LONG_LIST or die "The --long_list argument must be a directory\n";
opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
@ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
closedir GFFDIR;
scripts/Bio-DB-GFF/bp_fast_load_gff.pl view on Meta::CPAN
my %DONE = ();
my $FEATURES = 0;
load_tables($db->dbh) unless $CREATE;
my ($major,$minor,$sub) = split /\./,$db->dbh->get_info(18); # SQL_DBMS_VER
my $can_disable_indexes = ($major >= 4 and $minor >= 0);
# open up pipes to the database
my (%FH,%COMMAND);
my $MYSQL = MYSQL;
my $tmpdir = $TMPDIR || $ENV{TMPDIR} || $ENV{TMP} || File::Spec->tmpdir();
-d $tmpdir or die <<END;
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
Please select a directory and indicate its location by setting the TMP environment variable, or
by using the --Temporary switch.
END
my @fasta_files_to_be_unlinked;
my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
foreach (@files) {
my $file = "$tmpdir/$_.$$";
print STDERR "creating load file $file...";
$DO_FAST &&= (system("mkfifo $file") == 0); # for system(), 0 = success
print STDERR "ok\n";
my $delete = $CREATE ? "delete from $_" : '';
my $local = $LOCAL ? 'local' : '';
my $analyze = "analyze table $_";
my $command =<<END;
$MYSQL $AUTH
-N
-s
-e "lock tables $_ write; $delete; load data $local infile '$file' replace into table $_; unlock tables; $analyze"
$DSN
END
;
$command =~ s/\n/ /g;
$COMMAND{$_} = $command;
if ($DO_FAST) {
if (my $pid = fork) {
$PID{$pid} = $_;
print STDERR "pausing for 0.5 sec..." if $DO_FAST;
select(undef,undef,undef,0.50); # work around a race condition
print STDERR "ok\n";
} else { # THIS IS IN CHILD PROCESS
die "Couldn't fork: $!" unless defined $pid;
exec $command || die "Couldn't exec: $!";
exit 0;
}
}
print STDERR "opening load file for writing...";
$FH{$_} = IO::File->new($file,'>') or die $_,": $!";
print STDERR "ok\n";
$FH{$_}->autoflush;
}
print STDERR "Fast loading enabled\n" if $DO_FAST;
my ($count,$gff3,$last,$start,$beginning,$current_file);
$last = Time::HiRes::time() if $timer;
$beginning = $start = $last;
# avoid hanging on standalone --fasta load
if (!@ARGV) {
$FH{NULL} = IO::File->new(">$tmpdir/null");
push @ARGV, "$tmpdir/null";
}
while (<>) {
# reset GFF3 flag if new filehandle
$current_file ||= $ARGV;
unless ($current_file eq $ARGV) {
undef $gff3;
$current_file = $ARGV;
}
chomp;
my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
# close sequence filehandle if required
if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
$FH{FASTA}->close;
delete $FH{FASTA};
}
# print to fasta file if the handle is open
if ( defined $FH{FASTA} ) {
$FH{FASTA}->print("$_\n");
next;
}
elsif (/^>(\S+)/) { # uh oh, sequence coming
$FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
$FH{FASTA}->print("$_\n");
push @fasta, "$tmpdir/$1\.fa";
push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
print STDERR "Processing embedded sequence $1\n";
next;
}
elsif (/^\#\#\s*group-tags\s+(.+)/) {
$db->preferred_groups(split(/\s+/,$1));
next;
}
elsif (/^\#\#\s*gff-version\s+(\d+)/) {
$gff3 = ($1 >= 3);
$db->print_gff3_warning() if $gff3;
next;
}
elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
}
elsif (/^\#/) {
next;
}
else {
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
}
next unless defined $ref;
$FEATURES++;
warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;
$source = '\N' unless defined $source;
$score = '\N' if $score eq '.';
$strand = '\N' if $strand eq '.';
$phase = '\N' if $phase eq '.';
my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
# GFF2/3 transition
$gclass = [$gclass] unless ref $gclass;
$gname = [$gname] unless ref $gname;
for (my $i=0; $i < @$gname; $i++) {
my $group_class = $gclass->[$i];
my $group_name = $gname->[$i];
$group_class ||= '\N';
$group_name ||= '\N';
$target_start ||= '\N';
$target_stop ||= '\N';
$method ||= '\N';
$source ||= '\N';
my $fid = $FID++;
my $gid = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
my $ftypeid = $FTYPEID{lc join($;,$source,$method)} ||= $FTYPEID++;
my $bin = bin($start,$stop,$db->min_bin);
$FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
$FH{ FGROUP() }->print( join("\t",$gid,$group_class,$group_name),"\n" ) unless $DONE{"fgroup$;$gid"}++;
$FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"ftype$;$ftypeid"}++;
foreach (@$attributes) {
my ($key,$value) = @$_;
my $attributeid = $ATTRIBUTEID{lc $key} ||= $ATTRIBUTEID++;
$FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"fattribute$;$attributeid"}++;
$FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
}
if ( $FEATURES % 1000 == 0) {
my $now = Time::HiRes::time() if $timer;
my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
$last = $now;
print STDERR "$fid features parsed$elapsed...";
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
}
}
}
$FH{FASTA}->close if exists $FH{FASTA};
printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
$start = time();
for my $fasta (@fasta) {
warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n";
my $old = select($FH{FDNA()});
my $loaded = $db->load_fasta($fasta);
warn "$fasta: $loaded records loaded\n";
select $old;
}
printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
$start = time();
my $success = 1;
if ($DO_FAST) {
warn "Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n";
}
$_->close foreach values %FH;
if (!$DO_FAST) {
warn "Loading feature data and analyzing tables. You may see database messages here...\n";
$success &&= system($COMMAND{$_}) == 0 foreach @files;
}
# wait for children
while (%PID) {
sleep;
}
$success &&= !$FAILED;
cleanup();
printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer;
if ($success) {
print "SUCCESS: $FEATURES features successfully loaded\n";
exit 0;
} else {
print "FAILURE: Please see standard error for details\n";
exit -1;
}
if ($SUMMARY_STATS) {
warn "Building summary statistics for coverage histograms...\n";
$db->build_summary_statistics;
}
exit 0;
sub cleanup {
foreach (@files,@fasta_files_to_be_unlinked) {
unlink "$tmpdir/$_.$$";
}
}
# load copies of some of the tables into memory
sub load_tables {
my $dbh = shift;
print STDERR "loading normalized group, type and attribute information...";
$FID = 1 + get_max_id($dbh,'fdata','fid');
$GID = 1 + get_max_id($dbh,'fgroup','gid');
$FTYPEID = 1 + get_max_id($dbh,'ftype','ftypeid');
$ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id');
get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname');
get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod');
get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
print STDERR "ok\n";
}
sub get_max_id {
my $dbh = shift;
my ($table,$id) = @_;
my $sql = "select max($id) from $table";
my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
$result->[0];
}
sub get_ids {
my $dbh = shift;
my ($done,$idhash,$table,$id,@columns) = @_;
my $columns = join ',',$id,@columns;
my $sql = "select $columns from $table";
my $sth = $dbh->prepare($sql) or die $dbh->errstr;
$sth->execute or die $dbh->errstr;
while (my($id,@cols) = $sth->fetchrow_array) {
my $key = lc join $;,@cols;
$idhash->{$key} = $id;
$done->{$table,$id}++;
}
}
__END__
( run in 1.283 second using v1.01-cache-2.11-cpan-39bf76dae61 )