BioPerl
view release on metacpan or search on metacpan
scripts/Bio-DB-GFF/bp_bulk_load_gff.pl view on Meta::CPAN
sub insert_sequence {
my $self = shift;
my ($id,$offset,$seq) = @_;
print join("\t",$id,$offset,$seq),"\n";
};
package Bio::DB::GFF::Adaptor::fauxmysqlcmap;
use Bio::DB::GFF::Adaptor::dbi::mysqlcmap;
use vars '@ISA';
@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlcmap';
sub insert_sequence {
my $self = shift;
my ($id,$offset,$seq) = @_;
print join("\t",$id,$offset,$seq),"\n";
};
package Bio::DB::GFF::Adaptor::fauxpg;
use Bio::DB::GFF::Adaptor::dbi::pg;
use vars '@ISA';
@ISA = 'Bio::DB::GFF::Adaptor::dbi::pg';
#these two subs are to separate the table creation from the
#index creation
sub do_initialize {
my $self = shift;
my $erase = shift;
$self->drop_all if $erase;
my $dbh = $self->features_db;
my $schema = $self->schema;
foreach my $table_name ($self->tables) {
my $create_table_stmt = $schema->{$table_name}{table} ;
$dbh->do($create_table_stmt) || warn $dbh->errstr;
# $self->create_other_schema_objects(\%{$schema->{$table_name}});
}
1;
}
sub _create_indexes_etc {
my $self = shift;
my $dbh = $self->features_db;
my $schema = $self->schema;
foreach my $table_name ($self->tables) {
$self->create_other_schema_objects(\%{$schema->{$table_name}});
}
}
sub insert_sequence {
my $self = shift;
my ($id,$offset,$seq) = @_;
print "$id\t$offset\t$seq\n";
}
package main;
eval "use Time::HiRes"; undef $@;
my $timer = defined &Time::HiRes::time;
my $bWINDOWS = 0; # Boolean: is this a MSWindows operating system?
if ($^O =~ /MSWin32/i) {
$bWINDOWS = 1;
}
my ($DSN,$ADAPTOR,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR);
GetOptions ('database:s' => \$DSN,
'adaptor:s' => \$ADAPTOR,
'create' => \$FORCE,
'user:s' => \$USER,
'password:s' => \$PASSWORD,
'fasta:s' => \$FASTA,
'local' => \$LOCAL,
'maxbin|maxfeature:s' => \$MAX_BIN,
'group:s' => \$GROUP_TAG,
'long_list:s' => \$LONG_LIST,
'gff3_munge' => \$MUNGE,
'Temporary:s' => \$TMPDIR,
) or (system('pod2text', $0), exit -1);
# If called as pg_bulk_load_gff.pl behave as that did.
if ($0 =~/pg_bulk_load_gff.pl/){
$ADAPTOR ||= 'Pg';
$DSN ||= 'test';
}
$DSN ||= 'dbi:mysql:test';
$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
if ($bWINDOWS && not $FORCE) {
die "Note that Windows users must use the --create option.\n";
}
unless ($FORCE) {
die "This will delete all existing data in database $DSN. If you want to do this, rerun with the --create option.\n"
if $bWINDOWS;
open (TTY,"/dev/tty") or die "/dev/tty: $!\n"; #TTY use removed for win compatability
print STDERR "This operation will delete all existing data in database $DSN. Continue? ";
my $f = <TTY>;
die "Aborted\n" unless $f =~ /^[yY]/;
close TTY;
}
# postgres DBD::Pg allows 'database', but also 'dbname', and 'db':
# and it must be Pg (not pg)
$DSN=~s/pg:database=/Pg:/i;
$DSN=~s/pg:dbname=/Pg:/i;
$DSN=~s/pg:db=/Pg:/i;
# leave these lines for mysql
$DSN=~s/database=//i;
$DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host"
my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN;
$DBNAME=$DSN unless $DSN=~/:/;
$ADAPTOR ||= $DBD;
$ADAPTOR ||= 'mysql';
scripts/Bio-DB-GFF/bp_bulk_load_gff.pl view on Meta::CPAN
}
foreach (@ARGV) {
$_ = "gunzip -c $_ |" if /\.gz$/;
$_ = "uncompress -c $_ |" if /\.Z$/;
$_ = "bunzip2 -c $_ |" if /\.bz2$/;
}
my (@gff,@fasta);
foreach (@ARGV) {
if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
push @fasta,$_;
} else {
push @gff,$_;
}
}
@ARGV = @gff;
push @fasta,$FASTA if defined $FASTA;
# drop everything that was there before
my %FH;
my $tmpdir = File::Spec->tmpdir() || '/tmp';
$tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
-d $tmpdir or die <<END;
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
Please select a directory and indicate its location by setting the TMP environment variable, or
by using the --Temporary switch.
END
my @fasta_files_to_be_unlinked;
my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
foreach (@files) {
$FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!";
$FH{$_}->autoflush;
}
if ( $use_pg ) {
$FH{FDATA() }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
$FH{FTYPE() }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
$FH{FGROUP() }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
$FH{FATTRIBUTE() }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
$FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
}
my $FID = 1;
my $GID = 1;
my $FTYPEID = 1;
my $ATTRIBUTEID = 1;
my %GROUPID = ();
my %FTYPEID = ();
my %ATTRIBUTEID = ();
my %DONE = ();
my $FEATURES = 0;
my %tmpfiles; # keep track of temporary fasta files
my $count;
my $fasta_sequence_id;
my $gff3;
my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files
$db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;
my $last = Time::HiRes::time() if $timer;
my $start = $last;
# avoid hanging on standalone --fasta load
if (!@ARGV) {
$FH{NULL} = IO::File->new(">$tmpdir/null");
push @ARGV, "$tmpdir/null";
}
my ($cmap_db);
if ($use_mysqlcmap){
my $options = {
AutoCommit => 1,
FetchHashKeyName => 'NAME_lc',
LongReadLen => 3000,
LongTruncOk => 1,
RaiseError => 1,
};
$cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options );
}
# Only load CMap::Utils if using cmap
unless (!$use_mysqlcmap or
eval {
require Bio::GMOD::CMap::Utils;
Bio::GMOD::CMap::Utils->import('next_number');
1;
}
) {
print STDERR "Error loading Bio::GMOD::CMap::Utils\n";
}
while (<>) {
$current_file ||= $ARGV;
# reset GFF3 flag if new filehandle
unless($current_file eq $ARGV){
undef $gff3;
$current_file = $ARGV;
}
chomp;
my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
# close sequence filehandle if required
if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
$FH{FASTA}->close;
delete $FH{FASTA};
}
# print to fasta file if the handle is open
if ( defined $FH{FASTA} ) {
$FH{FASTA}->print("$_\n");
next;
}
elsif (/^>(\S+)/) { # uh oh, sequence coming
$FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
$FH{FASTA}->print("$_\n");
scripts/Bio-DB-GFF/bp_bulk_load_gff.pl view on Meta::CPAN
}
if ( not defined( $ref ) or length ($ref) == 0) {
warn "\$ref is null. source = $source, method = $method, group = $group\n";
next;
}
$FEATURES++;
my $size = $stop-$start+1;
warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;
$source = '\N' unless defined $source;
$score = '\N' if $score eq '.';
$strand = '\N' if $strand eq '.';
$phase = '\N' if $phase eq '.';
my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
# GFF2/3 transition
$group_class = [$group_class] unless ref $group_class;
$group_name = [$group_name] unless ref $group_name;
for (my $i=0; $i < @$group_name; $i++) {
$group_class->[$i] ||= '\N';
$group_name->[$i] ||= '\N';
$target_start ||= '\N';
$target_stop ||= '\N';
$method ||= '\N';
$source ||= '\N';
my $fid = $FID++;
my $gid = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])} ||= $GID++;
my $ftypeid = $FTYPEID{lc join('',$source,$method)} ||= $FTYPEID++;
my $bin = bin($start,$stop,$db->min_bin);
$FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
if ($use_mysqlcmap){
my $feature_id = next_number(
db => $cmap_db,
table_name => 'cmap_feature',
id_field => 'feature_id',
)
or die 'No feature id';
my $direction = $strand eq '-' ? -1:1;
$FH{ FGROUP() }->print(
join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
,"\n"
) unless $DONE{"G$gid"}++;
}
else {
$FH{ FGROUP() }->print( join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
}
$FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++;
foreach (@$attributes) {
my ($key,$value) = @$_;
my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++;
$FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++;
$FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
}
if ( $fid % 1000 == 0) {
my $now = Time::HiRes::time() if $timer;
my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
$last = $now;
print STDERR "$fid features parsed$elapsed...";
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
}
}
}
$FH{FASTA}->close if exists $FH{FASTA};
for my $file (@fasta) {
warn "Preparing DNA file $file....\n";
if ($use_pg){
$FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
}
my $old = select($FH{FDNA()});
$db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
if ($use_pg){
$FH{FDNA() }->print("\\.\n\n");
}
warn "done...\n";
select $old;
unlink $file if $tmpfiles{$file};
}
if ($use_pg) {
$FH{FDATA() }->print("\\.\n\n");
$FH{FTYPE() }->print("\\.\n\n");
$FH{FGROUP() }->print("\\.\n\n");
$FH{FATTRIBUTE() }->print("\\.\n\n");
$FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n");
}
$_->close foreach values %FH;
printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
warn "Loading feature data and analyzing tables. You may see RDBMS messages here...\n";
if ($use_pg){
warn "Loading feature data. You may see Postgres comments...\n";
foreach (@files) {
my $file = "$tmpdir/$_.$$";
$AUTH ? system("psql $AUTH -f $file $DBNAME")
: system('psql','-f', $file, $DBNAME);
unlink $file;
}
warn "Updating sequences ...\n";
$db->update_sequences();
warn "Creating indexes ...\n";
$db->_create_indexes_etc();
warn "done...\n";
}
elsif( $use_mysql or $use_mysqlcmap ) {
$start = time();
my $success = 1;
my $TERMINATEDBY = $bWINDOWS ? q( LINES TERMINATED BY '\r\n') : '';
for my $f (@files) {
my $table = function_to_table($f,$ADAPTOR);
my $sql = join ('; ',
"lock tables $table write",
"delete from $table",
"load data $LOCAL infile '$tmpdir/$f.$$' replace into table $table $TERMINATEDBY",
"unlock tables");
my $command = MYSQL . qq[$AUTH -s -e "$sql"];
$command =~ s/\n/ /g;
$success &&= system($command) == 0;
unlink "$tmpdir/$f.$$";
}
printf STDERR "Total load time %5.2fs\n",(time() - $start) if $timer;
print STDERR "done...\n";
print STDERR "Analyzing/optimizing tables. You will see database messages...\n";
$start = time();
my $sql = '';
for my $f (@files) {
my $table = function_to_table($f,$ADAPTOR);
$sql .= "analyze table $table;";
}
my $command = MYSQL . qq[$AUTH -N -s -e "$sql"];
$success &&= system($command) == 0;
printf STDERR "Optimization time time %5.2fs\n",(time() - $start);
if ($success) {
print "$FEATURES features successfully loaded\n";
} else {
print "FAILURE: Please see standard error for details\n";
exit -1;
}
}
foreach (@fasta_files_to_be_unlinked) {
unlink "$tmpdir/$_.$$";
}
warn "Building summary statistics for coverage histograms...\n";
my (@args,$AUTH);
if (defined $USER) {
push @args,(-user=>$USER);
$AUTH .= " -u$USER";
}
if (defined $PASSWORD) {
push @args,(-pass=>$PASSWORD);
$AUTH .= " -p$PASSWORD";
}
push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
my $db = Bio::DB::GFF->new(-adaptor=>"dbi::$ADAPTOR",-dsn => $DSN,@args)
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
$db->build_summary_statistics;
exit 0;
sub function_to_table {
my $function = shift;
my $adaptor = shift;
if ($function eq 'fdata'){
return 'fdata';
}
elsif ($function eq 'ftype'){
return 'ftype';
}
elsif ($function eq 'fgroup'){
return 'cmap_feature' if ($adaptor eq 'mysqlcmap');
return 'fgroup';
}
elsif ($function eq 'fdna'){
return 'fdna';
}
elsif ($function eq 'fattribute'){
( run in 1.063 second using v1.01-cache-2.11-cpan-39bf76dae61 )