AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
my $encoded = $encoder->index( $p->dummy(0) );
return $encoded;
}
####
{
say "Testing one-hot encoding:\n";
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
use overload '""' => \&_op_stringify;
sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}
#####
{
say "Testing interval resizing:\n";
sub _debug_resize {
my ($interval, $to, $msg) = @_;
my $resized_interval = $interval->resize($to);
die "Wrong interval size for $interval --($to)--> $resized_interval"
unless $resized_interval->length == $to;
say sprintf "Interval: %s -> %s, length %2d : %s",
$interval,
$resized_interval, $resized_interval->length,
$msg;
}
for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
my ($start, $end) = @$interval_spec;
my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
say "-----";
for(0..5) {
my $base = $test_interval->length;
my $to = $base + $_;
_debug_resize $test_interval, $to, "$base -> $to (+ $_)";
}
say "";
}
}
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
B<STREAM (STDOUT)>:
Testing one-hot encoding:
Encoder is
PDL: Float D [5,4]
[
[0 1 0 0 0]
[0 0 1 0 0]
[0 0 0 1 0]
[0 0 0 0 1]
]
One-hot encoding of sequence 'ACGTNtgcan' is:
PDL: Float D [4,10]
[
[1 0 0 0]
[0 1 0 0]
[0 0 1 0]
[0 0 0 1]
[0 0 0 0]
[0 0 0 1]
[0 0 1 0]
[0 1 0 0]
[1 0 0 0]
[0 0 0 0]
]
B<RESULT>:
1
Note that in the above, the PDL ndarray's
=over
=item *
first dimension is 4 which matches the last dimension of the input C<TFTensor>;
=item *
second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.
=back
Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
use overload '""' => \&_op_stringify;
sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}
#####
{
say "Testing interval resizing:\n";
sub _debug_resize {
my ($interval, $to, $msg) = @_;
my $resized_interval = $interval->resize($to);
die "Wrong interval size for $interval --($to)--> $resized_interval"
unless $resized_interval->length == $to;
say sprintf "Interval: %s -> %s, length %2d : %s",
$interval,
$resized_interval, $resized_interval->length,
$msg;
}
for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
my ($start, $end) = @$interval_spec;
my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
say "-----";
for(0..5) {
my $base = $test_interval->length;
my $to = $base + $_;
_debug_resize $test_interval, $to, "$base -> $to (+ $_)";
}
say "";
}
}
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants
COMMENT
undef;
=head1 RESOURCE USAGE
use Filesys::DiskUsage qw/du/;
my $total = du( { 'human-readable' => 1, dereference => 1 },
$model_archive_path, $model_base, $new_model_base,
$targets_path,
$hg_gz_path,
$hg_bgz_path, $hg_bgz_fai_path,
$clinvar_path,
$plot_output_path,
);
say "Disk space usage: $total"; undef;
B<STREAM (STDOUT)>:
Disk space usage: 4.66G
=head1 CPANFILE
requires 'AI::TensorFlow::Libtensorflow';
requires 'AI::TensorFlow::Libtensorflow::DataType';
requires 'Archive::Extract';
requires 'Bio::DB::HTS::Faidx';
requires 'Bio::Location::Simple';
requires 'Bio::Tools::Run::Samtools';
requires 'Data::Frame';
requires 'Data::Printer';
requires 'Data::Printer::Filter::PDL';
requires 'Devel::Timer';
requires 'Digest::file';
requires 'FFI::Platypus::Buffer';
requires 'FFI::Platypus::Memory';
requires 'File::Which';
requires 'Filesys::DiskUsage';
requires 'HTTP::Tiny';
requires 'IPC::Run';
requires 'List::Util';
requires 'PDL';
requires 'PDL::Graphics::Gnuplot';
requires 'Path::Tiny';
requires 'Syntax::Construct';
requires 'Text::Table::Tiny';
requires 'URI';
requires 'constant';
requires 'feature';
requires 'lib::projectroot';
requires 'overload';
requires 'parent';
requires 'strict';
requires 'utf8';
requires 'warnings';
=head1 AUTHOR
Zakariyya Mughal <zmughal@cpan.org>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2022-2023 by Auto-Parallel Technologies, Inc.
This is free software, licensed under:
The Apache License, Version 2.0, January 2004
=cut
( run in 1.249 second using v1.01-cache-2.11-cpan-39bf76dae61 )