AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
my %puts = (
## Inputs
inputs_args_0 =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('serving_default_args_0'),
index => 0,
}),
## Outputs
outputs_human =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 0,
}),
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
}),
);
p %puts;
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
my $encoded = $encoder->index( $p->dummy(0) );
return $encoded;
}
####
{
say "Testing one-hot encoding:\n";
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
use overload '""' => \&_op_stringify;
sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}
#####
{
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>
We need a helper to simplify running the session and getting just the predictions that we want.
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
=head2 Encoding the data
The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.
This means that the bases are represented as vectors of length 4:
| base | vector encoding |
|------|-----------------|
| A | C<[1 0 0 0]> |
| C | C<[0 1 0 0]> |
| G | C<[0 0 1 0]> |
| T | C<[0 0 0 1]> |
| N | C<[0 0 0 0]> |
We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
my $encoded = $encoder->index( $p->dummy(0) );
return $encoded;
}
####
{
say "Testing one-hot encoding:\n";
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
B<STREAM (STDOUT)>:
Testing one-hot encoding:
Encoder is
PDL: Float D [5,4]
[
[0 1 0 0 0]
[0 0 1 0 0]
[0 0 0 1 0]
[0 0 0 0 1]
]
One-hot encoding of sequence 'ACGTNtgcan' is:
PDL: Float D [4,10]
[
[1 0 0 0]
[0 1 0 0]
[0 0 1 0]
[0 0 0 1]
[0 0 0 0]
[0 0 0 1]
[0 0 1 0]
[0 1 0 0]
[1 0 0 0]
[0 0 0 0]
]
B<RESULT>:
1
Note that in the above, the PDL ndarray's
=over
( run in 0.673 second using v1.01-cache-2.11-cpan-5a3173703d6 )