AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
}),
);
p %puts;
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
my $encoded = $encoder->index( $p->dummy(0) );
return $encoded;
}
####
{
say "Testing one-hot encoding:\n";
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
use overload '""' => \&_op_stringify;
sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}
#####
{
say "Testing interval resizing:\n";
sub _debug_resize {
my ($interval, $to, $msg) = @_;
my $resized_interval = $interval->resize($to);
die "Wrong interval size for $interval --($to)--> $resized_interval"
unless $resized_interval->length == $to;
say sprintf "Interval: %s -> %s, length %2d : %s",
$interval,
$resized_interval, $resized_interval->length,
$msg;
}
for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
my ($start, $end) = @$interval_spec;
my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
say "-----";
for(0..5) {
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
( $i == $#tracks
? ( xtics => { format => '%.3f', @tics_opts } )
: ( xtics => 0 ) ),
( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : () ),
},
with => 'filledcurves',
#'lc' => '#086eb5',
# $x scaled by 1e7; filled curve between $y and the x-axis
$x / 1e7, $y, pdl(0)
);
}
$gp->end_multi;
$gp->close;
if( IN_IPERL ) {
IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}
# Some code that could be used for working with variants.
1 if <<'COMMENT';
use Bio::DB::HTS::VCF;
my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants
COMMENT
undef;
use Filesys::DiskUsage qw/du/;
my $total = du( { 'human-readable' => 1, dereference => 1 },
$model_archive_path, $model_base, $new_model_base,
$targets_path,
$hg_gz_path,
$hg_bgz_path, $hg_bgz_fai_path,
$clinvar_path,
$plot_output_path,
);
say "Disk space usage: $total"; undef;
__END__
=pod
=encoding UTF-8
=head1 NAME
AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel - Using TensorFlow to do gene expression prediction using a pre-trained model
=head1 SYNOPSIS
The following tutorial is based on the L<Enformer usage notebook|https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb>. It uses a pre-trained model based on a transformer architecture trained as described in Avsec e...
Running the code requires an Internet connection to download the model (from Google servers) and datasets (from GitHub, UCSC, and NIH).
Some of this code is identical to that of C<InferenceUsingTFHubMobileNetV2Model> notebook. Please look there for explanation for that code. As stated there, this will later be wrapped up into a high-level library to hide the details behind an API.
B<NOTE>: If running this model, please be aware that
=over
=item *
the Docker image takes 3 GB or more of disk space;
=item *
the model and data takes 5 GB or more of disk space.
=back
meaning that you will need a total of B<8 GB> of disk space. You may need at least B<4 GB> of free memory to run the model.
=head1 COLOPHON
The following document is either a POD file which can additionally be run as a Perl script or a Jupyter Notebook which can be run in L<IPerl|https://p3rl.org/Devel::IPerl> (viewable online at L<nbviewer|https://nbviewer.org/github/EntropyOrg/perl-AI-...
You will also need the executables C<gunzip>, C<bgzip>, and C<samtools>. Furthermore,
=over
=item *
C<Bio::DB::HTS> requires C<libhts> and
=item *
C<PDL::Graphics::Gnuplot> requires C<gnuplot>.
=back
If you are running the code, you may optionally install the L<C<tensorflow> Python package|https://www.tensorflow.org/install/pip> in order to access the C<saved_model_cli> command, but this is only used for informational purposes.
=head1 TUTORIAL
=head2 Load the library
First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...
use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
The given SavedModel SignatureDef contains the following input(s):
The given SavedModel SignatureDef contains the following output(s):
outputs['__saved_model_init_op'] tensor_info:
dtype: DT_INVALID
shape: unknown_rank
name: NoOp
Method name is:
signature_def['serving_default']:
The given SavedModel SignatureDef contains the following input(s):
inputs['args_0'] tensor_info:
dtype: DT_FLOAT
shape: (-1, 393216, 4)
name: serving_default_args_0:0
The given SavedModel SignatureDef contains the following output(s):
outputs['human'] tensor_info:
dtype: DT_FLOAT
shape: (-1, 896, 5313)
name: StatefulPartitionedCall:0
outputs['mouse'] tensor_info:
dtype: DT_FLOAT
shape: (-1, 896, 1643)
name: StatefulPartitionedCall:1
Method name is: tensorflow/serving/predict
Concrete Functions:
Function Name: 'predict_on_batch'
Option #1
Callable with:
Argument #1
args_0: TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name='args_0')
B<RESULT>:
1
We want to use the C<serve> tag-set and
=over
=item *
the input C<args_0> which has the name C<serving_default_args_0:0> and
=item *
the output C<human> which has the name C<StatefulPartitionedCall:0>.
=back
all of which are C<DT_FLOAT>.
Make note of the shapes that those take. Per the L<model description|https://tfhub.dev/deepmind/enformer/1> at TensorFlow Hub:
=over 2
The input sequence length is 393,216 with the prediction corresponding to 128 base pair windows for the center 114,688 base pairs. The input sequence is one hot encoded using the order of indices corresponding to 'ACGT' with N values being all zeros.
=back
The input shape C<(-1, 393216, 4)> thus represents dimensions C<[batch size] x [sequence length] x [one-hot encoding of ACGT]>.
The output shape C<(-1, 896, 5313)> represents dimensions C<[batch size] x [ predictions along 114,688 base pairs / 128 base pair windows ] x [ human target by index ]>. We can confirm this by doing some calculations:
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
B<STREAM (STDOUT)>:
Number of predictions: 896
B<RESULT>:
1
and by looking at the targets file:
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
say "First 5:";
say $df->head(5);
B<STREAM (STDOUT)>:
Number of targets: 5313
First 5:
------------------------------------------------------------------------------------------------------------------------------------------------
index genome identifier file clip scale sum_stat description
------------------------------------------------------------------------------------------------------------------------------------------------
0 0 0 ENCFF833POA encode/ENCSR000EIJ 32 2 mean DNASE:cerebellum male adult (27 years) and male adult (35 years)
1 1 0 ENCFF110QGM encode/ENCSR000EIK 32 2 mean DNASE:frontal cortex male adult (27 years) and male adult (35 years)
2 2 0 ENCFF880MKD encode/ENCSR000EIL 32 2 mean DNASE:chorion
3 3 0 ENCFF463ZLQ encode/ENCSR000EIP 32 2 mean DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour
4 4 0 ENCFF890OGQ encode/ENCSR000EIS 32 2 mean DNASE:GM03348
------------------------------------------------------------------------------------------------------------------------------------------------
B<RESULT>:
1
=head2 Load the model
Let's now load the model in Perl and get the inputs and outputs into a data structure by name.
my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
);
p %puts;
B<STREAM (STDERR)>:
=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
</span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>
We need a helper to simplify running the session and getting just the predictions that we want.
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
=head2 Encoding the data
The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.
This means that the bases are represented as vectors of length 4:
| base | vector encoding |
|------|-----------------|
| A | C<[1 0 0 0]> |
| C | C<[0 1 0 0]> |
| G | C<[0 0 1 0]> |
| T | C<[0 0 0 1]> |
| N | C<[0 0 0 0]> |
We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
my $encoded = $encoder->index( $p->dummy(0) );
return $encoded;
}
####
{
say "Testing one-hot encoding:\n";
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
B<STREAM (STDOUT)>:
Testing one-hot encoding:
Encoder is
PDL: Float D [5,4]
[
[0 1 0 0 0]
[0 0 1 0 0]
[0 0 0 1 0]
[0 0 0 0 1]
]
One-hot encoding of sequence 'ACGTNtgcan' is:
PDL: Float D [4,10]
[
[1 0 0 0]
[0 1 0 0]
[0 0 1 0]
[0 0 0 1]
[0 0 0 0]
[0 0 0 1]
[0 0 1 0]
[0 1 0 0]
[1 0 0 0]
[0 0 0 0]
]
B<RESULT>:
1
Note that in the above, the PDL ndarray's
=over
=item *
first dimension is 4 which matches the last dimension of the input C<TFTensor>;
=item *
second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.
=back
Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
( run in 0.753 second using v1.01-cache-2.11-cpan-4991d5b9bd9 )