AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
say "First 5:";
say $df->head(5);
B<STREAM (STDOUT)>:
Number of targets: 5313
First 5:
------------------------------------------------------------------------------------------------------------------------------------------------
index genome identifier file clip scale sum_stat description
------------------------------------------------------------------------------------------------------------------------------------------------
0 0 0 ENCFF833POA encode/ENCSR000EIJ 32 2 mean DNASE:cerebellum male adult (27 years) and male adult (35 years)
1 1 0 ENCFF110QGM encode/ENCSR000EIK 32 2 mean DNASE:frontal cortex male adult (27 years) and male adult (35 years)
2 2 0 ENCFF880MKD encode/ENCSR000EIL 32 2 mean DNASE:chorion
3 3 0 ENCFF463ZLQ encode/ENCSR000EIP 32 2 mean DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour
4 4 0 ENCFF890OGQ encode/ENCSR000EIS 32 2 mean DNASE:GM03348
------------------------------------------------------------------------------------------------------------------------------------------------
B<RESULT>:
1
=head2 Load the model
Let's now load the model in Perl and get the inputs and outputs into a data structure by name.
my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
$opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);
my %puts = (
## Inputs
inputs_args_0 =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('serving_default_args_0'),
index => 0,
}),
## Outputs
outputs_human =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 0,
}),
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
}),
);
p %puts;
B<STREAM (STDERR)>:
=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
</span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>
We need a helper to simplify running the session and getting just the predictions that we want.
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
=head2 Encoding the data
The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.
This means that the bases are represented as vectors of length 4:
| base | vector encoding |
|------|-----------------|
| A | C<[1 0 0 0]> |
| C | C<[0 1 0 0]> |
| G | C<[0 0 1 0]> |
| T | C<[0 0 0 1]> |
| N | C<[0 0 0 0]> |
We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
die "Target interval is not $model_central_base_pairs_length bp long"
unless $target_interval->length == $model_central_base_pairs_length;
say "Target sequence is ", seq_info extract_sequence( $hg_db, $target_interval );
say "";
my $resized_interval = $target_interval->resize( $model_sequence_length );
say "Resized interval: $resized_interval with length @{[ $resized_interval->length ]}";
die "resize() is not working properly!" unless $resized_interval->length == $model_sequence_length;
my $seq = extract_sequence( $hg_db, $resized_interval );
say "Resized sequence is ", seq_info($seq);
B<STREAM (STDOUT)>:
Target interval: chr11:35082743..35197430 with length 114688
Target sequence is GGTGGCAGCC...ATCTCCTTTT (length 114688)
Resized interval: chr11:34943479..35336694 with length 393216
Resized sequence is ACTAGTTCTA...GGCCCAAATC (length 393216)
B<RESULT>:
1
To prepare the input we have to one-hot encode this resized sequence and give it a dummy dimension at the end to indicate that it is is a batch with a single sequence. Then we can turn the PDL ndarray into a C<TFTensor> and pass it to our prediction ...
my $sequence_one_hot = one_hot_dna( $seq )->dummy(-1);
say $sequence_one_hot->info; undef;
B<STREAM (STDOUT)>:
PDL: Float D [4,393216,1]
use Devel::Timer;
my $t = Devel::Timer->new;
$t->mark('prediction of sequence');
my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );
$t->mark('End of prediction of sequence');
p $predictions;
$t->mark('END');
$t->report();
B<STREAM (STDERR)>:
=begin html
<span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Tensor</span><span style=""> </span><span style="color: #33ccff;">{</span><span style="">
</span><span style="color: #6666cc;">Type </span><span style=""> </span><span style="color: #cc66cc;">FLOAT</span><span style="">
</span><span style="color: #6666cc;">Dims </span><span style=""> </span><span style="color: #33ccff;">[</span><span style=""> </span><span style="color: #ff6633;">1</span><span style=""> </span><span style="color: #ff6633;">896</span><s...
</span><span style="color: #6666cc;">NumDims </span><span style=""> </span><span style="color: #ff6633;">3</span><span style="">
</span><span style="color: #6666cc;">ElementCount </span><span style=""> </span><span style="color: #ff6633;">4760448</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
Devel::Timer Report -- Total time: 14.5641 secs
Interval Time Percent
----------------------------------------------
01 -> 02 14.5634 100.00% prediction of sequence -> End of prediction of sequence
02 -> 03 0.0007 0.00% End of prediction of sequence -> END
00 -> 01 0.0000 0.00% INIT -> prediction of sequence
</span></code></pre></span>
=end html
Now we turn the C<TFTensor> output into a PDL ndarray.
my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
say $predictions_p->info; undef;
B<STREAM (STDOUT)>:
PDL: Float D [5313,896]
=head2 Plot predicted tracks
These predictions can be plotted
my @tracks = (
[ 'DNASE:CD14-positive monocyte female' => 41 => $predictions_p->slice('(41)') ],
[ 'DNASE:keratinocyte female' => 42 => $predictions_p->slice('(42)') ],
[ 'CHIP:H3K27ac:keratinocyte female' => 706 => $predictions_p->slice('(706)')],
[ 'CAGE:Keratinocyte - epidermal' => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
);
use PDL::Graphics::Gnuplot;
my $plot_output_path = 'enformer-target-interval-tracks.png';
my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
$gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
$gp->options(
offsets => [ graph => "0.01, 0, 0, 0" ],
lmargin => "at screen 0.05",
);
my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
my @tics_opts = (mirror => 0, out => 1);
for my $i (0..$#tracks) {
my ($title, $id, $y) = @{$tracks[$i]};
$gp->plot( {
title => $title,
border => [2],
ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
( $i == $#tracks
? ( xtics => { format => '%.3f', @tics_opts } )
: ( xtics => 0 ) ),
( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : () ),
},
with => 'filledcurves',
#'lc' => '#086eb5',
# $x scaled by 1e7; filled curve between $y and the x-axis
$x / 1e7, $y, pdl(0)
);
}
$gp->end_multi;
$gp->close;
if( IN_IPERL ) {
IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}
B<DISPLAY>:
=for html <span style="display:inline-block;margin-left:1em;"><p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...
=head2 Parts of the original notebook that fall outside the scope
In the orignal notebook, there are several more steps that have not been ported here:
=over
=item 1.
"Compute contribution scores":
This task requires implementing C<@tf.function> to compile gradients.
=item 2.
"Predict the effect of a genetic variant" and "Score multiple variants":
The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...
=back
# Some code that could be used for working with variants.
1 if <<'COMMENT';
use Bio::DB::HTS::VCF;
my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants
COMMENT
undef;
=head1 RESOURCE USAGE
use Filesys::DiskUsage qw/du/;
my $total = du( { 'human-readable' => 1, dereference => 1 },
$model_archive_path, $model_base, $new_model_base,
$targets_path,
$hg_gz_path,
$hg_bgz_path, $hg_bgz_fai_path,
$clinvar_path,
$plot_output_path,
);
say "Disk space usage: $total"; undef;
B<STREAM (STDOUT)>:
Disk space usage: 4.66G
( run in 0.728 second using v1.01-cache-2.11-cpan-39bf76dae61 )