AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
);
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
my $new_model_base = "${model_base}_new";
system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf
in_path, out_path = sys.argv[1:3]
imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF
saved_model_cli( qw(show),
qw(--dir) => $new_model_base,
qw(--all),
);
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
say "First 5:";
say $df->head(5);
my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
$opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);
my %puts = (
## Inputs
inputs_args_0 =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('serving_default_args_0'),
index => 0,
}),
## Outputs
outputs_human =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 0,
}),
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
}),
);
p %puts;
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
In order to quickly seek for sequences in the reference genome FASTA, we
=over
=item 1.
convert the gzip'd file into a block gzip'd file and
=item 2.
index that C<.bgz> file using C<faidx> from C<samtools>.
=back
(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
use IPC::Run;
if( ! -e $hg_bgz_path ) {
IPC::Run::run(
[ qw(gunzip -c) ], '<', $hg_gz_path,
'|',
[ qw(bgzip -c) ], '>', $hg_bgz_path
);
}
use Bio::Tools::Run::Samtools;
my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
$faidx_tool->run( -fas => $hg_bgz_path )
or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}
=head2 Model input and output specification
Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.
sub saved_model_cli {
my (@rest) = @_;
if( File::Which::which('saved_model_cli')) {
system(qw(saved_model_cli), @rest ) == 0
or die "Could not run saved_model_cli";
} else {
warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
return -1;
}
}
say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
qw(--dir) => $model_base,
);
B<STREAM (STDOUT)>:
Checking with saved_model_cli scan:
MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}
B<RESULT>:
1
We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
B<STREAM (STDOUT)>:
MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
signature_def['__saved_model_init_op']:
The given SavedModel SignatureDef contains the following input(s):
The given SavedModel SignatureDef contains the following output(s):
outputs['__saved_model_init_op'] tensor_info:
dtype: DT_INVALID
shape: unknown_rank
name: NoOp
Method name is:
Concrete Functions:
B<RESULT>:
1
It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.
my $new_model_base = "${model_base}_new";
system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf
in_path, out_path = sys.argv[1:3]
imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF
saved_model_cli( qw(show),
qw(--dir) => $new_model_base,
qw(--all),
);
B<STREAM (STDOUT)>:
MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
signature_def['__saved_model_init_op']:
The given SavedModel SignatureDef contains the following input(s):
The given SavedModel SignatureDef contains the following output(s):
outputs['__saved_model_init_op'] tensor_info:
dtype: DT_INVALID
shape: unknown_rank
name: NoOp
Method name is:
signature_def['serving_default']:
The given SavedModel SignatureDef contains the following input(s):
inputs['args_0'] tensor_info:
dtype: DT_FLOAT
shape: (-1, 393216, 4)
name: serving_default_args_0:0
The given SavedModel SignatureDef contains the following output(s):
outputs['human'] tensor_info:
dtype: DT_FLOAT
shape: (-1, 896, 5313)
name: StatefulPartitionedCall:0
outputs['mouse'] tensor_info:
dtype: DT_FLOAT
shape: (-1, 896, 1643)
name: StatefulPartitionedCall:1
Method name is: tensorflow/serving/predict
Concrete Functions:
Function Name: 'predict_on_batch'
Option #1
Callable with:
Argument #1
args_0: TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name='args_0')
B<RESULT>:
1
We want to use the C<serve> tag-set and
=over
=item *
the input C<args_0> which has the name C<serving_default_args_0:0> and
=item *
the output C<human> which has the name C<StatefulPartitionedCall:0>.
=back
all of which are C<DT_FLOAT>.
Make note of the shapes that those take. Per the L<model description|https://tfhub.dev/deepmind/enformer/1> at TensorFlow Hub:
=over 2
The input sequence length is 393,216 with the prediction corresponding to 128 base pair windows for the center 114,688 base pairs. The input sequence is one hot encoded using the order of indices corresponding to 'ACGT' with N values being all zeros.
=back
The input shape C<(-1, 393216, 4)> thus represents dimensions C<[batch size] x [sequence length] x [one-hot encoding of ACGT]>.
The output shape C<(-1, 896, 5313)> represents dimensions C<[batch size] x [ predictions along 114,688 base pairs / 128 base pair windows ] x [ human target by index ]>. We can confirm this by doing some calculations:
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
B<STREAM (STDOUT)>:
Number of predictions: 896
B<RESULT>:
1
and by looking at the targets file:
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
say "First 5:";
say $df->head(5);
B<STREAM (STDOUT)>:
Number of targets: 5313
First 5:
------------------------------------------------------------------------------------------------------------------------------------------------
index genome identifier file clip scale sum_stat description
------------------------------------------------------------------------------------------------------------------------------------------------
0 0 0 ENCFF833POA encode/ENCSR000EIJ 32 2 mean DNASE:cerebellum male adult (27 years) and male adult (35 years)
1 1 0 ENCFF110QGM encode/ENCSR000EIK 32 2 mean DNASE:frontal cortex male adult (27 years) and male adult (35 years)
2 2 0 ENCFF880MKD encode/ENCSR000EIL 32 2 mean DNASE:chorion
3 3 0 ENCFF463ZLQ encode/ENCSR000EIP 32 2 mean DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour
4 4 0 ENCFF890OGQ encode/ENCSR000EIS 32 2 mean DNASE:GM03348
------------------------------------------------------------------------------------------------------------------------------------------------
B<RESULT>:
1
=head2 Load the model
Let's now load the model in Perl and get the inputs and outputs into a data structure by name.
my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
$opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);
my %puts = (
## Inputs
inputs_args_0 =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('serving_default_args_0'),
index => 0,
}),
## Outputs
outputs_human =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 0,
}),
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
}),
);
p %puts;
B<STREAM (STDERR)>:
=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
</span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
</span><span style="color: #6666cc;">index</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
</span><span style="color: #6666cc;">Name</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;"> </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
</span><span style="color: #6666cc;">OpType</span><span style=""> </span><span style="color: #33ccff;"> </span><span style="color: #33ccff;">"</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>
We need a helper to simplify running the session and getting just the predictions that we want.
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
=head2 Encoding the data
The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.
This means that the bases are represented as vectors of length 4:
| base | vector encoding |
|------|-----------------|
| A | C<[1 0 0 0]> |
| C | C<[0 1 0 0]> |
| G | C<[0 0 1 0]> |
| T | C<[0 0 0 1]> |
| N | C<[0 0 0 0]> |
( run in 0.871 second using v1.01-cache-2.11-cpan-13bb782fe5a )