default results from the CPAN

AI-TensorFlow-Libtensorflow

);

saved_model_cli( qw(show),
    qw(--dir) => $model_base,
    qw(--all),
);

my $new_model_base = "${model_base}_new";

system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf

in_path, out_path  = sys.argv[1:3]

imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF

saved_model_cli( qw(show),
    qw(--dir) => $new_model_base,
    qw(--all),
);

my $model_central_base_pairs_length     = 114_688; # bp
my $model_central_base_pair_window_size = 128;     # bp / prediction

say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

use Data::Frame;

my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
    ->transform({
        file => sub {
            my ($col, $df) = @_;
            # clean up the paths in 'file' column
            [map { join "/", (split('/', $_))[7..8] } $col->list];
        }
    });

say "Number of targets: ", $df->nrow;

say "";

say "First 5:";
say $df->head(5);

my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;

my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
    $opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);

my %puts = (
    ## Inputs
    inputs_args_0 =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('serving_default_args_0'),
            index => 0,
        }),

    ## Outputs
    outputs_human  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 0,
        }),
    outputs_mouse  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 1,
    }),
);

p %puts;

my $predict_on_batch = sub {
    my ($session, $t) = @_;
    my @outputs_t;

    $session->Run(
        undef,
        [$puts{inputs_args_0}], [$t],
        [$puts{outputs_human}], \@outputs_t,
        undef,
        undef,
        $s
    );
    AssertOK($s);

    return $outputs_t[0];
};

undef;

use PDL;

our $SHOW_ENCODER = 1;

sub one_hot_dna {
    my ($seq) = @_;

    my $from_alphabet = "NACGT";
    my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;

    # sequences from UCSC genome have both uppercase and lowercase bases
    my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
    my $to_alphabet_tr   = $to_alphabet x 2;

    my $p = zeros(byte, bytes::length($seq));
    my $p_dataref = $p->get_dataref;
    ${ $p_dataref } = $seq;
    eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
    $p->upd_data;

    my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
    say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN


In order to quickly seek for sequences in the reference genome FASTA, we

=over

=item 1.

convert the gzip'd file into a block gzip'd file and

=item 2.

index that C<.bgz> file using C<faidx> from C<samtools>.

=back

  (my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
  my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
  
  use IPC::Run;
  
  if( ! -e $hg_bgz_path ) {
      IPC::Run::run(
          [ qw(gunzip -c) ], '<', $hg_gz_path,
          '|',
          [ qw(bgzip -c)  ], '>', $hg_bgz_path
      );
  }
  
  use Bio::Tools::Run::Samtools;
  
  my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
  if( ! -e $hg_bgz_fai_path ) {
      my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
      $faidx_tool->run( -fas => $hg_bgz_path )
          or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
  }

=head2 Model input and output specification

Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.

  sub saved_model_cli {
      my (@rest) = @_;
      if( File::Which::which('saved_model_cli')) {
          system(qw(saved_model_cli), @rest ) == 0
              or die "Could not run saved_model_cli";
      } else {
          warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
          return -1;
      }
  }
  
  say "Checking with saved_model_cli scan:";
  saved_model_cli( qw(scan),
      qw(--dir) => $model_base,
  );

B<STREAM (STDOUT)>:

  Checking with saved_model_cli scan:
  MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}

B<RESULT>:

  1

We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:

  saved_model_cli( qw(show),
      qw(--dir) => $model_base,
      qw(--all),
  );

B<STREAM (STDOUT)>:

  MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
  
  signature_def['__saved_model_init_op']:
    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  Concrete Functions:
B<RESULT>:

  1

It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.

  my $new_model_base = "${model_base}_new";
  
  system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
  import sys
  import tensorflow as tf
  
  in_path, out_path  = sys.argv[1:3]
  
  imported_model = tf.saved_model.load(in_path).model
  tf.saved_model.save( imported_model , out_path )
  EOF
  
  saved_model_cli( qw(show),
      qw(--dir) => $new_model_base,
      qw(--all),
  );

B<STREAM (STDOUT)>:

  MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
  
  signature_def['__saved_model_init_op']:
    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  signature_def['serving_default']:
    The given SavedModel SignatureDef contains the following input(s):
      inputs['args_0'] tensor_info:
          dtype: DT_FLOAT
          shape: (-1, 393216, 4)
          name: serving_default_args_0:0
    The given SavedModel SignatureDef contains the following output(s):
      outputs['human'] tensor_info:
          dtype: DT_FLOAT
          shape: (-1, 896, 5313)
          name: StatefulPartitionedCall:0
      outputs['mouse'] tensor_info:
          dtype: DT_FLOAT
          shape: (-1, 896, 1643)
          name: StatefulPartitionedCall:1
    Method name is: tensorflow/serving/predict
  
  Concrete Functions:
    Function Name: 'predict_on_batch'
      Option #1
        Callable with:
          Argument #1
            args_0: TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name='args_0')

B<RESULT>:

  1

We want to use the C<serve> tag-set and

=over

=item *

the input C<args_0> which has the name C<serving_default_args_0:0> and

=item *

the output C<human> which has the name C<StatefulPartitionedCall:0>.

=back

all of which are C<DT_FLOAT>.

Make note of the shapes that those take. Per the L<model description|https://tfhub.dev/deepmind/enformer/1> at TensorFlow Hub:

=over 2

The input sequence length is 393,216 with the prediction corresponding to 128 base pair windows for the center 114,688 base pairs. The input sequence is one hot encoded using the order of indices corresponding to 'ACGT' with N values being all zeros.

=back

The input shape C<(-1, 393216, 4)> thus represents dimensions C<[batch size] x [sequence length] x [one-hot encoding of ACGT]>.

The output shape C<(-1, 896, 5313)> represents dimensions C<[batch size] x [ predictions along 114,688 base pairs / 128 base pair windows ] x [ human target by index ]>. We can confirm this by doing some calculations:

  my $model_central_base_pairs_length     = 114_688; # bp
  my $model_central_base_pair_window_size = 128;     # bp / prediction
  
  say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

B<STREAM (STDOUT)>:

  Number of predictions: 896

B<RESULT>:

  1

and by looking at the targets file:

  use Data::Frame;
  
  my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
      ->transform({
          file => sub {
              my ($col, $df) = @_;
              # clean up the paths in 'file' column
              [map { join "/", (split('/', $_))[7..8] } $col->list];
          }
      });
  
  say "Number of targets: ", $df->nrow;
  
  say "";
  
  say "First 5:";
  say $df->head(5);

B<STREAM (STDOUT)>:

  Number of targets: 5313
  
  First 5:
  ------------------------------------------------------------------------------------------------------------------------------------------------
      index  genome  identifier   file                clip  scale  sum_stat  description                                                          
  ------------------------------------------------------------------------------------------------------------------------------------------------
   0  0      0       ENCFF833POA  encode/ENCSR000EIJ  32    2      mean      DNASE:cerebellum male adult (27 years) and male adult (35 years)     
   1  1      0       ENCFF110QGM  encode/ENCSR000EIK  32    2      mean      DNASE:frontal cortex male adult (27 years) and male adult (35 years) 
   2  2      0       ENCFF880MKD  encode/ENCSR000EIL  32    2      mean      DNASE:chorion                                                        
   3  3      0       ENCFF463ZLQ  encode/ENCSR000EIP  32    2      mean      DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour      
   4  4      0       ENCFF890OGQ  encode/ENCSR000EIS  32    2      mean      DNASE:GM03348                                                        
  ------------------------------------------------------------------------------------------------------------------------------------------------

B<RESULT>:

  1

=head2 Load the model

Let's now load the model in Perl and get the inputs and outputs into a data structure by name.

  my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
  
  my @tags = ( 'serve' );
  my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
  my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
      $opt, undef, $new_model_base, \@tags, $graph, undef, $s
  );
  AssertOK($s);
  
  my %puts = (
      ## Inputs
      inputs_args_0 =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('serving_default_args_0'),
              index => 0,
          }),
  
      ## Outputs
      outputs_human  =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('StatefulPartitionedCall'),
              index => 0,
          }),
      outputs_mouse  =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('StatefulPartitionedCall'),
              index => 1,
      }),
  );
  
  p %puts;

B<STREAM (STDERR)>:

=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
    </span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>

We need a helper to simplify running the session and getting just the predictions that we want.

  my $predict_on_batch = sub {
      my ($session, $t) = @_;
      my @outputs_t;
  
      $session->Run(
          undef,
          [$puts{inputs_args_0}], [$t],
          [$puts{outputs_human}], \@outputs_t,
          undef,
          undef,
          $s
      );
      AssertOK($s);
  
      return $outputs_t[0];
  };
  
  undef;

=head2 Encoding the data

The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.

This means that the bases are represented as vectors of length 4:

| base | vector encoding |
|------|-----------------|
| A    | C<[1 0 0 0]>     |
| C    | C<[0 1 0 0]>     |
| G    | C<[0 0 1 0]>     |
| T    | C<[0 0 0 1]>     |
| N    | C<[0 0 0 0]>     |

( run in 0.871 second using v1.01-cache-2.11-cpan-13bb782fe5a )