AI-TensorFlow-Libtensorflow

 view release on metacpan or  search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

  say "First 5:";
  say $df->head(5);

B<STREAM (STDOUT)>:

  Number of targets: 5313
  
  First 5:
  ------------------------------------------------------------------------------------------------------------------------------------------------
      index  genome  identifier   file                clip  scale  sum_stat  description                                                          
  ------------------------------------------------------------------------------------------------------------------------------------------------
   0  0      0       ENCFF833POA  encode/ENCSR000EIJ  32    2      mean      DNASE:cerebellum male adult (27 years) and male adult (35 years)     
   1  1      0       ENCFF110QGM  encode/ENCSR000EIK  32    2      mean      DNASE:frontal cortex male adult (27 years) and male adult (35 years) 
   2  2      0       ENCFF880MKD  encode/ENCSR000EIL  32    2      mean      DNASE:chorion                                                        
   3  3      0       ENCFF463ZLQ  encode/ENCSR000EIP  32    2      mean      DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour      
   4  4      0       ENCFF890OGQ  encode/ENCSR000EIS  32    2      mean      DNASE:GM03348                                                        
  ------------------------------------------------------------------------------------------------------------------------------------------------

B<RESULT>:

  1

=head2 Load the model

Let's now load the model in Perl and get the inputs and outputs into a data structure by name.

  my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
  
  my @tags = ( 'serve' );
  my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
  my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
      $opt, undef, $new_model_base, \@tags, $graph, undef, $s
  );
  AssertOK($s);
  
  my %puts = (
      ## Inputs
      inputs_args_0 =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('serving_default_args_0'),
              index => 0,
          }),
  
      ## Outputs
      outputs_human  =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('StatefulPartitionedCall'),
              index => 0,
          }),
      outputs_mouse  =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('StatefulPartitionedCall'),
              index => 1,
      }),
  );
  
  p %puts;

B<STREAM (STDERR)>:

=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
    </span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>

We need a helper to simplify running the session and getting just the predictions that we want.

  my $predict_on_batch = sub {
      my ($session, $t) = @_;
      my @outputs_t;
  
      $session->Run(
          undef,
          [$puts{inputs_args_0}], [$t],
          [$puts{outputs_human}], \@outputs_t,
          undef,
          undef,
          $s
      );
      AssertOK($s);
  
      return $outputs_t[0];
  };
  
  undef;

=head2 Encoding the data

The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.

This means that the bases are represented as vectors of length 4:

| base | vector encoding |
|------|-----------------|
| A    | C<[1 0 0 0]>     |
| C    | C<[0 1 0 0]>     |
| G    | C<[0 0 1 0]>     |
| T    | C<[0 0 0 1]>     |
| N    | C<[0 0 0 0]>     |

We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...

  use PDL;
  
  our $SHOW_ENCODER = 1;
  
  sub one_hot_dna {
      my ($seq) = @_;
  
      my $from_alphabet = "NACGT";
      my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;
  
      # sequences from UCSC genome have both uppercase and lowercase bases
      my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
      my $to_alphabet_tr   = $to_alphabet x 2;
  
      my $p = zeros(byte, bytes::length($seq));
      my $p_dataref = $p->get_dataref;
      ${ $p_dataref } = $seq;
      eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
      $p->upd_data;
  
      my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

  
  die "Target interval is not $model_central_base_pairs_length bp long"
      unless $target_interval->length == $model_central_base_pairs_length;
  
  say "Target sequence is ", seq_info extract_sequence( $hg_db, $target_interval );
  
  
  say "";
  
  
  my $resized_interval = $target_interval->resize( $model_sequence_length );
  say "Resized interval: $resized_interval with length @{[ $resized_interval->length ]}";
  
  die "resize() is not working properly!" unless $resized_interval->length == $model_sequence_length;
  
  my $seq = extract_sequence( $hg_db, $resized_interval );
  
  say "Resized sequence is ", seq_info($seq);

B<STREAM (STDOUT)>:

  Target interval: chr11:35082743..35197430 with length 114688
  Target sequence is GGTGGCAGCC...ATCTCCTTTT (length 114688)
  
  Resized interval: chr11:34943479..35336694 with length 393216
  Resized sequence is ACTAGTTCTA...GGCCCAAATC (length 393216)

B<RESULT>:

  1

To prepare the input we have to one-hot encode this resized sequence and give it a dummy dimension at the end to indicate that it is is a batch with a single sequence. Then we can turn the PDL ndarray into a C<TFTensor> and pass it to our prediction ...

  my $sequence_one_hot = one_hot_dna( $seq )->dummy(-1);
  
  say $sequence_one_hot->info; undef;

B<STREAM (STDOUT)>:

  PDL: Float D [4,393216,1]


  use Devel::Timer;
  my $t = Devel::Timer->new;
  
  $t->mark('prediction of sequence');
  
  my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );
  
  $t->mark('End of prediction of sequence');
  
  p $predictions;
  
  $t->mark('END');
  $t->report();

B<STREAM (STDERR)>:

=begin html

<span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Tensor</span><span style=""> </span><span style="color: #33ccff;">{</span><span style="">
    </span><span style="color: #6666cc;">Type           </span><span style=""> </span><span style="color: #cc66cc;">FLOAT</span><span style="">
    </span><span style="color: #6666cc;">Dims           </span><span style=""> </span><span style="color: #33ccff;">[</span><span style=""> </span><span style="color: #ff6633;">1</span><span style=""> </span><span style="color: #ff6633;">896</span><s...
    </span><span style="color: #6666cc;">NumDims        </span><span style=""> </span><span style="color: #ff6633;">3</span><span style="">
    </span><span style="color: #6666cc;">ElementCount   </span><span style=""> </span><span style="color: #ff6633;">4760448</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">


Devel::Timer Report -- Total time: 14.5641 secs
Interval  Time    Percent
----------------------------------------------
01 -&gt; 02  14.5634  100.00%  prediction of sequence -&gt; End of prediction of sequence
02 -&gt; 03  0.0007   0.00%  End of prediction of sequence -&gt; END
00 -&gt; 01  0.0000   0.00%  INIT -&gt; prediction of sequence
</span></code></pre></span>

=end html

Now we turn the C<TFTensor> output into a PDL ndarray.

  my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
  say $predictions_p->info; undef;

B<STREAM (STDOUT)>:

  PDL: Float D [5313,896]

=head2 Plot predicted tracks

These predictions can be plotted 

  my @tracks = (
      [ 'DNASE:CD14-positive monocyte female' =>   41 => $predictions_p->slice('(41)') ],
      [ 'DNASE:keratinocyte female'           =>   42 => $predictions_p->slice('(42)') ],
      [ 'CHIP:H3K27ac:keratinocyte female'    =>  706 => $predictions_p->slice('(706)')],
      [ 'CAGE:Keratinocyte - epidermal'       => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
  );
  
  use PDL::Graphics::Gnuplot;
  
  my $plot_output_path = 'enformer-target-interval-tracks.png';
  my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
  
  $gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
  
  $gp->options(
      offsets => [ graph => "0.01, 0, 0, 0" ],
      lmargin => "at screen 0.05",
  );
  
  my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
  
  my @tics_opts = (mirror => 0, out => 1);
  
  for my $i (0..$#tracks) {
      my ($title, $id, $y) = @{$tracks[$i]};
      $gp->plot( {
              title => $title,
              border => [2],
              ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
              ( $i == $#tracks
                  ? ( xtics => { format => '%.3f', @tics_opts } )
                  : ( xtics => 0 ) ),
              ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),
  
          },
          with => 'filledcurves',
          #'lc' => '#086eb5',
  
          # $x scaled by 1e7; filled curve between $y and the x-axis
          $x / 1e7, $y, pdl(0)
      );
  }
  
  $gp->end_multi;
  
  $gp->close;
  
  if( IN_IPERL ) {
      IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
  }

B<DISPLAY>:

=for html <span style="display:inline-block;margin-left:1em;"><p><img						src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...

=head2 Parts of the original notebook that fall outside the scope

In the orignal notebook, there are several more steps that have not been ported here:

=over

=item 1.

"Compute contribution scores":

This task requires implementing C<@tf.function> to compile gradients.

=item 2.

"Predict the effect of a genetic variant" and "Score multiple variants":

The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...

=back

  # Some code that could be used for working with variants.
  1 if <<'COMMENT';
  
  use Bio::DB::HTS::VCF;
  
  my $clinvar_tbi_path = "${clinvar_path}.tbi";
  unless( -f $clinvar_tbi_path ) {
      system( qw(tabix), $clinvar_path );
  }
  my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
  $v->num_variants
  
  COMMENT
  
  undef;

=head1 RESOURCE USAGE

  use Filesys::DiskUsage qw/du/;
  
  my $total = du( { 'human-readable' => 1, dereference => 1 },
      $model_archive_path, $model_base, $new_model_base,
  
      $targets_path,
  
      $hg_gz_path,
      $hg_bgz_path, $hg_bgz_fai_path,
  
      $clinvar_path,
  
      $plot_output_path,
  );
  
  say "Disk space usage: $total"; undef;

B<STREAM (STDOUT)>:

  Disk space usage: 4.66G



( run in 0.728 second using v1.01-cache-2.11-cpan-39bf76dae61 )