encoding results from the CPAN

encoding

AI-TensorFlow-Libtensorflow

view release on metacpan or search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

    outputs_mouse  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 1,
    }),
);

p %puts;

my $predict_on_batch = sub {
    my ($session, $t) = @_;
    my @outputs_t;

    $session->Run(
        undef,
        [$puts{inputs_args_0}], [$t],
        [$puts{outputs_human}], \@outputs_t,
        undef,
        undef,
        $s
    );
    AssertOK($s);

    return $outputs_t[0];
};

undef;

use PDL;

our $SHOW_ENCODER = 1;

sub one_hot_dna {
    my ($seq) = @_;

    my $from_alphabet = "NACGT";
    my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;

    # sequences from UCSC genome have both uppercase and lowercase bases
    my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
    my $to_alphabet_tr   = $to_alphabet x 2;

    my $p = zeros(byte, bytes::length($seq));
    my $p_dataref = $p->get_dataref;
    ${ $p_dataref } = $seq;
    eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
    $p->upd_data;

    my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
    say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;

    my $encoded  = $encoder->index( $p->dummy(0) );

    return $encoded;
}

####

{

say "Testing one-hot encoding:\n";

my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;

say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;

}

package Interval {
    use Bio::Location::Simple ();

    use parent qw(Bio::Location::Simple);

    sub center {
        my $self = shift;
        my $center = int( ($self->start + $self->end ) / 2 );
        my $delta = ($self->start + $self->end ) % 2;
        return $center + $delta;
    }

    sub resize {
        my ($self, $width) = @_;
        my $new_interval = $self->clone;

        my $center = $self->center;
        my $half   = int( ($width-1) / 2 );
        my $offset = ($width-1) % 2;

        $new_interval->start( $center - $half - $offset );
        $new_interval->end(   $center + $half  );

        return $new_interval;
    }

    use overload '""' => \&_op_stringify;

    sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}

#####

{

say "Testing interval resizing:\n";
sub _debug_resize {
    my ($interval, $to, $msg) = @_;

    my $resized_interval = $interval->resize($to);

    die "Wrong interval size for $interval --($to)--> $resized_interval"
        unless $resized_interval->length == $to;

    say sprintf "Interval: %s -> %s, length %2d : %s",
        $interval,
        $resized_interval, $resized_interval->length,
        $msg;
}

for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
    my ($start, $end) = @$interval_spec;
    my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
    say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
    say "-----";
    for(0..5) {

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

            ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
            ( $i == $#tracks
                ? ( xtics => { format => '%.3f', @tics_opts } )
                : ( xtics => 0 ) ),
            ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),

        },
        with => 'filledcurves',
        #'lc' => '#086eb5',

        # $x scaled by 1e7; filled curve between $y and the x-axis
        $x / 1e7, $y, pdl(0)
    );
}

$gp->end_multi;

$gp->close;

if( IN_IPERL ) {
    IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}

# Some code that could be used for working with variants.
1 if <<'COMMENT';

use Bio::DB::HTS::VCF;

my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
    system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants

COMMENT

undef;

use Filesys::DiskUsage qw/du/;

my $total = du( { 'human-readable' => 1, dereference => 1 },
    $model_archive_path, $model_base, $new_model_base,

    $targets_path,

    $hg_gz_path,
    $hg_bgz_path, $hg_bgz_fai_path,

    $clinvar_path,

    $plot_output_path,
);

say "Disk space usage: $total"; undef;

__END__

=pod

=encoding UTF-8

=head1 NAME

AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel - Using TensorFlow to do gene expression prediction using a pre-trained model

=head1 SYNOPSIS

The following tutorial is based on the L<Enformer usage notebook|https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb>. It uses a pre-trained model based on a transformer architecture trained as described in Avsec e...

Running the code requires an Internet connection to download the model (from Google servers) and datasets (from GitHub, UCSC, and NIH).

Some of this code is identical to that of C<InferenceUsingTFHubMobileNetV2Model> notebook. Please look there for explanation for that code. As stated there, this will later be wrapped up into a high-level library to hide the details behind an API.

B<NOTE>: If running this model, please be aware that

=over

=item *

the Docker image takes 3 GB or more of disk space;

=item *

the model and data takes 5 GB or more of disk space.

=back

meaning that you will need a total of B<8 GB> of disk space. You may need at least B<4 GB> of free memory to run the model.

=head1 COLOPHON

The following document is either a POD file which can additionally be run as a Perl script or a Jupyter Notebook which can be run in L<IPerl|https://p3rl.org/Devel::IPerl> (viewable online at L<nbviewer|https://nbviewer.org/github/EntropyOrg/perl-AI-...

You will also need the executables C<gunzip>, C<bgzip>, and C<samtools>. Furthermore,

=over

=item *

C<Bio::DB::HTS> requires C<libhts> and

=item *

C<PDL::Graphics::Gnuplot> requires C<gnuplot>.

=back

If you are running the code, you may optionally install the L<C<tensorflow> Python package|https://www.tensorflow.org/install/pip> in order to access the C<saved_model_cli> command, but this is only used for informational purposes.

=head1 TUTORIAL

=head2 Load the library

First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...

  use strict;
  use warnings;
  use utf8;
  use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
  no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  signature_def['serving_default']:
    The given SavedModel SignatureDef contains the following input(s):
      inputs['args_0'] tensor_info:
          dtype: DT_FLOAT
          shape: (-1, 393216, 4)
          name: serving_default_args_0:0
    The given SavedModel SignatureDef contains the following output(s):
      outputs['human'] tensor_info:
          dtype: DT_FLOAT
          shape: (-1, 896, 5313)
          name: StatefulPartitionedCall:0
      outputs['mouse'] tensor_info:
          dtype: DT_FLOAT
          shape: (-1, 896, 1643)
          name: StatefulPartitionedCall:1
    Method name is: tensorflow/serving/predict
  
  Concrete Functions:
    Function Name: 'predict_on_batch'
      Option #1
        Callable with:
          Argument #1
            args_0: TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name='args_0')

B<RESULT>:

  1

We want to use the C<serve> tag-set and

=over

=item *

the input C<args_0> which has the name C<serving_default_args_0:0> and

=item *

the output C<human> which has the name C<StatefulPartitionedCall:0>.

=back

all of which are C<DT_FLOAT>.

Make note of the shapes that those take. Per the L<model description|https://tfhub.dev/deepmind/enformer/1> at TensorFlow Hub:

=over 2

The input sequence length is 393,216 with the prediction corresponding to 128 base pair windows for the center 114,688 base pairs. The input sequence is one hot encoded using the order of indices corresponding to 'ACGT' with N values being all zeros.

=back

The input shape C<(-1, 393216, 4)> thus represents dimensions C<[batch size] x [sequence length] x [one-hot encoding of ACGT]>.

The output shape C<(-1, 896, 5313)> represents dimensions C<[batch size] x [ predictions along 114,688 base pairs / 128 base pair windows ] x [ human target by index ]>. We can confirm this by doing some calculations:

  my $model_central_base_pairs_length     = 114_688; # bp
  my $model_central_base_pair_window_size = 128;     # bp / prediction
  
  say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

B<STREAM (STDOUT)>:

  Number of predictions: 896

B<RESULT>:

  1

and by looking at the targets file:

  use Data::Frame;
  
  my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
      ->transform({
          file => sub {
              my ($col, $df) = @_;
              # clean up the paths in 'file' column
              [map { join "/", (split('/', $_))[7..8] } $col->list];
          }
      });
  
  say "Number of targets: ", $df->nrow;
  
  say "";
  
  say "First 5:";
  say $df->head(5);

B<STREAM (STDOUT)>:

  Number of targets: 5313
  
  First 5:
  ------------------------------------------------------------------------------------------------------------------------------------------------
      index  genome  identifier   file                clip  scale  sum_stat  description                                                          
  ------------------------------------------------------------------------------------------------------------------------------------------------
   0  0      0       ENCFF833POA  encode/ENCSR000EIJ  32    2      mean      DNASE:cerebellum male adult (27 years) and male adult (35 years)     
   1  1      0       ENCFF110QGM  encode/ENCSR000EIK  32    2      mean      DNASE:frontal cortex male adult (27 years) and male adult (35 years) 
   2  2      0       ENCFF880MKD  encode/ENCSR000EIL  32    2      mean      DNASE:chorion                                                        
   3  3      0       ENCFF463ZLQ  encode/ENCSR000EIP  32    2      mean      DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour      
   4  4      0       ENCFF890OGQ  encode/ENCSR000EIS  32    2      mean      DNASE:GM03348                                                        
  ------------------------------------------------------------------------------------------------------------------------------------------------

B<RESULT>:

  1

=head2 Load the model

Let's now load the model in Perl and get the inputs and outputs into a data structure by name.

  my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

  );
  
  p %puts;

B<STREAM (STDERR)>:

=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
    </span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>

We need a helper to simplify running the session and getting just the predictions that we want.

  my $predict_on_batch = sub {
      my ($session, $t) = @_;
      my @outputs_t;
  
      $session->Run(
          undef,
          [$puts{inputs_args_0}], [$t],
          [$puts{outputs_human}], \@outputs_t,
          undef,
          undef,
          $s
      );
      AssertOK($s);
  
      return $outputs_t[0];
  };
  
  undef;

=head2 Encoding the data

The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.

This means that the bases are represented as vectors of length 4:

| base | vector encoding |
|------|-----------------|
| A    | C<[1 0 0 0]>     |
| C    | C<[0 1 0 0]>     |
| G    | C<[0 0 1 0]>     |
| T    | C<[0 0 0 1]>     |
| N    | C<[0 0 0 0]>     |

We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...

  use PDL;
  
  our $SHOW_ENCODER = 1;
  
  sub one_hot_dna {
      my ($seq) = @_;
  
      my $from_alphabet = "NACGT";
      my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;
  
      # sequences from UCSC genome have both uppercase and lowercase bases
      my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
      my $to_alphabet_tr   = $to_alphabet x 2;
  
      my $p = zeros(byte, bytes::length($seq));
      my $p_dataref = $p->get_dataref;
      ${ $p_dataref } = $seq;
      eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
      $p->upd_data;
  
      my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
      say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
  
      my $encoded  = $encoder->index( $p->dummy(0) );
  
      return $encoded;
  }
  
  ####
  
  {
  
  say "Testing one-hot encoding:\n";
  
  my $onehot_test_seq = "ACGTNtgcan";
  my $test_encoded = one_hot_dna( $onehot_test_seq );
  $SHOW_ENCODER = 0;
  
  say "One-hot encoding of sequence '$onehot_test_seq' is:";
  say $test_encoded->info, $test_encoded;
  
  }

B<STREAM (STDOUT)>:

  Testing one-hot encoding:
  
  Encoder is
  PDL: Float D [5,4]
  [
   [0 1 0 0 0]
   [0 0 1 0 0]
   [0 0 0 1 0]
   [0 0 0 0 1]
  ]
  
  One-hot encoding of sequence 'ACGTNtgcan' is:
  PDL: Float D [4,10]
  [
   [1 0 0 0]
   [0 1 0 0]
   [0 0 1 0]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 1]
   [0 0 1 0]
   [0 1 0 0]
   [1 0 0 0]
   [0 0 0 0]
  ]

B<RESULT>:

  1

Note that in the above, the PDL ndarray's

=over

=item *

first dimension is 4 which matches the last dimension of the input C<TFTensor>;

=item *

second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.

=back

Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.

  package Interval {
      use Bio::Location::Simple ();
  
      use parent qw(Bio::Location::Simple);
  
      sub center {
          my $self = shift;
          my $center = int( ($self->start + $self->end ) / 2 );
          my $delta = ($self->start + $self->end ) % 2;
          return $center + $delta;
      }
  
      sub resize {
          my ($self, $width) = @_;
          my $new_interval = $self->clone;
  
          my $center = $self->center;
          my $half   = int( ($width-1) / 2 );
          my $offset = ($width-1) % 2;
  
          $new_interval->start( $center - $half - $offset );
          $new_interval->end(   $center + $half  );
  
          return $new_interval;
      }

( run in 0.948 second using v1.01-cache-2.11-cpan-f4a522933cf )