AI-TensorFlow-Libtensorflow

 view release on metacpan or  search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN


my %puts = (
    ## Inputs
    inputs_args_0 =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('serving_default_args_0'),
            index => 0,
        }),

    ## Outputs
    outputs_human  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 0,
        }),
    outputs_mouse  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 1,
    }),
);

p %puts;

my $predict_on_batch = sub {
    my ($session, $t) = @_;
    my @outputs_t;

    $session->Run(
        undef,
        [$puts{inputs_args_0}], [$t],
        [$puts{outputs_human}], \@outputs_t,
        undef,
        undef,
        $s
    );
    AssertOK($s);

    return $outputs_t[0];
};

undef;

use PDL;

our $SHOW_ENCODER = 1;

sub one_hot_dna {
    my ($seq) = @_;

    my $from_alphabet = "NACGT";
    my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;

    # sequences from UCSC genome have both uppercase and lowercase bases
    my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
    my $to_alphabet_tr   = $to_alphabet x 2;

    my $p = zeros(byte, bytes::length($seq));
    my $p_dataref = $p->get_dataref;
    ${ $p_dataref } = $seq;
    eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
    $p->upd_data;

    my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
    say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;

    my $encoded  = $encoder->index( $p->dummy(0) );

    return $encoded;
}

####

{

say "Testing one-hot encoding:\n";

my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;

say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;

}

package Interval {
    use Bio::Location::Simple ();

    use parent qw(Bio::Location::Simple);

    sub center {
        my $self = shift;
        my $center = int( ($self->start + $self->end ) / 2 );
        my $delta = ($self->start + $self->end ) % 2;
        return $center + $delta;
    }

    sub resize {
        my ($self, $width) = @_;
        my $new_interval = $self->clone;

        my $center = $self->center;
        my $half   = int( ($width-1) / 2 );
        my $offset = ($width-1) % 2;

        $new_interval->start( $center - $half - $offset );
        $new_interval->end(   $center + $half  );

        return $new_interval;
    }

    use overload '""' => \&_op_stringify;

    sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}

#####

{

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>

We need a helper to simplify running the session and getting just the predictions that we want.

  my $predict_on_batch = sub {
      my ($session, $t) = @_;
      my @outputs_t;
  
      $session->Run(
          undef,
          [$puts{inputs_args_0}], [$t],
          [$puts{outputs_human}], \@outputs_t,
          undef,
          undef,
          $s
      );
      AssertOK($s);
  
      return $outputs_t[0];
  };
  
  undef;

=head2 Encoding the data

The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.

This means that the bases are represented as vectors of length 4:

| base | vector encoding |
|------|-----------------|
| A    | C<[1 0 0 0]>     |
| C    | C<[0 1 0 0]>     |
| G    | C<[0 0 1 0]>     |
| T    | C<[0 0 0 1]>     |
| N    | C<[0 0 0 0]>     |

We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...

  use PDL;
  
  our $SHOW_ENCODER = 1;
  
  sub one_hot_dna {
      my ($seq) = @_;
  
      my $from_alphabet = "NACGT";
      my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;
  
      # sequences from UCSC genome have both uppercase and lowercase bases
      my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
      my $to_alphabet_tr   = $to_alphabet x 2;
  
      my $p = zeros(byte, bytes::length($seq));
      my $p_dataref = $p->get_dataref;
      ${ $p_dataref } = $seq;
      eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
      $p->upd_data;
  
      my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
      say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
  
      my $encoded  = $encoder->index( $p->dummy(0) );
  
      return $encoded;
  }
  
  ####
  
  {
  
  say "Testing one-hot encoding:\n";
  
  my $onehot_test_seq = "ACGTNtgcan";
  my $test_encoded = one_hot_dna( $onehot_test_seq );
  $SHOW_ENCODER = 0;
  
  say "One-hot encoding of sequence '$onehot_test_seq' is:";
  say $test_encoded->info, $test_encoded;
  
  }

B<STREAM (STDOUT)>:

  Testing one-hot encoding:
  
  Encoder is
  PDL: Float D [5,4]
  [
   [0 1 0 0 0]
   [0 0 1 0 0]
   [0 0 0 1 0]
   [0 0 0 0 1]
  ]
  
  One-hot encoding of sequence 'ACGTNtgcan' is:
  PDL: Float D [4,10]
  [
   [1 0 0 0]
   [0 1 0 0]
   [0 0 1 0]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 1]
   [0 0 1 0]
   [0 1 0 0]
   [1 0 0 0]
   [0 0 0 0]
  ]

B<RESULT>:

  1

Note that in the above, the PDL ndarray's

=over



( run in 0.673 second using v1.01-cache-2.11-cpan-5a3173703d6 )