AI-TensorFlow-Libtensorflow

 view release on metacpan or  search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 1,
    }),
);

p %puts;

my $predict_on_batch = sub {
    my ($session, $t) = @_;
    my @outputs_t;

    $session->Run(
        undef,
        [$puts{inputs_args_0}], [$t],
        [$puts{outputs_human}], \@outputs_t,
        undef,
        undef,
        $s
    );
    AssertOK($s);

    return $outputs_t[0];
};

undef;

use PDL;

our $SHOW_ENCODER = 1;

sub one_hot_dna {
    my ($seq) = @_;

    my $from_alphabet = "NACGT";
    my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;

    # sequences from UCSC genome have both uppercase and lowercase bases
    my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
    my $to_alphabet_tr   = $to_alphabet x 2;

    my $p = zeros(byte, bytes::length($seq));
    my $p_dataref = $p->get_dataref;
    ${ $p_dataref } = $seq;
    eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
    $p->upd_data;

    my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
    say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;

    my $encoded  = $encoder->index( $p->dummy(0) );

    return $encoded;
}

####

{

say "Testing one-hot encoding:\n";

my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;

say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;

}

package Interval {
    use Bio::Location::Simple ();

    use parent qw(Bio::Location::Simple);

    sub center {
        my $self = shift;
        my $center = int( ($self->start + $self->end ) / 2 );
        my $delta = ($self->start + $self->end ) % 2;
        return $center + $delta;
    }

    sub resize {
        my ($self, $width) = @_;
        my $new_interval = $self->clone;

        my $center = $self->center;
        my $half   = int( ($width-1) / 2 );
        my $offset = ($width-1) % 2;

        $new_interval->start( $center - $half - $offset );
        $new_interval->end(   $center + $half  );

        return $new_interval;
    }

    use overload '""' => \&_op_stringify;

    sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}

#####

{

say "Testing interval resizing:\n";
sub _debug_resize {
    my ($interval, $to, $msg) = @_;

    my $resized_interval = $interval->resize($to);

    die "Wrong interval size for $interval --($to)--> $resized_interval"
        unless $resized_interval->length == $to;

    say sprintf "Interval: %s -> %s, length %2d : %s",
        $interval,
        $resized_interval, $resized_interval->length,
        $msg;
}

for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
    my ($start, $end) = @$interval_spec;
    my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
    say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
    say "-----";
    for(0..5) {
        my $base = $test_interval->length;
        my $to = $base + $_;
        _debug_resize $test_interval, $to, "$base -> $to (+ $_)";
    }
    say "";
}

}

undef;

use Bio::DB::HTS::Faidx;

my $hg_db = Bio::DB::HTS::Faidx->new( $hg_bgz_path );

sub extract_sequence {
    my ($db, $interval) = @_;

    my $chrom_length = $db->length($interval->seq_id);

    my $trimmed_interval = $interval->clone;
    $trimmed_interval->start( List::Util::max( $interval->start, 1               ) );
    $trimmed_interval->end(   List::Util::min( $interval->end  , $chrom_length   ) );

    # Bio::DB::HTS::Faidx is 0-based for both start and end points
    my $seq = $db->get_sequence2_no_length(
        $trimmed_interval->seq_id,
        $trimmed_interval->start - 1,
        $trimmed_interval->end   - 1,
    );

    my $pad_upstream   = 'N' x List::Util::max( -($interval->start-1), 0 );
    my $pad_downstream = 'N' x List::Util::max( $interval->end - $chrom_length, 0 );

    return join '', $pad_upstream, $seq, $pad_downstream;
}

sub seq_info {
    my ($seq, $n) = @_;
    $n ||= 10;
    if( length $seq > $n ) {
        sprintf "%s...%s (length %d)", uc substr($seq, 0, $n), uc substr($seq, -$n), length $seq;
    } else {
        sprintf "%s (length %d)", uc $seq, length $seq;
    }
}

####

{

say "Testing sequence extraction:";

say "1 base: ",   seq_info
    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => 35_082_742 + 1,
            -end   => 35_082_742 + 1 ) );

say "3 bases: ",  seq_info
    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => 1,

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );

$t->mark('End of prediction of sequence');

p $predictions;

$t->mark('END');
$t->report();

my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
say $predictions_p->info; undef;

my @tracks = (
    [ 'DNASE:CD14-positive monocyte female' =>   41 => $predictions_p->slice('(41)') ],
    [ 'DNASE:keratinocyte female'           =>   42 => $predictions_p->slice('(42)') ],
    [ 'CHIP:H3K27ac:keratinocyte female'    =>  706 => $predictions_p->slice('(706)')],
    [ 'CAGE:Keratinocyte - epidermal'       => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
);

use PDL::Graphics::Gnuplot;

my $plot_output_path = 'enformer-target-interval-tracks.png';
my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );

$gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );

$gp->options(
    offsets => [ graph => "0.01, 0, 0, 0" ],
    lmargin => "at screen 0.05",
);

my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);

my @tics_opts = (mirror => 0, out => 1);

for my $i (0..$#tracks) {
    my ($title, $id, $y) = @{$tracks[$i]};
    $gp->plot( {
            title => $title,
            border => [2],
            ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
            ( $i == $#tracks
                ? ( xtics => { format => '%.3f', @tics_opts } )
                : ( xtics => 0 ) ),
            ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),

        },
        with => 'filledcurves',
        #'lc' => '#086eb5',

        # $x scaled by 1e7; filled curve between $y and the x-axis
        $x / 1e7, $y, pdl(0)
    );
}

$gp->end_multi;

$gp->close;

if( IN_IPERL ) {
    IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}

# Some code that could be used for working with variants.
1 if <<'COMMENT';

use Bio::DB::HTS::VCF;

my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
    system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants

COMMENT

undef;

use Filesys::DiskUsage qw/du/;

my $total = du( { 'human-readable' => 1, dereference => 1 },
    $model_archive_path, $model_base, $new_model_base,

    $targets_path,

    $hg_gz_path,
    $hg_bgz_path, $hg_bgz_fai_path,

    $clinvar_path,

    $plot_output_path,
);

say "Disk space usage: $total"; undef;

__END__

=pod

=encoding UTF-8

=head1 NAME

AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel - Using TensorFlow to do gene expression prediction using a pre-trained model

=head1 SYNOPSIS

The following tutorial is based on the L<Enformer usage notebook|https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb>. It uses a pre-trained model based on a transformer architecture trained as described in Avsec e...

Running the code requires an Internet connection to download the model (from Google servers) and datasets (from GitHub, UCSC, and NIH).

Some of this code is identical to that of C<InferenceUsingTFHubMobileNetV2Model> notebook. Please look there for explanation for that code. As stated there, this will later be wrapped up into a high-level library to hide the details behind an API.

B<NOTE>: If running this model, please be aware that

=over

=item *

the Docker image takes 3 GB or more of disk space;

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

          undef,
          $s
      );
      AssertOK($s);
  
      return $outputs_t[0];
  };
  
  undef;

=head2 Encoding the data

The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.

This means that the bases are represented as vectors of length 4:

| base | vector encoding |
|------|-----------------|
| A    | C<[1 0 0 0]>     |
| C    | C<[0 1 0 0]>     |
| G    | C<[0 0 1 0]>     |
| T    | C<[0 0 0 1]>     |
| N    | C<[0 0 0 0]>     |

We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...

  use PDL;
  
  our $SHOW_ENCODER = 1;
  
  sub one_hot_dna {
      my ($seq) = @_;
  
      my $from_alphabet = "NACGT";
      my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;
  
      # sequences from UCSC genome have both uppercase and lowercase bases
      my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
      my $to_alphabet_tr   = $to_alphabet x 2;
  
      my $p = zeros(byte, bytes::length($seq));
      my $p_dataref = $p->get_dataref;
      ${ $p_dataref } = $seq;
      eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
      $p->upd_data;
  
      my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
      say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
  
      my $encoded  = $encoder->index( $p->dummy(0) );
  
      return $encoded;
  }
  
  ####
  
  {
  
  say "Testing one-hot encoding:\n";
  
  my $onehot_test_seq = "ACGTNtgcan";
  my $test_encoded = one_hot_dna( $onehot_test_seq );
  $SHOW_ENCODER = 0;
  
  say "One-hot encoding of sequence '$onehot_test_seq' is:";
  say $test_encoded->info, $test_encoded;
  
  }

B<STREAM (STDOUT)>:

  Testing one-hot encoding:
  
  Encoder is
  PDL: Float D [5,4]
  [
   [0 1 0 0 0]
   [0 0 1 0 0]
   [0 0 0 1 0]
   [0 0 0 0 1]
  ]
  
  One-hot encoding of sequence 'ACGTNtgcan' is:
  PDL: Float D [4,10]
  [
   [1 0 0 0]
   [0 1 0 0]
   [0 0 1 0]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 1]
   [0 0 1 0]
   [0 1 0 0]
   [1 0 0 0]
   [0 0 0 0]
  ]

B<RESULT>:

  1

Note that in the above, the PDL ndarray's

=over

=item *

first dimension is 4 which matches the last dimension of the input C<TFTensor>;

=item *

second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.

=back

Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.

  package Interval {
      use Bio::Location::Simple ();
  
      use parent qw(Bio::Location::Simple);
  
      sub center {
          my $self = shift;
          my $center = int( ($self->start + $self->end ) / 2 );
          my $delta = ($self->start + $self->end ) % 2;
          return $center + $delta;
      }
  
      sub resize {
          my ($self, $width) = @_;
          my $new_interval = $self->clone;
  
          my $center = $self->center;
          my $half   = int( ($width-1) / 2 );
          my $offset = ($width-1) % 2;
  
          $new_interval->start( $center - $half - $offset );
          $new_interval->end(   $center + $half  );
  
          return $new_interval;
      }
  
      use overload '""' => \&_op_stringify;
  
      sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
  }
  
  #####
  
  {
  
  say "Testing interval resizing:\n";
  sub _debug_resize {
      my ($interval, $to, $msg) = @_;
  
      my $resized_interval = $interval->resize($to);
  
      die "Wrong interval size for $interval --($to)--> $resized_interval"
          unless $resized_interval->length == $to;
  
      say sprintf "Interval: %s -> %s, length %2d : %s",
          $interval,
          $resized_interval, $resized_interval->length,
          $msg;
  }
  
  for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
      my ($start, $end) = @$interval_spec;
      my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
      say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
      say "-----";
      for(0..5) {
          my $base = $test_interval->length;
          my $to = $base + $_;
          _debug_resize $test_interval, $to, "$base -> $to (+ $_)";
      }
      say "";
  }
  
  }
  
  undef;

B<STREAM (STDOUT)>:

  Testing interval resizing:
  
  Testing interval chr11:4..8 with length 5
  -----
  Interval: chr11:4..8 -> chr11:4..8, length  5 : 5 -> 5 (+ 0)
  Interval: chr11:4..8 -> chr11:3..8, length  6 : 5 -> 6 (+ 1)
  Interval: chr11:4..8 -> chr11:3..9, length  7 : 5 -> 7 (+ 2)
  Interval: chr11:4..8 -> chr11:2..9, length  8 : 5 -> 8 (+ 3)
  Interval: chr11:4..8 -> chr11:2..10, length  9 : 5 -> 9 (+ 4)
  Interval: chr11:4..8 -> chr11:1..10, length 10 : 5 -> 10 (+ 5)
  
  Testing interval chr11:5..8 with length 4
  -----
  Interval: chr11:5..8 -> chr11:5..8, length  4 : 4 -> 4 (+ 0)
  Interval: chr11:5..8 -> chr11:5..9, length  5 : 4 -> 5 (+ 1)
  Interval: chr11:5..8 -> chr11:4..9, length  6 : 4 -> 6 (+ 2)
  Interval: chr11:5..8 -> chr11:4..10, length  7 : 4 -> 7 (+ 3)
  Interval: chr11:5..8 -> chr11:3..10, length  8 : 4 -> 8 (+ 4)
  Interval: chr11:5..8 -> chr11:3..11, length  9 : 4 -> 9 (+ 5)
  
  Testing interval chr11:5..9 with length 5
  -----
  Interval: chr11:5..9 -> chr11:5..9, length  5 : 5 -> 5 (+ 0)
  Interval: chr11:5..9 -> chr11:4..9, length  6 : 5 -> 6 (+ 1)
  Interval: chr11:5..9 -> chr11:4..10, length  7 : 5 -> 7 (+ 2)
  Interval: chr11:5..9 -> chr11:3..10, length  8 : 5 -> 8 (+ 3)
  Interval: chr11:5..9 -> chr11:3..11, length  9 : 5 -> 9 (+ 4)
  Interval: chr11:5..9 -> chr11:2..11, length 10 : 5 -> 10 (+ 5)
  
  Testing interval chr11:6..9 with length 4
  -----
  Interval: chr11:6..9 -> chr11:6..9, length  4 : 4 -> 4 (+ 0)
  Interval: chr11:6..9 -> chr11:6..10, length  5 : 4 -> 5 (+ 1)
  Interval: chr11:6..9 -> chr11:5..10, length  6 : 4 -> 6 (+ 2)
  Interval: chr11:6..9 -> chr11:5..11, length  7 : 4 -> 7 (+ 3)
  Interval: chr11:6..9 -> chr11:4..11, length  8 : 4 -> 8 (+ 4)
  Interval: chr11:6..9 -> chr11:4..12, length  9 : 4 -> 9 (+ 5)
  


  use Bio::DB::HTS::Faidx;
  
  my $hg_db = Bio::DB::HTS::Faidx->new( $hg_bgz_path );
  
  sub extract_sequence {
      my ($db, $interval) = @_;
  
      my $chrom_length = $db->length($interval->seq_id);
  
      my $trimmed_interval = $interval->clone;

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN


  my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
  say $predictions_p->info; undef;

B<STREAM (STDOUT)>:

  PDL: Float D [5313,896]

=head2 Plot predicted tracks

These predictions can be plotted 

  my @tracks = (
      [ 'DNASE:CD14-positive monocyte female' =>   41 => $predictions_p->slice('(41)') ],
      [ 'DNASE:keratinocyte female'           =>   42 => $predictions_p->slice('(42)') ],
      [ 'CHIP:H3K27ac:keratinocyte female'    =>  706 => $predictions_p->slice('(706)')],
      [ 'CAGE:Keratinocyte - epidermal'       => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
  );
  
  use PDL::Graphics::Gnuplot;
  
  my $plot_output_path = 'enformer-target-interval-tracks.png';
  my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
  
  $gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
  
  $gp->options(
      offsets => [ graph => "0.01, 0, 0, 0" ],
      lmargin => "at screen 0.05",
  );
  
  my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
  
  my @tics_opts = (mirror => 0, out => 1);
  
  for my $i (0..$#tracks) {
      my ($title, $id, $y) = @{$tracks[$i]};
      $gp->plot( {
              title => $title,
              border => [2],
              ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
              ( $i == $#tracks
                  ? ( xtics => { format => '%.3f', @tics_opts } )
                  : ( xtics => 0 ) ),
              ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),
  
          },
          with => 'filledcurves',
          #'lc' => '#086eb5',
  
          # $x scaled by 1e7; filled curve between $y and the x-axis
          $x / 1e7, $y, pdl(0)
      );
  }
  
  $gp->end_multi;
  
  $gp->close;
  
  if( IN_IPERL ) {
      IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
  }

B<DISPLAY>:

=for html <span style="display:inline-block;margin-left:1em;"><p><img						src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...

=head2 Parts of the original notebook that fall outside the scope

In the orignal notebook, there are several more steps that have not been ported here:

=over

=item 1.

"Compute contribution scores":

This task requires implementing C<@tf.function> to compile gradients.

=item 2.

"Predict the effect of a genetic variant" and "Score multiple variants":

The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...

=back

  # Some code that could be used for working with variants.
  1 if <<'COMMENT';
  
  use Bio::DB::HTS::VCF;
  
  my $clinvar_tbi_path = "${clinvar_path}.tbi";
  unless( -f $clinvar_tbi_path ) {
      system( qw(tabix), $clinvar_path );
  }
  my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
  $v->num_variants
  
  COMMENT
  
  undef;

=head1 RESOURCE USAGE

  use Filesys::DiskUsage qw/du/;
  
  my $total = du( { 'human-readable' => 1, dereference => 1 },
      $model_archive_path, $model_base, $new_model_base,
  
      $targets_path,
  
      $hg_gz_path,
      $hg_bgz_path, $hg_bgz_fai_path,
  
      $clinvar_path,
  
      $plot_output_path,
  );
  
  say "Disk space usage: $total"; undef;



( run in 1.117 second using v1.01-cache-2.11-cpan-62a16548d74 )