png results from the CPAN

AI-TensorFlow-Libtensorflow

    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => 1,
            -end   => $hg_db->length('chr11') )->resize( $hg_db->length('chr11') ) );
}

my $target_interval = Interval->new( -seq_id => 'chr11',
    -start => 35_082_742 +  1, # BioPerl is 1-based
    -end   => 35_197_430 );

say "Target interval: $target_interval with length @{[ $target_interval->length ]}";

die "Target interval is not $model_central_base_pairs_length bp long"
    unless $target_interval->length == $model_central_base_pairs_length;

say "Target sequence is ", seq_info extract_sequence( $hg_db, $target_interval );


say "";


my $resized_interval = $target_interval->resize( $model_sequence_length );
say "Resized interval: $resized_interval with length @{[ $resized_interval->length ]}";

die "resize() is not working properly!" unless $resized_interval->length == $model_sequence_length;

my $seq = extract_sequence( $hg_db, $resized_interval );

say "Resized sequence is ", seq_info($seq);

my $sequence_one_hot = one_hot_dna( $seq )->dummy(-1);

say $sequence_one_hot->info; undef;

use Devel::Timer;
my $t = Devel::Timer->new;

$t->mark('prediction of sequence');

my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );

$t->mark('End of prediction of sequence');

p $predictions;

$t->mark('END');
$t->report();

my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
say $predictions_p->info; undef;

my @tracks = (
    [ 'DNASE:CD14-positive monocyte female' =>   41 => $predictions_p->slice('(41)') ],
    [ 'DNASE:keratinocyte female'           =>   42 => $predictions_p->slice('(42)') ],
    [ 'CHIP:H3K27ac:keratinocyte female'    =>  706 => $predictions_p->slice('(706)')],
    [ 'CAGE:Keratinocyte - epidermal'       => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
);

use PDL::Graphics::Gnuplot;

my $plot_output_path = 'enformer-target-interval-tracks.png';
my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );

$gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );

$gp->options(
    offsets => [ graph => "0.01, 0, 0, 0" ],
    lmargin => "at screen 0.05",
);

my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);

my @tics_opts = (mirror => 0, out => 1);

for my $i (0..$#tracks) {
    my ($title, $id, $y) = @{$tracks[$i]};
    $gp->plot( {
            title => $title,
            border => [2],
            ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
            ( $i == $#tracks
                ? ( xtics => { format => '%.3f', @tics_opts } )
                : ( xtics => 0 ) ),
            ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),

        },
        with => 'filledcurves',
        #'lc' => '#086eb5',

        # $x scaled by 1e7; filled curve between $y and the x-axis
        $x / 1e7, $y, pdl(0)
    );
}

$gp->end_multi;

$gp->close;

if( IN_IPERL ) {
    IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}

# Some code that could be used for working with variants.
1 if <<'COMMENT';

use Bio::DB::HTS::VCF;

my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
    system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants

COMMENT

undef;

use Filesys::DiskUsage qw/du/;

my $total = du( { 'human-readable' => 1, dereference => 1 },
    $model_archive_path, $model_base, $new_model_base,

    $targets_path,

    $hg_gz_path,
    $hg_bgz_path, $hg_bgz_fai_path,

    $clinvar_path,

    $plot_output_path,
);

say "Disk space usage: $total"; undef;

__END__

=pod

=encoding UTF-8

=head1 NAME

AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel - Using TensorFlow to do gene expression prediction using a pre-trained model

=head1 SYNOPSIS

The following tutorial is based on the L<Enformer usage notebook|https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb>. It uses a pre-trained model based on a transformer architecture trained as described in Avsec e...

Running the code requires an Internet connection to download the model (from Google servers) and datasets (from GitHub, UCSC, and NIH).

Some of this code is identical to that of C<InferenceUsingTFHubMobileNetV2Model> notebook. Please look there for explanation for that code. As stated there, this will later be wrapped up into a high-level library to hide the details behind an API.

B<NOTE>: If running this model, please be aware that

=over

=item *

the Docker image takes 3 GB or more of disk space;

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN



  use Devel::Timer;
  my $t = Devel::Timer->new;
  
  $t->mark('prediction of sequence');
  
  my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );
  
  $t->mark('End of prediction of sequence');
  
  p $predictions;
  
  $t->mark('END');
  $t->report();

B<STREAM (STDERR)>:

=begin html

<span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Tensor</span><span style=""> </span><span style="color: #33ccff;">{</span><span style="">
    </span><span style="color: #6666cc;">Type           </span><span style=""> </span><span style="color: #cc66cc;">FLOAT</span><span style="">
    </span><span style="color: #6666cc;">Dims           </span><span style=""> </span><span style="color: #33ccff;">[</span><span style=""> </span><span style="color: #ff6633;">1</span><span style=""> </span><span style="color: #ff6633;">896</span><s...
    </span><span style="color: #6666cc;">NumDims        </span><span style=""> </span><span style="color: #ff6633;">3</span><span style="">
    </span><span style="color: #6666cc;">ElementCount   </span><span style=""> </span><span style="color: #ff6633;">4760448</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">


Devel::Timer Report -- Total time: 14.5641 secs
Interval  Time    Percent
----------------------------------------------
01 -&gt; 02  14.5634  100.00%  prediction of sequence -&gt; End of prediction of sequence
02 -&gt; 03  0.0007   0.00%  End of prediction of sequence -&gt; END
00 -&gt; 01  0.0000   0.00%  INIT -&gt; prediction of sequence
</span></code></pre></span>

=end html

Now we turn the C<TFTensor> output into a PDL ndarray.

  my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
  say $predictions_p->info; undef;

B<STREAM (STDOUT)>:

  PDL: Float D [5313,896]

=head2 Plot predicted tracks

These predictions can be plotted 

  my @tracks = (
      [ 'DNASE:CD14-positive monocyte female' =>   41 => $predictions_p->slice('(41)') ],
      [ 'DNASE:keratinocyte female'           =>   42 => $predictions_p->slice('(42)') ],
      [ 'CHIP:H3K27ac:keratinocyte female'    =>  706 => $predictions_p->slice('(706)')],
      [ 'CAGE:Keratinocyte - epidermal'       => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
  );
  
  use PDL::Graphics::Gnuplot;
  
  my $plot_output_path = 'enformer-target-interval-tracks.png';
  my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
  
  $gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
  
  $gp->options(
      offsets => [ graph => "0.01, 0, 0, 0" ],
      lmargin => "at screen 0.05",
  );
  
  my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
  
  my @tics_opts = (mirror => 0, out => 1);
  
  for my $i (0..$#tracks) {
      my ($title, $id, $y) = @{$tracks[$i]};
      $gp->plot( {
              title => $title,
              border => [2],
              ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
              ( $i == $#tracks
                  ? ( xtics => { format => '%.3f', @tics_opts } )
                  : ( xtics => 0 ) ),
              ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),
  
          },
          with => 'filledcurves',
          #'lc' => '#086eb5',
  
          # $x scaled by 1e7; filled curve between $y and the x-axis
          $x / 1e7, $y, pdl(0)
      );
  }
  
  $gp->end_multi;
  
  $gp->close;
  
  if( IN_IPERL ) {
      IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
  }

B<DISPLAY>:

=for html <span style="display:inline-block;margin-left:1em;"><p><img						src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...

=head2 Parts of the original notebook that fall outside the scope

In the orignal notebook, there are several more steps that have not been ported here:

=over

=item 1.

"Compute contribution scores":

This task requires implementing C<@tf.function> to compile gradients.

=item 2.

"Predict the effect of a genetic variant" and "Score multiple variants":

The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...

=back

  # Some code that could be used for working with variants.
  1 if <<'COMMENT';
  
  use Bio::DB::HTS::VCF;
  
  my $clinvar_tbi_path = "${clinvar_path}.tbi";
  unless( -f $clinvar_tbi_path ) {
      system( qw(tabix), $clinvar_path );
  }
  my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
  $v->num_variants
  
  COMMENT
  
  undef;

=head1 RESOURCE USAGE

  use Filesys::DiskUsage qw/du/;
  
  my $total = du( { 'human-readable' => 1, dereference => 1 },
      $model_archive_path, $model_base, $new_model_base,
  
      $targets_path,
  
      $hg_gz_path,
      $hg_bgz_path, $hg_bgz_fai_path,
  
      $clinvar_path,
  
      $plot_output_path,
  );
  
  say "Disk space usage: $total"; undef;

B<STREAM (STDOUT)>:

  Disk space usage: 4.66G

( run in 0.460 second using v1.01-cache-2.11-cpan-df04353d9ac )