sub results from the CPAN

AI-TensorFlow-Libtensorflow

# PODNAME: AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel


## DO NOT EDIT. Generated from notebook/InferenceUsingTFHubEnformerGeneExprPredModel.ipynb using ./maint/process-notebook.pl.

use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells

use feature qw(say);
use Syntax::Construct qw( // );

use lib::projectroot qw(lib);

BEGIN {
    if( IN_IPERL ) {
        $ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
    }
    require AI::TensorFlow::Libtensorflow;
}

use URI ();
use HTTP::Tiny ();
use Path::Tiny qw(path);

use File::Which ();

use List::Util ();

use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
use Data::Printer::Filter::PDL ();
use Text::Table::Tiny qw(generate_table);

my $s = AI::TensorFlow::Libtensorflow::Status->New;
sub AssertOK {
    die "Status $_[0]: " . $_[0]->Message
        unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
    return;
}
AssertOK($s);

use PDL;
use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);

use FFI::Platypus::Memory qw(memcpy);
use FFI::Platypus::Buffer qw(scalar_to_pointer);

sub FloatPDLTOTFTensor {
    my ($p) = @_;
    return AI::TensorFlow::Libtensorflow::Tensor->New(
        FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
    );
}

sub FloatTFTensorToPDL {
    my ($t) = @_;

    my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );

    memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
        scalar_to_pointer( ${$t->Data} ),
        $t->ByteSize;
    $pdl->upd_data;

    $pdl;
}

# Model handle
my $model_uri = URI->new( 'https://tfhub.dev/deepmind/enformer/1' );
$model_uri->query_form( 'tf-hub-format' => 'compressed' );
my $model_base = substr( $model_uri->path, 1 ) =~ s,/,_,gr;
my $model_archive_path = "${model_base}.tar.gz";
my $model_sequence_length = 393_216; # bp

# Human targets from Basenji2 dataset
my $targets_uri  = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';

# Human reference genome
my $hg_uri    = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path   = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";

my $clinvar_uri  = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';

my $http = HTTP::Tiny->new;

for my $download ( [ $model_uri   => $model_archive_path ],
                   [ $targets_uri => $targets_path       ],
                   [ $hg_uri      => $hg_gz_path            ],
                   [ $clinvar_uri => $clinvar_path       ],) {
    my ($uri, $path) = @$download;
    say "Downloading $uri to $path";
    next if -e $path;
    $http->mirror( $uri, $path );
}

use Archive::Extract;
$Archive::Extract::DEBUG      = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
    my $ae = Archive::Extract->new( archive => $model_archive_path );
    die "Could not extract archive" unless $ae->extract( to => $model_base );
}

use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
    say "MD5 sum for $hg_gz_path OK";
} else {
    die "Digest for $hg_gz_path failed";
}

(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";

use IPC::Run;

if( ! -e $hg_bgz_path ) {
    IPC::Run::run(
        [ qw(gunzip -c) ], '<', $hg_gz_path,
        '|',
        [ qw(bgzip -c)  ], '>', $hg_bgz_path
    );
}

use Bio::Tools::Run::Samtools;

my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
    my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
    $faidx_tool->run( -fas => $hg_bgz_path )
        or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}

sub saved_model_cli {
    my (@rest) = @_;
    if( File::Which::which('saved_model_cli')) {
        system(qw(saved_model_cli), @rest ) == 0
            or die "Could not run saved_model_cli";
    } else {
        warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
        return -1;
    }
}

say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
    qw(--dir) => $model_base,
);

saved_model_cli( qw(show),
    qw(--dir) => $model_base,
    qw(--all),
);

my $new_model_base = "${model_base}_new";

system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf

in_path, out_path  = sys.argv[1:3]

imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF

saved_model_cli( qw(show),
    qw(--dir) => $new_model_base,
    qw(--all),
);

my $model_central_base_pairs_length     = 114_688; # bp
my $model_central_base_pair_window_size = 128;     # bp / prediction

say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

use Data::Frame;

my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
    ->transform({
        file => sub {
            my ($col, $df) = @_;
            # clean up the paths in 'file' column
            [map { join "/", (split('/', $_))[7..8] } $col->list];
        }
    });

say "Number of targets: ", $df->nrow;

say "";

say "First 5:";
say $df->head(5);

my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;

my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
    $opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);

my %puts = (
    ## Inputs
    inputs_args_0 =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('serving_default_args_0'),
            index => 0,
        }),

    ## Outputs
    outputs_human  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 0,
        }),
    outputs_mouse  =>
        AI::TensorFlow::Libtensorflow::Output->New({
            oper => $graph->OperationByName('StatefulPartitionedCall'),
            index => 1,
    }),
);

p %puts;

my $predict_on_batch = sub {
    my ($session, $t) = @_;
    my @outputs_t;

    $session->Run(
        undef,
        [$puts{inputs_args_0}], [$t],
        [$puts{outputs_human}], \@outputs_t,
        undef,
        undef,
        $s
    );
    AssertOK($s);

    return $outputs_t[0];
};

undef;

use PDL;

our $SHOW_ENCODER = 1;

sub one_hot_dna {
    my ($seq) = @_;

    my $from_alphabet = "NACGT";
    my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;

    # sequences from UCSC genome have both uppercase and lowercase bases
    my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
    my $to_alphabet_tr   = $to_alphabet x 2;

    my $p = zeros(byte, bytes::length($seq));
    my $p_dataref = $p->get_dataref;
    ${ $p_dataref } = $seq;
    eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
    $p->upd_data;

    my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
    say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;

    my $encoded  = $encoder->index( $p->dummy(0) );

    return $encoded;
}

####

{

say "Testing one-hot encoding:\n";

my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;

say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;

}

package Interval {
    use Bio::Location::Simple ();

    use parent qw(Bio::Location::Simple);

    sub center {
        my $self = shift;
        my $center = int( ($self->start + $self->end ) / 2 );
        my $delta = ($self->start + $self->end ) % 2;
        return $center + $delta;
    }

    sub resize {
        my ($self, $width) = @_;
        my $new_interval = $self->clone;

        my $center = $self->center;
        my $half   = int( ($width-1) / 2 );
        my $offset = ($width-1) % 2;

        $new_interval->start( $center - $half - $offset );
        $new_interval->end(   $center + $half  );

        return $new_interval;
    }

    use overload '""' => \&_op_stringify;

    sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}

#####

{

say "Testing interval resizing:\n";
sub _debug_resize {
    my ($interval, $to, $msg) = @_;

    my $resized_interval = $interval->resize($to);

    die "Wrong interval size for $interval --($to)--> $resized_interval"
        unless $resized_interval->length == $to;

    say sprintf "Interval: %s -> %s, length %2d : %s",
        $interval,
        $resized_interval, $resized_interval->length,
        $msg;
}

for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
    my ($start, $end) = @$interval_spec;
    my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
    say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
    say "-----";
    for(0..5) {
        my $base = $test_interval->length;
        my $to = $base + $_;
        _debug_resize $test_interval, $to, "$base -> $to (+ $_)";
    }
    say "";
}

}

undef;

use Bio::DB::HTS::Faidx;

my $hg_db = Bio::DB::HTS::Faidx->new( $hg_bgz_path );

sub extract_sequence {
    my ($db, $interval) = @_;

    my $chrom_length = $db->length($interval->seq_id);

    my $trimmed_interval = $interval->clone;
    $trimmed_interval->start( List::Util::max( $interval->start, 1               ) );
    $trimmed_interval->end(   List::Util::min( $interval->end  , $chrom_length   ) );

    # Bio::DB::HTS::Faidx is 0-based for both start and end points
    my $seq = $db->get_sequence2_no_length(
        $trimmed_interval->seq_id,
        $trimmed_interval->start - 1,
        $trimmed_interval->end   - 1,
    );

    my $pad_upstream   = 'N' x List::Util::max( -($interval->start-1), 0 );
    my $pad_downstream = 'N' x List::Util::max( $interval->end - $chrom_length, 0 );

    return join '', $pad_upstream, $seq, $pad_downstream;
}

sub seq_info {
    my ($seq, $n) = @_;
    $n ||= 10;
    if( length $seq > $n ) {
        sprintf "%s...%s (length %d)", uc substr($seq, 0, $n), uc substr($seq, -$n), length $seq;
    } else {
        sprintf "%s (length %d)", uc $seq, length $seq;
    }
}

####

{

say "Testing sequence extraction:";

say "1 base: ",   seq_info
    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => 35_082_742 + 1,
            -end   => 35_082_742 + 1 ) );

say "3 bases: ",  seq_info
    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => 1,
            -end   => 1 )->resize(3) );

say "5 bases: ", seq_info
    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => $hg_db->length('chr11'),
            -end   => $hg_db->length('chr11') )->resize(5) );

say "chr11 is of length ", $hg_db->length('chr11');
say "chr11 bases: ", seq_info
    extract_sequence( $hg_db,
        Interval->new( -seq_id => 'chr11',
            -start => 1,
            -end   => $hg_db->length('chr11') )->resize( $hg_db->length('chr11') ) );
}

my $target_interval = Interval->new( -seq_id => 'chr11',
    -start => 35_082_742 +  1, # BioPerl is 1-based
    -end   => 35_197_430 );

say "Target interval: $target_interval with length @{[ $target_interval->length ]}";

die "Target interval is not $model_central_base_pairs_length bp long"
    unless $target_interval->length == $model_central_base_pairs_length;

say "Target sequence is ", seq_info extract_sequence( $hg_db, $target_interval );


say "";


my $resized_interval = $target_interval->resize( $model_sequence_length );
say "Resized interval: $resized_interval with length @{[ $resized_interval->length ]}";

die "resize() is not working properly!" unless $resized_interval->length == $model_sequence_length;

my $seq = extract_sequence( $hg_db, $resized_interval );

say "Resized sequence is ", seq_info($seq);

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN


meaning that you will need a total of B<8 GB> of disk space. You may need at least B<4 GB> of free memory to run the model.

=head1 COLOPHON

The following document is either a POD file which can additionally be run as a Perl script or a Jupyter Notebook which can be run in L<IPerl|https://p3rl.org/Devel::IPerl> (viewable online at L<nbviewer|https://nbviewer.org/github/EntropyOrg/perl-AI-...

You will also need the executables C<gunzip>, C<bgzip>, and C<samtools>. Furthermore,

=over

=item *

C<Bio::DB::HTS> requires C<libhts> and

=item *

C<PDL::Graphics::Gnuplot> requires C<gnuplot>.

=back

If you are running the code, you may optionally install the L<C<tensorflow> Python package|https://www.tensorflow.org/install/pip> in order to access the C<saved_model_cli> command, but this is only used for informational purposes.

=head1 TUTORIAL

=head2 Load the library

First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...

  use strict;
  use warnings;
  use utf8;
  use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
  no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
  
  use feature qw(say);
  use Syntax::Construct qw( // );
  
  use lib::projectroot qw(lib);
  
  BEGIN {
      if( IN_IPERL ) {
          $ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
      }
      require AI::TensorFlow::Libtensorflow;
  }
  
  use URI ();
  use HTTP::Tiny ();
  use Path::Tiny qw(path);
  
  use File::Which ();
  
  use List::Util ();
  
  use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
  use Data::Printer::Filter::PDL ();
  use Text::Table::Tiny qw(generate_table);
  
  my $s = AI::TensorFlow::Libtensorflow::Status->New;
  sub AssertOK {
      die "Status $_[0]: " . $_[0]->Message
          unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
      return;
  }
  AssertOK($s);

And create helpers for converting between C<PDL> ndarrays and C<TFTensor> ndarrays.

  use PDL;
  use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
  
  use FFI::Platypus::Memory qw(memcpy);
  use FFI::Platypus::Buffer qw(scalar_to_pointer);
  
  sub FloatPDLTOTFTensor {
      my ($p) = @_;
      return AI::TensorFlow::Libtensorflow::Tensor->New(
          FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
      );
  }
  
  sub FloatTFTensorToPDL {
      my ($t) = @_;
  
      my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );
  
      memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
          scalar_to_pointer( ${$t->Data} ),
          $t->ByteSize;
      $pdl->upd_data;
  
      $pdl;
  }

=head2 Download model and data

=over

=item *

L<Enformer model|https://tfhub.dev/deepmind/enformer/1> from

  > Avsec Å½, Agarwal V, Visentin D, Ledsam JR, Grabska-Barwinska A, Taylor KR, Assael Y, Jumper J, Kohli P, Kelley DR. Effective gene expression prediction from sequence by integrating long-range interactions. I<Nat Methods>. 2021 Oct;B<18(10)>:1196...

=item *

L<Human target dataset|https://github.com/calico/basenji/tree/master/manuscripts/cross2020> from

  > Kelley DR. Cross-species regulatory sequence activity prediction. I<PLoS Comput Biol>. 2020 Jul 20;B<16(7)>:e1008050. doi: L<10.1371/journal.pcbi.1008050|https://doi.org/10.1371/journal.pcbi.1008050>. PMID: L<32687525|https://pubmed.ncbi.nlm.nih....

=item *

L<UCSC hg38 genome|https://www.ncbi.nlm.nih.gov/assembly/GCA_000001405.15>. More info at L<http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/>; L<Genome Reference Consortium Human Build 38|https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/>...

  > Schneider VA, Graves-Lindsay T, Howe K, Bouk N, Chen HC, Kitts PA, Murphy TD, Pruitt KD, Thibaud-Nissen F, Albracht D, Fulton RS, Kremitzki M, Magrini V, Markovic C, McGrath S, Steinberg KM, Auger K, Chow W, Collins J, Harden G, Hubbard T, Pelan ...

=item *

L<ClinVar|https://www.ncbi.nlm.nih.gov/clinvar/> file

  > Landrum MJ, Lee JM, Benson M, Brown GR, Chao C, Chitipiralla S, Gu B, Hart J, Hoffman D, Jang W, Karapetyan K, Katz K, Liu C, Maddipatla Z, Malheiro A, McDaniel K, Ovetsky M, Riley G, Zhou G, Holmes JB, Kattman BL, Maglott DR. ClinVar: improving ...

=back

  # Model handle
  my $model_uri = URI->new( 'https://tfhub.dev/deepmind/enformer/1' );
  $model_uri->query_form( 'tf-hub-format' => 'compressed' );
  my $model_base = substr( $model_uri->path, 1 ) =~ s,/,_,gr;
  my $model_archive_path = "${model_base}.tar.gz";
  my $model_sequence_length = 393_216; # bp
  
  # Human targets from Basenji2 dataset
  my $targets_uri  = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
  my $targets_path = 'targets_human.txt';
  
  # Human reference genome
  my $hg_uri    = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
  my $hg_gz_path   = "hg38.fa.gz";
  # From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
  my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";
  
  my $clinvar_uri  = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
  my $clinvar_path = 'clinvar.vcf.gz';
  
  my $http = HTTP::Tiny->new;
  
  for my $download ( [ $model_uri   => $model_archive_path ],
                     [ $targets_uri => $targets_path       ],
                     [ $hg_uri      => $hg_gz_path            ],
                     [ $clinvar_uri => $clinvar_path       ],) {
      my ($uri, $path) = @$download;
      say "Downloading $uri to $path";
      next if -e $path;
      $http->mirror( $uri, $path );
  }

B<STREAM (STDOUT)>:

  Downloading https://tfhub.dev/deepmind/enformer/1?tf-hub-format=compressed to deepmind_enformer_1.tar.gz
  Downloading https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt to targets_human.txt
  Downloading http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz to hg38.fa.gz
  Downloading https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz to clinvar.vcf.gz

Now we

=over

=item 1.

extract the saved model so that we can load it and

=item 2.

check the MD5 sum of the reference genome to make sure it was downloaded correctly.

=back

  use Archive::Extract;
  $Archive::Extract::DEBUG      = 1;
  $Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
  if( ! -e $model_base ) {
      my $ae = Archive::Extract->new( archive => $model_archive_path );
      die "Could not extract archive" unless $ae->extract( to => $model_base );
  }
  
  use Digest::file qw(digest_file_hex);
  if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
      say "MD5 sum for $hg_gz_path OK";
  } else {
      die "Digest for $hg_gz_path failed";
  }

B<STREAM (STDOUT)>:

  MD5 sum for hg38.fa.gz OK

B<RESULT>:

  1

In order to quickly seek for sequences in the reference genome FASTA, we

=over

=item 1.

convert the gzip'd file into a block gzip'd file and

=item 2.

index that C<.bgz> file using C<faidx> from C<samtools>.

=back

  (my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
  my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
  
  use IPC::Run;
  
  if( ! -e $hg_bgz_path ) {
      IPC::Run::run(
          [ qw(gunzip -c) ], '<', $hg_gz_path,
          '|',
          [ qw(bgzip -c)  ], '>', $hg_bgz_path
      );
  }
  
  use Bio::Tools::Run::Samtools;
  
  my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
  if( ! -e $hg_bgz_fai_path ) {
      my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
      $faidx_tool->run( -fas => $hg_bgz_path )
          or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
  }

=head2 Model input and output specification

Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.

  sub saved_model_cli {
      my (@rest) = @_;
      if( File::Which::which('saved_model_cli')) {
          system(qw(saved_model_cli), @rest ) == 0
              or die "Could not run saved_model_cli";
      } else {
          warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
          return -1;
      }
  }
  
  say "Checking with saved_model_cli scan:";
  saved_model_cli( qw(scan),
      qw(--dir) => $model_base,
  );

B<STREAM (STDOUT)>:

  Checking with saved_model_cli scan:
  MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}

B<RESULT>:

  1

We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:

  saved_model_cli( qw(show),
      qw(--dir) => $model_base,
      qw(--all),
  );

B<STREAM (STDOUT)>:

  MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
  
  signature_def['__saved_model_init_op']:
    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  Concrete Functions:
B<RESULT>:

  1

It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.

  my $new_model_base = "${model_base}_new";
  
  system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
  import sys
  import tensorflow as tf
  
  in_path, out_path  = sys.argv[1:3]
  
  imported_model = tf.saved_model.load(in_path).model

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

    Method name is: tensorflow/serving/predict
  
  Concrete Functions:
    Function Name: 'predict_on_batch'
      Option #1
        Callable with:
          Argument #1
            args_0: TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name='args_0')

B<RESULT>:

  1

We want to use the C<serve> tag-set and

=over

=item *

the input C<args_0> which has the name C<serving_default_args_0:0> and

=item *

the output C<human> which has the name C<StatefulPartitionedCall:0>.

=back

all of which are C<DT_FLOAT>.

Make note of the shapes that those take. Per the L<model description|https://tfhub.dev/deepmind/enformer/1> at TensorFlow Hub:

=over 2

The input sequence length is 393,216 with the prediction corresponding to 128 base pair windows for the center 114,688 base pairs. The input sequence is one hot encoded using the order of indices corresponding to 'ACGT' with N values being all zeros.

=back

The input shape C<(-1, 393216, 4)> thus represents dimensions C<[batch size] x [sequence length] x [one-hot encoding of ACGT]>.

The output shape C<(-1, 896, 5313)> represents dimensions C<[batch size] x [ predictions along 114,688 base pairs / 128 base pair windows ] x [ human target by index ]>. We can confirm this by doing some calculations:

  my $model_central_base_pairs_length     = 114_688; # bp
  my $model_central_base_pair_window_size = 128;     # bp / prediction
  
  say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

B<STREAM (STDOUT)>:

  Number of predictions: 896

B<RESULT>:

  1

and by looking at the targets file:

  use Data::Frame;
  
  my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
      ->transform({
          file => sub {
              my ($col, $df) = @_;
              # clean up the paths in 'file' column
              [map { join "/", (split('/', $_))[7..8] } $col->list];
          }
      });
  
  say "Number of targets: ", $df->nrow;
  
  say "";
  
  say "First 5:";
  say $df->head(5);

B<STREAM (STDOUT)>:

  Number of targets: 5313
  
  First 5:
  ------------------------------------------------------------------------------------------------------------------------------------------------
      index  genome  identifier   file                clip  scale  sum_stat  description                                                          
  ------------------------------------------------------------------------------------------------------------------------------------------------
   0  0      0       ENCFF833POA  encode/ENCSR000EIJ  32    2      mean      DNASE:cerebellum male adult (27 years) and male adult (35 years)     
   1  1      0       ENCFF110QGM  encode/ENCSR000EIK  32    2      mean      DNASE:frontal cortex male adult (27 years) and male adult (35 years) 
   2  2      0       ENCFF880MKD  encode/ENCSR000EIL  32    2      mean      DNASE:chorion                                                        
   3  3      0       ENCFF463ZLQ  encode/ENCSR000EIP  32    2      mean      DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour      
   4  4      0       ENCFF890OGQ  encode/ENCSR000EIS  32    2      mean      DNASE:GM03348                                                        
  ------------------------------------------------------------------------------------------------------------------------------------------------

B<RESULT>:

  1

=head2 Load the model

Let's now load the model in Perl and get the inputs and outputs into a data structure by name.

  my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
  
  my @tags = ( 'serve' );
  my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
  my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
      $opt, undef, $new_model_base, \@tags, $graph, undef, $s
  );
  AssertOK($s);
  
  my %puts = (
      ## Inputs
      inputs_args_0 =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('serving_default_args_0'),
              index => 0,
          }),
  
      ## Outputs
      outputs_human  =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('StatefulPartitionedCall'),
              index => 0,
          }),
      outputs_mouse  =>
          AI::TensorFlow::Libtensorflow::Output->New({
              oper => $graph->OperationByName('StatefulPartitionedCall'),
              index => 1,
      }),
  );
  
  p %puts;

B<STREAM (STDERR)>:

=for html <span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #33ccff;">{</span><span style="">
    </span><span style="color: #6666cc;">inputs_args_0</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">serving_default_args_0</span><span style="color: ...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">Placeholder</span><span style="color: #33ccff;">&...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_human</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">0</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="color: #33ccff;">,</span><span style="">
    </span><span style="color: #6666cc;">outputs_mouse</span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Output</span><span style=""> </span><span style="color: #33ccff;">{</span><span style=""...
        </span><span style="color: #6666cc;">index</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">1</span><span style="color: #33ccff;">,</span><span style="">
        </span><span style="color: #6666cc;">oper</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Operation</span><span style=""> </span><span style="color: #33ccff;">{...
            </span><span style="color: #6666cc;">Name</span><span style="">      </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
            </span><span style="color: #6666cc;">NumInputs</span><span style=""> </span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">274</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">NumOutputs</span><span style="color: #33ccff;">   </span><span style="color: #ff6633;">2</span><span style="color: #33ccff;">,</span><span style="">
            </span><span style="color: #6666cc;">OpType</span><span style="">    </span><span style="color: #33ccff;">   </span><span style="color: #33ccff;">&quot;</span><span style="color: #669933;">StatefulPartitionedCall</span><span style="color:...
        </span><span style="color: #33ccff;">}</span><span style="">
    </span><span style="color: #33ccff;">}</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
</span></code></pre></span>

We need a helper to simplify running the session and getting just the predictions that we want.

  my $predict_on_batch = sub {
      my ($session, $t) = @_;
      my @outputs_t;
  
      $session->Run(
          undef,
          [$puts{inputs_args_0}], [$t],
          [$puts{outputs_human}], \@outputs_t,
          undef,
          undef,
          $s
      );
      AssertOK($s);
  
      return $outputs_t[0];
  };
  
  undef;

=head2 Encoding the data

The model specifies that the way to get a sequence of DNA bases into a C<TFTensor> is to use L<one-hot encoding|https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics> in the order C<ACGT>.

This means that the bases are represented as vectors of length 4:

| base | vector encoding |
|------|-----------------|
| A    | C<[1 0 0 0]>     |
| C    | C<[0 1 0 0]>     |
| G    | C<[0 0 1 0]>     |
| T    | C<[0 0 0 1]>     |
| N    | C<[0 0 0 0]>     |

We can achieve this encoding by creating a lookup table with a PDL ndarray. This could be done by creating a byte PDL ndarray of dimensions C<[ 256 4 ]> to directly look up the the numeric value of characters 0-255, but here we'll go with a smaller C...

  use PDL;
  
  our $SHOW_ENCODER = 1;
  
  sub one_hot_dna {
      my ($seq) = @_;
  
      my $from_alphabet = "NACGT";
      my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;
  
      # sequences from UCSC genome have both uppercase and lowercase bases
      my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
      my $to_alphabet_tr   = $to_alphabet x 2;
  
      my $p = zeros(byte, bytes::length($seq));
      my $p_dataref = $p->get_dataref;
      ${ $p_dataref } = $seq;
      eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
      $p->upd_data;
  
      my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
      say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
  
      my $encoded  = $encoder->index( $p->dummy(0) );
  
      return $encoded;
  }
  
  ####
  
  {
  
  say "Testing one-hot encoding:\n";
  
  my $onehot_test_seq = "ACGTNtgcan";
  my $test_encoded = one_hot_dna( $onehot_test_seq );
  $SHOW_ENCODER = 0;
  
  say "One-hot encoding of sequence '$onehot_test_seq' is:";
  say $test_encoded->info, $test_encoded;
  
  }

B<STREAM (STDOUT)>:

  Testing one-hot encoding:
  
  Encoder is
  PDL: Float D [5,4]
  [
   [0 1 0 0 0]
   [0 0 1 0 0]
   [0 0 0 1 0]
   [0 0 0 0 1]
  ]
  
  One-hot encoding of sequence 'ACGTNtgcan' is:
  PDL: Float D [4,10]
  [
   [1 0 0 0]
   [0 1 0 0]
   [0 0 1 0]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 1]
   [0 0 1 0]
   [0 1 0 0]
   [1 0 0 0]
   [0 0 0 0]
  ]

B<RESULT>:

  1

Note that in the above, the PDL ndarray's

=over

=item *

first dimension is 4 which matches the last dimension of the input C<TFTensor>;

=item *

second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.

=back

Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.

  package Interval {
      use Bio::Location::Simple ();
  
      use parent qw(Bio::Location::Simple);
  
      sub center {
          my $self = shift;
          my $center = int( ($self->start + $self->end ) / 2 );
          my $delta = ($self->start + $self->end ) % 2;
          return $center + $delta;
      }
  
      sub resize {
          my ($self, $width) = @_;
          my $new_interval = $self->clone;
  
          my $center = $self->center;
          my $half   = int( ($width-1) / 2 );
          my $offset = ($width-1) % 2;
  
          $new_interval->start( $center - $half - $offset );
          $new_interval->end(   $center + $half  );
  
          return $new_interval;
      }
  
      use overload '""' => \&_op_stringify;
  
      sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
  }
  
  #####
  
  {
  
  say "Testing interval resizing:\n";
  sub _debug_resize {
      my ($interval, $to, $msg) = @_;
  
      my $resized_interval = $interval->resize($to);
  
      die "Wrong interval size for $interval --($to)--> $resized_interval"
          unless $resized_interval->length == $to;
  
      say sprintf "Interval: %s -> %s, length %2d : %s",
          $interval,
          $resized_interval, $resized_interval->length,
          $msg;
  }
  
  for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
      my ($start, $end) = @$interval_spec;
      my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
      say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
      say "-----";
      for(0..5) {
          my $base = $test_interval->length;
          my $to = $base + $_;
          _debug_resize $test_interval, $to, "$base -> $to (+ $_)";
      }
      say "";
  }
  
  }
  
  undef;

B<STREAM (STDOUT)>:

  Testing interval resizing:
  
  Testing interval chr11:4..8 with length 5
  -----
  Interval: chr11:4..8 -> chr11:4..8, length  5 : 5 -> 5 (+ 0)
  Interval: chr11:4..8 -> chr11:3..8, length  6 : 5 -> 6 (+ 1)
  Interval: chr11:4..8 -> chr11:3..9, length  7 : 5 -> 7 (+ 2)
  Interval: chr11:4..8 -> chr11:2..9, length  8 : 5 -> 8 (+ 3)
  Interval: chr11:4..8 -> chr11:2..10, length  9 : 5 -> 9 (+ 4)
  Interval: chr11:4..8 -> chr11:1..10, length 10 : 5 -> 10 (+ 5)
  
  Testing interval chr11:5..8 with length 4
  -----
  Interval: chr11:5..8 -> chr11:5..8, length  4 : 4 -> 4 (+ 0)
  Interval: chr11:5..8 -> chr11:5..9, length  5 : 4 -> 5 (+ 1)
  Interval: chr11:5..8 -> chr11:4..9, length  6 : 4 -> 6 (+ 2)
  Interval: chr11:5..8 -> chr11:4..10, length  7 : 4 -> 7 (+ 3)
  Interval: chr11:5..8 -> chr11:3..10, length  8 : 4 -> 8 (+ 4)
  Interval: chr11:5..8 -> chr11:3..11, length  9 : 4 -> 9 (+ 5)
  
  Testing interval chr11:5..9 with length 5
  -----
  Interval: chr11:5..9 -> chr11:5..9, length  5 : 5 -> 5 (+ 0)
  Interval: chr11:5..9 -> chr11:4..9, length  6 : 5 -> 6 (+ 1)
  Interval: chr11:5..9 -> chr11:4..10, length  7 : 5 -> 7 (+ 2)
  Interval: chr11:5..9 -> chr11:3..10, length  8 : 5 -> 8 (+ 3)
  Interval: chr11:5..9 -> chr11:3..11, length  9 : 5 -> 9 (+ 4)
  Interval: chr11:5..9 -> chr11:2..11, length 10 : 5 -> 10 (+ 5)
  
  Testing interval chr11:6..9 with length 4
  -----
  Interval: chr11:6..9 -> chr11:6..9, length  4 : 4 -> 4 (+ 0)
  Interval: chr11:6..9 -> chr11:6..10, length  5 : 4 -> 5 (+ 1)
  Interval: chr11:6..9 -> chr11:5..10, length  6 : 4 -> 6 (+ 2)
  Interval: chr11:6..9 -> chr11:5..11, length  7 : 4 -> 7 (+ 3)
  Interval: chr11:6..9 -> chr11:4..11, length  8 : 4 -> 8 (+ 4)
  Interval: chr11:6..9 -> chr11:4..12, length  9 : 4 -> 9 (+ 5)
  


  use Bio::DB::HTS::Faidx;
  
  my $hg_db = Bio::DB::HTS::Faidx->new( $hg_bgz_path );
  
  sub extract_sequence {
      my ($db, $interval) = @_;
  
      my $chrom_length = $db->length($interval->seq_id);
  
      my $trimmed_interval = $interval->clone;
      $trimmed_interval->start( List::Util::max( $interval->start, 1               ) );
      $trimmed_interval->end(   List::Util::min( $interval->end  , $chrom_length   ) );
  
      # Bio::DB::HTS::Faidx is 0-based for both start and end points
      my $seq = $db->get_sequence2_no_length(
          $trimmed_interval->seq_id,
          $trimmed_interval->start - 1,
          $trimmed_interval->end   - 1,
      );
  
      my $pad_upstream   = 'N' x List::Util::max( -($interval->start-1), 0 );
      my $pad_downstream = 'N' x List::Util::max( $interval->end - $chrom_length, 0 );
  
      return join '', $pad_upstream, $seq, $pad_downstream;
  }
  
  sub seq_info {
      my ($seq, $n) = @_;
      $n ||= 10;
      if( length $seq > $n ) {
          sprintf "%s...%s (length %d)", uc substr($seq, 0, $n), uc substr($seq, -$n), length $seq;
      } else {
          sprintf "%s (length %d)", uc $seq, length $seq;
      }
  }
  
  ####
  
  {
  
  say "Testing sequence extraction:";
  
  say "1 base: ",   seq_info
      extract_sequence( $hg_db,
          Interval->new( -seq_id => 'chr11',
              -start => 35_082_742 + 1,
              -end   => 35_082_742 + 1 ) );
  
  say "3 bases: ",  seq_info
      extract_sequence( $hg_db,
          Interval->new( -seq_id => 'chr11',
              -start => 1,
              -end   => 1 )->resize(3) );
  
  say "5 bases: ", seq_info
      extract_sequence( $hg_db,
          Interval->new( -seq_id => 'chr11',
              -start => $hg_db->length('chr11'),
              -end   => $hg_db->length('chr11') )->resize(5) );
  
  say "chr11 is of length ", $hg_db->length('chr11');
  say "chr11 bases: ", seq_info
      extract_sequence( $hg_db,
          Interval->new( -seq_id => 'chr11',
              -start => 1,
              -end   => $hg_db->length('chr11') )->resize( $hg_db->length('chr11') ) );
  }

B<STREAM (STDOUT)>:

  Testing sequence extraction:
  1 base: G (length 1)
  3 bases: NNN (length 3)
  5 bases: NNNNN (length 5)
  chr11 is of length 135086622
  chr11 bases: NNNNNNNNNN...NNNNNNNNNN (length 135086622)

B<RESULT>:

  1

Now we can use the same target interval that is used in the example notebook which recreates part of L<figure 1|https://www.nature.com/articles/s41592-021-01252-x/figures/1> from the Enformer paper.

  my $target_interval = Interval->new( -seq_id => 'chr11',
      -start => 35_082_742 +  1, # BioPerl is 1-based
      -end   => 35_197_430 );
  
  say "Target interval: $target_interval with length @{[ $target_interval->length ]}";
  
  die "Target interval is not $model_central_base_pairs_length bp long"
      unless $target_interval->length == $model_central_base_pairs_length;

( run in 0.715 second using v1.01-cache-2.11-cpan-9581c071862 )