AI-TensorFlow-Libtensorflow

 view release on metacpan or  search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

# PODNAME: AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel


## DO NOT EDIT. Generated from notebook/InferenceUsingTFHubEnformerGeneExprPredModel.ipynb using ./maint/process-notebook.pl.

use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells

use feature qw(say);
use Syntax::Construct qw( // );

use lib::projectroot qw(lib);

BEGIN {
    if( IN_IPERL ) {
        $ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
    }
    require AI::TensorFlow::Libtensorflow;
}

use URI ();
use HTTP::Tiny ();
use Path::Tiny qw(path);

use File::Which ();

use List::Util ();

use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
use Data::Printer::Filter::PDL ();
use Text::Table::Tiny qw(generate_table);

my $s = AI::TensorFlow::Libtensorflow::Status->New;
sub AssertOK {
    die "Status $_[0]: " . $_[0]->Message
        unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
    return;
}
AssertOK($s);

use PDL;
use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);

use FFI::Platypus::Memory qw(memcpy);
use FFI::Platypus::Buffer qw(scalar_to_pointer);

sub FloatPDLTOTFTensor {
    my ($p) = @_;
    return AI::TensorFlow::Libtensorflow::Tensor->New(
        FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
    );
}

sub FloatTFTensorToPDL {
    my ($t) = @_;

    my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );

    memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
        scalar_to_pointer( ${$t->Data} ),
        $t->ByteSize;
    $pdl->upd_data;

    $pdl;
}

# Model handle
my $model_uri = URI->new( 'https://tfhub.dev/deepmind/enformer/1' );
$model_uri->query_form( 'tf-hub-format' => 'compressed' );
my $model_base = substr( $model_uri->path, 1 ) =~ s,/,_,gr;
my $model_archive_path = "${model_base}.tar.gz";
my $model_sequence_length = 393_216; # bp

# Human targets from Basenji2 dataset
my $targets_uri  = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';

# Human reference genome
my $hg_uri    = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path   = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";

my $clinvar_uri  = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';

my $http = HTTP::Tiny->new;

for my $download ( [ $model_uri   => $model_archive_path ],
                   [ $targets_uri => $targets_path       ],
                   [ $hg_uri      => $hg_gz_path            ],
                   [ $clinvar_uri => $clinvar_path       ],) {
    my ($uri, $path) = @$download;
    say "Downloading $uri to $path";
    next if -e $path;
    $http->mirror( $uri, $path );
}

use Archive::Extract;
$Archive::Extract::DEBUG      = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
    my $ae = Archive::Extract->new( archive => $model_archive_path );
    die "Could not extract archive" unless $ae->extract( to => $model_base );
}

use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
    say "MD5 sum for $hg_gz_path OK";
} else {
    die "Digest for $hg_gz_path failed";
}

(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";

use IPC::Run;

if( ! -e $hg_bgz_path ) {
    IPC::Run::run(
        [ qw(gunzip -c) ], '<', $hg_gz_path,
        '|',
        [ qw(bgzip -c)  ], '>', $hg_bgz_path
    );
}

use Bio::Tools::Run::Samtools;

my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
    my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
    $faidx_tool->run( -fas => $hg_bgz_path )
        or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}

sub saved_model_cli {
    my (@rest) = @_;
    if( File::Which::which('saved_model_cli')) {
        system(qw(saved_model_cli), @rest ) == 0
            or die "Could not run saved_model_cli";
    } else {
        warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
        return -1;
    }
}

say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
    qw(--dir) => $model_base,
);

saved_model_cli( qw(show),
    qw(--dir) => $model_base,
    qw(--all),
);

my $new_model_base = "${model_base}_new";

system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf

in_path, out_path  = sys.argv[1:3]

imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF

saved_model_cli( qw(show),
    qw(--dir) => $new_model_base,
    qw(--all),
);

my $model_central_base_pairs_length     = 114_688; # bp
my $model_central_base_pair_window_size = 128;     # bp / prediction

say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

use Data::Frame;

my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
    ->transform({
        file => sub {
            my ($col, $df) = @_;
            # clean up the paths in 'file' column
            [map { join "/", (split('/', $_))[7..8] } $col->list];
        }
    });

say "Number of targets: ", $df->nrow;

say "";

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

=item *

the model and data takes 5 GB or more of disk space.

=back

meaning that you will need a total of B<8 GB> of disk space. You may need at least B<4 GB> of free memory to run the model.

=head1 COLOPHON

The following document is either a POD file which can additionally be run as a Perl script or a Jupyter Notebook which can be run in L<IPerl|https://p3rl.org/Devel::IPerl> (viewable online at L<nbviewer|https://nbviewer.org/github/EntropyOrg/perl-AI-...

You will also need the executables C<gunzip>, C<bgzip>, and C<samtools>. Furthermore,

=over

=item *

C<Bio::DB::HTS> requires C<libhts> and

=item *

C<PDL::Graphics::Gnuplot> requires C<gnuplot>.

=back

If you are running the code, you may optionally install the L<C<tensorflow> Python package|https://www.tensorflow.org/install/pip> in order to access the C<saved_model_cli> command, but this is only used for informational purposes.

=head1 TUTORIAL

=head2 Load the library

First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...

  use strict;
  use warnings;
  use utf8;
  use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
  no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
  
  use feature qw(say);
  use Syntax::Construct qw( // );
  
  use lib::projectroot qw(lib);
  
  BEGIN {
      if( IN_IPERL ) {
          $ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
      }
      require AI::TensorFlow::Libtensorflow;
  }
  
  use URI ();
  use HTTP::Tiny ();
  use Path::Tiny qw(path);
  
  use File::Which ();
  
  use List::Util ();
  
  use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
  use Data::Printer::Filter::PDL ();
  use Text::Table::Tiny qw(generate_table);
  
  my $s = AI::TensorFlow::Libtensorflow::Status->New;
  sub AssertOK {
      die "Status $_[0]: " . $_[0]->Message
          unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
      return;
  }
  AssertOK($s);

And create helpers for converting between C<PDL> ndarrays and C<TFTensor> ndarrays.

  use PDL;
  use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
  
  use FFI::Platypus::Memory qw(memcpy);
  use FFI::Platypus::Buffer qw(scalar_to_pointer);
  
  sub FloatPDLTOTFTensor {
      my ($p) = @_;
      return AI::TensorFlow::Libtensorflow::Tensor->New(
          FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
      );
  }
  
  sub FloatTFTensorToPDL {
      my ($t) = @_;
  
      my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );
  
      memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
          scalar_to_pointer( ${$t->Data} ),
          $t->ByteSize;
      $pdl->upd_data;
  
      $pdl;
  }

=head2 Download model and data

=over

=item *

L<Enformer model|https://tfhub.dev/deepmind/enformer/1> from

  > Avsec Ž, Agarwal V, Visentin D, Ledsam JR, Grabska-Barwinska A, Taylor KR, Assael Y, Jumper J, Kohli P, Kelley DR. Effective gene expression prediction from sequence by integrating long-range interactions. I<Nat Methods>. 2021 Oct;B<18(10)>:1196...

=item *

L<Human target dataset|https://github.com/calico/basenji/tree/master/manuscripts/cross2020> from

  > Kelley DR. Cross-species regulatory sequence activity prediction. I<PLoS Comput Biol>. 2020 Jul 20;B<16(7)>:e1008050. doi: L<10.1371/journal.pcbi.1008050|https://doi.org/10.1371/journal.pcbi.1008050>. PMID: L<32687525|https://pubmed.ncbi.nlm.nih....

=item *

L<UCSC hg38 genome|https://www.ncbi.nlm.nih.gov/assembly/GCA_000001405.15>. More info at L<http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/>; L<Genome Reference Consortium Human Build 38|https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/>...

  > Schneider VA, Graves-Lindsay T, Howe K, Bouk N, Chen HC, Kitts PA, Murphy TD, Pruitt KD, Thibaud-Nissen F, Albracht D, Fulton RS, Kremitzki M, Magrini V, Markovic C, McGrath S, Steinberg KM, Auger K, Chow W, Collins J, Harden G, Hubbard T, Pelan ...

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

check the MD5 sum of the reference genome to make sure it was downloaded correctly.

=back

  use Archive::Extract;
  $Archive::Extract::DEBUG      = 1;
  $Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
  if( ! -e $model_base ) {
      my $ae = Archive::Extract->new( archive => $model_archive_path );
      die "Could not extract archive" unless $ae->extract( to => $model_base );
  }
  
  use Digest::file qw(digest_file_hex);
  if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
      say "MD5 sum for $hg_gz_path OK";
  } else {
      die "Digest for $hg_gz_path failed";
  }

B<STREAM (STDOUT)>:

  MD5 sum for hg38.fa.gz OK

B<RESULT>:

  1

In order to quickly seek for sequences in the reference genome FASTA, we

=over

=item 1.

convert the gzip'd file into a block gzip'd file and

=item 2.

index that C<.bgz> file using C<faidx> from C<samtools>.

=back

  (my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
  my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
  
  use IPC::Run;
  
  if( ! -e $hg_bgz_path ) {
      IPC::Run::run(
          [ qw(gunzip -c) ], '<', $hg_gz_path,
          '|',
          [ qw(bgzip -c)  ], '>', $hg_bgz_path
      );
  }
  
  use Bio::Tools::Run::Samtools;
  
  my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
  if( ! -e $hg_bgz_fai_path ) {
      my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
      $faidx_tool->run( -fas => $hg_bgz_path )
          or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
  }

=head2 Model input and output specification

Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.

  sub saved_model_cli {
      my (@rest) = @_;
      if( File::Which::which('saved_model_cli')) {
          system(qw(saved_model_cli), @rest ) == 0
              or die "Could not run saved_model_cli";
      } else {
          warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
          return -1;
      }
  }
  
  say "Checking with saved_model_cli scan:";
  saved_model_cli( qw(scan),
      qw(--dir) => $model_base,
  );

B<STREAM (STDOUT)>:

  Checking with saved_model_cli scan:
  MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}

B<RESULT>:

  1

We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:

  saved_model_cli( qw(show),
      qw(--dir) => $model_base,
      qw(--all),
  );

B<STREAM (STDOUT)>:

  MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
  
  signature_def['__saved_model_init_op']:
    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  Concrete Functions:
B<RESULT>:

  1

It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.

  my $new_model_base = "${model_base}_new";
  

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN


  PDL: Float D [5313,896]

=head2 Plot predicted tracks

These predictions can be plotted 

  my @tracks = (
      [ 'DNASE:CD14-positive monocyte female' =>   41 => $predictions_p->slice('(41)') ],
      [ 'DNASE:keratinocyte female'           =>   42 => $predictions_p->slice('(42)') ],
      [ 'CHIP:H3K27ac:keratinocyte female'    =>  706 => $predictions_p->slice('(706)')],
      [ 'CAGE:Keratinocyte - epidermal'       => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
  );
  
  use PDL::Graphics::Gnuplot;
  
  my $plot_output_path = 'enformer-target-interval-tracks.png';
  my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
  
  $gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
  
  $gp->options(
      offsets => [ graph => "0.01, 0, 0, 0" ],
      lmargin => "at screen 0.05",
  );
  
  my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
  
  my @tics_opts = (mirror => 0, out => 1);
  
  for my $i (0..$#tracks) {
      my ($title, $id, $y) = @{$tracks[$i]};
      $gp->plot( {
              title => $title,
              border => [2],
              ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
              ( $i == $#tracks
                  ? ( xtics => { format => '%.3f', @tics_opts } )
                  : ( xtics => 0 ) ),
              ( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : ()  ),
  
          },
          with => 'filledcurves',
          #'lc' => '#086eb5',
  
          # $x scaled by 1e7; filled curve between $y and the x-axis
          $x / 1e7, $y, pdl(0)
      );
  }
  
  $gp->end_multi;
  
  $gp->close;
  
  if( IN_IPERL ) {
      IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
  }

B<DISPLAY>:

=for html <span style="display:inline-block;margin-left:1em;"><p><img						src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...

=head2 Parts of the original notebook that fall outside the scope

In the orignal notebook, there are several more steps that have not been ported here:

=over

=item 1.

"Compute contribution scores":

This task requires implementing C<@tf.function> to compile gradients.

=item 2.

"Predict the effect of a genetic variant" and "Score multiple variants":

The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...

=back

  # Some code that could be used for working with variants.
  1 if <<'COMMENT';
  
  use Bio::DB::HTS::VCF;
  
  my $clinvar_tbi_path = "${clinvar_path}.tbi";
  unless( -f $clinvar_tbi_path ) {
      system( qw(tabix), $clinvar_path );
  }
  my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
  $v->num_variants
  
  COMMENT
  
  undef;

=head1 RESOURCE USAGE

  use Filesys::DiskUsage qw/du/;
  
  my $total = du( { 'human-readable' => 1, dereference => 1 },
      $model_archive_path, $model_base, $new_model_base,
  
      $targets_path,
  
      $hg_gz_path,
      $hg_bgz_path, $hg_bgz_fai_path,
  
      $clinvar_path,
  
      $plot_output_path,
  );
  
  say "Disk space usage: $total"; undef;

B<STREAM (STDOUT)>:

  Disk space usage: 4.66G



( run in 1.197 second using v1.01-cache-2.11-cpan-39bf76dae61 )