comma results from the CPAN

comma

AI-TensorFlow-Libtensorflow

view release on metacpan or search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

my $model_archive_path = "${model_base}.tar.gz";
my $model_sequence_length = 393_216; # bp

# Human targets from Basenji2 dataset
my $targets_uri  = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';

# Human reference genome
my $hg_uri    = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path   = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";

my $clinvar_uri  = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';

my $http = HTTP::Tiny->new;

for my $download ( [ $model_uri   => $model_archive_path ],
                   [ $targets_uri => $targets_path       ],
                   [ $hg_uri      => $hg_gz_path            ],
                   [ $clinvar_uri => $clinvar_path       ],) {
    my ($uri, $path) = @$download;
    say "Downloading $uri to $path";
    next if -e $path;
    $http->mirror( $uri, $path );
}

use Archive::Extract;
$Archive::Extract::DEBUG      = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
    my $ae = Archive::Extract->new( archive => $model_archive_path );
    die "Could not extract archive" unless $ae->extract( to => $model_base );
}

use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
    say "MD5 sum for $hg_gz_path OK";
} else {
    die "Digest for $hg_gz_path failed";
}

(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";

use IPC::Run;

if( ! -e $hg_bgz_path ) {
    IPC::Run::run(
        [ qw(gunzip -c) ], '<', $hg_gz_path,
        '|',
        [ qw(bgzip -c)  ], '>', $hg_bgz_path
    );
}

use Bio::Tools::Run::Samtools;

my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
    my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
    $faidx_tool->run( -fas => $hg_bgz_path )
        or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}

sub saved_model_cli {
    my (@rest) = @_;
    if( File::Which::which('saved_model_cli')) {
        system(qw(saved_model_cli), @rest ) == 0
            or die "Could not run saved_model_cli";
    } else {
        warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
        return -1;
    }
}

say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
    qw(--dir) => $model_base,
);

saved_model_cli( qw(show),
    qw(--dir) => $model_base,
    qw(--all),
);

my $new_model_base = "${model_base}_new";

system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf

in_path, out_path  = sys.argv[1:3]

imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF

saved_model_cli( qw(show),
    qw(--dir) => $new_model_base,
    qw(--all),
);

my $model_central_base_pairs_length     = 114_688; # bp
my $model_central_base_pair_window_size = 128;     # bp / prediction

say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

use Data::Frame;

my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
    ->transform({
        file => sub {
            my ($col, $df) = @_;
            # clean up the paths in 'file' column
            [map { join "/", (split('/', $_))[7..8] } $col->list];
        }
    });

say "Number of targets: ", $df->nrow;

say "";

say "First 5:";
say $df->head(5);

my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;

my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
    $opt, undef, $new_model_base, \@tags, $graph, undef, $s

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN


    $clinvar_path,

    $plot_output_path,
);

say "Disk space usage: $total"; undef;

__END__

=pod

=encoding UTF-8

=head1 NAME

AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel - Using TensorFlow to do gene expression prediction using a pre-trained model

=head1 SYNOPSIS

The following tutorial is based on the L<Enformer usage notebook|https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb>. It uses a pre-trained model based on a transformer architecture trained as described in Avsec e...

Running the code requires an Internet connection to download the model (from Google servers) and datasets (from GitHub, UCSC, and NIH).

Some of this code is identical to that of C<InferenceUsingTFHubMobileNetV2Model> notebook. Please look there for explanation for that code. As stated there, this will later be wrapped up into a high-level library to hide the details behind an API.

B<NOTE>: If running this model, please be aware that

=over

=item *

the Docker image takes 3 GB or more of disk space;

=item *

the model and data takes 5 GB or more of disk space.

=back

meaning that you will need a total of B<8 GB> of disk space. You may need at least B<4 GB> of free memory to run the model.

=head1 COLOPHON

The following document is either a POD file which can additionally be run as a Perl script or a Jupyter Notebook which can be run in L<IPerl|https://p3rl.org/Devel::IPerl> (viewable online at L<nbviewer|https://nbviewer.org/github/EntropyOrg/perl-AI-...

You will also need the executables C<gunzip>, C<bgzip>, and C<samtools>. Furthermore,

=over

=item *

C<Bio::DB::HTS> requires C<libhts> and

=item *

C<PDL::Graphics::Gnuplot> requires C<gnuplot>.

=back

If you are running the code, you may optionally install the L<C<tensorflow> Python package|https://www.tensorflow.org/install/pip> in order to access the C<saved_model_cli> command, but this is only used for informational purposes.

=head1 TUTORIAL

=head2 Load the library

First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...

  use strict;
  use warnings;
  use utf8;
  use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
  no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
  
  use feature qw(say);
  use Syntax::Construct qw( // );
  
  use lib::projectroot qw(lib);
  
  BEGIN {
      if( IN_IPERL ) {
          $ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
      }
      require AI::TensorFlow::Libtensorflow;
  }
  
  use URI ();
  use HTTP::Tiny ();
  use Path::Tiny qw(path);
  
  use File::Which ();
  
  use List::Util ();
  
  use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
  use Data::Printer::Filter::PDL ();
  use Text::Table::Tiny qw(generate_table);
  
  my $s = AI::TensorFlow::Libtensorflow::Status->New;
  sub AssertOK {
      die "Status $_[0]: " . $_[0]->Message
          unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
      return;
  }
  AssertOK($s);

And create helpers for converting between C<PDL> ndarrays and C<TFTensor> ndarrays.

  use PDL;
  use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
  
  use FFI::Platypus::Memory qw(memcpy);
  use FFI::Platypus::Buffer qw(scalar_to_pointer);
  
  sub FloatPDLTOTFTensor {
      my ($p) = @_;
      return AI::TensorFlow::Libtensorflow::Tensor->New(
          FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
      );
  }

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN

=item 2.

check the MD5 sum of the reference genome to make sure it was downloaded correctly.

=back

  use Archive::Extract;
  $Archive::Extract::DEBUG      = 1;
  $Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
  if( ! -e $model_base ) {
      my $ae = Archive::Extract->new( archive => $model_archive_path );
      die "Could not extract archive" unless $ae->extract( to => $model_base );
  }
  
  use Digest::file qw(digest_file_hex);
  if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
      say "MD5 sum for $hg_gz_path OK";
  } else {
      die "Digest for $hg_gz_path failed";
  }

B<STREAM (STDOUT)>:

  MD5 sum for hg38.fa.gz OK

B<RESULT>:

  1

In order to quickly seek for sequences in the reference genome FASTA, we

=over

=item 1.

convert the gzip'd file into a block gzip'd file and

=item 2.

index that C<.bgz> file using C<faidx> from C<samtools>.

=back

  (my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
  my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
  
  use IPC::Run;
  
  if( ! -e $hg_bgz_path ) {
      IPC::Run::run(
          [ qw(gunzip -c) ], '<', $hg_gz_path,
          '|',
          [ qw(bgzip -c)  ], '>', $hg_bgz_path
      );
  }
  
  use Bio::Tools::Run::Samtools;
  
  my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
  if( ! -e $hg_bgz_fai_path ) {
      my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
      $faidx_tool->run( -fas => $hg_bgz_path )
          or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
  }

=head2 Model input and output specification

Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.

  sub saved_model_cli {
      my (@rest) = @_;
      if( File::Which::which('saved_model_cli')) {
          system(qw(saved_model_cli), @rest ) == 0
              or die "Could not run saved_model_cli";
      } else {
          warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
          return -1;
      }
  }
  
  say "Checking with saved_model_cli scan:";
  saved_model_cli( qw(scan),
      qw(--dir) => $model_base,
  );

B<STREAM (STDOUT)>:

  Checking with saved_model_cli scan:
  MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}

B<RESULT>:

  1

We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:

  saved_model_cli( qw(show),
      qw(--dir) => $model_base,
      qw(--all),
  );

B<STREAM (STDOUT)>:

  MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
  
  signature_def['__saved_model_init_op']:
    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  Concrete Functions:
B<RESULT>:

  1

It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.

  my $new_model_base = "${model_base}_new";
  
  system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
  import sys
  import tensorflow as tf
  
  in_path, out_path  = sys.argv[1:3]
  
  imported_model = tf.saved_model.load(in_path).model
  tf.saved_model.save( imported_model , out_path )
  EOF
  
  saved_model_cli( qw(show),
      qw(--dir) => $new_model_base,
      qw(--all),

( run in 0.901 second using v1.01-cache-2.11-cpan-fe3c2283af0 )