AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
# PODNAME: AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel
## DO NOT EDIT. Generated from notebook/InferenceUsingTFHubEnformerGeneExprPredModel.ipynb using ./maint/process-notebook.pl.
use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
use feature qw(say);
use Syntax::Construct qw( // );
use lib::projectroot qw(lib);
BEGIN {
if( IN_IPERL ) {
$ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
}
require AI::TensorFlow::Libtensorflow;
}
use URI ();
use HTTP::Tiny ();
use Path::Tiny qw(path);
use File::Which ();
use List::Util ();
use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
use Data::Printer::Filter::PDL ();
use Text::Table::Tiny qw(generate_table);
my $s = AI::TensorFlow::Libtensorflow::Status->New;
sub AssertOK {
die "Status $_[0]: " . $_[0]->Message
unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
return;
}
AssertOK($s);
use PDL;
use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
use FFI::Platypus::Memory qw(memcpy);
use FFI::Platypus::Buffer qw(scalar_to_pointer);
sub FloatPDLTOTFTensor {
my ($p) = @_;
return AI::TensorFlow::Libtensorflow::Tensor->New(
FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
);
}
sub FloatTFTensorToPDL {
my ($t) = @_;
my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );
memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
scalar_to_pointer( ${$t->Data} ),
$t->ByteSize;
$pdl->upd_data;
$pdl;
}
# Model handle
my $model_uri = URI->new( 'https://tfhub.dev/deepmind/enformer/1' );
$model_uri->query_form( 'tf-hub-format' => 'compressed' );
my $model_base = substr( $model_uri->path, 1 ) =~ s,/,_,gr;
my $model_archive_path = "${model_base}.tar.gz";
my $model_sequence_length = 393_216; # bp
# Human targets from Basenji2 dataset
my $targets_uri = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';
# Human reference genome
my $hg_uri = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";
my $clinvar_uri = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';
my $http = HTTP::Tiny->new;
for my $download ( [ $model_uri => $model_archive_path ],
[ $targets_uri => $targets_path ],
[ $hg_uri => $hg_gz_path ],
[ $clinvar_uri => $clinvar_path ],) {
my ($uri, $path) = @$download;
say "Downloading $uri to $path";
next if -e $path;
$http->mirror( $uri, $path );
}
use Archive::Extract;
$Archive::Extract::DEBUG = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
my $ae = Archive::Extract->new( archive => $model_archive_path );
die "Could not extract archive" unless $ae->extract( to => $model_base );
}
use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
say "MD5 sum for $hg_gz_path OK";
} else {
die "Digest for $hg_gz_path failed";
}
(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
use IPC::Run;
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
use Bio::Tools::Run::Samtools;
my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
$faidx_tool->run( -fas => $hg_bgz_path )
or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}
sub saved_model_cli {
my (@rest) = @_;
if( File::Which::which('saved_model_cli')) {
system(qw(saved_model_cli), @rest ) == 0
or die "Could not run saved_model_cli";
} else {
warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
return -1;
}
}
say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
qw(--dir) => $model_base,
);
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
my $new_model_base = "${model_base}_new";
system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf
in_path, out_path = sys.argv[1:3]
imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF
saved_model_cli( qw(show),
qw(--dir) => $new_model_base,
qw(--all),
);
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
say "First 5:";
say $df->head(5);
my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
$opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);
my %puts = (
## Inputs
inputs_args_0 =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('serving_default_args_0'),
index => 0,
}),
## Outputs
outputs_human =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 0,
}),
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
}),
);
p %puts;
my $predict_on_batch = sub {
my ($session, $t) = @_;
my @outputs_t;
$session->Run(
undef,
[$puts{inputs_args_0}], [$t],
[$puts{outputs_human}], \@outputs_t,
undef,
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
=head2 Load the library
First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...
use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
use feature qw(say);
use Syntax::Construct qw( // );
use lib::projectroot qw(lib);
BEGIN {
if( IN_IPERL ) {
$ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
}
require AI::TensorFlow::Libtensorflow;
}
use URI ();
use HTTP::Tiny ();
use Path::Tiny qw(path);
use File::Which ();
use List::Util ();
use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
use Data::Printer::Filter::PDL ();
use Text::Table::Tiny qw(generate_table);
my $s = AI::TensorFlow::Libtensorflow::Status->New;
sub AssertOK {
die "Status $_[0]: " . $_[0]->Message
unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
return;
}
AssertOK($s);
And create helpers for converting between C<PDL> ndarrays and C<TFTensor> ndarrays.
use PDL;
use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
use FFI::Platypus::Memory qw(memcpy);
use FFI::Platypus::Buffer qw(scalar_to_pointer);
sub FloatPDLTOTFTensor {
my ($p) = @_;
return AI::TensorFlow::Libtensorflow::Tensor->New(
FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
);
}
sub FloatTFTensorToPDL {
my ($t) = @_;
my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );
memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
scalar_to_pointer( ${$t->Data} ),
$t->ByteSize;
$pdl->upd_data;
$pdl;
}
=head2 Download model and data
=over
=item *
L<Enformer model|https://tfhub.dev/deepmind/enformer/1> from
> Avsec Ž, Agarwal V, Visentin D, Ledsam JR, Grabska-Barwinska A, Taylor KR, Assael Y, Jumper J, Kohli P, Kelley DR. Effective gene expression prediction from sequence by integrating long-range interactions. I<Nat Methods>. 2021 Oct;B<18(10)>:1196...
=item *
L<Human target dataset|https://github.com/calico/basenji/tree/master/manuscripts/cross2020> from
> Kelley DR. Cross-species regulatory sequence activity prediction. I<PLoS Comput Biol>. 2020 Jul 20;B<16(7)>:e1008050. doi: L<10.1371/journal.pcbi.1008050|https://doi.org/10.1371/journal.pcbi.1008050>. PMID: L<32687525|https://pubmed.ncbi.nlm.nih....
=item *
L<UCSC hg38 genome|https://www.ncbi.nlm.nih.gov/assembly/GCA_000001405.15>. More info at L<http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/>; L<Genome Reference Consortium Human Build 38|https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/>...
> Schneider VA, Graves-Lindsay T, Howe K, Bouk N, Chen HC, Kitts PA, Murphy TD, Pruitt KD, Thibaud-Nissen F, Albracht D, Fulton RS, Kremitzki M, Magrini V, Markovic C, McGrath S, Steinberg KM, Auger K, Chow W, Collins J, Harden G, Hubbard T, Pelan ...
=item *
L<ClinVar|https://www.ncbi.nlm.nih.gov/clinvar/> file
> Landrum MJ, Lee JM, Benson M, Brown GR, Chao C, Chitipiralla S, Gu B, Hart J, Hoffman D, Jang W, Karapetyan K, Katz K, Liu C, Maddipatla Z, Malheiro A, McDaniel K, Ovetsky M, Riley G, Zhou G, Holmes JB, Kattman BL, Maglott DR. ClinVar: improving ...
=back
# Model handle
my $model_uri = URI->new( 'https://tfhub.dev/deepmind/enformer/1' );
$model_uri->query_form( 'tf-hub-format' => 'compressed' );
my $model_base = substr( $model_uri->path, 1 ) =~ s,/,_,gr;
my $model_archive_path = "${model_base}.tar.gz";
my $model_sequence_length = 393_216; # bp
# Human targets from Basenji2 dataset
my $targets_uri = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';
# Human reference genome
my $hg_uri = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";
my $clinvar_uri = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';
my $http = HTTP::Tiny->new;
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
Function Name: 'predict_on_batch'
Option #1
Callable with:
Argument #1
args_0: TensorSpec(shape=(None, 393216, 4), dtype=tf.float32, name='args_0')
B<RESULT>:
1
We want to use the C<serve> tag-set and
=over
=item *
the input C<args_0> which has the name C<serving_default_args_0:0> and
=item *
the output C<human> which has the name C<StatefulPartitionedCall:0>.
=back
all of which are C<DT_FLOAT>.
Make note of the shapes that those take. Per the L<model description|https://tfhub.dev/deepmind/enformer/1> at TensorFlow Hub:
=over 2
The input sequence length is 393,216 with the prediction corresponding to 128 base pair windows for the center 114,688 base pairs. The input sequence is one hot encoded using the order of indices corresponding to 'ACGT' with N values being all zeros.
=back
The input shape C<(-1, 393216, 4)> thus represents dimensions C<[batch size] x [sequence length] x [one-hot encoding of ACGT]>.
The output shape C<(-1, 896, 5313)> represents dimensions C<[batch size] x [ predictions along 114,688 base pairs / 128 base pair windows ] x [ human target by index ]>. We can confirm this by doing some calculations:
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
B<STREAM (STDOUT)>:
Number of predictions: 896
B<RESULT>:
1
and by looking at the targets file:
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
say "First 5:";
say $df->head(5);
B<STREAM (STDOUT)>:
Number of targets: 5313
First 5:
------------------------------------------------------------------------------------------------------------------------------------------------
index genome identifier file clip scale sum_stat description
------------------------------------------------------------------------------------------------------------------------------------------------
0 0 0 ENCFF833POA encode/ENCSR000EIJ 32 2 mean DNASE:cerebellum male adult (27 years) and male adult (35 years)
1 1 0 ENCFF110QGM encode/ENCSR000EIK 32 2 mean DNASE:frontal cortex male adult (27 years) and male adult (35 years)
2 2 0 ENCFF880MKD encode/ENCSR000EIL 32 2 mean DNASE:chorion
3 3 0 ENCFF463ZLQ encode/ENCSR000EIP 32 2 mean DNASE:Ishikawa treated with 0.02% dimethyl sulfoxide for 1 hour
4 4 0 ENCFF890OGQ encode/ENCSR000EIS 32 2 mean DNASE:GM03348
------------------------------------------------------------------------------------------------------------------------------------------------
B<RESULT>:
1
=head2 Load the model
Let's now load the model in Perl and get the inputs and outputs into a data structure by name.
my $opt = AI::TensorFlow::Libtensorflow::SessionOptions->New;
my @tags = ( 'serve' );
my $graph = AI::TensorFlow::Libtensorflow::Graph->New;
my $session = AI::TensorFlow::Libtensorflow::Session->LoadFromSavedModel(
$opt, undef, $new_model_base, \@tags, $graph, undef, $s
);
AssertOK($s);
my %puts = (
## Inputs
inputs_args_0 =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('serving_default_args_0'),
index => 0,
}),
## Outputs
outputs_human =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 0,
}),
outputs_mouse =>
AI::TensorFlow::Libtensorflow::Output->New({
oper => $graph->OperationByName('StatefulPartitionedCall'),
index => 1,
( run in 0.997 second using v1.01-cache-2.11-cpan-140bd7fdf52 )