AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
# PODNAME: AI::TensorFlow::Libtensorflow::Manual::Notebook::InferenceUsingTFHubEnformerGeneExprPredModel
## DO NOT EDIT. Generated from notebook/InferenceUsingTFHubEnformerGeneExprPredModel.ipynb using ./maint/process-notebook.pl.
use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
use feature qw(say);
use Syntax::Construct qw( // );
use lib::projectroot qw(lib);
BEGIN {
if( IN_IPERL ) {
$ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
}
require AI::TensorFlow::Libtensorflow;
}
use URI ();
use HTTP::Tiny ();
use Path::Tiny qw(path);
use File::Which ();
use List::Util ();
use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
use Data::Printer::Filter::PDL ();
use Text::Table::Tiny qw(generate_table);
my $s = AI::TensorFlow::Libtensorflow::Status->New;
sub AssertOK {
die "Status $_[0]: " . $_[0]->Message
unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
return;
}
AssertOK($s);
use PDL;
use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
use FFI::Platypus::Memory qw(memcpy);
use FFI::Platypus::Buffer qw(scalar_to_pointer);
sub FloatPDLTOTFTensor {
my ($p) = @_;
return AI::TensorFlow::Libtensorflow::Tensor->New(
FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
);
}
sub FloatTFTensorToPDL {
my ($t) = @_;
my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );
memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
scalar_to_pointer( ${$t->Data} ),
$t->ByteSize;
$pdl->upd_data;
$pdl;
}
# Model handle
my $model_uri = URI->new( 'https://tfhub.dev/deepmind/enformer/1' );
$model_uri->query_form( 'tf-hub-format' => 'compressed' );
my $model_base = substr( $model_uri->path, 1 ) =~ s,/,_,gr;
my $model_archive_path = "${model_base}.tar.gz";
my $model_sequence_length = 393_216; # bp
# Human targets from Basenji2 dataset
my $targets_uri = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';
# Human reference genome
my $hg_uri = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";
my $clinvar_uri = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';
my $http = HTTP::Tiny->new;
for my $download ( [ $model_uri => $model_archive_path ],
[ $targets_uri => $targets_path ],
[ $hg_uri => $hg_gz_path ],
[ $clinvar_uri => $clinvar_path ],) {
my ($uri, $path) = @$download;
say "Downloading $uri to $path";
next if -e $path;
$http->mirror( $uri, $path );
}
use Archive::Extract;
$Archive::Extract::DEBUG = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
my $ae = Archive::Extract->new( archive => $model_archive_path );
die "Could not extract archive" unless $ae->extract( to => $model_base );
}
use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
say "MD5 sum for $hg_gz_path OK";
} else {
die "Digest for $hg_gz_path failed";
}
(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
use IPC::Run;
if( ! -e $hg_bgz_path ) {
IPC::Run::run(
[ qw(gunzip -c) ], '<', $hg_gz_path,
'|',
[ qw(bgzip -c) ], '>', $hg_bgz_path
);
}
use Bio::Tools::Run::Samtools;
my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
$faidx_tool->run( -fas => $hg_bgz_path )
or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}
sub saved_model_cli {
my (@rest) = @_;
if( File::Which::which('saved_model_cli')) {
system(qw(saved_model_cli), @rest ) == 0
or die "Could not run saved_model_cli";
} else {
warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
return -1;
}
}
say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
qw(--dir) => $model_base,
);
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
my $new_model_base = "${model_base}_new";
system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf
in_path, out_path = sys.argv[1:3]
imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF
saved_model_cli( qw(show),
qw(--dir) => $new_model_base,
qw(--all),
);
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
=item *
the model and data takes 5 GB or more of disk space.
=back
meaning that you will need a total of B<8 GB> of disk space. You may need at least B<4 GB> of free memory to run the model.
=head1 COLOPHON
The following document is either a POD file which can additionally be run as a Perl script or a Jupyter Notebook which can be run in L<IPerl|https://p3rl.org/Devel::IPerl> (viewable online at L<nbviewer|https://nbviewer.org/github/EntropyOrg/perl-AI-...
You will also need the executables C<gunzip>, C<bgzip>, and C<samtools>. Furthermore,
=over
=item *
C<Bio::DB::HTS> requires C<libhts> and
=item *
C<PDL::Graphics::Gnuplot> requires C<gnuplot>.
=back
If you are running the code, you may optionally install the L<C<tensorflow> Python package|https://www.tensorflow.org/install/pip> in order to access the C<saved_model_cli> command, but this is only used for informational purposes.
=head1 TUTORIAL
=head2 Load the library
First, we need to load the C<AI::TensorFlow::Libtensorflow> library and more helpers. We then create an C<AI::TensorFlow::Libtensorflow::Status> object and helper function to make sure that the calls to the C<libtensorflow> C library are working prop...
use strict;
use warnings;
use utf8;
use constant IN_IPERL => !! $ENV{PERL_IPERL_RUNNING};
no if IN_IPERL, warnings => 'redefine'; # fewer messages when re-running cells
use feature qw(say);
use Syntax::Construct qw( // );
use lib::projectroot qw(lib);
BEGIN {
if( IN_IPERL ) {
$ENV{TF_CPP_MIN_LOG_LEVEL} = 3;
}
require AI::TensorFlow::Libtensorflow;
}
use URI ();
use HTTP::Tiny ();
use Path::Tiny qw(path);
use File::Which ();
use List::Util ();
use Data::Printer ( output => 'stderr', return_value => 'void', filters => ['PDL'] );
use Data::Printer::Filter::PDL ();
use Text::Table::Tiny qw(generate_table);
my $s = AI::TensorFlow::Libtensorflow::Status->New;
sub AssertOK {
die "Status $_[0]: " . $_[0]->Message
unless $_[0]->GetCode == AI::TensorFlow::Libtensorflow::Status::OK;
return;
}
AssertOK($s);
And create helpers for converting between C<PDL> ndarrays and C<TFTensor> ndarrays.
use PDL;
use AI::TensorFlow::Libtensorflow::DataType qw(FLOAT);
use FFI::Platypus::Memory qw(memcpy);
use FFI::Platypus::Buffer qw(scalar_to_pointer);
sub FloatPDLTOTFTensor {
my ($p) = @_;
return AI::TensorFlow::Libtensorflow::Tensor->New(
FLOAT, [ reverse $p->dims ], $p->get_dataref, sub { undef $p }
);
}
sub FloatTFTensorToPDL {
my ($t) = @_;
my $pdl = zeros(float,reverse( map $t->Dim($_), 0..$t->NumDims-1 ) );
memcpy scalar_to_pointer( ${$pdl->get_dataref} ),
scalar_to_pointer( ${$t->Data} ),
$t->ByteSize;
$pdl->upd_data;
$pdl;
}
=head2 Download model and data
=over
=item *
L<Enformer model|https://tfhub.dev/deepmind/enformer/1> from
> Avsec Ž, Agarwal V, Visentin D, Ledsam JR, Grabska-Barwinska A, Taylor KR, Assael Y, Jumper J, Kohli P, Kelley DR. Effective gene expression prediction from sequence by integrating long-range interactions. I<Nat Methods>. 2021 Oct;B<18(10)>:1196...
=item *
L<Human target dataset|https://github.com/calico/basenji/tree/master/manuscripts/cross2020> from
> Kelley DR. Cross-species regulatory sequence activity prediction. I<PLoS Comput Biol>. 2020 Jul 20;B<16(7)>:e1008050. doi: L<10.1371/journal.pcbi.1008050|https://doi.org/10.1371/journal.pcbi.1008050>. PMID: L<32687525|https://pubmed.ncbi.nlm.nih....
=item *
L<UCSC hg38 genome|https://www.ncbi.nlm.nih.gov/assembly/GCA_000001405.15>. More info at L<http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/>; L<Genome Reference Consortium Human Build 38|https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/>...
> Schneider VA, Graves-Lindsay T, Howe K, Bouk N, Chen HC, Kitts PA, Murphy TD, Pruitt KD, Thibaud-Nissen F, Albracht D, Fulton RS, Kremitzki M, Magrini V, Markovic C, McGrath S, Steinberg KM, Auger K, Chow W, Collins J, Harden G, Hubbard T, Pelan ...
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
check the MD5 sum of the reference genome to make sure it was downloaded correctly.
=back
use Archive::Extract;
$Archive::Extract::DEBUG = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
my $ae = Archive::Extract->new( archive => $model_archive_path );
die "Could not extract archive" unless $ae->extract( to => $model_base );
}
use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
say "MD5 sum for $hg_gz_path OK";
} else {
die "Digest for $hg_gz_path failed";
}
B<STREAM (STDOUT)>:
MD5 sum for hg38.fa.gz OK
B<RESULT>:
1
In order to quickly seek for sequences in the reference genome FASTA, we
=over
=item 1.
convert the gzip'd file into a block gzip'd file and
=item 2.
index that C<.bgz> file using C<faidx> from C<samtools>.
=back
(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
use IPC::Run;
if( ! -e $hg_bgz_path ) {
IPC::Run::run(
[ qw(gunzip -c) ], '<', $hg_gz_path,
'|',
[ qw(bgzip -c) ], '>', $hg_bgz_path
);
}
use Bio::Tools::Run::Samtools;
my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
$faidx_tool->run( -fas => $hg_bgz_path )
or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}
=head2 Model input and output specification
Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.
sub saved_model_cli {
my (@rest) = @_;
if( File::Which::which('saved_model_cli')) {
system(qw(saved_model_cli), @rest ) == 0
or die "Could not run saved_model_cli";
} else {
warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
return -1;
}
}
say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
qw(--dir) => $model_base,
);
B<STREAM (STDOUT)>:
Checking with saved_model_cli scan:
MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}
B<RESULT>:
1
We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
B<STREAM (STDOUT)>:
MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
signature_def['__saved_model_init_op']:
The given SavedModel SignatureDef contains the following input(s):
The given SavedModel SignatureDef contains the following output(s):
outputs['__saved_model_init_op'] tensor_info:
dtype: DT_INVALID
shape: unknown_rank
name: NoOp
Method name is:
Concrete Functions:
B<RESULT>:
1
It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.
my $new_model_base = "${model_base}_new";
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
PDL: Float D [5313,896]
=head2 Plot predicted tracks
These predictions can be plotted
my @tracks = (
[ 'DNASE:CD14-positive monocyte female' => 41 => $predictions_p->slice('(41)') ],
[ 'DNASE:keratinocyte female' => 42 => $predictions_p->slice('(42)') ],
[ 'CHIP:H3K27ac:keratinocyte female' => 706 => $predictions_p->slice('(706)')],
[ 'CAGE:Keratinocyte - epidermal' => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
);
use PDL::Graphics::Gnuplot;
my $plot_output_path = 'enformer-target-interval-tracks.png';
my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
$gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
$gp->options(
offsets => [ graph => "0.01, 0, 0, 0" ],
lmargin => "at screen 0.05",
);
my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
my @tics_opts = (mirror => 0, out => 1);
for my $i (0..$#tracks) {
my ($title, $id, $y) = @{$tracks[$i]};
$gp->plot( {
title => $title,
border => [2],
ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
( $i == $#tracks
? ( xtics => { format => '%.3f', @tics_opts } )
: ( xtics => 0 ) ),
( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : () ),
},
with => 'filledcurves',
#'lc' => '#086eb5',
# $x scaled by 1e7; filled curve between $y and the x-axis
$x / 1e7, $y, pdl(0)
);
}
$gp->end_multi;
$gp->close;
if( IN_IPERL ) {
IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}
B<DISPLAY>:
=for html <span style="display:inline-block;margin-left:1em;"><p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...
=head2 Parts of the original notebook that fall outside the scope
In the orignal notebook, there are several more steps that have not been ported here:
=over
=item 1.
"Compute contribution scores":
This task requires implementing C<@tf.function> to compile gradients.
=item 2.
"Predict the effect of a genetic variant" and "Score multiple variants":
The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...
=back
# Some code that could be used for working with variants.
1 if <<'COMMENT';
use Bio::DB::HTS::VCF;
my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants
COMMENT
undef;
=head1 RESOURCE USAGE
use Filesys::DiskUsage qw/du/;
my $total = du( { 'human-readable' => 1, dereference => 1 },
$model_archive_path, $model_base, $new_model_base,
$targets_path,
$hg_gz_path,
$hg_bgz_path, $hg_bgz_fai_path,
$clinvar_path,
$plot_output_path,
);
say "Disk space usage: $total"; undef;
B<STREAM (STDOUT)>:
Disk space usage: 4.66G
( run in 1.197 second using v1.01-cache-2.11-cpan-39bf76dae61 )