AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
# Human targets from Basenji2 dataset
my $targets_uri = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';
# Human reference genome
my $hg_uri = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";
my $clinvar_uri = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';
my $http = HTTP::Tiny->new;
for my $download ( [ $model_uri => $model_archive_path ],
[ $targets_uri => $targets_path ],
[ $hg_uri => $hg_gz_path ],
[ $clinvar_uri => $clinvar_path ],) {
my ($uri, $path) = @$download;
say "Downloading $uri to $path";
next if -e $path;
$http->mirror( $uri, $path );
}
use Archive::Extract;
$Archive::Extract::DEBUG = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
my $ae = Archive::Extract->new( archive => $model_archive_path );
die "Could not extract archive" unless $ae->extract( to => $model_base );
}
use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
say "MD5 sum for $hg_gz_path OK";
} else {
die "Digest for $hg_gz_path failed";
}
(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
use IPC::Run;
if( ! -e $hg_bgz_path ) {
IPC::Run::run(
[ qw(gunzip -c) ], '<', $hg_gz_path,
'|',
[ qw(bgzip -c) ], '>', $hg_bgz_path
);
}
use Bio::Tools::Run::Samtools;
my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
$faidx_tool->run( -fas => $hg_bgz_path )
or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}
sub saved_model_cli {
my (@rest) = @_;
if( File::Which::which('saved_model_cli')) {
system(qw(saved_model_cli), @rest ) == 0
or die "Could not run saved_model_cli";
} else {
warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
return -1;
}
}
say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
qw(--dir) => $model_base,
);
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
my $new_model_base = "${model_base}_new";
system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf
in_path, out_path = sys.argv[1:3]
imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF
saved_model_cli( qw(show),
qw(--dir) => $new_model_base,
qw(--all),
);
my $model_central_base_pairs_length = 114_688; # bp
my $model_central_base_pair_window_size = 128; # bp / prediction
say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;
use Data::Frame;
my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
->transform({
file => sub {
my ($col, $df) = @_;
# clean up the paths in 'file' column
[map { join "/", (split('/', $_))[7..8] } $col->list];
}
});
say "Number of targets: ", $df->nrow;
say "";
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
check the MD5 sum of the reference genome to make sure it was downloaded correctly.
=back
use Archive::Extract;
$Archive::Extract::DEBUG = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
my $ae = Archive::Extract->new( archive => $model_archive_path );
die "Could not extract archive" unless $ae->extract( to => $model_base );
}
use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
say "MD5 sum for $hg_gz_path OK";
} else {
die "Digest for $hg_gz_path failed";
}
B<STREAM (STDOUT)>:
MD5 sum for hg38.fa.gz OK
B<RESULT>:
1
In order to quickly seek for sequences in the reference genome FASTA, we
=over
=item 1.
convert the gzip'd file into a block gzip'd file and
=item 2.
index that C<.bgz> file using C<faidx> from C<samtools>.
=back
(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
use IPC::Run;
if( ! -e $hg_bgz_path ) {
IPC::Run::run(
[ qw(gunzip -c) ], '<', $hg_gz_path,
'|',
[ qw(bgzip -c) ], '>', $hg_bgz_path
);
}
use Bio::Tools::Run::Samtools;
my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
$faidx_tool->run( -fas => $hg_bgz_path )
or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}
=head2 Model input and output specification
Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.
sub saved_model_cli {
my (@rest) = @_;
if( File::Which::which('saved_model_cli')) {
system(qw(saved_model_cli), @rest ) == 0
or die "Could not run saved_model_cli";
} else {
warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
return -1;
}
}
say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
qw(--dir) => $model_base,
);
B<STREAM (STDOUT)>:
Checking with saved_model_cli scan:
MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}
B<RESULT>:
1
We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:
saved_model_cli( qw(show),
qw(--dir) => $model_base,
qw(--all),
);
B<STREAM (STDOUT)>:
MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
signature_def['__saved_model_init_op']:
The given SavedModel SignatureDef contains the following input(s):
The given SavedModel SignatureDef contains the following output(s):
outputs['__saved_model_init_op'] tensor_info:
dtype: DT_INVALID
shape: unknown_rank
name: NoOp
Method name is:
Concrete Functions:
B<RESULT>:
1
It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.
my $new_model_base = "${model_base}_new";
( run in 0.447 second using v1.01-cache-2.11-cpan-f6376fbd888 )