AI-TensorFlow-Libtensorflow

 view release on metacpan or  search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN


# Human targets from Basenji2 dataset
my $targets_uri  = URI->new('https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt');
my $targets_path = 'targets_human.txt';

# Human reference genome
my $hg_uri    = URI->new("http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz");
my $hg_gz_path   = "hg38.fa.gz";
# From http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
my $hg_md5_digest = "1c9dcaddfa41027f17cd8f7a82c7293b";

my $clinvar_uri  = URI->new('https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz');
my $clinvar_path = 'clinvar.vcf.gz';

my $http = HTTP::Tiny->new;

for my $download ( [ $model_uri   => $model_archive_path ],
                   [ $targets_uri => $targets_path       ],
                   [ $hg_uri      => $hg_gz_path            ],
                   [ $clinvar_uri => $clinvar_path       ],) {
    my ($uri, $path) = @$download;
    say "Downloading $uri to $path";
    next if -e $path;
    $http->mirror( $uri, $path );
}

use Archive::Extract;
$Archive::Extract::DEBUG      = 1;
$Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
if( ! -e $model_base ) {
    my $ae = Archive::Extract->new( archive => $model_archive_path );
    die "Could not extract archive" unless $ae->extract( to => $model_base );
}

use Digest::file qw(digest_file_hex);
if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
    say "MD5 sum for $hg_gz_path OK";
} else {
    die "Digest for $hg_gz_path failed";
}

(my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
my $hg_bgz_path = "${hg_uncompressed_path}.bgz";

use IPC::Run;

if( ! -e $hg_bgz_path ) {
    IPC::Run::run(
        [ qw(gunzip -c) ], '<', $hg_gz_path,
        '|',
        [ qw(bgzip -c)  ], '>', $hg_bgz_path
    );
}

use Bio::Tools::Run::Samtools;

my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
if( ! -e $hg_bgz_fai_path ) {
    my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
    $faidx_tool->run( -fas => $hg_bgz_path )
        or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
}

sub saved_model_cli {
    my (@rest) = @_;
    if( File::Which::which('saved_model_cli')) {
        system(qw(saved_model_cli), @rest ) == 0
            or die "Could not run saved_model_cli";
    } else {
        warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
        return -1;
    }
}

say "Checking with saved_model_cli scan:";
saved_model_cli( qw(scan),
    qw(--dir) => $model_base,
);

saved_model_cli( qw(show),
    qw(--dir) => $model_base,
    qw(--all),
);

my $new_model_base = "${model_base}_new";

system( qw(python3), qw(-c) => <<EOF, $model_base, $new_model_base ) unless -e $new_model_base;
import sys
import tensorflow as tf

in_path, out_path  = sys.argv[1:3]

imported_model = tf.saved_model.load(in_path).model
tf.saved_model.save( imported_model , out_path )
EOF

saved_model_cli( qw(show),
    qw(--dir) => $new_model_base,
    qw(--all),
);

my $model_central_base_pairs_length     = 114_688; # bp
my $model_central_base_pair_window_size = 128;     # bp / prediction

say "Number of predictions: ", $model_central_base_pairs_length / $model_central_base_pair_window_size;

use Data::Frame;

my $df = Data::Frame->from_csv( $targets_path, sep => "\t" )
    ->transform({
        file => sub {
            my ($col, $df) = @_;
            # clean up the paths in 'file' column
            [map { join "/", (split('/', $_))[7..8] } $col->list];
        }
    });

say "Number of targets: ", $df->nrow;

say "";

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

check the MD5 sum of the reference genome to make sure it was downloaded correctly.

=back

  use Archive::Extract;
  $Archive::Extract::DEBUG      = 1;
  $Archive::Extract::PREFER_BIN = 1; # for the larger model, prefer bin
  if( ! -e $model_base ) {
      my $ae = Archive::Extract->new( archive => $model_archive_path );
      die "Could not extract archive" unless $ae->extract( to => $model_base );
  }
  
  use Digest::file qw(digest_file_hex);
  if( digest_file_hex( $hg_gz_path, "MD5" ) eq $hg_md5_digest ) {
      say "MD5 sum for $hg_gz_path OK";
  } else {
      die "Digest for $hg_gz_path failed";
  }

B<STREAM (STDOUT)>:

  MD5 sum for hg38.fa.gz OK

B<RESULT>:

  1

In order to quickly seek for sequences in the reference genome FASTA, we

=over

=item 1.

convert the gzip'd file into a block gzip'd file and

=item 2.

index that C<.bgz> file using C<faidx> from C<samtools>.

=back

  (my $hg_uncompressed_path = $hg_gz_path) =~ s/\.gz$//;
  my $hg_bgz_path = "${hg_uncompressed_path}.bgz";
  
  use IPC::Run;
  
  if( ! -e $hg_bgz_path ) {
      IPC::Run::run(
          [ qw(gunzip -c) ], '<', $hg_gz_path,
          '|',
          [ qw(bgzip -c)  ], '>', $hg_bgz_path
      );
  }
  
  use Bio::Tools::Run::Samtools;
  
  my $hg_bgz_fai_path = "${hg_bgz_path}.fai";
  if( ! -e $hg_bgz_fai_path ) {
      my $faidx_tool = Bio::Tools::Run::Samtools->new( -command => 'faidx' );
      $faidx_tool->run( -fas => $hg_bgz_path )
          or die "Could not index FASTA file $hg_bgz_path: " . $faidx_tool->error_string;
  }

=head2 Model input and output specification

Now we create a helper to call C<saved_model_cli> and called C<saved_model_cli scan> to ensure that the model is I/O-free for security reasons.

  sub saved_model_cli {
      my (@rest) = @_;
      if( File::Which::which('saved_model_cli')) {
          system(qw(saved_model_cli), @rest ) == 0
              or die "Could not run saved_model_cli";
      } else {
          warn "saved_model_cli(): Install the tensorflow Python package to get the `saved_model_cli` command.\n";
          return -1;
      }
  }
  
  say "Checking with saved_model_cli scan:";
  saved_model_cli( qw(scan),
      qw(--dir) => $model_base,
  );

B<STREAM (STDOUT)>:

  Checking with saved_model_cli scan:
  MetaGraph with tag set ['serve'] does not contain the default denylisted ops: {'ReadFile', 'PrintV2', 'WriteFile'}

B<RESULT>:

  1

We need to see what the inputs and outputs of this model are so C<saved_model_cli show> should show us that:

  saved_model_cli( qw(show),
      qw(--dir) => $model_base,
      qw(--all),
  );

B<STREAM (STDOUT)>:

  MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
  
  signature_def['__saved_model_init_op']:
    The given SavedModel SignatureDef contains the following input(s):
    The given SavedModel SignatureDef contains the following output(s):
      outputs['__saved_model_init_op'] tensor_info:
          dtype: DT_INVALID
          shape: unknown_rank
          name: NoOp
    Method name is: 
  
  Concrete Functions:
B<RESULT>:

  1

It appears that it does not! What we can do is load the model using C<tensorflow> in Python and then save it to a new path. Now when we run C<saved_model_cli show> on this new model path, it shows the correct inputs and outputs.

  my $new_model_base = "${model_base}_new";
  



( run in 0.447 second using v1.01-cache-2.11-cpan-f6376fbd888 )