AI-TensorFlow-Libtensorflow

 view release on metacpan or  search on metacpan

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

        undef,
        [$puts{inputs_args_0}], [$t],
        [$puts{outputs_human}], \@outputs_t,
        undef,
        undef,
        $s
    );
    AssertOK($s);

    return $outputs_t[0];
};

undef;

use PDL;

our $SHOW_ENCODER = 1;

sub one_hot_dna {
    my ($seq) = @_;

    my $from_alphabet = "NACGT";
    my $to_alphabet   = pack "C*", 0..length($from_alphabet)-1;

    # sequences from UCSC genome have both uppercase and lowercase bases
    my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
    my $to_alphabet_tr   = $to_alphabet x 2;

    my $p = zeros(byte, bytes::length($seq));
    my $p_dataref = $p->get_dataref;
    ${ $p_dataref } = $seq;
    eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
    $p->upd_data;

    my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
    say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;

    my $encoded  = $encoder->index( $p->dummy(0) );

    return $encoded;
}

####

{

say "Testing one-hot encoding:\n";

my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;

say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;

}

package Interval {
    use Bio::Location::Simple ();

    use parent qw(Bio::Location::Simple);

    sub center {
        my $self = shift;
        my $center = int( ($self->start + $self->end ) / 2 );
        my $delta = ($self->start + $self->end ) % 2;
        return $center + $delta;
    }

    sub resize {
        my ($self, $width) = @_;
        my $new_interval = $self->clone;

        my $center = $self->center;
        my $half   = int( ($width-1) / 2 );
        my $offset = ($width-1) % 2;

        $new_interval->start( $center - $half - $offset );
        $new_interval->end(   $center + $half  );

        return $new_interval;
    }

    use overload '""' => \&_op_stringify;

    sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}

#####

{

say "Testing interval resizing:\n";
sub _debug_resize {
    my ($interval, $to, $msg) = @_;

    my $resized_interval = $interval->resize($to);

    die "Wrong interval size for $interval --($to)--> $resized_interval"
        unless $resized_interval->length == $to;

    say sprintf "Interval: %s -> %s, length %2d : %s",
        $interval,
        $resized_interval, $resized_interval->length,
        $msg;
}

for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
    my ($start, $end) = @$interval_spec;
    my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
    say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
    say "-----";
    for(0..5) {
        my $base = $test_interval->length;
        my $to = $base + $_;
        _debug_resize $test_interval, $to, "$base -> $to (+ $_)";
    }
    say "";
}

}

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

  my $onehot_test_seq = "ACGTNtgcan";
  my $test_encoded = one_hot_dna( $onehot_test_seq );
  $SHOW_ENCODER = 0;
  
  say "One-hot encoding of sequence '$onehot_test_seq' is:";
  say $test_encoded->info, $test_encoded;
  
  }

B<STREAM (STDOUT)>:

  Testing one-hot encoding:
  
  Encoder is
  PDL: Float D [5,4]
  [
   [0 1 0 0 0]
   [0 0 1 0 0]
   [0 0 0 1 0]
   [0 0 0 0 1]
  ]
  
  One-hot encoding of sequence 'ACGTNtgcan' is:
  PDL: Float D [4,10]
  [
   [1 0 0 0]
   [0 1 0 0]
   [0 0 1 0]
   [0 0 0 1]
   [0 0 0 0]
   [0 0 0 1]
   [0 0 1 0]
   [0 1 0 0]
   [1 0 0 0]
   [0 0 0 0]
  ]

B<RESULT>:

  1

Note that in the above, the PDL ndarray's

=over

=item *

first dimension is 4 which matches the last dimension of the input C<TFTensor>;

=item *

second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.

=back

Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.

  package Interval {
      use Bio::Location::Simple ();
  
      use parent qw(Bio::Location::Simple);
  
      sub center {
          my $self = shift;
          my $center = int( ($self->start + $self->end ) / 2 );
          my $delta = ($self->start + $self->end ) % 2;
          return $center + $delta;
      }
  
      sub resize {
          my ($self, $width) = @_;
          my $new_interval = $self->clone;
  
          my $center = $self->center;
          my $half   = int( ($width-1) / 2 );
          my $offset = ($width-1) % 2;
  
          $new_interval->start( $center - $half - $offset );
          $new_interval->end(   $center + $half  );
  
          return $new_interval;
      }
  
      use overload '""' => \&_op_stringify;
  
      sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
  }
  
  #####
  
  {
  
  say "Testing interval resizing:\n";
  sub _debug_resize {
      my ($interval, $to, $msg) = @_;
  
      my $resized_interval = $interval->resize($to);
  
      die "Wrong interval size for $interval --($to)--> $resized_interval"
          unless $resized_interval->length == $to;
  
      say sprintf "Interval: %s -> %s, length %2d : %s",
          $interval,
          $resized_interval, $resized_interval->length,
          $msg;
  }
  
  for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
      my ($start, $end) = @$interval_spec;
      my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
      say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
      say "-----";
      for(0..5) {
          my $base = $test_interval->length;
          my $to = $base + $_;
          _debug_resize $test_interval, $to, "$base -> $to (+ $_)";
      }
      say "";
  }
  
  }

lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod  view on Meta::CPAN

  my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
  $v->num_variants
  
  COMMENT
  
  undef;

=head1 RESOURCE USAGE

  use Filesys::DiskUsage qw/du/;
  
  my $total = du( { 'human-readable' => 1, dereference => 1 },
      $model_archive_path, $model_base, $new_model_base,
  
      $targets_path,
  
      $hg_gz_path,
      $hg_bgz_path, $hg_bgz_fai_path,
  
      $clinvar_path,
  
      $plot_output_path,
  );
  
  say "Disk space usage: $total"; undef;

B<STREAM (STDOUT)>:

  Disk space usage: 4.66G

=head1 CPANFILE

  requires 'AI::TensorFlow::Libtensorflow';
  requires 'AI::TensorFlow::Libtensorflow::DataType';
  requires 'Archive::Extract';
  requires 'Bio::DB::HTS::Faidx';
  requires 'Bio::Location::Simple';
  requires 'Bio::Tools::Run::Samtools';
  requires 'Data::Frame';
  requires 'Data::Printer';
  requires 'Data::Printer::Filter::PDL';
  requires 'Devel::Timer';
  requires 'Digest::file';
  requires 'FFI::Platypus::Buffer';
  requires 'FFI::Platypus::Memory';
  requires 'File::Which';
  requires 'Filesys::DiskUsage';
  requires 'HTTP::Tiny';
  requires 'IPC::Run';
  requires 'List::Util';
  requires 'PDL';
  requires 'PDL::Graphics::Gnuplot';
  requires 'Path::Tiny';
  requires 'Syntax::Construct';
  requires 'Text::Table::Tiny';
  requires 'URI';
  requires 'constant';
  requires 'feature';
  requires 'lib::projectroot';
  requires 'overload';
  requires 'parent';
  requires 'strict';
  requires 'utf8';
  requires 'warnings';

=head1 AUTHOR

Zakariyya Mughal <zmughal@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2022-2023 by Auto-Parallel Technologies, Inc.

This is free software, licensed under:

  The Apache License, Version 2.0, January 2004

=cut



( run in 1.249 second using v1.01-cache-2.11-cpan-39bf76dae61 )