AI-TensorFlow-Libtensorflow
view release on metacpan or search on metacpan
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
undef,
$s
);
AssertOK($s);
return $outputs_t[0];
};
undef;
use PDL;
our $SHOW_ENCODER = 1;
sub one_hot_dna {
my ($seq) = @_;
my $from_alphabet = "NACGT";
my $to_alphabet = pack "C*", 0..length($from_alphabet)-1;
# sequences from UCSC genome have both uppercase and lowercase bases
my $from_alphabet_tr = $from_alphabet . lc $from_alphabet;
my $to_alphabet_tr = $to_alphabet x 2;
my $p = zeros(byte, bytes::length($seq));
my $p_dataref = $p->get_dataref;
${ $p_dataref } = $seq;
eval "tr/$from_alphabet_tr/$to_alphabet_tr/" for ${ $p_dataref };
$p->upd_data;
my $encoder = append(float(0), identity(float(length($from_alphabet)-1)) );
say "Encoder is\n", $encoder->info, $encoder if $SHOW_ENCODER;
my $encoded = $encoder->index( $p->dummy(0) );
return $encoded;
}
####
{
say "Testing one-hot encoding:\n";
my $onehot_test_seq = "ACGTNtgcan";
my $test_encoded = one_hot_dna( $onehot_test_seq );
$SHOW_ENCODER = 0;
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
use overload '""' => \&_op_stringify;
sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}
#####
{
say "Testing interval resizing:\n";
sub _debug_resize {
my ($interval, $to, $msg) = @_;
my $resized_interval = $interval->resize($to);
die "Wrong interval size for $interval --($to)--> $resized_interval"
unless $resized_interval->length == $to;
say sprintf "Interval: %s -> %s, length %2d : %s",
$interval,
$resized_interval, $resized_interval->length,
$msg;
}
for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
my ($start, $end) = @$interval_spec;
my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
say "-----";
for(0..5) {
my $base = $test_interval->length;
my $to = $base + $_;
_debug_resize $test_interval, $to, "$base -> $to (+ $_)";
}
say "";
}
}
undef;
use Bio::DB::HTS::Faidx;
my $hg_db = Bio::DB::HTS::Faidx->new( $hg_bgz_path );
sub extract_sequence {
my ($db, $interval) = @_;
my $chrom_length = $db->length($interval->seq_id);
my $trimmed_interval = $interval->clone;
$trimmed_interval->start( List::Util::max( $interval->start, 1 ) );
$trimmed_interval->end( List::Util::min( $interval->end , $chrom_length ) );
# Bio::DB::HTS::Faidx is 0-based for both start and end points
my $seq = $db->get_sequence2_no_length(
$trimmed_interval->seq_id,
$trimmed_interval->start - 1,
$trimmed_interval->end - 1,
);
my $pad_upstream = 'N' x List::Util::max( -($interval->start-1), 0 );
my $pad_downstream = 'N' x List::Util::max( $interval->end - $chrom_length, 0 );
return join '', $pad_upstream, $seq, $pad_downstream;
}
sub seq_info {
my ($seq, $n) = @_;
$n ||= 10;
if( length $seq > $n ) {
sprintf "%s...%s (length %d)", uc substr($seq, 0, $n), uc substr($seq, -$n), length $seq;
} else {
sprintf "%s (length %d)", uc $seq, length $seq;
}
}
####
{
say "Testing sequence extraction:";
say "1 base: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => 35_082_742 + 1,
-end => 35_082_742 + 1 ) );
say "3 bases: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => 1,
-end => 1 )->resize(3) );
say "5 bases: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => $hg_db->length('chr11'),
-end => $hg_db->length('chr11') )->resize(5) );
say "chr11 is of length ", $hg_db->length('chr11');
say "chr11 bases: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => 1,
-end => $hg_db->length('chr11') )->resize( $hg_db->length('chr11') ) );
}
my $target_interval = Interval->new( -seq_id => 'chr11',
-start => 35_082_742 + 1, # BioPerl is 1-based
-end => 35_197_430 );
say "Target interval: $target_interval with length @{[ $target_interval->length ]}";
die "Target interval is not $model_central_base_pairs_length bp long"
unless $target_interval->length == $model_central_base_pairs_length;
say "Target sequence is ", seq_info extract_sequence( $hg_db, $target_interval );
say "";
my $resized_interval = $target_interval->resize( $model_sequence_length );
say "Resized interval: $resized_interval with length @{[ $resized_interval->length ]}";
die "resize() is not working properly!" unless $resized_interval->length == $model_sequence_length;
my $seq = extract_sequence( $hg_db, $resized_interval );
say "Resized sequence is ", seq_info($seq);
my $sequence_one_hot = one_hot_dna( $seq )->dummy(-1);
say $sequence_one_hot->info; undef;
use Devel::Timer;
my $t = Devel::Timer->new;
$t->mark('prediction of sequence');
my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );
$t->mark('End of prediction of sequence');
p $predictions;
$t->mark('END');
$t->report();
my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
say $predictions_p->info; undef;
my @tracks = (
[ 'DNASE:CD14-positive monocyte female' => 41 => $predictions_p->slice('(41)') ],
[ 'DNASE:keratinocyte female' => 42 => $predictions_p->slice('(42)') ],
[ 'CHIP:H3K27ac:keratinocyte female' => 706 => $predictions_p->slice('(706)')],
[ 'CAGE:Keratinocyte - epidermal' => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
);
use PDL::Graphics::Gnuplot;
my $plot_output_path = 'enformer-target-interval-tracks.png';
my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
$gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
$gp->options(
offsets => [ graph => "0.01, 0, 0, 0" ],
lmargin => "at screen 0.05",
);
my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
my @tics_opts = (mirror => 0, out => 1);
for my $i (0..$#tracks) {
my ($title, $id, $y) = @{$tracks[$i]};
$gp->plot( {
title => $title,
border => [2],
ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
( $i == $#tracks
? ( xtics => { format => '%.3f', @tics_opts } )
: ( xtics => 0 ) ),
( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : () ),
},
with => 'filledcurves',
#'lc' => '#086eb5',
# $x scaled by 1e7; filled curve between $y and the x-axis
$x / 1e7, $y, pdl(0)
);
}
$gp->end_multi;
$gp->close;
if( IN_IPERL ) {
IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}
# Some code that could be used for working with variants.
1 if <<'COMMENT';
use Bio::DB::HTS::VCF;
my $clinvar_tbi_path = "${clinvar_path}.tbi";
unless( -f $clinvar_tbi_path ) {
system( qw(tabix), $clinvar_path );
}
my $v = Bio::DB::HTS::VCF->new( filename => $clinvar_path );
$v->num_variants
COMMENT
undef;
use Filesys::DiskUsage qw/du/;
my $total = du( { 'human-readable' => 1, dereference => 1 },
$model_archive_path, $model_base, $new_model_base,
$targets_path,
$hg_gz_path,
$hg_bgz_path, $hg_bgz_fai_path,
$clinvar_path,
$plot_output_path,
lib/AI/TensorFlow/Libtensorflow/Manual/Notebook/InferenceUsingTFHubEnformerGeneExprPredModel.pod view on Meta::CPAN
say "One-hot encoding of sequence '$onehot_test_seq' is:";
say $test_encoded->info, $test_encoded;
}
B<STREAM (STDOUT)>:
Testing one-hot encoding:
Encoder is
PDL: Float D [5,4]
[
[0 1 0 0 0]
[0 0 1 0 0]
[0 0 0 1 0]
[0 0 0 0 1]
]
One-hot encoding of sequence 'ACGTNtgcan' is:
PDL: Float D [4,10]
[
[1 0 0 0]
[0 1 0 0]
[0 0 1 0]
[0 0 0 1]
[0 0 0 0]
[0 0 0 1]
[0 0 1 0]
[0 1 0 0]
[1 0 0 0]
[0 0 0 0]
]
B<RESULT>:
1
Note that in the above, the PDL ndarray's
=over
=item *
first dimension is 4 which matches the last dimension of the input C<TFTensor>;
=item *
second dimension is the sequence length which matches the penultimate dimension of the input C<TFTensor>.
=back
Now we need a way to deal with the sequence interval. We're going to use 1-based coordinates as BioPerl does. In fact, we'll extend a BioPerl class.
package Interval {
use Bio::Location::Simple ();
use parent qw(Bio::Location::Simple);
sub center {
my $self = shift;
my $center = int( ($self->start + $self->end ) / 2 );
my $delta = ($self->start + $self->end ) % 2;
return $center + $delta;
}
sub resize {
my ($self, $width) = @_;
my $new_interval = $self->clone;
my $center = $self->center;
my $half = int( ($width-1) / 2 );
my $offset = ($width-1) % 2;
$new_interval->start( $center - $half - $offset );
$new_interval->end( $center + $half );
return $new_interval;
}
use overload '""' => \&_op_stringify;
sub _op_stringify { sprintf "%s:%s", $_[0]->seq_id // "(no sequence)", $_[0]->to_FTstring }
}
#####
{
say "Testing interval resizing:\n";
sub _debug_resize {
my ($interval, $to, $msg) = @_;
my $resized_interval = $interval->resize($to);
die "Wrong interval size for $interval --($to)--> $resized_interval"
unless $resized_interval->length == $to;
say sprintf "Interval: %s -> %s, length %2d : %s",
$interval,
$resized_interval, $resized_interval->length,
$msg;
}
for my $interval_spec ( [4, 8], [5, 8], [5, 9], [6, 9]) {
my ($start, $end) = @$interval_spec;
my $test_interval = Interval->new( -seq_id => 'chr11', -start => $start, -end => $end );
say sprintf "Testing interval %s with length %d", $test_interval, $test_interval->length;
say "-----";
for(0..5) {
my $base = $test_interval->length;
my $to = $base + $_;
_debug_resize $test_interval, $to, "$base -> $to (+ $_)";
}
say "";
}
}
undef;
B<STREAM (STDOUT)>:
Testing interval resizing:
Testing interval chr11:4..8 with length 5
-----
Interval: chr11:4..8 -> chr11:4..8, length 5 : 5 -> 5 (+ 0)
Interval: chr11:4..8 -> chr11:3..8, length 6 : 5 -> 6 (+ 1)
Interval: chr11:4..8 -> chr11:3..9, length 7 : 5 -> 7 (+ 2)
Interval: chr11:4..8 -> chr11:2..9, length 8 : 5 -> 8 (+ 3)
Interval: chr11:4..8 -> chr11:2..10, length 9 : 5 -> 9 (+ 4)
Interval: chr11:4..8 -> chr11:1..10, length 10 : 5 -> 10 (+ 5)
Testing interval chr11:5..8 with length 4
-----
Interval: chr11:5..8 -> chr11:5..8, length 4 : 4 -> 4 (+ 0)
Interval: chr11:5..8 -> chr11:5..9, length 5 : 4 -> 5 (+ 1)
Interval: chr11:5..8 -> chr11:4..9, length 6 : 4 -> 6 (+ 2)
Interval: chr11:5..8 -> chr11:4..10, length 7 : 4 -> 7 (+ 3)
Interval: chr11:5..8 -> chr11:3..10, length 8 : 4 -> 8 (+ 4)
Interval: chr11:5..8 -> chr11:3..11, length 9 : 4 -> 9 (+ 5)
Testing interval chr11:5..9 with length 5
-----
Interval: chr11:5..9 -> chr11:5..9, length 5 : 5 -> 5 (+ 0)
Interval: chr11:5..9 -> chr11:4..9, length 6 : 5 -> 6 (+ 1)
Interval: chr11:5..9 -> chr11:4..10, length 7 : 5 -> 7 (+ 2)
Interval: chr11:5..9 -> chr11:3..10, length 8 : 5 -> 8 (+ 3)
Interval: chr11:5..9 -> chr11:3..11, length 9 : 5 -> 9 (+ 4)
Interval: chr11:5..9 -> chr11:2..11, length 10 : 5 -> 10 (+ 5)
Testing interval chr11:6..9 with length 4
-----
Interval: chr11:6..9 -> chr11:6..9, length 4 : 4 -> 4 (+ 0)
Interval: chr11:6..9 -> chr11:6..10, length 5 : 4 -> 5 (+ 1)
Interval: chr11:6..9 -> chr11:5..10, length 6 : 4 -> 6 (+ 2)
Interval: chr11:6..9 -> chr11:5..11, length 7 : 4 -> 7 (+ 3)
Interval: chr11:6..9 -> chr11:4..11, length 8 : 4 -> 8 (+ 4)
Interval: chr11:6..9 -> chr11:4..12, length 9 : 4 -> 9 (+ 5)
use Bio::DB::HTS::Faidx;
my $hg_db = Bio::DB::HTS::Faidx->new( $hg_bgz_path );
sub extract_sequence {
my ($db, $interval) = @_;
my $chrom_length = $db->length($interval->seq_id);
my $trimmed_interval = $interval->clone;
$trimmed_interval->start( List::Util::max( $interval->start, 1 ) );
$trimmed_interval->end( List::Util::min( $interval->end , $chrom_length ) );
# Bio::DB::HTS::Faidx is 0-based for both start and end points
my $seq = $db->get_sequence2_no_length(
$trimmed_interval->seq_id,
$trimmed_interval->start - 1,
$trimmed_interval->end - 1,
);
my $pad_upstream = 'N' x List::Util::max( -($interval->start-1), 0 );
my $pad_downstream = 'N' x List::Util::max( $interval->end - $chrom_length, 0 );
return join '', $pad_upstream, $seq, $pad_downstream;
}
sub seq_info {
my ($seq, $n) = @_;
$n ||= 10;
if( length $seq > $n ) {
sprintf "%s...%s (length %d)", uc substr($seq, 0, $n), uc substr($seq, -$n), length $seq;
} else {
sprintf "%s (length %d)", uc $seq, length $seq;
}
}
####
{
say "Testing sequence extraction:";
say "1 base: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => 35_082_742 + 1,
-end => 35_082_742 + 1 ) );
say "3 bases: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => 1,
-end => 1 )->resize(3) );
say "5 bases: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => $hg_db->length('chr11'),
-end => $hg_db->length('chr11') )->resize(5) );
say "chr11 is of length ", $hg_db->length('chr11');
say "chr11 bases: ", seq_info
extract_sequence( $hg_db,
Interval->new( -seq_id => 'chr11',
-start => 1,
-end => $hg_db->length('chr11') )->resize( $hg_db->length('chr11') ) );
}
B<STREAM (STDOUT)>:
Testing sequence extraction:
1 base: G (length 1)
3 bases: NNN (length 3)
5 bases: NNNNN (length 5)
chr11 is of length 135086622
chr11 bases: NNNNNNNNNN...NNNNNNNNNN (length 135086622)
B<RESULT>:
1
Now we can use the same target interval that is used in the example notebook which recreates part of L<figure 1|https://www.nature.com/articles/s41592-021-01252-x/figures/1> from the Enformer paper.
my $target_interval = Interval->new( -seq_id => 'chr11',
-start => 35_082_742 + 1, # BioPerl is 1-based
-end => 35_197_430 );
say "Target interval: $target_interval with length @{[ $target_interval->length ]}";
die "Target interval is not $model_central_base_pairs_length bp long"
unless $target_interval->length == $model_central_base_pairs_length;
say "Target sequence is ", seq_info extract_sequence( $hg_db, $target_interval );
say "";
my $resized_interval = $target_interval->resize( $model_sequence_length );
say "Resized interval: $resized_interval with length @{[ $resized_interval->length ]}";
die "resize() is not working properly!" unless $resized_interval->length == $model_sequence_length;
my $seq = extract_sequence( $hg_db, $resized_interval );
say "Resized sequence is ", seq_info($seq);
B<STREAM (STDOUT)>:
Target interval: chr11:35082743..35197430 with length 114688
Target sequence is GGTGGCAGCC...ATCTCCTTTT (length 114688)
Resized interval: chr11:34943479..35336694 with length 393216
Resized sequence is ACTAGTTCTA...GGCCCAAATC (length 393216)
B<RESULT>:
1
To prepare the input we have to one-hot encode this resized sequence and give it a dummy dimension at the end to indicate that it is is a batch with a single sequence. Then we can turn the PDL ndarray into a C<TFTensor> and pass it to our prediction ...
my $sequence_one_hot = one_hot_dna( $seq )->dummy(-1);
say $sequence_one_hot->info; undef;
B<STREAM (STDOUT)>:
PDL: Float D [4,393216,1]
use Devel::Timer;
my $t = Devel::Timer->new;
$t->mark('prediction of sequence');
my $predictions = $predict_on_batch->( $session, FloatPDLTOTFTensor( $sequence_one_hot ) );
$t->mark('End of prediction of sequence');
p $predictions;
$t->mark('END');
$t->report();
B<STREAM (STDERR)>:
=begin html
<span style="display:inline-block;margin-left:1em;"><pre style="display: block"><code><span style="color: #cc66cc;">AI::TensorFlow::Libtensorflow::Tensor</span><span style=""> </span><span style="color: #33ccff;">{</span><span style="">
</span><span style="color: #6666cc;">Type </span><span style=""> </span><span style="color: #cc66cc;">FLOAT</span><span style="">
</span><span style="color: #6666cc;">Dims </span><span style=""> </span><span style="color: #33ccff;">[</span><span style=""> </span><span style="color: #ff6633;">1</span><span style=""> </span><span style="color: #ff6633;">896</span><s...
</span><span style="color: #6666cc;">NumDims </span><span style=""> </span><span style="color: #ff6633;">3</span><span style="">
</span><span style="color: #6666cc;">ElementCount </span><span style=""> </span><span style="color: #ff6633;">4760448</span><span style="">
</span><span style="color: #33ccff;">}</span><span style="">
Devel::Timer Report -- Total time: 14.5641 secs
Interval Time Percent
----------------------------------------------
01 -> 02 14.5634 100.00% prediction of sequence -> End of prediction of sequence
02 -> 03 0.0007 0.00% End of prediction of sequence -> END
00 -> 01 0.0000 0.00% INIT -> prediction of sequence
</span></code></pre></span>
=end html
Now we turn the C<TFTensor> output into a PDL ndarray.
my $predictions_p = FloatTFTensorToPDL($predictions)->slice(',,(0)');
say $predictions_p->info; undef;
B<STREAM (STDOUT)>:
PDL: Float D [5313,896]
=head2 Plot predicted tracks
These predictions can be plotted
my @tracks = (
[ 'DNASE:CD14-positive monocyte female' => 41 => $predictions_p->slice('(41)') ],
[ 'DNASE:keratinocyte female' => 42 => $predictions_p->slice('(42)') ],
[ 'CHIP:H3K27ac:keratinocyte female' => 706 => $predictions_p->slice('(706)')],
[ 'CAGE:Keratinocyte - epidermal' => 4799 => log10(1 + $predictions_p->slice('(4799)')) ],
);
use PDL::Graphics::Gnuplot;
my $plot_output_path = 'enformer-target-interval-tracks.png';
my $gp = gpwin('pngcairo', font => ",10", output => $plot_output_path, size => [10,2. * @tracks], aa => 2 );
$gp->multiplot( layout => [1, scalar @tracks], title => $target_interval );
$gp->options(
offsets => [ graph => "0.01, 0, 0, 0" ],
lmargin => "at screen 0.05",
);
my $x = zeroes($predictions_p->dim(1))->xlinvals($target_interval->start, $target_interval->end);
my @tics_opts = (mirror => 0, out => 1);
for my $i (0..$#tracks) {
my ($title, $id, $y) = @{$tracks[$i]};
$gp->plot( {
title => $title,
border => [2],
ytics => { @tics_opts, locations => [ ceil(($y->max-$y->min)/2)->sclr ] },
( $i == $#tracks
? ( xtics => { format => '%.3f', @tics_opts } )
: ( xtics => 0 ) ),
( $i == $#tracks ? ( xlabel => 'location ({/Symbol \264}10^7 bases)' ) : () ),
},
with => 'filledcurves',
#'lc' => '#086eb5',
# $x scaled by 1e7; filled curve between $y and the x-axis
$x / 1e7, $y, pdl(0)
);
}
$gp->end_multi;
$gp->close;
if( IN_IPERL ) {
IPerl->png( bytestream => path($plot_output_path)->slurp_raw );
}
B<DISPLAY>:
=for html <span style="display:inline-block;margin-left:1em;"><p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+gAAAMgCAIAAAA/et9qAAAgAElEQVR4nOzdd2AUVeIH8Ddb0jshBAIEpSo1GjoIpyAgCOqd3uGdoGBBUQQFRUVBRbkTf9gOBQucqFiwUhSSgJQYCCSBkJBAet1k...
=head2 Parts of the original notebook that fall outside the scope
In the orignal notebook, there are several more steps that have not been ported here:
=over
=item 1.
"Compute contribution scores":
This task requires implementing C<@tf.function> to compile gradients.
=item 2.
"Predict the effect of a genetic variant" and "Score multiple variants":
The first task is possible, but the second task requires loading a pre-processing pipeline for scikit-learn and unfortunately this pipeline is stored as a pickle file that is valid for an older version of scikit-learn (version 0.23.2) and as such its...
=back
# Some code that could be used for working with variants.
1 if <<'COMMENT';
use Bio::DB::HTS::VCF;
( run in 0.740 second using v1.01-cache-2.11-cpan-39bf76dae61 )