BioPerl

 view release on metacpan or  search on metacpan

Bio/Assembly/IO/tigr.pm  view on Meta::CPAN

    );

    # Add read location and sequence to contig (in 'gapped consensus' coordinates)
    $$readinfo{'aln_start'} = $$readinfo{'offset'} + 1; # seq offset is in gapped coordinates
    $$readinfo{'aln_end'} = $$readinfo{'aln_start'} + length($$readinfo{'lsequence'}) - 1; # lsequence is aligned seq
    my $alncoord = Bio::SeqFeature::Generic->new(
        -primary     => $readobj->id,
        -start       => $$readinfo{'aln_start'},
        -end         => $$readinfo{'aln_end'},
        -strand      => $$readinfo{'strand'},
        -tag         => { 'contig' => $contigobj->id() }
    );
    $contigobj->set_seq_coord($alncoord, $readobj);
 
    # Add quality clipping read information in contig features
    # (from 'aligned read' to 'gapped consensus' coordinates)
    $$readinfo{'clip_start'} = $contigobj->change_coord('aligned '.$readobj->id, 'gapped consensus', $$readinfo{'seq_lend'});
    $$readinfo{'clip_end'}   = $contigobj->change_coord('aligned '.$readobj->id, 'gapped consensus', $$readinfo{'seq_rend'});
    my $clipcoord = Bio::SeqFeature::Generic->new(
        -primary     => '_quality_clipping',
        -source      => $readobj->id,
        -start       => $$readinfo{'clip_start'},
        -end         => $$readinfo{'clip_end'},
        -strand      => $$readinfo{'strand'}
    );
    $clipcoord->attach_seq($readobj);
    $contigobj->add_features([ $clipcoord ], 0);

    # Add other misc read information as subsequence feature
    my $readtags = Bio::SeqFeature::Generic->new(
        -primary     => '_main_read_feature',
        -source      => $readobj->id,
        -start       => $$readinfo{'aln_start'},
        -end         => $$readinfo{'aln_end'},
        -strand      => $$readinfo{'strand'},
        -tag         => { 'best'    => $$readinfo{'best'},
                          'comment' => $$readinfo{'comment'} }
    );
    $contigobj->get_features_collection->add_features([$readtags]);
    $contigobj->get_features_collection->add_SeqFeature($alncoord, $readtags);

    return $readobj;
}


=head2 _store_singlet

    Title   : _store_singlet
    Usage   : my $singletobj = $self->_store_read(\%readinfo, \%contiginfo);
    Function: store information of a singlet belonging to a scaffold in a singlet object
    Returns : Bio::Assembly::Singlet
    Args    : hash, hash

=cut

sub _store_singlet {
    my ($self, $readinfo, $contiginfo) = @_;
    # Singlets in TIGR_Assembler are represented as a contig of one sequence
    # We try to simulate this duality by playing around with the Singlet object

    my $contigid = $$contiginfo{'asmbl_id'};
    my $readid   = $self->_merge_seq_name_and_db($$readinfo{'seq_name'}, $$readinfo{'db'});

    # Create a sequence object
    #$$contiginfo{'llength'} = length($$contiginfo{'lsequence'});
    my $seqobj = Bio::Seq::Quality->new(
       -primary_id => $readid,
       -display_id => $readid,
       -seq        => $$contiginfo{'lsequence'}, # do not use $$readinfo as ambiguities are uppercase
       -start      => 1,
       -strand     => $$readinfo{'strand'},
       -alphabet   => 'dna',
       -qual       => $self->_qual_hex2dec($$contiginfo{'quality'})    
   );

   # Create singlet from sequence and add it to scaffold
   my $singletobj = Bio::Assembly::Singlet->new(
     -id     => $contigid,
     -seqref => $seqobj
   );

   # Add other misc contig information as features of the singlet
   my $contigtags = Bio::SeqFeature::Generic->new(
        -primary     => '_main_contig_feature',
        -source      => $contigid,
        -start       => 1,
        -end         => $singletobj->get_consensus_length(),
        -strand      => 1,
        -tag         => { 'seq_id'     => $$contiginfo{'seq_id'},
                          'com_name'   => $$contiginfo{'com_name'},
                          'type'       => $$contiginfo{'type'},
                          'method'     => $$contiginfo{'method'},
                          'ed_status'  => $$contiginfo{'ed_status'},
                          'full_cds'   => $$contiginfo{'full_cds'},
                          'cds_start'  => $$contiginfo{'cds_start'},
                          'cds_end'    => $$contiginfo{'cds_end'},
                          'ed_pn'      => $$contiginfo{'ed_pn'},
                          'ed_date'    => $$contiginfo{'ed_date'},
                          'comment'    => $$contiginfo{'comment'},
                          'frameshift' => $$contiginfo{'frameshift'} }
   );
   $singletobj->add_features([ $contigtags ], 1);

   # Add read location and sequence to singlet features (in 'gapped consensus' coordinates)
   $$readinfo{'aln_start'} = $$readinfo{'offset'} + 1; # seq offset is in gapped coordinates
   $$readinfo{'aln_end'} = $$readinfo{'aln_start'} + length($$readinfo{'lsequence'}) - 1; # lsequence is aligned seq

   my $alncoord = Bio::SeqFeature::Generic->new(
       -primary     => '_aligned_coord',
       -source      => $readid,
       -start       => $$readinfo{'aln_start'},
       -end         => $$readinfo{'aln_end'},
       -strand      => $$readinfo{'strand'},
       -tag         => { 'contig' => $contigid }
   );
   $alncoord->attach_seq($singletobj->seqref);
   $singletobj->add_features([ $alncoord ], 0);

   # Add quality clipping read information in singlet features
   # (from 'aligned read' to 'gapped consensus' coordinates)
   $$readinfo{'clip_start'} = $$readinfo{'seq_lend'};
   $$readinfo{'clip_end'}   = $$readinfo{'seq_rend'};
   my $clipcoord = Bio::SeqFeature::Generic->new(
       -primary     => '_quality_clipping',
       -source      => $readid,
       -start       => $$readinfo{'clip_start'},
       -end         => $$readinfo{'clip_end'},
       -strand      => $$readinfo{'strand'},
       -tag         => { 'contig' => $contigid }
   );
   $clipcoord->attach_seq($singletobj->seqref);
   $singletobj->add_features([ $clipcoord ], 0);

   # Add other misc read information as subsequence feature
   my $readtags = Bio::SeqFeature::Generic->new(
       -primary     => '_main_read_feature',
       -source      => $readid,
       -start       => $$readinfo{'aln_start'},
       -end         => $$readinfo{'aln_end'},
       -strand      => $$readinfo{'strand'},
       -tag         => { 'best'    => $$readinfo{'best'},
                         'comment' => $$readinfo{'comment'} }
   );
   $singletobj->get_features_collection->add_features([$readtags]);
   $singletobj->get_features_collection->add_SeqFeature($alncoord, $readtags);

   return $singletobj;
}


=head2 write_assembly

    Title   : write_assembly
    Usage   : $asmio->write_assembly($assembly)
    Function: Write the assembly object in TIGR Assembler compatible format. The
              contig IDs are sorted naturally if the Sort::Naturally module is
              present, or lexically otherwise. Internally, write_assembly use
              the write_contig, write_footer and write_header methods. Use these
              methods if you want more control on the writing process.
    Returns : 1 on success, 0 for error
    Args    : A Bio::Assembly::Scaffold object
              1 to write singlets in the assembly file, 0 otherwise

=cut


=head2 write_contig

    Title   : write_contig
    Usage   : $asmio->write_contig($contig)
    Function: Write a contig or singlet object in TIGR compatible format. Quality
              scores are automatically generated if the contig does not contain
              any
    Returns : 1 on success, 0 for error
    Args    : A Bio::Assembly::Contig or Singlet object

=cut

sub write_contig {
    my ($self, @args) = @_;
    my ($contigobj) = $self->_rearrange([qw(CONTIG)], @args);

    # Sanity check
    if ( !$contigobj || !$contigobj->isa('Bio::Assembly::Contig') ) {
        $self->throw("Must provide a Bio::Assembly::Contig or Singlet object when calling write_contig");
    }

    my $decimal_format = '%.2f';
    my $contigid = $contigobj->id;
    my $numseqs = $contigobj->num_sequences;

    if ( $contigobj->isa('Bio::Assembly::Singlet') ) {
        # This is a singlet
        my $readid     = $contigobj->seqref->id;      
        my $singletobj = $contigobj;

        # Get contig information
        my ($contanno) = $singletobj->get_features_collection->get_features_by_type("_main_contig_feature:$contigid");

        my %contiginfo;
        $contiginfo{'sequence'}   = $singletobj->seqref->seq;
        $contiginfo{'lsequence'}  = $contiginfo{'sequence'};
        $contiginfo{'quality'}    = $self->_qual_dec2hex(
            join ' ', @{$singletobj->get_consensus_quality->qual} );
        $contiginfo{'asmbl_id'}   = $contigid;
        $contiginfo{'seq_id'}     = ($contanno->get_tag_values('seq_id'))[0];   
        $contiginfo{'com_name'}   = ($contanno->get_tag_values('com_name'))[0];
        $contiginfo{'type'}       = ($contanno->get_tag_values('type'))[0];
        $contiginfo{'method'}     = ($contanno->get_tag_values('method'))[0];
        $contiginfo{'ed_status'}  = ($contanno->get_tag_values('ed_status'))[0];
        $contiginfo{'redundancy'} = sprintf($decimal_format, 1);
        $contiginfo{'perc_N'}     = sprintf(
            $decimal_format, $self->_perc_N($contiginfo{'sequence'}));
        $contiginfo{'seqnum'}     = 1;
        $contiginfo{'full_cds'}   = ($contanno->get_tag_values('full_cds'))[0];
        $contiginfo{'cds_start'}  = ($contanno->get_tag_values('cds_start'))[0];
        $contiginfo{'cds_end'}    = ($contanno->get_tag_values('cds_end'))[0];
        $contiginfo{'ed_pn'}      = ($contanno->get_tag_values('ed_pn'))[0];
        $contiginfo{'ed_date'}    = $self->_date_time;
        $contiginfo{'comment'}    = ($contanno->get_tag_values('comment'))[0];
        $contiginfo{'frameshift'} = ($contanno->get_tag_values('frameshift'))[0];

        # Check that no tag value is undef
        $contiginfo{'seq_id'}     = '' unless defined $contiginfo{'seq_id'};
        $contiginfo{'com_name'}   = '' unless defined $contiginfo{'com_name'};
        $contiginfo{'type'}       = '' unless defined $contiginfo{'type'};
        $contiginfo{'method'}     = '' unless defined $contiginfo{'method'};
        $contiginfo{'ed_status'}  = '' unless defined $contiginfo{'ed_status'};
        $contiginfo{'full_cds'}   = '' unless defined $contiginfo{'full_cds'};
        $contiginfo{'cds_start'}  = '' unless defined $contiginfo{'cds_start'};
        $contiginfo{'cds_end'}    = '' unless defined $contiginfo{'cds_end'};
        $contiginfo{'ed_pn'}      = '' unless defined $contiginfo{'ed_pn'};
        $contiginfo{'comment'}    = '' unless defined $contiginfo{'comment'};
        $contiginfo{'frameshift'} = '' unless defined $contiginfo{'frameshift'};
            
        # Print singlet information
        $self->_print(
            "sequence\t$contiginfo{'sequence'}\n".
            "lsequence\t$contiginfo{'lsequence'}\n".
            "quality\t$contiginfo{'quality'}\n".
            "asmbl_id\t$contiginfo{'asmbl_id'}\n".
            "seq_id\t$contiginfo{'seq_id'}\n".
            "com_name\t$contiginfo{'com_name'}\n".
            "type\t$contiginfo{'type'}\n".
            "method\t$contiginfo{'method'}\n".
            "ed_status\t$contiginfo{'ed_status'}\n".
            "redundancy\t$contiginfo{'redundancy'}\n".
            "perc_N\t$contiginfo{'perc_N'}\n".
            "seq#\t$contiginfo{'seqnum'}\n".
            "full_cds\t$contiginfo{'full_cds'}\n".
            "cds_start\t$contiginfo{'cds_start'}\n".
            "cds_end\t$contiginfo{'cds_end'}\n".
            "ed_pn\t$contiginfo{'ed_pn'}\n".
            "ed_date\t$contiginfo{'ed_date'}\n".
            "comment\t$contiginfo{'comment'}\n".
            "frameshift\t$contiginfo{'frameshift'}\n".
            "\n"
        );

        # Get read information
        my ($seq_name, $db) = $self->_split_seq_name_and_db($readid);
        my ($clipcoord) = $singletobj->get_features_collection->get_features_by_type("_quality_clipping:$readid");
        my ($alncoord) = $singletobj->get_features_collection->get_features_by_type("_aligned_coord:$readid");
        my ($readanno) = $singletobj->get_features_collection->get_features_by_type("_main_read_feature:$readid");
        my %readinfo;
        $readinfo{'seq_name'}  = $seq_name;
        $readinfo{'asm_lend'}  = $alncoord->location->start;
        $readinfo{'asm_rend'}  = $alncoord->location->end;
        $readinfo{'seq_lend'}  = $clipcoord->location->start;
        $readinfo{'seq_rend'}  = $clipcoord->location->end;
        $readinfo{'best'}      = ($readanno->get_tag_values('best'))[0];
        $readinfo{'comment'}   = ($readanno->get_tag_values('comment'))[0];
        $readinfo{'db'}        = $db;
        $readinfo{'offset'}    = 0;
        # ambiguities in read sequence are uppercase
        $readinfo{'lsequence'} = uc($contiginfo{'lsequence'});
        
        # Check that no tag value is undef
        $readinfo{'best'}    = '' unless defined $readinfo{'best'};
        $readinfo{'comment'} = '' unless defined $readinfo{'comment'};

        # Print read information
        $self->_print(
            "seq_name\t$readinfo{'seq_name'}\n".
            "asm_lend\t$readinfo{'asm_lend'}\n".
            "asm_rend\t$readinfo{'asm_rend'}\n".
            "seq_lend\t$readinfo{'seq_lend'}\n".
            "seq_rend\t$readinfo{'seq_rend'}\n".
            "best\t$readinfo{'best'}\n".
            "comment\t$readinfo{'comment'}\n".
            "db\t$readinfo{'db'}\n".
            "offset\t$readinfo{'offset'}\n".
            "lsequence\t$readinfo{'lsequence'}\n"
        );
        $self->_print("|\n");

    } else {
        # This is a contig
        # Get contig information
        my ($contanno) = $contigobj->get_features_collection->get_features_by_type("_main_contig_feature:$contigid");
        my %contiginfo;
        $contiginfo{'sequence'}   = $self->_ungap(
            $contigobj->get_consensus_sequence->seq);
        $contiginfo{'lsequence'}  = $contigobj->get_consensus_sequence->seq;
        $contiginfo{'quality'}    = $self->_qual_dec2hex(
            join ' ', @{$contigobj->get_consensus_quality->qual});
        $contiginfo{'asmbl_id'}   = $contigid;
        $contiginfo{'seq_id'}     = ($contanno->get_tag_values('seq_id'))[0];
        $contiginfo{'com_name'}   = ($contanno->get_tag_values('com_name'))[0];
        $contiginfo{'type'}       = ($contanno->get_tag_values('type'))[0];
        $contiginfo{'method'}     = ($contanno->get_tag_values('method'))[0];
        $contiginfo{'ed_status'}  = ($contanno->get_tag_values('ed_status'))[0];
        $contiginfo{'redundancy'} = sprintf(
            $decimal_format, $self->_redundancy($contigobj));
        $contiginfo{'perc_N'}     = sprintf(
            $decimal_format, $self->_perc_N($contiginfo{'sequence'}));
        $contiginfo{'seqnum'}     = $contigobj->num_sequences;
        $contiginfo{'full_cds'}   = ($contanno->get_tag_values('full_cds'))[0];
        $contiginfo{'cds_start'}  = ($contanno->get_tag_values('cds_start'))[0];
        $contiginfo{'cds_end'}    = ($contanno->get_tag_values('cds_end'))[0];
        $contiginfo{'ed_pn'}      = ($contanno->get_tag_values('ed_pn'))[0];
        $contiginfo{'ed_date'}    = $self->_date_time;
        $contiginfo{'comment'}    = ($contanno->get_tag_values('comment'))[0];
        $contiginfo{'frameshift'} = ($contanno->get_tag_values('frameshift'))[0];
            
        # Check that no tag value is undef
        $contiginfo{'seq_id'}     = '' unless defined $contiginfo{'seq_id'};
        $contiginfo{'com_name'}   = '' unless defined $contiginfo{'com_name'};
        $contiginfo{'type'}       = '' unless defined $contiginfo{'type'};
        $contiginfo{'method'}     = '' unless defined $contiginfo{'method'};
        $contiginfo{'ed_status'}  = '' unless defined $contiginfo{'ed_status'};
        $contiginfo{'full_cds'}   = '' unless defined $contiginfo{'full_cds'};
        $contiginfo{'cds_start'}  = '' unless defined $contiginfo{'cds_start'};
        $contiginfo{'cds_end'}    = '' unless defined $contiginfo{'cds_end'};
        $contiginfo{'ed_pn'}      = '' unless defined $contiginfo{'ed_pn'};
        $contiginfo{'comment'}    = '' unless defined $contiginfo{'comment'};
        $contiginfo{'frameshift'} = '' unless defined $contiginfo{'frameshift'};

        # Print contig information
        $self->_print(
            "sequence\t$contiginfo{'sequence'}\n".
            "lsequence\t$contiginfo{'lsequence'}\n".
            "quality\t$contiginfo{'quality'}\n".
            "asmbl_id\t$contiginfo{'asmbl_id'}\n".
            "seq_id\t$contiginfo{'seq_id'}\n".
            "com_name\t$contiginfo{'com_name'}\n".
            "type\t$contiginfo{'type'}\n".
            "method\t$contiginfo{'method'}\n".
            "ed_status\t$contiginfo{'ed_status'}\n".
            "redundancy\t$contiginfo{'redundancy'}\n".
            "perc_N\t$contiginfo{'perc_N'}\n".
            "seq#\t$contiginfo{'seqnum'}\n".
            "full_cds\t$contiginfo{'full_cds'}\n".
            "cds_start\t$contiginfo{'cds_start'}\n".
            "cds_end\t$contiginfo{'cds_end'}\n".
            "ed_pn\t$contiginfo{'ed_pn'}\n".
            "ed_date\t$contiginfo{'ed_date'}\n".
            "comment\t$contiginfo{'comment'}\n".
            "frameshift\t$contiginfo{'frameshift'}\n".
            "\n"
        );
        my $seqno = 0;
        for my $readobj ( $contigobj->each_seq() ) {
            $seqno++;

            # Get read information
            my ($seq_name, $db) = $self->_split_seq_name_and_db($readobj->id);



( run in 0.600 second using v1.01-cache-2.11-cpan-5735350b133 )