Bio-Homology-InterologWalk

 view release on metacpan or  search on metacpan

lib/Bio/Homology/InterologWalk.pm  view on Meta::CPAN

     my $start_data_path   = $args{start_path};
     my $in_path           = $args{input_path};
     my $out_path          = $args{output_path};
     my $onetoone_only     = $args{hq_only};
     
     #MANAGE FILES
     my $dbh = _setup_dbi_connection($in_path);
     open (my $start_data, q{<}, $start_data_path) or croak("Unable to open $start_data_path : $!");
     open (my $out_data, q{>}, $out_path) or croak("Unable to open $out_path : $!");
     #============

     my $query;
     my %old_id_present = ();
     my %new_id_set = ();
     my %all_seen = (); #total number of nodes
     
     #we slurp all the ids in the initial file into a hash========
     my %start_data_set;
     while(<$start_data>) {
          chomp;
          next if($_ eq '');
          $start_data_set{$_} = 1;
     }
     my $number_of_elements_start_ds = keys %start_data_set;
     print("Number of unique ids in original dataset: $number_of_elements_start_ds\n");
     #=========================================================
     
     #now we get both interactors from the putative data set and we look at those that dont appear in the %start_data_set hash
     if($onetoone_only){
          $query = "SELECT    $FN_initid, $FN_interactor_id
                    FROM int
                    WHERE ($FN_odesc_1 like '%one2one%') 
                    AND ($FN_odesc_2 like '%one2one%')";
     }else{ # all orthology types
          $query = "SELECT    $FN_initid, $FN_interactor_id"
                    ." FROM int";
     }
     
     my $sth = $dbh->prepare($query);
     $sth->execute() or die "Cannot execute: " . $sth->errstr();
     
     my $rownumb = $sth->rows;             
     return if ($rownumb == 0); 
     #you need to do this with all subroutines
     
     while (my $row = $sth->fetchrow_hashref) {
          my $idIN = $row->{$FN_initid};
          my $idOUT = $row->{$FN_interactor_id};
     
          #first I count all of the unique ids present (ie network nodes)
          $all_seen{$idIN} = 1;    $all_seen{$idOUT} = 1;
          
          #then I count what fraction of the original ids actually made it to feature any putative interactions
          $old_id_present{$idIN} = 1 if(exists($start_data_set{$idIN}));
          $old_id_present{$idOUT} = 1 if(exists($start_data_set{$idOUT}));
     
          #lastly I store those new ids never seen in the starting dataset
          $new_id_set{$idOUT} = 1 unless(exists($start_data_set{$idOUT})); 
     }
     my $number_of_old_IDs = keys %old_id_present;
     my $percentage = ($number_of_old_IDs / $number_of_elements_start_ds) * 100;
     print("Number of IDs from the original dataset that appear in the network: $number_of_old_IDs\n");
     print("Percentage of IDs from the original dataset that appear in the final dataset: $percentage\n");
     
     my $number_of_new_IDs = keys %new_id_set;
     my $number_of_network_nodes = keys %all_seen;
     $percentage= ($number_of_new_IDs / $number_of_network_nodes) * 100;
     if($onetoone_only){
          print("Number of total UNIQUE IDs in interaction dataset (considering ONE-TO-ONE ortologies only): $number_of_network_nodes\n");
     }else{
          print("Number of total UNIQUE IDs in interaction dataset: $number_of_network_nodes\n");
     }
     print("Number of NEW ids (e.g. not seen in starting data set): $number_of_new_IDs\n");
     print("Percentage of new ids over the total: $percentage\n");
     
     
     #I save all the new ids in a flat file. This might be useful to do some analysis of their functional annotation
     foreach my $id (sort keys %new_id_set){
          #print ("$id\t$new_id_set{$id}\n");
          print $out_data $id, "\n";
     }
     $sth->finish();
     $dbh->disconnect();
     close($start_data);
     close($out_data);
     return 1;
}



####################################
#HELPER routines for internal usage
####################################

#
#_compare_uniprotkbids
#
# Usage     : How to use this function/method
# Purpose   : Suppose we query Intact with the Ensembl ID of a gene of interest, to look for interactors. Intact will not return a simple
#             list of interactors as desidered. It will instead return 0 or more lines of tsv data, each one describing a binary interaction, in MITAB format. 
#             Each lines will contain 2 ids, in uniprot format: one of them being our query id. As the returned format is Uniprot, while the query id was Ensembl,
#             we cannot possibly know which of the two is the query id and which is the interactor. Therefore we must carry out a conversion.
#             This routine looks for a uniprot id of the query id in Ensembl format. It then compares it to both retrieved id to decide which one is the output interactor
#             This is used by get_backward_orthologies().
# Returns   : a candidate id in uniprot format or UNDEF
# Argument  : gene adaptor, Ensembl Id of the query gene, the two uniprotKB accession number returned by Intact
# Throws    : -
# Comment   : This is a sample subroutine header.
#           : -
#
#See Also   : 
sub _compare_uniprotkbids{
     my ($adaptor, $ensembl_id, $uniprot_id_a, $uniprot_id_b) = @_;
     my %uniprotANseen = ();
     my $output_id;
     
     my $query_gene = $adaptor->fetch_by_stable_id($ensembl_id);
     return if(!$query_gene);
     my $uniprot_links = $query_gene->get_all_DBLinks("Uniprot%");
     
     foreach my $link (@$uniprot_links) {
          my $item = $link->primary_id;
          $uniprotANseen{$item} = 1;
     }
     
     if(exists $uniprotANseen{$uniprot_id_a}){
          $output_id = $uniprot_id_b;
     }elsif(exists $uniprotANseen{$uniprot_id_b}){
          $output_id = $uniprot_id_a;
     }else{
          print("WARNING uniprot-ids of $ensembl_id contain neither $uniprot_id_a nor $uniprot_id_b. Skipping..\n");
     }
     return $output_id;
}

lib/Bio/Homology/InterologWalk.pm  view on Meta::CPAN

          #the following filters the orthologies on the basis of their description             
          $DF_odesc = $homology->description;
          next if ($DF_odesc =~ /paralog/);
          #possible_ortholog: dubious duplication/speciation node labelling.
          next if ($DF_odesc =~ /possible_ortholog/);
          next if (($oo_only) && (!($DF_odesc =~ /^ortholog_one2one$/)));
          
          $counter += 1;
          #$osubtype = $homology->subtype; #not sure i need it for now
          #DNDS ratio
          $DF_dnds = $homology->dnds_ratio;#do not apply threshold for now
               
          my $genelist = $homology->gene_list;# genelist is a member object
          my $node_x; my $node_y;

          #genelist will contain ALL the genes in the orthology group: this means it will
          #also contain the original gene I made my query with.
          foreach my $homology_member (@{$genelist}){
               $DF_orthologue_id = $homology_member->stable_id;
               next if(!$DF_orthologue_id);
               my $canonical_pep_member = $homology_member->get_canonical_peptide_Member;
               if($canonical_pep_member){
                    my $mid = $canonical_pep_member->member_id;
                    if($pt_adaptor){
                    #TODO TEMP ADDED BECAUSE OF ERROR REPORTED ON 24/OCT TO ENSEMBL ML
                         if($DF_orthologue_id eq $init_id){ #node_x will contain the one I started with
                              $node_x = $pt_adaptor->fetch_AlignedMember_by_member_id_root_id($mid,1);
                         }else{
                              $node_y = $pt_adaptor->fetch_AlignedMember_by_member_id_root_id($mid,1);
                         } 
                    }
               }
          }
                    
          if((defined($node_x)) && (defined($node_y) )){
               my $root = $node_x->subroot; #method from nested set
                    if (defined ($root)){
               eval{
                    $root->merge_node_via_shared_ancestor($node_y);
                    $ancestor = $node_x->find_first_shared_ancestor($node_y);   
                    };
                    if($@){
                         print "\nAn error occurred ($@), continuing..\n";
                    }
               if (defined $ancestor){
                    $DF_fsa_x = $node_x->distance_to_ancestor($ancestor);
                         $DF_fsa_y = $node_y->distance_to_ancestor($ancestor);
                         $DF_nndist = $node_x->distance_to_node($node_y);
                    }
               }
          }

          foreach my $homology_member (@{$genelist}){
               $DF_orthologue_id = $homology_member->stable_id;
               next if ($DF_orthologue_id eq $init_id);#I dont want to print again the gene name
          
               $DF_oname = $homology_member->display_label;

               #OPI
               my $pairwise_alignment_from_multiple = $homology->get_SimpleAlign;
               $DF_opi = $pairwise_alignment_from_multiple->overall_percentage_identity;
               #$opi = sprintf("%.3f", $overall_pid); #rounded
          
               $DF_orthologue_id = '-' if(!$DF_orthologue_id);
               $DF_oname         = '-' if(!$DF_oname); 
               $DF_odesc         = '-' if(!$DF_odesc);
               $DF_dnds          = '-' if(!$DF_dnds); 
               $DF_opi           = '-' if(!$DF_opi);
               $DF_fsa_x         = '-' if(!$DF_fsa_x); 
               $DF_fsa_y         = '-' if(!$DF_fsa_y);
               $DF_nndist        = '-' if(!$DF_nndist);     
          
               if($old_data_row){ #get_backward_orthologies - species_y is our species of interest
                    $data_line = join("\t",$old_data_row, 
                                   $DF_orthologue_id,$DF_oname, $DF_odesc, 
                                   $DF_opi, $DF_dnds, $DF_nndist,
                                   $DF_fsa_y, $DF_fsa_x);
                    print "Putative interactor found: $DF_oname ($DF_orthologue_id)\n";
               }else{ #get_forward_orthologies - species_x is our species of interest
                    $data_line = join("\t",$init_id, 
                                   $DF_orthologue_id,$DF_oname, $DF_odesc, 
                                   $DF_opi, $DF_dnds, $DF_nndist,
                                   $DF_fsa_x, $DF_fsa_y);
               }
               print $output_file  $data_line, "\n";
               #$atLeastOneRowPrinted = 1;
          }
     }
     return $counter;
}

#
#_setup_dbi_connection
#
# Usage     : $dbh = _setup_dbi_connection($in_path);
# Purpose   : This subroutine sets up a DBD::CSV connection with the input file, which must be in TSV 
#             (Tab Separated Values) format. The file content is then accessed through DBI just like a 
#             Relational Database
# Returns   : A database handle
# Argument  : A string indicating the path to the file
# Throws    : -
# Comment   : -
#
#See Also   : 
sub _setup_dbi_connection{
     my ($path) = @_;
     
     my $dbh = DBI->connect("DBI:CSV:f_dir=.;csv_eol=\n;csv_sep_char=\t;csv_quote_char=;csv_escape_char=") 
     or die("Cannot connect: " . $DBI::errstr);
     
     $dbh->{'RaiseError'} = 1;
     local $@ = '';
     $dbh->{'csv_tables'}->{'int'} = { 'file' => $path};
     
     
     $dbh->{FetchHashKeyName} = 'NAME_uc';
     #weird behaviour..fetchrow_hashref might convert hash keys to lower case
     #got this from http://search.cpan.org/~rehsack/SQL-Statement-1.27/lib/SQL/Statement.pm
     
     return $dbh;
}

lib/Bio/Homology/InterologWalk.pm  view on Meta::CPAN

#See Also   : 
#TODO try to fuse this with the former, maybe passing two db handles, maybe making this optional
sub _get_multiple_taxa_mean_score{
     my ($dbh) = @_;
     my $mean_ME_TAXA_score = 0;
     my $interaction_count = 0;
     my %ME_TAXA_hash = (); #multiple evidence, taxa
     
     my $query = "SELECT $FN_multiple_taxa
                    FROM int";
     my $sth = $dbh->prepare($query);
     $sth->execute() or die "Cannot execute: " . $sth->errstr();
     
     if(!$sth->rows){
          print("_get_multiple_taxa_mean_score(): no data in input file. Skipping..");
          return;
     } # TODO temp solution do a SELECT COUNT etc etc
     
     while (my $row = $sth->fetchrow_hashref){
          $interaction_count += 1;
          my $multiple_taxa_score = $row->{$FN_multiple_taxa};
          $mean_ME_TAXA_score += $multiple_taxa_score;
          $ME_TAXA_hash{$multiple_taxa_score} += 1;
     }
     
     $mean_ME_TAXA_score = $mean_ME_TAXA_score / $interaction_count;
     print "Mean Multiple TAXA Score: ", $mean_ME_TAXA_score, "\n";

     print "Multiple TAXA Evidence Hist\n";
     foreach my $score (sort keys %ME_TAXA_hash){
          $ME_TAXA_hash{$score} = $ME_TAXA_hash{$score}/$interaction_count;
          print "$score\t$ME_TAXA_hash{$score}\n";
     }
     print "\n";
     $sth->finish;
     return $mean_ME_TAXA_score;
}

=head2 compute_prioritisation_index

 Usage     : $RC = Bio::Homology::InterologWalk::Scores::compute_prioritisation_index(
                                                        input_path        => $in_path,
                                                        score_path        => $score_path,
                                                        output_path       => $out_path,
                                                        term_graph        => $onto_graph,
                                                        meanscore_em      => $m_em,
                                                        meanscore_it      => $m_it,
                                                        meanscore_dm      => $m_dm,
                                                        meanscore_me_dm   => $m_mdm,
                                                        meanscore_me_taxa => $m_mtaxa
                                                        );
 Purpose   : This is used to analyse several ancillary data fields obtained alongside the actual 
             putative PPI IDs and collate them into an Interolog Prioritisation Index (IPX), to associate a
             numerical index to each putative PPI based on biological metadata. The index will take into account 
             a number of features related to each of the steps involved in the orthology walk. 
             We can divide the metadata features in two broad classes:
             - features related to the interaction. These include: Interaction Type, Interaction 
               Detection Method, Interaction coming from a SPOKE-expanded complex, interaction recon-
               firmed through multiple taxa, interaction reconfirmed through multiple detection methods
             - features related to the two orthology mappings. These include: orthology type 
               (one-to-one, one-to-many, many-to-one, many-to-many), OPI (percentage identity of the 
               conserved columns - see Bio::SimpleAlign), node to node distance, distance from the 
               first shared ancestor, (under development) dN/dS ratio
             The IPX computation will also involve a normalisation stage. The subroutine requires 
             five arguments (meanscore_x) representing mean values to be used for normalisation.
             The actual means are computed in get_mean_scores(), which is pre-requisite to 
             compute_prioritisation_index().
 Returns   : success/failure
 Argument  : -input_path : path to the input tsv file. A suitable input for this subroutine is the 
             final output of the orthology walk pipeline (see doInterologWalk.pl for usage guidelines).
              input file should have .06out extension
             -(OPTIONAL) score_path : path to  text file where scores will be saved one per row 
              (useful for looking at score distributions eg through matlab)
              output textfile has a .scores extension 
             -output_path : where you want the routine to write the data. Data is in TSV format. 
              File extension is .07out
             -term_graph :  a Go::Parser graph object obtained from parse_ontology() containing a 
              network representation of the PSI-MI controlled vocabulary of terms.
             -meanscore_em : mean experimental method score for normalisation
             -meanscore_it : mean interaction type score for normalisation  
             -meanscore_dm : mean detection method score for normalisation   
             -meanscore_me_dm : mean 'multiple detection methods' score for normalisation
             -meanscore_me_taxa : mean 'multiple taxa' score for normalisation
 Throws    : -
 Comment   : -

See Also   : L<http://search.cpan.org/~cjfields/BioPerl-1.6.1/Bio/SimpleAlign.pm#overall_percentage_identity>, L</get_mean_scores>, C<doScores.pl> for sample usage

=cut


sub compute_prioritisation_index{
     my %args = @_;
     my $in_path                   = $args{input_path};
     my $out_path                  = $args{output_path};
     my $score_path                = $args{score_path};
     my $graph                     = $args{term_graph};
     my $mean_exp_method_score     = $args{meanscore_em};
     my $mean_int_type_score       = $args{meanscore_it};
     my $mean_det_method_score     = $args{meanscore_dm};
     my $mean_multiple_dm_score    = $args{meanscore_me_dm}; #multiple evidence, detection method 
     my $mean_multiple_taxa_score  = $args{meanscore_me_taxa}; #multiple evidence, taxa
     
     if(!$graph){
          print("compute_prioritisation_index(): PSI-MI graph representation not found. Aborting..\n");
          return;
     }
     unless($mean_exp_method_score  && 
            $mean_int_type_score    && 
            $mean_det_method_score  && 
            $mean_multiple_dm_score &&
            $mean_multiple_taxa_score){
                 
                 print("compute_prioritisation_index(): missing mean scores. Aborting..\n");
                 return;
     }
     
     my @nodeNodeDist = ();
     my $MAXNNDISTANCE = 0;
     my $totalDIST = 0;
     my $score_data;
     
     #MANAGE FILES
     my $dbh = Bio::Homology::InterologWalk::_setup_dbi_connection($in_path);
     open (my $out_data, q{>}, $out_path) or croak("Unable to open $out_path : $!");
     if($score_path){
         open ($score_data,    q{>}, $score_path) or croak("Unable to open $score_path : $!"); 
     }
     #============
     
     #header----
     print $out_data $HEADER_FWD_ORTH,  "\t", 
                     $HEADER_INT,        "\t", 
                     $HEADER_BWD_ORTH,   "\t", 
                     $HEADER_MISC,       "\t",
                     $HEADER_SCORE,      "\n";
     
     ##########
     # 1st pass
     ##########
     # 1)need to retrieve the highest node to node distance value for weighting purposes
     # 2)need to retrieve some max/mean value for the multiple taxa score.
     my $query = "SELECT $FN_nndist_1, $FN_nndist_2, $FN_multiple_taxa
                         FROM int";
                    
     my $sth = $dbh->prepare($query);
     $sth->execute() or die "Cannot execute: " . $sth->errstr();



( run in 1.572 second using v1.01-cache-2.11-cpan-98e64b0badf )