Bio-Homology-InterologWalk
view release on metacpan or search on metacpan
lib/Bio/Homology/InterologWalk.pm view on Meta::CPAN
my $start_data_path = $args{start_path};
my $in_path = $args{input_path};
my $out_path = $args{output_path};
my $onetoone_only = $args{hq_only};
#MANAGE FILES
my $dbh = _setup_dbi_connection($in_path);
open (my $start_data, q{<}, $start_data_path) or croak("Unable to open $start_data_path : $!");
open (my $out_data, q{>}, $out_path) or croak("Unable to open $out_path : $!");
#============
my $query;
my %old_id_present = ();
my %new_id_set = ();
my %all_seen = (); #total number of nodes
#we slurp all the ids in the initial file into a hash========
my %start_data_set;
while(<$start_data>) {
chomp;
next if($_ eq '');
$start_data_set{$_} = 1;
}
my $number_of_elements_start_ds = keys %start_data_set;
print("Number of unique ids in original dataset: $number_of_elements_start_ds\n");
#=========================================================
#now we get both interactors from the putative data set and we look at those that dont appear in the %start_data_set hash
if($onetoone_only){
$query = "SELECT $FN_initid, $FN_interactor_id
FROM int
WHERE ($FN_odesc_1 like '%one2one%')
AND ($FN_odesc_2 like '%one2one%')";
}else{ # all orthology types
$query = "SELECT $FN_initid, $FN_interactor_id"
." FROM int";
}
my $sth = $dbh->prepare($query);
$sth->execute() or die "Cannot execute: " . $sth->errstr();
my $rownumb = $sth->rows;
return if ($rownumb == 0);
#you need to do this with all subroutines
while (my $row = $sth->fetchrow_hashref) {
my $idIN = $row->{$FN_initid};
my $idOUT = $row->{$FN_interactor_id};
#first I count all of the unique ids present (ie network nodes)
$all_seen{$idIN} = 1; $all_seen{$idOUT} = 1;
#then I count what fraction of the original ids actually made it to feature any putative interactions
$old_id_present{$idIN} = 1 if(exists($start_data_set{$idIN}));
$old_id_present{$idOUT} = 1 if(exists($start_data_set{$idOUT}));
#lastly I store those new ids never seen in the starting dataset
$new_id_set{$idOUT} = 1 unless(exists($start_data_set{$idOUT}));
}
my $number_of_old_IDs = keys %old_id_present;
my $percentage = ($number_of_old_IDs / $number_of_elements_start_ds) * 100;
print("Number of IDs from the original dataset that appear in the network: $number_of_old_IDs\n");
print("Percentage of IDs from the original dataset that appear in the final dataset: $percentage\n");
my $number_of_new_IDs = keys %new_id_set;
my $number_of_network_nodes = keys %all_seen;
$percentage= ($number_of_new_IDs / $number_of_network_nodes) * 100;
if($onetoone_only){
print("Number of total UNIQUE IDs in interaction dataset (considering ONE-TO-ONE ortologies only): $number_of_network_nodes\n");
}else{
print("Number of total UNIQUE IDs in interaction dataset: $number_of_network_nodes\n");
}
print("Number of NEW ids (e.g. not seen in starting data set): $number_of_new_IDs\n");
print("Percentage of new ids over the total: $percentage\n");
#I save all the new ids in a flat file. This might be useful to do some analysis of their functional annotation
foreach my $id (sort keys %new_id_set){
#print ("$id\t$new_id_set{$id}\n");
print $out_data $id, "\n";
}
$sth->finish();
$dbh->disconnect();
close($start_data);
close($out_data);
return 1;
}
####################################
#HELPER routines for internal usage
####################################
#
#_compare_uniprotkbids
#
# Usage : How to use this function/method
# Purpose : Suppose we query Intact with the Ensembl ID of a gene of interest, to look for interactors. Intact will not return a simple
# list of interactors as desidered. It will instead return 0 or more lines of tsv data, each one describing a binary interaction, in MITAB format.
# Each lines will contain 2 ids, in uniprot format: one of them being our query id. As the returned format is Uniprot, while the query id was Ensembl,
# we cannot possibly know which of the two is the query id and which is the interactor. Therefore we must carry out a conversion.
# This routine looks for a uniprot id of the query id in Ensembl format. It then compares it to both retrieved id to decide which one is the output interactor
# This is used by get_backward_orthologies().
# Returns : a candidate id in uniprot format or UNDEF
# Argument : gene adaptor, Ensembl Id of the query gene, the two uniprotKB accession number returned by Intact
# Throws : -
# Comment : This is a sample subroutine header.
# : -
#
#See Also :
sub _compare_uniprotkbids{
my ($adaptor, $ensembl_id, $uniprot_id_a, $uniprot_id_b) = @_;
my %uniprotANseen = ();
my $output_id;
my $query_gene = $adaptor->fetch_by_stable_id($ensembl_id);
return if(!$query_gene);
my $uniprot_links = $query_gene->get_all_DBLinks("Uniprot%");
foreach my $link (@$uniprot_links) {
my $item = $link->primary_id;
$uniprotANseen{$item} = 1;
}
if(exists $uniprotANseen{$uniprot_id_a}){
$output_id = $uniprot_id_b;
}elsif(exists $uniprotANseen{$uniprot_id_b}){
$output_id = $uniprot_id_a;
}else{
print("WARNING uniprot-ids of $ensembl_id contain neither $uniprot_id_a nor $uniprot_id_b. Skipping..\n");
}
return $output_id;
}
lib/Bio/Homology/InterologWalk.pm view on Meta::CPAN
#the following filters the orthologies on the basis of their description
$DF_odesc = $homology->description;
next if ($DF_odesc =~ /paralog/);
#possible_ortholog: dubious duplication/speciation node labelling.
next if ($DF_odesc =~ /possible_ortholog/);
next if (($oo_only) && (!($DF_odesc =~ /^ortholog_one2one$/)));
$counter += 1;
#$osubtype = $homology->subtype; #not sure i need it for now
#DNDS ratio
$DF_dnds = $homology->dnds_ratio;#do not apply threshold for now
my $genelist = $homology->gene_list;# genelist is a member object
my $node_x; my $node_y;
#genelist will contain ALL the genes in the orthology group: this means it will
#also contain the original gene I made my query with.
foreach my $homology_member (@{$genelist}){
$DF_orthologue_id = $homology_member->stable_id;
next if(!$DF_orthologue_id);
my $canonical_pep_member = $homology_member->get_canonical_peptide_Member;
if($canonical_pep_member){
my $mid = $canonical_pep_member->member_id;
if($pt_adaptor){
#TODO TEMP ADDED BECAUSE OF ERROR REPORTED ON 24/OCT TO ENSEMBL ML
if($DF_orthologue_id eq $init_id){ #node_x will contain the one I started with
$node_x = $pt_adaptor->fetch_AlignedMember_by_member_id_root_id($mid,1);
}else{
$node_y = $pt_adaptor->fetch_AlignedMember_by_member_id_root_id($mid,1);
}
}
}
}
if((defined($node_x)) && (defined($node_y) )){
my $root = $node_x->subroot; #method from nested set
if (defined ($root)){
eval{
$root->merge_node_via_shared_ancestor($node_y);
$ancestor = $node_x->find_first_shared_ancestor($node_y);
};
if($@){
print "\nAn error occurred ($@), continuing..\n";
}
if (defined $ancestor){
$DF_fsa_x = $node_x->distance_to_ancestor($ancestor);
$DF_fsa_y = $node_y->distance_to_ancestor($ancestor);
$DF_nndist = $node_x->distance_to_node($node_y);
}
}
}
foreach my $homology_member (@{$genelist}){
$DF_orthologue_id = $homology_member->stable_id;
next if ($DF_orthologue_id eq $init_id);#I dont want to print again the gene name
$DF_oname = $homology_member->display_label;
#OPI
my $pairwise_alignment_from_multiple = $homology->get_SimpleAlign;
$DF_opi = $pairwise_alignment_from_multiple->overall_percentage_identity;
#$opi = sprintf("%.3f", $overall_pid); #rounded
$DF_orthologue_id = '-' if(!$DF_orthologue_id);
$DF_oname = '-' if(!$DF_oname);
$DF_odesc = '-' if(!$DF_odesc);
$DF_dnds = '-' if(!$DF_dnds);
$DF_opi = '-' if(!$DF_opi);
$DF_fsa_x = '-' if(!$DF_fsa_x);
$DF_fsa_y = '-' if(!$DF_fsa_y);
$DF_nndist = '-' if(!$DF_nndist);
if($old_data_row){ #get_backward_orthologies - species_y is our species of interest
$data_line = join("\t",$old_data_row,
$DF_orthologue_id,$DF_oname, $DF_odesc,
$DF_opi, $DF_dnds, $DF_nndist,
$DF_fsa_y, $DF_fsa_x);
print "Putative interactor found: $DF_oname ($DF_orthologue_id)\n";
}else{ #get_forward_orthologies - species_x is our species of interest
$data_line = join("\t",$init_id,
$DF_orthologue_id,$DF_oname, $DF_odesc,
$DF_opi, $DF_dnds, $DF_nndist,
$DF_fsa_x, $DF_fsa_y);
}
print $output_file $data_line, "\n";
#$atLeastOneRowPrinted = 1;
}
}
return $counter;
}
#
#_setup_dbi_connection
#
# Usage : $dbh = _setup_dbi_connection($in_path);
# Purpose : This subroutine sets up a DBD::CSV connection with the input file, which must be in TSV
# (Tab Separated Values) format. The file content is then accessed through DBI just like a
# Relational Database
# Returns : A database handle
# Argument : A string indicating the path to the file
# Throws : -
# Comment : -
#
#See Also :
sub _setup_dbi_connection{
my ($path) = @_;
my $dbh = DBI->connect("DBI:CSV:f_dir=.;csv_eol=\n;csv_sep_char=\t;csv_quote_char=;csv_escape_char=")
or die("Cannot connect: " . $DBI::errstr);
$dbh->{'RaiseError'} = 1;
local $@ = '';
$dbh->{'csv_tables'}->{'int'} = { 'file' => $path};
$dbh->{FetchHashKeyName} = 'NAME_uc';
#weird behaviour..fetchrow_hashref might convert hash keys to lower case
#got this from http://search.cpan.org/~rehsack/SQL-Statement-1.27/lib/SQL/Statement.pm
return $dbh;
}
lib/Bio/Homology/InterologWalk.pm view on Meta::CPAN
#See Also :
#TODO try to fuse this with the former, maybe passing two db handles, maybe making this optional
sub _get_multiple_taxa_mean_score{
my ($dbh) = @_;
my $mean_ME_TAXA_score = 0;
my $interaction_count = 0;
my %ME_TAXA_hash = (); #multiple evidence, taxa
my $query = "SELECT $FN_multiple_taxa
FROM int";
my $sth = $dbh->prepare($query);
$sth->execute() or die "Cannot execute: " . $sth->errstr();
if(!$sth->rows){
print("_get_multiple_taxa_mean_score(): no data in input file. Skipping..");
return;
} # TODO temp solution do a SELECT COUNT etc etc
while (my $row = $sth->fetchrow_hashref){
$interaction_count += 1;
my $multiple_taxa_score = $row->{$FN_multiple_taxa};
$mean_ME_TAXA_score += $multiple_taxa_score;
$ME_TAXA_hash{$multiple_taxa_score} += 1;
}
$mean_ME_TAXA_score = $mean_ME_TAXA_score / $interaction_count;
print "Mean Multiple TAXA Score: ", $mean_ME_TAXA_score, "\n";
print "Multiple TAXA Evidence Hist\n";
foreach my $score (sort keys %ME_TAXA_hash){
$ME_TAXA_hash{$score} = $ME_TAXA_hash{$score}/$interaction_count;
print "$score\t$ME_TAXA_hash{$score}\n";
}
print "\n";
$sth->finish;
return $mean_ME_TAXA_score;
}
=head2 compute_prioritisation_index
Usage : $RC = Bio::Homology::InterologWalk::Scores::compute_prioritisation_index(
input_path => $in_path,
score_path => $score_path,
output_path => $out_path,
term_graph => $onto_graph,
meanscore_em => $m_em,
meanscore_it => $m_it,
meanscore_dm => $m_dm,
meanscore_me_dm => $m_mdm,
meanscore_me_taxa => $m_mtaxa
);
Purpose : This is used to analyse several ancillary data fields obtained alongside the actual
putative PPI IDs and collate them into an Interolog Prioritisation Index (IPX), to associate a
numerical index to each putative PPI based on biological metadata. The index will take into account
a number of features related to each of the steps involved in the orthology walk.
We can divide the metadata features in two broad classes:
- features related to the interaction. These include: Interaction Type, Interaction
Detection Method, Interaction coming from a SPOKE-expanded complex, interaction recon-
firmed through multiple taxa, interaction reconfirmed through multiple detection methods
- features related to the two orthology mappings. These include: orthology type
(one-to-one, one-to-many, many-to-one, many-to-many), OPI (percentage identity of the
conserved columns - see Bio::SimpleAlign), node to node distance, distance from the
first shared ancestor, (under development) dN/dS ratio
The IPX computation will also involve a normalisation stage. The subroutine requires
five arguments (meanscore_x) representing mean values to be used for normalisation.
The actual means are computed in get_mean_scores(), which is pre-requisite to
compute_prioritisation_index().
Returns : success/failure
Argument : -input_path : path to the input tsv file. A suitable input for this subroutine is the
final output of the orthology walk pipeline (see doInterologWalk.pl for usage guidelines).
input file should have .06out extension
-(OPTIONAL) score_path : path to text file where scores will be saved one per row
(useful for looking at score distributions eg through matlab)
output textfile has a .scores extension
-output_path : where you want the routine to write the data. Data is in TSV format.
File extension is .07out
-term_graph : a Go::Parser graph object obtained from parse_ontology() containing a
network representation of the PSI-MI controlled vocabulary of terms.
-meanscore_em : mean experimental method score for normalisation
-meanscore_it : mean interaction type score for normalisation
-meanscore_dm : mean detection method score for normalisation
-meanscore_me_dm : mean 'multiple detection methods' score for normalisation
-meanscore_me_taxa : mean 'multiple taxa' score for normalisation
Throws : -
Comment : -
See Also : L<http://search.cpan.org/~cjfields/BioPerl-1.6.1/Bio/SimpleAlign.pm#overall_percentage_identity>, L</get_mean_scores>, C<doScores.pl> for sample usage
=cut
sub compute_prioritisation_index{
my %args = @_;
my $in_path = $args{input_path};
my $out_path = $args{output_path};
my $score_path = $args{score_path};
my $graph = $args{term_graph};
my $mean_exp_method_score = $args{meanscore_em};
my $mean_int_type_score = $args{meanscore_it};
my $mean_det_method_score = $args{meanscore_dm};
my $mean_multiple_dm_score = $args{meanscore_me_dm}; #multiple evidence, detection method
my $mean_multiple_taxa_score = $args{meanscore_me_taxa}; #multiple evidence, taxa
if(!$graph){
print("compute_prioritisation_index(): PSI-MI graph representation not found. Aborting..\n");
return;
}
unless($mean_exp_method_score &&
$mean_int_type_score &&
$mean_det_method_score &&
$mean_multiple_dm_score &&
$mean_multiple_taxa_score){
print("compute_prioritisation_index(): missing mean scores. Aborting..\n");
return;
}
my @nodeNodeDist = ();
my $MAXNNDISTANCE = 0;
my $totalDIST = 0;
my $score_data;
#MANAGE FILES
my $dbh = Bio::Homology::InterologWalk::_setup_dbi_connection($in_path);
open (my $out_data, q{>}, $out_path) or croak("Unable to open $out_path : $!");
if($score_path){
open ($score_data, q{>}, $score_path) or croak("Unable to open $score_path : $!");
}
#============
#header----
print $out_data $HEADER_FWD_ORTH, "\t",
$HEADER_INT, "\t",
$HEADER_BWD_ORTH, "\t",
$HEADER_MISC, "\t",
$HEADER_SCORE, "\n";
##########
# 1st pass
##########
# 1)need to retrieve the highest node to node distance value for weighting purposes
# 2)need to retrieve some max/mean value for the multiple taxa score.
my $query = "SELECT $FN_nndist_1, $FN_nndist_2, $FN_multiple_taxa
FROM int";
my $sth = $dbh->prepare($query);
$sth->execute() or die "Cannot execute: " . $sth->errstr();
( run in 1.572 second using v1.01-cache-2.11-cpan-98e64b0badf )