InSilicoSpectro
view release on metacpan or search on metacpan
lib/InSilicoSpectro/InSilico/MassCalculator.pm view on Meta::CPAN
undef(%elMass);
undef(%fragType);
undef(%series);
undef(%loss);
my $twig = XML::Twig->new(twig_handlers=>{oneElement => \&twigAddElement,
oneMolecule => \&twigAddMolecule,
oneAminoAcid => \&twigAddAminoAcid,
oneSeries => \&twigAddSeries,
oneLoss => \&twigAddLoss,
oneFragType => \&twigAddFragType,
oneInternFragType => \&twigAddInternFragType,
#oneModRes => \&twigAddModRes,
pretty_print=>'indented'});
#read all the modres already defined and set a handler to register new one afterwards
foreach(InSilicoSpectro::InSilico::ModRes::getList()){
setModif($_);
}
InSilicoSpectro::InSilico::ModRes::registerModResHandler(\&InSilicoSpectro::InSilico::MassCalculator::setModif);
if (@files){
foreach (@files){
#warn "reading MassCalculator.pm def from $_\n";
$twig->parsefile($_) || croak("Cannot parse [$_]: $!");
}
}
else{
eval{
require Phenyx::Config::GlobalParam;
InSilicoSpectro::Utils::io::croakIt "no [phenyx.config.insilicodef] is set" unless defined Phenyx::Config::GlobalParam::get('phenyx.config.insilicodef');
foreach (split /,/, Phenyx::Config::GlobalParam::get('phenyx.config.insilicodef')){
print STDERR __PACKAGE__." opening [$_]\n" if ($InSilicoSpectro::Utils::io::VERBOSE);
$twig->parsefile($_) || croak("Cannot parse [$_]: $!");
}
};
if ($@){
croak "not possible to open default config files: $@";
}
}
$cTerm = getMass('el_H')+getMass('el_O');
$nTerm = getMass('el_H');
# Adds extra common molecules
setMass('mol_CO', 'CO');
setMass('mol_H3PO4', 'H3PO4');
setMass('mol_HPO3', 'HPO3');
setMass('mol_H5', 'H5');
$isInit = 1;
} # init
# -----------------------------------------------------------------------------
# Gets and sets masses
# -----------------------------------------------------------------------------
=head2 massFromComposition($formula)
Computes monoisotopic and average masses from an atomic composition given
in $formula. Monoisotopic and average masses are returned as a vector of 2
elements. Negative numbers are possible to accommodate with losses or certain
modifications.
Example:
print join(',', massFromComposition('HPO3')), "\n";
=cut
sub massFromComposition
{
my ($formula) = @_;
my $mono = 0;
my $avg = 0;
$formula =~ s/\s+//g;
while ($formula =~ /([A-Z][a-z\+]?)([\-\d]*)/g){
my $num = $2 || 1;
$mono += $num*$elMass{"el_$1"}[0];
$avg += $num*$elMass{"el_$1"}[1];
}
return ($mono, $avg);
} # massFromComposition
=head1 Basic mass functions
=head2 getMass($el, $mt)
Returns the mass, either monoisotopic or average mass depending on the
current setting or the parameter $mt, of elements, amino acids, modifications
and molecules.
=over 4
=item $el
Contains the name of the "object" whose mass is to return. $el must start with
el_ for elements (el_Na, el_H), with aa_ for amino acids (aa_Y, aa_T), with
mol_ for molecules (mol_H2O), and mod_ for modifications (mod_Oxidation).
=item $mt
Contains the mass type (0=monoisotopic, 1=average). When this argument is not
provided (most common function use), the current setting of mass type is used.
=back
=cut
sub getMass
{
# Returns the mass of the given element/molecule, according to $massType or to the second argument if defined
my ($el, $mt) = @_;
$mt = $massType if(!defined($mt));
defined($elMass{$el}) || ( $invalidElementCall && $invalidElementCall->("Unknown element/molecule in getMass: [$el]"));
$elMass{$el}[$mt];
} # getMass
=head2 setMass($symbol, $formula)
This function allows the user to set new molecule, element, amino acid, and modification
masses dynamically.
=over 4
=item $symbol
Contains the identifier of the new object with the appropriate prefix (el_, aa_,
mol_, or mod_).
=item $formula
Contains the atomic composition (it is used for computing monoisotopic and average
masses). Negative numbers of atoms are possible to deal with modifications.
Examples:
setMass('mol_NH3', 'NH3');
setMass('mol_H2O', 'H2O');
=back
=cut
sub setMass
{
my ($symbol, $formula) = @_;
croak("Already defined mass for [$symbol]") if (defined($elMass{$symbol}));
my ($mono, $avg) = massFromComposition($formula);
$elMass{$symbol} = [$mono, $avg];
} # setMass
=head2 setMassType($type)
Sets the current mass type. C<$type> must be equal to 0 (monoisotopic) or 1 (average).
=cut
sub setMassType
{
my ($type) = @_;
croak("Unknown mass type: [$type]") if (($type != 0) && ($type != 1));
$massType = $type;
} # setMassType
=head2 getMassType
Returns the current mass type.
=cut
sub getMassType
{
return $massType;
} # getMassType
=head2 setModif($mr)
add a InSilico::ModRes object
=cut
sub setModif
{
my $mr=shift;
my $name=$mr->get('name');
$elMass{"mod_$name"} = [$mr->{delta_monoisotopic}, $mr->{delta_average}];
} # setModif
# -----------------------------------------------------------
# Peptide functions
# -----------------------------------------------------------
=head1 Peptide-related functions
This section groups all the functions that are directly related to peptides, i.e.
peptide mass computation and protein digestion.
lib/InSilicoSpectro/InSilico/MassCalculator.pm view on Meta::CPAN
=item fixedModif
A reference to a vector of fixed modification names. The
rules for identifying possible amino acids where modifications
occur are read from the configuration file.
All possible modification sites for all fixed modifications are
computed and these sites are combined with the fixed modifications
given by the parameter modif.
Fixed modification always have the priority over variable
modifications and it is not allowed to have two fixed modifications
at the same location.
=item varModif
A reference to a vector of variable modification names. If this parameter
is provided then each generated peptide will be tested for possible
variable modifications and all the combinations will be put in the final
result, i.e. the peptide is present several times with different modification
strings.
=item pmf
If this parameter is set to any value, then the exact modification location
in the result is replaced by the format for PMF, see above.
=item nmc
Maximum number of missed cleavages, default 0.
=item minMass
Minimum peptide mass, default 500 Da.
=item maxMass
Maximum peptide mass, default 999999 Da.
=item enzyme
The enzyme can be specified either by a regular expression or by a CleavEnzyme
object. If this parameter is not provided then the digestion is performed for
trypsin according to the regular expression
(?<=[KR])(?=[^P])
Regular expressions used for specifying an enzyme can be given as strings or
precompiled regular expression such as
$dibasicRegex = qr/(?<=[KR])(?=[KR])/,
which is exported by this module.
When you specify the enzyme by a CleavEnzyme object, the regular expression is
read from the object.
=item CTermGain, NTermGain
In case the enzyme does not add H at N-termini and OH at C-termini, you can
define new formulas for what is added.
When the enzyme comes as a CleavEnzyme object, these values are read from the
enzyme object directly.
=item CTermModif, NTermModif
In case the enzyme does introduce a modification at peptide C-/N-termini, you can
provide the name of the modification to apply.
When the enzyme comes as a CleavEnzyme object, these values are read from the
enzyme object directly.
=item methionine
If methionine is set to any value then, in case the first protein amino acid
is a methionine, supplementary peptides with initial methionine removed are
generated. This may be useful when processing RNA-translated sequences.
=item addProton
This parameter, if set to any value, tells the function to add the mass
of one proton to each peptide mass. This is useful for PMF as it allows
direct comparison of theoretical (charged) masses with experimental
masses.
=item duplicate
When set to any value, and the protein is given as a AASequence object, this
parameter tells the function not to try to save memory when creating the Peptide
object.
=back
The result of the digestion is returned either as a vector of InSilicoSpectro::InSilico::Peptide
objects or as a data structure which is a vector of six references:
result[0] -> vector of peptide sequences
result[1] -> vector of start positions in the original protein string
result[2] -> vector of end positions in the original protein string
result[3] -> vector of number of missed cleavages
result[4] -> vector of peptide masses
result[5] -> vector of modification descriptions
Variables $digestIndexPept=0, $digestIndexStart=1, $digestIndexEnd=2, $digestIndexNmc=3,
$digestIndexMass=4, and $digestIndexModif=5 are exported for convenience.
Example:
my $protein = 'MCTMACTKGIPRKQWWEMMKPCKADFCV';
my $modif = '::Cys_CAM::Oxidation::::::::::::::Oxidation:::::::::::';
my @result = digestByRegExp(protein=>$protein, modif=>$modif, methionine=>1, nmc=>2);
print "$protein\n$modif\n";
for (my $i = 0; $i < @{$result[0]}; $i++){
print "$result[$digestIndexPept][$i]\t$result[$digestIndexStart][$i]\t",
"$result[$digestIndexEnd][$i]\t$result[$digestIndexNmc][$i]\t",
"$result[$digestIndexMass][$i]\t", modifToString($result[$digestIndexModif][$i]), "\n";
}
More examples in InSilicoSpectro/InSilico/test/testCalcDigest.pl and InSilicoSpectro/InSilico/test/testCalcDigestOOP.pl
lib/InSilicoSpectro/InSilico/MassCalculator.pm view on Meta::CPAN
}
}
} # matchSpectrumGreedy
=head2 getSeries($name)
Returns a vector (terminus, monoisotopic delta, average delta,
first fragment, last fragment) that contains the parameters
of series $name, where
=over 4
=item $name
is th name of the series, e.g. b or y.
=item terminus
is equal to 'N' or 'C'.
=item monoisotopic delta
is the mass delta to apply when computing the monoisotopic mass.
It is 0 for a b series and 1.007825 for y.
=item average delta
is the mass delta to apply when computing the average mass.
It is 0 for a b series and 1.007976 for y.
=item first fragment
is the number of the first fragment to compute. For instance,
fragment b1 is generally not detected hence first fragment
should be set to 2 for a b series. The rule is the same for
N- and C-term series.
=item last fragment
is the number of the last fragment counted from the end. For
instance, the last b fragment is normally not detected hence
last fragment should be set to 2. If a fragment containing
all the amino acids of the peptide is possible, then it
should be set to 1. The rule is the same for N- and C-term
series.
=back
=cut
sub getSeries
{
my ($name) = @_;
return ($series{$name}{terminus}, $series{$name}{delta}[0], $series{$name}{delta}[1], $series{$name}{firstFrag}, $series{$name}{lastFrag});
} # getSeries
=head2 setSeries($name, $terminus, $formula, $firstFrag, $lastFrag)
Adds a new series to the series read from the configuration file.
The parameters are defined above in getSeries except formula.
The mass deltas are not given as real numbers but rather as
deltas in atoms. For instance, b series formula is empty because
we do want 0 delta and c series formula is 'N 1 H 3'. Example:
setSeries('c', 'N', 'N 1 H 3', 2, 2);
=cut
sub setSeries
{
my ($name, $terminus, $formula, $firstFrag, $lastFrag) = @_;
croak("Already defined series for [$name]") if (defined($series{$name}));
$series{$name}{terminus} = uc($terminus);
$series{$name}{delta} = [massFromComposition($formula)];
$series{$name}{firstFrag} = $firstFrag;
$series{$name}{lastFrag} = $lastFrag;
} # setSeries
=head2 getLoss($name)
Returns a vector (residues, monoisotopic delta, average delta)
that contains the parameters of a neutral loss, where
=over 4
=item $name
is the name of the neutral loss.
=item residues
is a string containing the one character codes of the amino
acids where the loss is possible.
=item monoisotopic delta
is the mass delta to apply when computing the monoisotopic mass.
=item average delta
is the mass delta to apply when computing the average mass.
=back
=cut
sub getLoss
{
my ($name) = @_;
return (join('', keys(%{$loss{$name}{residues}})), $loss{$name}{delta}[0], $loss{$name}{delta}[1]);
} # getLoss
=head2 setLoss($name, $residues, $formula)
=head2 setLoss($name, $residues, $deltamass_mono, $deltamass_avg)
Adds a new loss to the ones read from the file fragments.xml.
The parameters are defined above in getLoss except formula.
The mass deltas are not given as real numbers but rather as
deltas in atoms. Example:
setLoss('H2O', 'ST', 'H 2 O 1');
or
setLoss('H2O', 'ST', 18.003, 17.927);
=cut
sub setLoss
{
my $name=shift;
my $residues=shift;
if($_[0]=~/^[A-Z]/){
#we have a formula
my $formula=shift;
croak("Already defined loss for [$name]") if (defined($loss{$name}));
$loss{$name}{delta} = [massFromComposition($formula)];
}else{
# we should have two masses
$loss{$name}{delta} = [$_[0], $_[1]];
}
foreach (split(//, $residues)){
$loss{$name}{residues}{$_} = 1;
}
} # setLoss
=head2 getFragType($name)
Returns a vector (series, charge, loss 1, repeat 1, loss 2, repeat 2, ...)
containing the parameters of a fragment type, where
=over 4
=item $name
is the name of the fragment type.
=item series
is the series on which this fragment type is based.
=item charge
is the charge state of a fragment of this type.
=item loss k
in case the fragment type includes losses, loss k is set to the name of
each loss possible for this fragment type.
=item repeat k
the maximum number of each loss, -1 means no maximum.
=back
=cut
sub getFragType
{
my ($name) = @_;
my @tmp = ($fragType{$name}{series}, $fragType{$name}{charge});
if (defined($fragType{$name}{loss})){
for (my $i = 0; $i < @{$fragType{$name}{loss}}; $i++){
push (@tmp, $fragType{$name}{loss}[$i], $fragType{$name}{repeat}[$i]);
}
}
return @tmp;
} # getFragType
=head2 getFragTypeList
lib/InSilicoSpectro/InSilico/MassCalculator.pm view on Meta::CPAN
{
return (join('', keys(%immoAA)), $immoDelta);
} # getImmonium
=head2 getImmoniumMass($aa, $mod)
Computes the mass of an immonium ion given the amino acid one letter
code $aa and a possible modification $mod. In case of Lysine (K), two
values are returned.
=cut
sub getImmoniumMass
{
my ($aa, $mod) = @_;
my $mass = getMass("aa_$aa")+$immoDelta;
$mass += getMass("mod_$mod") if ($mod);
if ($aa eq 'K'){
# Consider a possible extra mass with ammonia loss
return ($mass, $mass-getMass('mol_NH3'))
}
else{
return $mass;
}
} # getImmoniumMass
# ------------------------------------------------------------------------
# XML parsing
# ------------------------------------------------------------------------
sub twigAddElement
{
my ($twig, $el) = @_;
my $symbol = $el->atts->{symbol};
my $mel = $el->first_child('mass');
my $mono = $mel->atts->{monoisotopic};
my $avg = $mel->atts->{average};
$elMass{"el_$symbol"} = [$mono, $avg];
} # twigAddElement
sub twigAddMolecule
{
my ($twig, $el) = @_;
my $symbol = $el->atts->{symbol};
my ($mono, $avg);
if (my $mel = $el->first_child('mass')){
# Mass directly provided, use it
$mono = $mel->atts->{monoisotopic};
$avg = $mel->atts->{average};
}
else{
# Use the formula instead
my $formula = $el->first_child('formula')->text;
($mono, $avg) = massFromComposition($formula);
}
$elMass{"mol_$symbol"} = [$mono, $avg];
} # twigAddMolecule
sub twigAddAminoAcid
{
my ($twig, $el) = @_;
my $symbol = $el->atts->{code1};
my $mel = $el->first_child('mass');
my $mono = $mel->atts->{monoisotopic};
my $avg = $mel->atts->{average};
$elMass{"aa_$symbol"} = [$mono, $avg];
} # twigAddAminoAcid
sub twigAddSeries
{
my ($twig, $el) = @_;
my $name = $el->atts->{name};
my $terminus = $el->first_child('terminus')->text;
my $firstFrag = $el->first_child('firstFragment')->text;
my $lastFrag = $el->first_child('lastFragment')->text;
my $formula = $el->first_child('formula')->text;
setSeries($name, $terminus, $formula, $firstFrag, $lastFrag);
} # twigAddSeries
sub twigAddLoss
{
my ($twig, $el) = @_;
my $name = $el->atts->{name};
my $residues = $el->first_child('residues')->atts->{aa};
my $formula = $el->first_child('formula')&&$el->first_child('formula')->text;
if($formula){
setLoss($name, $residues, $formula);
}else{
my $eldm=$el->first_child('deltaMass')|| die "must specify 'formula' or 'deltamass' in oneLoss definition [$name]";
$eldm->print(\*STDERR);
setLoss($name, $residues, $eldm->atts->{monoisotopic}, $eldm->atts->{average});
}
} # twigAddLoss
sub twigAddFragType
{
my ($twig, $el) = @_;
my $name = $el->atts->{name};
my $series = $el->atts->{series};
my $charge = $el->atts->{charge};
my (@lossName, @repeat);
my @children = $el->children('loss');
foreach (@children){
push(@lossName, $_->atts->{name});
push(@repeat, $_->atts->{repeat});
}
setFragType($name, $series, $charge, \@lossName, \@repeat);
} # twigAddFragType
sub twigAddInternFragType
{
my ($twig, $el) = @_;
my $name = $el->atts->{name};
my $residues = $el->first_child('residues')->text;
my $mono = $el->first_child('delta')->atts->{monoisotopic};
my $avg = $el->first_child('delta')->atts->{average};
if ($name eq 'immo'){
setImmonium($residues, $mono);
}
else{
croak("Unknown internal fragment type [$name]");
}
} # twigAddInternFragType
sub twigAddModRes
{
my ($twig, $el) = @_;
my $name = $el->atts->{id} || $el->atts->{name};
#return added by alex; does not handle regexp definitions
my $rel = $el->first_child('residues') or return;
my $residues = $rel->atts->{aa};
my $residuesAfter = $rel->atts->{aaAfter} || '.';
my $terminus = defined($rel->atts->{nterm}) ? 'N' : (defined($rel->atts->{cterm}) ? 'C' : '-');
my $del = $el->first_child('delta');
my $mono = $del->atts->{monoisotopic};
my $avg = $del->atts->{average};
setModif($name, $mono, $avg, $residues, $residuesAfter, $terminus);
} # twigAddModRes
( run in 0.557 second using v1.01-cache-2.11-cpan-39bf76dae61 )