Peptide-Pubmed
view release on metacpan or search on metacpan
lib/Peptide/Pubmed.pm view on Meta::CPAN
return $_;
}
}
=head2 clean_orphan_parens
Args : string with a sequence to clean up.
Example : $_ = clean_orphan_parens($_);
Description : removes orphan parens from a string.
Returns : cleaned string.
=cut
sub clean_orphan_parens {
my ($str) = @_;
for (my $i = 0; $i < 100; $i++) { # magic number 100 to prevent inf loops
my $num_changes = 0;
$num_changes += s!^\)+!!g; # delete wrong sided parens
$num_changes += s!\(+$!!g; # delete wrong sided parens
$num_changes += s!\(+!\(!g; # change '((GET)' to '(GET)'
$num_changes += s!\)+!\)!g; # change '(GET))' to '(GET)'
$num_changes += s!\([^A-Z]*\)!!g; # remove '(/)', '()'
$num_changes += s!^\((.)\)!$1!; # change '(G)ETRAPL' to 'GETRAPL'
# change 'GETR(A)PL' to 'GETRAPL'
# change 'GET(RAP)L' to 'GETRAPL'
$num_changes += s!\(([^()/]*)\)!$1!g;
# change '(GETR(A)PL)' to 'GETR(A)PL' - delete outer parens in 2 steps:
$num_changes += s!\(([^()]+)\(!$1\(!g; # change '(GETR(A)PL)' to 'GETR(A)PL)'
$num_changes += s!\)([^()]+)\)!\)$1!g; # change 'GETR(A)PL)' to 'GETR(A)PL'
my $num_parens = tr!()!!;
if ($num_parens == 1) {
$num_changes += tr!()!!d; # change 'GE(TRAPL' to 'GETRAPL'
}
last unless $num_changes; # clean stubborn sequences, eg:
# 'G(ET(RA)PL' => 'G(ETRAPL' (after iteration 1), 'GETRAPL' (after iteration 2).
}
return $_;
}
=head2 parse_slashes
Args : string with a sequence.
Example : $_ = parse_slashes($_);
Description : handles slashes. Changes A/B/C/etc to (A/B/C/etc)
which means any of A, B, C, etc. Returns the resulting
string. Exceptions are as follows, based on the the
fact that the result would make no sense. If X occurs
next to '/' or if the result would contain a repeated
character. For exceptions, splits the string on
slashes, and returns the first longest string.
Returns : see above
=cut
sub parse_slashes {
$_ = $_[0];
return $_ unless m!/!;
if (m!/X|X/! or has_repeats_at_slashes($_) ) {
my $str = '';
for (split m!/+!) {
$str = $_ if length($str) < length($_); # change PXXP/PXPXP to PXPXP
}
return $str;
} else {
s!(\w(/\w)+)!($1)!g; # change PXXP/GXPXP to PXX(P/G)XPXP
return $_;
}
}
=head2 has_repeats_at_slashes
Args : string with a sequence.
Example : print 1 if has_repeats_at_slashes($_);
Description : see below
Returns : TRUE if the string has a repeated char next to a series of
slashes and chars, FALSE otherwise.
=cut
sub has_repeats_at_slashes {
for ($_[0]) {
for (m!(\w(?:/\w)+)!g) {
my %seen;
for (/\w/g) {
return 1 if $seen{$_}++;
}
}
}
return;
}
=head2 WordVars
Args : $rh_word - ref to hash with data for the word
Example : $parser->WordVars($rh_word) or return;
Description : computes variables for WordScore, eg WordPropDegen, WordIsDNA, etc.
Returns : TRUE if successful, FALSE otherwise.
=cut
sub WordVars {
my ($self, $rh_word) = @_;
return unless $rh_word and ref $rh_word eq 'HASH';
$_ = $rh_word->{WordSequence};
# keep allowed chars in kmers(): 'a'..'z' in sync with
# WordPropProtein() args : lc tr/A-Z//cd;
( $rh_word->{WordAlpha} = $_ ) =~ tr/A-Z//cd;
if ( $rh_word->{WordSequence} ) {
$rh_word->{WordSeqLen} = length $rh_word->{WordAlpha};
$rh_word->{WordNumDegen} = tr/X//;
$rh_word->{WordNumNotDegen} = $rh_word->{WordSeqLen} - $rh_word->{WordNumDegen};
$rh_word->{WordPropDegen} = $rh_word->{WordNumDegen} / ($rh_word->{WordSeqLen} || 1);
$rh_word->{WordLcSequence} = lc $rh_word->{WordSequence};
}
if ($rh_word->{WordAaSymbols} == 1) {
# Compute WordPropProtein only for 1 letter symbols, because
# it is important for classification if 'GET' looks like an english word,
( run in 0.942 second using v1.01-cache-2.11-cpan-71847e10f99 )