split m results from the CPAN

Peptide-Pubmed
view release on metacpan or search on metacpan
lib/Peptide/Pubmed.pm view on Meta::CPAN
	return $_;
    }
}

=head2 clean_orphan_parens

  Args        : string with a sequence to clean up.
  Example     : $_ = clean_orphan_parens($_);
  Description : removes orphan parens from a string.
  Returns     : cleaned string.

=cut

sub clean_orphan_parens {
    my ($str) = @_;
    for (my $i = 0; $i < 100; $i++) { # magic number 100 to prevent inf loops
	my $num_changes = 0;
	$num_changes += s!^\)+!!g;		# delete wrong sided parens
	$num_changes += s!\(+$!!g;		# delete wrong sided parens
	$num_changes += s!\(+!\(!g;		# change '((GET)' to '(GET)'
	$num_changes += s!\)+!\)!g;		# change '(GET))' to '(GET)'
	$num_changes += s!\([^A-Z]*\)!!g;	# remove '(/)', '()'
	$num_changes += s!^\((.)\)!$1!;		# change '(G)ETRAPL' to 'GETRAPL'
	# change 'GETR(A)PL' to 'GETRAPL'
	# change 'GET(RAP)L' to 'GETRAPL'
	$num_changes += s!\(([^()/]*)\)!$1!g;
	# change '(GETR(A)PL)' to 'GETR(A)PL' - delete outer parens in 2 steps:
	$num_changes += s!\(([^()]+)\(!$1\(!g;	# change '(GETR(A)PL)' to 'GETR(A)PL)'
	$num_changes += s!\)([^()]+)\)!\)$1!g;	# change 'GETR(A)PL)' to 'GETR(A)PL'
	my $num_parens = tr!()!!;
	if ($num_parens == 1) {
	    $num_changes += tr!()!!d;	       	# change 'GE(TRAPL' to 'GETRAPL'
	}
	last unless $num_changes; # clean stubborn sequences, eg: 
	# 'G(ET(RA)PL' => 'G(ETRAPL' (after iteration 1), 'GETRAPL' (after iteration 2).
    }
    return $_;
}


=head2 parse_slashes

  Args        : string with a sequence.
  Example     : $_ = parse_slashes($_);
  Description : handles slashes. Changes A/B/C/etc to (A/B/C/etc)
		which means any of A, B, C, etc. Returns the resulting
		string. Exceptions are as follows, based on the the
		fact that the result would make no sense. If X occurs
		next to '/' or if the result would contain a repeated
		character. For exceptions, splits the string on
		slashes, and returns the first longest string.
  Returns     : see above

=cut

sub parse_slashes {
    $_ = $_[0];
    return $_ unless m!/!;
    if (m!/X|X/! or has_repeats_at_slashes($_) ) {
	my $str = '';
	for (split m!/+!) {
	    $str = $_  if length($str) < length($_); # change PXXP/PXPXP to PXPXP
	}
	return $str;
    } else {
	s!(\w(/\w)+)!($1)!g;	# change PXXP/GXPXP to PXX(P/G)XPXP
	return $_;
    }	
}


=head2 has_repeats_at_slashes

  Args        : string with a sequence.
  Example     : print 1 if has_repeats_at_slashes($_);
  Description : see below
  Returns     : TRUE if the string has a repeated char next to a series of 
		slashes and chars, FALSE otherwise.

=cut

sub has_repeats_at_slashes {
    for ($_[0]) {
	for (m!(\w(?:/\w)+)!g) {
	    my %seen;
	    for (/\w/g) {
		return 1 if $seen{$_}++;
	    }
	}
    }
    return;
}

=head2 WordVars

  Args        : $rh_word - ref to hash with data for the word
  Example     : $parser->WordVars($rh_word) or return;
  Description : computes variables for WordScore, eg WordPropDegen, WordIsDNA, etc.
  Returns     : TRUE if successful, FALSE otherwise.

=cut

sub WordVars {
    my ($self, $rh_word) = @_;
    return unless $rh_word and ref $rh_word eq 'HASH';
    $_ = $rh_word->{WordSequence};
    # keep allowed chars in kmers(): 'a'..'z' in sync with 
    # WordPropProtein() args : lc tr/A-Z//cd;
    ( $rh_word->{WordAlpha} = $_ ) =~ tr/A-Z//cd;
    
    if ( $rh_word->{WordSequence} ) {
	$rh_word->{WordSeqLen} = length $rh_word->{WordAlpha};
	$rh_word->{WordNumDegen} = tr/X//;
	$rh_word->{WordNumNotDegen} = $rh_word->{WordSeqLen} - $rh_word->{WordNumDegen};
	$rh_word->{WordPropDegen} = $rh_word->{WordNumDegen} / ($rh_word->{WordSeqLen} || 1);
	$rh_word->{WordLcSequence} = lc $rh_word->{WordSequence};
    }
    
    if ($rh_word->{WordAaSymbols} == 1) {
	# Compute WordPropProtein only for 1 letter symbols, because
	# it is important for classification if 'GET' looks like an english word,
( run in 0.942 second using v1.01-cache-2.11-cpan-71847e10f99 )