GH

 view release on metacpan or  search on metacpan

GH/Sim4/sim4.2002-03-03/sim4.init.c  view on Meta::CPAN

/*
* sim4 - Align a cDNA sequence with a genomic sequence for it.
*
* The basic command syntax is
*
*       sim4 seq1 seq2 [[AXWRKCDHEPNBS]=]
*
* where sequence1 and sequence2 name files containing DNA sequences; a name
* like "foo[300,400]" refers to sequence entries 300-400 in file "foo".  The
* files are to be in FASTA format.  Thus a typical sequence file might begin:
*
*       >BOVHBPP3I  Bovine beta-globin psi-3 pseudogene, 5' end.
*       GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
*       GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
*
* Typically, sequence1 name file contains one DNA sequence, and sequence2
* name file contains a database of sequences in FASTA format. Sim4 can
* compare a genomic sequence with a database of cDNAs or a cDNA sequence
* with a genomic database. If highly accurate sequences are being compared,
* specify option N=1.
*
* The command permits optional additional arguments, e.g. "A=1" and "W=8",
* that reset certain internal parameters.  The available parameters are:
*
*       W gives the word size.
*       X gives the value for terminating word extensions.
*       K gives the MSP score threshold for the first pass.
*       C gives the MSP score threshold for the second pass. 
*       R direction of search; 0 - search the '+' strand only;
*         1 - search the '-' strand only; 2 - search both strands and
*         report the best match. (R=2)
*       D adjusts the range of diagonals in building the exons.
*       H adjusts the re-linking weight factor
*       A specifies the output format: exon endpoints only (A=0),
*         alignment text (A=1), alignment in lav format (A=2) or both
*         exon endpoints and alignment text (A=3, A=4). For A=3, positions
*         in sequence 1 are given in the original sequence, and for A=4 in
*         its reverse complement. A=5 prints the exon and CDS coordinates 
*         (the latter, if known) in the `exon file' format required by PipMaker.
*       N if !=0, highly accurate exon detection is expected, for highly
*         accurate sequence data. 
*       P remove polyA tails; if match on complementary strand, change
*         coordinates in sequence 1 according to the '+' strand and print
*         #lav alignment header for all alignment options.
*       B control the presence of ambiguity codes in sequence data. If
*         1 (default), allow ambiguity codes (ABCDGKMNRCTVWXY); if 0,  
*         restrict the set of accepted characters to ACGTNX.
*       S specifies a known coding region (to be used only with A=5, and for
*         comparing full-mRNA sequences).
*/      


#include "psublast.h"

#include "sim4.h" 
#include "align.h"
#include "poly.h"
               
#ifndef __lint     
/*@unused@*/       
static const char rcsid[] =
"$Id: sim4.init.c,v 1.1 2002/12/03 20:12:37 hartzell Exp $";
#endif         
           

static void init_stats(sim4_stats_t *);
static void sim4_argvals(sim4_args_t *);
static void cds_range(char *,int *,int *);
static char *extract_tok(char *);

static void add_offset_exons(Exon *,int);
static void add_offset_aligns(edit_script_list *,int);
static void print_align_blk(uchar *,uchar *,int,int,edit_script_list **,int,int);
static void print_align_lat(uchar *,uchar *,int,int,edit_script_list **,Exon *,int,int);

static const char Usage[] =
"%s seq1 seq2_db [[WXKCRDHAPNBS]=]\n\n\
       W  -  word size. (W=12)\n\
       X  -  value for terminating word extensions. (X=12)\n\
       K  -  MSP score threshold for the first pass. (e.g., K=16)\n\
       C  -  MSP score threshold for the second pass. (e.g., C=12)\n\
       R  -  direction of search; 0 - search the '+' (direct) strand only; \n\
             1 - search the '-' strand only; 2 - search both strands and \n\
             report the best match. (R=2)\n\
       D  -  bound for the range of diagonals within consecutive msps in an\n\
             exon. (D=10)\n\
       H  -  weight factor for MSP scores in relinking. (H=500)\n\
       A  -  output format: exon endpoints only (A=0), alignment text (A=1),\n\
             alignment in lav (block) format (A=2), or both exon endpoints\n\
             and alignment text (A=3, A=4). If complement match, A=0,1,2,3\n\
             give direct positions in the long sequence and complement \n\
             positions in the short sequence. A=4 gives direct positions in \n\
             the first sequence, regardless of the relative lengths.\n\
             A=5 prints the exon and CDS coordinates (the latter, if known)\n\
             in the `exon file' format required by PipMaker. To be used\n\
             with full-length mRNA sequences.\n\
       P  -  if not 0, remove poly-A tails; report coordinates in the \n\
             '+' (direct) strand for complement matches; use lav alignment \n\
             headers in all display options. (P=0) \n\
       N  -  accuracy of sequences (non-zero for highly accurate). (N=0)\n\
       B  -  if 0, dis-allow ambiguity codes (other than N and X) in the\n\
             sequence data. (B=1)\n\
       S  -  coding region specification (available only with A=5);\n\
             format: S=n1..n2\n";
       
#ifdef _STATS
static void print_stats(sim4_stats_t, char *, int);
static void print_exon_stats(Exon *, char *, int);
#endif

#ifdef DEBUG
static void polystats(int,int,int,int,int,int,int,int);
#endif

       
#include "sim4b1.h"
       
int main(int argc, char *argv[])
{
        uchar  *revseq1=NULL; 
        int    len1, len2, count, dist, match_ori, in_K, in_C, in_H;
        int pA, pT, xpT, xpA, rev_xpT, rev_xpA;
        int cds_from, cds_to;
        uchar *seq1, *seq2;
        char  *h2, *h1, *cds_gene=NULL, *line;
        SEQ   *sf1, *sf2, *rf1=NULL;
        argv_scores_t ds;
        ss_t ss;
       
        Exon *Exons=NULL, *rev_Exons=NULL; 
        edit_script_list *Aligns=NULL, *rev_Aligns=NULL;
       
        if (argc < 3) fatalf(Usage, argv[0]);
        ckargs("AXWRKCDHEPNBST", argc, argv, 2);

        sim4_argvals(&rs);
        DNA_scores(&ds, ss);
       
        /* read seq1 */ 
        sf1 = seq_open(argv[1], 0, (rs.B ? SEQ_ALLOW_AMB : 0));
        if (!seq_read(sf1)) fatalf("Cannot read sequence from %s.", argv[1]);
        
        seq1 = SEQ_CHARS(sf1);
        len1 = SEQ_LEN(sf1);
        h1 = SEQ_HEAD(sf1);
        if (!is_DNA(seq1, len1))
                fatal("The first sequence is not a DNA sequence.");
        seq_toupper(seq1, len1, argv[1]);
        
        /* read seq2 */
        sf2 = seq_open(argv[2], 0, (rs.B ? SEQ_ALLOW_AMB : 0));
        if (!seq_read(sf2)) fatalf("Cannot read sequence from %s.", argv[2]);
       
        seq2 = SEQ_CHARS(sf2);
        len2 = SEQ_LEN(sf2);
        h2 = SEQ_HEAD(sf2);
        if (!is_DNA(seq2, len2))
                fatal("The first sequence is not a DNA sequence.");
        seq_toupper(seq2, len2, argv[2]);
        
        /* determine the type of comparison */
        file_type = (len2<=len1) ? GEN_EST : EST_GEN;
        if (file_type== EST_GEN) {
            rf1 = seq_copy(sf1);
            rf1 = seq_revcomp_inplace(rf1);
            revseq1 = SEQ_CHARS(rf1);

            if (rs.ali_flag==5) {
                if (rs.CDS_to>len1) 
                   fatal("Command line CDS endpoint exceeds sequence length.");
                cds_gene = extract_tok(h1);
                if (cds_gene==NULL) {  /* no FastaA header */
                    cds_from = rs.CDS_from; cds_to = rs.CDS_to;
                } else {
                    line = strstr(h1, "CDS="); 
                    if (line && rs.S) {
                       fprintf(stderr, "Warning: Command line CDS specification overrides header CDS specification."); 
                       cds_from = rs.CDS_from; cds_to = rs.CDS_to;
                    } else if (line) {
                       cds_range(line+4, &cds_from, &cds_to); 
                    } else if (rs.S) {
                       cds_from = rs.CDS_from; cds_to = rs.CDS_to;
                    } else {
                       cds_from = cds_to = 0;
                    }
                }
                if (cds_to>len1) 
                   fatal("CDS endpoints exceed sequence length.");
            }
        }
        
        if (rs.poly_flag && file_type==EST_GEN)  {
            get_polyAT(seq1,len1,&pT,&pA,BOTH_AT);
        } else pT = pA = 0;

        bld_table(seq1-1+pT, len1-pA-pT, rs.W, INIT);
        
        count = 0; 
        while (!count || (seq_read(sf2)>0)) {
           sim4_stats_t st, rev_st; char *tok;
        
           if (count) { /* skip the first seq2, already in memory */
              h2 = SEQ_HEAD(sf2);
              seq2 = SEQ_CHARS(sf2);
              len2 = SEQ_LEN(sf2);
              tok = extract_tok(h2);
        
              if (!is_DNA(seq2, len2)) {
                char tmp[200];
                (void)sprintf(tmp,"%s sequence is not a DNA sequence.", tok);
                perror(tmp); continue;
              } 
              seq_toupper(seq2, len2, argv[2]);
           } else {  /* first sequence in the file, seq2 is already in memory */
               tok = extract_tok(h2);
               if (tok==NULL) { 
                   tok = ckalloc(strlen("(no header)")+1); 
                   strcpy(tok, "(no header)"); 
               } 
           }

           if ((rs.ali_flag==5) && (file_type==GEN_EST)) {
               cds_gene = tok;
               if (!cds_gene && !count) {
                   cds_from = rs.CDS_from; cds_to = rs.CDS_to; 
               } else if (!count) {
                   line = strstr(h2, "CDS=");
                   if (rs.S) {
                       if (line) fprintf(stderr, "Warning: Command line CDS specification overrides header CDS specification.");
                       cds_from = rs.CDS_from; cds_to = rs.CDS_to;
                   } else if (line) {
                       cds_range(line+4, &cds_from, &cds_to); 
                   }
               } else if (count) {
                   line = strstr(h2, "CDS=");
                   if (line) {
                       cds_range(line+4, &cds_from, &cds_to);
                    } else {
                       cds_from = cds_to = 0;
                    }
               }
               if (cds_to>len2) fatal("CDS endpoints exceed sequence length.");
           }

           if (rs.poly_flag && file_type==GEN_EST)  {
               get_polyAT(seq2, len2, &pT, &pA, BOTH_AT);
           }

           ++count; 
           init_stats(&st); init_stats(&rev_st);
           in_K = (rs.set_K==TRUE) ? rs.K:-1;
           in_C = (rs.set_C==TRUE) ? rs.C:-1;
           in_H = (rs.set_H==TRUE) ? rs.weight:-1;

           switch (rs.reverse) {
             case  0: Aligns = (file_type==EST_GEN) ?
                      SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&xpT,&xpA,&Exons,&st):
                      SIM4(seq1,seq2+pT,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&xpT,&xpA,&Exons,&st);
                      break;
              
             case  1: sf2 = seq_revcomp_inplace(sf2);
                      seq2 = SEQ_CHARS(sf2);
                      rev_Aligns = (file_type==EST_GEN) ?
                      SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st):
                      SIM4(seq1,seq2+pA,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st);
                      break; 
              
             case  2: Aligns = (file_type==EST_GEN) ?
                      SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&xpT,&xpA,&Exons,&st):
                      SIM4(seq1,seq2+pT,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&xpT,&xpA,&Exons,&st);

                      sf2 = seq_revcomp_inplace(sf2);
                      seq2 = SEQ_CHARS(sf2);
                      rev_Aligns = (file_type==EST_GEN) ?
                      SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st):
                      SIM4(seq1,seq2+pA,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
                           in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st);
                      break;
                               
              default: fatal ("Unrecognized request for EST orientation.");
           }
              
           if (st.nmatches>=rev_st.nmatches) {
               /* forward ('+') strand match */
               match_ori = FWD;
               if (rs.reverse && rs.ali_flag) {
                   /* reverse-complement back seq2 for alignment */
                   sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2);
               }
               if (rev_Exons) { free_list(rev_Exons); rev_Exons = NULL; }
               if (rev_Aligns) { free_align(rev_Aligns); rev_Aligns = NULL; }
           } else {            
               match_ori = BWD;
               if (Exons) { free_list(Exons);  Exons = NULL; }
               if (Aligns) { free_align(Aligns); Aligns = NULL; }

GH/Sim4/sim4.2002-03-03/sim4.init.c  view on Meta::CPAN

                if (args->weight<0)
                        fatal("Positive number required for H.");
                args->set_H = TRUE;
        } else {
            /*  args->weight = DEFAULT_WEIGHT; */
                args->set_H = FALSE;
        }
         
                        
/*      
        if (get_fargval('v',&V)) {
                if ((V<.7) || (V>1.0))
                        fatal("V must be in the range [0.7,1.0].");
        } else  
                V = DEFAULT_MIN_COV;
*/      
                
        if (get_argval('W', &(args->W))) {
                if (args->W < 1)
                        fatal("W must be positive.");
                if (args->W > 15)
                        fatal("W must be <= 15.");
        } else  
                args->W = DEFAULT_W;
        
        if (get_argval('X', &(args->X))) {
                if (args->X < 1)
                        fatal("X must be positive.");
        } else  
                args->X = DEFAULT_X;
        
        if (get_argval('K',&(args->K))) {
                if (args->K<0) fatal("K must be positive."); 
                args->set_K = TRUE;
        } else {
                args->K = DEFAULT_K;
                args->set_K = FALSE;
        }

        if (get_argval('C',&(args->C))) { 
                if (args->C<0) fatal("C must be positive.");                    
                args->set_C = TRUE;
        } else {
                args->C = DEFAULT_C;
                args->set_C = FALSE;
        }
        
        if (!get_argval('N', &(args->acc_flag)))
             args->acc_flag = 0;
                
        if (get_argval('B', &(args->B))) {
                 if (args->B && (args->B!=1))
                         fatal("B must be either 0 or 1.");
        } else  
                args->B = 1;
        
        if (get_strargval('S', &(args->S))) {
            cds_range(args->S, &(args->CDS_from), &(args->CDS_to));
            if ((args->CDS_from<=0) || (args->CDS_to<=0) || 
                (args->CDS_from>args->CDS_to))
                fatal("Illegal endpoints for the CDS region.");
        } else 
                args->S = NULL;

        if (args->S && (args->ali_flag!=5))
           fatal ("A=5 must accompany CDS specification.");

        return;
}

/* extract the CDS endpoints from the command line specification <n1>..<n2> */
static void cds_range(char *line, int *from, int *to)
{
     char *s = line;

     if (line == NULL) fatal ("NULL CDS specification.");

     if (!isdigit((int)(*s))) 
         fatal("Non-numerical value in the CDS specification."); 
     while (*s && isdigit((int)(*s))) s++;
     if (*s!='.') fatal ("Illegal CDS specification."); s++;
     if (*s!='.') fatal ("Illegal CDS specification."); s++;
     if (!isdigit((int)(*s))) 
         fatal ("Non-numerical value in the CDS specification.");
     while (*s && isdigit((int)(*s))) s++;   
     if (*s && !isspace((int)(*s))) 
        fatal ("Garbage at the end of the CDS numerical specification."); 
     
     /* now extract the CDS elements */
     if (sscanf(line, "%d..%d", from, to)!=2) 
         fatal ("Error when reading the CDS endpoints.");

     return;
}

static void add_offset_exons(Exon *exons, int offset)
{
    Exon *t;

    if (!offset || !(exons)) return;
 
    t = exons;
    while (t) {
       if (t->to1) { t->from2 += offset; t->to2 += offset; }
       t = t->next_exon;
    }
}

static void add_offset_aligns(edit_script_list *aligns, int offset)
{
    edit_script_list *head;
                 
    if (!offset || !aligns) return;
             
    head = aligns;
    while (head) { head->offset2 += offset; head = head->next_script; }
    
    return;
}

static char *extract_tok(char *h2)
{
    char *s, *tmp, *q;

    if ((h2==NULL) || (*h2=='\0')) return NULL;

    if (*h2!='>') fatal("Not a FASTA header.");
    s = h2+1; while (isspace((int)(*s)) && *s!='\n') s++;
    if (*s=='\n') return NULL;
    q = s; while (*s && !isspace((int)(*s))) s++; 
    tmp = (char *) ckalloc((unsigned int)(s-q+1));
    strncpy(tmp, q, (int)(s-q));
    tmp[s-q] = '\0'; 

    return tmp;
}


/*  ----------  utilities for collecting and reporting statistics ---------- */

static void init_stats(sim4_stats_t *st)
{
       (void)memset(st,0,sizeof(sim4_stats_t));
}

#ifdef _STATS
static void print_exon_stats(Exon *exons, char *seq_name, int len)
{
       FILE *xp;
       Exon *t;



( run in 1.146 second using v1.01-cache-2.11-cpan-39bf76dae61 )