GH
view release on metacpan or search on metacpan
GH/Sim4/sim4.2002-03-03/sim4.init.c view on Meta::CPAN
/*
* sim4 - Align a cDNA sequence with a genomic sequence for it.
*
* The basic command syntax is
*
* sim4 seq1 seq2 [[AXWRKCDHEPNBS]=]
*
* where sequence1 and sequence2 name files containing DNA sequences; a name
* like "foo[300,400]" refers to sequence entries 300-400 in file "foo". The
* files are to be in FASTA format. Thus a typical sequence file might begin:
*
* >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end.
* GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
* GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
*
* Typically, sequence1 name file contains one DNA sequence, and sequence2
* name file contains a database of sequences in FASTA format. Sim4 can
* compare a genomic sequence with a database of cDNAs or a cDNA sequence
* with a genomic database. If highly accurate sequences are being compared,
* specify option N=1.
*
* The command permits optional additional arguments, e.g. "A=1" and "W=8",
* that reset certain internal parameters. The available parameters are:
*
* W gives the word size.
* X gives the value for terminating word extensions.
* K gives the MSP score threshold for the first pass.
* C gives the MSP score threshold for the second pass.
* R direction of search; 0 - search the '+' strand only;
* 1 - search the '-' strand only; 2 - search both strands and
* report the best match. (R=2)
* D adjusts the range of diagonals in building the exons.
* H adjusts the re-linking weight factor
* A specifies the output format: exon endpoints only (A=0),
* alignment text (A=1), alignment in lav format (A=2) or both
* exon endpoints and alignment text (A=3, A=4). For A=3, positions
* in sequence 1 are given in the original sequence, and for A=4 in
* its reverse complement. A=5 prints the exon and CDS coordinates
* (the latter, if known) in the `exon file' format required by PipMaker.
* N if !=0, highly accurate exon detection is expected, for highly
* accurate sequence data.
* P remove polyA tails; if match on complementary strand, change
* coordinates in sequence 1 according to the '+' strand and print
* #lav alignment header for all alignment options.
* B control the presence of ambiguity codes in sequence data. If
* 1 (default), allow ambiguity codes (ABCDGKMNRCTVWXY); if 0,
* restrict the set of accepted characters to ACGTNX.
* S specifies a known coding region (to be used only with A=5, and for
* comparing full-mRNA sequences).
*/
#include "psublast.h"
#include "sim4.h"
#include "align.h"
#include "poly.h"
#ifndef __lint
/*@unused@*/
static const char rcsid[] =
"$Id: sim4.init.c,v 1.1 2002/12/03 20:12:37 hartzell Exp $";
#endif
static void init_stats(sim4_stats_t *);
static void sim4_argvals(sim4_args_t *);
static void cds_range(char *,int *,int *);
static char *extract_tok(char *);
static void add_offset_exons(Exon *,int);
static void add_offset_aligns(edit_script_list *,int);
static void print_align_blk(uchar *,uchar *,int,int,edit_script_list **,int,int);
static void print_align_lat(uchar *,uchar *,int,int,edit_script_list **,Exon *,int,int);
static const char Usage[] =
"%s seq1 seq2_db [[WXKCRDHAPNBS]=]\n\n\
W - word size. (W=12)\n\
X - value for terminating word extensions. (X=12)\n\
K - MSP score threshold for the first pass. (e.g., K=16)\n\
C - MSP score threshold for the second pass. (e.g., C=12)\n\
R - direction of search; 0 - search the '+' (direct) strand only; \n\
1 - search the '-' strand only; 2 - search both strands and \n\
report the best match. (R=2)\n\
D - bound for the range of diagonals within consecutive msps in an\n\
exon. (D=10)\n\
H - weight factor for MSP scores in relinking. (H=500)\n\
A - output format: exon endpoints only (A=0), alignment text (A=1),\n\
alignment in lav (block) format (A=2), or both exon endpoints\n\
and alignment text (A=3, A=4). If complement match, A=0,1,2,3\n\
give direct positions in the long sequence and complement \n\
positions in the short sequence. A=4 gives direct positions in \n\
the first sequence, regardless of the relative lengths.\n\
A=5 prints the exon and CDS coordinates (the latter, if known)\n\
in the `exon file' format required by PipMaker. To be used\n\
with full-length mRNA sequences.\n\
P - if not 0, remove poly-A tails; report coordinates in the \n\
'+' (direct) strand for complement matches; use lav alignment \n\
headers in all display options. (P=0) \n\
N - accuracy of sequences (non-zero for highly accurate). (N=0)\n\
B - if 0, dis-allow ambiguity codes (other than N and X) in the\n\
sequence data. (B=1)\n\
S - coding region specification (available only with A=5);\n\
format: S=n1..n2\n";
#ifdef _STATS
static void print_stats(sim4_stats_t, char *, int);
static void print_exon_stats(Exon *, char *, int);
#endif
#ifdef DEBUG
static void polystats(int,int,int,int,int,int,int,int);
#endif
#include "sim4b1.h"
int main(int argc, char *argv[])
{
uchar *revseq1=NULL;
int len1, len2, count, dist, match_ori, in_K, in_C, in_H;
int pA, pT, xpT, xpA, rev_xpT, rev_xpA;
int cds_from, cds_to;
uchar *seq1, *seq2;
char *h2, *h1, *cds_gene=NULL, *line;
SEQ *sf1, *sf2, *rf1=NULL;
argv_scores_t ds;
ss_t ss;
Exon *Exons=NULL, *rev_Exons=NULL;
edit_script_list *Aligns=NULL, *rev_Aligns=NULL;
if (argc < 3) fatalf(Usage, argv[0]);
ckargs("AXWRKCDHEPNBST", argc, argv, 2);
sim4_argvals(&rs);
DNA_scores(&ds, ss);
/* read seq1 */
sf1 = seq_open(argv[1], 0, (rs.B ? SEQ_ALLOW_AMB : 0));
if (!seq_read(sf1)) fatalf("Cannot read sequence from %s.", argv[1]);
seq1 = SEQ_CHARS(sf1);
len1 = SEQ_LEN(sf1);
h1 = SEQ_HEAD(sf1);
if (!is_DNA(seq1, len1))
fatal("The first sequence is not a DNA sequence.");
seq_toupper(seq1, len1, argv[1]);
/* read seq2 */
sf2 = seq_open(argv[2], 0, (rs.B ? SEQ_ALLOW_AMB : 0));
if (!seq_read(sf2)) fatalf("Cannot read sequence from %s.", argv[2]);
seq2 = SEQ_CHARS(sf2);
len2 = SEQ_LEN(sf2);
h2 = SEQ_HEAD(sf2);
if (!is_DNA(seq2, len2))
fatal("The first sequence is not a DNA sequence.");
seq_toupper(seq2, len2, argv[2]);
/* determine the type of comparison */
file_type = (len2<=len1) ? GEN_EST : EST_GEN;
if (file_type== EST_GEN) {
rf1 = seq_copy(sf1);
rf1 = seq_revcomp_inplace(rf1);
revseq1 = SEQ_CHARS(rf1);
if (rs.ali_flag==5) {
if (rs.CDS_to>len1)
fatal("Command line CDS endpoint exceeds sequence length.");
cds_gene = extract_tok(h1);
if (cds_gene==NULL) { /* no FastaA header */
cds_from = rs.CDS_from; cds_to = rs.CDS_to;
} else {
line = strstr(h1, "CDS=");
if (line && rs.S) {
fprintf(stderr, "Warning: Command line CDS specification overrides header CDS specification.");
cds_from = rs.CDS_from; cds_to = rs.CDS_to;
} else if (line) {
cds_range(line+4, &cds_from, &cds_to);
} else if (rs.S) {
cds_from = rs.CDS_from; cds_to = rs.CDS_to;
} else {
cds_from = cds_to = 0;
}
}
if (cds_to>len1)
fatal("CDS endpoints exceed sequence length.");
}
}
if (rs.poly_flag && file_type==EST_GEN) {
get_polyAT(seq1,len1,&pT,&pA,BOTH_AT);
} else pT = pA = 0;
bld_table(seq1-1+pT, len1-pA-pT, rs.W, INIT);
count = 0;
while (!count || (seq_read(sf2)>0)) {
sim4_stats_t st, rev_st; char *tok;
if (count) { /* skip the first seq2, already in memory */
h2 = SEQ_HEAD(sf2);
seq2 = SEQ_CHARS(sf2);
len2 = SEQ_LEN(sf2);
tok = extract_tok(h2);
if (!is_DNA(seq2, len2)) {
char tmp[200];
(void)sprintf(tmp,"%s sequence is not a DNA sequence.", tok);
perror(tmp); continue;
}
seq_toupper(seq2, len2, argv[2]);
} else { /* first sequence in the file, seq2 is already in memory */
tok = extract_tok(h2);
if (tok==NULL) {
tok = ckalloc(strlen("(no header)")+1);
strcpy(tok, "(no header)");
}
}
if ((rs.ali_flag==5) && (file_type==GEN_EST)) {
cds_gene = tok;
if (!cds_gene && !count) {
cds_from = rs.CDS_from; cds_to = rs.CDS_to;
} else if (!count) {
line = strstr(h2, "CDS=");
if (rs.S) {
if (line) fprintf(stderr, "Warning: Command line CDS specification overrides header CDS specification.");
cds_from = rs.CDS_from; cds_to = rs.CDS_to;
} else if (line) {
cds_range(line+4, &cds_from, &cds_to);
}
} else if (count) {
line = strstr(h2, "CDS=");
if (line) {
cds_range(line+4, &cds_from, &cds_to);
} else {
cds_from = cds_to = 0;
}
}
if (cds_to>len2) fatal("CDS endpoints exceed sequence length.");
}
if (rs.poly_flag && file_type==GEN_EST) {
get_polyAT(seq2, len2, &pT, &pA, BOTH_AT);
}
++count;
init_stats(&st); init_stats(&rev_st);
in_K = (rs.set_K==TRUE) ? rs.K:-1;
in_C = (rs.set_C==TRUE) ? rs.C:-1;
in_H = (rs.set_H==TRUE) ? rs.weight:-1;
switch (rs.reverse) {
case 0: Aligns = (file_type==EST_GEN) ?
SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&xpT,&xpA,&Exons,&st):
SIM4(seq1,seq2+pT,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&xpT,&xpA,&Exons,&st);
break;
case 1: sf2 = seq_revcomp_inplace(sf2);
seq2 = SEQ_CHARS(sf2);
rev_Aligns = (file_type==EST_GEN) ?
SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st):
SIM4(seq1,seq2+pA,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st);
break;
case 2: Aligns = (file_type==EST_GEN) ?
SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&xpT,&xpA,&Exons,&st):
SIM4(seq1,seq2+pT,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&xpT,&xpA,&Exons,&st);
sf2 = seq_revcomp_inplace(sf2);
seq2 = SEQ_CHARS(sf2);
rev_Aligns = (file_type==EST_GEN) ?
SIM4(seq2,seq1+pT,len2,len1-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st):
SIM4(seq1,seq2+pA,len1,len2-pT-pA,rs.W,rs.X,in_K,in_C,
in_H,&dist,&rev_xpT,&rev_xpA,&rev_Exons,&rev_st);
break;
default: fatal ("Unrecognized request for EST orientation.");
}
if (st.nmatches>=rev_st.nmatches) {
/* forward ('+') strand match */
match_ori = FWD;
if (rs.reverse && rs.ali_flag) {
/* reverse-complement back seq2 for alignment */
sf2 = seq_revcomp_inplace(sf2); seq2 = SEQ_CHARS(sf2);
}
if (rev_Exons) { free_list(rev_Exons); rev_Exons = NULL; }
if (rev_Aligns) { free_align(rev_Aligns); rev_Aligns = NULL; }
} else {
match_ori = BWD;
if (Exons) { free_list(Exons); Exons = NULL; }
if (Aligns) { free_align(Aligns); Aligns = NULL; }
GH/Sim4/sim4.2002-03-03/sim4.init.c view on Meta::CPAN
if (args->weight<0)
fatal("Positive number required for H.");
args->set_H = TRUE;
} else {
/* args->weight = DEFAULT_WEIGHT; */
args->set_H = FALSE;
}
/*
if (get_fargval('v',&V)) {
if ((V<.7) || (V>1.0))
fatal("V must be in the range [0.7,1.0].");
} else
V = DEFAULT_MIN_COV;
*/
if (get_argval('W', &(args->W))) {
if (args->W < 1)
fatal("W must be positive.");
if (args->W > 15)
fatal("W must be <= 15.");
} else
args->W = DEFAULT_W;
if (get_argval('X', &(args->X))) {
if (args->X < 1)
fatal("X must be positive.");
} else
args->X = DEFAULT_X;
if (get_argval('K',&(args->K))) {
if (args->K<0) fatal("K must be positive.");
args->set_K = TRUE;
} else {
args->K = DEFAULT_K;
args->set_K = FALSE;
}
if (get_argval('C',&(args->C))) {
if (args->C<0) fatal("C must be positive.");
args->set_C = TRUE;
} else {
args->C = DEFAULT_C;
args->set_C = FALSE;
}
if (!get_argval('N', &(args->acc_flag)))
args->acc_flag = 0;
if (get_argval('B', &(args->B))) {
if (args->B && (args->B!=1))
fatal("B must be either 0 or 1.");
} else
args->B = 1;
if (get_strargval('S', &(args->S))) {
cds_range(args->S, &(args->CDS_from), &(args->CDS_to));
if ((args->CDS_from<=0) || (args->CDS_to<=0) ||
(args->CDS_from>args->CDS_to))
fatal("Illegal endpoints for the CDS region.");
} else
args->S = NULL;
if (args->S && (args->ali_flag!=5))
fatal ("A=5 must accompany CDS specification.");
return;
}
/* extract the CDS endpoints from the command line specification <n1>..<n2> */
static void cds_range(char *line, int *from, int *to)
{
char *s = line;
if (line == NULL) fatal ("NULL CDS specification.");
if (!isdigit((int)(*s)))
fatal("Non-numerical value in the CDS specification.");
while (*s && isdigit((int)(*s))) s++;
if (*s!='.') fatal ("Illegal CDS specification."); s++;
if (*s!='.') fatal ("Illegal CDS specification."); s++;
if (!isdigit((int)(*s)))
fatal ("Non-numerical value in the CDS specification.");
while (*s && isdigit((int)(*s))) s++;
if (*s && !isspace((int)(*s)))
fatal ("Garbage at the end of the CDS numerical specification.");
/* now extract the CDS elements */
if (sscanf(line, "%d..%d", from, to)!=2)
fatal ("Error when reading the CDS endpoints.");
return;
}
static void add_offset_exons(Exon *exons, int offset)
{
Exon *t;
if (!offset || !(exons)) return;
t = exons;
while (t) {
if (t->to1) { t->from2 += offset; t->to2 += offset; }
t = t->next_exon;
}
}
static void add_offset_aligns(edit_script_list *aligns, int offset)
{
edit_script_list *head;
if (!offset || !aligns) return;
head = aligns;
while (head) { head->offset2 += offset; head = head->next_script; }
return;
}
static char *extract_tok(char *h2)
{
char *s, *tmp, *q;
if ((h2==NULL) || (*h2=='\0')) return NULL;
if (*h2!='>') fatal("Not a FASTA header.");
s = h2+1; while (isspace((int)(*s)) && *s!='\n') s++;
if (*s=='\n') return NULL;
q = s; while (*s && !isspace((int)(*s))) s++;
tmp = (char *) ckalloc((unsigned int)(s-q+1));
strncpy(tmp, q, (int)(s-q));
tmp[s-q] = '\0';
return tmp;
}
/* ---------- utilities for collecting and reporting statistics ---------- */
static void init_stats(sim4_stats_t *st)
{
(void)memset(st,0,sizeof(sim4_stats_t));
}
#ifdef _STATS
static void print_exon_stats(Exon *exons, char *seq_name, int len)
{
FILE *xp;
Exon *t;
( run in 1.146 second using v1.01-cache-2.11-cpan-39bf76dae61 )