* bugfix in index: large memory when a read pos is 1

Heng Li · Heng Li · commit cb5f44947629 · 2011-08-17T18:26:22.000Z
* bugfix in BAQ: incorrect probability (rare)
 * compute variant distance bias (commited by Petr)
 * bugfix for computing LRT2: LRT2=nan
diff --git a/bam2bcf.c b/bam2bcf.c
@@ -39,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca)
  * negative if we are looking at an indel. */
 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
 {
+    static int *var_pos = NULL, nvar_pos = 0;
 	int i, n, ref4, is_indel, ori_depth = 0;
 	memset(r, 0, sizeof(bcf_callret1_t));
 	if (ref_base >= 0) {
@@ -94,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
 	r->depth = n; r->ori_depth = ori_depth;
 	// glfgen
 	errmod_cal(bca->e, n, 5, bca->bases, r->p);
+
+    // Calculate the Variant Distance Bias (make it optional?)
+    if ( nvar_pos < _n ) {
+        nvar_pos = _n;
+        var_pos = realloc(var_pos,sizeof(int)*nvar_pos);
+    }
+    int alt_dp=0, read_len=0;
+    for (i=0; i<_n; i++) {
+        const bam_pileup1_t *p = pl + i;
+        if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) 
+            continue;
+
+        var_pos[alt_dp] = p->qpos;
+        if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 )
+            var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT;
+
+        alt_dp++;
+        read_len += p->b->core.l_qseq;
+    }
+    float mvd=0;
+    int j;
+    n=0;
+    for (i=0; i<alt_dp; i++) {
+        for (j=0; j<i; j++) {
+            mvd += abs(var_pos[i] - var_pos[j]);
+            n++;
+        }
+    }
+    r->mvd[0] = n ? mvd/n : 0;
+    r->mvd[1] = alt_dp;
+    r->mvd[2] = alt_dp ? read_len/alt_dp : 0;
+
 	return r->depth;
 }
 
+
+void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call)
+{
+    // Variant distance bias. Samples merged by means of DP-weighted average.
+
+    float weight=0, tot_prob=0;
+
+    int i;
+    for (i=0; i<n; i++)
+    {
+        int mvd      = calls[i].mvd[0];
+        int dp       = calls[i].mvd[1];
+        int read_len = calls[i].mvd[2];
+
+        if ( dp<2 ) continue;
+
+        float prob = 0;
+        if ( dp==2 )
+        {
+            // Exact formula
+            prob = (mvd==0) ? 1.0/read_len : (read_len-mvd)*2.0/read_len/read_len;
+        }
+        else if ( dp==3 )
+        {
+            // Sin, quite accurate approximation
+            float mu = read_len/2.9;
+            prob = mvd>2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14);
+        }
+        else
+        {
+            // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths.
+            if ( dp>5 )
+                dp = 5;
+            float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1));
+            float norm   = 1.125*sqrt(2*3.14*sigma2);
+            float mu     = read_len/2.9;
+            if ( mvd < mu )
+                prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm;
+            else
+                prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm;
+        }
+
+        //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob);
+        tot_prob += prob*dp;
+        weight += dp;
+    }
+    tot_prob = weight ? tot_prob/weight : 1; 
+    //fprintf(stderr,"prob=%f\n", tot_prob);
+    call->vdb = tot_prob;
+}
+
 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
 {
 	int ref4, i, j, qsum[4];
@@ -170,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
 		call->ori_depth += calls[i].ori_depth;
 		for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
 	}
+
+    calc_vdb(n, calls, call);
+
 	return 0;
 }
 
@@ -223,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
 		if (i) kputc(',', &s);
 		kputw(bc->anno[i], &s);
 	}
+    if ( bc->vdb!=1 )
+    {
+        ksprintf(&s, ";VDB=%.4f", bc->vdb);
+    }
 	kputc('\0', &s);
 	// FMT
 	kputs("PL", &s);
diff --git a/bam2bcf.h b/bam2bcf.h
@@ -26,13 +26,15 @@ typedef struct {
 	int depth, ori_depth, qsum[4];
 	int anno[16];
 	float p[25];
+    int mvd[3]; // mean variant distance, number of variant reads, average read length
 } bcf_callret1_t;
 
 typedef struct {
 	int a[5]; // alleles: ref, alt, alt2, alt3
 	int n, n_alleles, shift, ori_ref, unseen;
 	int anno[16], depth, ori_depth;
 	uint8_t *PL;
+    float vdb; // variant distance bias
 } bcf_call_t;
 
 #ifdef __cplusplus
diff --git a/bam_index.c b/bam_index.c
@@ -188,7 +188,7 @@ bam_index_t *bam_index_core(bamFile fp)
 					bam1_qname(b), last_coor, c->pos, c->tid+1);
 			return NULL;
 		}
-		if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+		if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
 		if (c->bin != last_bin) { // then possibly write the binning index
 			if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
 				insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
diff --git a/bcftools/call1.c b/bcftools/call1.c
@@ -131,7 +131,6 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr,
 		if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
 		if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
 		if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
-		//if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]);
 	}
 	if (cons_llr > 0) {
 		ksprintf(&s, ";CLR=%d", cons_llr);
@@ -274,6 +273,8 @@ static void write_header(bcf_hdr_t *h)
         kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
     if (!strstr(str.s, "##INFO=<ID=RP,"))
         kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+    if (!strstr(str.s, "##INFO=<ID=VDB,"))
+        kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias\">\n", &str);
     if (!strstr(str.s, "##FORMAT=<ID=GT,"))
         kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
     if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
@@ -285,7 +286,7 @@ static void write_header(bcf_hdr_t *h)
 	if (!strstr(str.s, "##FORMAT=<ID=SP,"))
 		kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
 	if (!strstr(str.s, "##FORMAT=<ID=PL,"))
-		kputs("##FORMAT=<ID=PL,Number=-1,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods, number of values is (#ALT+1)*(#ALT+2)/2\">\n", &str);
+		kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
 	h->l_txt = str.l + 1; h->txt = str.s;
 }
 
diff --git a/bcftools/em.c b/bcftools/em.c
@@ -168,7 +168,6 @@ static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
 // x[5..6]: group1 freq, group2 freq
 // x[7]: 1-degree P-value
 // x[8]: 2-degree P-value
-// x[9]: 1-degree P-value with freq estimated from genotypes
 int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
 {
 	double *pdg;
@@ -208,11 +207,13 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
 		x[6] = freqml(x[0], n1, n, pdg);
 	}
 	if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
-		double f[3], f3[3][3];
+		double f[3], f3[3][3], tmp;
 		f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
 		for (i = 0; i < 3; ++i)
 			f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
-		x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
+		tmp = log(lk_ratio_test(n, n1, pdg, f3));
+		if (tmp < 0) tmp = 0;
+		x[7] = kf_gammaq(.5, tmp);
 	}
 	if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
 		double g[3][3], tmp;
@@ -222,8 +223,8 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
 		for (i = 0; i < ITER_MAX; ++i)
 			if (g3_iter(g[2], pdg, n1, n) < EPS) break;
 		tmp = log(lk_ratio_test(n, n1, pdg, g));
+		if (tmp < 0) tmp = 0;
 		x[8] = kf_gammaq(1., tmp);
-		x[9] = kf_gammaq(.5, tmp);
 	}
 	// free
 	free(pdg);
diff --git a/kprobaln.c b/kprobaln.c
@@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer
 		double p = 1., Pr1 = 0.;
 		for (i = 0; i <= l_query + 1; ++i) {
 			p *= s[i];
-			if (p < 1e-100) Pr += -4.343 * log(p), p = 1.;
+			if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
 		}
 		Pr1 += -4.343 * log(p * l_ref * l_query);
 		Pr = (int)(Pr1 + .499);
diff --git a/sample.c b/sample.c
@@ -52,14 +52,15 @@ static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, cons
 int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
 {
 	const char *p = txt, *q, *r;
-	kstring_t buf;
+	kstring_t buf, first_sm;
 	int n = 0;
 	khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
 	if (txt == 0) {
 		add_pair(sm, sm2id, fn, fn);
 		return 0;
 	}
 	memset(&buf, 0, sizeof(kstring_t));
+    memset(&first_sm, 0, sizeof(kstring_t));
 	while ((q = strstr(p, "@RG")) != 0) {
 		p = q + 3;
 		r = q = 0;
@@ -73,12 +74,21 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
 			oq = *u; or = *v; *u = *v = '\0';
 			buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
 			add_pair(sm, sm2id, buf.s, r);
+            if ( !first_sm.s )
+                kputs(r,&first_sm); 
 			*u = oq; *v = or;
 		} else break;
 		p = q > r? q : r;
 		++n;
 	}
 	if (n == 0) add_pair(sm, sm2id, fn, fn);
+    // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
+    //  use the tag instead.
+    else if ( n==1 && first_sm.s )
+        add_pair(sm,sm2id,fn,first_sm.s);
+    if ( first_sm.s )
+        free(first_sm.s);
+
 //	add_pair(sm, sm2id, fn, fn);
 	free(buf.s);
 	return 0;

Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ bam_index_t *bam_index_core(bamFile fp)`
`188`	`188`	`bam1_qname(b), last_coor, c->pos, c->tid+1);`
`189`	`189`	`return NULL;`
`190`	`190`	`}`
`191`		`- if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);`
	`191`	`+ if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);`
`192`	`192`	`if (c->bin != last_bin) { // then possibly write the binning index`
`193`	`193`	`if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record`
`194`	`194`	`insert_offset(idx->index[save_tid], save_bin, save_off, last_off);`
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t _ref, int l_ref, const uint8_t _query, int l_quer`
`161`	`161`	`double p = 1., Pr1 = 0.;`
`162`	`162`	`for (i = 0; i <= l_query + 1; ++i) {`
`163`	`163`	`p *= s[i];`
`164`		`- if (p < 1e-100) Pr += -4.343 * log(p), p = 1.;`
	`164`	`+ if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;`
`165`	`165`	`}`
`166`	`166`	`Pr1 += -4.343 * log(p * l_ref * l_query);`
`167`	`167`	`Pr = (int)(Pr1 + .499);`