Skip to content

Commit cb5f449

Browse files
author
Heng Li
committed
* bugfix in index: large memory when a read pos is 1
* bugfix in BAQ: incorrect probability (rare) * compute variant distance bias (commited by Petr) * bugfix for computing LRT2: LRT2=nan
1 parent 2c05c1d commit cb5f449

File tree

7 files changed

+114
-9
lines changed

7 files changed

+114
-9
lines changed

bam2bcf.c

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca)
3939
* negative if we are looking at an indel. */
4040
int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
4141
{
42+
static int *var_pos = NULL, nvar_pos = 0;
4243
int i, n, ref4, is_indel, ori_depth = 0;
4344
memset(r, 0, sizeof(bcf_callret1_t));
4445
if (ref_base >= 0) {
@@ -94,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
9495
r->depth = n; r->ori_depth = ori_depth;
9596
// glfgen
9697
errmod_cal(bca->e, n, 5, bca->bases, r->p);
98+
99+
// Calculate the Variant Distance Bias (make it optional?)
100+
if ( nvar_pos < _n ) {
101+
nvar_pos = _n;
102+
var_pos = realloc(var_pos,sizeof(int)*nvar_pos);
103+
}
104+
int alt_dp=0, read_len=0;
105+
for (i=0; i<_n; i++) {
106+
const bam_pileup1_t *p = pl + i;
107+
if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base )
108+
continue;
109+
110+
var_pos[alt_dp] = p->qpos;
111+
if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 )
112+
var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT;
113+
114+
alt_dp++;
115+
read_len += p->b->core.l_qseq;
116+
}
117+
float mvd=0;
118+
int j;
119+
n=0;
120+
for (i=0; i<alt_dp; i++) {
121+
for (j=0; j<i; j++) {
122+
mvd += abs(var_pos[i] - var_pos[j]);
123+
n++;
124+
}
125+
}
126+
r->mvd[0] = n ? mvd/n : 0;
127+
r->mvd[1] = alt_dp;
128+
r->mvd[2] = alt_dp ? read_len/alt_dp : 0;
129+
97130
return r->depth;
98131
}
99132

133+
134+
void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call)
135+
{
136+
// Variant distance bias. Samples merged by means of DP-weighted average.
137+
138+
float weight=0, tot_prob=0;
139+
140+
int i;
141+
for (i=0; i<n; i++)
142+
{
143+
int mvd = calls[i].mvd[0];
144+
int dp = calls[i].mvd[1];
145+
int read_len = calls[i].mvd[2];
146+
147+
if ( dp<2 ) continue;
148+
149+
float prob = 0;
150+
if ( dp==2 )
151+
{
152+
// Exact formula
153+
prob = (mvd==0) ? 1.0/read_len : (read_len-mvd)*2.0/read_len/read_len;
154+
}
155+
else if ( dp==3 )
156+
{
157+
// Sin, quite accurate approximation
158+
float mu = read_len/2.9;
159+
prob = mvd>2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14);
160+
}
161+
else
162+
{
163+
// Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths.
164+
if ( dp>5 )
165+
dp = 5;
166+
float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1));
167+
float norm = 1.125*sqrt(2*3.14*sigma2);
168+
float mu = read_len/2.9;
169+
if ( mvd < mu )
170+
prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm;
171+
else
172+
prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm;
173+
}
174+
175+
//fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob);
176+
tot_prob += prob*dp;
177+
weight += dp;
178+
}
179+
tot_prob = weight ? tot_prob/weight : 1;
180+
//fprintf(stderr,"prob=%f\n", tot_prob);
181+
call->vdb = tot_prob;
182+
}
183+
100184
int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
101185
{
102186
int ref4, i, j, qsum[4];
@@ -170,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
170254
call->ori_depth += calls[i].ori_depth;
171255
for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
172256
}
257+
258+
calc_vdb(n, calls, call);
259+
173260
return 0;
174261
}
175262

@@ -223,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
223310
if (i) kputc(',', &s);
224311
kputw(bc->anno[i], &s);
225312
}
313+
if ( bc->vdb!=1 )
314+
{
315+
ksprintf(&s, ";VDB=%.4f", bc->vdb);
316+
}
226317
kputc('\0', &s);
227318
// FMT
228319
kputs("PL", &s);

bam2bcf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@ typedef struct {
2626
int depth, ori_depth, qsum[4];
2727
int anno[16];
2828
float p[25];
29+
int mvd[3]; // mean variant distance, number of variant reads, average read length
2930
} bcf_callret1_t;
3031

3132
typedef struct {
3233
int a[5]; // alleles: ref, alt, alt2, alt3
3334
int n, n_alleles, shift, ori_ref, unseen;
3435
int anno[16], depth, ori_depth;
3536
uint8_t *PL;
37+
float vdb; // variant distance bias
3638
} bcf_call_t;
3739

3840
#ifdef __cplusplus

bam_index.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ bam_index_t *bam_index_core(bamFile fp)
188188
bam1_qname(b), last_coor, c->pos, c->tid+1);
189189
return NULL;
190190
}
191-
if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
191+
if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
192192
if (c->bin != last_bin) { // then possibly write the binning index
193193
if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
194194
insert_offset(idx->index[save_tid], save_bin, save_off, last_off);

bcftools/call1.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr,
131131
if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
132132
if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
133133
if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
134-
//if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]);
135134
}
136135
if (cons_llr > 0) {
137136
ksprintf(&s, ";CLR=%d", cons_llr);
@@ -274,6 +273,8 @@ static void write_header(bcf_hdr_t *h)
274273
kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
275274
if (!strstr(str.s, "##INFO=<ID=RP,"))
276275
kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
276+
if (!strstr(str.s, "##INFO=<ID=VDB,"))
277+
kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias\">\n", &str);
277278
if (!strstr(str.s, "##FORMAT=<ID=GT,"))
278279
kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
279280
if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
@@ -285,7 +286,7 @@ static void write_header(bcf_hdr_t *h)
285286
if (!strstr(str.s, "##FORMAT=<ID=SP,"))
286287
kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
287288
if (!strstr(str.s, "##FORMAT=<ID=PL,"))
288-
kputs("##FORMAT=<ID=PL,Number=-1,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods, number of values is (#ALT+1)*(#ALT+2)/2\">\n", &str);
289+
kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
289290
h->l_txt = str.l + 1; h->txt = str.s;
290291
}
291292

bcftools/em.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
168168
// x[5..6]: group1 freq, group2 freq
169169
// x[7]: 1-degree P-value
170170
// x[8]: 2-degree P-value
171-
// x[9]: 1-degree P-value with freq estimated from genotypes
172171
int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
173172
{
174173
double *pdg;
@@ -208,11 +207,13 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
208207
x[6] = freqml(x[0], n1, n, pdg);
209208
}
210209
if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
211-
double f[3], f3[3][3];
210+
double f[3], f3[3][3], tmp;
212211
f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
213212
for (i = 0; i < 3; ++i)
214213
f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
215-
x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
214+
tmp = log(lk_ratio_test(n, n1, pdg, f3));
215+
if (tmp < 0) tmp = 0;
216+
x[7] = kf_gammaq(.5, tmp);
216217
}
217218
if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
218219
double g[3][3], tmp;
@@ -222,8 +223,8 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
222223
for (i = 0; i < ITER_MAX; ++i)
223224
if (g3_iter(g[2], pdg, n1, n) < EPS) break;
224225
tmp = log(lk_ratio_test(n, n1, pdg, g));
226+
if (tmp < 0) tmp = 0;
225227
x[8] = kf_gammaq(1., tmp);
226-
x[9] = kf_gammaq(.5, tmp);
227228
}
228229
// free
229230
free(pdg);

kprobaln.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer
161161
double p = 1., Pr1 = 0.;
162162
for (i = 0; i <= l_query + 1; ++i) {
163163
p *= s[i];
164-
if (p < 1e-100) Pr += -4.343 * log(p), p = 1.;
164+
if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
165165
}
166166
Pr1 += -4.343 * log(p * l_ref * l_query);
167167
Pr = (int)(Pr1 + .499);

sample.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,15 @@ static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, cons
5252
int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
5353
{
5454
const char *p = txt, *q, *r;
55-
kstring_t buf;
55+
kstring_t buf, first_sm;
5656
int n = 0;
5757
khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
5858
if (txt == 0) {
5959
add_pair(sm, sm2id, fn, fn);
6060
return 0;
6161
}
6262
memset(&buf, 0, sizeof(kstring_t));
63+
memset(&first_sm, 0, sizeof(kstring_t));
6364
while ((q = strstr(p, "@RG")) != 0) {
6465
p = q + 3;
6566
r = q = 0;
@@ -73,12 +74,21 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
7374
oq = *u; or = *v; *u = *v = '\0';
7475
buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
7576
add_pair(sm, sm2id, buf.s, r);
77+
if ( !first_sm.s )
78+
kputs(r,&first_sm);
7679
*u = oq; *v = or;
7780
} else break;
7881
p = q > r? q : r;
7982
++n;
8083
}
8184
if (n == 0) add_pair(sm, sm2id, fn, fn);
85+
// If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
86+
// use the tag instead.
87+
else if ( n==1 && first_sm.s )
88+
add_pair(sm,sm2id,fn,first_sm.s);
89+
if ( first_sm.s )
90+
free(first_sm.s);
91+
8292
// add_pair(sm, sm2id, fn, fn);
8393
free(buf.s);
8494
return 0;

0 commit comments

Comments
 (0)