@@ -39,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca)
3939 * negative if we are looking at an indel. */
4040int bcf_call_glfgen (int _n , const bam_pileup1_t * pl , int ref_base , bcf_callaux_t * bca , bcf_callret1_t * r )
4141{
42+ static int * var_pos = NULL , nvar_pos = 0 ;
4243 int i , n , ref4 , is_indel , ori_depth = 0 ;
4344 memset (r , 0 , sizeof (bcf_callret1_t ));
4445 if (ref_base >= 0 ) {
@@ -94,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
9495 r -> depth = n ; r -> ori_depth = ori_depth ;
9596 // glfgen
9697 errmod_cal (bca -> e , n , 5 , bca -> bases , r -> p );
98+
99+ // Calculate the Variant Distance Bias (make it optional?)
100+ if ( nvar_pos < _n ) {
101+ nvar_pos = _n ;
102+ var_pos = realloc (var_pos ,sizeof (int )* nvar_pos );
103+ }
104+ int alt_dp = 0 , read_len = 0 ;
105+ for (i = 0 ; i < _n ; i ++ ) {
106+ const bam_pileup1_t * p = pl + i ;
107+ if ( bam1_seqi (bam1_seq (p -> b ),p -> qpos ) == ref_base )
108+ continue ;
109+
110+ var_pos [alt_dp ] = p -> qpos ;
111+ if ( (bam1_cigar (p -> b )[0 ]& BAM_CIGAR_MASK )== 4 )
112+ var_pos [alt_dp ] -= bam1_cigar (p -> b )[0 ]>>BAM_CIGAR_SHIFT ;
113+
114+ alt_dp ++ ;
115+ read_len += p -> b -> core .l_qseq ;
116+ }
117+ float mvd = 0 ;
118+ int j ;
119+ n = 0 ;
120+ for (i = 0 ; i < alt_dp ; i ++ ) {
121+ for (j = 0 ; j < i ; j ++ ) {
122+ mvd += abs (var_pos [i ] - var_pos [j ]);
123+ n ++ ;
124+ }
125+ }
126+ r -> mvd [0 ] = n ? mvd /n : 0 ;
127+ r -> mvd [1 ] = alt_dp ;
128+ r -> mvd [2 ] = alt_dp ? read_len /alt_dp : 0 ;
129+
97130 return r -> depth ;
98131}
99132
133+
134+ void calc_vdb (int n , const bcf_callret1_t * calls , bcf_call_t * call )
135+ {
136+ // Variant distance bias. Samples merged by means of DP-weighted average.
137+
138+ float weight = 0 , tot_prob = 0 ;
139+
140+ int i ;
141+ for (i = 0 ; i < n ; i ++ )
142+ {
143+ int mvd = calls [i ].mvd [0 ];
144+ int dp = calls [i ].mvd [1 ];
145+ int read_len = calls [i ].mvd [2 ];
146+
147+ if ( dp < 2 ) continue ;
148+
149+ float prob = 0 ;
150+ if ( dp == 2 )
151+ {
152+ // Exact formula
153+ prob = (mvd == 0 ) ? 1.0 /read_len : (read_len - mvd )* 2.0 /read_len /read_len ;
154+ }
155+ else if ( dp == 3 )
156+ {
157+ // Sin, quite accurate approximation
158+ float mu = read_len /2.9 ;
159+ prob = mvd > 2 * mu ? 0 : sin (mvd * 3.14 /2 /mu ) / (4 * mu /3.14 );
160+ }
161+ else
162+ {
163+ // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths.
164+ if ( dp > 5 )
165+ dp = 5 ;
166+ float sigma2 = (read_len /1.9 /(dp + 1 )) * (read_len /1.9 /(dp + 1 ));
167+ float norm = 1.125 * sqrt (2 * 3.14 * sigma2 );
168+ float mu = read_len /2.9 ;
169+ if ( mvd < mu )
170+ prob = exp (- (mvd - mu )* (mvd - mu )/2 /sigma2 )/norm ;
171+ else
172+ prob = exp (- (mvd - mu )* (mvd - mu )/3.125 /sigma2 )/norm ;
173+ }
174+
175+ //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob);
176+ tot_prob += prob * dp ;
177+ weight += dp ;
178+ }
179+ tot_prob = weight ? tot_prob /weight : 1 ;
180+ //fprintf(stderr,"prob=%f\n", tot_prob);
181+ call -> vdb = tot_prob ;
182+ }
183+
100184int bcf_call_combine (int n , const bcf_callret1_t * calls , int ref_base /*4-bit*/ , bcf_call_t * call )
101185{
102186 int ref4 , i , j , qsum [4 ];
@@ -170,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
170254 call -> ori_depth += calls [i ].ori_depth ;
171255 for (j = 0 ; j < 16 ; ++ j ) call -> anno [j ] += calls [i ].anno [j ];
172256 }
257+
258+ calc_vdb (n , calls , call );
259+
173260 return 0 ;
174261}
175262
@@ -223,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
223310 if (i ) kputc (',' , & s );
224311 kputw (bc -> anno [i ], & s );
225312 }
313+ if ( bc -> vdb != 1 )
314+ {
315+ ksprintf (& s , ";VDB=%.4f" , bc -> vdb );
316+ }
226317 kputc ('\0' , & s );
227318 // FMT
228319 kputs ("PL" , & s );
0 commit comments