Skip to content

Commit cb94432

Browse files
snurkbrianwalenz
authored andcommitted
OEA: overlap rescoring works; improved microsatellite check; ignore diffs in flanks
1 parent 2cf7a92 commit cb94432

3 files changed

Lines changed: 79 additions & 35 deletions

File tree

src/overlapErrorAdjustment/correctOverlaps-Redo_Olaps.C

Lines changed: 70 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -318,24 +318,29 @@ CollectKmerStat(const char* seq, int32 reg_len, int32 kmer_len, int32 *stats) {
318318

319319
static
320320
bool
321-
CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
321+
CheckTrivialDNA(const char* seq, int32 remaining, int32 offset, int32 size_factor, int32 repeat_cnt) {
322322
//TODO configure trivial DNA analysis
323-
static const int32 SIZE_FACTOR = 6;
324-
static const int32 REPEAT_NUM = 5;
323+
//static const int32 SIZE_FACTOR = 6;
324+
//static const int32 REPEAT_NUM = 5;
325+
//static const int32 SIZE_FACTOR = 4;
326+
//static const int32 REPEAT_NUM = 3;
325327
static const int32 MIN_K = 2;
326328
static const int32 MAX_K = 5;
327329

328330
int32 stats_buff[1 << (2 * MAX_K)];
329331
for (int32 k = MIN_K; k <= MAX_K; ++k) {
330332
const int32 possible_kmer_cnt = 1 << (2 * k);
331-
int32 reg_len = k * SIZE_FACTOR;
333+
int32 reg_len = k * size_factor;
332334

333335
//exploring sequence to the right
334-
for (int32 shift = 0; shift < k; ++shift) {
336+
//fprintf(stderr, "checking upstream k=%d, init_shift=%d\n", k, std::max(-k, -offset));
337+
for (int32 shift = std::max(-k, -offset); shift < k; ++shift) {
338+
335339
if (reg_len + shift > remaining)
336340
break;
337341
CollectKmerStat(seq + shift, reg_len, k, stats_buff);
338-
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= REPEAT_NUM) {
342+
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= repeat_cnt) {
343+
//comment out!
339344
//char subbuff[reg_len + 1];
340345
//memcpy(subbuff, seq + shift, reg_len);
341346
//subbuff[reg_len] = '\0';
@@ -345,12 +350,14 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
345350
}
346351
}
347352

353+
//fprintf(stderr, "checking downstream k=%d, init_shift=%d\n", k, std::max(-k, -remaining));
348354
//exploring sequence to the left
349-
for (int32 shift = 0; shift < k; ++shift) {
355+
for (int32 shift = std::max(-k, -remaining); shift < k; ++shift) {
350356
if (reg_len + shift > offset)
351357
break;
352358
CollectKmerStat(seq - shift - 1, -reg_len, k, stats_buff);
353-
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= REPEAT_NUM) {
359+
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= repeat_cnt) {
360+
//comment out!
354361
//char subbuff[reg_len + 1];
355362
//memcpy(subbuff, seq - shift - reg_len, reg_len);
356363
//subbuff[reg_len] = '\0';
@@ -365,24 +372,40 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
365372

366373
static
367374
bool
368-
CheckNonTrivialDNA(const char* a_part, const char* b_part,
375+
CheckTrivialDNA(const char* a_part, const char* b_part,
369376
int32 a_len, int32 b_len,
370-
int32 a_pos, int32 b_pos) {
371-
//fprintf(stderr, "Checking for trivial DNA in A around position %d\n", i);
372-
if (CheckTrivialDNA(a_part + a_pos, /*remaining*/a_len - a_pos, /*offset*/a_pos))
373-
return false;
374-
//fprintf(stderr, "Checking for trivial DNA in B around position %d\n", j);
375-
if (CheckTrivialDNA(b_part + b_pos, /*remaining*/b_len - b_pos, /*offset*/b_pos))
376-
return false;
377-
return true;
377+
int32 a_pos, int32 b_pos,
378+
int32 size_factor, int32 repeat_cnt) {
379+
//fprintf(stderr, "Checking for trivial DNA in around positions %d, %d\n", a_pos, b_pos);
380+
381+
if (CheckTrivialDNA(a_part + a_pos, /*remaining*/a_len - a_pos, /*offset*/a_pos, size_factor, repeat_cnt)) {
382+
//fprintf(stderr, "Trivial DNA detected in A around position %d\n", a_pos);
383+
return true;
384+
}
385+
if (CheckTrivialDNA(b_part + b_pos, /*remaining*/b_len - b_pos, /*offset*/b_pos, size_factor, repeat_cnt)) {
386+
//fprintf(stderr, "Trivial DNA detected in B around position %d\n", b_pos);
387+
return true;
388+
}
389+
//fprintf(stderr, "NON-Trivial DNA!!!\n", a_pos);
390+
return false;
378391
}
379392

380393
static
381394
std::pair<size_t, size_t>
382395
ComputeErrors(const char* a_part, const char* b_part,
383396
int32 delta_len, int32 *deltas,
384397
int32 a_len, int32 b_len,
385-
bool check_trivial_dna) {
398+
bool check_trivial_dna,
399+
uint32 ignore_flank) {
400+
401+
static const int32 MM_SIZE_FACTOR = 6;
402+
static const int32 MM_REPEAT_NUM = 5;
403+
404+
//static const int32 IND_SIZE_FACTOR = MM_SIZE_FACTOR;
405+
//static const int32 IND_REPEAT_NUM = MM_REPEAT_NUM;
406+
static const int32 IND_SIZE_FACTOR = 4;
407+
static const int32 IND_REPEAT_NUM = 3;
408+
386409
// Event counter. Each individual (1bp) mismatch/insertion/deletion is an event
387410
int32 all_ct = 0;
388411
// Processed event counter
@@ -394,6 +417,21 @@ ComputeErrors(const char* a_part, const char* b_part,
394417
//position in "alignment" of a_part and b_part
395418
int32 p = 0;
396419

420+
auto cnt_event_f = [&](int32 size_factor, int32 repeat_cnt) {
421+
if (i < ignore_flank ||
422+
j < ignore_flank ||
423+
i + ignore_flank >= a_len ||
424+
j + ignore_flank >= b_len) {
425+
return false;
426+
}
427+
if (check_trivial_dna &&
428+
CheckTrivialDNA(a_part, b_part, a_len, b_len, i, j,
429+
size_factor, repeat_cnt)) {
430+
return false;
431+
}
432+
return true;
433+
};
434+
397435
for (int32 k=0; k < delta_len; k++) {
398436
//fprintf(stderr, "k=%d deltalen=%d i=%d our of %d j=%d out of %d\n", k, wa->ped.deltaLen, i, a_len, j, b_len);
399437

@@ -404,8 +442,7 @@ ComputeErrors(const char* a_part, const char* b_part,
404442
//fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);
405443

406444
all_ct++;
407-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
408-
ct++;
445+
ct += (int32) cnt_event_f(MM_SIZE_FACTOR, MM_REPEAT_NUM);
409446
}
410447

411448
i++; //assert(i <= a_len);
@@ -419,8 +456,7 @@ ComputeErrors(const char* a_part, const char* b_part,
419456
//Insertion at i - 1 in a_part (p in "alignment")
420457
//fprintf(stderr, "INSERT %c at %d #%d\n", b_part[j], i-1, p);
421458
all_ct++;
422-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
423-
ct++;
459+
ct += (int32) cnt_event_f(IND_SIZE_FACTOR, IND_REPEAT_NUM);
424460

425461
j++; //assert(j <= b_len);
426462
p++;
@@ -432,8 +468,7 @@ ComputeErrors(const char* a_part, const char* b_part,
432468
//Deletion at i in a_part (p in "alignment")
433469
//fprintf(stderr, "DELETE %c at %d #%d\n", a_part[i], i, p);
434470
all_ct++;
435-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
436-
ct++;
471+
ct += (int32) cnt_event_f(IND_SIZE_FACTOR, IND_REPEAT_NUM);
437472

438473
i++; //assert(i <= a_len);
439474
p++;
@@ -446,8 +481,7 @@ ComputeErrors(const char* a_part, const char* b_part,
446481
//fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);
447482
//Substitution at i in a_part (p in "alignment")
448483
all_ct++;
449-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
450-
ct++;
484+
ct += (int32) cnt_event_f(MM_SIZE_FACTOR, MM_REPEAT_NUM);
451485
}
452486

453487
i++; //assert(i <= a_len); // Guaranteed, we're looping on this
@@ -459,8 +493,8 @@ ComputeErrors(const char* a_part, const char* b_part,
459493
// fprintf(stderr, "Reported %d out of %d\n", ct, all_ct);
460494
//}
461495

462-
assert(i <= a_len);
463-
assert(j <= b_len);
496+
assert(i == a_len);
497+
assert(j == b_len);
464498

465499
return std::make_pair(ct, p);
466500
}
@@ -519,6 +553,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
519553
*match_to_end,
520554
ped);
521555

556+
522557
//Adjusting the extremities
523558
//TODO discuss the logic!
524559
//TODO refactor out the code duplication
@@ -560,6 +595,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
560595
all_errors -= i;
561596
}
562597

598+
//fprintf(stderr, "Showing alignment\n");
563599
//Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
564600

565601
*invalid_olap = (std::min(a_end, b_end) <= 0);
@@ -571,13 +607,13 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
571607
int32 events;
572608
int32 alignment_len;
573609

610+
//fprintf(stderr, "Computing errors\n");
611+
static const uint32 FLANK_IGNORE = 5;
574612
//fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
575613
std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,
576-
a_end, b_end, check_trivial_dna);
614+
a_end, b_end, check_trivial_dna, FLANK_IGNORE);
577615

578-
if (!check_trivial_dna && all_errors != events) {
579-
fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
580-
}
616+
//fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
581617
assert(check_trivial_dna || all_errors == events);
582618

583619
assert(events >= 0 && alignment_len > 0);
@@ -661,8 +697,8 @@ Redo_Olaps(coParameters *G, /*const*/ sqStore *seqStore) {
661697
for (; thisOvl <= lastOvl && G->olaps[thisOvl].b_iid == curID; thisOvl++) {
662698
const Olap_Info_t &olap = G->olaps[thisOvl];
663699

664-
//if (olap.b_iid != 39861)
665-
// continue;
700+
if (G->secondID != UINT32_MAX && olap.b_iid != G->secondID)
701+
continue;
666702

667703
if (olap.normal) {
668704
// fprintf(stderr, "b_part = fseq %40.40s\n", fseq);

src/overlapErrorAdjustment/correctOverlaps.C

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ main(int argc, char **argv) {
4848
G->bgnID = atoi(argv[++arg]);
4949
G->endID = atoi(argv[++arg]);
5050

51+
} else if (strcmp(argv[arg], "-B") == 0) {
52+
G->secondID = atoi(argv[++arg]);
53+
5154
} else if (strcmp(argv[arg], "-O") == 0) { // -F? -S Olap_Path
5255
G->ovlStorePath = argv[++arg];
5356

@@ -96,6 +99,7 @@ main(int argc, char **argv) {
9699
fprintf(stderr, " -S seqStore path to a sequence store\n");
97100
fprintf(stderr, " -O ovlStore path to an overlap store\n");
98101
fprintf(stderr, " -R bgn end only compute for reads bgn-end\n");
102+
fprintf(stderr, " -B id only compute for pairs with second read <id>\n");
99103
fprintf(stderr, "\n");
100104
fprintf(stderr, " -c input-name read corrections from 'input-name'\n");
101105
fprintf(stderr, " -o output-name write updated error rates to 'output-name'\n");

src/overlapErrorAdjustment/correctOverlaps.H

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ public:
248248
bgnID = 0;
249249
endID = UINT32_MAX;
250250

251+
secondID = UINT32_MAX;
252+
251253
bases = NULL;
252254
basesLen = 0;
253255

@@ -288,6 +290,8 @@ public:
288290
uint32 bgnID;
289291
uint32 endID;
290292

293+
uint32 secondID;
294+
291295
char *bases;
292296
uint64 basesLen;
293297

@@ -304,7 +308,7 @@ public:
304308

305309
double errorRate;
306310
uint32 minOverlap;
307-
bool checkTrivialDNA;
311+
bool checkTrivialDNA;
308312

309313
pedWorkArea_t ped;
310314

0 commit comments

Comments
 (0)