@@ -318,24 +318,29 @@ CollectKmerStat(const char* seq, int32 reg_len, int32 kmer_len, int32 *stats) {
318318
319319static
320320bool
321- CheckTrivialDNA (const char * seq , int32 remaining , int32 offset ) {
321+ CheckTrivialDNA (const char * seq , int32 remaining , int32 offset , int32 size_factor , int32 repeat_cnt ) {
322322 //TODO configure trivial DNA analysis
323- static const int32 SIZE_FACTOR = 6 ;
324- static const int32 REPEAT_NUM = 5 ;
323+ //static const int32 SIZE_FACTOR = 6;
324+ //static const int32 REPEAT_NUM = 5;
325+ //static const int32 SIZE_FACTOR = 4;
326+ //static const int32 REPEAT_NUM = 3;
325327 static const int32 MIN_K = 2 ;
326328 static const int32 MAX_K = 5 ;
327329
328330 int32 stats_buff [1 << (2 * MAX_K )];
329331 for (int32 k = MIN_K ; k <= MAX_K ; ++ k ) {
330332 const int32 possible_kmer_cnt = 1 << (2 * k );
331- int32 reg_len = k * SIZE_FACTOR ;
333+ int32 reg_len = k * size_factor ;
332334
333335 //exploring sequence to the right
334- for (int32 shift = 0 ; shift < k ; ++ shift ) {
336+ //fprintf(stderr, "checking upstream k=%d, init_shift=%d\n", k, std::max(-k, -offset));
337+ for (int32 shift = std ::max (- k , - offset ); shift < k ; ++ shift ) {
338+
335339 if (reg_len + shift > remaining )
336340 break ;
337341 CollectKmerStat (seq + shift , reg_len , k , stats_buff );
338- if (* std ::max_element (stats_buff , stats_buff + possible_kmer_cnt ) >= REPEAT_NUM ) {
342+ if (* std ::max_element (stats_buff , stats_buff + possible_kmer_cnt ) >= repeat_cnt ) {
343+ //comment out!
339344 //char subbuff[reg_len + 1];
340345 //memcpy(subbuff, seq + shift, reg_len);
341346 //subbuff[reg_len] = '\0';
@@ -345,12 +350,14 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
345350 }
346351 }
347352
353+ //fprintf(stderr, "checking downstream k=%d, init_shift=%d\n", k, std::max(-k, -remaining));
348354 //exploring sequence to the left
349- for (int32 shift = 0 ; shift < k ; ++ shift ) {
355+ for (int32 shift = std :: max ( - k , - remaining ) ; shift < k ; ++ shift ) {
350356 if (reg_len + shift > offset )
351357 break ;
352358 CollectKmerStat (seq - shift - 1 , - reg_len , k , stats_buff );
353- if (* std ::max_element (stats_buff , stats_buff + possible_kmer_cnt ) >= REPEAT_NUM ) {
359+ if (* std ::max_element (stats_buff , stats_buff + possible_kmer_cnt ) >= repeat_cnt ) {
360+ //comment out!
354361 //char subbuff[reg_len + 1];
355362 //memcpy(subbuff, seq - shift - reg_len, reg_len);
356363 //subbuff[reg_len] = '\0';
@@ -365,24 +372,40 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
365372
366373static
367374bool
368- CheckNonTrivialDNA (const char * a_part , const char * b_part ,
375+ CheckTrivialDNA (const char * a_part , const char * b_part ,
369376 int32 a_len , int32 b_len ,
370- int32 a_pos , int32 b_pos ) {
371- //fprintf(stderr, "Checking for trivial DNA in A around position %d\n", i);
372- if (CheckTrivialDNA (a_part + a_pos , /*remaining*/ a_len - a_pos , /*offset*/ a_pos ))
373- return false;
374- //fprintf(stderr, "Checking for trivial DNA in B around position %d\n", j);
375- if (CheckTrivialDNA (b_part + b_pos , /*remaining*/ b_len - b_pos , /*offset*/ b_pos ))
376- return false;
377- return true;
377+ int32 a_pos , int32 b_pos ,
378+ int32 size_factor , int32 repeat_cnt ) {
379+ //fprintf(stderr, "Checking for trivial DNA in around positions %d, %d\n", a_pos, b_pos);
380+
381+ if (CheckTrivialDNA (a_part + a_pos , /*remaining*/ a_len - a_pos , /*offset*/ a_pos , size_factor , repeat_cnt )) {
382+ //fprintf(stderr, "Trivial DNA detected in A around position %d\n", a_pos);
383+ return true;
384+ }
385+ if (CheckTrivialDNA (b_part + b_pos , /*remaining*/ b_len - b_pos , /*offset*/ b_pos , size_factor , repeat_cnt )) {
386+ //fprintf(stderr, "Trivial DNA detected in B around position %d\n", b_pos);
387+ return true;
388+ }
389+ //fprintf(stderr, "NON-Trivial DNA!!!\n", a_pos);
390+ return false;
378391}
379392
380393static
381394std ::pair < size_t , size_t >
382395ComputeErrors (const char * a_part , const char * b_part ,
383396 int32 delta_len , int32 * deltas ,
384397 int32 a_len , int32 b_len ,
385- bool check_trivial_dna ) {
398+ bool check_trivial_dna ,
399+ uint32 ignore_flank ) {
400+
401+ static const int32 MM_SIZE_FACTOR = 6 ;
402+ static const int32 MM_REPEAT_NUM = 5 ;
403+
404+ //static const int32 IND_SIZE_FACTOR = MM_SIZE_FACTOR;
405+ //static const int32 IND_REPEAT_NUM = MM_REPEAT_NUM;
406+ static const int32 IND_SIZE_FACTOR = 4 ;
407+ static const int32 IND_REPEAT_NUM = 3 ;
408+
386409 // Event counter. Each individual (1bp) mismatch/insertion/deletion is an event
387410 int32 all_ct = 0 ;
388411 // Processed event counter
@@ -394,6 +417,21 @@ ComputeErrors(const char* a_part, const char* b_part,
394417 //position in "alignment" of a_part and b_part
395418 int32 p = 0 ;
396419
420+ auto cnt_event_f = [& ](int32 size_factor , int32 repeat_cnt ) {
421+ if (i < ignore_flank ||
422+ j < ignore_flank ||
423+ i + ignore_flank >= a_len ||
424+ j + ignore_flank >= b_len ) {
425+ return false;
426+ }
427+ if (check_trivial_dna &&
428+ CheckTrivialDNA (a_part , b_part , a_len , b_len , i , j ,
429+ size_factor , repeat_cnt )) {
430+ return false;
431+ }
432+ return true;
433+ };
434+
397435 for (int32 k = 0 ; k < delta_len ; k ++ ) {
398436 //fprintf(stderr, "k=%d deltalen=%d i=%d our of %d j=%d out of %d\n", k, wa->ped.deltaLen, i, a_len, j, b_len);
399437
@@ -404,8 +442,7 @@ ComputeErrors(const char* a_part, const char* b_part,
404442 //fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);
405443
406444 all_ct ++ ;
407- if (!check_trivial_dna || CheckNonTrivialDNA (a_part , b_part , a_len , b_len , i , j ))
408- ct ++ ;
445+ ct += (int32 ) cnt_event_f (MM_SIZE_FACTOR , MM_REPEAT_NUM );
409446 }
410447
411448 i ++ ; //assert(i <= a_len);
@@ -419,8 +456,7 @@ ComputeErrors(const char* a_part, const char* b_part,
419456 //Insertion at i - 1 in a_part (p in "alignment")
420457 //fprintf(stderr, "INSERT %c at %d #%d\n", b_part[j], i-1, p);
421458 all_ct ++ ;
422- if (!check_trivial_dna || CheckNonTrivialDNA (a_part , b_part , a_len , b_len , i , j ))
423- ct ++ ;
459+ ct += (int32 ) cnt_event_f (IND_SIZE_FACTOR , IND_REPEAT_NUM );
424460
425461 j ++ ; //assert(j <= b_len);
426462 p ++ ;
@@ -432,8 +468,7 @@ ComputeErrors(const char* a_part, const char* b_part,
432468 //Deletion at i in a_part (p in "alignment")
433469 //fprintf(stderr, "DELETE %c at %d #%d\n", a_part[i], i, p);
434470 all_ct ++ ;
435- if (!check_trivial_dna || CheckNonTrivialDNA (a_part , b_part , a_len , b_len , i , j ))
436- ct ++ ;
471+ ct += (int32 ) cnt_event_f (IND_SIZE_FACTOR , IND_REPEAT_NUM );
437472
438473 i ++ ; //assert(i <= a_len);
439474 p ++ ;
@@ -446,8 +481,7 @@ ComputeErrors(const char* a_part, const char* b_part,
446481 //fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);
447482 //Substitution at i in a_part (p in "alignment")
448483 all_ct ++ ;
449- if (!check_trivial_dna || CheckNonTrivialDNA (a_part , b_part , a_len , b_len , i , j ))
450- ct ++ ;
484+ ct += (int32 ) cnt_event_f (MM_SIZE_FACTOR , MM_REPEAT_NUM );
451485 }
452486
453487 i ++ ; //assert(i <= a_len); // Guaranteed, we're looping on this
@@ -459,8 +493,8 @@ ComputeErrors(const char* a_part, const char* b_part,
459493 // fprintf(stderr, "Reported %d out of %d\n", ct, all_ct);
460494 //}
461495
462- assert (i < = a_len );
463- assert (j < = b_len );
496+ assert (i = = a_len );
497+ assert (j = = b_len );
464498
465499 return std ::make_pair (ct , p );
466500}
@@ -519,6 +553,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
519553 * match_to_end ,
520554 ped );
521555
556+
522557 //Adjusting the extremities
523558 //TODO discuss the logic!
524559 //TODO refactor out the code duplication
@@ -560,6 +595,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
560595 all_errors -= i ;
561596 }
562597
598+ //fprintf(stderr, "Showing alignment\n");
563599 //Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
564600
565601 * invalid_olap = (std ::min (a_end , b_end ) <= 0 );
@@ -571,13 +607,13 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
571607 int32 events ;
572608 int32 alignment_len ;
573609
610+ //fprintf(stderr, "Computing errors\n");
611+ static const uint32 FLANK_IGNORE = 5 ;
574612 //fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
575613 std ::tie (events , alignment_len ) = ComputeErrors (a_part , b_part , ped -> deltaLen , ped -> delta ,
576- a_end , b_end , check_trivial_dna );
614+ a_end , b_end , check_trivial_dna , FLANK_IGNORE );
577615
578- if (!check_trivial_dna && all_errors != events ) {
579- fprintf (stderr , "Old errors %d new events %d\n" , all_errors , events );
580- }
616+ //fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
581617 assert (check_trivial_dna || all_errors == events );
582618
583619 assert (events >= 0 && alignment_len > 0 );
@@ -661,8 +697,8 @@ Redo_Olaps(coParameters *G, /*const*/ sqStore *seqStore) {
661697 for (; thisOvl <= lastOvl && G -> olaps [thisOvl ].b_iid == curID ; thisOvl ++ ) {
662698 const Olap_Info_t & olap = G -> olaps [thisOvl ];
663699
664- // if (olap.b_iid != 39861 )
665- // continue;
700+ if (G -> secondID != UINT32_MAX && olap .b_iid != G -> secondID )
701+ continue ;
666702
667703 if (olap .normal ) {
668704 // fprintf(stderr, "b_part = fseq %40.40s\n", fseq);
0 commit comments