From 8df0d64204ecf0e22743980eb15d87149a7746a4 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 4 May 2023 12:22:12 +0100 Subject: [PATCH] Change bounds checking in probaln_glocal In 3 places when filling out forwards and backwards arrays, the "u" array index has bounds checks of "u < 3 || u >= i_dim-3". Understanding this code is tricky however! My hypothesis that the upper bounds check here is because we use u, u+1 and u+2 in array indices, and we iterate with "k <= l_ref" so we can access one beyond the end of the array. However the arrays are allocated to be dimension (l_query+1)*i_dim, so (assuming correctness of l_ref vs l_query in bw/i_dim calculation) we have compensated for this over-step already. This has been validated with address sanitiser. The effect of the i_dim-3 limit is that having band width equal to query length causes the final state element to be incorrectly labelled as an insertion. This hypothesis may however be incorrect, as the lower bound "u < 3" also seems redundant, yet changing this to "u < 0" does give different quality scores in about 1 in 4000 sequences (tested on 10 million illumina short read BAQ calculations). Hence for now this is left unchanged. In normal behaviour using a band, tested using "samtools calmd -r -E" to generate BQ tags, this commit does not change output. Fixes #1605 --- probaln.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/probaln.c b/probaln.c index 192f4b751..8a60372b3 100644 --- a/probaln.c +++ b/probaln.c @@ -245,10 +245,24 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, { // f[l_query+1] double sum; double M = 1./s[l_query]; + // Note this goes up to <= l_ref, meaning we are accessing 1 beyond + // the end of the sequence. However we allocated above with + // (l_query+1)*i_dim (plus appropriate l_ref vs l_query in band width) + // so this should be sufficient. + // + // This fixes Issue #1605 where band width equal to sequence length + // gives incorrect alignments, due to the last value not being filled + // out correctly. + // + // I am unsure why the limit was previously set at u >= i_dim - 3, but + // can only conjecture it was due to forgetting the l_query+1 alloc. + // I am also unsure why "u < 3" is used instead of "u < 0", however + // changing that does change behaviour for common usage (unlike + // "idim - 3" to "idim"). for (k = 1, sum = 0.; k <= l_ref; ++k) { int u; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += M*f[l_query*i_dim + u+0] * sM + M*f[l_query*i_dim + u+1] * sI; } s[l_query+1] = sum; // the last scaling factor @@ -272,7 +286,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int u; double *bi = &b[l_query*i_dim]; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; } // b[l_query-1..1] @@ -350,7 +364,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int u; double e = (ref[k - 1] > 3 || query[0] > 3)? 1. : ref[k - 1] == query[0]? 1. - qual[0] : qual[0] * EM; set_u(u, bw, 1, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += e * b[1*i_dim + u+0] * bM + EI * b[1*i_dim + u+1] * bI; } set_u(k, bw, 0, 0);