diff --git a/dorado/utils/sequence_utils.cpp b/dorado/utils/sequence_utils.cpp index 2dbbede71..5273e9b51 100644 --- a/dorado/utils/sequence_utils.cpp +++ b/dorado/utils/sequence_utils.cpp @@ -321,6 +321,9 @@ std::optional compute_overlap(const std::string& query_seq, std::tuple> realign_moves(const std::string& query_sequence, const std::string& target_sequence, const std::vector& moves) { + assert(static_cast(query_sequence.length()) == + std::accumulate(moves.begin(), moves.end(), 0)); + // We are going to compute the overlap between the two reads MmTbufPtr working_buffer; const auto overlap_result = @@ -337,20 +340,23 @@ std::tuple> realign_moves(const std::string& quer const auto query_end = overlap_result->query_end; const auto target_end = overlap_result->target_end; - // Advance the query and target position. - ++query_start; - ++target_start; - while (query_sequence[query_start] != target_sequence[target_start]) { + // Advance the query and target position so that their first nucleotide is identical + while (query_sequence[target_start] != target_sequence[query_start]) { ++query_start; ++target_start; + if (static_cast(target_start) >= query_sequence.length() || + static_cast(query_start) >= target_sequence.length()) { + return failed_realignment; + } } EdlibAlignConfig align_config = edlibDefaultAlignConfig(); align_config.task = EDLIB_TASK_PATH; auto target_sequence_component = - target_sequence.substr(target_start, target_end - target_start); - auto query_sequence_component = query_sequence.substr(query_start, query_end - query_start); + target_sequence.substr(query_start, query_end - query_start + 1); + auto query_sequence_component = + query_sequence.substr(target_start, target_end - target_start + 1); EdlibAlignResult edlib_result = edlibAlign( target_sequence_component.data(), static_cast(target_sequence_component.length()), @@ -366,52 +372,57 @@ std::tuple> realign_moves(const std::string& quer return failed_realignment; } - // Let's keep two cursor positions - one for the new move table and one for the old: + // Let's keep two cursor positions - one for the new move table which we are building, and one for the old where we track where we got to int new_move_cursor = 0; int old_move_cursor = 0; + // First step is to advance the moves table to the start of the aligment in the query. int moves_found = 0; - while (moves_found < int(moves.size()) && moves_found < int(query_start)) { - moves_found += moves[old_move_cursor]; - ++old_move_cursor; + for (int i = 0; i < int(moves.size()); i++) { + moves_found += moves[i]; + if (moves_found == target_start + 1) { + break; + } + old_move_cursor++; } - --old_move_cursor; // We have gone one too far. - int old_moves_offset = old_move_cursor; + + int old_moves_offset = + old_move_cursor; // Cursor indicating where the move table should now start const auto alignment_size = - static_cast(edlib_result.endLocations[0] - edlib_result.startLocations[0]); + static_cast(edlib_result.endLocations[0] - edlib_result.startLocations[0]) + 1; // Now that we have the alignment, we need to compute the new move table, by walking along the alignment std::vector new_moves; for (size_t i = 0; i < alignment_size; i++) { auto alignment_entry = edlib_result.alignment[i]; - if ((alignment_entry == 0) || - (alignment_entry == - 3)) { //Match or mismatch, need to update the new move table and move the cursor of the old move table. + if ((alignment_entry == 0) || (alignment_entry == 3)) { // Match or mismatch + // Need to update the new move table and move the cursor of the old move table. new_moves.push_back(1); // We have a match so we need a 1 (move) new_move_cursor++; old_move_cursor++; - while (moves[old_move_cursor] == 0) { + while ((old_move_cursor < int(moves.size())) && moves[old_move_cursor] == 0) { if (old_move_cursor < (new_move_cursor + old_moves_offset)) { old_move_cursor++; } else { + // If we have a zero in the old move table, we need to add zeros to the new move table to make it up new_moves.push_back(0); new_move_cursor++; old_move_cursor++; } } // Update the Query and target seq cursors - } else if (alignment_entry == 1) { //Insertion to target + } else if (alignment_entry == 1) { // Insertion to target // If we have an insertion in the target, we need to add a 1 to the new move table, and increment the new move table cursor. the old move table cursor and new are now out of sync and need fixing. new_moves.push_back(1); new_move_cursor++; - } else if (alignment_entry == 2) { //Insertion to Query + } else if (alignment_entry == 2) { // Insertion to Query // We have a query insertion, all we need to do is add zeros to the new move table to make it up, the signal can be assigned to the leftmost nucleotide in the sequence. new_moves.push_back(0); new_move_cursor++; old_move_cursor++; - while (moves[old_move_cursor] == 0) { + while ((old_move_cursor < int(moves.size())) && moves[old_move_cursor] == 0) { new_moves.push_back(0); old_move_cursor++; new_move_cursor++; @@ -421,7 +432,7 @@ std::tuple> realign_moves(const std::string& quer edlibFreeAlignResult(edlib_result); - return std::make_tuple(old_moves_offset, target_start - 1, std::move(new_moves)); + return std::make_tuple(old_moves_offset, query_start, std::move(new_moves)); } std::vector move_cum_sums(const std::vector& moves) { diff --git a/dorado/utils/sequence_utils.h b/dorado/utils/sequence_utils.h index b8fefc2ca..3b611b8a7 100644 --- a/dorado/utils/sequence_utils.h +++ b/dorado/utils/sequence_utils.h @@ -50,7 +50,14 @@ class BaseInfo { size_t count_trailing_chars(const std::string_view seq, char c); size_t count_leading_chars(const std::string_view seq, char c); -// Result of overlapping two reads +/** + * @brief Result of overlapping two reads. + * + * The `OverlapResult` struct holds the results of overlapping a query sequence with a target sequence. + * The coordinates provided for `target_start`, `target_end`, `query_start`, and `query_end` indicate positions in the respective sequences. + * + * For example, `query_start` represents the location of the start of the query sequence in the target sequence, and so on. + */ struct OverlapResult { int32_t target_start; int32_t target_end; diff --git a/tests/RealignMovesTest.cpp b/tests/RealignMovesTest.cpp index 3418154d5..7cf8c4d4d 100644 --- a/tests/RealignMovesTest.cpp +++ b/tests/RealignMovesTest.cpp @@ -38,4 +38,148 @@ TEST_CASE("No alignment doesn't produce an error", TEST_GROUP) { CHECK(move_offset == -1); CHECK(target_start == -1); CHECK(new_moves.empty()); +} + +TEST_CASE("Test realign_moves - output moves correct", TEST_GROUP) { + SECTION("Test move table realignment of long identical sequences") { + auto query = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + + std::vector input_moves; + + for (char nucleotide : query) { + (void)nucleotide; + input_moves.push_back(1); + input_moves.push_back(0); + } + + auto [old_moves_offset, query_start, new_moves] = + utils::realign_moves(query, target, input_moves); + + CHECK(old_moves_offset == 0); + CHECK(query_start == 0); + CHECK(new_moves.size() == input_moves.size()); + CHECK(new_moves == input_moves); + } + + SECTION("Test move table realignment where target is suffix of query") { + auto query = std::string( + "TTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + + std::vector input_moves; + + for (char nucleotide : query) { + (void)nucleotide; + input_moves.push_back(1); + input_moves.push_back(0); + } + + auto [old_moves_offset, target_start, new_moves] = + utils::realign_moves(query, target, input_moves); // simplex, duplex, moves + + CHECK(old_moves_offset == 8); + CHECK(target_start == 0); + CHECK(new_moves.size() == input_moves.size() - 4 * 2); + CHECK(std::equal(new_moves.begin(), new_moves.end(), input_moves.begin() + 4 * 2)); + } + + SECTION("Test 2 - move table realignment where target is suffix of query") { + auto query = std::string( + "TTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + + std::vector input_moves; + + for (char nucleotide : query) { + (void)nucleotide; + input_moves.push_back(1); + input_moves.push_back(0); + input_moves.push_back(0); + } + + auto [old_moves_offset, target_start, new_moves] = + utils::realign_moves(query, target, input_moves); // simplex, duplex, moves + + CHECK(old_moves_offset == 4 * 3); + CHECK(target_start == 0); + CHECK(new_moves.size() == input_moves.size() - 4 * 3); + CHECK(std::equal(new_moves.begin(), new_moves.end(), input_moves.begin() + 4 * 3)); + } + + SECTION("Test move table realignment where query is suffix of target") { + auto query = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = std::string( + "TTTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + + std::vector input_moves; + + for (char nucleotide : query) { + (void)nucleotide; + input_moves.push_back(1); + input_moves.push_back(0); + input_moves.push_back(0); + input_moves.push_back(0); + } + + auto [old_moves_offset, target_start, new_moves] = + utils::realign_moves(query, target, input_moves); // simplex, duplex, moves + + CHECK(old_moves_offset == 0); + CHECK(target_start == 5); + CHECK(new_moves.size() == input_moves.size()); + CHECK(std::equal(new_moves.begin(), new_moves.end(), input_moves.begin())); + } + + SECTION("Test move table realignment where target is an infix of query") { + auto query = std::string( + "GGGGGACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTTGGGGG"); + auto target = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + + std::vector input_moves; + + for (char nucleotide : query) { + (void)nucleotide; + input_moves.push_back(1); + input_moves.push_back(0); + input_moves.push_back(0); + input_moves.push_back(0); + } + + auto [old_moves_offset, target_start, new_moves] = + utils::realign_moves(query, target, input_moves); // simplex, duplex, moves + + CHECK(old_moves_offset == 5 * 4); + CHECK(target_start == 0); + CHECK(new_moves.size() == input_moves.size() - (5 + 5) * 4); + } + + SECTION("Test move table realignment where query is an infix of target") { + auto query = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = std::string( + "GGGGGACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTTGGGGG"); + + std::vector input_moves; + + for (char nucleotide : query) { + (void)nucleotide; + input_moves.push_back(1); + input_moves.push_back(0); + input_moves.push_back(0); + } + + auto [old_moves_offset, target_start, new_moves] = + utils::realign_moves(query, target, input_moves); // simplex, duplex, moves + + CHECK(old_moves_offset == 0); + CHECK(target_start == 5); + CHECK(new_moves.size() == input_moves.size()); + } } \ No newline at end of file diff --git a/tests/SequenceUtilsTest.cpp b/tests/SequenceUtilsTest.cpp index 32f2ab931..a1703668d 100644 --- a/tests/SequenceUtilsTest.cpp +++ b/tests/SequenceUtilsTest.cpp @@ -11,6 +11,36 @@ using std::make_tuple; using namespace dorado::utils; +TEST_CASE(TEST_GROUP ": Test compute_overlap", TEST_GROUP) { + SECTION("Test overlaps of long identical strings") { + auto query = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + dorado::MmTbufPtr working_buffer; + auto overlap = compute_overlap(query, "query", target, "target", working_buffer); + + CHECK(overlap->query_start == 0); + CHECK(overlap->query_end == static_cast(query.size()) - 1); + CHECK(overlap->target_start == 0); + CHECK(overlap->target_end == static_cast(target.size()) - 1); + } + + SECTION("Test overlaps of strings where one is a prefix of the other") { + auto query = std::string( + "TTTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + auto target = + std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT"); + dorado::MmTbufPtr working_buffer; + auto overlap = compute_overlap(query, "query", target, "target", working_buffer); + + CHECK(overlap->query_start == 0); + CHECK(overlap->query_end == static_cast(target.size()) - 1); + CHECK(overlap->target_start == 5); + CHECK(overlap->target_end == static_cast(query.size()) - 1); + } +} + TEST_CASE(TEST_GROUP ": Test base_to_int", TEST_GROUP) { CHECK(base_to_int('A') == 0); CHECK(base_to_int('C') == 1);