Skip to content

Commit

Permalink
Merge branch 'fix-modbase-duplex-bugs' into 'master'
Browse files Browse the repository at this point in the history
Fix for duplex modbase errors

Closes DOR-746

See merge request machine-learning/dorado!1146
  • Loading branch information
malton-ont committed Aug 16, 2024
2 parents bf2b41a + d5a9e75 commit 6ec77c8
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 22 deletions.
53 changes: 32 additions & 21 deletions dorado/utils/sequence_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,9 @@ std::optional<OverlapResult> compute_overlap(const std::string& query_seq,
std::tuple<int, int, std::vector<uint8_t>> realign_moves(const std::string& query_sequence,
const std::string& target_sequence,
const std::vector<uint8_t>& moves) {
assert(static_cast<int>(query_sequence.length()) ==
std::accumulate(moves.begin(), moves.end(), 0));

// We are going to compute the overlap between the two reads
MmTbufPtr working_buffer;
const auto overlap_result =
Expand All @@ -337,20 +340,23 @@ std::tuple<int, int, std::vector<uint8_t>> realign_moves(const std::string& quer
const auto query_end = overlap_result->query_end;
const auto target_end = overlap_result->target_end;

// Advance the query and target position.
++query_start;
++target_start;
while (query_sequence[query_start] != target_sequence[target_start]) {
// Advance the query and target position so that their first nucleotide is identical
while (query_sequence[target_start] != target_sequence[query_start]) {
++query_start;
++target_start;
if (static_cast<size_t>(target_start) >= query_sequence.length() ||
static_cast<size_t>(query_start) >= target_sequence.length()) {
return failed_realignment;
}
}

EdlibAlignConfig align_config = edlibDefaultAlignConfig();
align_config.task = EDLIB_TASK_PATH;

auto target_sequence_component =
target_sequence.substr(target_start, target_end - target_start);
auto query_sequence_component = query_sequence.substr(query_start, query_end - query_start);
target_sequence.substr(query_start, query_end - query_start + 1);
auto query_sequence_component =
query_sequence.substr(target_start, target_end - target_start + 1);

EdlibAlignResult edlib_result = edlibAlign(
target_sequence_component.data(), static_cast<int>(target_sequence_component.length()),
Expand All @@ -366,52 +372,57 @@ std::tuple<int, int, std::vector<uint8_t>> realign_moves(const std::string& quer
return failed_realignment;
}

// Let's keep two cursor positions - one for the new move table and one for the old:
// Let's keep two cursor positions - one for the new move table which we are building, and one for the old where we track where we got to
int new_move_cursor = 0;
int old_move_cursor = 0;

// First step is to advance the moves table to the start of the aligment in the query.
int moves_found = 0;

while (moves_found < int(moves.size()) && moves_found < int(query_start)) {
moves_found += moves[old_move_cursor];
++old_move_cursor;
for (int i = 0; i < int(moves.size()); i++) {
moves_found += moves[i];
if (moves_found == target_start + 1) {
break;
}
old_move_cursor++;
}
--old_move_cursor; // We have gone one too far.
int old_moves_offset = old_move_cursor;

int old_moves_offset =
old_move_cursor; // Cursor indicating where the move table should now start

const auto alignment_size =
static_cast<size_t>(edlib_result.endLocations[0] - edlib_result.startLocations[0]);
static_cast<size_t>(edlib_result.endLocations[0] - edlib_result.startLocations[0]) + 1;
// Now that we have the alignment, we need to compute the new move table, by walking along the alignment
std::vector<uint8_t> new_moves;
for (size_t i = 0; i < alignment_size; i++) {
auto alignment_entry = edlib_result.alignment[i];
if ((alignment_entry == 0) ||
(alignment_entry ==
3)) { //Match or mismatch, need to update the new move table and move the cursor of the old move table.
if ((alignment_entry == 0) || (alignment_entry == 3)) { // Match or mismatch
// Need to update the new move table and move the cursor of the old move table.
new_moves.push_back(1); // We have a match so we need a 1 (move)
new_move_cursor++;
old_move_cursor++;

while (moves[old_move_cursor] == 0) {
while ((old_move_cursor < int(moves.size())) && moves[old_move_cursor] == 0) {
if (old_move_cursor < (new_move_cursor + old_moves_offset)) {
old_move_cursor++;
} else {
// If we have a zero in the old move table, we need to add zeros to the new move table to make it up
new_moves.push_back(0);
new_move_cursor++;
old_move_cursor++;
}
}
// Update the Query and target seq cursors
} else if (alignment_entry == 1) { //Insertion to target
} else if (alignment_entry == 1) { // Insertion to target
// If we have an insertion in the target, we need to add a 1 to the new move table, and increment the new move table cursor. the old move table cursor and new are now out of sync and need fixing.
new_moves.push_back(1);
new_move_cursor++;
} else if (alignment_entry == 2) { //Insertion to Query
} else if (alignment_entry == 2) { // Insertion to Query
// We have a query insertion, all we need to do is add zeros to the new move table to make it up, the signal can be assigned to the leftmost nucleotide in the sequence.
new_moves.push_back(0);
new_move_cursor++;
old_move_cursor++;
while (moves[old_move_cursor] == 0) {
while ((old_move_cursor < int(moves.size())) && moves[old_move_cursor] == 0) {
new_moves.push_back(0);
old_move_cursor++;
new_move_cursor++;
Expand All @@ -421,7 +432,7 @@ std::tuple<int, int, std::vector<uint8_t>> realign_moves(const std::string& quer

edlibFreeAlignResult(edlib_result);

return std::make_tuple(old_moves_offset, target_start - 1, std::move(new_moves));
return std::make_tuple(old_moves_offset, query_start, std::move(new_moves));
}

std::vector<uint64_t> move_cum_sums(const std::vector<uint8_t>& moves) {
Expand Down
9 changes: 8 additions & 1 deletion dorado/utils/sequence_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ class BaseInfo {
size_t count_trailing_chars(const std::string_view seq, char c);
size_t count_leading_chars(const std::string_view seq, char c);

// Result of overlapping two reads
/**
* @brief Result of overlapping two reads.
*
* The `OverlapResult` struct holds the results of overlapping a query sequence with a target sequence.
* The coordinates provided for `target_start`, `target_end`, `query_start`, and `query_end` indicate positions in the respective sequences.
*
* For example, `query_start` represents the location of the start of the query sequence in the target sequence, and so on.
*/
struct OverlapResult {
int32_t target_start;
int32_t target_end;
Expand Down
144 changes: 144 additions & 0 deletions tests/RealignMovesTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,148 @@ TEST_CASE("No alignment doesn't produce an error", TEST_GROUP) {
CHECK(move_offset == -1);
CHECK(target_start == -1);
CHECK(new_moves.empty());
}

TEST_CASE("Test realign_moves - output moves correct", TEST_GROUP) {
SECTION("Test move table realignment of long identical sequences") {
auto query =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");

std::vector<uint8_t> input_moves;

for (char nucleotide : query) {
(void)nucleotide;
input_moves.push_back(1);
input_moves.push_back(0);
}

auto [old_moves_offset, query_start, new_moves] =
utils::realign_moves(query, target, input_moves);

CHECK(old_moves_offset == 0);
CHECK(query_start == 0);
CHECK(new_moves.size() == input_moves.size());
CHECK(new_moves == input_moves);
}

SECTION("Test move table realignment where target is suffix of query") {
auto query = std::string(
"TTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");

std::vector<uint8_t> input_moves;

for (char nucleotide : query) {
(void)nucleotide;
input_moves.push_back(1);
input_moves.push_back(0);
}

auto [old_moves_offset, target_start, new_moves] =
utils::realign_moves(query, target, input_moves); // simplex, duplex, moves

CHECK(old_moves_offset == 8);
CHECK(target_start == 0);
CHECK(new_moves.size() == input_moves.size() - 4 * 2);
CHECK(std::equal(new_moves.begin(), new_moves.end(), input_moves.begin() + 4 * 2));
}

SECTION("Test 2 - move table realignment where target is suffix of query") {
auto query = std::string(
"TTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");

std::vector<uint8_t> input_moves;

for (char nucleotide : query) {
(void)nucleotide;
input_moves.push_back(1);
input_moves.push_back(0);
input_moves.push_back(0);
}

auto [old_moves_offset, target_start, new_moves] =
utils::realign_moves(query, target, input_moves); // simplex, duplex, moves

CHECK(old_moves_offset == 4 * 3);
CHECK(target_start == 0);
CHECK(new_moves.size() == input_moves.size() - 4 * 3);
CHECK(std::equal(new_moves.begin(), new_moves.end(), input_moves.begin() + 4 * 3));
}

SECTION("Test move table realignment where query is suffix of target") {
auto query =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target = std::string(
"TTTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");

std::vector<uint8_t> input_moves;

for (char nucleotide : query) {
(void)nucleotide;
input_moves.push_back(1);
input_moves.push_back(0);
input_moves.push_back(0);
input_moves.push_back(0);
}

auto [old_moves_offset, target_start, new_moves] =
utils::realign_moves(query, target, input_moves); // simplex, duplex, moves

CHECK(old_moves_offset == 0);
CHECK(target_start == 5);
CHECK(new_moves.size() == input_moves.size());
CHECK(std::equal(new_moves.begin(), new_moves.end(), input_moves.begin()));
}

SECTION("Test move table realignment where target is an infix of query") {
auto query = std::string(
"GGGGGACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTTGGGGG");
auto target =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");

std::vector<uint8_t> input_moves;

for (char nucleotide : query) {
(void)nucleotide;
input_moves.push_back(1);
input_moves.push_back(0);
input_moves.push_back(0);
input_moves.push_back(0);
}

auto [old_moves_offset, target_start, new_moves] =
utils::realign_moves(query, target, input_moves); // simplex, duplex, moves

CHECK(old_moves_offset == 5 * 4);
CHECK(target_start == 0);
CHECK(new_moves.size() == input_moves.size() - (5 + 5) * 4);
}

SECTION("Test move table realignment where query is an infix of target") {
auto query =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target = std::string(
"GGGGGACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTTGGGGG");

std::vector<uint8_t> input_moves;

for (char nucleotide : query) {
(void)nucleotide;
input_moves.push_back(1);
input_moves.push_back(0);
input_moves.push_back(0);
}

auto [old_moves_offset, target_start, new_moves] =
utils::realign_moves(query, target, input_moves); // simplex, duplex, moves

CHECK(old_moves_offset == 0);
CHECK(target_start == 5);
CHECK(new_moves.size() == input_moves.size());
}
}
30 changes: 30 additions & 0 deletions tests/SequenceUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,36 @@
using std::make_tuple;
using namespace dorado::utils;

TEST_CASE(TEST_GROUP ": Test compute_overlap", TEST_GROUP) {
SECTION("Test overlaps of long identical strings") {
auto query =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
dorado::MmTbufPtr working_buffer;
auto overlap = compute_overlap(query, "query", target, "target", working_buffer);

CHECK(overlap->query_start == 0);
CHECK(overlap->query_end == static_cast<int>(query.size()) - 1);
CHECK(overlap->target_start == 0);
CHECK(overlap->target_end == static_cast<int>(target.size()) - 1);
}

SECTION("Test overlaps of strings where one is a prefix of the other") {
auto query = std::string(
"TTTTTACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
auto target =
std::string("ACGACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT");
dorado::MmTbufPtr working_buffer;
auto overlap = compute_overlap(query, "query", target, "target", working_buffer);

CHECK(overlap->query_start == 0);
CHECK(overlap->query_end == static_cast<int>(target.size()) - 1);
CHECK(overlap->target_start == 5);
CHECK(overlap->target_end == static_cast<int>(query.size()) - 1);
}
}

TEST_CASE(TEST_GROUP ": Test base_to_int", TEST_GROUP) {
CHECK(base_to_int('A') == 0);
CHECK(base_to_int('C') == 1);
Expand Down

0 comments on commit 6ec77c8

Please sign in to comment.