From 8ad61e163f78b9b6b5733b26beccba364062894d Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Sun, 14 May 2023 21:32:23 +0100 Subject: [PATCH 01/21] relax adapter search params --- dorado/read_pipeline/DuplexSplitNode.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 7997d96e8..37f26f5f9 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -18,8 +18,8 @@ struct DuplexSplitSettings { int flank_edist = 150; int relaxed_flank_edist = 250; int adapter_edist = 4; - int relaxed_adapter_edist = 6; - uint64_t pore_adapter_range = 100; //bp + int relaxed_adapter_edist = 7; + uint64_t pore_adapter_range = 300; //bp //in bases uint64_t expect_adapter_prefix = 200; //in samples From 6c250c4f3c45b591c6da5427547a0b914618654e Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Sun, 14 May 2023 21:34:05 +0100 Subject: [PATCH 02/21] switch to scaled pore threshold --- dorado/read_pipeline/DuplexSplitNode.cpp | 9 ++------- dorado/read_pipeline/DuplexSplitNode.h | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 833c26ca6..fde4ba697 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -207,18 +207,13 @@ DuplexSplitNode::ExtRead::ExtRead(std::shared_ptr r) PosRanges DuplexSplitNode::possible_pore_regions(const DuplexSplitNode::ExtRead& read, float pore_thr) const { - PosRanges pore_regions; - - //pA formula before scaling: - //pA = read->scaling * (raw + read->offset); - //pA formula after scaling: - //pA = read->scale * raw + read->shift spdlog::trace("Analyzing signal in read {}", read.read->read_id); auto pore_sample_ranges = detect_pore_signal( - read.data_as_float32, (pore_thr - read.read->shift) / read.read->scale, + read.data_as_float32, pore_thr, m_settings.pore_cl_dist, m_settings.expect_pore_prefix); + PosRanges pore_regions; for (auto pore_sample_range : pore_sample_ranges) { auto move_start = pore_sample_range.first / read.read->model_stride; auto move_end = pore_sample_range.second / read.read->model_stride; diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 37f26f5f9..e07176143 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -6,9 +6,9 @@ namespace dorado { struct DuplexSplitSettings { bool enabled = true; bool simplex_mode = false; - float pore_thr = 160.; + float pore_thr = 2.2; size_t pore_cl_dist = 4000; // TODO maybe use frequency * 1sec here? - float relaxed_pore_thr = 150.; + float relaxed_pore_thr = 2.; //usually template read region to the left of potential spacer region size_t end_flank = 1200; //trim potentially erroneous (and/or PCR adapter) bases at end of query From 0831dbb0fde2ef0d4481f86fc953383a6bff4663 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Sun, 14 May 2023 22:07:27 +0100 Subject: [PATCH 03/21] attempt at handling shorter reads --- dorado/read_pipeline/DuplexSplitNode.cpp | 67 ++++++++++++++++++------ dorado/read_pipeline/DuplexSplitNode.h | 8 +-- 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index fde4ba697..2d3f7e659 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -243,20 +243,49 @@ bool DuplexSplitNode::check_nearby_adapter(const Read& read, PosRange r, int ada } //r is potential spacer region -bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, int dist_thr) const { - return r.first >= m_settings.end_flank && - r.second + m_settings.start_flank <= read.seq.length() && - check_rc_match(read.seq, {r.first - m_settings.end_flank, r.first - m_settings.end_trim}, - //including spacer region in search - {r.first, r.second + m_settings.start_flank}, dist_thr); -} +bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_thr) const { + const uint64_t rlen = read.seq.length(); + assert(r.first <= r.second && r.second <= rlen); + if (r.first <= m_settings.end_trim || r.second == rlen) { + return false; + } + const auto s1 = (r.first > m_settings.end_flank) ? r.first - m_settings.end_flank : 0; + const auto e1 = r.first - m_settings.end_trim; + assert(s1 < e1); + //including spacer region in search + const auto s2 = r.first; + const auto e2 = std::min(r.second + m_settings.start_flank, rlen); + + if (e2 == rlen) { + //short read mode triggered + if ((e1 - s1) < m_settings.min_short_flank || + e2 - s2 < e1 - s1 || rlen > 3 * (e2 - s2)) { + return false; + } + } + + const int dist_thr = std::round(err_thr * (e1 - s1)); + + return check_rc_match(read.seq, {s1, e1}, + //including spacer region in search + {s2, e2}, dist_thr); + } + +//bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, int dist_thr) const { +// return r.first >= m_settings.end_flank && +// r.second + m_settings.start_flank <= read.seq.length() && +// check_rc_match(read.seq, {r.first - m_settings.end_flank, r.first - m_settings.end_trim}, +// //including spacer region in search +// {r.first, r.second + m_settings.start_flank}, dist_thr); +//} std::optional DuplexSplitNode::identify_extra_middle_split( const Read& read) const { + assert(m_settings.end_flank > m_settings.end_trim + m_settings.min_short_flank); const auto r_l = read.seq.size(); const auto search_span = std::max(m_settings.middle_adapter_search_span, int(std::round(m_settings.middle_adapter_search_frac * r_l))); - if (r_l < m_settings.end_flank + m_settings.start_flank || r_l < search_span) { + if (r_l < search_span) { return std::nullopt; } @@ -265,13 +294,17 @@ std::optional DuplexSplitNode::identify_extra_middle_ m_settings.adapter, read.seq, m_settings.relaxed_adapter_edist, {r_l / 2 - search_span / 2, r_l / 2 + search_span / 2})) { auto adapter_start = adapter_match->first; - spdlog::trace("Checking middle match & start/end match"); - if (check_flank_match(read, {adapter_start, adapter_start}, - m_settings.relaxed_flank_edist) && - check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, - {0, m_settings.start_flank}, m_settings.relaxed_flank_edist)) { - return PosRange{adapter_start - 1, adapter_start}; + spdlog::trace("Checking middle match & start/end match (unless read is short)"); + if (!check_flank_match(read, {adapter_start, adapter_start}, + m_settings.relaxed_flank_err)) { + return std::nullopt; } + const int dist_thr = std::round(m_settings.relaxed_flank_err * (m_settings.end_flank - m_settings.end_trim)); + if (r_l < 2 * m_settings.end_flank || + check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, + {0, std::min(r_l, m_settings.start_flank)}, dist_thr)) { + return PosRange{adapter_start - 1, adapter_start}; + } } return std::nullopt; } @@ -328,7 +361,7 @@ DuplexSplitNode::build_split_finders() const { filter_ranges(possible_pore_regions(read, m_settings.pore_thr), [&](PosRange r) { return check_flank_match(*read.read, r, - m_settings.flank_edist); + m_settings.flank_err); }), m_settings.end_flank + m_settings.start_flank); }}); @@ -343,7 +376,7 @@ DuplexSplitNode::build_split_finders() const { m_settings.relaxed_adapter_edist) && check_flank_match( *read.read, r, - m_settings.relaxed_flank_edist); + m_settings.relaxed_flank_err); }), m_settings.end_flank + m_settings.start_flank); }}); @@ -356,7 +389,7 @@ DuplexSplitNode::build_split_finders() const { [&](PosRange r) { return check_flank_match(*read.read, {r.first, r.first}, - m_settings.flank_edist); + m_settings.flank_err); }); }}); diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index e07176143..211eb81ce 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -15,8 +15,10 @@ struct DuplexSplitSettings { size_t end_trim = 200; //adjusted to adapter presense and potential loss of bases on query, leading to 'shift' size_t start_flank = 1700; - int flank_edist = 150; - int relaxed_flank_edist = 250; + //minimal query size to consider in "short read" case + size_t min_short_flank = 300; + float flank_err = 0.175; + float relaxed_flank_err = 0.275; int adapter_edist = 4; int relaxed_adapter_edist = 7; uint64_t pore_adapter_range = 300; //bp @@ -62,7 +64,7 @@ class DuplexSplitNode : public MessageSink { std::vector possible_pore_regions(const ExtRead& read, float pore_thr) const; bool check_nearby_adapter(const Read& read, PosRange r, int adapter_edist) const; - bool check_flank_match(const Read& read, PosRange r, int dist_thr) const; + bool check_flank_match(const Read& read, PosRange r, float err_thr) const; std::optional identify_extra_middle_split(const Read& read) const; std::vector> subreads(std::shared_ptr read, From d6549c377bfb43a23ee5c5ce9e7b8c9108214395 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Sun, 14 May 2023 22:09:44 +0100 Subject: [PATCH 04/21] relax flank match identity threshold --- dorado/read_pipeline/DuplexSplitNode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 211eb81ce..602b8ac8f 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -17,7 +17,7 @@ struct DuplexSplitSettings { size_t start_flank = 1700; //minimal query size to consider in "short read" case size_t min_short_flank = 300; - float flank_err = 0.175; + float flank_err = 0.22; float relaxed_flank_err = 0.275; int adapter_edist = 4; int relaxed_adapter_edist = 7; From 8eebeb58161846e1c63a846585839a20e467f0c7 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Sun, 14 May 2023 22:08:53 +0100 Subject: [PATCH 05/21] adjust flank check region --- dorado/read_pipeline/DuplexSplitNode.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 2d3f7e659..97df2a14b 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -254,7 +254,9 @@ bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_ assert(s1 < e1); //including spacer region in search const auto s2 = r.first; - const auto e2 = std::min(r.second + m_settings.start_flank, rlen); + //(r.second - r.first) adjusts for potentially incorrectly detected split region + //, shifting into correct sequence + const auto e2 = std::min(r.second + m_settings.start_flank + (r.second - r.first), rlen); if (e2 == rlen) { //short read mode triggered From 31f41b20b6ea4914f224594d4c276739114baf64 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 01:21:53 +0100 Subject: [PATCH 06/21] no-adapter middle split strategy --- dorado/read_pipeline/DuplexSplitNode.cpp | 79 ++++++++++++++++++++++-- dorado/read_pipeline/DuplexSplitNode.h | 1 + 2 files changed, 74 insertions(+), 6 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 97df2a14b..de3b664be 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -121,7 +121,9 @@ std::vector find_adapter_matches(const std::string& adapter, } //semi-global alignment of "template region" to "complement region" -bool check_rc_match(const std::string& seq, PosRange templ_r, PosRange compl_r, int dist_thr) { +//returns range in the compl_r +std::optional +check_rc_match(const std::string& seq, PosRange templ_r, PosRange compl_r, int dist_thr) { assert(templ_r.second > templ_r.first); assert(compl_r.second > compl_r.first); assert(dist_thr >= 0); @@ -129,17 +131,24 @@ bool check_rc_match(const std::string& seq, PosRange templ_r, PosRange compl_r, auto rc_compl = dorado::utils::reverse_complement( seq.substr(compl_r.first, compl_r.second - compl_r.first)); - auto edlib_cfg = edlibNewAlignConfig(dist_thr, EDLIB_MODE_HW, EDLIB_TASK_DISTANCE, NULL, 0); + auto edlib_cfg = edlibNewAlignConfig(dist_thr, EDLIB_MODE_HW, EDLIB_TASK_LOC, NULL, 0); auto edlib_result = edlibAlign(seq.c_str() + templ_r.first, templ_r.second - templ_r.first, rc_compl.c_str(), rc_compl.size(), edlib_cfg); assert(edlib_result.status == EDLIB_STATUS_OK); bool match = (edlib_result.status == EDLIB_STATUS_OK) && (edlib_result.editDistance != -1); - assert(!match || edlib_result.editDistance <= dist_thr); + std::optional res = std::nullopt; + if (match) { + assert(edlib_result.editDistance <= dist_thr); + assert(edlib_result.numLocations > 0 && + edlib_result.endLocations[0] < compl_r.second && + edlib_result.startLocations[0] < compl_r.second); + res = PosRange(compl_r.second - edlib_result.endLocations[0], compl_r.second - edlib_result.startLocations[0]); + } edlibFreeAlignResult(edlib_result); - return match; + return res; } //TODO end_reason access? @@ -258,6 +267,7 @@ bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_ //, shifting into correct sequence const auto e2 = std::min(r.second + m_settings.start_flank + (r.second - r.first), rlen); + //TODO magic constant 3 (determines minimal size of the 'right' region in terms of full read) if (e2 == rlen) { //short read mode triggered if ((e1 - s1) < m_settings.min_short_flank || @@ -270,7 +280,7 @@ bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_ return check_rc_match(read.seq, {s1, e1}, //including spacer region in search - {s2, e2}, dist_thr); + {s2, e2}, dist_thr).has_value(); } //bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, int dist_thr) const { @@ -281,7 +291,7 @@ bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_ // {r.first, r.second + m_settings.start_flank}, dist_thr); //} -std::optional DuplexSplitNode::identify_extra_middle_split( +std::optional DuplexSplitNode::identify_middle_adapter_split( const Read& read) const { assert(m_settings.end_flank > m_settings.end_trim + m_settings.min_short_flank); const auto r_l = read.seq.size(); @@ -311,6 +321,55 @@ std::optional DuplexSplitNode::identify_extra_middle_ return std::nullopt; } +std::optional DuplexSplitNode::identify_extra_middle_split( + const Read& read) const { + const size_t r_l = read.seq.size(); + //TODO parameterize + static const float ext_start_frac = 0.1; + //extend to tolerate some extra length difference + const auto ext_start_flank = std::max(size_t(ext_start_frac * r_l), m_settings.start_flank); + //further consider only reasonably long reads + if (r_l < 2 * m_settings.end_flank) { + return std::nullopt; + } + + int relaxed_flank_edist = std::round(m_settings.relaxed_flank_err * (m_settings.end_flank - m_settings.end_trim)); + + spdlog::trace("Checking start/end match"); + if (auto templ_start_match = check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, + {0, std::min(r_l, ext_start_flank)}, relaxed_flank_edist)) { + //matched region and query overlap + if (templ_start_match->second + m_settings.end_flank >= r_l) { + return std::nullopt; + } + size_t est_middle = (templ_start_match->second + (r_l - m_settings.end_flank)) / 2; + spdlog::trace("Middle estimate {}", est_middle); + //TODO parameterize + static const int min_split_margin = 100; + static const float split_margin_frac = 0.05; + const auto split_margin = std::max(min_split_margin, int(split_margin_frac * r_l)); + + const auto s1 = (est_middle < split_margin + m_settings.end_flank) ? 0 : est_middle - split_margin - m_settings.end_flank; + const auto e1 = (est_middle < split_margin + m_settings.end_trim) ? 0 : est_middle - split_margin - m_settings.end_trim; + + if (e1 - s1 < m_settings.min_short_flank) { + return std::nullopt; + } + + const auto s2 = est_middle - split_margin; + const auto e2 = std::min(r_l, est_middle + 2 * split_margin + m_settings.start_flank); + spdlog::trace("Checking approx middle match"); + + relaxed_flank_edist = std::round(m_settings.relaxed_flank_err * (e1 - s1)); + if (auto compl_start_match = check_rc_match(read.seq, {s1, e1}, {s2, e2}, relaxed_flank_edist)) { + est_middle = (e1 + compl_start_match->first) / 2; + spdlog::trace("Middle re-estimate {}", est_middle); + return PosRange{est_middle - 1, est_middle}; + } + } + return std::nullopt; +} + std::vector> DuplexSplitNode::subreads( std::shared_ptr read, const std::vector& spacers) const { @@ -396,6 +455,14 @@ DuplexSplitNode::build_split_finders() const { }}); split_finders.push_back({"ADAPTER_MIDDLE", [&](const ExtRead& read) { + if (auto split = identify_middle_adapter_split(*read.read)) { + return PosRanges{*split}; + } else { + return PosRanges(); + } + }}); + + split_finders.push_back({"SPLIT_MIDDLE", [&](const ExtRead& read) { if (auto split = identify_extra_middle_split(*read.read)) { return PosRanges{*split}; } else { diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 602b8ac8f..1ee4bd535 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -65,6 +65,7 @@ class DuplexSplitNode : public MessageSink { std::vector possible_pore_regions(const ExtRead& read, float pore_thr) const; bool check_nearby_adapter(const Read& read, PosRange r, int adapter_edist) const; bool check_flank_match(const Read& read, PosRange r, float err_thr) const; + std::optional identify_middle_adapter_split(const Read& read) const; std::optional identify_extra_middle_split(const Read& read) const; std::vector> subreads(std::shared_ptr read, From cef28d04459756b87096577e5fa71acd41f52413 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 11:08:31 +0100 Subject: [PATCH 07/21] configure max pore region size --- dorado/read_pipeline/DuplexSplitNode.cpp | 4 +++- dorado/read_pipeline/DuplexSplitNode.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index de3b664be..107bdb82b 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -237,7 +237,9 @@ PosRanges DuplexSplitNode::possible_pore_regions(const DuplexSplitNode::ExtRead& //NB. adding adapter length auto end_pos = read.move_sums[move_end]; assert(end_pos > start_pos); - pore_regions.push_back({start_pos, end_pos}); + if (end_pos <= start_pos + m_settings.max_pore_region) { + pore_regions.push_back({start_pos, end_pos}); + } } return pore_regions; diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 1ee4bd535..bbed192ce 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -9,6 +9,8 @@ struct DuplexSplitSettings { float pore_thr = 2.2; size_t pore_cl_dist = 4000; // TODO maybe use frequency * 1sec here? float relaxed_pore_thr = 2.; + //maximal 'open pore' region to consider (bp) + size_t max_pore_region = 200; //usually template read region to the left of potential spacer region size_t end_flank = 1200; //trim potentially erroneous (and/or PCR adapter) bases at end of query From 20fd3f88e9aebae25dc54a0e23ebf6fb8773e97a Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 13:23:44 +0100 Subject: [PATCH 08/21] remove relaxed_pore_threshold and precompute potential pore regions --- dorado/read_pipeline/DuplexSplitNode.cpp | 31 +++++++++++++----------- dorado/read_pipeline/DuplexSplitNode.h | 7 +++--- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 107bdb82b..fae082385 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -206,20 +206,23 @@ std::shared_ptr subread(const Read& read, PosRange seq_range, PosRange sig namespace dorado { -DuplexSplitNode::ExtRead::ExtRead(std::shared_ptr r) - : read(std::move(r)), - data_as_float32(read->raw_data.to(torch::kFloat)), - move_sums(utils::move_cum_sums(read->moves)) { - assert(!move_sums.empty()); - assert(move_sums.back() == read->seq.length()); +DuplexSplitNode::ExtRead +DuplexSplitNode::create_ext_read(std::shared_ptr r) const { + ExtRead ext_read; + ext_read.read = r; + ext_read.move_sums = utils::move_cum_sums(r->moves); + assert(!ext_read.move_sums.empty()); + assert(ext_read.move_sums.back() == r->seq.length()); + ext_read.data_as_float32 = r->raw_data.to(torch::kFloat); + ext_read.possible_pore_regions = possible_pore_regions(ext_read); + return ext_read; } -PosRanges DuplexSplitNode::possible_pore_regions(const DuplexSplitNode::ExtRead& read, - float pore_thr) const { +PosRanges DuplexSplitNode::possible_pore_regions(const DuplexSplitNode::ExtRead& read) const { spdlog::trace("Analyzing signal in read {}", read.read->read_id); auto pore_sample_ranges = detect_pore_signal( - read.data_as_float32, pore_thr, + read.data_as_float32, m_settings.pore_thr, m_settings.pore_cl_dist, m_settings.expect_pore_prefix); PosRanges pore_regions; @@ -412,7 +415,7 @@ DuplexSplitNode::build_split_finders() const { split_finders.push_back( {"PORE_ADAPTER", [&](const ExtRead& read) { return filter_ranges( - possible_pore_regions(read, m_settings.pore_thr), [&](PosRange r) { + read.possible_pore_regions, [&](PosRange r) { return check_nearby_adapter(*read.read, r, m_settings.adapter_edist); }); }}); @@ -421,7 +424,7 @@ DuplexSplitNode::build_split_finders() const { split_finders.push_back( {"PORE_FLANK", [&](const ExtRead& read) { return merge_ranges( - filter_ranges(possible_pore_regions(read, m_settings.pore_thr), + filter_ranges(read.possible_pore_regions, [&](PosRange r) { return check_flank_match(*read.read, r, m_settings.flank_err); @@ -432,7 +435,7 @@ DuplexSplitNode::build_split_finders() const { split_finders.push_back( {"PORE_ALL", [&](const ExtRead& read) { return merge_ranges( - filter_ranges(possible_pore_regions(read, m_settings.relaxed_pore_thr), + filter_ranges(read.possible_pore_regions, [&](PosRange r) { return check_nearby_adapter( *read.read, r, @@ -489,7 +492,7 @@ std::vector> DuplexSplitNode::split(std::shared_ptr return std::vector>{std::move(init_read)}; } - std::vector to_split{ExtRead(init_read)}; + std::vector to_split{create_ext_read(init_read)}; for (const auto& [description, split_f] : m_split_finders) { spdlog::trace("Running {}", description); std::vector split_round_result; @@ -502,7 +505,7 @@ std::vector> DuplexSplitNode::split(std::shared_ptr split_round_result.push_back(std::move(r)); } else { for (auto sr : subreads(r.read, spacers)) { - split_round_result.emplace_back(sr); + split_round_result.push_back(create_ext_read(sr)); } } } diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index bbed192ce..d9319a274 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -8,7 +8,6 @@ struct DuplexSplitSettings { bool simplex_mode = false; float pore_thr = 2.2; size_t pore_cl_dist = 4000; // TODO maybe use frequency * 1sec here? - float relaxed_pore_thr = 2.; //maximal 'open pore' region to consider (bp) size_t max_pore_region = 200; //usually template read region to the left of potential spacer region @@ -58,13 +57,13 @@ class DuplexSplitNode : public MessageSink { std::shared_ptr read; torch::Tensor data_as_float32; std::vector move_sums; - - explicit ExtRead(std::shared_ptr r); + PosRanges possible_pore_regions; }; typedef std::function SplitFinderF; - std::vector possible_pore_regions(const ExtRead& read, float pore_thr) const; + ExtRead create_ext_read(std::shared_ptr r) const; + std::vector possible_pore_regions(const ExtRead& read) const; bool check_nearby_adapter(const Read& read, PosRange r, int adapter_edist) const; bool check_flank_match(const Read& read, PosRange r, float err_thr) const; std::optional identify_middle_adapter_split(const Read& read) const; From 01e5e5df56f1eb29cbcdb4b123e8ef533eb7ef17 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 13:27:53 +0100 Subject: [PATCH 09/21] formatting --- dorado/read_pipeline/DuplexSplitNode.cpp | 94 +++++++++++++----------- 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index fae082385..953df67b7 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -122,8 +122,10 @@ std::vector find_adapter_matches(const std::string& adapter, //semi-global alignment of "template region" to "complement region" //returns range in the compl_r -std::optional -check_rc_match(const std::string& seq, PosRange templ_r, PosRange compl_r, int dist_thr) { +std::optional check_rc_match(const std::string& seq, + PosRange templ_r, + PosRange compl_r, + int dist_thr) { assert(templ_r.second > templ_r.first); assert(compl_r.second > compl_r.first); assert(dist_thr >= 0); @@ -141,10 +143,10 @@ check_rc_match(const std::string& seq, PosRange templ_r, PosRange compl_r, int d std::optional res = std::nullopt; if (match) { assert(edlib_result.editDistance <= dist_thr); - assert(edlib_result.numLocations > 0 && - edlib_result.endLocations[0] < compl_r.second && - edlib_result.startLocations[0] < compl_r.second); - res = PosRange(compl_r.second - edlib_result.endLocations[0], compl_r.second - edlib_result.startLocations[0]); + assert(edlib_result.numLocations > 0 && edlib_result.endLocations[0] < compl_r.second && + edlib_result.startLocations[0] < compl_r.second); + res = PosRange(compl_r.second - edlib_result.endLocations[0], + compl_r.second - edlib_result.startLocations[0]); } edlibFreeAlignResult(edlib_result); @@ -206,8 +208,7 @@ std::shared_ptr subread(const Read& read, PosRange seq_range, PosRange sig namespace dorado { -DuplexSplitNode::ExtRead -DuplexSplitNode::create_ext_read(std::shared_ptr r) const { +DuplexSplitNode::ExtRead DuplexSplitNode::create_ext_read(std::shared_ptr r) const { ExtRead ext_read; ext_read.read = r; ext_read.move_sums = utils::move_cum_sums(r->moves); @@ -221,9 +222,9 @@ DuplexSplitNode::create_ext_read(std::shared_ptr r) const { PosRanges DuplexSplitNode::possible_pore_regions(const DuplexSplitNode::ExtRead& read) const { spdlog::trace("Analyzing signal in read {}", read.read->read_id); - auto pore_sample_ranges = detect_pore_signal( - read.data_as_float32, m_settings.pore_thr, - m_settings.pore_cl_dist, m_settings.expect_pore_prefix); + auto pore_sample_ranges = + detect_pore_signal(read.data_as_float32, m_settings.pore_thr, m_settings.pore_cl_dist, + m_settings.expect_pore_prefix); PosRanges pore_regions; for (auto pore_sample_range : pore_sample_ranges) { @@ -275,8 +276,7 @@ bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_ //TODO magic constant 3 (determines minimal size of the 'right' region in terms of full read) if (e2 == rlen) { //short read mode triggered - if ((e1 - s1) < m_settings.min_short_flank || - e2 - s2 < e1 - s1 || rlen > 3 * (e2 - s2)) { + if ((e1 - s1) < m_settings.min_short_flank || e2 - s2 < e1 - s1 || rlen > 3 * (e2 - s2)) { return false; } } @@ -284,9 +284,10 @@ bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_ const int dist_thr = std::round(err_thr * (e1 - s1)); return check_rc_match(read.seq, {s1, e1}, - //including spacer region in search - {s2, e2}, dist_thr).has_value(); - } + //including spacer region in search + {s2, e2}, dist_thr) + .has_value(); +} //bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, int dist_thr) const { // return r.first >= m_settings.end_flank && @@ -313,15 +314,16 @@ std::optional DuplexSplitNode::identify_middle_adapte auto adapter_start = adapter_match->first; spdlog::trace("Checking middle match & start/end match (unless read is short)"); if (!check_flank_match(read, {adapter_start, adapter_start}, - m_settings.relaxed_flank_err)) { + m_settings.relaxed_flank_err)) { return std::nullopt; } - const int dist_thr = std::round(m_settings.relaxed_flank_err * (m_settings.end_flank - m_settings.end_trim)); + const int dist_thr = std::round(m_settings.relaxed_flank_err * + (m_settings.end_flank - m_settings.end_trim)); if (r_l < 2 * m_settings.end_flank || check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, {0, std::min(r_l, m_settings.start_flank)}, dist_thr)) { - return PosRange{adapter_start - 1, adapter_start}; - } + return PosRange{adapter_start - 1, adapter_start}; + } } return std::nullopt; } @@ -338,11 +340,13 @@ std::optional DuplexSplitNode::identify_extra_middle_ return std::nullopt; } - int relaxed_flank_edist = std::round(m_settings.relaxed_flank_err * (m_settings.end_flank - m_settings.end_trim)); + int relaxed_flank_edist = + std::round(m_settings.relaxed_flank_err * (m_settings.end_flank - m_settings.end_trim)); spdlog::trace("Checking start/end match"); - if (auto templ_start_match = check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, - {0, std::min(r_l, ext_start_flank)}, relaxed_flank_edist)) { + if (auto templ_start_match = + check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, + {0, std::min(r_l, ext_start_flank)}, relaxed_flank_edist)) { //matched region and query overlap if (templ_start_match->second + m_settings.end_flank >= r_l) { return std::nullopt; @@ -354,8 +358,12 @@ std::optional DuplexSplitNode::identify_extra_middle_ static const float split_margin_frac = 0.05; const auto split_margin = std::max(min_split_margin, int(split_margin_frac * r_l)); - const auto s1 = (est_middle < split_margin + m_settings.end_flank) ? 0 : est_middle - split_margin - m_settings.end_flank; - const auto e1 = (est_middle < split_margin + m_settings.end_trim) ? 0 : est_middle - split_margin - m_settings.end_trim; + const auto s1 = (est_middle < split_margin + m_settings.end_flank) + ? 0 + : est_middle - split_margin - m_settings.end_flank; + const auto e1 = (est_middle < split_margin + m_settings.end_trim) + ? 0 + : est_middle - split_margin - m_settings.end_trim; if (e1 - s1 < m_settings.min_short_flank) { return std::nullopt; @@ -366,7 +374,8 @@ std::optional DuplexSplitNode::identify_extra_middle_ spdlog::trace("Checking approx middle match"); relaxed_flank_edist = std::round(m_settings.relaxed_flank_err * (e1 - s1)); - if (auto compl_start_match = check_rc_match(read.seq, {s1, e1}, {s2, e2}, relaxed_flank_edist)) { + if (auto compl_start_match = + check_rc_match(read.seq, {s1, e1}, {s2, e2}, relaxed_flank_edist)) { est_middle = (e1 + compl_start_match->first) / 2; spdlog::trace("Middle re-estimate {}", est_middle); return PosRange{est_middle - 1, est_middle}; @@ -412,25 +421,24 @@ std::vector> DuplexSplitNode::subreads( std::vector> DuplexSplitNode::build_split_finders() const { std::vector> split_finders; - split_finders.push_back( - {"PORE_ADAPTER", [&](const ExtRead& read) { - return filter_ranges( - read.possible_pore_regions, [&](PosRange r) { - return check_nearby_adapter(*read.read, r, m_settings.adapter_edist); - }); - }}); + split_finders.push_back({"PORE_ADAPTER", [&](const ExtRead& read) { + return filter_ranges(read.possible_pore_regions, [&](PosRange r) { + return check_nearby_adapter(*read.read, r, + m_settings.adapter_edist); + }); + }}); if (!m_settings.simplex_mode) { - split_finders.push_back( - {"PORE_FLANK", [&](const ExtRead& read) { - return merge_ranges( - filter_ranges(read.possible_pore_regions, - [&](PosRange r) { - return check_flank_match(*read.read, r, - m_settings.flank_err); - }), - m_settings.end_flank + m_settings.start_flank); - }}); + split_finders.push_back({"PORE_FLANK", [&](const ExtRead& read) { + return merge_ranges( + filter_ranges(read.possible_pore_regions, + [&](PosRange r) { + return check_flank_match( + *read.read, r, + m_settings.flank_err); + }), + m_settings.end_flank + m_settings.start_flank); + }}); split_finders.push_back( {"PORE_ALL", [&](const ExtRead& read) { From cc8a40625a2382f9a0f8f7f7f9bd7c8bbb7ca078 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 13:42:17 +0100 Subject: [PATCH 10/21] more stringent pore-adapter distance threshold --- dorado/read_pipeline/DuplexSplitNode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index d9319a274..a9be368c1 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -22,7 +22,7 @@ struct DuplexSplitSettings { float relaxed_flank_err = 0.275; int adapter_edist = 4; int relaxed_adapter_edist = 7; - uint64_t pore_adapter_range = 300; //bp + uint64_t pore_adapter_range = 100; //bp //in bases uint64_t expect_adapter_prefix = 200; //in samples From 12ce3580fd01b4c0edcb3c2260fa1b1d856b2f68 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 13:49:27 +0100 Subject: [PATCH 11/21] further relaxed relaxed_adapter_edit distance --- dorado/read_pipeline/DuplexSplitNode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index a9be368c1..3619caa8f 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -21,7 +21,7 @@ struct DuplexSplitSettings { float flank_err = 0.22; float relaxed_flank_err = 0.275; int adapter_edist = 4; - int relaxed_adapter_edist = 7; + int relaxed_adapter_edist = 8; uint64_t pore_adapter_range = 100; //bp //in bases uint64_t expect_adapter_prefix = 200; From d2dca2f96dcdd2cceaf0e7cf692a63004c4b32e8 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 13:09:08 +0000 Subject: [PATCH 12/21] extra prints --- dorado/read_pipeline/DuplexSplitNode.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 953df67b7..a2ca41673 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -521,15 +521,18 @@ std::vector> DuplexSplitNode::split(std::shared_ptr } std::vector> split_result; + size_t max_sr_len = 0; for (const auto& ext_read : to_split) { + max_sr_len = std::max(ext_read.read->seq.size(), max_sr_len); split_result.push_back(std::move(ext_read.read)); } - spdlog::trace("Read {} split into {} subreads", init_read->read_id, split_result.size()); + spdlog::info("Read {} split into {} subreads. init_len {} max_subread_len {}", + init_read->read_id, split_result.size(), init_read->seq.size(), max_sr_len); auto stop_ts = high_resolution_clock::now(); - spdlog::trace("READ duration: {} microseconds (ID: {})", - duration_cast(stop_ts - start_ts).count(), init_read->read_id); + spdlog::info("READ duration: {} microseconds (ID: {})", + duration_cast(stop_ts - start_ts).count(), init_read->read_id); return split_result; } From e46a92a3a4077fe6bbb29a7fbc6a4d7ac67cff8d Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 20:22:44 +0000 Subject: [PATCH 13/21] increads max pore region size --- dorado/read_pipeline/DuplexSplitNode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 3619caa8f..c074f24d8 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -9,7 +9,7 @@ struct DuplexSplitSettings { float pore_thr = 2.2; size_t pore_cl_dist = 4000; // TODO maybe use frequency * 1sec here? //maximal 'open pore' region to consider (bp) - size_t max_pore_region = 200; + size_t max_pore_region = 500; //usually template read region to the left of potential spacer region size_t end_flank = 1200; //trim potentially erroneous (and/or PCR adapter) bases at end of query From 05e168446c17a190f3c0162db30fa6d798c997fd Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 15 May 2023 22:05:31 +0100 Subject: [PATCH 14/21] Revert "extra prints" This reverts commit d2dca2f96dcdd2cceaf0e7cf692a63004c4b32e8. --- dorado/read_pipeline/DuplexSplitNode.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index a2ca41673..953df67b7 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -521,18 +521,15 @@ std::vector> DuplexSplitNode::split(std::shared_ptr } std::vector> split_result; - size_t max_sr_len = 0; for (const auto& ext_read : to_split) { - max_sr_len = std::max(ext_read.read->seq.size(), max_sr_len); split_result.push_back(std::move(ext_read.read)); } - spdlog::info("Read {} split into {} subreads. init_len {} max_subread_len {}", - init_read->read_id, split_result.size(), init_read->seq.size(), max_sr_len); + spdlog::trace("Read {} split into {} subreads", init_read->read_id, split_result.size()); auto stop_ts = high_resolution_clock::now(); - spdlog::info("READ duration: {} microseconds (ID: {})", - duration_cast(stop_ts - start_ts).count(), init_read->read_id); + spdlog::trace("READ duration: {} microseconds (ID: {})", + duration_cast(stop_ts - start_ts).count(), init_read->read_id); return split_result; } From 43b340752e34b6246163cb3b60ad7a942b3646f6 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 12 Jun 2023 13:37:12 +0000 Subject: [PATCH 15/21] update num_samples attribute in subread --- dorado/read_pipeline/DuplexSplitNode.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 38be23cf8..7dec87b83 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -180,16 +180,17 @@ std::shared_ptr subread(const Read& read, PosRange seq_range, PosRange sig {torch::indexing::Slice(signal_range.first, signal_range.second)}); subread->attributes.read_number = -1; + //we adjust for it in new start time + subread->attributes.num_samples = signal_range.second - signal_range.first; + subread->num_trimmed_samples = 0; subread->start_sample = read.start_sample + read.num_trimmed_samples + signal_range.first; - subread->end_sample = read.start_sample + read.num_trimmed_samples + signal_range.second; + subread->end_sample = subread->start_sample + subread->attributes.num_samples; + auto start_time_ms = read.run_acquisition_start_time_ms + uint64_t(std::round(subread->start_sample * 1000. / subread->sample_rate)); subread->attributes.start_time = utils::get_string_timestamp_from_unix_time(start_time_ms); subread->start_time_ms = start_time_ms; - //we adjust for it in new start time above - subread->num_trimmed_samples = 0; - subread->seq = subread->seq.substr(seq_range.first, seq_range.second - seq_range.first); subread->qstring = subread->qstring.substr(seq_range.first, seq_range.second - seq_range.first); subread->moves = std::vector(subread->moves.begin() + signal_range.first / stride, From 4b3c8d617cfc6b03b791746850ab08b87ec2745b Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Tue, 13 Jun 2023 19:59:08 +0000 Subject: [PATCH 16/21] refactor; unify flank region check logic; address review comments --- dorado/read_pipeline/DuplexSplitNode.cpp | 161 +++++++++++------------ dorado/read_pipeline/DuplexSplitNode.h | 12 +- 2 files changed, 86 insertions(+), 87 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 7dec87b83..b75c5e5f0 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -259,49 +259,46 @@ bool DuplexSplitNode::check_nearby_adapter(const Read& read, PosRange r, int ada .has_value(); } -//r is potential spacer region -bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, float err_thr) const { +//'spacer' is region potentially containing templ/compl strand boundary +//returns optional pair of matching ranges (first strictly to the left of spacer region) +std::optional> +DuplexSplitNode::check_flank_match(const Read& read, PosRange spacer, float err_thr) const { const uint64_t rlen = read.seq.length(); - assert(r.first <= r.second && r.second <= rlen); - if (r.first <= m_settings.end_trim || r.second == rlen) { - return false; + assert(spacer.first <= spacer.second && spacer.second <= rlen); + if (spacer.first <= m_settings.strand_end_trim || spacer.second == rlen) { + return std::nullopt; } - const auto s1 = (r.first > m_settings.end_flank) ? r.first - m_settings.end_flank : 0; - const auto e1 = r.first - m_settings.end_trim; - assert(s1 < e1); + + const uint64_t left_start = (spacer.first > m_settings.strand_end_flank) + ? spacer.first - m_settings.strand_end_flank + : 0; + const uint64_t left_end = spacer.first - m_settings.strand_end_trim; + assert(left_start < left_end); + const uint64_t left_span = left_end - left_start; + //including spacer region in search - const auto s2 = r.first; + const uint64_t right_start = spacer.first; //(r.second - r.first) adjusts for potentially incorrectly detected split region //, shifting into correct sequence - const auto e2 = std::min(r.second + m_settings.start_flank + (r.second - r.first), rlen); - - //TODO magic constant 3 (determines minimal size of the 'right' region in terms of full read) - if (e2 == rlen) { - //short read mode triggered - if ((e1 - s1) < m_settings.min_short_flank || e2 - s2 < e1 - s1 || rlen > 3 * (e2 - s2)) { - return false; + const uint64_t right_end = std::min( + spacer.second + m_settings.strand_start_flank + (spacer.second - spacer.first), rlen); + assert(right_start < right_end); + const uint64_t right_span = right_end - right_start; + + const int dist_thr = std::round(err_thr * left_span); + if (left_span >= m_settings.min_flank && right_span >= left_span) { + if (auto match = check_rc_match(read.seq, {left_start, left_end}, + //including spacer region in search + {right_start, right_end}, dist_thr)) { + return std::pair{PosRange{left_start, left_end}, *match}; } } - - const int dist_thr = std::round(err_thr * (e1 - s1)); - - return check_rc_match(read.seq, {s1, e1}, - //including spacer region in search - {s2, e2}, dist_thr) - .has_value(); + return std::nullopt; } -//bool DuplexSplitNode::check_flank_match(const Read& read, PosRange r, int dist_thr) const { -// return r.first >= m_settings.end_flank && -// r.second + m_settings.start_flank <= read.seq.length() && -// check_rc_match(read.seq, {r.first - m_settings.end_flank, r.first - m_settings.end_trim}, -// //including spacer region in search -// {r.first, r.second + m_settings.start_flank}, dist_thr); -//} - std::optional DuplexSplitNode::identify_middle_adapter_split( const Read& read) const { - assert(m_settings.end_flank > m_settings.end_trim + m_settings.min_short_flank); + assert(m_settings.strand_end_flank > m_settings.strand_end_trim + m_settings.min_flank); const auto r_l = read.seq.size(); const auto search_span = std::max(m_settings.middle_adapter_search_span, int(std::round(m_settings.middle_adapter_search_frac * r_l))); @@ -313,18 +310,29 @@ std::optional DuplexSplitNode::identify_middle_adapte if (auto adapter_match = find_best_adapter_match( m_settings.adapter, read.seq, m_settings.relaxed_adapter_edist, {r_l / 2 - search_span / 2, r_l / 2 + search_span / 2})) { - auto adapter_start = adapter_match->first; - spdlog::trace("Checking middle match & start/end match (unless read is short)"); - if (!check_flank_match(read, {adapter_start, adapter_start}, - m_settings.relaxed_flank_err)) { - return std::nullopt; - } - const int dist_thr = std::round(m_settings.relaxed_flank_err * - (m_settings.end_flank - m_settings.end_trim)); - if (r_l < 2 * m_settings.end_flank || - check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, - {0, std::min(r_l, m_settings.start_flank)}, dist_thr)) { - return PosRange{adapter_start - 1, adapter_start}; + const uint64_t adapter_start = adapter_match->first; + const uint64_t adapter_end = adapter_match->second; + spdlog::trace("Checking middle match & start/end match"); + //Checking match around adapter + if (check_flank_match(read, {adapter_start, adapter_start}, m_settings.relaxed_flank_err)) { + //Checking start/end match + //some initializations might 'overflow' and not make sense, but not if check_rc_match below actually ends up checked! + const uint64_t query_start = r_l - m_settings.strand_end_flank; + const uint64_t query_end = r_l - m_settings.strand_end_trim; + const uint64_t query_span = query_end - query_start; + const int dist_thr = std::round(m_settings.relaxed_flank_err * query_span); + + const uint64_t template_start = 0; + const uint64_t template_end = std::min(m_settings.strand_start_flank, adapter_start); + const uint64_t template_span = template_end - template_start; + + if (adapter_end + m_settings.strand_end_flank > r_l || template_span < query_span || + check_rc_match( + read.seq, + {r_l - m_settings.strand_end_flank, r_l - m_settings.strand_end_trim}, + {0, std::min(m_settings.strand_start_flank, r_l)}, dist_thr)) { + return PosRange{adapter_start - 1, adapter_start}; + } } } return std::nullopt; @@ -336,49 +344,38 @@ std::optional DuplexSplitNode::identify_extra_middle_ //TODO parameterize static const float ext_start_frac = 0.1; //extend to tolerate some extra length difference - const auto ext_start_flank = std::max(size_t(ext_start_frac * r_l), m_settings.start_flank); + const auto ext_start_flank = + std::max(size_t(ext_start_frac * r_l), m_settings.strand_start_flank); //further consider only reasonably long reads - if (r_l < 2 * m_settings.end_flank) { + if (ext_start_flank + m_settings.strand_end_flank > r_l) { return std::nullopt; } int relaxed_flank_edist = - std::round(m_settings.relaxed_flank_err * (m_settings.end_flank - m_settings.end_trim)); + std::round(m_settings.relaxed_flank_err * + (m_settings.strand_end_flank - m_settings.strand_end_trim)); spdlog::trace("Checking start/end match"); - if (auto templ_start_match = - check_rc_match(read.seq, {r_l - m_settings.end_flank, r_l - m_settings.end_trim}, - {0, std::min(r_l, ext_start_flank)}, relaxed_flank_edist)) { - //matched region and query overlap - if (templ_start_match->second + m_settings.end_flank >= r_l) { + if (auto templ_start_match = check_rc_match( + read.seq, {r_l - m_settings.strand_end_flank, r_l - m_settings.strand_end_trim}, + {0, std::min(r_l, ext_start_flank)}, relaxed_flank_edist)) { + //check if matched region and query overlap + if (templ_start_match->second + m_settings.strand_end_flank > r_l) { return std::nullopt; } - size_t est_middle = (templ_start_match->second + (r_l - m_settings.end_flank)) / 2; + size_t est_middle = (templ_start_match->second + (r_l - m_settings.strand_end_flank)) / 2; spdlog::trace("Middle estimate {}", est_middle); //TODO parameterize static const int min_split_margin = 100; static const float split_margin_frac = 0.05; const auto split_margin = std::max(min_split_margin, int(split_margin_frac * r_l)); - const auto s1 = (est_middle < split_margin + m_settings.end_flank) - ? 0 - : est_middle - split_margin - m_settings.end_flank; - const auto e1 = (est_middle < split_margin + m_settings.end_trim) - ? 0 - : est_middle - split_margin - m_settings.end_trim; - - if (e1 - s1 < m_settings.min_short_flank) { - return std::nullopt; - } - - const auto s2 = est_middle - split_margin; - const auto e2 = std::min(r_l, est_middle + 2 * split_margin + m_settings.start_flank); spdlog::trace("Checking approx middle match"); - - relaxed_flank_edist = std::round(m_settings.relaxed_flank_err * (e1 - s1)); - if (auto compl_start_match = - check_rc_match(read.seq, {s1, e1}, {s2, e2}, relaxed_flank_edist)) { - est_middle = (e1 + compl_start_match->first) / 2; + if (auto middle_match_ranges = + check_flank_match(read, {est_middle - split_margin, est_middle + split_margin}, + m_settings.relaxed_flank_err)) { + est_middle = + (middle_match_ranges->first.second + middle_match_ranges->second.first) / 2; spdlog::trace("Middle re-estimate {}", est_middle); return PosRange{est_middle - 1, est_middle}; } @@ -431,16 +428,16 @@ DuplexSplitNode::build_split_finders() const { }}); if (!m_settings.simplex_mode) { - split_finders.push_back({"PORE_FLANK", [&](const ExtRead& read) { - return merge_ranges( - filter_ranges(read.possible_pore_regions, - [&](PosRange r) { - return check_flank_match( - *read.read, r, - m_settings.flank_err); - }), - m_settings.end_flank + m_settings.start_flank); - }}); + split_finders.push_back( + {"PORE_FLANK", [&](const ExtRead& read) { + return merge_ranges( + filter_ranges(read.possible_pore_regions, + [&](PosRange r) { + return check_flank_match(*read.read, r, + m_settings.flank_err); + }), + m_settings.strand_end_flank + m_settings.strand_start_flank); + }}); split_finders.push_back( {"PORE_ALL", [&](const ExtRead& read) { @@ -454,7 +451,7 @@ DuplexSplitNode::build_split_finders() const { *read.read, r, m_settings.relaxed_flank_err); }), - m_settings.end_flank + m_settings.start_flank); + m_settings.strand_end_flank + m_settings.strand_start_flank); }}); split_finders.push_back( diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 8298c88dd..7a29b0d6d 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -19,13 +19,13 @@ struct DuplexSplitSettings { //maximal 'open pore' region to consider (bp) size_t max_pore_region = 500; //usually template read region to the left of potential spacer region - size_t end_flank = 1200; + size_t strand_end_flank = 1200; //trim potentially erroneous (and/or PCR adapter) bases at end of query - size_t end_trim = 200; + size_t strand_end_trim = 200; //adjusted to adapter presense and potential loss of bases on query, leading to 'shift' - size_t start_flank = 1700; + size_t strand_start_flank = 1700; //minimal query size to consider in "short read" case - size_t min_short_flank = 300; + size_t min_flank = 300; float flank_err = 0.22; float relaxed_flank_err = 0.275; int adapter_edist = 4; @@ -75,7 +75,9 @@ class DuplexSplitNode : public MessageSink { ExtRead create_ext_read(std::shared_ptr r) const; std::vector possible_pore_regions(const ExtRead& read) const; bool check_nearby_adapter(const Read& read, PosRange r, int adapter_edist) const; - bool check_flank_match(const Read& read, PosRange r, float err_thr) const; + std::optional> check_flank_match(const Read& read, + PosRange r, + float err_thr) const; std::optional identify_middle_adapter_split(const Read& read) const; std::optional identify_extra_middle_split(const Read& read) const; From cb9d463fd9941670cbe00c8d05c929f09a3f2877 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Tue, 13 Jun 2023 21:51:30 +0000 Subject: [PATCH 17/21] check extra fields in split test --- tests/DuplexSplitTest.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/DuplexSplitTest.cpp b/tests/DuplexSplitTest.cpp index f0f39577b..6bbe42435 100644 --- a/tests/DuplexSplitTest.cpp +++ b/tests/DuplexSplitTest.cpp @@ -65,6 +65,19 @@ TEST_CASE("4 subread splitting test", TEST_GROUP) { "2023-02-21T12:46:39.607+00:00", "2023-02-21T12:46:53.105+00:00"}); + std::vector start_time_mss; + for (auto &r : split_res) { + start_time_mss.push_back(r->start_time_ms); + } + REQUIRE(start_time_mss == + std::vector{1676983561529, 1676983585837, 1676983599607, 1676983613105}); + + std::vector num_sampless; + for (auto &r : split_res) { + num_sampless.push_back(r->attributes.num_samples); + } + REQUIRE(num_sampless == std::vector{97125, 55055, 53940, 50475}); + std::set names; for (auto &r : split_res) { names.insert(r->read_id); From c2eb24781f5350ea3268add5726934522b9c4fb9 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Wed, 14 Jun 2023 17:49:45 +0100 Subject: [PATCH 18/21] switch to uint64_t everywhere (also fixes mac os compilation) --- dorado/read_pipeline/DuplexSplitNode.cpp | 26 ++++++++++++------------ dorado/read_pipeline/DuplexSplitNode.h | 14 ++++++------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index b75c5e5f0..584209745 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -32,7 +32,7 @@ auto filter_ranges(const PosRanges& ranges, FilterF filter_f) { //merges overlapping ranges and ranges separated by merge_dist or less //ranges supposed to be sorted by start coordinate -PosRanges merge_ranges(const PosRanges& ranges, size_t merge_dist) { +PosRanges merge_ranges(const PosRanges& ranges, uint64_t merge_dist) { PosRanges merged; for (auto& r : ranges) { assert(merged.empty() || r.first >= merged.back().first); @@ -45,16 +45,16 @@ PosRanges merge_ranges(const PosRanges& ranges, size_t merge_dist) { return merged; } -std::vector> detect_pore_signal(const torch::Tensor& signal, +std::vector> detect_pore_signal(const torch::Tensor& signal, float threshold, - size_t cluster_dist, - size_t ignore_prefix) { - std::vector> ans; + uint64_t cluster_dist, + uint64_t ignore_prefix) { + std::vector> ans; auto pore_a = signal.accessor(); int64_t cl_start = -1; int64_t cl_end = -1; - for (size_t i = ignore_prefix; i < pore_a.size(0); i++) { + for (auto i = ignore_prefix; i < pore_a.size(0); i++) { if (pore_a[i] > threshold) { //check if we need to start new cluster if (cl_end == -1 || i > cl_end + cluster_dist) { @@ -299,9 +299,9 @@ DuplexSplitNode::check_flank_match(const Read& read, PosRange spacer, float err_ std::optional DuplexSplitNode::identify_middle_adapter_split( const Read& read) const { assert(m_settings.strand_end_flank > m_settings.strand_end_trim + m_settings.min_flank); - const auto r_l = read.seq.size(); - const auto search_span = std::max(m_settings.middle_adapter_search_span, - int(std::round(m_settings.middle_adapter_search_frac * r_l))); + const uint64_t r_l = read.seq.size(); + const uint64_t search_span = std::max(m_settings.middle_adapter_search_span, + uint64_t(std::round(m_settings.middle_adapter_search_frac * r_l))); if (r_l < search_span) { return std::nullopt; } @@ -340,12 +340,12 @@ std::optional DuplexSplitNode::identify_middle_adapte std::optional DuplexSplitNode::identify_extra_middle_split( const Read& read) const { - const size_t r_l = read.seq.size(); + const uint64_t r_l = read.seq.size(); //TODO parameterize static const float ext_start_frac = 0.1; //extend to tolerate some extra length difference - const auto ext_start_flank = - std::max(size_t(ext_start_frac * r_l), m_settings.strand_start_flank); + const uint64_t ext_start_flank = + std::max(uint64_t(ext_start_frac * r_l), m_settings.strand_start_flank); //further consider only reasonably long reads if (ext_start_flank + m_settings.strand_end_flank > r_l) { return std::nullopt; @@ -363,7 +363,7 @@ std::optional DuplexSplitNode::identify_extra_middle_ if (templ_start_match->second + m_settings.strand_end_flank > r_l) { return std::nullopt; } - size_t est_middle = (templ_start_match->second + (r_l - m_settings.strand_end_flank)) / 2; + uint64_t est_middle = (templ_start_match->second + (r_l - m_settings.strand_end_flank)) / 2; spdlog::trace("Middle estimate {}", est_middle); //TODO parameterize static const int min_split_margin = 100; diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 7a29b0d6d..7492d4cb3 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -15,17 +15,17 @@ struct DuplexSplitSettings { bool enabled = true; bool simplex_mode = false; float pore_thr = 2.2; - size_t pore_cl_dist = 4000; // TODO maybe use frequency * 1sec here? + uint64_t pore_cl_dist = 4000; // TODO maybe use frequency * 1sec here? //maximal 'open pore' region to consider (bp) - size_t max_pore_region = 500; + uint64_t max_pore_region = 500; //usually template read region to the left of potential spacer region - size_t strand_end_flank = 1200; + uint64_t strand_end_flank = 1200; //trim potentially erroneous (and/or PCR adapter) bases at end of query - size_t strand_end_trim = 200; + uint64_t strand_end_trim = 200; //adjusted to adapter presense and potential loss of bases on query, leading to 'shift' - size_t strand_start_flank = 1700; + uint64_t strand_start_flank = 1700; //minimal query size to consider in "short read" case - size_t min_flank = 300; + uint64_t min_flank = 300; float flank_err = 0.22; float relaxed_flank_err = 0.275; int adapter_edist = 4; @@ -35,7 +35,7 @@ struct DuplexSplitSettings { uint64_t expect_adapter_prefix = 200; //in samples uint64_t expect_pore_prefix = 5000; - int middle_adapter_search_span = 1000; + uint64_t middle_adapter_search_span = 1000; float middle_adapter_search_frac = 0.2; //TODO put in config From 699865b9a7fb02579788939b49a2e8443e314c71 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Wed, 14 Jun 2023 20:13:08 +0100 Subject: [PATCH 19/21] clang-format --- dorado/read_pipeline/DuplexSplitNode.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 584209745..ff4dc46ba 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -46,9 +46,9 @@ PosRanges merge_ranges(const PosRanges& ranges, uint64_t merge_dist) { } std::vector> detect_pore_signal(const torch::Tensor& signal, - float threshold, - uint64_t cluster_dist, - uint64_t ignore_prefix) { + float threshold, + uint64_t cluster_dist, + uint64_t ignore_prefix) { std::vector> ans; auto pore_a = signal.accessor(); int64_t cl_start = -1; @@ -300,8 +300,9 @@ std::optional DuplexSplitNode::identify_middle_adapte const Read& read) const { assert(m_settings.strand_end_flank > m_settings.strand_end_trim + m_settings.min_flank); const uint64_t r_l = read.seq.size(); - const uint64_t search_span = std::max(m_settings.middle_adapter_search_span, - uint64_t(std::round(m_settings.middle_adapter_search_frac * r_l))); + const uint64_t search_span = + std::max(m_settings.middle_adapter_search_span, + uint64_t(std::round(m_settings.middle_adapter_search_frac * r_l))); if (r_l < search_span) { return std::nullopt; } From 9faf75ea8250088316e057605eac5a489d0ebeb3 Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Fri, 30 Jun 2023 12:44:44 +0000 Subject: [PATCH 20/21] stricter flank identity --- dorado/read_pipeline/DuplexSplitNode.cpp | 13 ++++++------- dorado/read_pipeline/DuplexSplitNode.h | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index ff4dc46ba..08da4d3b9 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -315,13 +315,13 @@ std::optional DuplexSplitNode::identify_middle_adapte const uint64_t adapter_end = adapter_match->second; spdlog::trace("Checking middle match & start/end match"); //Checking match around adapter - if (check_flank_match(read, {adapter_start, adapter_start}, m_settings.relaxed_flank_err)) { + if (check_flank_match(read, {adapter_start, adapter_start}, m_settings.flank_err)) { //Checking start/end match //some initializations might 'overflow' and not make sense, but not if check_rc_match below actually ends up checked! const uint64_t query_start = r_l - m_settings.strand_end_flank; const uint64_t query_end = r_l - m_settings.strand_end_trim; const uint64_t query_span = query_end - query_start; - const int dist_thr = std::round(m_settings.relaxed_flank_err * query_span); + const int dist_thr = std::round(m_settings.flank_err * query_span); const uint64_t template_start = 0; const uint64_t template_end = std::min(m_settings.strand_start_flank, adapter_start); @@ -352,14 +352,13 @@ std::optional DuplexSplitNode::identify_extra_middle_ return std::nullopt; } - int relaxed_flank_edist = - std::round(m_settings.relaxed_flank_err * - (m_settings.strand_end_flank - m_settings.strand_end_trim)); + int flank_edist = std::round(m_settings.flank_err * + (m_settings.strand_end_flank - m_settings.strand_end_trim)); spdlog::trace("Checking start/end match"); if (auto templ_start_match = check_rc_match( read.seq, {r_l - m_settings.strand_end_flank, r_l - m_settings.strand_end_trim}, - {0, std::min(r_l, ext_start_flank)}, relaxed_flank_edist)) { + {0, std::min(r_l, ext_start_flank)}, flank_edist)) { //check if matched region and query overlap if (templ_start_match->second + m_settings.strand_end_flank > r_l) { return std::nullopt; @@ -374,7 +373,7 @@ std::optional DuplexSplitNode::identify_extra_middle_ spdlog::trace("Checking approx middle match"); if (auto middle_match_ranges = check_flank_match(read, {est_middle - split_margin, est_middle + split_margin}, - m_settings.relaxed_flank_err)) { + m_settings.flank_err)) { est_middle = (middle_match_ranges->first.second + middle_match_ranges->second.first) / 2; spdlog::trace("Middle re-estimate {}", est_middle); diff --git a/dorado/read_pipeline/DuplexSplitNode.h b/dorado/read_pipeline/DuplexSplitNode.h index 7492d4cb3..ca2b81a99 100644 --- a/dorado/read_pipeline/DuplexSplitNode.h +++ b/dorado/read_pipeline/DuplexSplitNode.h @@ -26,7 +26,7 @@ struct DuplexSplitSettings { uint64_t strand_start_flank = 1700; //minimal query size to consider in "short read" case uint64_t min_flank = 300; - float flank_err = 0.22; + float flank_err = 0.15; float relaxed_flank_err = 0.275; int adapter_edist = 4; int relaxed_adapter_edist = 8; From 22d45ec10a2a7d19e7914214c9f200e12f124abd Mon Sep 17 00:00:00 2001 From: Sergey Nurk Date: Mon, 3 Jul 2023 13:53:49 +0000 Subject: [PATCH 21/21] remove 'static' from compile-time constants --- dorado/read_pipeline/DuplexSplitNode.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dorado/read_pipeline/DuplexSplitNode.cpp b/dorado/read_pipeline/DuplexSplitNode.cpp index 08da4d3b9..3d1f08485 100644 --- a/dorado/read_pipeline/DuplexSplitNode.cpp +++ b/dorado/read_pipeline/DuplexSplitNode.cpp @@ -343,7 +343,7 @@ std::optional DuplexSplitNode::identify_extra_middle_ const Read& read) const { const uint64_t r_l = read.seq.size(); //TODO parameterize - static const float ext_start_frac = 0.1; + const float ext_start_frac = 0.1; //extend to tolerate some extra length difference const uint64_t ext_start_flank = std::max(uint64_t(ext_start_frac * r_l), m_settings.strand_start_flank); @@ -366,8 +366,8 @@ std::optional DuplexSplitNode::identify_extra_middle_ uint64_t est_middle = (templ_start_match->second + (r_l - m_settings.strand_end_flank)) / 2; spdlog::trace("Middle estimate {}", est_middle); //TODO parameterize - static const int min_split_margin = 100; - static const float split_margin_frac = 0.05; + const int min_split_margin = 100; + const float split_margin_frac = 0.05; const auto split_margin = std::max(min_split_margin, int(split_margin_frac * r_l)); spdlog::trace("Checking approx middle match");