From d7118688725e14b7446df4047910f76afaf79a8e Mon Sep 17 00:00:00 2001 From: Stephen Malton Date: Fri, 24 May 2024 15:56:59 +0000 Subject: [PATCH] Merge branch 'smalton/DOR-597-BC-tag-update' into 'master' [DOR-597] Update BC tag instead of adding a new one Closes DOR-597 See merge request machine-learning/dorado!1026 (cherry picked from commit 42480b4d0bc895150dda1cde53bcc7c6022f68df) 5c0b7132 [DOR-597] Update BC tag instead of adding a new one 60309135 Address review commments --- dorado/cli/basecaller.cpp | 5 ++-- dorado/cli/cli_utils.h | 3 +- dorado/cli/demux.cpp | 7 ++--- dorado/cli/duplex.cpp | 2 +- dorado/cli/trim.cpp | 12 ++------ .../read_pipeline/BarcodeClassifierNode.cpp | 2 +- dorado/utils/bam_utils.cpp | 30 ++++++++++++------- dorado/utils/bam_utils.h | 23 ++++++++++++-- tests/BamUtilsTest.cpp | 3 +- 9 files changed, 53 insertions(+), 34 deletions(-) diff --git a/dorado/cli/basecaller.cpp b/dorado/cli/basecaller.cpp index d22894ce0..79284fb77 100644 --- a/dorado/cli/basecaller.cpp +++ b/dorado/cli/basecaller.cpp @@ -195,7 +195,7 @@ void setup(const std::vector& args, barcoding_info != nullptr, adapter_trimming_enabled); SamHdrPtr hdr(sam_hdr_init()); - cli::add_pg_hdr(hdr.get(), args, device); + cli::add_pg_hdr(hdr.get(), "basecaller", args, device); if (barcoding_info) { std::unordered_map custom_barcodes{}; @@ -295,7 +295,8 @@ void setup(const std::vector& args, // Turn off warning logging as header info is fetched. auto initial_hts_log_level = hts_get_log_level(); hts_set_log_level(HTS_LOG_OFF); - auto pg_keys = utils::extract_pg_keys_from_hdr(resume_from_file, {"CL"}); + auto pg_keys = + utils::extract_pg_keys_from_hdr(resume_from_file, {"CL"}, "ID", "basecaller"); hts_set_log_level(initial_hts_log_level); auto tokens = cli::extract_token_from_cli(pg_keys["CL"]); diff --git a/dorado/cli/cli_utils.h b/dorado/cli/cli_utils.h index 9f8084193..5230ca7b7 100644 --- a/dorado/cli/cli_utils.h +++ b/dorado/cli/cli_utils.h @@ -65,10 +65,11 @@ inline std::pair worker_vs_writer_thread_allocation(int available_thre } inline void add_pg_hdr(sam_hdr_t* hdr, + const std::string& pg_id, const std::vector& args, const std::string& device) { utils::add_hd_header_line(hdr); - auto safe_id = sam_hdr_pg_id(hdr, "basecaller"); + auto safe_id = sam_hdr_pg_id(hdr, pg_id.c_str()); std::stringstream pg; pg << "@PG\tID:" << safe_id << "\tPN:dorado\tVN:" << DORADO_VERSION << "\tCL:dorado"; diff --git a/dorado/cli/demux.cpp b/dorado/cli/demux.cpp index 0ed0a7c24..45c12ba89 100644 --- a/dorado/cli/demux.cpp +++ b/dorado/cli/demux.cpp @@ -33,10 +33,6 @@ using namespace std::chrono_literals; namespace { -void add_pg_hdr(sam_hdr_t* hdr) { - sam_hdr_add_pg(hdr, "demux", "PN", "dorado", "VN", DORADO_VERSION, nullptr); -} - // This function allows us to map the reference id from input BAM records to what // they should be in the output file, based on the new ordering of references in // the merged header. @@ -198,6 +194,7 @@ int demuxer(int argc, char* argv[]) { auto threads(parser.visible.get("threads")); auto max_reads(parser.visible.get("max-reads")); auto strip_alignment = !no_trim; + std::vector args(argv, argv + argc); alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true}; if (!processing_items.initialise()) { @@ -243,7 +240,7 @@ int demuxer(int argc, char* argv[]) { hdr_merger.finalize_merge(); auto sq_mapping = hdr_merger.get_sq_mapping(); auto header = SamHdrPtr(sam_hdr_dup(hdr_merger.get_merged_header())); - add_pg_hdr(header.get()); + cli::add_pg_hdr(header.get(), "demux", args, "cpu"); auto barcode_sample_sheet = parser.visible.get("--sample-sheet"); std::unique_ptr sample_sheet; diff --git a/dorado/cli/duplex.cpp b/dorado/cli/duplex.cpp index 37d85c60c..4646b9c98 100644 --- a/dorado/cli/duplex.cpp +++ b/dorado/cli/duplex.cpp @@ -411,7 +411,7 @@ int duplex(int argc, char* argv[]) { spdlog::debug("> Reads to process: {}", num_reads); SamHdrPtr hdr(sam_hdr_init()); - cli::add_pg_hdr(hdr.get(), args, device); + cli::add_pg_hdr(hdr.get(), "duplex", args, device); constexpr int WRITER_THREADS = 4; utils::HtsFile hts_file("-", output_mode, WRITER_THREADS, false); diff --git a/dorado/cli/trim.cpp b/dorado/cli/trim.cpp index 26841c8a0..54c97425f 100644 --- a/dorado/cli/trim.cpp +++ b/dorado/cli/trim.cpp @@ -30,14 +30,6 @@ namespace dorado { using OutputMode = dorado::utils::HtsFile::OutputMode; -namespace { - -void add_pg_hdr(sam_hdr_t* hdr) { - sam_hdr_add_pg(hdr, "trim", "PN", "dorado", "VN", DORADO_VERSION, NULL); -} - -} // anonymous namespace - int trim(int argc, char* argv[]) { argparse::ArgumentParser parser("dorado", DORADO_VERSION, argparse::default_arguments::help); parser.add_description("Adapter/primer trimming tool."); @@ -91,6 +83,7 @@ int trim(int argc, char* argv[]) { auto reads(parser.get>("reads")); auto threads(parser.get("threads")); auto max_reads(parser.get("max-reads")); + std::vector args(argv, argv + argc); threads = threads == 0 ? std::thread::hardware_concurrency() : threads; // The input thread is the total number of threads to use for dorado @@ -118,8 +111,7 @@ int trim(int argc, char* argv[]) { HtsReader reader(reads[0], read_list); auto header = SamHdrPtr(sam_hdr_dup(reader.header)); - utils::add_hd_header_line(header.get()); - add_pg_hdr(header.get()); + cli::add_pg_hdr(header.get(), "trim", args, "cpu"); // Always remove alignment information from input header // because at minimum the adapters are trimmed, which // invalidates the alignment record. diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 33572f0d7..4c925e3aa 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -92,7 +92,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read, const demux::BarcodingInfo* ba barcoding_info->allowed_barcodes); auto bc = generate_barcode_string(bc_res); spdlog::trace("Barcode for {} is {}", bam_get_qname(irecord), bc); - bam_aux_append(irecord, "BC", 'Z', int(bc.length() + 1), (uint8_t*)bc.c_str()); + bam_aux_update_str(irecord, "BC", int(bc.length() + 1), bc.c_str()); m_num_records++; { std::lock_guard lock(m_barcode_count_mutex); diff --git a/dorado/utils/bam_utils.cpp b/dorado/utils/bam_utils.cpp index 7678400c8..63e0d0433 100644 --- a/dorado/utils/bam_utils.cpp +++ b/dorado/utils/bam_utils.cpp @@ -241,29 +241,37 @@ AlignmentOps get_alignment_op_counts(bam1_t* record) { } std::map extract_pg_keys_from_hdr(const std::string& filename, - const std::vector& keys) { + const std::vector& keys, + const char* ID_key, + const char* ID_val) { std::map pg_keys; - auto file = hts_open(filename.c_str(), "r"); + auto file = HtsFilePtr(hts_open(filename.c_str(), "r")); if (!file) { throw std::runtime_error("Could not open file: " + filename); } - SamHdrPtr header(sam_hdr_read(file)); + SamHdrPtr header(sam_hdr_read(file.get())); if (!header) { throw std::runtime_error("Could not open header from file: " + filename); } + return extract_pg_keys_from_hdr(header.get(), keys, ID_key, ID_val); +} + +std::map extract_pg_keys_from_hdr(sam_hdr_t* header, + const std::vector& keys, + const char* ID_key, + const char* ID_val) { + std::map pg_keys; + if (!header) { + throw std::runtime_error("Provided header cannot be a nullptr."); + } KString val_wrapper(1000000); auto val = val_wrapper.get(); for (auto& k : keys) { - auto ret = sam_hdr_find_tag_id(header.get(), "PG", NULL, NULL, k.c_str(), &val); - if (ret != 0) { - throw std::runtime_error(std::string("Required key ") - .append(k) - .append(" not found in header of ") - .append(filename)); + auto ret = sam_hdr_find_tag_id(header, "PG", ID_key, ID_val, k.c_str(), &val); + if (ret == 0) { + pg_keys[k] = std::string(val.s); } - pg_keys[k] = std::string(val.s); } - hts_close(file); return pg_keys; } diff --git a/dorado/utils/bam_utils.h b/dorado/utils/bam_utils.h index 455fc2bf8..195cefa61 100644 --- a/dorado/utils/bam_utils.h +++ b/dorado/utils/bam_utils.h @@ -83,12 +83,31 @@ AlignmentOps get_alignment_op_counts(bam1_t* record); * Extract keys for PG header from BAM header. * * @param filepath Path to input BAM file. - * @params keys Vector of keys to parse + * @param keys Vector of keys to parse + * @param ID_key Tag defining the line. E.g. "SN". Can be a nullptr. + * @param ID_val Tag value associated with the key above. If key is null, value can be null. + * @return Map of keys to their values + * @throws An error if a key is requested that doesn't exist. + */ +std::map extract_pg_keys_from_hdr(sam_hdr_t* header, + const std::vector& keys, + const char* ID_key, + const char* ID_val); + +/** + * Extract keys for PG header from a BAM file. + * + * @param filepath Path to input BAM file. + * @param keys Vector of keys to parse + * @param ID_key Tag defining the line. E.g. "SN". Can be a nullptr. + * @param ID_val Tag value associated with the key above. If key is null, value can be null. * @return Map of keys to their values * @throws An error if a key is requested that doesn't exist. */ std::map extract_pg_keys_from_hdr(const std::string& filename, - const std::vector& keys); + const std::vector& keys, + const char* ID_key, + const char* ID_val); /* * Extract the sequence string. diff --git a/tests/BamUtilsTest.cpp b/tests/BamUtilsTest.cpp index a7ce85d5c..dc985edb3 100644 --- a/tests/BamUtilsTest.cpp +++ b/tests/BamUtilsTest.cpp @@ -21,7 +21,8 @@ TEST_CASE("BamUtilsTest: fetch keys from PG header", TEST_GROUP) { fs::path aligner_test_dir = fs::path(get_data_dir("aligner_test")); auto sam = aligner_test_dir / "basecall.sam"; - auto keys = utils::extract_pg_keys_from_hdr(sam.string(), {"PN", "CL", "VN"}); + auto keys = + utils::extract_pg_keys_from_hdr(sam.string(), {"PN", "CL", "VN"}, "ID", "basecaller"); CHECK(keys["PN"] == "dorado"); CHECK(keys["VN"] == "0.5.0+5fa4de73+dirty"); CHECK(keys["CL"] ==