Skip to content

Commit

Permalink
Merge branch 'cherry-pick-42480b4d' into 'release-v0.7'
Browse files Browse the repository at this point in the history
Cherry-pick DOR-597 into 'release-0.7'

See merge request machine-learning/dorado!1035
  • Loading branch information
tijyojwad committed May 29, 2024
2 parents dcbbbb1 + d711868 commit 33578e7
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 34 deletions.
5 changes: 3 additions & 2 deletions dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ void setup(const std::vector<std::string>& args,
barcoding_info != nullptr, adapter_trimming_enabled);

SamHdrPtr hdr(sam_hdr_init());
cli::add_pg_hdr(hdr.get(), args, device);
cli::add_pg_hdr(hdr.get(), "basecaller", args, device);

if (barcoding_info) {
std::unordered_map<std::string, std::string> custom_barcodes{};
Expand Down Expand Up @@ -295,7 +295,8 @@ void setup(const std::vector<std::string>& args,
// Turn off warning logging as header info is fetched.
auto initial_hts_log_level = hts_get_log_level();
hts_set_log_level(HTS_LOG_OFF);
auto pg_keys = utils::extract_pg_keys_from_hdr(resume_from_file, {"CL"});
auto pg_keys =
utils::extract_pg_keys_from_hdr(resume_from_file, {"CL"}, "ID", "basecaller");
hts_set_log_level(initial_hts_log_level);

auto tokens = cli::extract_token_from_cli(pg_keys["CL"]);
Expand Down
3 changes: 2 additions & 1 deletion dorado/cli/cli_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ inline std::pair<int, int> worker_vs_writer_thread_allocation(int available_thre
}

inline void add_pg_hdr(sam_hdr_t* hdr,
const std::string& pg_id,
const std::vector<std::string>& args,
const std::string& device) {
utils::add_hd_header_line(hdr);
auto safe_id = sam_hdr_pg_id(hdr, "basecaller");
auto safe_id = sam_hdr_pg_id(hdr, pg_id.c_str());

std::stringstream pg;
pg << "@PG\tID:" << safe_id << "\tPN:dorado\tVN:" << DORADO_VERSION << "\tCL:dorado";
Expand Down
7 changes: 2 additions & 5 deletions dorado/cli/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ using namespace std::chrono_literals;

namespace {

void add_pg_hdr(sam_hdr_t* hdr) {
sam_hdr_add_pg(hdr, "demux", "PN", "dorado", "VN", DORADO_VERSION, nullptr);
}

// This function allows us to map the reference id from input BAM records to what
// they should be in the output file, based on the new ordering of references in
// the merged header.
Expand Down Expand Up @@ -198,6 +194,7 @@ int demuxer(int argc, char* argv[]) {
auto threads(parser.visible.get<int>("threads"));
auto max_reads(parser.visible.get<int>("max-reads"));
auto strip_alignment = !no_trim;
std::vector<std::string> args(argv, argv + argc);

alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true};
if (!processing_items.initialise()) {
Expand Down Expand Up @@ -243,7 +240,7 @@ int demuxer(int argc, char* argv[]) {
hdr_merger.finalize_merge();
auto sq_mapping = hdr_merger.get_sq_mapping();
auto header = SamHdrPtr(sam_hdr_dup(hdr_merger.get_merged_header()));
add_pg_hdr(header.get());
cli::add_pg_hdr(header.get(), "demux", args, "cpu");

auto barcode_sample_sheet = parser.visible.get<std::string>("--sample-sheet");
std::unique_ptr<const utils::SampleSheet> sample_sheet;
Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/duplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ int duplex(int argc, char* argv[]) {
spdlog::debug("> Reads to process: {}", num_reads);

SamHdrPtr hdr(sam_hdr_init());
cli::add_pg_hdr(hdr.get(), args, device);
cli::add_pg_hdr(hdr.get(), "duplex", args, device);

constexpr int WRITER_THREADS = 4;
utils::HtsFile hts_file("-", output_mode, WRITER_THREADS, false);
Expand Down
12 changes: 2 additions & 10 deletions dorado/cli/trim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ namespace dorado {

using OutputMode = dorado::utils::HtsFile::OutputMode;

namespace {

void add_pg_hdr(sam_hdr_t* hdr) {
sam_hdr_add_pg(hdr, "trim", "PN", "dorado", "VN", DORADO_VERSION, NULL);
}

} // anonymous namespace

int trim(int argc, char* argv[]) {
argparse::ArgumentParser parser("dorado", DORADO_VERSION, argparse::default_arguments::help);
parser.add_description("Adapter/primer trimming tool.");
Expand Down Expand Up @@ -91,6 +83,7 @@ int trim(int argc, char* argv[]) {
auto reads(parser.get<std::vector<std::string>>("reads"));
auto threads(parser.get<int>("threads"));
auto max_reads(parser.get<int>("max-reads"));
std::vector<std::string> args(argv, argv + argc);

threads = threads == 0 ? std::thread::hardware_concurrency() : threads;
// The input thread is the total number of threads to use for dorado
Expand Down Expand Up @@ -118,8 +111,7 @@ int trim(int argc, char* argv[]) {

HtsReader reader(reads[0], read_list);
auto header = SamHdrPtr(sam_hdr_dup(reader.header));
utils::add_hd_header_line(header.get());
add_pg_hdr(header.get());
cli::add_pg_hdr(header.get(), "trim", args, "cpu");
// Always remove alignment information from input header
// because at minimum the adapters are trimmed, which
// invalidates the alignment record.
Expand Down
2 changes: 1 addition & 1 deletion dorado/read_pipeline/BarcodeClassifierNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read, const demux::BarcodingInfo* ba
barcoding_info->allowed_barcodes);
auto bc = generate_barcode_string(bc_res);
spdlog::trace("Barcode for {} is {}", bam_get_qname(irecord), bc);
bam_aux_append(irecord, "BC", 'Z', int(bc.length() + 1), (uint8_t*)bc.c_str());
bam_aux_update_str(irecord, "BC", int(bc.length() + 1), bc.c_str());
m_num_records++;
{
std::lock_guard lock(m_barcode_count_mutex);
Expand Down
30 changes: 19 additions & 11 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,29 +241,37 @@ AlignmentOps get_alignment_op_counts(bam1_t* record) {
}

std::map<std::string, std::string> extract_pg_keys_from_hdr(const std::string& filename,
const std::vector<std::string>& keys) {
const std::vector<std::string>& keys,
const char* ID_key,
const char* ID_val) {
std::map<std::string, std::string> pg_keys;
auto file = hts_open(filename.c_str(), "r");
auto file = HtsFilePtr(hts_open(filename.c_str(), "r"));
if (!file) {
throw std::runtime_error("Could not open file: " + filename);
}
SamHdrPtr header(sam_hdr_read(file));
SamHdrPtr header(sam_hdr_read(file.get()));
if (!header) {
throw std::runtime_error("Could not open header from file: " + filename);
}
return extract_pg_keys_from_hdr(header.get(), keys, ID_key, ID_val);
}

std::map<std::string, std::string> extract_pg_keys_from_hdr(sam_hdr_t* header,
const std::vector<std::string>& keys,
const char* ID_key,
const char* ID_val) {
std::map<std::string, std::string> pg_keys;
if (!header) {
throw std::runtime_error("Provided header cannot be a nullptr.");
}
KString val_wrapper(1000000);
auto val = val_wrapper.get();
for (auto& k : keys) {
auto ret = sam_hdr_find_tag_id(header.get(), "PG", NULL, NULL, k.c_str(), &val);
if (ret != 0) {
throw std::runtime_error(std::string("Required key ")
.append(k)
.append(" not found in header of ")
.append(filename));
auto ret = sam_hdr_find_tag_id(header, "PG", ID_key, ID_val, k.c_str(), &val);
if (ret == 0) {
pg_keys[k] = std::string(val.s);
}
pg_keys[k] = std::string(val.s);
}
hts_close(file);
return pg_keys;
}

Expand Down
23 changes: 21 additions & 2 deletions dorado/utils/bam_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,31 @@ AlignmentOps get_alignment_op_counts(bam1_t* record);
* Extract keys for PG header from BAM header.
*
* @param filepath Path to input BAM file.
* @params keys Vector of keys to parse
* @param keys Vector of keys to parse
* @param ID_key Tag defining the line. E.g. "SN". Can be a nullptr.
* @param ID_val Tag value associated with the key above. If key is null, value can be null.
* @return Map of keys to their values
* @throws An error if a key is requested that doesn't exist.
*/
std::map<std::string, std::string> extract_pg_keys_from_hdr(sam_hdr_t* header,
const std::vector<std::string>& keys,
const char* ID_key,
const char* ID_val);

/**
* Extract keys for PG header from a BAM file.
*
* @param filepath Path to input BAM file.
* @param keys Vector of keys to parse
* @param ID_key Tag defining the line. E.g. "SN". Can be a nullptr.
* @param ID_val Tag value associated with the key above. If key is null, value can be null.
* @return Map of keys to their values
* @throws An error if a key is requested that doesn't exist.
*/
std::map<std::string, std::string> extract_pg_keys_from_hdr(const std::string& filename,
const std::vector<std::string>& keys);
const std::vector<std::string>& keys,
const char* ID_key,
const char* ID_val);

/*
* Extract the sequence string.
Expand Down
3 changes: 2 additions & 1 deletion tests/BamUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ TEST_CASE("BamUtilsTest: fetch keys from PG header", TEST_GROUP) {
fs::path aligner_test_dir = fs::path(get_data_dir("aligner_test"));
auto sam = aligner_test_dir / "basecall.sam";

auto keys = utils::extract_pg_keys_from_hdr(sam.string(), {"PN", "CL", "VN"});
auto keys =
utils::extract_pg_keys_from_hdr(sam.string(), {"PN", "CL", "VN"}, "ID", "basecaller");
CHECK(keys["PN"] == "dorado");
CHECK(keys["VN"] == "0.5.0+5fa4de73+dirty");
CHECK(keys["CL"] ==
Expand Down

0 comments on commit 33578e7

Please sign in to comment.