Skip to content

Commit

Permalink
Merge branch 'DOR-649_output_path' into 'master'
Browse files Browse the repository at this point in the history
DOR-649 add --output-dir arg to basecaller and duplex

Closes DOR-649

See merge request machine-learning/dorado!1062
  • Loading branch information
hpendry-ont committed Jun 13, 2024
2 parents 7d22ac8 + e7229a9 commit df614ab
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 103 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,8 @@ if(NOT DORADO_DISABLE_DORADO)
add_executable(dorado
dorado/main.cpp
dorado/cli/aligner.cpp
dorado/cli/basecall_output_args.cpp
dorado/cli/basecall_output_args.h
dorado/cli/demux.cpp
dorado/cli/duplex.cpp
dorado/cli/trim.cpp
Expand Down
165 changes: 165 additions & 0 deletions dorado/cli/basecall_output_args.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#include "basecall_output_args.h"

#include "utils/tty_utils.h"

#include <ctime>
#include <filesystem>
#include <mutex>
#include <optional>
#include <sstream>

using OutputMode = dorado::utils::HtsFile::OutputMode;
namespace fs = std::filesystem;

namespace dorado::cli {

namespace {

constexpr std::string_view OUTPUT_DIR_ARG{"--output-dir"};
constexpr std::string_view EMIT_FASTQ_ARG{"--emit-fastq"};
constexpr std::string_view EMIT_SAM_ARG{"--emit-sam"};

constexpr std::string_view OUTPUT_FILE_PREFIX{"calls_"};
constexpr std::string_view FASTQ_EXT{".fastq"};
constexpr std::string_view SAM_EXT{".sam"};
constexpr std::string_view BAM_EXT{".bam"};

bool try_create_output_folder(const std::string& output_folder) {
std::error_code creation_error;
// N.B. No error code if folder already exists.
fs::create_directories(fs::path{output_folder}, creation_error);
if (creation_error) {
spdlog::error("Unable to create output folder {}. ErrorCode({}) {}", output_folder,
creation_error.value(), creation_error.message());
return false;
}
return true;
}

std::tm get_gmtime(const std::time_t* time) {
// gmtime is not threadsafe, so lock.
static std::mutex gmtime_mutex;
std::lock_guard lock(gmtime_mutex);
std::tm* time_buffer = gmtime(time);
return *time_buffer;
}

class HtsFileCreator {
const bool m_emit_fastq;
const bool m_emit_sam;
const bool m_reference_requested;
std::optional<std::string> m_output_dir;

std::string m_output_file{};
OutputMode m_output_mode{OutputMode::BAM};

bool is_output_to_file() const { return m_output_file != "-"; }

bool is_sam_or_bam_output() const {
return m_output_mode == OutputMode::BAM || m_output_mode == OutputMode::SAM;
}

std::string_view get_output_file_extension() {
if (m_emit_fastq) {
return FASTQ_EXT;
}
if (m_emit_sam) {
return SAM_EXT;
}
return BAM_EXT;
}

std::string get_output_filename() {
time_t time_now = time(nullptr);
std::tm gm_time_now = get_gmtime(&time_now);
char timestamp_buffer[32];
strftime(timestamp_buffer, 32, "%F_T%H-%M-%S", &gm_time_now);

std::ostringstream oss{};
oss << OUTPUT_FILE_PREFIX << timestamp_buffer << get_output_file_extension();
return oss.str();
}

bool try_set_output_file() {
if (!m_output_dir) {
m_output_file = "-";
return true;
}
if (!try_create_output_folder(*m_output_dir)) {
return false;
}

m_output_file = (fs::path(*m_output_dir) / get_output_filename()).string();
return true;
}

bool try_set_output_mode() {
if (m_emit_fastq) {
if (m_emit_sam) {
spdlog::error("Only one of --emit-{fastq, sam} can be set (or none).");
return false;
}
if (m_reference_requested) {
spdlog::error(
"--emit-fastq cannot be used with --reference as FASTQ cannot store "
"alignment results.");
return false;
}
spdlog::info(
" - Note: FASTQ output is not recommended as not all data can be preserved.");
m_output_mode = OutputMode::FASTQ;
} else if (m_emit_sam || (m_output_file == "-" && utils::is_fd_tty(stdout))) {
m_output_mode = OutputMode::SAM;
} else if (m_output_file == "-" && utils::is_fd_pipe(stdout)) {
m_output_mode = OutputMode::UBAM;
} else {
m_output_mode = OutputMode::BAM;
}

return true;
}

public:
HtsFileCreator(const utils::arg_parse::ArgParser& parser)
: m_emit_fastq(parser.visible.get<bool>(EMIT_FASTQ_ARG)),
m_emit_sam(parser.visible.get<bool>(EMIT_SAM_ARG)),
m_reference_requested(!parser.visible.get<std::string>("--reference").empty()),
m_output_dir(parser.visible.present<std::string>(OUTPUT_DIR_ARG)) {}

std::unique_ptr<utils::HtsFile> create() {
if (!try_set_output_file()) {
return nullptr;
}
if (!try_set_output_mode()) {
return nullptr;
}

// If writing to a SAM/BAM file and there's a reference then we will sort.
auto sort_bam = is_output_to_file() && is_sam_or_bam_output() && m_reference_requested;

return std::make_unique<utils::HtsFile>(m_output_file, m_output_mode, 0, sort_bam);
}
};

} // namespace

std::unique_ptr<utils::HtsFile> extract_hts_file(const utils::arg_parse::ArgParser& parser) {
HtsFileCreator hts_file_creator(parser);
return hts_file_creator.create();
}

void add_basecaller_output_arguments(utils::arg_parse::ArgParser& parser) {
parser.visible.add_argument(EMIT_FASTQ_ARG)
.help("Output in fastq format.")
.default_value(false)
.implicit_value(true);
parser.visible.add_argument(EMIT_SAM_ARG)
.help("Output in SAM format.")
.default_value(false)
.implicit_value(true);
parser.visible.add_argument("-o", OUTPUT_DIR_ARG)
.help("Optional output folder, if specified output will be written to a calls file "
"(calls_<timestamp>.sam|.bam|.fastq) in the given folder.");
}

} // namespace dorado::cli
14 changes: 14 additions & 0 deletions dorado/cli/basecall_output_args.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once

#include "utils/arg_parse_ext.h"
#include "utils/hts_file.h"

#include <memory>

namespace dorado::cli {

void add_basecaller_output_arguments(utils::arg_parse::ArgParser& parser);

std::unique_ptr<utils::HtsFile> extract_hts_file(const utils::arg_parse::ArgParser& parser);

} // namespace dorado::cli
66 changes: 21 additions & 45 deletions dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "api/pipeline_creation.h"
#include "api/runner_creation.h"
#include "basecall/CRFModelConfig.h"
#include "basecall_output_args.h"
#include "cli/cli_utils.h"
#include "data_loader/DataLoader.h"
#include "demux/adapter_info.h"
Expand Down Expand Up @@ -59,6 +60,12 @@
#include <sstream>
#include <thread>

using dorado::utils::default_parameters;
using OutputMode = dorado::utils::HtsFile::OutputMode;
using namespace std::chrono_literals;
using namespace dorado::models;
namespace fs = std::filesystem;

namespace dorado {

namespace {
Expand Down Expand Up @@ -113,12 +120,6 @@ void set_basecaller_params(const argparse::ArgumentParser& arg,

} // namespace

using dorado::utils::default_parameters;
using OutputMode = dorado::utils::HtsFile::OutputMode;
using namespace std::chrono_literals;
using namespace dorado::models;
namespace fs = std::filesystem;

void set_dorado_basecaller_args(utils::arg_parse::ArgParser& parser, int& verbosity) {
parser.visible.add_argument("model").help(
"model selection {fast,hac,sup}@v{version} for automatic model selection including "
Expand Down Expand Up @@ -200,15 +201,6 @@ void set_dorado_basecaller_args(utils::arg_parse::ArgParser& parser, int& verbos
.help("the minimum predicted methylation probability for a modified base to be emitted "
"in an all-context model, [0, 1]");

parser.visible.add_argument("--emit-fastq")
.help("Output in fastq format.")
.default_value(false)
.implicit_value(true);
parser.visible.add_argument("--emit-sam")
.help("Output in SAM format.")
.default_value(false)
.implicit_value(true);

parser.visible.add_argument("--emit-moves").default_value(false).implicit_value(true);

parser.visible.add_argument("--reference")
Expand Down Expand Up @@ -260,6 +252,7 @@ void set_dorado_basecaller_args(utils::arg_parse::ArgParser& parser, int& verbos
.help("Configuration file for PolyA estimation to change default behaviours")
.default_value(std::string(""));

cli::add_basecaller_output_arguments(parser);
cli::add_internal_arguments(parser);
}

Expand All @@ -274,7 +267,7 @@ void setup(const std::vector<std::string>& args,
size_t remora_batch_size,
size_t num_remora_threads,
float methylation_threshold_pct,
OutputMode output_mode,
std::unique_ptr<utils::HtsFile> hts_file,
bool emit_moves,
size_t max_reads,
size_t min_qscore,
Expand Down Expand Up @@ -416,14 +409,14 @@ void setup(const std::vector<std::string>& args,
utils::add_rg_headers(hdr.get(), read_groups);
}

utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads, false);
hts_file->set_num_threads(thread_allocations.writer_threads);

PipelineDescriptor pipeline_desc;
std::string gpu_names{};
#if DORADO_CUDA_BUILD
gpu_names = utils::get_cuda_gpu_names(device);
#endif
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, gpu_names);
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, *hts_file, gpu_names);
auto aligner = PipelineDescriptor::InvalidNodeHandle;
auto current_sink_node = hts_writer;
if (enable_aligner) {
Expand Down Expand Up @@ -494,7 +487,7 @@ void setup(const std::vector<std::string>& args,
const auto& aligner_ref = dynamic_cast<AlignerNode&>(pipeline->get_node_ref(aligner));
utils::add_sq_hdr(hdr.get(), aligner_ref.get_sequence_records_for_header());
}
hts_file.set_header(hdr.get());
hts_file->set_header(hdr.get());

std::unordered_set<std::string> reads_already_processed;
if (!resume_from_file.empty()) {
Expand Down Expand Up @@ -542,7 +535,7 @@ void setup(const std::vector<std::string>& args,
}

// If we're doing alignment, post-processing takes longer due to bam file sorting.
float post_processing_percentage = (hts_file.finalise_is_noop() || ref.empty()) ? 0.0f : 0.5f;
float post_processing_percentage = (hts_file->finalise_is_noop() || ref.empty()) ? 0.0f : 0.5f;

ProgressTracker tracker(int(num_reads), false, post_processing_percentage);
tracker.set_description("Basecalling");
Expand Down Expand Up @@ -576,7 +569,7 @@ void setup(const std::vector<std::string>& args,

// Report progress during output file finalisation.
tracker.set_description("Sorting output files");
hts_file.finalise([&](size_t progress) {
hts_file->finalise([&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
});

Expand Down Expand Up @@ -635,41 +628,24 @@ int basecaller(int argc, char* argv[]) {
return EXIT_FAILURE;
}

auto output_mode = OutputMode::BAM;

auto emit_fastq = parser.visible.get<bool>("--emit-fastq");
auto emit_sam = parser.visible.get<bool>("--emit-sam");

if (emit_fastq && emit_sam) {
spdlog::error("Only one of --emit-{fastq, sam} can be set (or none).");
return EXIT_FAILURE;
}

if (parser.visible.get<std::string>("--reference").empty() &&
!parser.visible.get<std::string>("--bed-file").empty()) {
spdlog::error("--bed-file cannot be used without --reference.");
return EXIT_FAILURE;
}

if (emit_fastq) {
auto hts_file = cli::extract_hts_file(parser);
if (!hts_file) {
return EXIT_FAILURE;
}

if (hts_file->get_output_mode() == OutputMode::FASTQ) {
if (model_selection.has_mods_variant() || !mod_bases.empty() || !mod_bases_models.empty()) {
spdlog::error(
"--emit-fastq cannot be used with modbase models as FASTQ cannot store modbase "
"results.");
return EXIT_FAILURE;
}
if (!parser.visible.get<std::string>("--reference").empty()) {
spdlog::error(
"--emit-fastq cannot be used with --reference as FASTQ cannot store alignment "
"results.");
return EXIT_FAILURE;
}
spdlog::info(" - Note: FASTQ output is not recommended as not all data can be preserved.");
output_mode = OutputMode::FASTQ;
} else if (emit_sam || utils::is_fd_tty(stdout)) {
output_mode = OutputMode::SAM;
} else if (utils::is_fd_pipe(stdout)) {
output_mode = OutputMode::UBAM;
}

bool no_trim_barcodes = false, no_trim_primers = false, no_trim_adapters = false;
Expand Down Expand Up @@ -781,7 +757,7 @@ int basecaller(int argc, char* argv[]) {
parser.visible.get<std::string>("--reference"),
parser.visible.get<std::string>("--bed-file"), default_parameters.num_runners,
default_parameters.remora_batchsize, default_parameters.remora_threads,
methylation_threshold, output_mode, parser.visible.get<bool>("--emit-moves"),
methylation_threshold, std::move(hts_file), parser.visible.get<bool>("--emit-moves"),
parser.visible.get<int>("--max-reads"), parser.visible.get<int>("--min-qscore"),
parser.visible.get<std::string>("--read-ids"), recursive, *minimap_options,
parser.hidden.get<bool>("--skip-model-compatibility-check"),
Expand Down
4 changes: 1 addition & 3 deletions dorado/cli/cli_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
#include "utils/arg_parse_ext.h"
#include "utils/bam_utils.h"

#include <optional>
#include <stdexcept>

#ifdef _WIN32
// Unreachable code warnings are emitted from argparse, even though they should be disabled by the
// MSVC /external:W0 setting. This is a limitation of /external: for some C47XX backend warnings. See:
Expand All @@ -34,6 +31,7 @@
#include <iostream>
#include <optional>
#include <sstream>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>
Expand Down
Loading

0 comments on commit df614ab

Please sign in to comment.