Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Misc improvements before index optimisation PR #339

Merged
merged 10 commits into from
Jul 18, 2023
1 change: 1 addition & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct CompareOptions {
fs::path vcf_refs_file;
uint8_t verbosity { 0 };
float error_rate { 0.11 };
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
bool output_vcf { false };
Expand Down
2 changes: 1 addition & 1 deletion include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ struct DiscoverOptions {
uint32_t threads { 1 };
uint8_t verbosity { 0 };
float error_rate { 0.11 };
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
bool output_kg { false };
bool illumina { false };
bool clean { false };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 600 };
Expand Down
2 changes: 0 additions & 2 deletions include/kmergraphwithcoverage.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,6 @@ class KmerGraphWithCoverage {

void save_covg_dist(const std::string&);

std::vector<std::vector<KmerNodePtr>> get_random_paths(uint32_t);

float prob_path(const std::vector<KmerNodePtr>& kpath, const uint32_t& sample_id,
const std::string& prob_model);

Expand Down
12 changes: 11 additions & 1 deletion include/localPRG.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,17 @@ class LocalPRG {
std::vector<LocalNodePtr> find_alt_path(const std::vector<LocalNodePtr>&,
const uint32_t, const std::string&, const std::string&) const;

std::string random_path();
template<typename RNG>
std::string random_path(RNG &&rng) {
std::vector<LocalNodePtr> npath;
npath.push_back(prg.nodes.at(0));
while (not npath.back()->outNodes.empty()) {
uint32_t random_number = rng();
size_t random_neighbour = random_number % npath.back()->outNodes.size();
npath.push_back(npath.back()->outNodes[random_neighbour]);
}
return string_along_path(npath);
}

// TODO: I really feel like these methods are not responsability of a LocalPRG
// TODO: many of them should be in VCF class, or in the KmerGraphWithCoverage or
Expand Down
2 changes: 1 addition & 1 deletion include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ struct MapOptions {
fs::path vcf_refs_file;
uint8_t verbosity { 0 };
float error_rate { 0.11 };
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
bool output_kg { false };
bool output_vcf { false };
bool illumina { false };
bool clean { false };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 300 };
Expand Down
46 changes: 29 additions & 17 deletions include/minihit_clusters.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,21 @@
* Querying can only happen after insertions are finalised.
*/
class MinimizerHitClusters {
private:
bool insertion_phase;
std::vector<MinimizerHits> clusters;
std::mt19937 rng; // TODO: make shuffling deterministic?

inline void check_if_can_insert() const {
if (!insertion_phase) {
fatal_error("Tried to insert a cluster in a MinimizerHitClusters when insertion phase was finished");
}
}

inline void check_if_can_query() const {
if (insertion_phase) {
fatal_error("Tried to query a MinimizerHitClusters when insertion phase was not finished");
public:
explicit MinimizerHitClusters(const uint32_t rng_seed) : insertion_phase(true), clusters(){
if (bool deterministic_run = rng_seed > 0; deterministic_run) {
rng = std::mt19937(rng_seed);
} else{
rng = std::mt19937(std::random_device()());
}
}
~MinimizerHitClusters() = default; // Note: this is not virtual as there is no need to

public:
MinimizerHitClusters() : insertion_phase(true), clusters(), rng(std::random_device()()){}
~MinimizerHitClusters() = default;
// enable copy/move
MinimizerHitClusters(const MinimizerHitClusters &) = default; // copy constructor
MinimizerHitClusters& operator=(const MinimizerHitClusters &) = default; // copy assignment operator
MinimizerHitClusters(MinimizerHitClusters &&) = default; // move constructor
MinimizerHitClusters& operator=(MinimizerHitClusters &&) = default; // move assignment operator

inline void insert(const MinimizerHits& cluster) {
check_if_can_insert();
Expand Down Expand Up @@ -81,6 +76,23 @@ class MinimizerHitClusters {
check_if_can_query();
return clusters.end();
}

private:
bool insertion_phase;
std::vector<MinimizerHits> clusters;
std::mt19937 rng;

inline void check_if_can_insert() const {
if (!insertion_phase) {
fatal_error("Tried to insert a cluster in a MinimizerHitClusters when insertion phase was finished");
}
}

inline void check_if_can_query() const {
if (insertion_phase) {
fatal_error("Tried to query a MinimizerHitClusters when insertion phase was not finished");
}
}
};

#endif // PANDORA_MINIHIT_CLUSTERS_H
30 changes: 8 additions & 22 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,6 @@ template <typename T> struct spointer_values_equal {
bool operator()(const std::shared_ptr<T> other) const { return *to_find == *other; }
};

// pointer less comparator, from
// https://stackoverflow.com/questions/41375232/is-there-an-stl-comparator-for-stdset-or-stdmap-with-shared-ptr-keys-that
struct ptr_less {
template <typename T> bool operator()(T lhs, T rhs) const
{
return std::less<decltype(*lhs)>()(*lhs, *rhs);
}
};

// utility functions
std::string now();

Expand Down Expand Up @@ -91,7 +82,8 @@ MinimizerHitClusters filter_clusters(
const Seq &seq,
const MinimizerHitClusters& clusters_of_hits,
const std::vector<std::string> &prg_names,
ClusterFilterFile& cluster_filter_file
ClusterFilterFile& cluster_filter_file,
const uint32_t rng_seed = 0
);

void add_clusters_to_pangraph(
Expand All @@ -106,19 +98,20 @@ MinimizerHitClusters get_minimizer_hit_clusters(
const std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> minimizer_hits,
std::shared_ptr<pangenome::Graph> pangraph, const int max_diff,
const uint32_t& genome_size, const float& fraction_kmers_required_for_cluster,
const float& fraction_kmers_required_for_cluster,
ClusterDefFile &cluster_def_file,
ClusterFilterFile &cluster_filter_file,
const uint32_t min_cluster_size,
const uint32_t expected_number_kmers_in_read_sketch = std::numeric_limits<uint32_t>::max());
const uint32_t expected_number_kmers_in_read_sketch,
const uint32_t rng_seed);

uint32_t pangraph_from_read_file(const SampleData& sample,
std::shared_ptr<pangenome::Graph> pangraph, Index &index,
const int max_diff, const float& e_rate,
const fs::path& sample_outdir, const uint32_t min_cluster_size = 10,
const uint32_t genome_size = 5000000, const bool illumina = false, const bool clean = false,
const uint32_t max_covg = 300, uint32_t threads = 1,
const bool keep_extra_debugging_files = false);
const uint32_t genome_size = 5000000, const uint32_t max_covg = 300,
leoisl marked this conversation as resolved.
Show resolved Hide resolved
uint32_t threads = 1, const bool keep_extra_debugging_files = false,
const uint32_t rng_seed = 0);

void infer_most_likely_prg_path_for_pannode(
const std::vector<std::shared_ptr<LocalPRG>>&, PanNode*, uint32_t, float);
Expand Down Expand Up @@ -148,12 +141,8 @@ std::vector<SampleData> load_read_index(const fs::path& read_index_fpath);
// Returns filepath
std::pair<int, std::string> build_memfd(const std::string &data);

std::string exec(const char* cmd);

void build_file(const std::string &filepath, const std::string &data);

bool tool_exists(const std::string &command);

void concatenate_text_files(
const fs::path& output_filename, const std::vector<fs::path>& input_filenames,
const std::string &prepend="");
Expand All @@ -164,9 +153,6 @@ inline void to_upper(std::string &str) {
std::transform(str.begin(), str.end(), str.begin(), ::toupper);
}

int random_int();
std::pair<std::vector<std::string>, std::vector<size_t>> split_ambiguous(const std::string& input_string, uint8_t delim = 4);

uintmax_t get_number_of_bytes_in_file(const fs::path &file);

#endif
20 changes: 13 additions & 7 deletions src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,6 @@ void setup_compare_subcommand(CLI::App& app)
"reads")
->group("Preset");

compare_subcmd
->add_flag(
"--clean", opt->clean, "Add a step to clean and detangle the pangraph")
->group("Filtering");

compare_subcmd
->add_flag("--bin", opt->binomial,
"Use binomial model for kmer coverages [default: negative binomial]")
Expand Down Expand Up @@ -217,6 +212,17 @@ void setup_compare_subcommand(CLI::App& app)
"create thousands of files.")
->group("Debugging");

compare_subcmd
->add_option(
"-r,--rng-seed", opt->rng_seed, "RNG seed, an int>0 to force deterministic "
"mapping when multiple optimal mappings are "
"possible. To be avoided except in "
"debugging/investigation scenarios. A value "
"of 0 will be interpreted as no seed given "
"and mapping will not be deterministic.")
->capture_default_str()
->group("Debugging");

compare_subcmd->add_flag(
"-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity");

Expand Down Expand Up @@ -287,8 +293,8 @@ int pandora_compare(CompareOptions& opt)
<< sample_fpath << " (this will take a while)";
uint32_t covg = pangraph_from_read_file(sample, pangraph_sample, index,
opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.illumina, opt.clean,
opt.max_covg, opt.threads, opt.keep_extra_debugging_files);
opt.min_cluster_size, opt.genome_size, opt.max_covg, opt.threads,
opt.keep_extra_debugging_files, opt.rng_seed);

if (pangraph_sample->nodes.empty()) {
BOOST_LOG_TRIVIAL(warning)
Expand Down
20 changes: 13 additions & 7 deletions src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,6 @@ void setup_discover_subcommand(CLI::App& app)
"reads")
->group("Preset");

discover_subcmd
->add_flag(
"--clean", opt->clean, "Add a step to clean and detangle the pangraph")
->group("Filtering");

discover_subcmd
->add_flag("--bin", opt->binomial,
"Use binomial model for kmer coverages [default: negative binomial]")
Expand Down Expand Up @@ -169,6 +164,17 @@ void setup_discover_subcommand(CLI::App& app)
discover_subcmd->add_flag(
"-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity");

discover_subcmd
->add_option(
"-r,--rng-seed", opt->rng_seed, "RNG seed, an int>0 to force deterministic "
"mapping when multiple optimal mappings are "
"possible. To be avoided except in "
"debugging/investigation scenarios. A value "
"of 0 will be interpreted as no seed given "
"and mapping will not be deterministic.")
->capture_default_str()
->group("Debugging");

// Set the function that will be called when this subcommand is issued.
discover_subcmd->callback([opt]() { pandora_discover(*opt); });
}
Expand All @@ -194,8 +200,8 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio
auto pangraph = std::make_shared<pangenome::Graph>();
uint32_t covg
= pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.illumina, opt.clean, opt.max_covg,
opt.threads, opt.keep_extra_debugging_files);
opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);

if (pangraph->nodes.empty()) {
BOOST_LOG_TRIVIAL(warning)
Expand Down
33 changes: 0 additions & 33 deletions src/kmergraphwithcoverage.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <sstream>
#include <limits>
#include <cstdlib> /* srand, rand */
#include <cmath>

#include <boost/math/distributions/negative_binomial.hpp>
Expand Down Expand Up @@ -327,38 +326,6 @@ float KmerGraphWithCoverage::find_max_path(std::vector<KmerNodePtr>& maxpath,
return prob_path(maxpath, sample_id, prob_model);
}

std::vector<std::vector<KmerNodePtr>> KmerGraphWithCoverage::get_random_paths(
uint32_t num_paths)
{
// find a random path through kmergraph picking ~uniformly from the outnodes at each
// point
std::vector<std::vector<KmerNodePtr>> rpaths;
std::vector<KmerNodePtr> rpath;
uint32_t i;

time_t now;
now = time(nullptr);
srand((unsigned int)now);

if (!kmer_prg->nodes.empty()) {
for (uint32_t j = 0; j != num_paths; ++j) {
i = rand() % kmer_prg->nodes[0]->out_nodes.size();
rpath.push_back(kmer_prg->nodes[0]->out_nodes[i].lock());
while (rpath.back() != kmer_prg->nodes[kmer_prg->nodes.size() - 1]) {
if (rpath.back()->out_nodes.size() == 1) {
rpath.push_back(rpath.back()->out_nodes[0].lock());
} else {
i = rand() % rpath.back()->out_nodes.size();
rpath.push_back(rpath.back()->out_nodes[i].lock());
}
}
rpath.pop_back();
rpaths.push_back(rpath);
rpath.clear();
}
}
return rpaths;
}

float KmerGraphWithCoverage::prob_path(const std::vector<KmerNodePtr>& kpath,
const uint32_t& sample_id, const std::string& prob_model)
Expand Down
11 changes: 0 additions & 11 deletions src/localPRG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1835,17 +1835,6 @@ void LocalPRG::add_variants_to_vcf(VCF& master_vcf, pangenome::NodePtr pnode,
}
}

std::string LocalPRG::random_path()
{
std::vector<LocalNodePtr> npath;
npath.push_back(prg.nodes.at(0));
while (not npath.back()->outNodes.empty()) {
uint32_t random_number = rand() % npath.back()->outNodes.size();
npath.push_back(npath.back()->outNodes[random_number]);
}
return string_along_path(npath);
}

bool operator!=(
const std::vector<KmerNodePtr>& lhs, const std::vector<KmerNodePtr>& rhs)
{
Expand Down
20 changes: 13 additions & 7 deletions src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,6 @@ void setup_map_subcommand(CLI::App& app)
"reads")
->group("Preset");

map_subcmd
->add_flag(
"--clean", opt->clean, "Add a step to clean and detangle the pangraph")
->group("Filtering");

map_subcmd
->add_flag("--bin", opt->binomial,
"Use binomial model for kmer coverages [default: negative binomial]")
Expand Down Expand Up @@ -225,6 +220,17 @@ void setup_map_subcommand(CLI::App& app)
"create thousands of files.")
->group("Debugging");

map_subcmd
->add_option(
"-r,--rng-seed", opt->rng_seed, "RNG seed, an int>0 to force deterministic "
"mapping when multiple optimal mappings are "
"possible. To be avoided except in "
"debugging/investigation scenarios. A value "
"of 0 will be interpreted as no seed given "
"and mapping will not be deterministic.")
->capture_default_str()
->group("Debugging");

map_subcmd->add_flag(
"-v", opt->verbosity, "Verbosity of logging. Repeat for increased verbosity");

Expand Down Expand Up @@ -283,8 +289,8 @@ int pandora_map(MapOptions& opt)
auto pangraph = std::make_shared<pangenome::Graph>();
uint32_t covg
= pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate,
opt.outdir, opt.min_cluster_size, opt.genome_size, opt.illumina, opt.clean, opt.max_covg,
opt.threads, opt.keep_extra_debugging_files);
opt.outdir, opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);

if (pangraph->nodes.empty()) {
BOOST_LOG_TRIVIAL(info) << "Found none of the LocalPRGs in the reads.";
Expand Down
Loading
Loading