Skip to content

Commit

Permalink
Merge pull request #344 from leoisl/improved_mapping
Browse files Browse the repository at this point in the history
Improved mapping
  • Loading branch information
leoisl authored Dec 11, 2023
2 parents 487fcbb + 1ac2368 commit 40d4e78
Show file tree
Hide file tree
Showing 13 changed files with 241 additions and 53 deletions.
6 changes: 5 additions & 1 deletion include/cluster_files.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ class ClusterFilterFile : public GenericFile {
public:
ClusterFilterFile(const fs::path &filepath, bool is_fake_file = false)
: GenericFile(filepath, is_fake_file){
(*this) << "read\tprg\tcluster_size\tstatus\n";
(*this) << "read\tprg\tnb_of_unique_minimisers\ttarget_cov\tread_start\t"
"read_end\toverlap\tstatus\tfavoured_cluster_prg\t"
"favoured_cluster_nb_of_unique_minimisers\t"
"favoured_cluster_target_cov\tfavoured_cluster_read_start\t"
"favoured_cluster_read_end\n";
}
};

Expand Down
2 changes: 2 additions & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ struct CompareOptions {
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
float conflicting_clusters_overlap_threshold { 0.8 };
float conflicting_clusters_minimiser_tolerance { 0.05 };
bool output_vcf { false };
bool illumina { false };
float min_absolute_gene_coverage { 3.0 };
Expand Down
2 changes: 2 additions & 0 deletions include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ struct DiscoverOptions {
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
float conflicting_clusters_overlap_threshold { 0.8 };
float conflicting_clusters_minimiser_tolerance { 0.05 };
bool output_kg { false };
bool illumina { false };
bool binomial { false };
Expand Down
2 changes: 1 addition & 1 deletion include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ class Index {
return prg_lengths[id];
}

inline const std::vector<uint32_t> & get_prg_max_path_lengths() const {
inline std::vector<uint32_t> & get_prg_max_path_lengths() {
return prg_max_path_lengths;
}
inline uint32_t get_prg_max_path_lengths_given_id(size_t id) const {
Expand Down
2 changes: 2 additions & 0 deletions include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ struct MapOptions {
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
float conflicting_clusters_overlap_threshold { 0.8 };
float conflicting_clusters_minimiser_tolerance { 0.05 };
bool output_kg { false };
bool output_vcf { false };
bool illumina { false };
Expand Down
55 changes: 51 additions & 4 deletions include/minihits.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ struct pEq {
class MinimizerHits {
private:
std::set<MinimizerHitPtr, pComp> hits;

uint32_t number_of_equal_read_minimizers;
std::vector<uint32_t> *prg_max_path_lengths; // required to calculate target coverage
public:
MinimizerHits() = default;
MinimizerHits(std::vector<uint32_t> *prg_max_path_lengths) :
hits(), number_of_equal_read_minimizers(0), prg_max_path_lengths(prg_max_path_lengths) {};
~MinimizerHits() = default;

// copy constructors
Expand All @@ -38,7 +40,7 @@ class MinimizerHits {
MinimizerHits(MinimizerHits&& other) = default;
MinimizerHits& operator=(MinimizerHits&& other) = default;

inline void insert(const MinimizerHitPtr minimizer_hit) {
inline void insert(const MinimizerHitPtr &minimizer_hit) {
hits.insert(minimizer_hit);
}

Expand All @@ -64,14 +66,59 @@ class MinimizerHits {
return hits.begin();
}

inline auto rbegin () const {
return hits.rbegin();
}

inline const MinimizerHitPtr& front() const {
return *hits.begin();
}

inline auto end () const {
return hits.end();
}

inline void clear() { hits.clear(); }
inline auto rend () const {
return hits.rend();
}

inline const MinimizerHitPtr& back() const {
return *hits.rbegin();
}

uint32_t read_span_size() const;

inline void clear() {
hits.clear();
number_of_equal_read_minimizers=0;
}

inline void increment_number_of_equal_read_minimizers() {
number_of_equal_read_minimizers++;
}

inline uint32_t get_number_of_equal_read_minimizers() const {
return number_of_equal_read_minimizers;
}

inline size_t get_number_of_unique_mini_in_cluster() const {
return size() - number_of_equal_read_minimizers;
}

bool operator<(const MinimizerHits &rhs) const;

std::pair<uint32_t, uint32_t> get_strand_counts() const;

/**
* Returns a proportion denoting the amount of overlap between this cluster and the
* cluster passed as an argument. The proportion is over the smallest cluster.
*/
double overlap_amount(const MinimizerHits& cluster) const;

double target_coverage() const;

bool is_preferred_to(const MinimizerHits& cluster,
double minimisers_tolerance) const;

};
#endif
10 changes: 7 additions & 3 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ void define_clusters(
const std::string &sample_name,
const Seq &seq,
MinimizerHitClusters& clusters_of_hits,
const std::vector<uint32_t> &prg_min_path_lengths,
const std::vector<std::string> &prg_names,
const std::vector<uint32_t> &prg_max_path_lengths,
std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> &minimizer_hits, const int max_diff,
const float& fraction_kmers_required_for_cluster, const uint32_t min_cluster_size,
const uint32_t expected_number_kmers_in_read_sketch,
Expand All @@ -83,6 +83,8 @@ MinimizerHitClusters filter_clusters(
const MinimizerHitClusters& clusters_of_hits,
const std::vector<std::string> &prg_names,
ClusterFilterFile& cluster_filter_file,
const float overlap_threshold,
const float conflicting_clusters_minimiser_tolerance,
const uint32_t rng_seed = 0
);

Expand All @@ -94,7 +96,7 @@ void add_clusters_to_pangraph(
MinimizerHitClusters get_minimizer_hit_clusters(
const std::string &sample_name,
const Seq &seq,
const std::vector<uint32_t> &prg_min_path_lengths,
std::vector<uint32_t> &prg_max_path_lengths,
const std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> &minimizer_hits,
const int max_diff,
Expand All @@ -110,6 +112,8 @@ uint32_t pangraph_from_read_file(const SampleData& sample,
const int max_diff, const float& e_rate,
const fs::path& sample_outdir, const uint32_t min_cluster_size = 10,
const uint32_t genome_size = 5000000, const uint32_t max_covg = 300,
const float conflicting_clusters_overlap_threshold=0.8,
const float conflicting_clusters_minimiser_tolerance=0.05,
uint32_t threads = 1, const bool keep_extra_debugging_files = false,
const uint32_t rng_seed = 0);

Expand Down
24 changes: 22 additions & 2 deletions src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("INT")
->group("Mapping");

description
= "When two clusters of hits are conflicting, the one with highest number of unique minimisers "
"will be kept. However, if the difference between the number of unique minimisers is too small, "
"less than this parameter, then we will prefer the cluster that has higher target coverage.";
compare_subcmd
->add_option("--cluster-mini-tolerance", opt->conflicting_clusters_minimiser_tolerance, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

compare_subcmd
->add_flag("--loci-vcf", opt->output_vcf, "Save a VCF file for each found loci")
->group("Input/Output");
Expand Down Expand Up @@ -159,6 +169,15 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("INT")
->group("Mapping");

description
= "Minimum proportion of overlap between two clusters of hits to consider "
"them conflicting. Only one of the conflicting clusters will be kept.";
compare_subcmd
->add_option("--min-cluster-overlap", opt->conflicting_clusters_overlap_threshold, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description = "Maximum number of kmers to average over when selecting the maximum "
"likelihood path";
compare_subcmd->add_option("--kmer-avg", opt->max_num_kmers_to_avg, description)
Expand Down Expand Up @@ -285,8 +304,9 @@ int pandora_compare(CompareOptions& opt)
<< sample_fpath << " (this will take a while)";
uint32_t covg = pangraph_from_read_file(sample, pangraph_sample, index,
opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.max_covg, opt.threads,
opt.keep_extra_debugging_files, opt.rng_seed);
opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);

if (pangraph_sample->nodes.empty()) {
BOOST_LOG_TRIVIAL(warning)
Expand Down
20 changes: 20 additions & 0 deletions src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ void setup_discover_subcommand(CLI::App& app)
->type_name("INT")
->group("Mapping");

description
= "Minimum proportion of overlap between two clusters of hits to consider "
"them conflicting. Only one of the conflicting clusters will be kept.";
discover_subcmd
->add_option("--min-cluster-overlap", opt->conflicting_clusters_overlap_threshold, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description
= "When two clusters of hits are conflicting, the one with highest number of unique minimisers "
"will be kept. However, if the difference between the number of unique minimisers is too small, "
"less than this parameter, then we will prefer the cluster that has higher target coverage.";
discover_subcmd
->add_option("--cluster-mini-tolerance", opt->conflicting_clusters_minimiser_tolerance, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

discover_subcmd
->add_flag("--kg", opt->output_kg,
"Save kmer graphs with forward and reverse coverage annotations for found "
Expand Down Expand Up @@ -201,6 +220,7 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio
uint32_t covg
= pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);

if (pangraph->nodes.empty()) {
Expand Down
22 changes: 21 additions & 1 deletion src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,25 @@ void setup_map_subcommand(CLI::App& app)
->type_name("INT")
->group("Mapping");

description
= "When two clusters of hits are conflicting, the one with highest number of unique minimisers "
"will be kept. However, if the difference between the number of unique minimisers is too small, "
"less than this parameter, then we will prefer the cluster that has higher target coverage.";
map_subcmd
->add_option("--cluster-mini-tolerance", opt->conflicting_clusters_minimiser_tolerance, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description
= "Minimum proportion of overlap between two clusters of hits to consider "
"them conflicting. Only one of the conflicting clusters will be kept.";
map_subcmd
->add_option("--min-cluster-overlap", opt->conflicting_clusters_overlap_threshold, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

map_subcmd
->add_flag("--kg", opt->output_kg,
"Save kmer graphs with forward and reverse coverage annotations for found "
Expand Down Expand Up @@ -282,7 +301,8 @@ int pandora_map(MapOptions& opt)
uint32_t covg
= pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate,
opt.outdir, opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);

if (pangraph->nodes.empty()) {
BOOST_LOG_TRIVIAL(info) << "Found none of the LocalPRGs in the reads.";
Expand Down
38 changes: 38 additions & 0 deletions src/minihits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,41 @@ std::pair<uint32_t, uint32_t> MinimizerHits::get_strand_counts() const {
uint32_t minus_strand_count = strands.size() - plus_strand_count;
return std::make_pair(plus_strand_count, minus_strand_count);
}

uint32_t MinimizerHits::read_span_size() const
{
return back()->get_read_start_position() - front()->get_read_start_position();
}

double MinimizerHits::overlap_amount(const MinimizerHits& cluster) const {
const uint32_t start = std::max(this->front()->get_read_start_position(),
cluster.front()->get_read_start_position());
const uint32_t end = std::min(this->back()->get_read_start_position(),
cluster.back()->get_read_start_position());

const bool no_overlap = start > end;
if (no_overlap) {
return false;
}

const uint32_t overlap = end - start;
const uint32_t shortest_read_span = std::min(this->read_span_size(), cluster.read_span_size());

return (double)overlap / shortest_read_span;
}

double MinimizerHits::target_coverage() const {
return (double)get_number_of_unique_mini_in_cluster() / (double)prg_max_path_lengths->at(front()->get_prg_id());
}

bool MinimizerHits::is_preferred_to(const MinimizerHits& cluster, double minimisers_tolerance) const {
double margin = minimisers_tolerance *
std::max(this->get_number_of_unique_mini_in_cluster(), cluster.get_number_of_unique_mini_in_cluster());
double difference = std::abs((double)(this->get_number_of_unique_mini_in_cluster() - cluster.get_number_of_unique_mini_in_cluster()));

if (difference > margin) {
return this->get_number_of_unique_mini_in_cluster() > cluster.get_number_of_unique_mini_in_cluster();
}

return this->target_coverage() > cluster.target_coverage();
}
3 changes: 1 addition & 2 deletions src/sam_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,7 @@ std::string SAMFile::get_sam_record_from_hit_cluster(
ss << seq.name << "\t"
<< flag << "\t"
<< prg_name << "\t"
<< "0\t"
<< "." << "\t"
<< "1\t"
<< "255\t"
<< cigar << "\t"
<< "*\t0\t0\t"
Expand Down
Loading

0 comments on commit 40d4e78

Please sign in to comment.