Skip to content

Commit

Permalink
Merge pull request #354 from rmcolq/dev
Browse files Browse the repository at this point in the history
Sync dev to master, preparing for release 0.9.3
  • Loading branch information
leoisl authored Dec 14, 2023
2 parents 487fcbb + d81f82d commit 3f66c92
Show file tree
Hide file tree
Showing 16 changed files with 417 additions and 100 deletions.
8 changes: 4 additions & 4 deletions example/run_pandora.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,20 @@ ${make_prg_executable} from_msa --threads 1 --input msas/ --output-prefix out/pr
echo "Running ${pandora_executable} index"
"${pandora_executable}" index --threads 1 out/prgs/pangenome.prg.fa
echo "Running ${pandora_executable} map"
"${pandora_executable}" map --threads 1 --genotype -o out/map_toy_sample_1 --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/toy_sample_1/toy_sample_1.100x.random.illumina.fastq
"${pandora_executable}" map --threads 1 --genotype -o out/map_toy_sample_1 --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/toy_sample_1/toy_sample_1.100x.random.illumina.fastq
echo "Running ${pandora_executable} compare"
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_no_denovo --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_no_denovo --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
echo "Running pandora without denovo - done!"

echo "Running pandora with denovo..."
echo "Running ${pandora_executable} discover"
"${pandora_executable}" discover --threads 1 --outdir out/pandora_discover_out --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" discover --threads 1 --outdir out/pandora_discover_out --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
echo "Running ${make_prg_executable} update"
${make_prg_executable} update --threads 1 --update-DS out/prgs/pangenome.update_DS.zip --denovo-paths out/pandora_discover_out/denovo_paths.txt --output-prefix out/updated_prgs/pangenome_updated
echo "Running ${pandora_executable} index on updated PRGs"
"${pandora_executable}" index --threads 1 out/updated_prgs/pangenome_updated.prg.fa
echo "Running ${pandora_executable} compare"
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_with_denovo --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/updated_prgs/pangenome_updated.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_with_denovo --no-gene-coverage-filtering out/updated_prgs/pangenome_updated.prg.fa.panidx.zip reads/read_index.tsv
echo "Running pandora with denovo - done!"

# first compare non-zip files
Expand Down
6 changes: 5 additions & 1 deletion include/cluster_files.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ class ClusterFilterFile : public GenericFile {
public:
ClusterFilterFile(const fs::path &filepath, bool is_fake_file = false)
: GenericFile(filepath, is_fake_file){
(*this) << "read\tprg\tcluster_size\tstatus\n";
(*this) << "read\tprg\tnb_of_unique_minimisers\ttarget_cov\tread_start\t"
"read_end\toverlap\tstatus\tfavoured_cluster_prg\t"
"favoured_cluster_nb_of_unique_minimisers\t"
"favoured_cluster_target_cov\tfavoured_cluster_read_start\t"
"favoured_cluster_read_end\n";
}
};

Expand Down
5 changes: 5 additions & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,15 @@ struct CompareOptions {
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
float conflicting_clusters_overlap_threshold { 0.8 };
float conflicting_clusters_minimiser_tolerance { 0.05 };
bool output_vcf { false };
bool illumina { false };
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 300 };
Expand All @@ -56,6 +60,7 @@ struct CompareOptions {
float min_allele_fraction_covg_gt { 0 };
float genotyping_error_rate { 0.01 };
uint16_t confidence_threshold { 1 };
float partial_matching_lower_bound { 0.5 };
bool keep_extra_debugging_files { false };
};

Expand Down
7 changes: 6 additions & 1 deletion include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,21 @@ struct DiscoverOptions {
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
float conflicting_clusters_overlap_threshold { 0.8 };
float conflicting_clusters_minimiser_tolerance { 0.05 };
bool output_kg { false };
bool illumina { false };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 600 };
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 10 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
uint32_t min_cluster_size { 10 };
uint32_t max_num_kmers_to_avg { 100 };
float partial_matching_lower_bound { 0.5 };
bool keep_extra_debugging_files { false };
};

Expand Down
2 changes: 1 addition & 1 deletion include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ class Index {
return prg_lengths[id];
}

inline const std::vector<uint32_t> & get_prg_max_path_lengths() const {
inline std::vector<uint32_t> & get_prg_max_path_lengths() {
return prg_max_path_lengths;
}
inline uint32_t get_prg_max_path_lengths_given_id(size_t id) const {
Expand Down
2 changes: 1 addition & 1 deletion include/localPRG.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ class LocalPRG {
std::vector<LocalNodePtr>&, const uint32_t, const bool, const uint32_t,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage) const;
float max_relative_gene_coverage, float min_gene_coverage_proportion, bool no_gene_coverage_filtering) const;
std::vector<LocalNodePtr> get_valid_vcf_reference(const std::string&) const;

void add_variants_to_vcf(VCF&, pangenome::NodePtr, const std::string&,
Expand Down
5 changes: 5 additions & 0 deletions include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ struct MapOptions {
uint32_t rng_seed { 0 };
uint32_t genome_size { 5000000 };
uint32_t max_diff { 250 };
float conflicting_clusters_overlap_threshold { 0.8 };
float conflicting_clusters_minimiser_tolerance { 0.05 };
bool output_kg { false };
bool output_vcf { false };
bool illumina { false };
Expand All @@ -47,6 +49,8 @@ struct MapOptions {
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
bool genotype { false };
bool local_genotype { false };
bool snps_only { false };
Expand All @@ -58,6 +62,7 @@ struct MapOptions {
float min_allele_fraction_covg_gt { 0 };
float genotyping_error_rate { 0.01 };
uint16_t confidence_threshold { 1 };
float partial_matching_lower_bound { 0.5 };
bool keep_extra_debugging_files { false };
};

Expand Down
55 changes: 51 additions & 4 deletions include/minihits.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ struct pEq {
class MinimizerHits {
private:
std::set<MinimizerHitPtr, pComp> hits;

uint32_t number_of_equal_read_minimizers;
std::vector<uint32_t> *prg_max_path_lengths; // required to calculate target coverage
public:
MinimizerHits() = default;
MinimizerHits(std::vector<uint32_t> *prg_max_path_lengths) :
hits(), number_of_equal_read_minimizers(0), prg_max_path_lengths(prg_max_path_lengths) {};
~MinimizerHits() = default;

// copy constructors
Expand All @@ -38,7 +40,7 @@ class MinimizerHits {
MinimizerHits(MinimizerHits&& other) = default;
MinimizerHits& operator=(MinimizerHits&& other) = default;

inline void insert(const MinimizerHitPtr minimizer_hit) {
inline void insert(const MinimizerHitPtr &minimizer_hit) {
hits.insert(minimizer_hit);
}

Expand All @@ -64,14 +66,59 @@ class MinimizerHits {
return hits.begin();
}

inline auto rbegin () const {
return hits.rbegin();
}

inline const MinimizerHitPtr& front() const {
return *hits.begin();
}

inline auto end () const {
return hits.end();
}

inline void clear() { hits.clear(); }
inline auto rend () const {
return hits.rend();
}

inline const MinimizerHitPtr& back() const {
return *hits.rbegin();
}

uint32_t read_span_size() const;

inline void clear() {
hits.clear();
number_of_equal_read_minimizers=0;
}

inline void increment_number_of_equal_read_minimizers() {
number_of_equal_read_minimizers++;
}

inline uint32_t get_number_of_equal_read_minimizers() const {
return number_of_equal_read_minimizers;
}

inline size_t get_number_of_unique_mini_in_cluster() const {
return size() - number_of_equal_read_minimizers;
}

bool operator<(const MinimizerHits &rhs) const;

std::pair<uint32_t, uint32_t> get_strand_counts() const;

/**
* Returns a proportion denoting the amount of overlap between this cluster and the
* cluster passed as an argument. The proportion is over the smallest cluster.
*/
double overlap_amount(const MinimizerHits& cluster) const;

double target_coverage() const;

bool is_preferred_to(const MinimizerHits& cluster,
double minimisers_tolerance) const;

};
#endif
12 changes: 8 additions & 4 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ void define_clusters(
const std::string &sample_name,
const Seq &seq,
MinimizerHitClusters& clusters_of_hits,
const std::vector<uint32_t> &prg_min_path_lengths,
const std::vector<std::string> &prg_names,
const std::vector<uint32_t> &prg_max_path_lengths,
std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> &minimizer_hits, const int max_diff,
const float& fraction_kmers_required_for_cluster, const uint32_t min_cluster_size,
const uint32_t expected_number_kmers_in_read_sketch,
Expand All @@ -83,6 +83,8 @@ MinimizerHitClusters filter_clusters(
const MinimizerHitClusters& clusters_of_hits,
const std::vector<std::string> &prg_names,
ClusterFilterFile& cluster_filter_file,
const float overlap_threshold,
const float conflicting_clusters_minimiser_tolerance,
const uint32_t rng_seed = 0
);

Expand All @@ -94,7 +96,7 @@ void add_clusters_to_pangraph(
MinimizerHitClusters get_minimizer_hit_clusters(
const std::string &sample_name,
const Seq &seq,
const std::vector<uint32_t> &prg_min_path_lengths,
std::vector<uint32_t> &prg_max_path_lengths,
const std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> &minimizer_hits,
const int max_diff,
Expand All @@ -110,8 +112,10 @@ uint32_t pangraph_from_read_file(const SampleData& sample,
const int max_diff, const float& e_rate,
const fs::path& sample_outdir, const uint32_t min_cluster_size = 10,
const uint32_t genome_size = 5000000, const uint32_t max_covg = 300,
const float conflicting_clusters_overlap_threshold=0.8,
const float conflicting_clusters_minimiser_tolerance=0.05,
uint32_t threads = 1, const bool keep_extra_debugging_files = false,
const uint32_t rng_seed = 0);
const uint32_t rng_seed = 0, const float partial_matching_lower_bound=0.5);

void infer_most_likely_prg_path_for_pannode(
const std::vector<std::shared_ptr<LocalPRG>>&, PanNode*, uint32_t, float);
Expand Down
59 changes: 56 additions & 3 deletions src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("INT")
->group("Mapping");

description
= "When two clusters of hits are conflicting, the one with highest number of unique minimisers "
"will be kept. However, if the difference between the number of unique minimisers is too small, "
"less than this parameter, then we will prefer the cluster that has higher target coverage.";
compare_subcmd
->add_option("--cluster-mini-tolerance", opt->conflicting_clusters_minimiser_tolerance, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

compare_subcmd
->add_flag("--loci-vcf", opt->output_vcf, "Save a VCF file for each found loci")
->group("Input/Output");
Expand Down Expand Up @@ -140,6 +150,26 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

compare_subcmd
->add_option(
"--min-gene-coverage-proportion", opt->min_gene_coverage_proportion,
"Minimum gene coverage proportion to keep a gene. "
"Given the coverage on the kmers of the maximum likelihood path of a gene, we compute "
"the number of bases that have at least one read covering it. "
"If the proportion of such bases is larger than the value in this "
"parameter, the gene is kept. Otherwise, the gene is filtered out.")
->capture_default_str()
->type_name("FLOAT")
->group("Filtering");

compare_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = compare_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand All @@ -159,6 +189,26 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("INT")
->group("Mapping");

description
= "Minimum proportion of overlap between two clusters of hits to consider "
"them conflicting. Only one of the conflicting clusters will be kept.";
compare_subcmd
->add_option("--min-cluster-overlap", opt->conflicting_clusters_overlap_threshold, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description
= "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that "
"pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size "
"for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive "
"matches.";
compare_subcmd
->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description = "Maximum number of kmers to average over when selecting the maximum "
"likelihood path";
compare_subcmd->add_option("--kmer-avg", opt->max_num_kmers_to_avg, description)
Expand Down Expand Up @@ -285,8 +335,10 @@ int pandora_compare(CompareOptions& opt)
<< sample_fpath << " (this will take a while)";
uint32_t covg = pangraph_from_read_file(sample, pangraph_sample, index,
opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.max_covg, opt.threads,
opt.keep_extra_debugging_files, opt.rng_seed);
opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed,
opt.partial_matching_lower_bound);

if (pangraph_sample->nodes.empty()) {
BOOST_LOG_TRIVIAL(warning)
Expand Down Expand Up @@ -315,7 +367,8 @@ int pandora_compare(CompareOptions& opt)
local_prg->add_consensus_path_to_fastaq(consensus_fq, c->second, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
c = pangraph_sample->remove_node(c->second);
Expand Down
Loading

0 comments on commit 3f66c92

Please sign in to comment.