Skip to content

Commit

Permalink
Merge pull request #351 from leoisl/min_gene_coverage_proportion
Browse files Browse the repository at this point in the history
feat: adds parameter --min-gene-coverage-proportion to pandora map, compare and discover
  • Loading branch information
leoisl authored Dec 12, 2023
2 parents 40d4e78 + bfa3404 commit 047a3cc
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct CompareOptions {
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 300 };
Expand Down
3 changes: 2 additions & 1 deletion include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ struct DiscoverOptions {
uint32_t max_covg { 600 };
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 10 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
uint32_t min_cluster_size { 10 };
uint32_t max_num_kmers_to_avg { 100 };
bool keep_extra_debugging_files { false };
Expand Down
2 changes: 1 addition & 1 deletion include/localPRG.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ class LocalPRG {
std::vector<LocalNodePtr>&, const uint32_t, const bool, const uint32_t,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage) const;
float max_relative_gene_coverage, float min_gene_coverage_proportion) const;
std::vector<LocalNodePtr> get_valid_vcf_reference(const std::string&) const;

void add_variants_to_vcf(VCF&, pangenome::NodePtr, const std::string&,
Expand Down
1 change: 1 addition & 0 deletions include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ struct MapOptions {
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool genotype { false };
bool local_genotype { false };
bool snps_only { false };
Expand Down
14 changes: 13 additions & 1 deletion src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,18 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

compare_subcmd
->add_option(
"--min-gene-coverage-proportion", opt->min_gene_coverage_proportion,
"Minimum gene coverage proportion to keep a gene. "
"Given the coverage on the kmers of the maximum likelihood path of a gene, we compute "
"the number of bases that have at least one read covering it. "
"If the proportion of such bases is larger than the value in this "
"parameter, the gene is kept. Otherwise, the gene is filtered out.")
->capture_default_str()
->type_name("FLOAT")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = compare_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand Down Expand Up @@ -335,7 +347,7 @@ int pandora_compare(CompareOptions& opt)
local_prg->add_consensus_path_to_fastaq(consensus_fq, c->second, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);

if (kmp.empty()) {
c = pangraph_sample->remove_node(c->second);
Expand Down
14 changes: 13 additions & 1 deletion src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,18 @@ void setup_discover_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

discover_subcmd
->add_option(
"--min-gene-coverage-proportion", opt->min_gene_coverage_proportion,
"Minimum gene coverage proportion to keep a gene. "
"Given the coverage on the kmers of the maximum likelihood path of a gene, we compute "
"the number of bases that have at least one read covering it. "
"If the proportion of such bases is larger than the value in this "
"parameter, the gene is kept. Otherwise, the gene is filtered out.")
->capture_default_str()
->type_name("FLOAT")
->group("Filtering");

description
= "Minimum size of a cluster of hits between a read and a loci to consider "
"the loci present";
Expand Down Expand Up @@ -293,7 +305,7 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio
pangraph_node, kmp, lmp, index.get_window_size(), opt.binomial, covg,
opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);

if (kmp.empty()) {
// mark the node as to remove
Expand Down
19 changes: 18 additions & 1 deletion src/localPRG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1699,7 +1699,7 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt
const bool bin, const uint32_t global_covg,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage) const
float max_relative_gene_coverage, float min_gene_coverage_proportion) const
{
if (pnode->covg == 0) {
BOOST_LOG_TRIVIAL(warning) << "Node " << pnode->get_name() << " has no reads";
Expand All @@ -1719,10 +1719,27 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt

std::vector<uint32_t> covgs
= get_covgs_along_localnode_path(pnode, lmp, kmp, sample_id);

auto mean_covg = Maths::mean(covgs.begin(), covgs.end());
BOOST_LOG_TRIVIAL(debug) << "Found global coverage " << global_covg
<< " and path mean " << mean_covg;

// apply the coverage proportion filter
uint32_t amount_of_bases_with_coverage = std::count_if(covgs.begin(), covgs.end(),
[](uint32_t covg){return covg > 0;});
float coverage_proportion = ((float)amount_of_bases_with_coverage) / covgs.size();
const bool coverage_proportion_is_too_low = coverage_proportion < min_gene_coverage_proportion;
if (coverage_proportion_is_too_low) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "coverage proportion (" << coverage_proportion << ") "
<< "being too low, less than the --min-gene-coverage-proportion parameter ("
<< min_gene_coverage_proportion << ")";
kmp.clear();
return;
}

// apply the min/max coverage filters
if (mean_covg < min_absolute_gene_coverage) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
Expand Down
14 changes: 13 additions & 1 deletion src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ void setup_map_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

map_subcmd
->add_option(
"--min-gene-coverage-proportion", opt->min_gene_coverage_proportion,
"Minimum gene coverage proportion to keep a gene. "
"Given the coverage on the kmers of the maximum likelihood path of a gene, we compute "
"the number of bases that have at least one read covering it. "
"If the proportion of such bases is larger than the value in this "
"parameter, the gene is kept. Otherwise, the gene is filtered out.")
->capture_default_str()
->type_name("FLOAT")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = map_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand Down Expand Up @@ -377,7 +389,7 @@ int pandora_map(MapOptions& opt)
prg->add_consensus_path_to_fastaq(consensus_fq, pangraph_node, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);

if (kmp.empty()) {
#pragma omp critical(nodes_to_remove)
Expand Down

0 comments on commit 047a3cc

Please sign in to comment.