diff --git a/include/compare_main.h b/include/compare_main.h index b32a3680..81718135 100644 --- a/include/compare_main.h +++ b/include/compare_main.h @@ -45,6 +45,7 @@ struct CompareOptions { float min_absolute_gene_coverage { 3.0 }; float min_relative_gene_coverage { 0.05 }; float max_relative_gene_coverage { 100 }; + float min_gene_coverage_proportion { 0.8 }; bool binomial { false }; bool do_not_auto_update_params { false }; uint32_t max_covg { 300 }; diff --git a/include/denovo_discovery/discover_main.h b/include/denovo_discovery/discover_main.h index 89590d18..36c601c1 100644 --- a/include/denovo_discovery/discover_main.h +++ b/include/denovo_discovery/discover_main.h @@ -34,7 +34,8 @@ struct DiscoverOptions { uint32_t max_covg { 600 }; float min_absolute_gene_coverage { 3.0 }; float min_relative_gene_coverage { 0.05 }; - float max_relative_gene_coverage { 10 }; + float max_relative_gene_coverage { 100 }; + float min_gene_coverage_proportion { 0.8 }; uint32_t min_cluster_size { 10 }; uint32_t max_num_kmers_to_avg { 100 }; bool keep_extra_debugging_files { false }; diff --git a/include/localPRG.h b/include/localPRG.h index 5c9e4d80..e3646074 100644 --- a/include/localPRG.h +++ b/include/localPRG.h @@ -157,7 +157,7 @@ class LocalPRG { std::vector&, const uint32_t, const bool, const uint32_t, const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id, float min_absolute_gene_coverage, float min_relative_gene_coverage, - float max_relative_gene_coverage) const; + float max_relative_gene_coverage, float min_gene_coverage_proportion) const; std::vector get_valid_vcf_reference(const std::string&) const; void add_variants_to_vcf(VCF&, pangenome::NodePtr, const std::string&, diff --git a/include/map_main.h b/include/map_main.h index f917e79c..83f64e4b 100644 --- a/include/map_main.h +++ b/include/map_main.h @@ -49,6 +49,7 @@ struct MapOptions { float min_absolute_gene_coverage { 3.0 }; float min_relative_gene_coverage { 0.05 }; float max_relative_gene_coverage { 100 }; + float min_gene_coverage_proportion { 0.8 }; bool genotype { false }; bool local_genotype { false }; bool snps_only { false }; diff --git a/src/compare_main.cpp b/src/compare_main.cpp index 08fddc9e..bdaecc21 100644 --- a/src/compare_main.cpp +++ b/src/compare_main.cpp @@ -150,6 +150,18 @@ void setup_compare_subcommand(CLI::App& app) ->type_name("FLOAT") ->group("Filtering"); + compare_subcmd + ->add_option( + "--min-gene-coverage-proportion", opt->min_gene_coverage_proportion, + "Minimum gene coverage proportion to keep a gene. " + "Given the coverage on the kmers of the maximum likelihood path of a gene, we compute " + "the number of bases that have at least one read covering it. " + "If the proportion of such bases is larger than the value in this " + "parameter, the gene is kept. Otherwise, the gene is filtered out.") + ->capture_default_str() + ->type_name("FLOAT") + ->group("Filtering"); + description = "Add extra step to carefully genotype sites."; auto* gt_opt = compare_subcmd->add_flag("--genotype", opt->genotype, description) ->group("Consensus/Variant Calling"); @@ -335,7 +347,7 @@ int pandora_compare(CompareOptions& opt) local_prg->add_consensus_path_to_fastaq(consensus_fq, c->second, kmp, lmp, index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0, opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage, - opt.max_relative_gene_coverage); + opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion); if (kmp.empty()) { c = pangraph_sample->remove_node(c->second); diff --git a/src/denovo_discovery/discover_main.cpp b/src/denovo_discovery/discover_main.cpp index c5d57df8..b82c71b2 100644 --- a/src/denovo_discovery/discover_main.cpp +++ b/src/denovo_discovery/discover_main.cpp @@ -158,6 +158,18 @@ void setup_discover_subcommand(CLI::App& app) ->type_name("FLOAT") ->group("Filtering"); + discover_subcmd + ->add_option( + "--min-gene-coverage-proportion", opt->min_gene_coverage_proportion, + "Minimum gene coverage proportion to keep a gene. " + "Given the coverage on the kmers of the maximum likelihood path of a gene, we compute " + "the number of bases that have at least one read covering it. " + "If the proportion of such bases is larger than the value in this " + "parameter, the gene is kept. Otherwise, the gene is filtered out.") + ->capture_default_str() + ->type_name("FLOAT") + ->group("Filtering"); + description = "Minimum size of a cluster of hits between a read and a loci to consider " "the loci present"; @@ -293,7 +305,7 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio pangraph_node, kmp, lmp, index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0, opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage, - opt.max_relative_gene_coverage); + opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion); if (kmp.empty()) { // mark the node as to remove diff --git a/src/localPRG.cpp b/src/localPRG.cpp index 5048f7e9..70975f58 100644 --- a/src/localPRG.cpp +++ b/src/localPRG.cpp @@ -1699,7 +1699,7 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt const bool bin, const uint32_t global_covg, const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id, float min_absolute_gene_coverage, float min_relative_gene_coverage, - float max_relative_gene_coverage) const + float max_relative_gene_coverage, float min_gene_coverage_proportion) const { if (pnode->covg == 0) { BOOST_LOG_TRIVIAL(warning) << "Node " << pnode->get_name() << " has no reads"; @@ -1719,10 +1719,27 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt std::vector covgs = get_covgs_along_localnode_path(pnode, lmp, kmp, sample_id); + auto mean_covg = Maths::mean(covgs.begin(), covgs.end()); BOOST_LOG_TRIVIAL(debug) << "Found global coverage " << global_covg << " and path mean " << mean_covg; + // apply the coverage proportion filter + uint32_t amount_of_bases_with_coverage = std::count_if(covgs.begin(), covgs.end(), + [](uint32_t covg){return covg > 0;}); + float coverage_proportion = ((float)amount_of_bases_with_coverage) / covgs.size(); + const bool coverage_proportion_is_too_low = coverage_proportion < min_gene_coverage_proportion; + if (coverage_proportion_is_too_low) { + BOOST_LOG_TRIVIAL(warning) + << "Filtering out gene " << name << " due to " + << "coverage proportion (" << coverage_proportion << ") " + << "being too low, less than the --min-gene-coverage-proportion parameter (" + << min_gene_coverage_proportion << ")"; + kmp.clear(); + return; + } + + // apply the min/max coverage filters if (mean_covg < min_absolute_gene_coverage) { BOOST_LOG_TRIVIAL(warning) << "Filtering out gene " << name << " due to " diff --git a/src/map_main.cpp b/src/map_main.cpp index 5ea63915..dc6ed019 100644 --- a/src/map_main.cpp +++ b/src/map_main.cpp @@ -164,6 +164,18 @@ void setup_map_subcommand(CLI::App& app) ->type_name("FLOAT") ->group("Filtering"); + map_subcmd + ->add_option( + "--min-gene-coverage-proportion", opt->min_gene_coverage_proportion, + "Minimum gene coverage proportion to keep a gene. " + "Given the coverage on the kmers of the maximum likelihood path of a gene, we compute " + "the number of bases that have at least one read covering it. " + "If the proportion of such bases is larger than the value in this " + "parameter, the gene is kept. Otherwise, the gene is filtered out.") + ->capture_default_str() + ->type_name("FLOAT") + ->group("Filtering"); + description = "Add extra step to carefully genotype sites."; auto* gt_opt = map_subcmd->add_flag("--genotype", opt->genotype, description) ->group("Consensus/Variant Calling"); @@ -377,7 +389,7 @@ int pandora_map(MapOptions& opt) prg->add_consensus_path_to_fastaq(consensus_fq, pangraph_node, kmp, lmp, index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0, opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage, - opt.max_relative_gene_coverage); + opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion); if (kmp.empty()) { #pragma omp critical(nodes_to_remove)