Skip to content

Commit

Permalink
Merge pull request #352 from leoisl/no-gene-cov-filter
Browse files Browse the repository at this point in the history
feat: adds flag --no-gene-coverage-filtering to pandora map, compare and discover
  • Loading branch information
leoisl authored Dec 13, 2023
2 parents 047a3cc + 183c397 commit 6befbdb
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 57 deletions.
8 changes: 4 additions & 4 deletions example/run_pandora.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,20 @@ ${make_prg_executable} from_msa --threads 1 --input msas/ --output-prefix out/pr
echo "Running ${pandora_executable} index"
"${pandora_executable}" index --threads 1 out/prgs/pangenome.prg.fa
echo "Running ${pandora_executable} map"
"${pandora_executable}" map --threads 1 --genotype -o out/map_toy_sample_1 --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/toy_sample_1/toy_sample_1.100x.random.illumina.fastq
"${pandora_executable}" map --threads 1 --genotype -o out/map_toy_sample_1 --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/toy_sample_1/toy_sample_1.100x.random.illumina.fastq
echo "Running ${pandora_executable} compare"
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_no_denovo --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_no_denovo --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
echo "Running pandora without denovo - done!"

echo "Running pandora with denovo..."
echo "Running ${pandora_executable} discover"
"${pandora_executable}" discover --threads 1 --outdir out/pandora_discover_out --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" discover --threads 1 --outdir out/pandora_discover_out --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
echo "Running ${make_prg_executable} update"
${make_prg_executable} update --threads 1 --update-DS out/prgs/pangenome.update_DS.zip --denovo-paths out/pandora_discover_out/denovo_paths.txt --output-prefix out/updated_prgs/pangenome_updated
echo "Running ${pandora_executable} index on updated PRGs"
"${pandora_executable}" index --threads 1 out/updated_prgs/pangenome_updated.prg.fa
echo "Running ${pandora_executable} compare"
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_with_denovo --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/updated_prgs/pangenome_updated.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_with_denovo --no-gene-coverage-filtering out/updated_prgs/pangenome_updated.prg.fa.panidx.zip reads/read_index.tsv
echo "Running pandora with denovo - done!"

# first compare non-zip files
Expand Down
1 change: 1 addition & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct CompareOptions {
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 300 };
Expand Down
1 change: 1 addition & 0 deletions include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ struct DiscoverOptions {
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
uint32_t min_cluster_size { 10 };
uint32_t max_num_kmers_to_avg { 100 };
bool keep_extra_debugging_files { false };
Expand Down
2 changes: 1 addition & 1 deletion include/localPRG.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ class LocalPRG {
std::vector<LocalNodePtr>&, const uint32_t, const bool, const uint32_t,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage, float min_gene_coverage_proportion) const;
float max_relative_gene_coverage, float min_gene_coverage_proportion, bool no_gene_coverage_filtering) const;
std::vector<LocalNodePtr> get_valid_vcf_reference(const std::string&) const;

void add_variants_to_vcf(VCF&, pangenome::NodePtr, const std::string&,
Expand Down
1 change: 1 addition & 0 deletions include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct MapOptions {
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
bool genotype { false };
bool local_genotype { false };
bool snps_only { false };
Expand Down
11 changes: 10 additions & 1 deletion src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,14 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

compare_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = compare_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand Down Expand Up @@ -347,7 +355,8 @@ int pandora_compare(CompareOptions& opt)
local_prg->add_consensus_path_to_fastaq(consensus_fq, c->second, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
c = pangraph_sample->remove_node(c->second);
Expand Down
11 changes: 10 additions & 1 deletion src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ void setup_discover_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

discover_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description
= "Minimum size of a cluster of hits between a read and a loci to consider "
"the loci present";
Expand Down Expand Up @@ -305,7 +313,8 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio
pangraph_node, kmp, lmp, index.get_window_size(), opt.binomial, covg,
opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
// mark the node as to remove
Expand Down
101 changes: 52 additions & 49 deletions src/localPRG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1699,7 +1699,8 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt
const bool bin, const uint32_t global_covg,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage, float min_gene_coverage_proportion) const
float max_relative_gene_coverage, float min_gene_coverage_proportion,
bool no_gene_coverage_filtering) const
{
if (pnode->covg == 0) {
BOOST_LOG_TRIVIAL(warning) << "Node " << pnode->get_name() << " has no reads";
Expand All @@ -1724,57 +1725,59 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt
BOOST_LOG_TRIVIAL(debug) << "Found global coverage " << global_covg
<< " and path mean " << mean_covg;

// apply the coverage proportion filter
uint32_t amount_of_bases_with_coverage = std::count_if(covgs.begin(), covgs.end(),
[](uint32_t covg){return covg > 0;});
float coverage_proportion = ((float)amount_of_bases_with_coverage) / covgs.size();
const bool coverage_proportion_is_too_low = coverage_proportion < min_gene_coverage_proportion;
if (coverage_proportion_is_too_low) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "coverage proportion (" << coverage_proportion << ") "
<< "being too low, less than the --min-gene-coverage-proportion parameter ("
<< min_gene_coverage_proportion << ")";
kmp.clear();
return;
}
if (!no_gene_coverage_filtering) {
// apply the coverage proportion filter
uint32_t amount_of_bases_with_coverage = std::count_if(covgs.begin(), covgs.end(),
[](uint32_t covg){return covg > 0;});
float coverage_proportion = ((float)amount_of_bases_with_coverage) / covgs.size();
const bool coverage_proportion_is_too_low = coverage_proportion < min_gene_coverage_proportion;
if (coverage_proportion_is_too_low) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "coverage proportion (" << coverage_proportion << ") "
<< "being too low, less than the --min-gene-coverage-proportion parameter ("
<< min_gene_coverage_proportion << ")";
kmp.clear();
return;
}

// apply the min/max coverage filters
if (mean_covg < min_absolute_gene_coverage) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the --min-abs-gene-coverage parameter (" << min_absolute_gene_coverage << ")";
kmp.clear();
return;
}
// apply the min/max coverage filters
if (mean_covg < min_absolute_gene_coverage) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the --min-abs-gene-coverage parameter (" << min_absolute_gene_coverage << ")";
kmp.clear();
return;
}

if (mean_covg < min_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the "
<< "--min-rel-gene-coverage * global coverage ("
<< min_relative_gene_coverage << " * " << global_covg << " = "
<< min_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
}
if (mean_covg < min_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the "
<< "--min-rel-gene-coverage * global coverage ("
<< min_relative_gene_coverage << " * " << global_covg << " = "
<< min_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
}

if (mean_covg > max_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too high, larger than the "
<< "--max-rel-gene-coverage * global coverage ("
<< max_relative_gene_coverage << " * " << global_covg << " = "
<< max_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
if (mean_covg > max_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too high, larger than the "
<< "--max-rel-gene-coverage * global coverage ("
<< max_relative_gene_coverage << " * " << global_covg << " = "
<< max_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
}
}

std::string fq_name = pnode->get_name();
Expand Down
11 changes: 10 additions & 1 deletion src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@ void setup_map_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

map_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = map_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand Down Expand Up @@ -389,7 +397,8 @@ int pandora_map(MapOptions& opt)
prg->add_consensus_path_to_fastaq(consensus_fq, pangraph_node, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
#pragma omp critical(nodes_to_remove)
Expand Down

0 comments on commit 6befbdb

Please sign in to comment.