Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adds flag --no-gene-coverage-filtering to pandora map, compare and discover #352

Merged
merged 3 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions example/run_pandora.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,20 @@ ${make_prg_executable} from_msa --threads 1 --input msas/ --output-prefix out/pr
echo "Running ${pandora_executable} index"
"${pandora_executable}" index --threads 1 out/prgs/pangenome.prg.fa
echo "Running ${pandora_executable} map"
"${pandora_executable}" map --threads 1 --genotype -o out/map_toy_sample_1 --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/toy_sample_1/toy_sample_1.100x.random.illumina.fastq
"${pandora_executable}" map --threads 1 --genotype -o out/map_toy_sample_1 --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/toy_sample_1/toy_sample_1.100x.random.illumina.fastq
echo "Running ${pandora_executable} compare"
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_no_denovo --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_no_denovo --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
echo "Running pandora without denovo - done!"

echo "Running pandora with denovo..."
echo "Running ${pandora_executable} discover"
"${pandora_executable}" discover --threads 1 --outdir out/pandora_discover_out --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" discover --threads 1 --outdir out/pandora_discover_out --no-gene-coverage-filtering out/prgs/pangenome.prg.fa.panidx.zip reads/read_index.tsv
echo "Running ${make_prg_executable} update"
${make_prg_executable} update --threads 1 --update-DS out/prgs/pangenome.update_DS.zip --denovo-paths out/pandora_discover_out/denovo_paths.txt --output-prefix out/updated_prgs/pangenome_updated
echo "Running ${pandora_executable} index on updated PRGs"
"${pandora_executable}" index --threads 1 out/updated_prgs/pangenome_updated.prg.fa
echo "Running ${pandora_executable} compare"
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_with_denovo --genome-size 700 --min-abs-gene-coverage 0 --min-rel-gene-coverage 0 --max-rel-gene-coverage 1000 out/updated_prgs/pangenome_updated.prg.fa.panidx.zip reads/read_index.tsv
"${pandora_executable}" compare --threads 1 --genotype -o out/output_toy_example_with_denovo --no-gene-coverage-filtering out/updated_prgs/pangenome_updated.prg.fa.panidx.zip reads/read_index.tsv
echo "Running pandora with denovo - done!"

# first compare non-zip files
Expand Down
1 change: 1 addition & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct CompareOptions {
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
bool binomial { false };
bool do_not_auto_update_params { false };
uint32_t max_covg { 300 };
Expand Down
1 change: 1 addition & 0 deletions include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ struct DiscoverOptions {
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
uint32_t min_cluster_size { 10 };
uint32_t max_num_kmers_to_avg { 100 };
bool keep_extra_debugging_files { false };
Expand Down
2 changes: 1 addition & 1 deletion include/localPRG.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ class LocalPRG {
std::vector<LocalNodePtr>&, const uint32_t, const bool, const uint32_t,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage, float min_gene_coverage_proportion) const;
float max_relative_gene_coverage, float min_gene_coverage_proportion, bool no_gene_coverage_filtering) const;
std::vector<LocalNodePtr> get_valid_vcf_reference(const std::string&) const;

void add_variants_to_vcf(VCF&, pangenome::NodePtr, const std::string&,
Expand Down
1 change: 1 addition & 0 deletions include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct MapOptions {
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
float min_gene_coverage_proportion { 0.8 };
bool no_gene_coverage_filtering { false };
bool genotype { false };
bool local_genotype { false };
bool snps_only { false };
Expand Down
11 changes: 10 additions & 1 deletion src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,14 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

compare_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = compare_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand Down Expand Up @@ -347,7 +355,8 @@ int pandora_compare(CompareOptions& opt)
local_prg->add_consensus_path_to_fastaq(consensus_fq, c->second, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
c = pangraph_sample->remove_node(c->second);
Expand Down
11 changes: 10 additions & 1 deletion src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ void setup_discover_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

discover_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description
= "Minimum size of a cluster of hits between a read and a loci to consider "
"the loci present";
Expand Down Expand Up @@ -305,7 +313,8 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio
pangraph_node, kmp, lmp, index.get_window_size(), opt.binomial, covg,
opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
// mark the node as to remove
Expand Down
101 changes: 52 additions & 49 deletions src/localPRG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1699,7 +1699,8 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt
const bool bin, const uint32_t global_covg,
const uint32_t& max_num_kmers_to_average, const uint32_t& sample_id,
float min_absolute_gene_coverage, float min_relative_gene_coverage,
float max_relative_gene_coverage, float min_gene_coverage_proportion) const
float max_relative_gene_coverage, float min_gene_coverage_proportion,
bool no_gene_coverage_filtering) const
{
if (pnode->covg == 0) {
BOOST_LOG_TRIVIAL(warning) << "Node " << pnode->get_name() << " has no reads";
Expand All @@ -1724,57 +1725,59 @@ void LocalPRG::add_consensus_path_to_fastaq(Fastaq& output_fq, pangenome::NodePt
BOOST_LOG_TRIVIAL(debug) << "Found global coverage " << global_covg
<< " and path mean " << mean_covg;

// apply the coverage proportion filter
uint32_t amount_of_bases_with_coverage = std::count_if(covgs.begin(), covgs.end(),
[](uint32_t covg){return covg > 0;});
float coverage_proportion = ((float)amount_of_bases_with_coverage) / covgs.size();
const bool coverage_proportion_is_too_low = coverage_proportion < min_gene_coverage_proportion;
if (coverage_proportion_is_too_low) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "coverage proportion (" << coverage_proportion << ") "
<< "being too low, less than the --min-gene-coverage-proportion parameter ("
<< min_gene_coverage_proportion << ")";
kmp.clear();
return;
}
if (!no_gene_coverage_filtering) {
// apply the coverage proportion filter
uint32_t amount_of_bases_with_coverage = std::count_if(covgs.begin(), covgs.end(),
[](uint32_t covg){return covg > 0;});
float coverage_proportion = ((float)amount_of_bases_with_coverage) / covgs.size();
const bool coverage_proportion_is_too_low = coverage_proportion < min_gene_coverage_proportion;
if (coverage_proportion_is_too_low) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "coverage proportion (" << coverage_proportion << ") "
<< "being too low, less than the --min-gene-coverage-proportion parameter ("
<< min_gene_coverage_proportion << ")";
kmp.clear();
return;
}

// apply the min/max coverage filters
if (mean_covg < min_absolute_gene_coverage) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the --min-abs-gene-coverage parameter (" << min_absolute_gene_coverage << ")";
kmp.clear();
return;
}
// apply the min/max coverage filters
if (mean_covg < min_absolute_gene_coverage) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the --min-abs-gene-coverage parameter (" << min_absolute_gene_coverage << ")";
kmp.clear();
return;
}

if (mean_covg < min_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the "
<< "--min-rel-gene-coverage * global coverage ("
<< min_relative_gene_coverage << " * " << global_covg << " = "
<< min_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
}
if (mean_covg < min_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too low, less than the "
<< "--min-rel-gene-coverage * global coverage ("
<< min_relative_gene_coverage << " * " << global_covg << " = "
<< min_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
}

if (mean_covg > max_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too high, larger than the "
<< "--max-rel-gene-coverage * global coverage ("
<< max_relative_gene_coverage << " * " << global_covg << " = "
<< max_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
if (mean_covg > max_relative_gene_coverage*global_covg) {
BOOST_LOG_TRIVIAL(warning)
<< "Filtering out gene " << name << " due to "
<< "mean coverage (" << mean_covg << ") "
<< "being too high, larger than the "
<< "--max-rel-gene-coverage * global coverage ("
<< max_relative_gene_coverage << " * " << global_covg << " = "
<< max_relative_gene_coverage * global_covg << "). "
<< "Is global coverage very different from the expected (too low/high)? "
<< "Try setting a better genome length (see --genome-size param).";
kmp.clear();
return;
}
}

std::string fq_name = pnode->get_name();
Expand Down
11 changes: 10 additions & 1 deletion src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@ void setup_map_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Filtering");

map_subcmd
->add_flag(
"--no-gene-coverage-filtering", opt->no_gene_coverage_filtering,
"Do not filter genes based on their coverage, effectively ignoring params "
"--min-abs-gene-coverage, --min-rel-gene-coverage, --max-rel-gene-coverage and --min-gene-coverage-proportion. "
"This is useful if you are not using read datasets.")
->group("Filtering");

description = "Add extra step to carefully genotype sites.";
auto* gt_opt = map_subcmd->add_flag("--genotype", opt->genotype, description)
->group("Consensus/Variant Calling");
Expand Down Expand Up @@ -389,7 +397,8 @@ int pandora_map(MapOptions& opt)
prg->add_consensus_path_to_fastaq(consensus_fq, pangraph_node, kmp, lmp,
index.get_window_size(), opt.binomial, covg, opt.max_num_kmers_to_avg, 0,
opt.min_absolute_gene_coverage, opt.min_relative_gene_coverage,
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion);
opt.max_relative_gene_coverage, opt.min_gene_coverage_proportion,
opt.no_gene_coverage_filtering);

if (kmp.empty()) {
#pragma omp critical(nodes_to_remove)
Expand Down
Loading