Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index optimisation #342

Merged
merged 26 commits into from
Jul 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4f02ec8
Removing attribute MiniRecord::path
leoisl May 10, 2023
19a547e
Improving Index loading
leoisl May 10, 2023
ea34940
Fixing segfault bug with getting the wrong pangraph node
leoisl May 19, 2023
b4ceec3
Now outputting unmapped reads back in SAM
leoisl May 19, 2023
a2f9c42
Removing --clean command line option
leoisl May 22, 2023
c2caae0
Adding CLI option --rng-seed
leoisl May 22, 2023
1fd3b98
Improving MinimizerHitClusters to accept RNG seeds
leoisl May 22, 2023
9bbf987
Cleaning up unused code
leoisl May 22, 2023
34f030c
Improving position and section of --rng-seed parameter
leoisl May 22, 2023
29d9009
random_main now using std::mt19937 as random number generator
leoisl May 23, 2023
28c2e42
estimate_parameters() now does not update error rate nor bin model
leoisl May 23, 2023
b19d268
Removing unused argument genome_size from get_minimizer_hit_clusters()
leoisl May 23, 2023
cf06998
Adding debugging to gitignore
leoisl Jun 8, 2023
72cd6a0
Merge branch 'not_keeping_read_info_in_compare' into index_optimisation
leoisl Jun 8, 2023
7f55daa
Fixing issue with index loading introduced in the previous commit
leoisl Jun 12, 2023
f492729
Adding massif profilings
leoisl Jun 9, 2023
27629b3
Finishing properly removing Path from hits
leoisl Jun 12, 2023
1c53eb7
Misc improvements
leoisl Jun 12, 2023
83260a6
Adding massif profiling
leoisl Jun 13, 2023
d6b4373
POS now is always .
leoisl Jun 13, 2023
6055845
POS is now always 0
leoisl Jun 15, 2023
75df5e1
Fixing SAM file headers
leoisl Jun 15, 2023
c277224
Removing if initialisation to not require a C++17 compiler
leoisl Jun 15, 2023
5ed37b3
Merge remote-tracking branch 'origin/dev' into index_optimisation
leoisl Jul 19, 2023
ac3fa74
Removing massif profiling files
leoisl Jul 19, 2023
97a76ea
Removing code to fix error rate if user provided bad arguments
leoisl Jul 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,5 @@ example/make_prg*
.ipynb_checkpoints
scripts/compare_pandora_results/pandora_multisample.matrix*
!test/test_cases/sample_example/pangenome.prg.fa

/debugging/
1 change: 0 additions & 1 deletion include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ struct CompareOptions {
uint32_t max_diff { 250 };
bool output_vcf { false };
bool illumina { false };
bool clean { false };
float min_absolute_gene_coverage { 3.0 };
float min_relative_gene_coverage { 0.05 };
float max_relative_gene_coverage { 100 };
Expand Down
2 changes: 1 addition & 1 deletion include/fastaq.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct Fastaq {
const uint_least16_t, const std::string header = "");

void add_entry(
const std::string&, const std::string&, const std::string header = "");
const std::string&, const std::string&, const std::string &header = "");

void clear();

Expand Down
1 change: 0 additions & 1 deletion include/minihit.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ struct MinimizerHit {
inline uint32_t get_read_id() const { return read_id; }
inline uint32_t get_read_start_position() const { return read_start_position; }
inline uint32_t get_prg_id() const { return minimizer_from_PRG.prg_id; }
inline const prg::Path& get_prg_path() const { return minimizer_from_PRG.path; }
inline uint32_t get_kmer_node_id() const { return minimizer_from_PRG.knode_id; }
inline bool get_prg_kmer_strand() const { return minimizer_from_PRG.strand; }
inline bool same_strands() const
Expand Down
3 changes: 2 additions & 1 deletion include/minihit_clusters.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
class MinimizerHitClusters {
public:
explicit MinimizerHitClusters(const uint32_t rng_seed) : insertion_phase(true), clusters(){
if (bool deterministic_run = rng_seed > 0; deterministic_run) {
bool deterministic_run = rng_seed > 0;
if (deterministic_run) {
rng = std::mt19937(rng_seed);
} else{
rng = std::mt19937(std::random_device()());
Expand Down
2 changes: 1 addition & 1 deletion include/minimizermatch_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class MinimizerMatchFile : public GenericFile {
public:
MinimizerMatchFile(const fs::path &filepath, const std::vector<std::string> &prg_names,
bool is_fake_file = false);
void write_hits(const Seq &seq, const MinimizerHits &hits);
void write_hits(const Seq &seq, const MinimizerHits &hits, const uint32_t k);
};


Expand Down
1 change: 0 additions & 1 deletion include/minirecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
// Minimizer Record - stores information about a minimizer
struct MiniRecord {
uint32_t prg_id; // prg id of the minimizer
prg::Path path; // kmer path of the minimizer in the PRG
uint32_t knode_id; // node id of this record in the KmerGraph
bool strand; // strand of this kmer

Expand Down
5 changes: 3 additions & 2 deletions include/sam_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class SAMFile {
const std::vector<uint32_t> &prg_lengths;
std::set<uint32_t> prg_ids_that_we_mapped_to;
const uint32_t flank_size;
const uint32_t k;

std::vector<bool> get_mapped_positions_bitset(const Seq &seq, const MinimizerHits &cluster) const;
Cigar get_cigar(const std::vector<bool> &mapped_positions_bitset) const;
Expand All @@ -68,10 +69,10 @@ class SAMFile {
void create_final_sam_file();
public:
SAMFile(const fs::path &filepath, const std::vector<std::string> &prg_names,
const std::vector<uint32_t> &prg_lengths, const uint32_t flank_size)
const std::vector<uint32_t> &prg_lengths, const uint32_t flank_size, const uint32_t k)
:filepath(filepath), tmp_filepath(filepath.string() + ".tmp"),
tmp_sam_file(new GenericFile(tmp_filepath)), prg_names(prg_names),
prg_lengths(prg_lengths), flank_size(flank_size){}
prg_lengths(prg_lengths), flank_size(flank_size), k(k) {}
virtual ~SAMFile();

// Note: these two methods could be one, e.g. write_sam_record() could generate the
Expand Down
8 changes: 4 additions & 4 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void define_clusters(
MinimizerHitClusters& clusters_of_hits,
const std::vector<uint32_t> &prg_min_path_lengths,
const std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> minimizer_hits, const int max_diff,
std::shared_ptr<MinimizerHits> &minimizer_hits, const int max_diff,
const float& fraction_kmers_required_for_cluster, const uint32_t min_cluster_size,
const uint32_t expected_number_kmers_in_read_sketch,
ClusterDefFile& cluster_def_file);
Expand All @@ -96,8 +96,8 @@ MinimizerHitClusters get_minimizer_hit_clusters(
const Seq &seq,
const std::vector<uint32_t> &prg_min_path_lengths,
const std::vector<std::string> &prg_names,
std::shared_ptr<MinimizerHits> minimizer_hits,
std::shared_ptr<pangenome::Graph> pangraph, const int max_diff,
std::shared_ptr<MinimizerHits> &minimizer_hits,
const int max_diff,
const float& fraction_kmers_required_for_cluster,
ClusterDefFile &cluster_def_file,
ClusterFilterFile &cluster_filter_file,
Expand All @@ -106,7 +106,7 @@ MinimizerHitClusters get_minimizer_hit_clusters(
const uint32_t rng_seed);

uint32_t pangraph_from_read_file(const SampleData& sample,
std::shared_ptr<pangenome::Graph> pangraph, Index &index,
std::shared_ptr<pangenome::Graph> &pangraph, Index &index,
const int max_diff, const float& e_rate,
const fs::path& sample_outdir, const uint32_t min_cluster_size = 10,
const uint32_t genome_size = 5000000, const uint32_t max_covg = 300,
Expand Down
12 changes: 0 additions & 12 deletions src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,18 +240,6 @@ int pandora_compare(CompareOptions& opt)
}
boost::log::core::get()->set_filter(boost::log::trivial::severity >= log_level);

// =========
// todo: this all seems strange
if (opt.error_rate < 0.01) {
opt.illumina = true;
}
if (opt.error_rate > 0.05 and opt.illumina) {
opt.error_rate = 0.001;
}
if (opt.illumina and opt.error_rate > 0.1) {
opt.error_rate = 0.001;
}
// ==========
if (opt.genotype) {
opt.output_vcf = true;
}
Expand Down
13 changes: 0 additions & 13 deletions src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,19 +411,6 @@ int pandora_discover(DiscoverOptions& opt)
}
boost::log::core::get()->set_filter(boost::log::trivial::severity >= log_level);

// =========
// todo: this all seems strange
if (opt.error_rate < 0.01) {
opt.illumina = true;
}
if (opt.error_rate > 0.05 and opt.illumina) {
opt.error_rate = 0.001;
}
if (opt.illumina and opt.error_rate > 0.1) {
opt.error_rate = 0.001;
}
// ==========

BOOST_LOG_TRIVIAL(info) << "Loading Index...";
Index index = Index::load(opt.index_file.string());
BOOST_LOG_TRIVIAL(info) << "Index loaded successfully!";
Expand Down
2 changes: 1 addition & 1 deletion src/fastaq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void Fastaq::add_entry(const std::string& name, const std::string& sequence,
}

void Fastaq::add_entry(
const std::string& name, const std::string& sequence, const std::string header)
const std::string& name, const std::string& sequence, const std::string &header)
{
const bool fasta_entry_has_a_name = name.length() > 0;
if (!fasta_entry_has_a_name) {
Expand Down
9 changes: 2 additions & 7 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,8 @@ void Index::load_minhash() {
zip_in.ignore(1, '\t');
zip_in >> size;
auto* vmr = new std::vector<MiniRecord>;
if (minhash.find(key) != minhash.end()) {
vmr = minhash[key];
vmr->reserve(vmr->size() + size);
} else {
vmr->reserve(size);
minhash[key] = vmr;
}
vmr->reserve(size);
minhash[key] = vmr;
zip_in.ignore(1, '\t');
} else if (c == EOF) {
break;
Expand Down
12 changes: 0 additions & 12 deletions src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,6 @@ int pandora_map(MapOptions& opt)
}
boost::log::core::get()->set_filter(boost::log::trivial::severity >= log_level);

// =========
// todo: this all seems strange
if (opt.error_rate < 0.01) {
opt.illumina = true;
}
if (opt.error_rate > 0.05 and opt.illumina) {
opt.error_rate = 0.001;
}
if (opt.illumina and opt.error_rate > 0.1) {
opt.error_rate = 0.001;
}
// ==========
if (opt.genotype) {
opt.output_vcf = true;
}
Expand Down
20 changes: 6 additions & 14 deletions src/minihit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@ MinimizerHit::MinimizerHit(const uint32_t i, const Minimizer& minimizer_from_rea
, read_start_position { minimizer_from_read.pos_of_kmer_in_read.start }
, read_strand { minimizer_from_read.is_forward_strand }
, minimizer_from_PRG { minimizer_from_PRG }
{
const bool both_minimizers_have_same_length
= minimizer_from_read.pos_of_kmer_in_read.length
== minimizer_from_PRG.path.length();
if (!both_minimizers_have_same_length) {
fatal_error("Error when storing minimizers: minimizer from read/sequence "
"and from PRG have different lengths");
}
}
{ }

bool MinimizerHit::operator==(const MinimizerHit& y) const
{
Expand All @@ -31,7 +23,7 @@ bool MinimizerHit::operator==(const MinimizerHit& y) const
if (get_prg_id() != y.get_prg_id()) {
return false;
}
if (!(get_prg_path() == y.get_prg_path())) {
if (!(get_kmer_node_id() == y.get_kmer_node_id())) {
return false;
}
if (same_strands() != y.same_strands()) {
Expand Down Expand Up @@ -74,11 +66,11 @@ bool MinimizerHit::operator<(const MinimizerHit& y) const
return false;
}

// then by position on target string
if (get_prg_path() < y.get_prg_path()) {
// then by position on the DAG
if (get_kmer_node_id() < y.get_kmer_node_id()) {
return true;
}
if (y.get_prg_path() < get_prg_path()) {
if (y.get_kmer_node_id() < get_kmer_node_id()) {
return false;
}

Expand All @@ -88,7 +80,7 @@ bool MinimizerHit::operator<(const MinimizerHit& y) const
std::ostream& operator<<(std::ostream& out, MinimizerHit const& m)
{
out << "(" << m.get_read_id() << ", " << m.get_read_start_position() << ", "
<< m.get_prg_id() << ", " << m.get_prg_path() << ", " << m.same_strands() << ", "
<< m.get_prg_id() << ", " << ", " << m.same_strands() << ", "
<< m.get_kmer_node_id() << ")";
return out;
}
4 changes: 2 additions & 2 deletions src/minihits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ bool pCompReadPositionFirst::operator()(const MinimizerHitPtr& lhs, const Minimi
if (rhs->get_prg_id() < lhs->get_prg_id()) {
return false;
}
if (lhs->get_prg_path() < rhs->get_prg_path()) {
if (lhs->get_kmer_node_id() < rhs->get_kmer_node_id()) {
return true;
}
if (rhs->get_prg_path() < lhs->get_prg_path()) {
if (rhs->get_kmer_node_id() < lhs->get_kmer_node_id()) {
return false;
}
return false;
Expand Down
9 changes: 4 additions & 5 deletions src/minimizermatch_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ MinimizerMatchFile::MinimizerMatchFile(const fs::path &filepath,
const std::vector<std::string> &prg_names,
bool is_fake_file) :
GenericFile(filepath, is_fake_file), prg_names(prg_names) {
(*this) << "kmer\tread\tread_start\tread_end\tread_strand\tprg\tprg_path\tprg_strand\n";
(*this) << "kmer\tread\tread_start\tread_end\tread_strand\tprg\tprg_strand\n";
}

void MinimizerMatchFile::write_hits(const Seq &seq, const MinimizerHits &hits) {
void MinimizerMatchFile::write_hits(const Seq &seq, const MinimizerHits &hits, const uint32_t k) {
std::vector<MinimizerHitPtr> sorted_hits;
sorted_hits.insert(sorted_hits.end(), hits.begin(), hits.end());
std::sort(sorted_hits.begin(), sorted_hits.end(), pCompReadPositionFirst());
for (const MinimizerHitPtr &hit : hits) {
const uint32_t read_start_position = hit->get_read_start_position();
const uint32_t read_end_position = hit->get_read_start_position()+hit->get_prg_path().length();
std::string kmer = seq.full_seq.substr(read_start_position, hit->get_prg_path().length());
const uint32_t read_end_position = read_start_position + k;
std::string kmer = seq.full_seq.substr(read_start_position, k);
if (hit->read_strand == 0) {
kmer = reverse_complement(kmer);
}
Expand All @@ -28,7 +28,6 @@ void MinimizerMatchFile::write_hits(const Seq &seq, const MinimizerHits &hits) {
<< read_end_position << "\t"
<< "-+"[hit->read_strand] << "\t"
<< prg_name << "\t"
<< hit->get_prg_path() << "\t"
<< "-+"[hit->get_prg_kmer_strand()] << "\n";
}
}
7 changes: 2 additions & 5 deletions src/minirecord.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ MiniRecord::MiniRecord() {};
MiniRecord::MiniRecord(
const uint32_t p, const prg::Path &q, const uint32_t n, const bool c)
: prg_id(p)
, path(q)
, knode_id(n)
, strand(c) {};

Expand All @@ -18,7 +17,7 @@ bool MiniRecord::operator==(const MiniRecord& y) const
if (prg_id != y.prg_id) {
return false;
}
if (!(path == y.path)) {
if (knode_id != y.knode_id) {
return false;
}
if (strand != y.strand) {
Expand All @@ -29,7 +28,7 @@ bool MiniRecord::operator==(const MiniRecord& y) const

std::ostream& operator<<(std::ostream& out, MiniRecord const& m)
{
out << "(" << m.prg_id << ", " << m.path << ", " << m.knode_id << ", " << m.strand
out << "(" << m.prg_id << ", " << m.knode_id << ", " << m.strand
<< ")";
return out;
}
Expand All @@ -39,8 +38,6 @@ std::istream& operator>>(std::istream& in, MiniRecord& m)
in.ignore(1, '(');
in >> m.prg_id;
in.ignore(2, ' ');
in >> m.path;
in.ignore(2, ' ');
in >> m.knode_id;
in.ignore(2, ' ');
in >> m.strand;
Expand Down
2 changes: 1 addition & 1 deletion src/pangenome/pangraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ std::vector<LocalNodePtr> pangenome::Graph::infer_node_vcf_reference_path(
const auto& prg = *prg_ptr;
if (vcf_refs.find(prg.name) != vcf_refs.end()) {
const auto& vcf_reference_sequence = vcf_refs.at(prg.name);
const auto reference_path = prg.get_valid_vcf_reference(vcf_reference_sequence);
auto reference_path = prg.get_valid_vcf_reference(vcf_reference_sequence);
if (!reference_path.empty())
return reference_path;
}
Expand Down
21 changes: 6 additions & 15 deletions src/sam_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,13 @@ std::string SAMFile::get_header() const {
}
ss << "@PG\tID:pandora\tPN:pandora\tVN:" << PANDORA_VERSION
<< "\tCL: " << PandoraGlobals::command_line << "\n";
ss << "@CO\tThe reference length (in @SQ header lines) and the POS field refer to "
"the string representation of the PRGs\n";
ss << "@CO\tThe reference length (LN in @SQ header lines) is the length of the string representation of the PRGs\n";
ss << "@CO\tLF: left flank sequence, the sequence before the first "
"mapped kmer, soft-clipped, max " << flank_size << " bps\n";
ss << "@CO\tRF: right flank sequence, the sequence after the last "
"mapped kmer, soft-clipped, max " << flank_size << " bps\n";
ss << "@CO\tMP: number of minimizer matches on the plus strand in the cluster of hits\n";
ss << "@CO\tMM: number of minimizer matches on the minus strand in the cluster of hits\n";
ss << "@CO\tPP: Prg Paths of the cluster of hits: the PRG path of each "
"hit in considered cluster of hits\n";
ss << "@CO\tNM: Total number of mismatches in the quasi-alignment\n";
ss << "@CO\tAS: Alignment score (number of matches)\n";
ss << "@CO\tnn: Number of ambiguous bases in the quasi-alignment\n";
Expand Down Expand Up @@ -56,7 +53,7 @@ std::vector<bool> SAMFile::get_mapped_positions_bitset(const Seq &seq, const Min
std::vector<bool> mapped_positions_bitset(seq.full_seq.size(), false);
for (const MinimizerHitPtr &hit : cluster) {
const uint32_t read_start_position = hit->get_read_start_position();
const uint32_t read_end_position = hit->get_read_start_position()+hit->get_prg_path().length();
const uint32_t read_end_position = hit->get_read_start_position()+k;
for (uint32_t pos = read_start_position; pos < read_end_position; ++pos) {
mapped_positions_bitset[pos] = true;
}
Expand Down Expand Up @@ -134,8 +131,6 @@ std::string SAMFile::get_sam_record_from_hit_cluster(
const MinimizerHitPtr first_hit = *(cluster.begin());
const std::string &prg_name = prg_names[first_hit->get_prg_id()];

const uint32_t alignment_start = first_hit->get_prg_path().begin()->start + 1;

const std::vector<bool> mapped_positions_bitset =
get_mapped_positions_bitset(seq, cluster);
const Cigar cigar = get_cigar(mapped_positions_bitset);
Expand Down Expand Up @@ -163,11 +158,6 @@ std::string SAMFile::get_sam_record_from_hit_cluster(
std::swap(left_flank, right_flank);
}

std::stringstream cluster_of_hits_prg_paths_ss;
for (const MinimizerHitPtr &hit : cluster) {
cluster_of_hits_prg_paths_ss << hit->get_prg_path() << "->";
}

auto number_ambiguous_bases = std::count_if(
segment_sequence.begin(), segment_sequence.end(),
[](char base) -> bool {
Expand All @@ -189,7 +179,8 @@ std::string SAMFile::get_sam_record_from_hit_cluster(
ss << seq.name << "\t"
<< flag << "\t"
<< prg_name << "\t"
<< alignment_start << "\t"
<< "0\t"
<< "." << "\t"
<< "255\t"
<< cigar << "\t"
<< "*\t0\t0\t"
Expand All @@ -202,8 +193,8 @@ std::string SAMFile::get_sam_record_from_hit_cluster(
<< "MP:i:" << plus_strand_count << "\t"
<< "MM:i:" << minus_strand_count << "\t"
<< "LF:Z:" << left_flank << "\t"
<< "RF:Z:" << right_flank << "\t"
<< "PP:Z:" << cluster_of_hits_prg_paths_ss.str() << "\n";
<< "RF:Z:" << right_flank << "\n";
;

#pragma omp critical(prg_ids_that_we_mapped_to)
{
Expand Down
Loading
Loading