Skip to content

Commit

Permalink
Merge pull request #4436 from vgteam/fix-chunk-gbwt
Browse files Browse the repository at this point in the history
Fix chunk extraction and subsequent GBWT generation
  • Loading branch information
adamnovak authored Nov 7, 2024
2 parents 519656f + a5579c1 commit 62ccb55
Show file tree
Hide file tree
Showing 11 changed files with 220 additions and 316 deletions.
2 changes: 1 addition & 1 deletion deps/gbwtgraph
Submodule gbwtgraph updated 4 files
+1 −1 Makefile
+12 −2 README.md
+344 −0 src/gbz_extract.cpp
+30 −21 src/utils.cpp
160 changes: 97 additions & 63 deletions src/algorithms/subgraph.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "subgraph.hpp"
#include "../path.hpp"
#include "../crash.hpp"

namespace vg {
namespace algorithms {
Expand Down Expand Up @@ -290,83 +291,116 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat
}
}

/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph
/// based on their order in the path position index provided by the source graph
/// will clear any path found in both graphs before writing the new steps into it
/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset
/// in the source graph (unless the subpath was not cut up at all)
void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
bool subpath_naming) {
std::unordered_map<std::string, std::map<uint64_t, handle_t> > subpaths;
void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph) {

// We want to organize all visits by base path. This key type holds the
// sense, sample and locus names, haplotype, and phase block.
using base_metadata_t = std::tuple<PathSense, string, string, size_t, size_t>;

// This stores, for each source graph base path, for each start offset, the handle at that offset on the path.
std::unordered_map<base_metadata_t, std::map<uint64_t, handle_t> > subpaths;

// This stores information about base paths that don't have subranges, and
// their full lengths and circularity flags, so we can avoid generating new
// subrange metadata when we just have all of a path.
std::unordered_map<base_metadata_t, std::pair<size_t, bool>> full_path_info;

subgraph.for_each_handle([&](const handle_t& h) {
handlegraph::nid_t id = subgraph.get_id(h);
if (source.has_node(id)) {
handle_t handle = source.get_handle(id);
source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) {
path_handle_t path = source.get_path_handle_of_step(step);
std::string path_name = source.get_path_name(path);
subpaths[path_name][pos] = is_rev ? subgraph.flip(h) : h;
// Figure out the base path this visit is on
base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_locus_name(path), source.get_haplotype(path), source.get_phase_block(path)};
// Figure out the subrange of the base path it is relative to
subrange_t path_subrange = source.get_subrange(path);
uint64_t visit_offset = pos;
if (path_subrange != PathMetadata::NO_SUBRANGE) {
// If we have the position relative to a subrange, adjust by that subrange's offset.
visit_offset += path_subrange.first;
}
subpaths[key][visit_offset] = is_rev ? subgraph.flip(h) : h;

if (path_subrange == PathMetadata::NO_SUBRANGE) {
// There's no subrange set, so this path is full-length in the source graph.
// See if we know of this path as a full-length path or not
auto it = full_path_info.find(key);
if (it == full_path_info.end()) {
// We haven't recorded its length and circularity yet, so do it.
full_path_info.emplace_hint(it, key, std::make_pair(source.get_path_length(path), source.get_is_circular(path)));
}
}
return true;
});
}
});

function<path_handle_t(const string&, bool, size_t)> new_subpath =
[&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) {
PathSense sense;
string sample;
string locus;
size_t haplotype;
size_t phase_block;
subrange_t subrange;
PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange);
if (subrange == PathMetadata::NO_SUBRANGE) {
subrange.first = subpath_offset;
} else {
subrange.first += subpath_offset;
}
subrange.first = subpath_offset;
subrange.second = PathMetadata::NO_END_POSITION;
string subpath_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange);
if (subgraph.has_path(subpath_name)) {
subgraph.destroy_path(subgraph.get_path_handle(subpath_name));
}
return subgraph.create_path_handle(subpath_name, is_circular);
};
for (auto& base_and_visits : subpaths) {
// For each base path
const base_metadata_t& base_path_metadata = base_and_visits.first;
const auto& start_to_handle = base_and_visits.second;
// If we didn't put anything in the visit collection, it shouldn't be here.
crash_unless(!start_to_handle.empty());

for (auto& subpath : subpaths) {
const std::string& path_name = subpath.first;
path_handle_t source_path_handle = source.get_path_handle(path_name);
// destroy the path if it exists
if (subgraph.has_path(path_name)) {
subgraph.destroy_path(subgraph.get_path_handle(path_name));
}
// create a new path. give it a subpath name if the flag's on and its smaller than original
path_handle_t path;
if (!subpath_naming || subpath.second.size() == source.get_step_count(source_path_handle) ||
subpath.second.empty()) {
path = subgraph.create_path_handle(path_name, source.get_is_circular(source_path_handle));
} else {
path = new_subpath(path_name, source.get_is_circular(source_path_handle), subpath.second.begin()->first);
}
for (auto p = subpath.second.begin(); p != subpath.second.end(); ++p) {
const handle_t& handle = p->second;
if (p != subpath.second.begin() && subpath_naming) {
auto prev = p;
--prev;
const handle_t& prev_handle = prev->second;
// distance from map
size_t delta = p->first - prev->first;
// what the distance should be if they're contiguous depends on relative orienations
size_t cont_delta = subgraph.get_length(prev_handle);
if (delta != cont_delta) {
// we have a discontinuity! we'll make a new path can continue from there
assert(subgraph.get_step_count(path) > 0);
path = new_subpath(path_name, subgraph.get_is_circular(path), p->first);
// We're going to walk over all the visits and find contiguous runs
auto run_start = start_to_handle.begin();
auto run_end = run_start;
size_t start_coordinate = run_start->first;
while (run_end != start_to_handle.end()) {
// Until we run out of runs
// Figure out where this node ends on the path
size_t stop_coordinate = run_end->first + subgraph.get_length(run_end->second);

// Look ahead
++run_end;

if (run_end != start_to_handle.end() && run_end->first == stop_coordinate) {
// The next visit is still contiguous, so advance.
continue;
}

// Otherwise we've reached a break in continuity. We have a
// contiguous run from run_start to run_end, visiting the subrange
// start_coordinate to stop_coordinate.

// Find out if we cover a full source graph path.
subrange_t run_subrange = {start_coordinate, stop_coordinate};
bool is_circular = false;
if (start_coordinate == 0) {
// We might be a full path
auto found_length_and_circularity = full_path_info.find(base_path_metadata);
if (found_length_and_circularity != full_path_info.end() && found_length_and_circularity->second.first == stop_coordinate) {
// We are a full path
run_subrange = PathMetadata::NO_SUBRANGE;
// We can be circular.
is_circular = found_length_and_circularity->second.second;
}
}
//fill in the path information
subgraph.append_step(path, handle);

// Make a path with all the metadata
path_handle_t new_path = subgraph.create_path(
std::get<0>(base_path_metadata),
std::get<1>(base_path_metadata),
std::get<2>(base_path_metadata),
std::get<3>(base_path_metadata),
std::get<4>(base_path_metadata),
run_subrange,
is_circular
);

for (auto it = run_start; it != run_end; ++it) {
// Copy the path's visits
subgraph.append_step(new_path, it->second);
}

// Set up the next subpath.
// Set where it starts.
run_start = run_end;
if (run_start != start_to_handle.end()) {
// And if it will exist, set its start coordinate.
start_coordinate = run_start->first;
}
}
}
}
Expand Down
16 changes: 9 additions & 7 deletions src/algorithms/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ void extract_id_range(const HandleGraph& source, const nid_t& id1, const nid_t&
/// if end < 0, then it will walk to the end of the path
void extract_path_range(const PathPositionHandleGraph& source, path_handle_t path_handle, int64_t start, int64_t end, MutableHandleGraph& subgraph);

/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph
/// based on their order in the path position index provided by the source graph
/// will clear any path found in both graphs before writing the new steps into it
/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset
/// in the source graph (unless the subpath was not cut up at all)
void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
bool subpath_naming = false);
/// Add subpaths to the subgraph for all paths visiting its nodes in the source
/// graph.
///
/// Always generates correct path metadata, and a path for each contiguous
/// fragment of any base path. Assumes the source graph does not contain any
/// overlapping path fragments on a given base path, and that the subgraph does
/// not already contain any paths on a base path also present in the source
/// graph.
void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph);

/// We can accumulate a subgraph without accumulating all the edges between its nodes
/// this helper ensures that we get the full set
Expand Down
Loading

1 comment on commit 62ccb55

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17363 seconds

Please sign in to comment.