Skip to content

Commit

Permalink
Output extracted regions on base path and not subpaths
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed Feb 10, 2025
1 parent fc9735a commit 8bd10a6
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 75 deletions.
25 changes: 7 additions & 18 deletions src/chunker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ PathChunker::~PathChunker() {

void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only,
MutablePathMutableHandleGraph& subgraph, Region& out_region) {

// Output region must be empty. We fill it in.
crash_unless(out_region.seq.empty());

// extract our path range into the graph
// TODO: Handle incoming names with subranges when they aren't exactly the names of paths we have.
crash_unless(graph->has_path(region.seq));
path_handle_t path_handle = graph->get_path_handle(region.seq);
step_handle_t start_step = graph->get_step_at_position(path_handle, region.start);
handle_t start_handle = graph->get_handle_of_step(start_step);
Expand Down Expand Up @@ -152,27 +156,12 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_

// TODO: We assume we actually found some of the target path
crash_unless(min_start != std::numeric_limits<size_t>::max());

if (base_path_subrange != PathMetadata::NO_SUBRANGE) {
// Our input region was on a path with a subrange, so we want our output region to be relative to (and within) that.

// Make sure we have an end
populate_subrange_end(*graph, base_path_subrange, path_handle);

// We can't expand beyond its bounds and still say we pulled a range on it, so restrict to them.
min_start = std::max(min_start, base_path_subrange.first);
max_end = std::min(max_end, base_path_subrange.second);

// Remove offset from containing path subrange
min_start -= base_path_subrange.first;
max_end -= base_path_subrange.first;
}

// We can't represent a region with a 0 end-exclusive coordinate.
crash_unless(max_end != 0);

// Produce the output region. Convert coordinates to be 0-based, end-inclusive.
out_region.seq = region.seq;
// Express the output region against the base path, if different from the target path with its subrange.
out_region.seq = get_path_base_name(*graph, path_handle);
// Convert coordinates to be 0-based, end-inclusive.
out_region.start = min_start;
out_region.end = max_end - 1;
}
Expand Down
14 changes: 11 additions & 3 deletions src/chunker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,15 @@ class PathChunker {
/** Extract subgraph corresponding to given path region into its
* own vg graph, and send it to out_stream. The boundaries of the
* extracted region of the target path (which can be different because we
* expand context and don't cut nodes) are written to out_region. If the
* target path goes through the extracted region multiple times, only the
* extended bounds of the visit containing the target region are produced.
* expand context and don't cut nodes) are written to out_region.
*
* The contig name of out_region is set to the base path name (without any
* subrange) for the path from region. Output coordinates will be relative
* to this base path.
*
* If the target path goes through the extracted region multiple times,
* only the extended bounds of the visit containing the target region are
* produced.
*
* If forward_only set, context is only expanded in the forward direction
*
Expand Down Expand Up @@ -66,6 +72,8 @@ class PathChunker {
*
* Skips nodes in the ID range that do not actually exist in the source
* graph, or that already exist in the target graph.
*
* Populated out_region's start and end with node IDs.
*/
void extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only,
MutablePathMutableHandleGraph& subgraph, Region& out_region);
Expand Down
43 changes: 43 additions & 0 deletions src/path.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "path.hpp"
#include <vg/io/stream.hpp>
#include "region.hpp"
#include "crash.hpp"
#include <sstream>

// #define debug_simplify
Expand Down Expand Up @@ -2535,6 +2536,48 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path) {
return aln;
}

bool find_containing_subpath(const PathPositionHandleGraph& graph, Region& region, path_handle_t& path) {
// We might be asking for a part of a subpath, or a part of a base path.
if (graph.has_path(region.seq)) {
// It's a base path we have all of, or a subpath we have exactly.
path = graph.get_path_handle(region.seq);

if (region.end == -1) {
// Infer the region endpoint from the path
region.end = graph.get_path_length(path) - 1;
}

// The region start point will be 0 if not set.
region.start = max((int64_t)0, region.start);

return true;
} else if (region.start < 0 || region.end < 0) {
return false;
} else {
// Maybe it's a base path and we only have a subpath.
bool found_contained = false;

for_each_subpath_of(graph, region.seq, [&](const path_handle_t& candidate) {
// We should have some subrange if we get here. Also the region coordunates are specified.
subrange_t candidate_subrange = graph.get_subrange(candidate);
crash_unless(candidate_subrange != PathMetadata::NO_SUBRANGE);
if (candidate_subrange.first <= region.start && candidate_subrange.first + candidate_subrange.second > region.end + 1) {
// The subranges are 0-based exclusive and the regions are 0-based inclusive.
// This subrange fully contains this region.
path = candidate;

found_contained = true;
// Use first result
return false;
}
return true;
});

// Return whether we found the containing subpath.
return found_contained;
}
}

bool for_each_subpath_of(const PathPositionHandleGraph& graph, const string& path_name, const std::function<bool(const path_handle_t& path)>& iteratee) {
if (graph.has_path(path_name)) {
// Just look at the full path.
Expand Down
10 changes: 9 additions & 1 deletion src/path.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "utility.hpp"
#include "types.hpp"
#include "position.hpp"
#include "region.hpp"
#include "nodetraversal.hpp"

//#define debug
Expand Down Expand Up @@ -384,10 +385,17 @@ Path path_from_path_handle(const PathHandleGraph& graph, path_handle_t path_hand
Alignment alignment_from_path(const HandleGraph& graph, const Path& path);

////
// Functiuons for working with path subranges.
// Functions for working with path subranges.
// TODO: Move to libhandlegraph
////

/// Find the subpath containing the given region (possibly a full base path)
/// and return true, or return false if no such subpath can be found.
///
/// If one or both region coordinates are -1, and a full path is found, fills
/// them in from that path.
bool find_containing_subpath(const PathPositionHandleGraph& graph, Region& region, path_handle_t& path);

/// Run the given iteratee for each path that is either the path with the given
/// name (if present), or a subrange of a path with the given name as the base
/// name (otherwise).
Expand Down
100 changes: 47 additions & 53 deletions src/subcommand/chunk_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,66 +609,39 @@ int main_chunk(int argc, char** argv) {
for (auto& region : regions) {
// We want to find the path handle and infer or adjust the coordinates on it for the region.
path_handle_t region_path;
bool found_contained = false;
// We might be asking for a part of a subpath, or a part of a base path.
if (graph->has_path(region.seq)) {
// It's a base path we have all of, or a subpath we have exactly.
region_path = graph->get_path_handle(region.seq);

if (region.end == -1) {
// Infer the region endpoint from the path
region.end = graph->get_path_length(region_path) - 1;
}

// The region start point will be 0 if not set.
region.start = max((int64_t)0, region.start);

found_contained = true;
} else if (region.start < 0 || region.end < 0) {
// The region coordinates aren't fully specified but the path with that exact name doesn't exist.
// Guessing what the user wants would be hard, so stop.
cerr << "error[vg chunk]: input path " << region.seq << " not found exactly in graph and region coordinates are not completely specified" << endl;
return 1;
} else {
// Maybe it's a base path and we only have a subpath.
for_each_subpath_of(*graph, region.seq, [&](const path_handle_t& candidate) {
// We should have some subrange if we get here. Also the region coiordunates are specified.
subrange_t candidate_subrange = graph->get_subrange(candidate);
crash_unless(candidate_subrange != PathMetadata::NO_SUBRANGE);
if (candidate_subrange.first <= region.start && candidate_subrange.first + candidate_subrange.second >= region.end) {
// The subranges are 0-based exclusive and the regions are 0-based exclusive.
// TODO: Is this true???
// This subrange fully contains this region.
region_path = candidate;

// Adjust the name to match the subpath exactly
region.seq = graph->get_path_name(region_path);

// Adjust start and end to be relative to the subpath.
// We know they're set to numbers already.
region.start -= candidate_subrange.first;
region.end -= candidate_subrange.first;

found_contained = true;
// Use first result
return false;
}
return true;
});
}

bool found_contained = find_containing_subpath(*graph, region, region_path);
if (!found_contained) {
if (graph->has_path(region.seq)) {
// We can't find this region.
if (region.start < 0 || region.end < 0) {
// The region coordinates aren't fully specified but the path with that exact name doesn't exist.
// Guessing what the user wants would be hard, so stop.
cerr << "error[vg chunk]: input path " << region.seq << " not found exactly in graph and region coordinates are not completely specified" << endl;
return 1;
} else if (graph->has_path(region.seq)) {
// This is just an out of range request
cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end
<< " is out of bounds of path " << region.seq
<< " which has length " << graph->get_path_length(graph->get_path_handle(region.seq)) << endl;
} else {
// The path isn't therr or the containing subpath isn't there.
// The path isn't there or the containing subpath isn't there.
cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end << " not contained by any graph path" << endl;
}
return 1;
}

subrange_t candidate_subrange = graph->get_subrange(region_path);
if (graph->get_path_name(region_path) != region.seq && candidate_subrange != PathMetadata::NO_SUBRANGE) {
// We found a subpath when we asked for a longer path's region.
// Change the region to be on the subpath.

// Adjust the name to match the subpath exactly
region.seq = graph->get_path_name(region_path);

// Adjust start and end to be relative to the subpath.
// We know they're set to numbers already.
region.start -= candidate_subrange.first;
region.end -= candidate_subrange.first;
}
}
}

Expand Down Expand Up @@ -812,9 +785,30 @@ int main_chunk(int argc, char** argv) {
trace_start = output_regions[i].start;
trace_steps = output_regions[i].end - trace_start;
} else {
path_handle_t path_handle = graph->get_path_handle(output_regions[i].seq);
step_handle_t trace_start_step = graph->get_step_at_position(path_handle, output_regions[i].start);
step_handle_t trace_end_step = graph->get_step_at_position(path_handle, output_regions[i].end);
// TODO: Instead of tracing from the start, just get a sub-GBWT on the subgraph.

// For now, we start tracing at the expanded start point along the target path.
//
// output_regions is in base path space, so its seq may not
// name an extant subpath. But we know the region is contained within one.
path_handle_t path_handle;
// TODO: We know this won't end up rewriting the region bounds but it's still weird to give it a mutable region.
bool found_contained = find_containing_subpath(*graph, output_regions[i], path_handle);
crash_unless(found_contained);

size_t trace_start = output_regions[i].start;
size_t trace_end = output_regions[i].end;

subrange_t candidate_subrange = graph->get_subrange(path_handle);
if (graph->get_path_name(path_handle) != output_regions[i].seq && candidate_subrange != PathMetadata::NO_SUBRANGE) {
// We need to use offsets along the subpath and not the base path.
// But don't rewrite the original region.
trace_start -= candidate_subrange.first;
trace_end -= candidate_subrange.first;
}

step_handle_t trace_start_step = graph->get_step_at_position(path_handle, trace_start);
step_handle_t trace_end_step = graph->get_step_at_position(path_handle, trace_end);
// make sure we don't loop forever in next loop
if (output_regions[i].start > output_regions[i].end) {
swap(trace_start_step, trace_end_step);
Expand Down

1 comment on commit 8bd10a6

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch subpath-chunk. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17398 seconds

Please sign in to comment.