Skip to content

Commit

Permalink
Merge pull request #4522 from vgteam/subpath-chunk
Browse files Browse the repository at this point in the history
Adjust subpath chunking
  • Loading branch information
adamnovak authored Feb 13, 2025
2 parents e44d6eb + 672dfe9 commit f926d96
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/testmac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:

steps:
- name: Use cache
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: |
deps
Expand Down
59 changes: 36 additions & 23 deletions src/chunker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ PathChunker::~PathChunker() {

void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only,
MutablePathMutableHandleGraph& subgraph, Region& out_region) {

// Output region must be empty. We fill it in.
crash_unless(out_region.seq.empty());

// extract our path range into the graph
// TODO: Handle incoming names with subranges when they aren't exactly the names of paths we have.
crash_unless(graph->has_path(region.seq));
path_handle_t path_handle = graph->get_path_handle(region.seq);
step_handle_t start_step = graph->get_step_at_position(path_handle, region.start);
handle_t start_handle = graph->get_handle_of_step(start_step);
Expand Down Expand Up @@ -95,31 +99,51 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_
size_t min_start = std::numeric_limits<size_t>::max();
size_t max_end = 0;

// We use this to make sure we have a path end
auto populate_subrange_end = [](const PathHandleGraph& g, subrange_t& subrange, const path_handle_t& path) {
if (subrange.second == PathMetadata::NO_END_POSITION) {
// Compute a length and use that to get the end.
// TODO: Sniff for an efficient/available get_path_length.
size_t path_length = 0;
for (handle_t handle : g.scan_path(path)) {
path_length += g.get_length(handle);
}
subrange.second = subrange.first + path_length;
}
};

subgraph.for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t subpath) {
if (subgraph.get_haplotype(subpath) != haplotype || subgraph.get_phase_block(subpath) != phase_block) {
// Skip this subpath since it's not the right phase/fragment
return true;
}

#ifdef debug
#pragma omp critical(cerr)
{
std::cerr << "Consider new subpath " << subgraph.get_path_name(subpath) << std::endl;
}
#endif

subrange_t subpath_subrange = subgraph.get_subrange(subpath);
if (subpath_subrange == PathMetadata::NO_SUBRANGE) {
// Fill in a 0 start
subpath_subrange.first = 0;
}
if (subpath_subrange.second == PathMetadata::NO_END_POSITION) {
// Compute a length and use that to get the end.
// TODO: Sniff for an efficient/available get_path_length.
size_t path_length = 0;
for (handle_t handle : subgraph.scan_path(subpath)) {
path_length += subgraph.get_length(handle);
}
subpath_subrange.second = subpath_subrange.first + path_length;
}
populate_subrange_end(subgraph, subpath_subrange, subpath);

if (subpath_subrange.first >= target_subrange.second || subpath_subrange.second <= target_subrange.first) {
// This subpath doesn't actually overlap the selected target base path
// subrange (which is 0-based, end-exclusive), and so shouldn't count
// for extending the selected region along the target path.

#ifdef debug
#pragma omp critical(cerr)
{
std::cerr << "\tSkip it" << std::endl;
}
#endif

return true;
}

Expand All @@ -132,23 +156,12 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_

// TODO: We assume we actually found some of the target path
crash_unless(min_start != std::numeric_limits<size_t>::max());

// Hackily remove source path subrange offsets if any
subrange_t source_subrange = graph->get_subrange(path_handle);
if (source_subrange != PathMetadata::NO_SUBRANGE) {
// If we requested something on this path region, we can't handle
// finding part of an earlier path region.
// TODO: Handle it.
crash_unless(min_start <= source_subrange.first);
min_start -= source_subrange.first;
max_end -= source_subrange.first;
}

// We can't represent a region with a 0 end-exclusive coordinate.
crash_unless(max_end != 0);

// Produce the output region. Convert coordinates to be 0-based, end-inclusive.
out_region.seq = region.seq;
// Express the output region against the base path, if different from the target path with its subrange.
out_region.seq = get_path_base_name(*graph, path_handle);
// Convert coordinates to be 0-based, end-inclusive.
out_region.start = min_start;
out_region.end = max_end - 1;
}
Expand Down
14 changes: 11 additions & 3 deletions src/chunker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,15 @@ class PathChunker {
/** Extract subgraph corresponding to given path region into its
* own vg graph, and send it to out_stream. The boundaries of the
* extracted region of the target path (which can be different because we
* expand context and don't cut nodes) are written to out_region. If the
* target path goes through the extracted region multiple times, only the
* extended bounds of the visit containing the target region are produced.
* expand context and don't cut nodes) are written to out_region.
*
* The contig name of out_region is set to the base path name (without any
* subrange) for the path from region. Output coordinates will be relative
* to this base path.
*
* If the target path goes through the extracted region multiple times,
* only the extended bounds of the visit containing the target region are
* produced.
*
* If forward_only set, context is only expanded in the forward direction
*
Expand Down Expand Up @@ -66,6 +72,8 @@ class PathChunker {
*
* Skips nodes in the ID range that do not actually exist in the source
* graph, or that already exist in the target graph.
*
* Populated out_region's start and end with node IDs.
*/
void extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only,
MutablePathMutableHandleGraph& subgraph, Region& out_region);
Expand Down
43 changes: 43 additions & 0 deletions src/path.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "path.hpp"
#include <vg/io/stream.hpp>
#include "region.hpp"
#include "crash.hpp"
#include <sstream>

// #define debug_simplify
Expand Down Expand Up @@ -2535,6 +2536,48 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path) {
return aln;
}

bool find_containing_subpath(const PathPositionHandleGraph& graph, Region& region, path_handle_t& path) {
// We might be asking for a part of a subpath, or a part of a base path.
if (graph.has_path(region.seq)) {
// It's a base path we have all of, or a subpath we have exactly.
path = graph.get_path_handle(region.seq);

if (region.end == -1) {
// Infer the region endpoint from the path
region.end = graph.get_path_length(path) - 1;
}

// The region start point will be 0 if not set.
region.start = max((int64_t)0, region.start);

return true;
} else if (region.start < 0 || region.end < 0) {
return false;
} else {
// Maybe it's a base path and we only have a subpath.
bool found_contained = false;

for_each_subpath_of(graph, region.seq, [&](const path_handle_t& candidate) {
// We should have some subrange if we get here. Also the region coordunates are specified.
subrange_t candidate_subrange = graph.get_subrange(candidate);
crash_unless(candidate_subrange != PathMetadata::NO_SUBRANGE);
if (candidate_subrange.first <= region.start && candidate_subrange.first + candidate_subrange.second > region.end + 1) {
// The subranges are 0-based exclusive and the regions are 0-based inclusive.
// This subrange fully contains this region.
path = candidate;

found_contained = true;
// Use first result
return false;
}
return true;
});

// Return whether we found the containing subpath.
return found_contained;
}
}

bool for_each_subpath_of(const PathPositionHandleGraph& graph, const string& path_name, const std::function<bool(const path_handle_t& path)>& iteratee) {
if (graph.has_path(path_name)) {
// Just look at the full path.
Expand Down
10 changes: 9 additions & 1 deletion src/path.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "utility.hpp"
#include "types.hpp"
#include "position.hpp"
#include "region.hpp"
#include "nodetraversal.hpp"

//#define debug
Expand Down Expand Up @@ -384,10 +385,17 @@ Path path_from_path_handle(const PathHandleGraph& graph, path_handle_t path_hand
Alignment alignment_from_path(const HandleGraph& graph, const Path& path);

////
// Functiuons for working with path subranges.
// Functions for working with path subranges.
// TODO: Move to libhandlegraph
////

/// Find the subpath containing the given region (possibly a full base path)
/// and return true, or return false if no such subpath can be found.
///
/// If one or both region coordinates are -1, and a full path is found, fills
/// them in from that path.
bool find_containing_subpath(const PathPositionHandleGraph& graph, Region& region, path_handle_t& path);

/// Run the given iteratee for each path that is either the path with the given
/// name (if present), or a subrange of a path with the given name as the base
/// name (otherwise).
Expand Down
89 changes: 66 additions & 23 deletions src/subcommand/chunk_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "subcommand.hpp"

#include "../crash.hpp"
#include "../vg.hpp"
#include <vg/io/stream.hpp>
#include <vg/io/stream_multiplexer.hpp>
Expand All @@ -21,6 +22,7 @@
#include "../stream_index.hpp"
#include "../region.hpp"
#include "../haplotype_extracter.hpp"
#include "../path.hpp"
#include "../algorithms/sorted_id_ranges.hpp"
#include "../algorithms/find_gbwt.hpp"
#include <bdsg/overlays/overlay_helper.hpp>
Expand Down Expand Up @@ -596,29 +598,49 @@ int main_chunk(int argc, char** argv) {
}

// validate and fill in sizes for regions that span entire path
function<size_t(const string&)> get_path_length = [&](const string& path_name) {
function<size_t(const path_handle_t&)> get_path_length = [&](const path_handle_t& path_handle) {
size_t path_length = 0;
for (handle_t handle : graph->scan_path(graph->get_path_handle(path_name))) {
for (handle_t handle : graph->scan_path(path_handle)) {
path_length += graph->get_length(handle);
}
return path_length;
};
if (!id_range) {
for (auto& region : regions) {
if (!graph->has_path(region.seq)) {
cerr << "error[vg chunk]: input path " << region.seq << " not found in graph" << endl;
return 1;
}
region.start = max((int64_t)0, region.start);
if (region.end == -1) {
region.end = get_path_length(region.seq) - 1;
} else if (!id_range && !components) {
if (region.start < 0 || region.end >= get_path_length(region.seq)) {
// We want to find the path handle and infer or adjust the coordinates on it for the region.
path_handle_t region_path;
bool found_contained = find_containing_subpath(*graph, region, region_path);
if (!found_contained) {
// We can't find this region.
if (region.start < 0 || region.end < 0) {
// The region coordinates aren't fully specified but the path with that exact name doesn't exist.
// Guessing what the user wants would be hard, so stop.
cerr << "error[vg chunk]: input path " << region.seq << " not found exactly in graph and region coordinates are not completely specified" << endl;
return 1;
} else if (graph->has_path(region.seq)) {
// This is just an out of range request
cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end
<< " is out of bounds of path " << region.seq << " which has length "<< get_path_length(region.seq)
<< endl;
return -1;
<< " is out of bounds of path " << region.seq
<< " which has length " << graph->get_path_length(graph->get_path_handle(region.seq)) << endl;
} else {
// The path isn't there or the containing subpath isn't there.
cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end << " not contained by any graph path" << endl;
}
return 1;
}

subrange_t candidate_subrange = graph->get_subrange(region_path);
if (graph->get_path_name(region_path) != region.seq && candidate_subrange != PathMetadata::NO_SUBRANGE) {
// We found a subpath when we asked for a longer path's region.
// Change the region to be on the subpath.

// Adjust the name to match the subpath exactly
region.seq = graph->get_path_name(region_path);

// Adjust start and end to be relative to the subpath.
// We know they're set to numbers already.
region.start -= candidate_subrange.first;
region.end -= candidate_subrange.first;
}
}
}
Expand Down Expand Up @@ -757,32 +779,53 @@ int main_chunk(int argc, char** argv) {

// optionally trace our haplotypes
if (trace && subgraph && gbwt_index) {
int64_t trace_start;
int64_t trace_start_id;
int64_t trace_steps = 0;
if (id_range) {
trace_start = output_regions[i].start;
trace_steps = output_regions[i].end - trace_start;
trace_start_id = output_regions[i].start;
trace_steps = output_regions[i].end - trace_start_id;
} else {
path_handle_t path_handle = graph->get_path_handle(output_regions[i].seq);
step_handle_t trace_start_step = graph->get_step_at_position(path_handle, output_regions[i].start);
step_handle_t trace_end_step = graph->get_step_at_position(path_handle, output_regions[i].end);
// TODO: Instead of tracing from the start, just get a sub-GBWT on the subgraph.

// For now, we start tracing at the expanded start point along the target path.
//
// output_regions is in base path space, so its seq may not
// name an extant subpath. But we know the region is contained within one.
path_handle_t path_handle;
// TODO: We know this won't end up rewriting the region bounds but it's still weird to give it a mutable region.
bool found_contained = find_containing_subpath(*graph, output_regions[i], path_handle);
crash_unless(found_contained);

size_t trace_start_coordinate = output_regions[i].start;
size_t trace_end_coordinate = output_regions[i].end;

subrange_t candidate_subrange = graph->get_subrange(path_handle);
if (graph->get_path_name(path_handle) != output_regions[i].seq && candidate_subrange != PathMetadata::NO_SUBRANGE) {
// We need to use offsets along the subpath and not the base path.
// But don't rewrite the original region.
trace_start_coordinate -= candidate_subrange.first;
trace_end_coordinate -= candidate_subrange.first;
}

step_handle_t trace_start_step = graph->get_step_at_position(path_handle, trace_start_coordinate);
step_handle_t trace_end_step = graph->get_step_at_position(path_handle, trace_end_coordinate);
// make sure we don't loop forever in next loop
if (output_regions[i].start > output_regions[i].end) {
swap(trace_start_step, trace_end_step);
}
trace_start = graph->get_id(graph->get_handle_of_step(trace_start_step));
trace_start_id = graph->get_id(graph->get_handle_of_step(trace_start_step));
for (; trace_start_step != trace_end_step; trace_start_step = graph->get_next_step(trace_start_step)) {
++trace_steps;
}
// haplotype_extender is forward only. until it's made bidirectional, try to
// detect backward paths and trace them backwards. this will not cover all possible cases though.
if (graph->get_is_reverse(graph->get_handle_of_step(trace_start_step)) &&
graph->get_is_reverse(graph->get_handle_of_step(trace_end_step))) {
trace_start = graph->get_id(graph->get_handle_of_step(trace_end_step));
trace_start_id = graph->get_id(graph->get_handle_of_step(trace_end_step));
}
}
Graph g;
trace_haplotypes_and_paths(*graph, *gbwt_index, trace_start, trace_steps,
trace_haplotypes_and_paths(*graph, *gbwt_index, trace_start_id, trace_steps,
g, trace_thread_frequencies, false);
subgraph->for_each_path_handle([&trace_thread_frequencies, &subgraph](path_handle_t path_handle) {
trace_thread_frequencies[subgraph->get_path_name(path_handle)] = 1;});
Expand Down
9 changes: 7 additions & 2 deletions test/t/30_vg_chunk.t
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap

PATH=../bin:$PATH # for vg

plan tests 35
plan tests 37

# Construct a graph with alt paths so we can make a GBWT and a GBZ
vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg
Expand Down Expand Up @@ -134,7 +134,12 @@ grep "not found in" log.txt
is "$?" "1" "chunking on a haplotype path does not produce a path not found error"
is "$(vg stats -z part.vg | grep nodes | cut -f2)" "4" "chunking on a haplotype produces the correct size graph"

rm -f graph.gbz part.vg log.txt
vg chunk -x graph.gbz -p GRCh38#0#chr1:5-7 -c 1 >part.vg
vg chunk -x part.vg -p GRCh38#0#chr1:5-7 -c 1 >subpart.vg
is "$?" "0" "chunking the same base path range out of a chunk works"
is "$(vg stats -l part.vg)" "$(vg stats -l subpart.vg)" "chunking the same base path range out of a chunk gets the same nodes"

rm -f graph.gbz part.vg subpart.vg log.txt



1 comment on commit f926d96

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17534 seconds

Please sign in to comment.