From 8bd10a614889eff92e5775bef5c9bf7e20a67396 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 10 Feb 2025 18:40:36 -0500 Subject: [PATCH] Output extracted regions on base path and not subpaths --- src/chunker.cpp | 25 +++------ src/chunker.hpp | 14 ++++- src/path.cpp | 43 +++++++++++++++ src/path.hpp | 10 +++- src/subcommand/chunk_main.cpp | 100 ++++++++++++++++------------------ 5 files changed, 117 insertions(+), 75 deletions(-) diff --git a/src/chunker.cpp b/src/chunker.cpp index bff553016a..318b3f2885 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -23,9 +23,13 @@ PathChunker::~PathChunker() { void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region) { + + // Output region must be empty. We fill it in. + crash_unless(out_region.seq.empty()); // extract our path range into the graph // TODO: Handle incoming names with subranges when they aren't exactly the names of paths we have. + crash_unless(graph->has_path(region.seq)); path_handle_t path_handle = graph->get_path_handle(region.seq); step_handle_t start_step = graph->get_step_at_position(path_handle, region.start); handle_t start_handle = graph->get_handle_of_step(start_step); @@ -152,27 +156,12 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_ // TODO: We assume we actually found some of the target path crash_unless(min_start != std::numeric_limits::max()); - - if (base_path_subrange != PathMetadata::NO_SUBRANGE) { - // Our input region was on a path with a subrange, so we want our output region to be relative to (and within) that. - - // Make sure we have an end - populate_subrange_end(*graph, base_path_subrange, path_handle); - - // We can't expand beyond its bounds and still say we pulled a range on it, so restrict to them. - min_start = std::max(min_start, base_path_subrange.first); - max_end = std::min(max_end, base_path_subrange.second); - - // Remove offset from containing path subrange - min_start -= base_path_subrange.first; - max_end -= base_path_subrange.first; - } - // We can't represent a region with a 0 end-exclusive coordinate. crash_unless(max_end != 0); - // Produce the output region. Convert coordinates to be 0-based, end-inclusive. - out_region.seq = region.seq; + // Express the output region against the base path, if different from the target path with its subrange. + out_region.seq = get_path_base_name(*graph, path_handle); + // Convert coordinates to be 0-based, end-inclusive. out_region.start = min_start; out_region.end = max_end - 1; } diff --git a/src/chunker.hpp b/src/chunker.hpp index 980f4b8e2a..cc9e93d715 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -32,9 +32,15 @@ class PathChunker { /** Extract subgraph corresponding to given path region into its * own vg graph, and send it to out_stream. The boundaries of the * extracted region of the target path (which can be different because we - * expand context and don't cut nodes) are written to out_region. If the - * target path goes through the extracted region multiple times, only the - * extended bounds of the visit containing the target region are produced. + * expand context and don't cut nodes) are written to out_region. + * + * The contig name of out_region is set to the base path name (without any + * subrange) for the path from region. Output coordinates will be relative + * to this base path. + * + * If the target path goes through the extracted region multiple times, + * only the extended bounds of the visit containing the target region are + * produced. * * If forward_only set, context is only expanded in the forward direction * @@ -66,6 +72,8 @@ class PathChunker { * * Skips nodes in the ID range that do not actually exist in the source * graph, or that already exist in the target graph. + * + * Populated out_region's start and end with node IDs. */ void extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region); diff --git a/src/path.cpp b/src/path.cpp index 089d2c0c78..c81d8c9cc1 100644 --- a/src/path.cpp +++ b/src/path.cpp @@ -1,6 +1,7 @@ #include "path.hpp" #include #include "region.hpp" +#include "crash.hpp" #include // #define debug_simplify @@ -2535,6 +2536,48 @@ Alignment alignment_from_path(const HandleGraph& graph, const Path& path) { return aln; } +bool find_containing_subpath(const PathPositionHandleGraph& graph, Region& region, path_handle_t& path) { + // We might be asking for a part of a subpath, or a part of a base path. + if (graph.has_path(region.seq)) { + // It's a base path we have all of, or a subpath we have exactly. + path = graph.get_path_handle(region.seq); + + if (region.end == -1) { + // Infer the region endpoint from the path + region.end = graph.get_path_length(path) - 1; + } + + // The region start point will be 0 if not set. + region.start = max((int64_t)0, region.start); + + return true; + } else if (region.start < 0 || region.end < 0) { + return false; + } else { + // Maybe it's a base path and we only have a subpath. + bool found_contained = false; + + for_each_subpath_of(graph, region.seq, [&](const path_handle_t& candidate) { + // We should have some subrange if we get here. Also the region coordunates are specified. + subrange_t candidate_subrange = graph.get_subrange(candidate); + crash_unless(candidate_subrange != PathMetadata::NO_SUBRANGE); + if (candidate_subrange.first <= region.start && candidate_subrange.first + candidate_subrange.second > region.end + 1) { + // The subranges are 0-based exclusive and the regions are 0-based inclusive. + // This subrange fully contains this region. + path = candidate; + + found_contained = true; + // Use first result + return false; + } + return true; + }); + + // Return whether we found the containing subpath. + return found_contained; + } +} + bool for_each_subpath_of(const PathPositionHandleGraph& graph, const string& path_name, const std::function& iteratee) { if (graph.has_path(path_name)) { // Just look at the full path. diff --git a/src/path.hpp b/src/path.hpp index 8df0d49c41..ba03385111 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -14,6 +14,7 @@ #include "utility.hpp" #include "types.hpp" #include "position.hpp" +#include "region.hpp" #include "nodetraversal.hpp" //#define debug @@ -384,10 +385,17 @@ Path path_from_path_handle(const PathHandleGraph& graph, path_handle_t path_hand Alignment alignment_from_path(const HandleGraph& graph, const Path& path); //// -// Functiuons for working with path subranges. +// Functions for working with path subranges. // TODO: Move to libhandlegraph //// +/// Find the subpath containing the given region (possibly a full base path) +/// and return true, or return false if no such subpath can be found. +/// +/// If one or both region coordinates are -1, and a full path is found, fills +/// them in from that path. +bool find_containing_subpath(const PathPositionHandleGraph& graph, Region& region, path_handle_t& path); + /// Run the given iteratee for each path that is either the path with the given /// name (if present), or a subrange of a path with the given name as the base /// name (otherwise). diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index abce88e382..b8caac816d 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -609,66 +609,39 @@ int main_chunk(int argc, char** argv) { for (auto& region : regions) { // We want to find the path handle and infer or adjust the coordinates on it for the region. path_handle_t region_path; - bool found_contained = false; - // We might be asking for a part of a subpath, or a part of a base path. - if (graph->has_path(region.seq)) { - // It's a base path we have all of, or a subpath we have exactly. - region_path = graph->get_path_handle(region.seq); - - if (region.end == -1) { - // Infer the region endpoint from the path - region.end = graph->get_path_length(region_path) - 1; - } - - // The region start point will be 0 if not set. - region.start = max((int64_t)0, region.start); - - found_contained = true; - } else if (region.start < 0 || region.end < 0) { - // The region coordinates aren't fully specified but the path with that exact name doesn't exist. - // Guessing what the user wants would be hard, so stop. - cerr << "error[vg chunk]: input path " << region.seq << " not found exactly in graph and region coordinates are not completely specified" << endl; - return 1; - } else { - // Maybe it's a base path and we only have a subpath. - for_each_subpath_of(*graph, region.seq, [&](const path_handle_t& candidate) { - // We should have some subrange if we get here. Also the region coiordunates are specified. - subrange_t candidate_subrange = graph->get_subrange(candidate); - crash_unless(candidate_subrange != PathMetadata::NO_SUBRANGE); - if (candidate_subrange.first <= region.start && candidate_subrange.first + candidate_subrange.second >= region.end) { - // The subranges are 0-based exclusive and the regions are 0-based exclusive. - // TODO: Is this true??? - // This subrange fully contains this region. - region_path = candidate; - - // Adjust the name to match the subpath exactly - region.seq = graph->get_path_name(region_path); - - // Adjust start and end to be relative to the subpath. - // We know they're set to numbers already. - region.start -= candidate_subrange.first; - region.end -= candidate_subrange.first; - - found_contained = true; - // Use first result - return false; - } - return true; - }); - } - + bool found_contained = find_containing_subpath(*graph, region, region_path); if (!found_contained) { - if (graph->has_path(region.seq)) { + // We can't find this region. + if (region.start < 0 || region.end < 0) { + // The region coordinates aren't fully specified but the path with that exact name doesn't exist. + // Guessing what the user wants would be hard, so stop. + cerr << "error[vg chunk]: input path " << region.seq << " not found exactly in graph and region coordinates are not completely specified" << endl; + return 1; + } else if (graph->has_path(region.seq)) { // This is just an out of range request cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end << " is out of bounds of path " << region.seq << " which has length " << graph->get_path_length(graph->get_path_handle(region.seq)) << endl; } else { - // The path isn't therr or the containing subpath isn't there. + // The path isn't there or the containing subpath isn't there. cerr << "error[vg chunk]: input region " << region.seq << ":" << region.start << "-" << region.end << " not contained by any graph path" << endl; } return 1; } + + subrange_t candidate_subrange = graph->get_subrange(region_path); + if (graph->get_path_name(region_path) != region.seq && candidate_subrange != PathMetadata::NO_SUBRANGE) { + // We found a subpath when we asked for a longer path's region. + // Change the region to be on the subpath. + + // Adjust the name to match the subpath exactly + region.seq = graph->get_path_name(region_path); + + // Adjust start and end to be relative to the subpath. + // We know they're set to numbers already. + region.start -= candidate_subrange.first; + region.end -= candidate_subrange.first; + } } } @@ -812,9 +785,30 @@ int main_chunk(int argc, char** argv) { trace_start = output_regions[i].start; trace_steps = output_regions[i].end - trace_start; } else { - path_handle_t path_handle = graph->get_path_handle(output_regions[i].seq); - step_handle_t trace_start_step = graph->get_step_at_position(path_handle, output_regions[i].start); - step_handle_t trace_end_step = graph->get_step_at_position(path_handle, output_regions[i].end); + // TODO: Instead of tracing from the start, just get a sub-GBWT on the subgraph. + + // For now, we start tracing at the expanded start point along the target path. + // + // output_regions is in base path space, so its seq may not + // name an extant subpath. But we know the region is contained within one. + path_handle_t path_handle; + // TODO: We know this won't end up rewriting the region bounds but it's still weird to give it a mutable region. + bool found_contained = find_containing_subpath(*graph, output_regions[i], path_handle); + crash_unless(found_contained); + + size_t trace_start = output_regions[i].start; + size_t trace_end = output_regions[i].end; + + subrange_t candidate_subrange = graph->get_subrange(path_handle); + if (graph->get_path_name(path_handle) != output_regions[i].seq && candidate_subrange != PathMetadata::NO_SUBRANGE) { + // We need to use offsets along the subpath and not the base path. + // But don't rewrite the original region. + trace_start -= candidate_subrange.first; + trace_end -= candidate_subrange.first; + } + + step_handle_t trace_start_step = graph->get_step_at_position(path_handle, trace_start); + step_handle_t trace_end_step = graph->get_step_at_position(path_handle, trace_end); // make sure we don't loop forever in next loop if (output_regions[i].start > output_regions[i].end) { swap(trace_start_step, trace_end_step);