From 82fcc47fb9034838da68b09ec0ba43bc1371fa4f Mon Sep 17 00:00:00 2001 From: "Luis M. Rodriguez-R" Date: Fri, 6 Oct 2023 02:13:47 +0200 Subject: [PATCH] Complete `gfa-paths-to-fasta` --- bin/gfa-paths-to-fasta | 7 +++++ lib/gfa/record/path.rb | 55 +++++++++++++++++++++++++++++++++++++++ lib/gfa/record/segment.rb | 8 ++++++ lib/gfa/version.rb | 2 +- 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/bin/gfa-paths-to-fasta b/bin/gfa-paths-to-fasta index 8fc4360..5ed968b 100755 --- a/bin/gfa-paths-to-fasta +++ b/bin/gfa-paths-to-fasta @@ -26,4 +26,11 @@ end $stderr.puts "Loading GFA: #{input}" gfa = GFA.load_parallel(input, (threads || 1).to_i) +$stderr.puts "Saving path sequences: #{output}" +File.open(output, 'w') do |fasta| + gfa.paths.set.each do |path| + fasta.puts '>%s' % path.path_name.value + fasta.puts path.sequence(gfa) + end +end diff --git a/lib/gfa/record/path.rb b/lib/gfa/record/path.rb index 17b4d0c..7de7aeb 100644 --- a/lib/gfa/record/path.rb +++ b/lib/gfa/record/path.rb @@ -42,4 +42,59 @@ def include?(segment) segment_names_a.any? { |name| segment.name == name } end + + ## + # Array of GFA::Field::String with the sequences from each segment featuring + # the correct orientation from a +gfa+ (which *must* be indexed) + # + # TODO: Distinguish between a direct path (separated by comma) and a + # jump (separated by semicolon). Jumps include a distance estimate + # (column 6, optional) which could be used to add Ns between segment + # sequences (from GFA 1.2) + def segment_sequences(gfa) + raise "Unindexed GFA" unless gfa.indexed? + segment_names.value.split(/[,;]/).map do |i| + orientation = i[-1] + i[-1] = '' + segment = gfa.segments[i] + + case orientation + when '+' ; segment.sequence + when '-' ; segment.rc + else ; raise "Unknown orientation: #{orientation} (path: #{path_name})" + end + end + end + + ## + # Produce the contiguous path sequence based on the segment sequences and + # orientations from a +gfa+ (which *must* be indexed) + # + # TODO: Estimate gaps (Ns) from Jump distances (see +segment_sequences+) + # + # TODO: Attempt reading CIGAR values from the path first, the corresponding + # links next, and actually performing the pairwise overlap as last resort + # + # TODO: Support ambiguous IUPAC codes for overlap evaluation + def sequence(gfa) + segment_sequences(gfa).map(&:value) + .inject('') { |a, b| a + after_overlap(a, b) } + end + + private + ## + # Find the overlap between sequences +a+ and +b+ (Strings) and return + # only the part of +b+ after the overlap. Assumes that +a+ starts + # at the same point or before +b+. If no overlap is found, returns +b+ + # in its entirety. + def after_overlap(a, b) + (0 .. a.length - 1).each do |a_from| + a_to = b.length + a_from > a.length ? a.length : b.length + a_from + b_to = b.length + a_from > a.length ? a.length - a_from : b.length + if a[a_from .. a_to - 1] == b[0 .. b_to - 1] + return b[b_to .. b.length].to_s + end + end + b + end end diff --git a/lib/gfa/record/segment.rb b/lib/gfa/record/segment.rb index 13bcf28..6ffd4ce 100644 --- a/lib/gfa/record/segment.rb +++ b/lib/gfa/record/segment.rb @@ -32,4 +32,12 @@ def initialize(name, sequence, *opt_fields) def length sequence.value.length end + + ## + # Returns the reverse-complement of the sequence (as a Z field) + def rc + GFA::Field::String.new( + sequence.value.upcase.reverse.tr('ACGTURYSWKMBDHVN', 'TGCAAYRSWMKVHDBN') + ) + end end diff --git a/lib/gfa/version.rb b/lib/gfa/version.rb index 7597b9a..c80654e 100644 --- a/lib/gfa/version.rb +++ b/lib/gfa/version.rb @@ -1,5 +1,5 @@ class GFA - VERSION = '0.6.1' + VERSION = '0.6.2' VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc: VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc: VERSION_MINOR = VERSION_ARRAY[1] # :nodoc: