From 82fcc47fb9034838da68b09ec0ba43bc1371fa4f Mon Sep 17 00:00:00 2001
From: "Luis M. Rodriguez-R" <lmrodriguezr@gmail.com>
Date: Fri, 6 Oct 2023 02:13:47 +0200
Subject: [PATCH] Complete `gfa-paths-to-fasta`

---
 bin/gfa-paths-to-fasta    |  7 +++++
 lib/gfa/record/path.rb    | 55 +++++++++++++++++++++++++++++++++++++++
 lib/gfa/record/segment.rb |  8 ++++++
 lib/gfa/version.rb        |  2 +-
 4 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/bin/gfa-paths-to-fasta b/bin/gfa-paths-to-fasta
index 8fc4360..5ed968b 100755
--- a/bin/gfa-paths-to-fasta
+++ b/bin/gfa-paths-to-fasta
@@ -26,4 +26,11 @@ end
 $stderr.puts "Loading GFA: #{input}"
 gfa = GFA.load_parallel(input, (threads || 1).to_i)
 
+$stderr.puts "Saving path sequences: #{output}"
+File.open(output, 'w') do |fasta|
+  gfa.paths.set.each do |path|
+    fasta.puts '>%s' % path.path_name.value
+    fasta.puts path.sequence(gfa)
+  end
+end
 
diff --git a/lib/gfa/record/path.rb b/lib/gfa/record/path.rb
index 17b4d0c..7de7aeb 100644
--- a/lib/gfa/record/path.rb
+++ b/lib/gfa/record/path.rb
@@ -42,4 +42,59 @@ def include?(segment)
 
     segment_names_a.any? { |name| segment.name == name }
   end
+
+  ##
+  # Array of GFA::Field::String with the sequences from each segment featuring
+  # the correct orientation from a +gfa+ (which *must* be indexed)
+  #
+  # TODO: Distinguish between a direct path (separated by comma) and a
+  # jump (separated by semicolon). Jumps include a distance estimate
+  # (column 6, optional) which could be used to add Ns between segment
+  # sequences (from GFA 1.2)
+  def segment_sequences(gfa)
+    raise "Unindexed GFA" unless gfa.indexed?
+    segment_names.value.split(/[,;]/).map do |i|
+      orientation = i[-1]
+      i[-1] = ''
+      segment = gfa.segments[i]
+
+      case orientation
+        when '+' ; segment.sequence
+        when '-' ; segment.rc
+        else ; raise "Unknown orientation: #{orientation} (path: #{path_name})"
+      end
+    end
+  end
+
+  ##
+  # Produce the contiguous path sequence based on the segment sequences and
+  # orientations from a +gfa+ (which *must* be indexed)
+  #
+  # TODO: Estimate gaps (Ns) from Jump distances (see +segment_sequences+)
+  #
+  # TODO: Attempt reading CIGAR values from the path first, the corresponding
+  # links next, and actually performing the pairwise overlap as last resort
+  #
+  # TODO: Support ambiguous IUPAC codes for overlap evaluation
+  def sequence(gfa)
+    segment_sequences(gfa).map(&:value)
+      .inject('') { |a, b| a + after_overlap(a, b) }
+  end
+
+  private
+    ##
+    # Find the overlap between sequences +a+ and +b+ (Strings) and return
+    # only the part of +b+ after the overlap. Assumes that +a+ starts
+    # at the same point or before +b+. If no overlap is found, returns +b+
+    # in its entirety.
+    def after_overlap(a, b)
+      (0 .. a.length - 1).each do |a_from|
+        a_to = b.length + a_from > a.length ? a.length : b.length + a_from
+        b_to = b.length + a_from > a.length ? a.length - a_from : b.length
+        if a[a_from .. a_to - 1] == b[0 .. b_to - 1]
+          return b[b_to .. b.length].to_s
+        end
+      end
+      b
+    end
 end
diff --git a/lib/gfa/record/segment.rb b/lib/gfa/record/segment.rb
index 13bcf28..6ffd4ce 100644
--- a/lib/gfa/record/segment.rb
+++ b/lib/gfa/record/segment.rb
@@ -32,4 +32,12 @@ def initialize(name, sequence, *opt_fields)
   def length
     sequence.value.length
   end
+
+  ##
+  # Returns the reverse-complement of the sequence (as a Z field)
+  def rc
+    GFA::Field::String.new(
+      sequence.value.upcase.reverse.tr('ACGTURYSWKMBDHVN', 'TGCAAYRSWMKVHDBN')
+    )
+  end
 end
diff --git a/lib/gfa/version.rb b/lib/gfa/version.rb
index 7597b9a..c80654e 100644
--- a/lib/gfa/version.rb
+++ b/lib/gfa/version.rb
@@ -1,5 +1,5 @@
 class GFA
-  VERSION       = '0.6.1'
+  VERSION       = '0.6.2'
   VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
   VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
   VERSION_MINOR = VERSION_ARRAY[1] # :nodoc: