From 080d9686d046bc57d91d668e1fdbf49b19b3554e Mon Sep 17 00:00:00 2001
From: Etienne Boileau <boileau@uni-heidelberg.de>
Date: Wed, 25 Sep 2024 00:44:59 +0200
Subject: [PATCH] FIX #139

---
 server/src/scimodom/services/bedtools.py | 68 ++++++++++++++++++------
 1 file changed, 53 insertions(+), 15 deletions(-)

diff --git a/server/src/scimodom/services/bedtools.py b/server/src/scimodom/services/bedtools.py
index 7cbc129f..8fa5d0ac 100644
--- a/server/src/scimodom/services/bedtools.py
+++ b/server/src/scimodom/services/bedtools.py
@@ -333,9 +333,14 @@ def ensembl_to_bed_features(
         written to the parent directory of "annotation_file"
         using "features" as stem, and bed as extension.
 
-        NOTE: "extended features" are used to validate consistency
-        with the caller: "conventional features" are extracted
-        on the fly, but intron and intergenic require special treatment.
+        NOTE: To merge features "gene-wise", GTF is first formatted
+        into bed-like format with name in the first field, sorted,
+        merged, then fields are re-ordered into standard bed-like
+        order and sorted. The equivalent command would be e.g.
+        awk 'OFS="\t" {print $4, $2, $3, $1, $5, $6, $7, $8}' exon.tmp |
+        sort -k1,1 -k2,2n | bedtools merge -i - -s -c 4,5,6,7,8 -o distinct |
+        awk 'OFS="\t" {print $4, $2, $3, $1, $5, $6, $7, $8}' |
+        sort -k1,1 -k2,2n > exon.bed.
 
         :param annotation_file: Path to annotation file.
         The format is implicitely assumed to be GTF.
@@ -349,6 +354,39 @@ def ensembl_to_bed_features(
         parent = annotation_file.parent
         annotation_bedtool = pybedtools.BedTool(annotation_file.as_posix()).sort()
 
+        def _get_gtf_attrs_by_name(feature):
+            line = [
+                feature.attrs["gene_name"]
+                if "gene_name" in feature.attrs
+                else feature.attrs["gene_id"],
+                feature.start,
+                feature.end,
+                feature.chrom,
+                feature.score,
+                feature.strand,
+                feature.attrs["gene_id"],
+                feature.attrs["gene_biotype"],
+            ]
+            return pybedtools.cbedtools.create_interval_from_list(line)
+
+        def _get_fields_by_name(feature):
+            line = [
+                feature.name,
+                feature.start,
+                feature.end,
+                feature.chrom,
+                feature.score,
+                feature.strand,
+                feature.fields[6],
+                feature.fields[7],
+            ]
+            return pybedtools.cbedtools.create_interval_from_list(line)
+
+        def _annotation_to_stream(feature, function=_get_gtf_attrs_by_name):
+            return annotation_bedtool.filter(lambda a: a.fields[2] == feature).each(
+                function
+            )
+
         logger.info(f"Preparing annotation and writing to {parent}...")
 
         for feature in features["conventional"]:
@@ -356,25 +394,31 @@ def ensembl_to_bed_features(
 
             file_name = Path(parent, f"{feature}.bed").as_posix()
             _ = (
-                self._annotation_to_stream(annotation_bedtool, feature)
+                _annotation_to_stream(feature)
+                .sort()
                 .merge(s=True, c=[4, 5, 6, 7, 8], o="distinct")
+                .each(_get_fields_by_name)
+                .sort()
                 .moveto(file_name)
             )
 
         file_name = Path(parent, "exon.bed").as_posix()
-        exons = pybedtools.BedTool(file_name)
+        exons = pybedtools.BedTool(file_name).each(_get_fields_by_name).sort()
         file_name = self._check_feature("intron", features, parent)
-        # why is the sort order lost after subtract?
         _ = (
-            self._annotation_to_stream(annotation_bedtool, "gene")
+            _annotation_to_stream("gene")
+            .sort()
             .subtract(exons, s=True, sorted=True)
             .sort()
             .merge(s=True, c=[4, 5, 6, 7, 8], o="distinct")
-        ).moveto(file_name)
+            .each(_get_fields_by_name)
+            .sort()
+            .moveto(file_name)
+        )
 
         file_name = self._check_feature("intergenic", features, parent)
         _ = (
-            self._annotation_to_stream(annotation_bedtool, "gene")
+            _annotation_to_stream("gene", function=_get_gtf_attrs)
             .complement(g=chrom_file.as_posix())
             .moveto(file_name)
         )
@@ -559,12 +603,6 @@ def getfasta(
         sequence = bedtool.sequence(fi=fasta, s=is_strand)
         return sequence.seqfn
 
-    @staticmethod
-    def _annotation_to_stream(annotation_bedtool, feature):
-        return annotation_bedtool.filter(lambda a: a.fields[2] == feature).each(
-            _get_gtf_attrs
-        )
-
     @staticmethod
     def _check_feature(feature, features, parent):
         if feature not in features["extended"]: