Fix for #1334: Refactor transcript annotation

* Place extra data in attributes field * Simplify code path for annotation parsing
ga4gh · Feb 10, 2017 · f492fe5 · f492fe5
1 parent c5306d5
commit f492fe5
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 144 deletions.
diff --git a/ga4gh/server/datamodel/variants.py b/ga4gh/server/datamodel/variants.py
@@ -29,13 +29,37 @@
 ANNOTATIONS_SNPEFF = "SNPEff"
 
 
+# Utility functions for module
+
 def isUnspecified(str):
     """
     Checks whether a string is None or an
     empty string. Returns a boolean.
     """
     return str == "" or str is None
 
+def deepGetAttr(obj, path):
+    """
+    Utility function which is similiar to std lib's getattr, but works to resolve
+    a dot-delimited path on an object. If path is not found an `AttributeError` will
+    be raised.
+    """
+    return reduce(getattr, path.split('.'), obj)
+
+def deepSetAttr(obj, path, val):
+    """
+    Utility function which is similar to std lib's setattr, but attempts to set
+    a deep attribute on an object by resolving a dot-delimited path. If path does not
+    exist an `AttributeError` will be raised`.
+    """
+    first, sep, rest = path.rpartition('.')
+    return setattr(deepGetAttr(obj, first) if first else obj, rest, val)
+
+_nothing = object()
+
+def isEmptyIter(it):
+    """Return True iff the iterator is empty or exhausted"""
+    return next(it, _nothing) is _nothing
 
 class CallSet(datamodel.DatamodelObject):
     """
@@ -414,22 +438,6 @@ def generateVariant(self, referenceName, position, randomNumberGenerator):
         variant.id = self.getVariantId(variant)
         return variant
 
-
-def _encodeValue(value):
-    if isinstance(value, (list, tuple)):
-        return [common_pb2.AttributeValue(string_value=str(v)) for v in value]
-    else:
-        return [common_pb2.AttributeValue(string_value=str(value))]
-
-
-_nothing = object()
-
-
-def isEmptyIter(it):
-    """Return True iff the iterator is empty or exhausted"""
-    return next(it, _nothing) is _nothing
-
-
 class HtslibVariantSet(datamodel.PysamDatamodelMixin, AbstractVariantSet):
     """
     Class representing a single variant set backed by a directory of indexed
@@ -993,16 +1001,18 @@ def _addTranscriptEffectLocations(self, effect, ann, variant):
     def _getRandomOntologyTerm(self, randomNumberGenerator):
         # TODO more mock options from simulated seqOnt?
         ontologyTuples = [
-            ("intron_variant", "SO:0001627"),
-            ("exon_variant", "SO:0001791")]
+            ("intron_variant", "SO:0001627", "LOW"),
+            ("exon_variant", "SO:0001791", "LOW"),
+            ("misense_variant", "SO:0001583", "MODERATE"),
+            ("stop_gained", "SO:0001587", "HIGH")]
         term = protocol.OntologyTerm()
-        ontologyTuple = randomNumberGenerator.choice(ontologyTuples)
-        term.term, term.term_id = ontologyTuple[0], ontologyTuple[1]
-        return term
+        term.term, term.term_id, impact = randomNumberGenerator.choice(ontologyTuples)
+        return term, impact
 
     def _addTranscriptEffectOntologyTerm(self, effect, randomNumberGenerator):
-        effect.effects.add().CopyFrom(
-            self._getRandomOntologyTerm(randomNumberGenerator))
+        term, impact = self._getRandomOntologyTerm(randomNumberGenerator)
+        effect.effects.add().CopyFrom(term)
+        protocol.setAttribute(effect.attributes.attr["impact"].values, impact)
         return effect
 
     def _generateAnalysisResult(self, effect, ann, randomNumberGenerator):
@@ -1040,6 +1050,33 @@ class HtslibVariantAnnotationSet(AbstractVariantAnnotationSet):
     Class representing a single variant annotation derived from an
     annotated variant set.
     """
+    CSQ_FIELDS = ("alternate_bases", "gene", "feature_id",
+                  "featureType", "effects", "cdnaPos", "cdsPos",
+                  "protPos", "aminos", "codons", "existingVar",
+                  "distance", "strand", "sift", "polyPhen",
+                  "motifName", "motifPos", "highInfPos",
+                  "motifScoreChange")
+    VEP_FIELDS = ("alternate_bases", "effects", "impact", "symbol",
+                  "geneName", "featureType", "feature_id",
+                  "trBiotype", "exon", "intron",
+                  "hgvs_annotation.transcript",
+                  "hgvs_annotation.protein", "cdnaPos", "cdsPos",
+                  "protPos", "aminos", "codons", "existingVar",
+                  "distance", "strand", "symbolSource", "hgncId",
+                  "hgvsOffset")
+    SNPEFF_FIELDS = ("alternate_bases", "effects", "impact",
+                     "geneName", "geneId", "featureType",
+                     "feature_id", "trBiotype", "rank",
+                     "hgvs_annotation.transcript",
+                     "hgvs_annotation.protein", "cdnaPos", "cdsPos",
+                     "protPos", "distance", "errsWarns")
+    EXCLUDED_FIELDS = ("effects", "geneName", "gene", "geneId", "featureType",
+                       "trBiotype", "rank", "cdnaPos", "cdsPos",
+                       "protPos", "distance", "exon", "intron",
+                       "aminos", "codons", "existingVar", "distance",
+                       "strand", "symbolSource", "hgncId",
+                       "hgvsOffset")
+
     def __init__(self, variantSet, localId):
         super(HtslibVariantAnnotationSet, self).__init__(variantSet, localId)
 
@@ -1126,19 +1163,10 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition):
         :param endPosition:
         :return: generator of protocol.VariantAnnotation
         """
-        # TODO Refactor this so that we use the annotationType information
-        # where it makes most sense, and rename the various methods so that
-        # it's clear what program/version combination they operate on.
         variantIter = self._variantSet.getPysamVariants(
             referenceName, startPosition, endPosition)
-        if self._annotationType == ANNOTATIONS_SNPEFF:
-            transcriptConverter = self.convertTranscriptEffectSnpEff
-        elif self._annotationType == ANNOTATIONS_VEP_V82:
-            transcriptConverter = self.convertTranscriptEffectVEP
-        else:
-            transcriptConverter = self.convertTranscriptEffectCSQ
         for record in variantIter:
-            yield self.convertVariantAnnotation(record, transcriptConverter)
+            yield self.convertVariantAnnotation(record)
 
     def convertLocation(self, pos):
         """
@@ -1244,69 +1272,7 @@ def addLocations(self, effect, protPos, cdnaPos):
         self.addProteinLocation(effect, protPos)
         return effect
 
-    def convertTranscriptEffectCSQ(self, annStr, hgvsG):
-        """
-        Takes the consequence string of an annotated VCF using a
-        CSQ field as opposed to ANN and returns an array of
-        transcript effects.
-        :param annStr: String
-        :param hgvsG: String
-        :return: [protocol.TranscriptEffect]
-        """
-        # Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|
-        # CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|
-        # DISTANCE|STRAND|SIFT|PolyPhen|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE
-
-        (alt, gene, featureId, featureType, effects, cdnaPos,
-         cdsPos, protPos, aminos, codons, existingVar, distance,
-         strand, sift, polyPhen, motifName, motifPos,
-         highInfPos, motifScoreChange) = annStr.split('|')
-        terms = effects.split("&")
-        transcriptEffects = []
-        for term in terms:
-            transcriptEffects.append(
-                self._createCsqTranscriptEffect(
-                    alt, term, protPos,
-                    cdnaPos, featureId))
-        return transcriptEffects
-
-    def _createCsqTranscriptEffect(
-            self, alt, term, protPos, cdnaPos, featureId):
-        effect = self._createGaTranscriptEffect()
-        effect.alternate_bases = alt
-        effect.effects.extend(self.convertSeqOntology(term))
-        effect.feature_id = featureId
-        # These are not present in the data
-        self.addLocations(effect, protPos, cdnaPos)
-        effect.id = self.getTranscriptEffectId(effect)
-        return effect
-
-    def convertTranscriptEffectVEP(self, annStr, hgvsG):
-        """
-        Takes the ANN string of a VEP generated VCF, splits it
-        and returns a populated GA4GH transcript effect object.
-        :param annStr: String
-        :param hgvsG: String
-        :return: effect protocol.TranscriptEffect
-        """
-        effect = self._createGaTranscriptEffect()
-        (alt, effects, impact, symbol, geneName, featureType,
-         featureId, trBiotype, exon, intron, hgvsC, hgvsP,
-         cdnaPos, cdsPos, protPos, aminos, codons,
-         existingVar, distance, strand, symbolSource,
-         hgncId, hgvsOffset) = annStr.split('|')
-        effect.alternate_bases = alt
-        effect.effects.extend(self.convertSeqOntology(effects))
-        effect.feature_id = featureId
-        effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
-        effect.hgvs_annotation.genomic = hgvsG
-        effect.hgvs_annotation.transcript = hgvsC
-        effect.hgvs_annotation.protein = hgvsP
-        self.addLocations(effect, protPos, cdnaPos)
-        effect.id = self.getTranscriptEffectId(effect)
-        return effect
-
-    def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
+    def convertTranscriptEffect(self, annStr, hgvsG):
         """
         Takes the ANN string of a SnpEff generated VCF, splits it
         and returns a populated GA4GH transcript effect object.
@@ -1315,17 +1281,24 @@ def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
         :return: effect protocol.TranscriptEffect()
         """
         effect = self._createGaTranscriptEffect()
-        # SnpEff and VEP don't agree on this :)
-        (alt, effects, impact, geneName, geneId, featureType,
-            featureId, trBiotype, rank, hgvsC, hgvsP, cdnaPos,
-            cdsPos, protPos, distance, errsWarns) = annStr.split('|')
-        effect.alternate_bases = alt
-        effect.effects.extend(self.convertSeqOntology(effects))
-        effect.feature_id = featureId
-        effect.hgvs_annotation.genomic = hgvsG
-        effect.hgvs_annotation.transcript = hgvsC
-        effect.hgvs_annotation.protein = hgvsP
-        self.addLocations(effect, protPos, cdnaPos)
+        effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
+        annDict = dict()
+        if self._annotationType == ANNOTATIONS_SNPEFF:
+            annDict = dict(zip(self. SNPEFF_FIELDS, annStr.split("|")))
+        elif self._annotationType == ANNOTATIONS_VEP_V82:
+            annDict = dict(zip(self.VEP_FIELDS, annStr.split("|")))
+        else:
+            annDict = dict(zip(self.CSQ_FIELDS, annStr.split("|")))
+        annDict["hgvs_annotation.genomic"] = hgvsG if hgvsG else u''
+        attrDict = {}
+        for key, val in annDict.items():
+            try:
+                deepSetAttr(effect, key, val)
+            except AttributeError:
+                if val and key not in self.EXCLUDED_FIELDS:
+                    protocol.setAttribute(effect.attributes.attr[key].values, val)
+        effect.effects.extend(self.convertSeqOntology(annDict.get('effects')))
+        self.addLocations(effect, annDict.get('protPos'), annDict.get('cdnaPos'))
         effect.id = self.getTranscriptEffectId(effect)
         return effect
 
@@ -1341,7 +1314,7 @@ def convertSeqOntology(self, seqOntStr):
             self._ontology.getGaTermByName(soName)
             for soName in seqOntStr.split('&')]
 
-    def convertVariantAnnotation(self, record, transcriptConverter):
+    def convertVariantAnnotation(self, record):
         """
         Converts the specfied pysam variant record into a GA4GH variant
         annotation object using the specified function to convert the
@@ -1350,33 +1323,15 @@ def convertVariantAnnotation(self, record, transcriptConverter):
         variant = self._variantSet.convertVariant(record, [])
         annotation = self._createGaVariantAnnotation()
         annotation.variant_id = variant.id
+        hgvsGenomicAnnotations = record.info.get(b'HGVS.g')
         # Convert annotations from INFO field into TranscriptEffect
         transcriptEffects = []
-        hgvsG = record.info.get(b'HGVS.g')
-        if transcriptConverter != self.convertTranscriptEffectCSQ:
-            annotations = record.info.get(b'ANN')
-            transcriptEffects = self._convertAnnotations(
-                annotations, variant, hgvsG, transcriptConverter)
-        else:
-            annotations = record.info.get('CSQ'.encode())
-            transcriptEffects = []
-            for ann in annotations:
-                transcriptEffects.extend(
-                    self.convertTranscriptEffectCSQ(ann, hgvsG))
+        annotations = record.info.get(b'ANN') or record.info.get(b'CSQ')
+        if "CSQ" in record.info.keys():
+            pass#import ipdb; ipdb.set_trace()
+        for idx, ann in enumerate(annotations):
+            hgvsG = hgvsGenomicAnnotations[idx % len(variant.alternate_bases)] if hgvsGenomicAnnotations else None
+            transcriptEffects.append(self.convertTranscriptEffect(ann, hgvsG))
         annotation.transcript_effects.extend(transcriptEffects)
         annotation.id = self.getVariantAnnotationId(variant, annotation)
         return variant, annotation
-
-    def _convertAnnotations(
-            self, annotations, variant, hgvsG, transcriptConverter):
-        transcriptEffects = []
-        if annotations is not None:
-            for index, ann in enumerate(annotations):
-                altshgvsG = ""
-                if hgvsG is not None:
-                    # The HGVS.g field contains an element for
-                    # each alternate allele
-                    altshgvsG = hgvsG[index % len(variant.alternate_bases)]
-                transcriptEffects.append(
-                    transcriptConverter(ann, altshgvsG))
-        return transcriptEffects
diff --git a/tests/datadriven/test_variant_annotations.py b/tests/datadriven/test_variant_annotations.py
@@ -109,6 +109,7 @@ def splitAnnField(self, annfield):
                       "hgvsP", "cdnaPos", "cdsPos", "protPos",
                       "distance", "errsWarns"]
         values = annfield.split('|')
+        values[1] = values[1].split('&') # split effects
         return dict(zip(fields, values))
 
     def getDataModelInstance(self, localId, dataPath):
@@ -158,6 +159,12 @@ def _verifyVariantAnnotationsEqual(
                 for pyvcfEffect, gaEffect in \
                         zip(pyvcfAnn, gaVariantAnnotation.transcript_effects):
                     effectDict = self.splitAnnField(pyvcfEffect)
+                    self.assertEqual(
+                        map(lambda e: e.term, gaEffect.effects),
+                        effectDict['effects'])
+                    self.assertEqual(
+                        gaEffect.attributes.attr['impact'].values[0].string_value,
+                        effectDict['impact'])
                     self.assertEqual(
                         gaEffect.alternate_bases, effectDict['alt'])
                     self.assertEqual(
@@ -176,12 +183,13 @@ def _verifyVariantAnnotationsEqual(
                     i += 1
             elif 'CSQ' in pyvcfVariant.INFO:
                 pyvcfAnn = pyvcfVariant.INFO['CSQ']
-                transcriptEffects = []
-                for ann in pyvcfAnn:
-                    transcriptEffects += self._splitCsqEffects(ann)
+                transcriptEffects = [self._splitCsqEffects(ann) for ann in pyvcfAnn]
                 for treff, gaEffect in zip(
                         transcriptEffects,
                         gaVariantAnnotation.transcript_effects):
+                    self.assertEqual(
+                        map(lambda e: e.term, gaEffect.effects),
+                        treff['effects'])
                     self.assertEqual(gaEffect.alternate_bases, treff['alt'])
                     self.assertEqual(gaEffect.feature_id, treff['featureId'])
             self.assertGreater(len(gaVariantAnnotation.transcript_effects), 0)
@@ -192,14 +200,11 @@ def _splitCsqEffects(self, annStr):
             distance, strand, sift, polyPhen, motifName,
             motifPos, highInfPos,
             motifScoreChange) = annStr.split('|')
-        terms = effects.split("&")
-        treffs = []
-        for term in terms:
-            effect = {}
-            effect['featureId'] = featureId
-            effect['alt'] = alt
-            treffs.append(effect)
-        return treffs
+        effect = {}
+        effect['featureId'] = featureId
+        effect['alt'] = alt
+        effect['effects'] = effects.split("&")
+        return effect
 
     def testVariantsValid(self):
         end = datamodel.PysamDatamodelMixin.vcfMax