Fix for #1334: Refactor transcript annotation

* Place extra data in attributes field * Simplify code path for annotation parsing
ga4gh · Feb 16, 2017 · 5792e83 · 5792e83
1 parent cdfd148
commit 5792e83
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 153 deletions.
diff --git a/ga4gh/server/datamodel/variants.py b/ga4gh/server/datamodel/variants.py
@@ -20,7 +20,6 @@
 import ga4gh.server.datamodel as datamodel
 
 import ga4gh.schemas.pb as pb
-import ga4gh.schemas.ga4gh.common_pb2 as common_pb2
 import ga4gh.schemas.protocol as protocol
 
 
@@ -29,6 +28,8 @@
 ANNOTATIONS_SNPEFF = "SNPEff"
 
 
+# Utility functions for module
+
 def isUnspecified(str):
     """
     Checks whether a string is None or an
@@ -37,6 +38,14 @@ def isUnspecified(str):
     return str == "" or str is None
 
 
+_nothing = object()
+
+
+def isEmptyIter(it):
+    """Return True iff the iterator is empty or exhausted"""
+    return next(it, _nothing) is _nothing
+
+
 class CallSet(datamodel.DatamodelObject):
     """
     Class representing a CallSet. A CallSet basically represents the
@@ -415,21 +424,6 @@ def generateVariant(self, referenceName, position, randomNumberGenerator):
         return variant
 
 
-def _encodeValue(value):
-    if isinstance(value, (list, tuple)):
-        return [common_pb2.AttributeValue(string_value=str(v)) for v in value]
-    else:
-        return [common_pb2.AttributeValue(string_value=str(value))]
-
-
-_nothing = object()
-
-
-def isEmptyIter(it):
-    """Return True iff the iterator is empty or exhausted"""
-    return next(it, _nothing) is _nothing
-
-
 class HtslibVariantSet(datamodel.PysamDatamodelMixin, AbstractVariantSet):
     """
     Class representing a single variant set backed by a directory of indexed
@@ -993,16 +987,19 @@ def _addTranscriptEffectLocations(self, effect, ann, variant):
     def _getRandomOntologyTerm(self, randomNumberGenerator):
         # TODO more mock options from simulated seqOnt?
         ontologyTuples = [
-            ("intron_variant", "SO:0001627"),
-            ("exon_variant", "SO:0001791")]
+            ("intron_variant", "SO:0001627", "LOW"),
+            ("exon_variant", "SO:0001791", "LOW"),
+            ("misense_variant", "SO:0001583", "MODERATE"),
+            ("stop_gained", "SO:0001587", "HIGH")]
         term = protocol.OntologyTerm()
-        ontologyTuple = randomNumberGenerator.choice(ontologyTuples)
-        term.term, term.term_id = ontologyTuple[0], ontologyTuple[1]
-        return term
+        picked = randomNumberGenerator.choice(ontologyTuples)
+        term.term, term.term_id, impact = picked
+        return term, impact
 
     def _addTranscriptEffectOntologyTerm(self, effect, randomNumberGenerator):
-        effect.effects.add().CopyFrom(
-            self._getRandomOntologyTerm(randomNumberGenerator))
+        term, impact = self._getRandomOntologyTerm(randomNumberGenerator)
+        effect.effects.add().CopyFrom(term)
+        protocol.setAttribute(effect.attributes.attr["impact"].values, impact)
         return effect
 
     def _generateAnalysisResult(self, effect, ann, randomNumberGenerator):
@@ -1040,6 +1037,33 @@ class HtslibVariantAnnotationSet(AbstractVariantAnnotationSet):
     Class representing a single variant annotation derived from an
     annotated variant set.
     """
+    CSQ_FIELDS = ("alternate_bases", "gene", "feature_id",
+                  "featureType", "effects", "cdnaPos", "cdsPos",
+                  "protPos", "aminos", "codons", "existingVar",
+                  "distance", "strand", "sift", "polyPhen",
+                  "motifName", "motifPos", "highInfPos",
+                  "motifScoreChange")
+    VEP_FIELDS = ("alternate_bases", "effects", "impact", "symbol",
+                  "geneName", "featureType", "feature_id",
+                  "trBiotype", "exon", "intron",
+                  "hgvs_annotation.transcript",
+                  "hgvs_annotation.protein", "cdnaPos", "cdsPos",
+                  "protPos", "aminos", "codons", "existingVar",
+                  "distance", "strand", "symbolSource", "hgncId",
+                  "hgvsOffset")
+    SNPEFF_FIELDS = ("alternate_bases", "effects", "impact",
+                     "geneName", "geneId", "featureType",
+                     "feature_id", "trBiotype", "rank",
+                     "hgvs_annotation.transcript",
+                     "hgvs_annotation.protein", "cdnaPos", "cdsPos",
+                     "protPos", "distance", "errsWarns")
+    EXCLUDED_FIELDS = ("effects", "geneName", "gene", "geneId", "featureType",
+                       "trBiotype", "rank", "cdnaPos", "cdsPos",
+                       "protPos", "distance", "exon", "intron",
+                       "aminos", "codons", "existingVar", "distance",
+                       "strand", "symbolSource", "hgncId",
+                       "hgvsOffset")
+
     def __init__(self, variantSet, localId):
         super(HtslibVariantAnnotationSet, self).__init__(variantSet, localId)
 
@@ -1126,19 +1150,10 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition):
         :param endPosition:
         :return: generator of protocol.VariantAnnotation
         """
-        # TODO Refactor this so that we use the annotationType information
-        # where it makes most sense, and rename the various methods so that
-        # it's clear what program/version combination they operate on.
         variantIter = self._variantSet.getPysamVariants(
             referenceName, startPosition, endPosition)
-        if self._annotationType == ANNOTATIONS_SNPEFF:
-            transcriptConverter = self.convertTranscriptEffectSnpEff
-        elif self._annotationType == ANNOTATIONS_VEP_V82:
-            transcriptConverter = self.convertTranscriptEffectVEP
-        else:
-            transcriptConverter = self.convertTranscriptEffectCSQ
         for record in variantIter:
-            yield self.convertVariantAnnotation(record, transcriptConverter)
+            yield self.convertVariantAnnotation(record)
 
     def convertLocation(self, pos):
         """
@@ -1244,69 +1259,7 @@ def addLocations(self, effect, protPos, cdnaPos):
         self.addProteinLocation(effect, protPos)
         return effect
 
-    def convertTranscriptEffectCSQ(self, annStr, hgvsG):
-        """
-        Takes the consequence string of an annotated VCF using a
-        CSQ field as opposed to ANN and returns an array of
-        transcript effects.
-        :param annStr: String
-        :param hgvsG: String
-        :return: [protocol.TranscriptEffect]
-        """
-        # Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|
-        # CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|
-        # DISTANCE|STRAND|SIFT|PolyPhen|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE
-
-        (alt, gene, featureId, featureType, effects, cdnaPos,
-         cdsPos, protPos, aminos, codons, existingVar, distance,
-         strand, sift, polyPhen, motifName, motifPos,
-         highInfPos, motifScoreChange) = annStr.split('|')
-        terms = effects.split("&")
-        transcriptEffects = []
-        for term in terms:
-            transcriptEffects.append(
-                self._createCsqTranscriptEffect(
-                    alt, term, protPos,
-                    cdnaPos, featureId))
-        return transcriptEffects
-
-    def _createCsqTranscriptEffect(
-            self, alt, term, protPos, cdnaPos, featureId):
-        effect = self._createGaTranscriptEffect()
-        effect.alternate_bases = alt
-        effect.effects.extend(self.convertSeqOntology(term))
-        effect.feature_id = featureId
-        # These are not present in the data
-        self.addLocations(effect, protPos, cdnaPos)
-        effect.id = self.getTranscriptEffectId(effect)
-        return effect
-
-    def convertTranscriptEffectVEP(self, annStr, hgvsG):
-        """
-        Takes the ANN string of a VEP generated VCF, splits it
-        and returns a populated GA4GH transcript effect object.
-        :param annStr: String
-        :param hgvsG: String
-        :return: effect protocol.TranscriptEffect
-        """
-        effect = self._createGaTranscriptEffect()
-        (alt, effects, impact, symbol, geneName, featureType,
-         featureId, trBiotype, exon, intron, hgvsC, hgvsP,
-         cdnaPos, cdsPos, protPos, aminos, codons,
-         existingVar, distance, strand, symbolSource,
-         hgncId, hgvsOffset) = annStr.split('|')
-        effect.alternate_bases = alt
-        effect.effects.extend(self.convertSeqOntology(effects))
-        effect.feature_id = featureId
-        effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
-        effect.hgvs_annotation.genomic = hgvsG
-        effect.hgvs_annotation.transcript = hgvsC
-        effect.hgvs_annotation.protein = hgvsP
-        self.addLocations(effect, protPos, cdnaPos)
-        effect.id = self.getTranscriptEffectId(effect)
-        return effect
-
-    def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
+    def convertTranscriptEffect(self, annStr, hgvsG):
         """
         Takes the ANN string of a SnpEff generated VCF, splits it
         and returns a populated GA4GH transcript effect object.
@@ -1315,17 +1268,25 @@ def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
         :return: effect protocol.TranscriptEffect()
         """
         effect = self._createGaTranscriptEffect()
-        # SnpEff and VEP don't agree on this :)
-        (alt, effects, impact, geneName, geneId, featureType,
-            featureId, trBiotype, rank, hgvsC, hgvsP, cdnaPos,
-            cdsPos, protPos, distance, errsWarns) = annStr.split('|')
-        effect.alternate_bases = alt
-        effect.effects.extend(self.convertSeqOntology(effects))
-        effect.feature_id = featureId
-        effect.hgvs_annotation.genomic = hgvsG
-        effect.hgvs_annotation.transcript = hgvsC
-        effect.hgvs_annotation.protein = hgvsP
-        self.addLocations(effect, protPos, cdnaPos)
+        effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
+        annDict = dict()
+        if self._annotationType == ANNOTATIONS_SNPEFF:
+            annDict = dict(zip(self. SNPEFF_FIELDS, annStr.split("|")))
+        elif self._annotationType == ANNOTATIONS_VEP_V82:
+            annDict = dict(zip(self.VEP_FIELDS, annStr.split("|")))
+        else:
+            annDict = dict(zip(self.CSQ_FIELDS, annStr.split("|")))
+        annDict["hgvs_annotation.genomic"] = hgvsG if hgvsG else u''
+        for key, val in annDict.items():
+            try:
+                protocol.deepSetAttr(effect, key, val)
+            except AttributeError:
+                if val and key not in self.EXCLUDED_FIELDS:
+                    protocol.setAttribute(
+                        effect.attributes.attr[key].values, val)
+        effect.effects.extend(self.convertSeqOntology(annDict.get('effects')))
+        self.addLocations(
+            effect, annDict.get('protPos'), annDict.get('cdnaPos'))
         effect.id = self.getTranscriptEffectId(effect)
         return effect
 
@@ -1341,7 +1302,7 @@ def convertSeqOntology(self, seqOntStr):
             self._ontology.getGaTermByName(soName)
             for soName in seqOntStr.split('&')]
 
-    def convertVariantAnnotation(self, record, transcriptConverter):
+    def convertVariantAnnotation(self, record):
         """
         Converts the specfied pysam variant record into a GA4GH variant
         annotation object using the specified function to convert the
@@ -1350,33 +1311,13 @@ def convertVariantAnnotation(self, record, transcriptConverter):
         variant = self._variantSet.convertVariant(record, [])
         annotation = self._createGaVariantAnnotation()
         annotation.variant_id = variant.id
+        gDots = record.info.get(b'HGVS.g')
         # Convert annotations from INFO field into TranscriptEffect
         transcriptEffects = []
-        hgvsG = record.info.get(b'HGVS.g')
-        if transcriptConverter != self.convertTranscriptEffectCSQ:
-            annotations = record.info.get(b'ANN')
-            transcriptEffects = self._convertAnnotations(
-                annotations, variant, hgvsG, transcriptConverter)
-        else:
-            annotations = record.info.get('CSQ'.encode())
-            transcriptEffects = []
-            for ann in annotations:
-                transcriptEffects.extend(
-                    self.convertTranscriptEffectCSQ(ann, hgvsG))
+        annotations = record.info.get(b'ANN') or record.info.get(b'CSQ')
+        for i, ann in enumerate(annotations):
+            hgvsG = gDots[i % len(variant.alternate_bases)] if gDots else None
+            transcriptEffects.append(self.convertTranscriptEffect(ann, hgvsG))
         annotation.transcript_effects.extend(transcriptEffects)
         annotation.id = self.getVariantAnnotationId(variant, annotation)
         return variant, annotation
-
-    def _convertAnnotations(
-            self, annotations, variant, hgvsG, transcriptConverter):
-        transcriptEffects = []
-        if annotations is not None:
-            for index, ann in enumerate(annotations):
-                altshgvsG = ""
-                if hgvsG is not None:
-                    # The HGVS.g field contains an element for
-                    # each alternate allele
-                    altshgvsG = hgvsG[index % len(variant.alternate_bases)]
-                transcriptEffects.append(
-                    transcriptConverter(ann, altshgvsG))
-        return transcriptEffects
diff --git a/tests/datadriven/test_variant_annotations.py b/tests/datadriven/test_variant_annotations.py
@@ -109,6 +109,7 @@ def splitAnnField(self, annfield):
                       "hgvsP", "cdnaPos", "cdsPos", "protPos",
                       "distance", "errsWarns"]
         values = annfield.split('|')
+        values[1] = values[1].split('&')  # split effects
         return dict(zip(fields, values))
 
     def getDataModelInstance(self, localId, dataPath):
@@ -155,35 +156,43 @@ def _verifyVariantAnnotationsEqual(
             if 'ANN' in pyvcfVariant.INFO:
                 pyvcfAnn = pyvcfVariant.INFO['ANN']
                 i = 0
-                for pyvcfEffect, gaEffect in \
+                for pyvcfEffect, gaEff in \
                         zip(pyvcfAnn, gaVariantAnnotation.transcript_effects):
                     effectDict = self.splitAnnField(pyvcfEffect)
                     self.assertEqual(
-                        gaEffect.alternate_bases, effectDict['alt'])
+                        map(lambda e: e.term, gaEff.effects),
+                        effectDict['effects'])
                     self.assertEqual(
-                        gaEffect.feature_id, effectDict['featureId'])
+                        gaEff.attributes.attr['impact'].values[0].string_value,
+                        effectDict['impact'])
                     self.assertEqual(
-                        gaEffect.hgvs_annotation.transcript,
+                        gaEff.alternate_bases, effectDict['alt'])
+                    self.assertEqual(
+                        gaEff.feature_id, effectDict['featureId'])
+                    self.assertEqual(
+                        gaEff.hgvs_annotation.transcript,
                         effectDict['hgvsC'])
                     self.assertEqual(
-                        gaEffect.hgvs_annotation.protein, effectDict['hgvsP'])
+                        gaEff.hgvs_annotation.protein, effectDict['hgvsP'])
                     if 'HGVS.g' in pyvcfVariant.INFO:
                         # Not all VCF have this field
                         index = i % len(pyvcfVariant.INFO['HGVS.g'])
                         self.assertEqual(
-                            gaEffect.hgvs_annotation.genomic,
+                            gaEff.hgvs_annotation.genomic,
                             pyvcfVariant.INFO['HGVS.g'][index])
                     i += 1
             elif 'CSQ' in pyvcfVariant.INFO:
                 pyvcfAnn = pyvcfVariant.INFO['CSQ']
-                transcriptEffects = []
-                for ann in pyvcfAnn:
-                    transcriptEffects += self._splitCsqEffects(ann)
-                for treff, gaEffect in zip(
+                transcriptEffects = [
+                    self._splitCsqEffects(ann) for ann in pyvcfAnn]
+                for treff, gaEff in zip(
                         transcriptEffects,
                         gaVariantAnnotation.transcript_effects):
-                    self.assertEqual(gaEffect.alternate_bases, treff['alt'])
-                    self.assertEqual(gaEffect.feature_id, treff['featureId'])
+                    self.assertEqual(
+                        map(lambda e: e.term, gaEff.effects),
+                        treff['effects'])
+                    self.assertEqual(gaEff.alternate_bases, treff['alt'])
+                    self.assertEqual(gaEff.feature_id, treff['featureId'])
             self.assertGreater(len(gaVariantAnnotation.transcript_effects), 0)
 
     def _splitCsqEffects(self, annStr):
@@ -192,14 +201,11 @@ def _splitCsqEffects(self, annStr):
             distance, strand, sift, polyPhen, motifName,
             motifPos, highInfPos,
             motifScoreChange) = annStr.split('|')
-        terms = effects.split("&")
-        treffs = []
-        for term in terms:
-            effect = {}
-            effect['featureId'] = featureId
-            effect['alt'] = alt
-            treffs.append(effect)
-        return treffs
+        effect = {}
+        effect['featureId'] = featureId
+        effect['alt'] = alt
+        effect['effects'] = effects.split("&")
+        return effect
 
     def testVariantsValid(self):
         end = datamodel.PysamDatamodelMixin.vcfMax