From bdc0d7e5ea78f84407d617a645ba789e51701a7f Mon Sep 17 00:00:00 2001 From: Andrew Jesaitis Date: Wed, 11 Jan 2017 18:27:18 -0700 Subject: [PATCH] Fix for #1334: Refactor transcript annotation * Place extra data in attributes field * Simplify code path for annotation parsing --- ga4gh/server/datamodel/variants.py | 227 ++++++++----------- tests/datadriven/test_variant_annotations.py | 46 ++-- 2 files changed, 120 insertions(+), 153 deletions(-) diff --git a/ga4gh/server/datamodel/variants.py b/ga4gh/server/datamodel/variants.py index cc585bba1..f2bef1bec 100644 --- a/ga4gh/server/datamodel/variants.py +++ b/ga4gh/server/datamodel/variants.py @@ -20,7 +20,6 @@ import ga4gh.server.datamodel as datamodel import ga4gh.schemas.pb as pb -import ga4gh.schemas.ga4gh.common_pb2 as common_pb2 import ga4gh.schemas.protocol as protocol @@ -29,6 +28,8 @@ ANNOTATIONS_SNPEFF = "SNPEff" +# Utility functions for module + def isUnspecified(str): """ Checks whether a string is None or an @@ -37,6 +38,34 @@ def isUnspecified(str): return str == "" or str is None +def deepGetAttr(obj, path): + """ + Utility function which is similiar to std lib's getattr, but works + to resolve a dot-delimited path on an object. If path is not found + an `AttributeError` will be raised. + """ + return reduce(getattr, path.split('.'), obj) + + +def deepSetAttr(obj, path, val): + """ + Utility function which is similar to std lib's setattr, but + attempts to set a deep attribute on an object by resolving a + dot-delimited path. If path does not exist an `AttributeError` + will be raised`. + """ + first, sep, rest = path.rpartition('.') + return setattr(deepGetAttr(obj, first) if first else obj, rest, val) + + +_nothing = object() + + +def isEmptyIter(it): + """Return True iff the iterator is empty or exhausted""" + return next(it, _nothing) is _nothing + + class CallSet(datamodel.DatamodelObject): """ Class representing a CallSet. A CallSet basically represents the @@ -415,21 +444,6 @@ def generateVariant(self, referenceName, position, randomNumberGenerator): return variant -def _encodeValue(value): - if isinstance(value, (list, tuple)): - return [common_pb2.AttributeValue(string_value=str(v)) for v in value] - else: - return [common_pb2.AttributeValue(string_value=str(value))] - - -_nothing = object() - - -def isEmptyIter(it): - """Return True iff the iterator is empty or exhausted""" - return next(it, _nothing) is _nothing - - class HtslibVariantSet(datamodel.PysamDatamodelMixin, AbstractVariantSet): """ Class representing a single variant set backed by a directory of indexed @@ -993,16 +1007,19 @@ def _addTranscriptEffectLocations(self, effect, ann, variant): def _getRandomOntologyTerm(self, randomNumberGenerator): # TODO more mock options from simulated seqOnt? ontologyTuples = [ - ("intron_variant", "SO:0001627"), - ("exon_variant", "SO:0001791")] + ("intron_variant", "SO:0001627", "LOW"), + ("exon_variant", "SO:0001791", "LOW"), + ("misense_variant", "SO:0001583", "MODERATE"), + ("stop_gained", "SO:0001587", "HIGH")] term = protocol.OntologyTerm() - ontologyTuple = randomNumberGenerator.choice(ontologyTuples) - term.term, term.term_id = ontologyTuple[0], ontologyTuple[1] - return term + picked = randomNumberGenerator.choice(ontologyTuples) + term.term, term.term_id, impact = picked + return term, impact def _addTranscriptEffectOntologyTerm(self, effect, randomNumberGenerator): - effect.effects.add().CopyFrom( - self._getRandomOntologyTerm(randomNumberGenerator)) + term, impact = self._getRandomOntologyTerm(randomNumberGenerator) + effect.effects.add().CopyFrom(term) + protocol.setAttribute(effect.attributes.attr["impact"].values, impact) return effect def _generateAnalysisResult(self, effect, ann, randomNumberGenerator): @@ -1040,6 +1057,33 @@ class HtslibVariantAnnotationSet(AbstractVariantAnnotationSet): Class representing a single variant annotation derived from an annotated variant set. """ + CSQ_FIELDS = ("alternate_bases", "gene", "feature_id", + "featureType", "effects", "cdnaPos", "cdsPos", + "protPos", "aminos", "codons", "existingVar", + "distance", "strand", "sift", "polyPhen", + "motifName", "motifPos", "highInfPos", + "motifScoreChange") + VEP_FIELDS = ("alternate_bases", "effects", "impact", "symbol", + "geneName", "featureType", "feature_id", + "trBiotype", "exon", "intron", + "hgvs_annotation.transcript", + "hgvs_annotation.protein", "cdnaPos", "cdsPos", + "protPos", "aminos", "codons", "existingVar", + "distance", "strand", "symbolSource", "hgncId", + "hgvsOffset") + SNPEFF_FIELDS = ("alternate_bases", "effects", "impact", + "geneName", "geneId", "featureType", + "feature_id", "trBiotype", "rank", + "hgvs_annotation.transcript", + "hgvs_annotation.protein", "cdnaPos", "cdsPos", + "protPos", "distance", "errsWarns") + EXCLUDED_FIELDS = ("effects", "geneName", "gene", "geneId", "featureType", + "trBiotype", "rank", "cdnaPos", "cdsPos", + "protPos", "distance", "exon", "intron", + "aminos", "codons", "existingVar", "distance", + "strand", "symbolSource", "hgncId", + "hgvsOffset") + def __init__(self, variantSet, localId): super(HtslibVariantAnnotationSet, self).__init__(variantSet, localId) @@ -1126,19 +1170,10 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition): :param endPosition: :return: generator of protocol.VariantAnnotation """ - # TODO Refactor this so that we use the annotationType information - # where it makes most sense, and rename the various methods so that - # it's clear what program/version combination they operate on. variantIter = self._variantSet.getPysamVariants( referenceName, startPosition, endPosition) - if self._annotationType == ANNOTATIONS_SNPEFF: - transcriptConverter = self.convertTranscriptEffectSnpEff - elif self._annotationType == ANNOTATIONS_VEP_V82: - transcriptConverter = self.convertTranscriptEffectVEP - else: - transcriptConverter = self.convertTranscriptEffectCSQ for record in variantIter: - yield self.convertVariantAnnotation(record, transcriptConverter) + yield self.convertVariantAnnotation(record) def convertLocation(self, pos): """ @@ -1244,69 +1279,7 @@ def addLocations(self, effect, protPos, cdnaPos): self.addProteinLocation(effect, protPos) return effect - def convertTranscriptEffectCSQ(self, annStr, hgvsG): - """ - Takes the consequence string of an annotated VCF using a - CSQ field as opposed to ANN and returns an array of - transcript effects. - :param annStr: String - :param hgvsG: String - :return: [protocol.TranscriptEffect] - """ - # Allele|Gene|Feature|Feature_type|Consequence|cDNA_position| - # CDS_position|Protein_position|Amino_acids|Codons|Existing_variation| - # DISTANCE|STRAND|SIFT|PolyPhen|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE - - (alt, gene, featureId, featureType, effects, cdnaPos, - cdsPos, protPos, aminos, codons, existingVar, distance, - strand, sift, polyPhen, motifName, motifPos, - highInfPos, motifScoreChange) = annStr.split('|') - terms = effects.split("&") - transcriptEffects = [] - for term in terms: - transcriptEffects.append( - self._createCsqTranscriptEffect( - alt, term, protPos, - cdnaPos, featureId)) - return transcriptEffects - - def _createCsqTranscriptEffect( - self, alt, term, protPos, cdnaPos, featureId): - effect = self._createGaTranscriptEffect() - effect.alternate_bases = alt - effect.effects.extend(self.convertSeqOntology(term)) - effect.feature_id = featureId - # These are not present in the data - self.addLocations(effect, protPos, cdnaPos) - effect.id = self.getTranscriptEffectId(effect) - return effect - - def convertTranscriptEffectVEP(self, annStr, hgvsG): - """ - Takes the ANN string of a VEP generated VCF, splits it - and returns a populated GA4GH transcript effect object. - :param annStr: String - :param hgvsG: String - :return: effect protocol.TranscriptEffect - """ - effect = self._createGaTranscriptEffect() - (alt, effects, impact, symbol, geneName, featureType, - featureId, trBiotype, exon, intron, hgvsC, hgvsP, - cdnaPos, cdsPos, protPos, aminos, codons, - existingVar, distance, strand, symbolSource, - hgncId, hgvsOffset) = annStr.split('|') - effect.alternate_bases = alt - effect.effects.extend(self.convertSeqOntology(effects)) - effect.feature_id = featureId - effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation()) - effect.hgvs_annotation.genomic = hgvsG - effect.hgvs_annotation.transcript = hgvsC - effect.hgvs_annotation.protein = hgvsP - self.addLocations(effect, protPos, cdnaPos) - effect.id = self.getTranscriptEffectId(effect) - return effect - - def convertTranscriptEffectSnpEff(self, annStr, hgvsG): + def convertTranscriptEffect(self, annStr, hgvsG): """ Takes the ANN string of a SnpEff generated VCF, splits it and returns a populated GA4GH transcript effect object. @@ -1315,17 +1288,25 @@ def convertTranscriptEffectSnpEff(self, annStr, hgvsG): :return: effect protocol.TranscriptEffect() """ effect = self._createGaTranscriptEffect() - # SnpEff and VEP don't agree on this :) - (alt, effects, impact, geneName, geneId, featureType, - featureId, trBiotype, rank, hgvsC, hgvsP, cdnaPos, - cdsPos, protPos, distance, errsWarns) = annStr.split('|') - effect.alternate_bases = alt - effect.effects.extend(self.convertSeqOntology(effects)) - effect.feature_id = featureId - effect.hgvs_annotation.genomic = hgvsG - effect.hgvs_annotation.transcript = hgvsC - effect.hgvs_annotation.protein = hgvsP - self.addLocations(effect, protPos, cdnaPos) + effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation()) + annDict = dict() + if self._annotationType == ANNOTATIONS_SNPEFF: + annDict = dict(zip(self. SNPEFF_FIELDS, annStr.split("|"))) + elif self._annotationType == ANNOTATIONS_VEP_V82: + annDict = dict(zip(self.VEP_FIELDS, annStr.split("|"))) + else: + annDict = dict(zip(self.CSQ_FIELDS, annStr.split("|"))) + annDict["hgvs_annotation.genomic"] = hgvsG if hgvsG else u'' + for key, val in annDict.items(): + try: + deepSetAttr(effect, key, val) + except AttributeError: + if val and key not in self.EXCLUDED_FIELDS: + protocol.setAttribute( + effect.attributes.attr[key].values, val) + effect.effects.extend(self.convertSeqOntology(annDict.get('effects'))) + self.addLocations( + effect, annDict.get('protPos'), annDict.get('cdnaPos')) effect.id = self.getTranscriptEffectId(effect) return effect @@ -1341,7 +1322,7 @@ def convertSeqOntology(self, seqOntStr): self._ontology.getGaTermByName(soName) for soName in seqOntStr.split('&')] - def convertVariantAnnotation(self, record, transcriptConverter): + def convertVariantAnnotation(self, record): """ Converts the specfied pysam variant record into a GA4GH variant annotation object using the specified function to convert the @@ -1350,33 +1331,13 @@ def convertVariantAnnotation(self, record, transcriptConverter): variant = self._variantSet.convertVariant(record, []) annotation = self._createGaVariantAnnotation() annotation.variant_id = variant.id + gDots = record.info.get(b'HGVS.g') # Convert annotations from INFO field into TranscriptEffect transcriptEffects = [] - hgvsG = record.info.get(b'HGVS.g') - if transcriptConverter != self.convertTranscriptEffectCSQ: - annotations = record.info.get(b'ANN') - transcriptEffects = self._convertAnnotations( - annotations, variant, hgvsG, transcriptConverter) - else: - annotations = record.info.get('CSQ'.encode()) - transcriptEffects = [] - for ann in annotations: - transcriptEffects.extend( - self.convertTranscriptEffectCSQ(ann, hgvsG)) + annotations = record.info.get(b'ANN') or record.info.get(b'CSQ') + for i, ann in enumerate(annotations): + hgvsG = gDots[i % len(variant.alternate_bases)] if gDots else None + transcriptEffects.append(self.convertTranscriptEffect(ann, hgvsG)) annotation.transcript_effects.extend(transcriptEffects) annotation.id = self.getVariantAnnotationId(variant, annotation) return variant, annotation - - def _convertAnnotations( - self, annotations, variant, hgvsG, transcriptConverter): - transcriptEffects = [] - if annotations is not None: - for index, ann in enumerate(annotations): - altshgvsG = "" - if hgvsG is not None: - # The HGVS.g field contains an element for - # each alternate allele - altshgvsG = hgvsG[index % len(variant.alternate_bases)] - transcriptEffects.append( - transcriptConverter(ann, altshgvsG)) - return transcriptEffects diff --git a/tests/datadriven/test_variant_annotations.py b/tests/datadriven/test_variant_annotations.py index 68f8c470f..ce1b9f57d 100644 --- a/tests/datadriven/test_variant_annotations.py +++ b/tests/datadriven/test_variant_annotations.py @@ -109,6 +109,7 @@ def splitAnnField(self, annfield): "hgvsP", "cdnaPos", "cdsPos", "protPos", "distance", "errsWarns"] values = annfield.split('|') + values[1] = values[1].split('&') # split effects return dict(zip(fields, values)) def getDataModelInstance(self, localId, dataPath): @@ -155,35 +156,43 @@ def _verifyVariantAnnotationsEqual( if 'ANN' in pyvcfVariant.INFO: pyvcfAnn = pyvcfVariant.INFO['ANN'] i = 0 - for pyvcfEffect, gaEffect in \ + for pyvcfEffect, gaEff in \ zip(pyvcfAnn, gaVariantAnnotation.transcript_effects): effectDict = self.splitAnnField(pyvcfEffect) self.assertEqual( - gaEffect.alternate_bases, effectDict['alt']) + map(lambda e: e.term, gaEff.effects), + effectDict['effects']) self.assertEqual( - gaEffect.feature_id, effectDict['featureId']) + gaEff.attributes.attr['impact'].values[0].string_value, + effectDict['impact']) self.assertEqual( - gaEffect.hgvs_annotation.transcript, + gaEff.alternate_bases, effectDict['alt']) + self.assertEqual( + gaEff.feature_id, effectDict['featureId']) + self.assertEqual( + gaEff.hgvs_annotation.transcript, effectDict['hgvsC']) self.assertEqual( - gaEffect.hgvs_annotation.protein, effectDict['hgvsP']) + gaEff.hgvs_annotation.protein, effectDict['hgvsP']) if 'HGVS.g' in pyvcfVariant.INFO: # Not all VCF have this field index = i % len(pyvcfVariant.INFO['HGVS.g']) self.assertEqual( - gaEffect.hgvs_annotation.genomic, + gaEff.hgvs_annotation.genomic, pyvcfVariant.INFO['HGVS.g'][index]) i += 1 elif 'CSQ' in pyvcfVariant.INFO: pyvcfAnn = pyvcfVariant.INFO['CSQ'] - transcriptEffects = [] - for ann in pyvcfAnn: - transcriptEffects += self._splitCsqEffects(ann) - for treff, gaEffect in zip( + transcriptEffects = [ + self._splitCsqEffects(ann) for ann in pyvcfAnn] + for treff, gaEff in zip( transcriptEffects, gaVariantAnnotation.transcript_effects): - self.assertEqual(gaEffect.alternate_bases, treff['alt']) - self.assertEqual(gaEffect.feature_id, treff['featureId']) + self.assertEqual( + map(lambda e: e.term, gaEff.effects), + treff['effects']) + self.assertEqual(gaEff.alternate_bases, treff['alt']) + self.assertEqual(gaEff.feature_id, treff['featureId']) self.assertGreater(len(gaVariantAnnotation.transcript_effects), 0) def _splitCsqEffects(self, annStr): @@ -192,14 +201,11 @@ def _splitCsqEffects(self, annStr): distance, strand, sift, polyPhen, motifName, motifPos, highInfPos, motifScoreChange) = annStr.split('|') - terms = effects.split("&") - treffs = [] - for term in terms: - effect = {} - effect['featureId'] = featureId - effect['alt'] = alt - treffs.append(effect) - return treffs + effect = {} + effect['featureId'] = featureId + effect['alt'] = alt + effect['effects'] = effects.split("&") + return effect def testVariantsValid(self): end = datamodel.PysamDatamodelMixin.vcfMax