Skip to content
This repository has been archived by the owner on Jan 24, 2018. It is now read-only.

Commit

Permalink
Fix for #1334: Refactor transcript annotation
Browse files Browse the repository at this point in the history
* Place extra data in attributes field
* Simplify code path for annotation parsing
  • Loading branch information
andrewjesaitis committed Feb 10, 2017
1 parent c5306d5 commit f492fe5
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 144 deletions.
221 changes: 88 additions & 133 deletions ga4gh/server/datamodel/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,37 @@
ANNOTATIONS_SNPEFF = "SNPEff"


# Utility functions for module

def isUnspecified(str):
"""
Checks whether a string is None or an
empty string. Returns a boolean.
"""
return str == "" or str is None

def deepGetAttr(obj, path):
"""
Utility function which is similiar to std lib's getattr, but works to resolve
a dot-delimited path on an object. If path is not found an `AttributeError` will
be raised.
"""
return reduce(getattr, path.split('.'), obj)

def deepSetAttr(obj, path, val):
"""
Utility function which is similar to std lib's setattr, but attempts to set
a deep attribute on an object by resolving a dot-delimited path. If path does not
exist an `AttributeError` will be raised`.
"""
first, sep, rest = path.rpartition('.')
return setattr(deepGetAttr(obj, first) if first else obj, rest, val)

_nothing = object()

def isEmptyIter(it):
"""Return True iff the iterator is empty or exhausted"""
return next(it, _nothing) is _nothing

class CallSet(datamodel.DatamodelObject):
"""
Expand Down Expand Up @@ -414,22 +438,6 @@ def generateVariant(self, referenceName, position, randomNumberGenerator):
variant.id = self.getVariantId(variant)
return variant


def _encodeValue(value):
if isinstance(value, (list, tuple)):
return [common_pb2.AttributeValue(string_value=str(v)) for v in value]
else:
return [common_pb2.AttributeValue(string_value=str(value))]


_nothing = object()


def isEmptyIter(it):
"""Return True iff the iterator is empty or exhausted"""
return next(it, _nothing) is _nothing


class HtslibVariantSet(datamodel.PysamDatamodelMixin, AbstractVariantSet):
"""
Class representing a single variant set backed by a directory of indexed
Expand Down Expand Up @@ -993,16 +1001,18 @@ def _addTranscriptEffectLocations(self, effect, ann, variant):
def _getRandomOntologyTerm(self, randomNumberGenerator):
# TODO more mock options from simulated seqOnt?
ontologyTuples = [
("intron_variant", "SO:0001627"),
("exon_variant", "SO:0001791")]
("intron_variant", "SO:0001627", "LOW"),
("exon_variant", "SO:0001791", "LOW"),
("misense_variant", "SO:0001583", "MODERATE"),
("stop_gained", "SO:0001587", "HIGH")]
term = protocol.OntologyTerm()
ontologyTuple = randomNumberGenerator.choice(ontologyTuples)
term.term, term.term_id = ontologyTuple[0], ontologyTuple[1]
return term
term.term, term.term_id, impact = randomNumberGenerator.choice(ontologyTuples)
return term, impact

def _addTranscriptEffectOntologyTerm(self, effect, randomNumberGenerator):
effect.effects.add().CopyFrom(
self._getRandomOntologyTerm(randomNumberGenerator))
term, impact = self._getRandomOntologyTerm(randomNumberGenerator)
effect.effects.add().CopyFrom(term)
protocol.setAttribute(effect.attributes.attr["impact"].values, impact)
return effect

def _generateAnalysisResult(self, effect, ann, randomNumberGenerator):
Expand Down Expand Up @@ -1040,6 +1050,33 @@ class HtslibVariantAnnotationSet(AbstractVariantAnnotationSet):
Class representing a single variant annotation derived from an
annotated variant set.
"""
CSQ_FIELDS = ("alternate_bases", "gene", "feature_id",
"featureType", "effects", "cdnaPos", "cdsPos",
"protPos", "aminos", "codons", "existingVar",
"distance", "strand", "sift", "polyPhen",
"motifName", "motifPos", "highInfPos",
"motifScoreChange")
VEP_FIELDS = ("alternate_bases", "effects", "impact", "symbol",
"geneName", "featureType", "feature_id",
"trBiotype", "exon", "intron",
"hgvs_annotation.transcript",
"hgvs_annotation.protein", "cdnaPos", "cdsPos",
"protPos", "aminos", "codons", "existingVar",
"distance", "strand", "symbolSource", "hgncId",
"hgvsOffset")
SNPEFF_FIELDS = ("alternate_bases", "effects", "impact",
"geneName", "geneId", "featureType",
"feature_id", "trBiotype", "rank",
"hgvs_annotation.transcript",
"hgvs_annotation.protein", "cdnaPos", "cdsPos",
"protPos", "distance", "errsWarns")
EXCLUDED_FIELDS = ("effects", "geneName", "gene", "geneId", "featureType",
"trBiotype", "rank", "cdnaPos", "cdsPos",
"protPos", "distance", "exon", "intron",
"aminos", "codons", "existingVar", "distance",
"strand", "symbolSource", "hgncId",
"hgvsOffset")

def __init__(self, variantSet, localId):
super(HtslibVariantAnnotationSet, self).__init__(variantSet, localId)

Expand Down Expand Up @@ -1126,19 +1163,10 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition):
:param endPosition:
:return: generator of protocol.VariantAnnotation
"""
# TODO Refactor this so that we use the annotationType information
# where it makes most sense, and rename the various methods so that
# it's clear what program/version combination they operate on.
variantIter = self._variantSet.getPysamVariants(
referenceName, startPosition, endPosition)
if self._annotationType == ANNOTATIONS_SNPEFF:
transcriptConverter = self.convertTranscriptEffectSnpEff
elif self._annotationType == ANNOTATIONS_VEP_V82:
transcriptConverter = self.convertTranscriptEffectVEP
else:
transcriptConverter = self.convertTranscriptEffectCSQ
for record in variantIter:
yield self.convertVariantAnnotation(record, transcriptConverter)
yield self.convertVariantAnnotation(record)

def convertLocation(self, pos):
"""
Expand Down Expand Up @@ -1244,69 +1272,7 @@ def addLocations(self, effect, protPos, cdnaPos):
self.addProteinLocation(effect, protPos)
return effect

def convertTranscriptEffectCSQ(self, annStr, hgvsG):
"""
Takes the consequence string of an annotated VCF using a
CSQ field as opposed to ANN and returns an array of
transcript effects.
:param annStr: String
:param hgvsG: String
:return: [protocol.TranscriptEffect]
"""
# Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|
# CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|
# DISTANCE|STRAND|SIFT|PolyPhen|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE

(alt, gene, featureId, featureType, effects, cdnaPos,
cdsPos, protPos, aminos, codons, existingVar, distance,
strand, sift, polyPhen, motifName, motifPos,
highInfPos, motifScoreChange) = annStr.split('|')
terms = effects.split("&")
transcriptEffects = []
for term in terms:
transcriptEffects.append(
self._createCsqTranscriptEffect(
alt, term, protPos,
cdnaPos, featureId))
return transcriptEffects

def _createCsqTranscriptEffect(
self, alt, term, protPos, cdnaPos, featureId):
effect = self._createGaTranscriptEffect()
effect.alternate_bases = alt
effect.effects.extend(self.convertSeqOntology(term))
effect.feature_id = featureId
# These are not present in the data
self.addLocations(effect, protPos, cdnaPos)
effect.id = self.getTranscriptEffectId(effect)
return effect

def convertTranscriptEffectVEP(self, annStr, hgvsG):
"""
Takes the ANN string of a VEP generated VCF, splits it
and returns a populated GA4GH transcript effect object.
:param annStr: String
:param hgvsG: String
:return: effect protocol.TranscriptEffect
"""
effect = self._createGaTranscriptEffect()
(alt, effects, impact, symbol, geneName, featureType,
featureId, trBiotype, exon, intron, hgvsC, hgvsP,
cdnaPos, cdsPos, protPos, aminos, codons,
existingVar, distance, strand, symbolSource,
hgncId, hgvsOffset) = annStr.split('|')
effect.alternate_bases = alt
effect.effects.extend(self.convertSeqOntology(effects))
effect.feature_id = featureId
effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
effect.hgvs_annotation.genomic = hgvsG
effect.hgvs_annotation.transcript = hgvsC
effect.hgvs_annotation.protein = hgvsP
self.addLocations(effect, protPos, cdnaPos)
effect.id = self.getTranscriptEffectId(effect)
return effect

def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
def convertTranscriptEffect(self, annStr, hgvsG):
"""
Takes the ANN string of a SnpEff generated VCF, splits it
and returns a populated GA4GH transcript effect object.
Expand All @@ -1315,17 +1281,24 @@ def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
:return: effect protocol.TranscriptEffect()
"""
effect = self._createGaTranscriptEffect()
# SnpEff and VEP don't agree on this :)
(alt, effects, impact, geneName, geneId, featureType,
featureId, trBiotype, rank, hgvsC, hgvsP, cdnaPos,
cdsPos, protPos, distance, errsWarns) = annStr.split('|')
effect.alternate_bases = alt
effect.effects.extend(self.convertSeqOntology(effects))
effect.feature_id = featureId
effect.hgvs_annotation.genomic = hgvsG
effect.hgvs_annotation.transcript = hgvsC
effect.hgvs_annotation.protein = hgvsP
self.addLocations(effect, protPos, cdnaPos)
effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
annDict = dict()
if self._annotationType == ANNOTATIONS_SNPEFF:
annDict = dict(zip(self. SNPEFF_FIELDS, annStr.split("|")))
elif self._annotationType == ANNOTATIONS_VEP_V82:
annDict = dict(zip(self.VEP_FIELDS, annStr.split("|")))
else:
annDict = dict(zip(self.CSQ_FIELDS, annStr.split("|")))
annDict["hgvs_annotation.genomic"] = hgvsG if hgvsG else u''
attrDict = {}
for key, val in annDict.items():
try:
deepSetAttr(effect, key, val)
except AttributeError:
if val and key not in self.EXCLUDED_FIELDS:
protocol.setAttribute(effect.attributes.attr[key].values, val)
effect.effects.extend(self.convertSeqOntology(annDict.get('effects')))
self.addLocations(effect, annDict.get('protPos'), annDict.get('cdnaPos'))
effect.id = self.getTranscriptEffectId(effect)
return effect

Expand All @@ -1341,7 +1314,7 @@ def convertSeqOntology(self, seqOntStr):
self._ontology.getGaTermByName(soName)
for soName in seqOntStr.split('&')]

def convertVariantAnnotation(self, record, transcriptConverter):
def convertVariantAnnotation(self, record):
"""
Converts the specfied pysam variant record into a GA4GH variant
annotation object using the specified function to convert the
Expand All @@ -1350,33 +1323,15 @@ def convertVariantAnnotation(self, record, transcriptConverter):
variant = self._variantSet.convertVariant(record, [])
annotation = self._createGaVariantAnnotation()
annotation.variant_id = variant.id
hgvsGenomicAnnotations = record.info.get(b'HGVS.g')
# Convert annotations from INFO field into TranscriptEffect
transcriptEffects = []
hgvsG = record.info.get(b'HGVS.g')
if transcriptConverter != self.convertTranscriptEffectCSQ:
annotations = record.info.get(b'ANN')
transcriptEffects = self._convertAnnotations(
annotations, variant, hgvsG, transcriptConverter)
else:
annotations = record.info.get('CSQ'.encode())
transcriptEffects = []
for ann in annotations:
transcriptEffects.extend(
self.convertTranscriptEffectCSQ(ann, hgvsG))
annotations = record.info.get(b'ANN') or record.info.get(b'CSQ')
if "CSQ" in record.info.keys():
pass#import ipdb; ipdb.set_trace()
for idx, ann in enumerate(annotations):
hgvsG = hgvsGenomicAnnotations[idx % len(variant.alternate_bases)] if hgvsGenomicAnnotations else None
transcriptEffects.append(self.convertTranscriptEffect(ann, hgvsG))
annotation.transcript_effects.extend(transcriptEffects)
annotation.id = self.getVariantAnnotationId(variant, annotation)
return variant, annotation

def _convertAnnotations(
self, annotations, variant, hgvsG, transcriptConverter):
transcriptEffects = []
if annotations is not None:
for index, ann in enumerate(annotations):
altshgvsG = ""
if hgvsG is not None:
# The HGVS.g field contains an element for
# each alternate allele
altshgvsG = hgvsG[index % len(variant.alternate_bases)]
transcriptEffects.append(
transcriptConverter(ann, altshgvsG))
return transcriptEffects
27 changes: 16 additions & 11 deletions tests/datadriven/test_variant_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def splitAnnField(self, annfield):
"hgvsP", "cdnaPos", "cdsPos", "protPos",
"distance", "errsWarns"]
values = annfield.split('|')
values[1] = values[1].split('&') # split effects
return dict(zip(fields, values))

def getDataModelInstance(self, localId, dataPath):
Expand Down Expand Up @@ -158,6 +159,12 @@ def _verifyVariantAnnotationsEqual(
for pyvcfEffect, gaEffect in \
zip(pyvcfAnn, gaVariantAnnotation.transcript_effects):
effectDict = self.splitAnnField(pyvcfEffect)
self.assertEqual(
map(lambda e: e.term, gaEffect.effects),
effectDict['effects'])
self.assertEqual(
gaEffect.attributes.attr['impact'].values[0].string_value,
effectDict['impact'])
self.assertEqual(
gaEffect.alternate_bases, effectDict['alt'])
self.assertEqual(
Expand All @@ -176,12 +183,13 @@ def _verifyVariantAnnotationsEqual(
i += 1
elif 'CSQ' in pyvcfVariant.INFO:
pyvcfAnn = pyvcfVariant.INFO['CSQ']
transcriptEffects = []
for ann in pyvcfAnn:
transcriptEffects += self._splitCsqEffects(ann)
transcriptEffects = [self._splitCsqEffects(ann) for ann in pyvcfAnn]
for treff, gaEffect in zip(
transcriptEffects,
gaVariantAnnotation.transcript_effects):
self.assertEqual(
map(lambda e: e.term, gaEffect.effects),
treff['effects'])
self.assertEqual(gaEffect.alternate_bases, treff['alt'])
self.assertEqual(gaEffect.feature_id, treff['featureId'])
self.assertGreater(len(gaVariantAnnotation.transcript_effects), 0)
Expand All @@ -192,14 +200,11 @@ def _splitCsqEffects(self, annStr):
distance, strand, sift, polyPhen, motifName,
motifPos, highInfPos,
motifScoreChange) = annStr.split('|')
terms = effects.split("&")
treffs = []
for term in terms:
effect = {}
effect['featureId'] = featureId
effect['alt'] = alt
treffs.append(effect)
return treffs
effect = {}
effect['featureId'] = featureId
effect['alt'] = alt
effect['effects'] = effects.split("&")
return effect

def testVariantsValid(self):
end = datamodel.PysamDatamodelMixin.vcfMax
Expand Down

0 comments on commit f492fe5

Please sign in to comment.