Skip to content

Commit

Permalink
Fix for ga4gh#1334: Refactor transcript annotation to place extra dat…
Browse files Browse the repository at this point in the history
…a in INFO
  • Loading branch information
andrewjesaitis committed Jan 18, 2017
1 parent 0cb4961 commit d8257d8
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 144 deletions.
234 changes: 101 additions & 133 deletions ga4gh/server/datamodel/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,50 @@
ANNOTATIONS_SNPEFF = "SNPEff"


# Utility functions for module

def isUnspecified(str):
"""
Checks whether a string is None or an
empty string. Returns a boolean.
"""
return str == "" or str is None

def _encodeValue(value):
if isinstance(value, (list, tuple)):
return [struct_pb2.Value(string_value=str(v)) for v in value]
else:
return [struct_pb2.Value(string_value=str(value))]

def deepGetAttr(obj, path):
"""
Utility function which is similiar to std lib's getattr, but works to resolve
a dot-delimited path on an object. If path is not found an `AttributeError` will
be raised.
"""
return reduce(getattr, path.split('.'), obj)

def deepSetAttr(obj, path, val):
"""
Utility function which is similar to std lib's setattr, but attempts to set
a deep attribute on an object by resolving a dot-delimited path. If path does not
exist an `AttributeError` will be raised`.
"""
first, sep, rest = path.rpartition('.')
return setattr(deepGetAttr(obj, first) if first else obj, rest, val)


def setInfoField(msg, key, val):
"""
Sets the info field on the GA message with the the specified key/value pair
"""
msg.info[key].values.extend(_encodeValue(val))

_nothing = object()

def isEmptyIter(it):
"""Return True iff the iterator is empty or exhausted"""
return next(it, _nothing) is _nothing

class CallSet(datamodel.DatamodelObject):
"""
Expand Down Expand Up @@ -414,22 +451,6 @@ def generateVariant(self, referenceName, position, randomNumberGenerator):
variant.id = self.getVariantId(variant)
return variant


def _encodeValue(value):
if isinstance(value, (list, tuple)):
return [struct_pb2.Value(string_value=str(v)) for v in value]
else:
return [struct_pb2.Value(string_value=str(value))]


_nothing = object()


def isEmptyIter(it):
"""Return True iff the iterator is empty or exhausted"""
return next(it, _nothing) is _nothing


class HtslibVariantSet(datamodel.PysamDatamodelMixin, AbstractVariantSet):
"""
Class representing a single variant set backed by a directory of indexed
Expand Down Expand Up @@ -990,18 +1011,20 @@ def _addTranscriptEffectLocations(self, effect, ann, variant):
def _getRandomOntologyTerm(self, randomNumberGenerator):
# TODO more mock options from simulated seqOnt?
ontologyTuples = [
("intron_variant", "SO:0001627"),
("exon_variant", "SO:0001791")]
("intron_variant", "SO:0001627", "LOW"),
("exon_variant", "SO:0001791", "LOW"),
("misense_variant", "SO:0001583", "MODERATE"),
("stop_gained", "SO:0001587", "HIGH")]
term = protocol.OntologyTerm()
ontologyTuple = randomNumberGenerator.choice(ontologyTuples)
term.term, term.id = ontologyTuple[0], ontologyTuple[1]
term.term, term.id, impact = randomNumberGenerator.choice(ontologyTuples)
term.source_name = "ontology"
term.source_version = "0"
return term
return term, impact

def _addTranscriptEffectOntologyTerm(self, effect, randomNumberGenerator):
effect.effects.add().CopyFrom(
self._getRandomOntologyTerm(randomNumberGenerator))
term, impact = self._getRandomOntologyTerm(randomNumberGenerator)
effect.effects.add().CopyFrom(term)
setInfoField(effect, "impact", impact)
return effect

def _generateAnalysisResult(self, effect, ann, randomNumberGenerator):
Expand Down Expand Up @@ -1039,6 +1062,33 @@ class HtslibVariantAnnotationSet(AbstractVariantAnnotationSet):
Class representing a single variant annotation derived from an
annotated variant set.
"""
CSQ_FIELDS = ("alternate_bases", "gene", "feature_id",
"featureType", "effects", "cdnaPos", "cdsPos",
"protPos", "aminos", "codons", "existingVar",
"distance", "strand", "sift", "polyPhen",
"motifName", "motifPos", "highInfPos",
"motifScoreChange")
VEP_FIELDS = ("alternate_bases", "effects", "impact", "symbol",
"geneName", "featureType", "feature_id",
"trBiotype", "exon", "intron",
"hgvs_annotation.transcript",
"hgvs_annotation.protein", "cdnaPos", "cdsPos",
"protPos", "aminos", "codons", "existingVar",
"distance", "strand", "symbolSource", "hgncId",
"hgvsOffset")
SNPEFF_FIELDS = ("alternate_bases", "effects", "impact",
"geneName", "geneId", "featureType",
"feature_id", "trBiotype", "rank",
"hgvs_annotation.transcript",
"hgvs_annotation.protein", "cdnaPos", "cdsPos",
"protPos", "distance", "errsWarns")
EXCLUDED_FIELDS = ("effects", "geneName", "geneId", "featureType",
"trBiotype", "rank", "cdnaPos", "cdsPos",
"protPos", "distance", "exon", "intron",
"aminos", "codons", "existingVar", "distance",
"strand", "symbolSource", "hgncId",
"hgvsOffset")

def __init__(self, variantSet, localId):
super(HtslibVariantAnnotationSet, self).__init__(variantSet, localId)

Expand Down Expand Up @@ -1122,19 +1172,10 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition):
:param endPosition:
:return: generator of protocol.VariantAnnotation
"""
# TODO Refactor this so that we use the annotationType information
# where it makes most sense, and rename the various methods so that
# it's clear what program/version combination they operate on.
variantIter = self._variantSet.getPysamVariants(
referenceName, startPosition, endPosition)
if self._annotationType == ANNOTATIONS_SNPEFF:
transcriptConverter = self.convertTranscriptEffectSnpEff
elif self._annotationType == ANNOTATIONS_VEP_V82:
transcriptConverter = self.convertTranscriptEffectVEP
else:
transcriptConverter = self.convertTranscriptEffectCSQ
for record in variantIter:
yield self.convertVariantAnnotation(record, transcriptConverter)
yield self.convertVariantAnnotation(record)

def convertLocation(self, pos):
"""
Expand Down Expand Up @@ -1240,69 +1281,7 @@ def addLocations(self, effect, protPos, cdnaPos):
self.addProteinLocation(effect, protPos)
return effect

def convertTranscriptEffectCSQ(self, annStr, hgvsG):
"""
Takes the consequence string of an annotated VCF using a
CSQ field as opposed to ANN and returns an array of
transcript effects.
:param annStr: String
:param hgvsG: String
:return: [protocol.TranscriptEffect]
"""
# Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|
# CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|
# DISTANCE|STRAND|SIFT|PolyPhen|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE

(alt, gene, featureId, featureType, effects, cdnaPos,
cdsPos, protPos, aminos, codons, existingVar, distance,
strand, sift, polyPhen, motifName, motifPos,
highInfPos, motifScoreChange) = annStr.split('|')
terms = effects.split("&")
transcriptEffects = []
for term in terms:
transcriptEffects.append(
self._createCsqTranscriptEffect(
alt, term, protPos,
cdnaPos, featureId))
return transcriptEffects

def _createCsqTranscriptEffect(
self, alt, term, protPos, cdnaPos, featureId):
effect = self._createGaTranscriptEffect()
effect.alternate_bases = alt
effect.effects.extend(self.convertSeqOntology(term))
effect.feature_id = featureId
# These are not present in the data
self.addLocations(effect, protPos, cdnaPos)
effect.id = self.getTranscriptEffectId(effect)
return effect

def convertTranscriptEffectVEP(self, annStr, hgvsG):
"""
Takes the ANN string of a VEP generated VCF, splits it
and returns a populated GA4GH transcript effect object.
:param annStr: String
:param hgvsG: String
:return: effect protocol.TranscriptEffect
"""
effect = self._createGaTranscriptEffect()
(alt, effects, impact, symbol, geneName, featureType,
featureId, trBiotype, exon, intron, hgvsC, hgvsP,
cdnaPos, cdsPos, protPos, aminos, codons,
existingVar, distance, strand, symbolSource,
hgncId, hgvsOffset) = annStr.split('|')
effect.alternate_bases = alt
effect.effects.extend(self.convertSeqOntology(effects))
effect.feature_id = featureId
effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
effect.hgvs_annotation.genomic = hgvsG
effect.hgvs_annotation.transcript = hgvsC
effect.hgvs_annotation.protein = hgvsP
self.addLocations(effect, protPos, cdnaPos)
effect.id = self.getTranscriptEffectId(effect)
return effect

def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
def convertTranscriptEffect(self, annStr, hgvsG):
"""
Takes the ANN string of a SnpEff generated VCF, splits it
and returns a populated GA4GH transcript effect object.
Expand All @@ -1311,17 +1290,24 @@ def convertTranscriptEffectSnpEff(self, annStr, hgvsG):
:return: effect protocol.TranscriptEffect()
"""
effect = self._createGaTranscriptEffect()
# SnpEff and VEP don't agree on this :)
(alt, effects, impact, geneName, geneId, featureType,
featureId, trBiotype, rank, hgvsC, hgvsP, cdnaPos,
cdsPos, protPos, distance, errsWarns) = annStr.split('|')
effect.alternate_bases = alt
effect.effects.extend(self.convertSeqOntology(effects))
effect.feature_id = featureId
effect.hgvs_annotation.genomic = hgvsG
effect.hgvs_annotation.transcript = hgvsC
effect.hgvs_annotation.protein = hgvsP
self.addLocations(effect, protPos, cdnaPos)
effect.hgvs_annotation.CopyFrom(protocol.HGVSAnnotation())
annDict = dict()
if self._annotationType == ANNOTATIONS_SNPEFF:
annDict = dict(zip(self. SNPEFF_FIELDS, annStr.split("|")))
elif self._annotationType == ANNOTATIONS_VEP_V82:
annDict = dict(zip(self.VEP_FIELDS, annStr.split("|")))
else:
annDict = dict(zip(self.CSQ_FIELDS, annStr.split("|")))
annDict["hgvs_annotation.genomic"] = hgvsG if hgvsG else u''

for key, val in annDict.items():
try:
deepSetAttr(effect, key, val)
except AttributeError:
if val and key not in self.EXCLUDED_FIELDS:
setInfoField(effect, key, val)
effect.effects.extend(self.convertSeqOntology(annDict.get('effects')))
self.addLocations(effect, annDict.get('protPos'), annDict.get('cdnaPos'))
effect.id = self.getTranscriptEffectId(effect)
return effect

Expand All @@ -1337,7 +1323,7 @@ def convertSeqOntology(self, seqOntStr):
self._ontology.getGaTermByName(soName)
for soName in seqOntStr.split('&')]

def convertVariantAnnotation(self, record, transcriptConverter):
def convertVariantAnnotation(self, record):
"""
Converts the specfied pysam variant record into a GA4GH variant
annotation object using the specified function to convert the
Expand All @@ -1346,33 +1332,15 @@ def convertVariantAnnotation(self, record, transcriptConverter):
variant = self._variantSet.convertVariant(record, [])
annotation = self._createGaVariantAnnotation()
annotation.variant_id = variant.id
hgvsGenomicAnnotations = record.info.get(b'HGVS.g')
# Convert annotations from INFO field into TranscriptEffect
transcriptEffects = []
hgvsG = record.info.get(b'HGVS.g')
if transcriptConverter != self.convertTranscriptEffectCSQ:
annotations = record.info.get(b'ANN')
transcriptEffects = self._convertAnnotations(
annotations, variant, hgvsG, transcriptConverter)
else:
annotations = record.info.get('CSQ'.encode())
transcriptEffects = []
for ann in annotations:
transcriptEffects.extend(
self.convertTranscriptEffectCSQ(ann, hgvsG))
annotations = record.info.get(b'ANN') or record.info.get(b'CSQ')
if "CSQ" in record.info.keys():
pass#import ipdb; ipdb.set_trace()
for idx, ann in enumerate(annotations):
hgvsG = hgvsGenomicAnnotations[idx % len(variant.alternate_bases)] if hgvsGenomicAnnotations else None
transcriptEffects.append(self.convertTranscriptEffect(ann, hgvsG))
annotation.transcript_effects.extend(transcriptEffects)
annotation.id = self.getVariantAnnotationId(variant, annotation)
return variant, annotation

def _convertAnnotations(
self, annotations, variant, hgvsG, transcriptConverter):
transcriptEffects = []
if annotations is not None:
for index, ann in enumerate(annotations):
altshgvsG = ""
if hgvsG is not None:
# The HGVS.g field contains an element for
# each alternate allele
altshgvsG = hgvsG[index % len(variant.alternate_bases)]
transcriptEffects.append(
transcriptConverter(ann, altshgvsG))
return transcriptEffects
17 changes: 6 additions & 11 deletions tests/datadriven/test_variant_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,7 @@ def _verifyVariantAnnotationsEqual(
i += 1
elif 'CSQ' in pyvcfVariant.INFO:
pyvcfAnn = pyvcfVariant.INFO['CSQ']
transcriptEffects = []
for ann in pyvcfAnn:
transcriptEffects += self._splitCsqEffects(ann)
transcriptEffects = [self._splitCsqEffects(ann) for ann in pyvcfAnn]
for treff, gaEffect in zip(
transcriptEffects,
gaVariantAnnotation.transcript_effects):
Expand All @@ -191,14 +189,11 @@ def _splitCsqEffects(self, annStr):
distance, strand, sift, polyPhen, motifName,
motifPos, highInfPos,
motifScoreChange) = annStr.split('|')
terms = effects.split("&")
treffs = []
for term in terms:
effect = {}
effect['featureId'] = featureId
effect['alt'] = alt
treffs.append(effect)
return treffs
effects = effects.split("&")
effect = {}
effect['featureId'] = featureId
effect['alt'] = alt
return effect

def testVariantsValid(self):
end = datamodel.PysamDatamodelMixin.vcfMax
Expand Down

0 comments on commit d8257d8

Please sign in to comment.