Skip to content
This repository has been archived by the owner on Jan 24, 2018. It is now read-only.

Commit

Permalink
remove feature_id from ExpressionLevel (#1574)
Browse files Browse the repository at this point in the history
* remove feature_id from ExpressionLevel

* update tests

* Update rnaseq2ga.py

* Update rnaseq2ga.py

Simple changer to force tests to rerun.

* Update rnaseq2ga.py

simple change to force rerun
  • Loading branch information
saupchurch authored and ejacox committed Mar 2, 2017
1 parent 0276ab0 commit d535648
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 60 deletions.
21 changes: 9 additions & 12 deletions ga4gh/server/datamodel/rna_quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def __init__(self, parentContainer, localId):
super(AbstractExpressionLevel, self).__init__(
parentContainer, localId)
self._expression = 0.0
self._featureId = ""
self._isNormalized = ""
self._rawReadCount = 0.0
self._score = 0.0
Expand All @@ -50,7 +49,6 @@ def toProtocolElement(self):
protocolElement = protocol.ExpressionLevel()
protocolElement.id = self.getId()
protocolElement.name = self._name
protocolElement.feature_id = self._featureId
protocolElement.rna_quantification_id = self._parentContainer.getId()
protocolElement.raw_read_count = self._rawReadCount
protocolElement.expression = self._expression
Expand All @@ -71,7 +69,6 @@ def __init__(self, parentContainer, record):
super(SqliteExpressionLevel, self).__init__(
parentContainer, str(record["id"]))
self._expression = record["expression"]
self._featureId = record["feature_id"]
# sqlite stores booleans as int (False = 0, True = 1)
self._isNormalized = bool(record["is_normalized"])
self._rawReadCount = record["raw_read_count"]
Expand Down Expand Up @@ -326,15 +323,15 @@ def getDataUrl(self):
return self._dbFilePath

def getExpressionLevels(
self, threshold=0.0, featureIds=[], startIndex=0, maxResults=0):
self, threshold=0.0, names=[], startIndex=0, maxResults=0):
"""
Returns the list of ExpressionLevels in this RNA Quantification.
"""
rnaQuantificationId = self.getLocalId()
with self._db as dataSource:
expressionsReturned = dataSource.searchExpressionLevelsInDb(
rnaQuantificationId,
featureIds=featureIds,
names=names,
threshold=threshold,
startIndex=startIndex,
maxResults=maxResults)
Expand Down Expand Up @@ -392,7 +389,7 @@ def getRnaQuantificationById(self, rnaQuantificationId):
rnaQuantificationId)

def searchExpressionLevelsInDb(
self, rnaQuantId, featureIds=[], threshold=0.0, startIndex=0,
self, rnaQuantId, names=[], threshold=0.0, startIndex=0,
maxResults=0):
"""
:param rnaQuantId: string restrict search by quantification id
Expand All @@ -403,12 +400,12 @@ def searchExpressionLevelsInDb(
"rna_quantification_id = ? "
"AND expression > ? ")
sql_args = (rnaQuantId, threshold)
if len(featureIds) > 0:
sql += "AND feature_id in ("
sql += ",".join(['?' for featureId in featureIds])
if len(names) > 0:
sql += "AND name in ("
sql += ",".join(['?' for name in names])
sql += ") "
for featureId in featureIds:
sql_args += (featureId,)
for name in names:
sql_args += (name,)
sql += sqlite_backend.limitsSql(
startIndex=startIndex, maxResults=maxResults)
query = self._dbconn.execute(sql, sql_args)
Expand Down Expand Up @@ -468,7 +465,7 @@ def addExpressionLevel(self, expressionLevel):

# TODO this makes very little sense
def getExpressionLevels(
self, threshold=0.0, featureIds=[],
self, threshold=0.0, names=[],
startIndex=0, maxResults=0): # NOQA
return [self._expressionLevelIdMap[id_] for
id_ in self._expressionLevelIds]
Expand Down
2 changes: 1 addition & 1 deletion ga4gh/server/paging.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def _initialize(self):
def _search(self):
iterator = list(self._rnaQuant.getExpressionLevels(
threshold=self._request.threshold,
featureIds=self._request.feature_ids,
names=self._request.names,
startIndex=self._startIndex,
maxResults=self._maxResults))
return iterator
Expand Down
54 changes: 13 additions & 41 deletions ga4gh/server/repo/rnaseq2ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def createTables(self):
id INTEGER,
rna_quantification_id TEXT,
name TEXT,
feature_id TEXT,
expression REAL,
is_normalized BOOLEAN,
raw_read_count REAL,
Expand Down Expand Up @@ -69,7 +68,7 @@ def batchaddRNAQuantification(self):
def addExpression(self, datafields):
"""
Adds an Expression to the db. Datafields is a tuple in the order:
id, rna_quantification_id, name, feature_id, expression,
id, rna_quantification_id, name, expression,
is_normalized, raw_read_count, score, units, conf_low, conf_hi
"""
self._expressionValueList.append(datafields)
Expand All @@ -78,7 +77,7 @@ def addExpression(self, datafields):

def batchAddExpression(self):
if len(self._expressionValueList) > 0:
sql = "INSERT INTO Expression VALUES (?,?,?,?,?,?,?,?,?,?,?)"
sql = "INSERT INTO Expression VALUES (?,?,?,?,?,?,?,?,?,?)"
self._cursor.executemany(sql, self._expressionValueList)
self._dbConn.commit()
self._expressionValueList = []
Expand All @@ -89,8 +88,8 @@ def createIndices(self):
take a long time.
"""

sql = '''CREATE INDEX feature_id_index
ON Expression (feature_id)'''
sql = '''CREATE INDEX name_index
ON Expression (name)'''
self._cursor.execute(sql)
self._dbConn.commit()

Expand All @@ -111,7 +110,6 @@ def __init__(self, rnaDB, featureType="gene", dataset=None):
self._expressionLevelCol = None
self._idCol = None
self._nameCol = None
self._featureCol = None
self._countCol = None
self._confColLow = None
self._confColHi = None
Expand All @@ -135,20 +133,13 @@ def setColNum(self, header, name, defaultNum=None):
"Missing {} column in expression table.".format(name))
return colNum

def writeExpression(self, rnaQuantificationId, quantfilename,
featureSetNames=None):
def writeExpression(self, rnaQuantificationId, quantfilename):
"""
Reads the quantification results file and adds entries to the
specified database.
"""
isNormalized = self._isNormalized
units = self._units
featureSets = None
if self._dataset and featureSetNames:
featureSets = []
for annotationName in featureSetNames.split(","):
featureSets.append(
self._dataset.getFeatureSetByName(annotationName))
with open(quantfilename, "r") as quantFile:
quantificationReader = csv.reader(quantFile, delimiter=b"\t")
header = next(quantificationReader)
Expand All @@ -158,7 +149,6 @@ def writeExpression(self, rnaQuantificationId, quantfilename,
countColNum = self.setColNum(header, self._countCol, -1)
confColLowNum = self.setColNum(header, self._confColLow, -1)
confColHiNum = self.setColNum(header, self._confColHi, -1)
featureColNum = self.setColNum(header, self._featureCol)
expressionId = 0
for expression in quantificationReader:
expressionLevel = expression[expressionLevelColNum]
Expand All @@ -174,21 +164,9 @@ def writeExpression(self, rnaQuantificationId, quantfilename,
confidenceHi = float(expression[confColHiNum])
score = (confidenceLow + confidenceHi)/2

featureName = expression[featureColNum]
featureId = ""
if featureSets is not None:
for featureSet in featureSets:
if featureId == "":
for feature in featureSet.getFeatures(
name=featureName):
featureId = feature.id
break
else:
break
datafields = (expressionId, rnaQuantificationId, name,
featureId, expressionLevel, isNormalized,
rawCount, score, units, confidenceLow,
confidenceHi)
expressionLevel, isNormalized, rawCount, score,
units, confidenceLow, confidenceHi)
self._db.addExpression(datafields)
expressionId += 1
self._db.batchAddExpression()
Expand All @@ -210,8 +188,7 @@ def __init__(self, rnaDB, featureType, units="fpkm", dataset=None):
self._isNormalized = True
self._expressionLevelCol = "FPKM"
self._idCol = "tracking_id"
self._nameCol = "gene_short_name"
self._featureCol = "gene_id"
self._nameCol = "tracking_id"
self._confColLow = "FPKM_conf_lo"
self._confColHi = "FPKM_conf_hi"
self.setUnits(units)
Expand Down Expand Up @@ -241,7 +218,6 @@ def __init__(self, rnaDB, featureType, units="tpm", dataset=None):
rnaDB, featureType=featureType, dataset=dataset)
self._isNormalized = True
self._expressionLevelCol = "TPM"
self._featureCol = "gene_id"
self._confColLow = "TPM_ci_lower_bound"
self._confColHi = "TPM_ci_upper_bound"
self._countCol = "expected_count"
Expand All @@ -268,7 +244,6 @@ def __init__(self, rnaDB, featureType, units="tpm", dataset=None):
self._expressionLevelCol = "tpm"
self._idCol = "target_id"
self._nameCol = "target_id"
self._featureCol = "target_id"
self._countCol = "est_counts"
self.setUnits(units)

Expand All @@ -284,10 +259,9 @@ def writeRnaseqTable(rnaDB, analysisIds, description, annotationId,
rnaDB.batchaddRNAQuantification()


def writeExpressionTable(writer, data, featureSetNames=None):
for rnaQuantId, quantfilename in data:
writer.writeExpression(
rnaQuantId, quantfilename, featureSetNames=featureSetNames)
def writeExpressionTable(writer, data):
for rnaQuantId, quantFilename in data:
writer.writeExpression(rnaQuantId, quantFilename)


def rnaseq2ga(quantificationFilename, sqlFilename, localName, rnaType,
Expand All @@ -299,7 +273,7 @@ def rnaseq2ga(quantificationFilename, sqlFilename, localName, rnaType,
in a sqlite database for use by the GA4GH reference server.
Supports the following quantification output types:
Cufflinks, kallisto, RSEM
Cufflinks, kallisto, RSEM.
"""
readGroupSetName = ""
if readGroupSetNames:
Expand Down Expand Up @@ -330,6 +304,4 @@ def rnaseq2ga(quantificationFilename, sqlFilename, localName, rnaType,
writeRnaseqTable(rnaDB, [localName], description, featureSetIds,
readGroupId=readGroupIds, programs=programs,
biosampleId=biosampleId)
writeExpressionTable(
writer, [(localName, quantificationFilename)],
featureSetNames=featureSetNames)
writeExpressionTable(writer, [(localName, quantificationFilename)])
10 changes: 4 additions & 6 deletions tests/datadriven/test_rna_quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@
"name": "ENSG00000076984.13",
"rna_quantification_id": "",
"expression": 24.52,
"feature_id": "ENSG00000076984.13",
"feature_ids": ["ENSG00000076984.14", "ENSG00000076984.13"],
"names": ["ENSG00000076984.14", "ENSG00000076984.13"],
"is_normalized": True,
"raw_read_count": 4317.0,
"score": 24.35,
Expand Down Expand Up @@ -143,7 +142,6 @@ def assertExpressionEqual(self, gaExpressionObj, testData):
compoundId = datamodel.ExpressionLevelCompoundId.parse(idString)
self.assertEqual(gaExpression.id, str(compoundId))
self.assertEqual(gaExpression.name, testData["name"])
self.assertEqual(gaExpression.feature_id, testData["feature_id"])
self.assertEqual(
gaExpression.rna_quantification_id,
str(gaExpressionObj.getParentContainer().getCompoundId()))
Expand Down Expand Up @@ -172,11 +170,11 @@ def testSearchExpressionLevels(self):
_expressionTestData["num_entries_over_threshold"],
len(overThreshold))

def testSearchExpressionLevelsWithFeatureIds(self):
def testSearchExpressionLevelsWithNames(self):
rnaQuantification = self._gaObject.getRnaQuantificationByIndex(0)
featureIds = _expressionTestData["feature_ids"]
names = _expressionTestData["names"]
expressionLevels = rnaQuantification.getExpressionLevels(
featureIds=featureIds)
names=names)
self.assertEqual(
_expressionTestData["num_expression_entries"],
len(expressionLevels))
Expand Down

0 comments on commit d535648

Please sign in to comment.