feat(#605): add phenotypes

hpi-dhc · May 4, 2023 · 7c44cd0 · 7c44cd0
1 parent 9c586ce
commit 7c44cd0
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 25 deletions.
diff --git a/anni/scripts/README.md b/anni/scripts/README.md
@@ -11,12 +11,13 @@ containing a zipped JSON.
 Run `pyhthon3 migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
 `<PATH_TO_BACKUP>_migrated_<TIMESTAMP>.base64`.
 
-Breaking changes covered:
+(Breaking) changes covered:
 
 * [Add new medications (FDA)](https://github.com/hpi-dhc/PharMe/pull/582)
 * [One annotation per phenotype](https://github.com/hpi-dhc/PharMe/pull/597)
 * [Zipped Anni backup](https://github.com/hpi-dhc/PharMe/pull/599)
 * [Use phenotypes from cpic](https://github.com/hpi-dhc/PharMe/pull/602)
+* [Contract by phenotype first](https://github.com/hpi-dhc/PharMe/pull/604)
 
 ## Decode Base64
 

diff --git a/anni/scripts/common/cpic_data.py b/anni/scripts/common/cpic_data.py
@@ -0,0 +1,29 @@
+import json
+import urllib.request
+import urllib.parse
+
+
+def get_cpic_data(endpoint, params):
+    base_url = 'https://api.cpicpgx.org/v1/'
+    url = base_url + endpoint + '?' + urllib.parse.urlencode(params)
+    with urllib.request.urlopen(url) as response:
+        return json.loads(response.read())
+
+def get_phenotype_map():
+    # Would get gene but list of activity scores is not complete
+    lookup_data = get_cpic_data('recommendation', params={
+        'select': 'lookupkey,phenotypes',
+    })
+    phenotype_map = {}
+    for result in lookup_data:
+        for gene in result['lookupkey']:
+            gene_result = result['lookupkey'][gene]
+            phenotype = result['phenotypes'][gene] \
+                if gene in result['phenotypes'] \
+                else gene_result
+            if not gene in phenotype_map:
+                phenotype_map[gene] = {}
+            if not gene_result in phenotype_map[gene]:
+                phenotype_map[gene][gene_result] = phenotype
+    return phenotype_map
+
diff --git a/anni/scripts/migrate.py b/anni/scripts/migrate.py
@@ -1,6 +1,7 @@
 from common.get_data import get_data
 from common.write_data import write_data
 from common.constants import SCRIPT_POSTFIXES
+from common.cpic_data import get_phenotype_map
 
 # Rename `cpicData` in guidelines to `externalData` (#582)
 # Add `source` field to `externalData` with value 'CPIC' (#582)
@@ -19,30 +20,51 @@ def enlist_external_data(guideline):
     return guideline
 
 # Add phenotypes for guideline (#602)
-def add_phenotypes(guideline):
-    # TODO: get phenotypes from CPIC API based on lookupkey
+def add_phenotypes(guideline, phenotype_map):
+    if not 'phenotypes' in guideline:
+        phenotypes = {}
+        for gene_symbol, gene_results in guideline['lookupkey'].items():
+            phenotypes[gene_symbol] = []
+            for gene_result in gene_results:
+                phenotype = phenotype_map[gene_symbol][gene_result]
+                phenotypes[gene_symbol].append(phenotype)
+        guideline['phenotypes'] = phenotypes
     return guideline
 
+# Do not contract different phenotypes (#604)
+def split_phenotypes(guidelines):
+    # TODO: split up and copy guidelines per phenotype (combination)
+    return guidelines
+
 # Chain guideline migrations together
-def migrate_guideline(guideline):
-    return add_phenotypes(
-        enlist_external_data(
-            rename_external_data(guideline)))
-
-data = get_data()
-
-# Iterate data for migration of content
-for table_name in data.keys():
-    table_content = data[table_name]
-    if table_name.startswith('AppData'):
-        for row in table_content:
-            drugs = row['drugs']
-            for drug in drugs:
-                guidelines = drug['guidelines']
-                for guideline in guidelines:
-                    guideline = migrate_guideline(guideline)
-    if table_name.startswith('Guideline'):
-        for guideline in table_content:
-            guideline = migrate_guideline(guideline)
-
-write_data(data, postfix=SCRIPT_POSTFIXES['migrate'])
+def migrate_guideline(guideline, phenotype_map):
+    return split_phenotypes(
+        add_phenotypes(
+            enlist_external_data(rename_external_data(guideline)),
+            phenotype_map))
+
+# Migrate data
+def migrate_data():
+    data = get_data()
+    phenotype_map = get_phenotype_map()
+
+    # Iterate data for migration of content
+    for table_name in data.keys():
+        table_content = data[table_name]
+        if table_name.startswith('AppData'):
+            for row in table_content:
+                drugs = row['drugs']
+                for drug in drugs:
+                    guidelines = drug['guidelines']
+                    for guideline in guidelines:
+                        guideline = migrate_guideline(guideline, phenotype_map)
+                    guidelines = split_phenotypes(guidelines)
+        if table_name.startswith('Guideline'):
+            for guideline in table_content:
+                guideline = migrate_guideline(guideline, phenotype_map)
+            table_content = split_phenotypes(table_content)
+
+    write_data(data, postfix=SCRIPT_POSTFIXES['migrate'])
+
+if __name__ == '__main__':
+    migrate_data()