feat(#605): split guidelines by lookup keys

hpi-dhc · May 5, 2023 · 10ebaeb · 10ebaeb
1 parent ac3a19e
commit 10ebaeb
Show file tree

Hide file tree

Showing 15 changed files with 181 additions and 92 deletions.
diff --git a/anni/scripts/migrate.py b/anni/scripts/migrate.py
diff --git a/pharme.code-workspace b/pharme.code-workspace
@@ -23,6 +23,10 @@
     {
       "name": "🦄 Miscellaneous",
       "path": "miscellaneous"
+    },
+    {
+      "name": "📜 Scripts",
+      "path": "scripts"
     }
   ],
   "settings": {

diff --git a/anni/scripts/.gitignore → scripts/.gitignore b/anni/scripts/.gitignore → scripts/.gitignore
@@ -1,3 +1,5 @@
 *.json
 *.base64
 temp/
+
+.venv/
diff --git a/anni/scripts/README.md → scripts/README.md b/anni/scripts/README.md → scripts/README.md
@@ -1,14 +1,26 @@
 # Anni Data Scripts
 
 These scripts can be used to work on data backups.
-The scripts require `python3` to be installed.
+The scripts require `python` (3.X) and the packages defined in
+`requirements.txt` to be installed.
+
+Setup with `venv` (recommended):
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+🗒️ _Note: for VS Code, you might need to set the Python interpreter for_
+📜 _Scripts to the created `.venv`._
 
 As input, Anni backup data is assumed, either in JSON format, or Base64 format
 containing a zipped JSON.
 
 ## Migrate data
 
-Run `pyhthon3 migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
+Run `pyhthon migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
 `<PATH_TO_BACKUP>_migrated_<TIMESTAMP>.base64`.
 
 **⚠️ Migrating data will remove the data history!**
@@ -21,13 +33,17 @@ Run `pyhthon3 migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
 * [Use phenotypes from cpic](https://github.com/hpi-dhc/PharMe/pull/602)
 * [Contract by phenotype first](https://github.com/hpi-dhc/PharMe/pull/604)
 
+🗒️ _Note: contraction by phenotype will not work for data initialized between
+[Use phenotypes from cpic](https://github.com/hpi-dhc/PharMe/pull/602) and
+[Contract by phenotype first](https://github.com/hpi-dhc/PharMe/pull/604)._
+
 ## Decode Base64
 
-Run `python3 decode.py <PATH_TO_BACKUP>.base64` to receive
+Run `python decode.py <PATH_TO_BACKUP>.base64` to receive
 `<PATH_TO_BACKUP>_decoded_<TIMESTAMP>.json`.
 
 ## Clean script outputs
 
-Run `python3 clean.py` to remove the `scripts/temp` directory and all files in
+Run `python clean.py` to remove the `scripts/temp` directory and all files in
 `scripts/` containing a postfix defined in `SCRIPT_POSTFIXES` (see
 `common.constants`).
diff --git a/anni/scripts/clean.py → scripts/clean.py b/anni/scripts/clean.py → scripts/clean.py
diff --git a/anni/scripts/common/constants.py → scripts/common/constants.py b/anni/scripts/common/constants.py → scripts/common/constants.py
diff --git a/anni/scripts/common/cpic_data.py → scripts/common/cpic_data.py b/anni/scripts/common/cpic_data.py → scripts/common/cpic_data.py
diff --git a/anni/scripts/common/get_data.py → scripts/common/get_data.py b/anni/scripts/common/get_data.py → scripts/common/get_data.py
@@ -58,3 +58,44 @@ def get_data():
 def get_guideline_by_id(data, id):
     guidelines = data['Guideline']
     return next(guideline for guideline in guidelines if guideline['_id'] == id)
+
+def get_phenotype_value_lengths(guideline, expect_same_length = False):
+    phenotype_values = list(guideline['lookupkey'].values()) + \
+        list(guideline['phenotypes'].values())
+    phenotype_values_lengths = list(set(map(len, phenotype_values)))
+    if expect_same_length:
+        if len(phenotype_values_lengths) != 1:
+            raise Exception('[ERROR] Expecting lookupkey and phenotypes per ' \
+                            'gene to have same lenghts but lengths differ ' \
+                            'for guideline {}'.format(guideline['_id']))
+        return phenotype_values_lengths[0]
+    return phenotype_values_lengths
+
+def get_phenotype_value(phenotype_values, index):
+    phenotype_value = {}
+    for gene in phenotype_values:
+        phenotype_value[gene] = phenotype_values[gene][index]
+    return phenotype_value
+
+def dict_to_key(dictionary):
+    return ' '.join(map(
+        lambda key: f'{key} {dictionary[key]}',
+        dict(sorted(dictionary.items())).keys()))
+
+def get_phenotype_key(guideline):
+    return dict_to_key(guideline['phenotypes'])
+
+def get_information_key(guideline):
+    # Lenth of guideline['externalData']) should always be 1 as we just migrated
+    # it but just to be sure
+    if len(guideline['externalData']) != 1:
+        raise Exception('[ERROR] Expecting externalData to be list with one ' \
+                        'element')
+    external_data = guideline['externalData'][0]
+    information_key = external_data['comments'] \
+        if external_data['comments'] != None \
+        else ''
+    information_key += external_data['recommendation']
+    information_key += dict_to_key(external_data['implications'])
+    return information_key
+
diff --git a/anni/scripts/common/make_temp_dir.py → scripts/common/make_temp_dir.py b/anni/scripts/common/make_temp_dir.py → scripts/common/make_temp_dir.py
diff --git a/scripts/common/mongo.py b/scripts/common/mongo.py
@@ -0,0 +1,4 @@
+import bson
+
+def get_object_id():
+    return str(bson.ObjectId())
diff --git a/anni/scripts/common/remove_history.py → scripts/common/remove_history.py b/anni/scripts/common/remove_history.py → scripts/common/remove_history.py
diff --git a/anni/scripts/common/write_data.py → scripts/common/write_data.py b/anni/scripts/common/write_data.py → scripts/common/write_data.py
diff --git a/anni/scripts/decode.py → scripts/decode.py b/anni/scripts/decode.py → scripts/decode.py
diff --git a/scripts/migrate.py b/scripts/migrate.py
@@ -0,0 +1,109 @@
+from common.get_data import get_data, get_information_key, get_guideline_by_id, \
+    get_phenotype_value_lengths, get_phenotype_value, get_phenotype_key
+from common.write_data import write_data
+from common.constants import SCRIPT_POSTFIXES
+from common.cpic_data import get_phenotype_map
+from common.remove_history import remove_history
+from common.mongo import get_object_id
+
+# Rename `cpicData` in guidelines to `externalData` (#582)
+# Add `source` field to `externalData` with value 'CPIC' (#582)
+def rename_external_data(guideline):
+    old_key = 'cpicData'
+    new_key = 'externalData'
+    if old_key in guideline:
+        guideline[new_key] = guideline.pop(old_key)
+        guideline[new_key]['source'] = 'CPIC'
+    return guideline
+
+# Change `externalData` to array (#597)
+def enlist_external_data(guideline):
+    if type(guideline['externalData']) is not list:
+        guideline['externalData'] = [guideline['externalData']]
+    return guideline
+
+# Add phenotypes for guideline (#602)
+def add_phenotypes(guideline, phenotype_map):
+    if not 'phenotypes' in guideline:
+        phenotypes = {}
+        for gene_symbol, gene_results in guideline['lookupkey'].items():
+            phenotypes[gene_symbol] = []
+            for gene_result in gene_results:
+                phenotype = phenotype_map[gene_symbol][gene_result]
+                phenotypes[gene_symbol].append(phenotype)
+        guideline['phenotypes'] = phenotypes
+    return guideline
+
+# Chain single guideline migrations together
+def migrate_guideline(guideline, phenotype_map):
+    return add_phenotypes(
+        enlist_external_data(rename_external_data(guideline)),
+        phenotype_map)
+
+# Contract external data by phenotypes (#597)
+# Split up previously contracted phenotypes (#604)
+# Contraction is implemented here analogous to
+# anni/src/common/database/helpers/cpic-constructors.py
+def contract_phenotypes_per_drug(guidelines):
+    # Split up by lookupkeys and group by phenotype and external information
+    phenotype_guideline_map = {}
+    for guideline in guidelines:
+        contracted_guideline_number = get_phenotype_value_lengths(
+            guideline, expect_same_length=True)
+        for phenotype_index in range(0, contracted_guideline_number):
+            decontracted_guideline = guideline.copy()
+            decontracted_guideline['_id'] = get_object_id()
+            decontracted_guideline['lookupkey'] = get_phenotype_value(
+                guideline['lookupkey'], phenotype_index)
+            decontracted_guideline['phenotypes'] = get_phenotype_value(
+                guideline['phenotypes'], phenotype_index)
+            phenotype_key = get_phenotype_key(decontracted_guideline)
+            information_key = get_information_key(decontracted_guideline)
+            if not phenotype_key in phenotype_guideline_map:
+                phenotype_guideline_map[phenotype_key] = {}
+            phenotype_guidelines = phenotype_guideline_map[phenotype_key]
+            if not information_key in phenotype_guidelines:
+                phenotype_guidelines[information_key] = []
+            phenotype_guidelines[information_key].append(
+                decontracted_guideline)
+    # TODO: Contract grouped guidelines
+    return list(guidelines)
+
+# Migrate data
+def migrate_data():
+    data = remove_history(get_data())
+    phenotype_map = get_phenotype_map()
+
+    # If phenotypes are not present initially (data was created before #602),
+    # assume that guidelines also need to be contracted by phenotypes (#604)
+    contract_by_phenotypes = not 'phenotypes' in data['Guideline'][0]
+
+    # Iterate data for migration of single guidelines and contract guidelines
+    # per drug afterwards (needs phenotypes)
+
+    for guideline in data['Guideline']:
+        guideline = migrate_guideline(guideline, phenotype_map)
+
+    if contract_by_phenotypes:
+        migrated_guidelines = []
+        for drug in data['Drug']:
+            migrated_guidelines.append(contract_phenotypes_per_drug(
+                list(map(
+                    lambda id: get_guideline_by_id(data, id),
+                    drug['guidelines']))))
+        data['Guideline'] = migrated_guidelines
+
+    if 'AppData' in data:
+        for row in data['AppData']:
+            for drug in row['drugs']:
+                guidelines = drug['guidelines']
+                for guideline in guidelines:
+                    guideline = migrate_guideline(guideline, phenotype_map)
+                if contract_by_phenotypes:
+                    guidelines = contract_phenotypes_per_drug(
+                        guidelines, phenotype_map)
+
+    write_data(data, postfix=SCRIPT_POSTFIXES['migrate'])
+
+if __name__ == '__main__':
+    migrate_data()
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -0,0 +1 @@
+pymongo==3.5.1