Skip to content

Commit

Permalink
feat(#605): split guidelines by lookup keys
Browse files Browse the repository at this point in the history
  • Loading branch information
tamslo committed May 5, 2023
1 parent ac3a19e commit 10ebaeb
Show file tree
Hide file tree
Showing 15 changed files with 181 additions and 92 deletions.
88 changes: 0 additions & 88 deletions anni/scripts/migrate.py

This file was deleted.

4 changes: 4 additions & 0 deletions pharme.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
{
"name": "🦄 Miscellaneous",
"path": "miscellaneous"
},
{
"name": "📜 Scripts",
"path": "scripts"
}
],
"settings": {
Expand Down
2 changes: 2 additions & 0 deletions anni/scripts/.gitignore → scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.json
*.base64
temp/

.venv/
24 changes: 20 additions & 4 deletions anni/scripts/README.md → scripts/README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
# Anni Data Scripts

These scripts can be used to work on data backups.
The scripts require `python3` to be installed.
The scripts require `python` (3.X) and the packages defined in
`requirements.txt` to be installed.

Setup with `venv` (recommended):

```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
```

🗒️ _Note: for VS Code, you might need to set the Python interpreter for_
📜 _Scripts to the created `.venv`._

As input, Anni backup data is assumed, either in JSON format, or Base64 format
containing a zipped JSON.

## Migrate data

Run `pyhthon3 migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
Run `pyhthon migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
`<PATH_TO_BACKUP>_migrated_<TIMESTAMP>.base64`.

**⚠️ Migrating data will remove the data history!**
Expand All @@ -21,13 +33,17 @@ Run `pyhthon3 migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
* [Use phenotypes from cpic](https://github.com/hpi-dhc/PharMe/pull/602)
* [Contract by phenotype first](https://github.com/hpi-dhc/PharMe/pull/604)

🗒️ _Note: contraction by phenotype will not work for data initialized between
[Use phenotypes from cpic](https://github.com/hpi-dhc/PharMe/pull/602) and
[Contract by phenotype first](https://github.com/hpi-dhc/PharMe/pull/604)._

## Decode Base64

Run `python3 decode.py <PATH_TO_BACKUP>.base64` to receive
Run `python decode.py <PATH_TO_BACKUP>.base64` to receive
`<PATH_TO_BACKUP>_decoded_<TIMESTAMP>.json`.

## Clean script outputs

Run `python3 clean.py` to remove the `scripts/temp` directory and all files in
Run `python clean.py` to remove the `scripts/temp` directory and all files in
`scripts/` containing a postfix defined in `SCRIPT_POSTFIXES` (see
`common.constants`).
File renamed without changes.
File renamed without changes.
File renamed without changes.
41 changes: 41 additions & 0 deletions anni/scripts/common/get_data.py → scripts/common/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,44 @@ def get_data():
def get_guideline_by_id(data, id):
guidelines = data['Guideline']
return next(guideline for guideline in guidelines if guideline['_id'] == id)

def get_phenotype_value_lengths(guideline, expect_same_length = False):
phenotype_values = list(guideline['lookupkey'].values()) + \
list(guideline['phenotypes'].values())
phenotype_values_lengths = list(set(map(len, phenotype_values)))
if expect_same_length:
if len(phenotype_values_lengths) != 1:
raise Exception('[ERROR] Expecting lookupkey and phenotypes per ' \
'gene to have same lenghts but lengths differ ' \
'for guideline {}'.format(guideline['_id']))
return phenotype_values_lengths[0]
return phenotype_values_lengths

def get_phenotype_value(phenotype_values, index):
phenotype_value = {}
for gene in phenotype_values:
phenotype_value[gene] = phenotype_values[gene][index]
return phenotype_value

def dict_to_key(dictionary):
return ' '.join(map(
lambda key: f'{key} {dictionary[key]}',
dict(sorted(dictionary.items())).keys()))

def get_phenotype_key(guideline):
return dict_to_key(guideline['phenotypes'])

def get_information_key(guideline):
# Lenth of guideline['externalData']) should always be 1 as we just migrated
# it but just to be sure
if len(guideline['externalData']) != 1:
raise Exception('[ERROR] Expecting externalData to be list with one ' \
'element')
external_data = guideline['externalData'][0]
information_key = external_data['comments'] \
if external_data['comments'] != None \
else ''
information_key += external_data['recommendation']
information_key += dict_to_key(external_data['implications'])
return information_key

File renamed without changes.
4 changes: 4 additions & 0 deletions scripts/common/mongo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import bson

def get_object_id():
return str(bson.ObjectId())
File renamed without changes.
File renamed without changes.
File renamed without changes.
109 changes: 109 additions & 0 deletions scripts/migrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from common.get_data import get_data, get_information_key, get_guideline_by_id, \
get_phenotype_value_lengths, get_phenotype_value, get_phenotype_key
from common.write_data import write_data
from common.constants import SCRIPT_POSTFIXES
from common.cpic_data import get_phenotype_map
from common.remove_history import remove_history
from common.mongo import get_object_id

# Rename `cpicData` in guidelines to `externalData` (#582)
# Add `source` field to `externalData` with value 'CPIC' (#582)
def rename_external_data(guideline):
old_key = 'cpicData'
new_key = 'externalData'
if old_key in guideline:
guideline[new_key] = guideline.pop(old_key)
guideline[new_key]['source'] = 'CPIC'
return guideline

# Change `externalData` to array (#597)
def enlist_external_data(guideline):
if type(guideline['externalData']) is not list:
guideline['externalData'] = [guideline['externalData']]
return guideline

# Add phenotypes for guideline (#602)
def add_phenotypes(guideline, phenotype_map):
if not 'phenotypes' in guideline:
phenotypes = {}
for gene_symbol, gene_results in guideline['lookupkey'].items():
phenotypes[gene_symbol] = []
for gene_result in gene_results:
phenotype = phenotype_map[gene_symbol][gene_result]
phenotypes[gene_symbol].append(phenotype)
guideline['phenotypes'] = phenotypes
return guideline

# Chain single guideline migrations together
def migrate_guideline(guideline, phenotype_map):
return add_phenotypes(
enlist_external_data(rename_external_data(guideline)),
phenotype_map)

# Contract external data by phenotypes (#597)
# Split up previously contracted phenotypes (#604)
# Contraction is implemented here analogous to
# anni/src/common/database/helpers/cpic-constructors.py
def contract_phenotypes_per_drug(guidelines):
# Split up by lookupkeys and group by phenotype and external information
phenotype_guideline_map = {}
for guideline in guidelines:
contracted_guideline_number = get_phenotype_value_lengths(
guideline, expect_same_length=True)
for phenotype_index in range(0, contracted_guideline_number):
decontracted_guideline = guideline.copy()
decontracted_guideline['_id'] = get_object_id()
decontracted_guideline['lookupkey'] = get_phenotype_value(
guideline['lookupkey'], phenotype_index)
decontracted_guideline['phenotypes'] = get_phenotype_value(
guideline['phenotypes'], phenotype_index)
phenotype_key = get_phenotype_key(decontracted_guideline)
information_key = get_information_key(decontracted_guideline)
if not phenotype_key in phenotype_guideline_map:
phenotype_guideline_map[phenotype_key] = {}
phenotype_guidelines = phenotype_guideline_map[phenotype_key]
if not information_key in phenotype_guidelines:
phenotype_guidelines[information_key] = []
phenotype_guidelines[information_key].append(
decontracted_guideline)
# TODO: Contract grouped guidelines
return list(guidelines)

# Migrate data
def migrate_data():
data = remove_history(get_data())
phenotype_map = get_phenotype_map()

# If phenotypes are not present initially (data was created before #602),
# assume that guidelines also need to be contracted by phenotypes (#604)
contract_by_phenotypes = not 'phenotypes' in data['Guideline'][0]

# Iterate data for migration of single guidelines and contract guidelines
# per drug afterwards (needs phenotypes)

for guideline in data['Guideline']:
guideline = migrate_guideline(guideline, phenotype_map)

if contract_by_phenotypes:
migrated_guidelines = []
for drug in data['Drug']:
migrated_guidelines.append(contract_phenotypes_per_drug(
list(map(
lambda id: get_guideline_by_id(data, id),
drug['guidelines']))))
data['Guideline'] = migrated_guidelines

if 'AppData' in data:
for row in data['AppData']:
for drug in row['drugs']:
guidelines = drug['guidelines']
for guideline in guidelines:
guideline = migrate_guideline(guideline, phenotype_map)
if contract_by_phenotypes:
guidelines = contract_phenotypes_per_drug(
guidelines, phenotype_map)

write_data(data, postfix=SCRIPT_POSTFIXES['migrate'])

if __name__ == '__main__':
migrate_data()
1 change: 1 addition & 0 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pymongo==3.5.1

0 comments on commit 10ebaeb

Please sign in to comment.