feat(#605): simplify migration

hpi-dhc · May 5, 2023 · b2084d4 · b2084d4
1 parent 10ebaeb
commit b2084d4
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 33 deletions.
diff --git a/scripts/README.md b/scripts/README.md
@@ -23,7 +23,7 @@ containing a zipped JSON.
 Run `pyhthon migrate.py <PATH_TO_BACKUP>[.json|.base64]` to receive
 `<PATH_TO_BACKUP>_migrated_<TIMESTAMP>.base64`.
 
-**⚠️ Migrating data will remove the data history!**
+**⚠️ Migrating data will remove the data history, including published versions!**
 
 (Breaking) changes covered:
 

diff --git a/scripts/common/get_data.py b/scripts/common/get_data.py
@@ -74,16 +74,18 @@ def get_phenotype_value_lengths(guideline, expect_same_length = False):
 def get_phenotype_value(phenotype_values, index):
     phenotype_value = {}
     for gene in phenotype_values:
-        phenotype_value[gene] = phenotype_values[gene][index]
+        phenotype_value[gene] = [phenotype_values[gene][index]]
     return phenotype_value
 
-def dict_to_key(dictionary):
+def dict_to_key(dictionary, format_value=lambda value: value):
     return ' '.join(map(
-        lambda key: f'{key} {dictionary[key]}',
+        lambda key: f'{key} {format_value(dictionary[key])}',
         dict(sorted(dictionary.items())).keys()))
 
 def get_phenotype_key(guideline):
-    return dict_to_key(guideline['phenotypes'])
+    return dict_to_key(
+        guideline['phenotypes'],
+        lambda phenotype_value: ''.join(phenotype_value))
 
 def get_information_key(guideline):
     # Lenth of guideline['externalData']) should always be 1 as we just migrated

diff --git a/scripts/common/remove_history.py b/scripts/common/remove_history.py
@@ -1,4 +1,6 @@
 def remove_history(data):
+    if 'AppData' in data:
+        data['AppData'] = []
     for table_name in data.keys():
         if table_name.endswith('_History'):
             data[table_name] = []

diff --git a/scripts/migrate.py b/scripts/migrate.py
@@ -40,34 +40,40 @@ def migrate_guideline(guideline, phenotype_map):
         enlist_external_data(rename_external_data(guideline)),
         phenotype_map)
 
-# Contract external data by phenotypes (#597)
-# Split up previously contracted phenotypes (#604)
-# Contraction is implemented here analogous to
-# anni/src/common/database/helpers/cpic-constructors.py
-def contract_phenotypes_per_drug(guidelines):
-    # Split up by lookupkeys and group by phenotype and external information
+# Migrate single guidelines; then split up by lookupkeys and re-contract by
+# phenotype and external information (according to #597 and #604)
+def migrate_drug_guidelines(guidelines, phenotype_map):
     phenotype_guideline_map = {}
     for guideline in guidelines:
+        guideline = migrate_guideline(guideline, phenotype_map)
         contracted_guideline_number = get_phenotype_value_lengths(
             guideline, expect_same_length=True)
         for phenotype_index in range(0, contracted_guideline_number):
             decontracted_guideline = guideline.copy()
-            decontracted_guideline['_id'] = get_object_id()
+            del decontracted_guideline['_id']
             decontracted_guideline['lookupkey'] = get_phenotype_value(
                 guideline['lookupkey'], phenotype_index)
             decontracted_guideline['phenotypes'] = get_phenotype_value(
                 guideline['phenotypes'], phenotype_index)
+            # Contraction is implemented analogous to Anni
+            # (see cpic-constructors.ts)
             phenotype_key = get_phenotype_key(decontracted_guideline)
             information_key = get_information_key(decontracted_guideline)
             if not phenotype_key in phenotype_guideline_map:
                 phenotype_guideline_map[phenotype_key] = {}
             phenotype_guidelines = phenotype_guideline_map[phenotype_key]
             if not information_key in phenotype_guidelines:
                 phenotype_guidelines[information_key] = []
-            phenotype_guidelines[information_key].append(
-                decontracted_guideline)
-    # TODO: Contract grouped guidelines
-    return list(guidelines)
+            phenotype_guidelines[information_key].append(decontracted_guideline)
+    # Re-contracted guidelines and assign new IDs
+    recontracted_guidelines = []
+    for phenotype_guidelines in phenotype_guideline_map.values():
+        # TODO: Contract lookupkeys per phenotype
+        # TODO: Contract unique external data per phenotype
+        print(phenotype_guidelines.values())
+        print('')
+
+    return recontracted_guidelines
 
 # Migrate data
 def migrate_data():
@@ -81,28 +87,21 @@ def migrate_data():
     # Iterate data for migration of single guidelines and contract guidelines
     # per drug afterwards (needs phenotypes)
 
-    for guideline in data['Guideline']:
-        guideline = migrate_guideline(guideline, phenotype_map)
-
     if contract_by_phenotypes:
         migrated_guidelines = []
         for drug in data['Drug']:
-            migrated_guidelines.append(contract_phenotypes_per_drug(
-                list(map(
-                    lambda id: get_guideline_by_id(data, id),
-                    drug['guidelines']))))
+            drug_guidelines = list(map(
+                lambda id: get_guideline_by_id(data, id),
+                drug['guidelines']))
+            migrated_drug_guidelines = migrate_drug_guidelines(
+                drug_guidelines, phenotype_map)
+            migrated_guidelines += migrated_drug_guidelines
+            drug['guidelines'] = list(map(
+                lambda guideline: guideline['_id'],
+                migrated_drug_guidelines
+            ))
         data['Guideline'] = migrated_guidelines
 
-    if 'AppData' in data:
-        for row in data['AppData']:
-            for drug in row['drugs']:
-                guidelines = drug['guidelines']
-                for guideline in guidelines:
-                    guideline = migrate_guideline(guideline, phenotype_map)
-                if contract_by_phenotypes:
-                    guidelines = contract_phenotypes_per_drug(
-                        guidelines, phenotype_map)
-
     write_data(data, postfix=SCRIPT_POSTFIXES['migrate'])
 
 if __name__ == '__main__':