From 630faa7dc2b256e9e3287d5cef9f7a9fbfb442a8 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 4 Oct 2024 14:40:26 +0200 Subject: [PATCH 01/39] changed order of operations, added new method --- notebooks/erdri_cds_definition_in_code.ipynb | 1000 +++++++++++++----- src/phenopacket_mapper/mapping/mapper.py | 12 + 2 files changed, 743 insertions(+), 269 deletions(-) diff --git a/notebooks/erdri_cds_definition_in_code.ipynb b/notebooks/erdri_cds_definition_in_code.ipynb index 147bd82..d805a43 100644 --- a/notebooks/erdri_cds_definition_in_code.ipynb +++ b/notebooks/erdri_cds_definition_in_code.ipynb @@ -61,48 +61,44 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "import phenopacket_mapper" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.392354100Z", - "start_time": "2024-09-25T10:58:30.284940700Z" + "end_time": "2024-10-04T12:31:24.311691Z", + "start_time": "2024-10-04T12:31:24.308967Z" } }, "id": "fc1e29f9e0c18364", - "execution_count": 171 + "outputs": [], + "execution_count": 65 }, { "cell_type": "code", - "outputs": [], - "source": [ - "from phenopacket_mapper.data_standards.code_system import ORDO, ICD9, HGVS, HGNC, OMIM, HPO " - ], + "source": "from phenopacket_mapper.data_standards.code_system import ORDO, ICD9, HGVS, HGNC, OMIM, HPO , SNOMED_CT", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.413728800Z", - "start_time": "2024-09-25T10:58:30.399337600Z" + "end_time": "2024-10-04T12:31:24.346999Z", + "start_time": "2024-10-04T12:31:24.343731Z" } }, "id": "a86f24df0f21ca85", - "execution_count": 172 + "outputs": [], + "execution_count": 66 }, { "cell_type": "code", - "execution_count": 173, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.447646100Z", - "start_time": "2024-09-25T10:58:30.431896800Z" + "end_time": "2024-10-04T12:31:24.359730Z", + "start_time": "2024-10-04T12:31:24.355952Z" } }, - "outputs": [], "source": [ "resources = [\n", " ORDO.set_version(\"1.0.19 (2024-08-02)\"),\n", @@ -112,7 +108,9 @@ " OMIM.set_version(\"2024-09-12\"),\n", " HPO.set_version(\"2024-06-07\")\n", "]" - ] + ], + "outputs": [], + "execution_count": 67 }, { "cell_type": "markdown", @@ -126,23 +124,22 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "from phenopacket_mapper.data_standards.code_system import CodeSystem" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.461842300Z", - "start_time": "2024-09-25T10:58:30.453631Z" + "end_time": "2024-10-04T12:31:24.369740Z", + "start_time": "2024-10-04T12:31:24.367338Z" } }, "id": "c667e5760e734525", - "execution_count": 174 + "outputs": [], + "execution_count": 68 }, { "cell_type": "code", - "outputs": [], "source": [ "alpha = CodeSystem(name='Alpha-ID-SE', namespace_prefix='alpha', url='https://www.bfarm.de/EN/Code-systems/Terminologies/Alpha-ID-SE/_node.html')\n", "icd9cm = CodeSystem(name='International Classification of Diseases 9 Clinical Modification (USA)', namespace_prefix='icd-9-cm', url='http://hl7.org/fhir/sid/icd-9-cm')\n", @@ -151,16 +148,16 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.490842700Z", - "start_time": "2024-09-25T10:58:30.482238400Z" + "end_time": "2024-10-04T12:31:24.379490Z", + "start_time": "2024-10-04T12:31:24.376978Z" } }, "id": "2cba20f939e9193a", - "execution_count": 175 + "outputs": [], + "execution_count": 69 }, { "cell_type": "code", - "outputs": [], "source": [ "resources.append(alpha)\n", "resources.append(icd9cm)\n", @@ -169,12 +166,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.502643200Z", - "start_time": "2024-09-25T10:58:30.494057900Z" + "end_time": "2024-10-04T12:31:24.389078Z", + "start_time": "2024-10-04T12:31:24.386726Z" } }, "id": "8f763e585095846f", - "execution_count": 176 + "outputs": [], + "execution_count": 70 }, { "cell_type": "markdown", @@ -194,19 +192,19 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "from phenopacket_mapper.data_standards.value_set import ValueSet" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.516174500Z", - "start_time": "2024-09-25T10:58:30.504640900Z" + "end_time": "2024-10-04T12:31:24.398479Z", + "start_time": "2024-10-04T12:31:24.396419Z" } }, "id": "da099dfff150c4c3", - "execution_count": 177 + "outputs": [], + "execution_count": 71 }, { "cell_type": "markdown", @@ -220,7 +218,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "# 1. Pseudonym\n", "# 1.1. Pseudonym\n", @@ -233,12 +230,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.529974400Z", - "start_time": "2024-09-25T10:58:30.522464700Z" + "end_time": "2024-10-04T12:31:24.408016Z", + "start_time": "2024-10-04T12:31:24.405509Z" } }, "id": "858ac0c5006d3eb2", - "execution_count": 178 + "outputs": [], + "execution_count": 72 }, { "cell_type": "markdown", @@ -252,23 +250,22 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "from phenopacket_mapper.data_standards import Date" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.559274700Z", - "start_time": "2024-09-25T10:58:30.536742400Z" + "end_time": "2024-10-04T12:31:24.426730Z", + "start_time": "2024-10-04T12:31:24.424330Z" } }, "id": "df11e0fe55d695b3", - "execution_count": 179 + "outputs": [], + "execution_count": 73 }, { "cell_type": "code", - "outputs": [], "source": [ "# 2. Personal information\n", "# 2.1. Date of Birth\n", @@ -281,12 +278,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.576576600Z", - "start_time": "2024-09-25T10:58:30.565254300Z" + "end_time": "2024-10-04T12:31:24.435990Z", + "start_time": "2024-10-04T12:31:24.432923Z" } }, "id": "1ce5edd4f1bff64f", - "execution_count": 180 + "outputs": [], + "execution_count": 74 }, { "cell_type": "markdown", @@ -310,7 +308,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "# 2.2. Sex\n", "vs_2_2 = ValueSet(\n", @@ -329,12 +326,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.595693800Z", - "start_time": "2024-09-25T10:58:30.588183800Z" + "end_time": "2024-10-04T12:31:24.446473Z", + "start_time": "2024-10-04T12:31:24.443614Z" } }, "id": "d60831a9feb5ebb1", - "execution_count": 181 + "outputs": [], + "execution_count": 75 }, { "cell_type": "markdown", @@ -348,7 +346,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "# 3.2. Date of death\n", "vs_3_2 = ValueSet(\n", @@ -360,12 +357,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.612139500Z", - "start_time": "2024-09-25T10:58:30.602696100Z" + "end_time": "2024-10-04T12:31:24.456443Z", + "start_time": "2024-10-04T12:31:24.453333Z" } }, "id": "8a640c47ec2b23f3", - "execution_count": 182 + "outputs": [], + "execution_count": 76 }, { "cell_type": "markdown", @@ -380,7 +378,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "# 4. Care Pathway\n", "# 4.1. First contact with specialised centre\n", @@ -393,12 +390,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.644365300Z", - "start_time": "2024-09-25T10:58:30.637141400Z" + "end_time": "2024-10-04T12:31:24.466088Z", + "start_time": "2024-10-04T12:31:24.463543Z" } }, "id": "74e09c708d1324a0", - "execution_count": 183 + "outputs": [], + "execution_count": 77 }, { "cell_type": "markdown", @@ -419,7 +417,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "# 5. Disease history\n", "# 5.1. Age at onset\n", @@ -433,12 +430,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.652238300Z", - "start_time": "2024-09-25T10:58:30.647727300Z" + "end_time": "2024-10-04T12:31:24.483517Z", + "start_time": "2024-10-04T12:31:24.480616Z" } }, "id": "3c3c81c9f5602da4", - "execution_count": 184 + "outputs": [], + "execution_count": 78 }, { "cell_type": "markdown", @@ -452,7 +450,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "# 6. Diagnosis\n", "# 6.1. Diagnosis of the rare disease\n", @@ -477,12 +474,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.678907700Z", - "start_time": "2024-09-25T10:58:30.654415900Z" + "end_time": "2024-10-04T12:31:24.493990Z", + "start_time": "2024-10-04T12:31:24.491027Z" } }, "id": "9f5bcb742f8e05aa", - "execution_count": 185 + "outputs": [], + "execution_count": 79 }, { "cell_type": "markdown", @@ -499,23 +497,22 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "from phenopacket_mapper.data_standards import DataModel, DataField" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.680907700Z", - "start_time": "2024-09-25T10:58:30.656884200Z" + "end_time": "2024-10-04T12:31:24.503597Z", + "start_time": "2024-10-04T12:31:24.501288Z" } }, "id": "4a1f2ec5fbdf96c", - "execution_count": 186 + "outputs": [], + "execution_count": 80 }, { "cell_type": "code", - "outputs": [], "source": [ "erdri_cds_data_model = DataModel(\n", " data_model_name=\"ERDRI CDS\",\n", @@ -560,15 +557,27 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.714389400Z", - "start_time": "2024-09-25T10:58:30.689952100Z" + "end_time": "2024-10-04T12:31:24.515349Z", + "start_time": "2024-10-04T12:31:24.511030Z" } }, "id": "79550015aac215", - "execution_count": 187 + "outputs": [], + "execution_count": 81 }, { "cell_type": "code", + "source": [ + "print(erdri_cds_data_model)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-04T12:31:24.546555Z", + "start_time": "2024-10-04T12:31:24.543356Z" + } + }, + "id": "b89630b2b3d91a3e", "outputs": [ { "name": "stdout", @@ -621,14 +630,14 @@ "\t\tid: age_at_onset,\n", "\t\tsection: 5. Disease history,\n", "\t\tordinal, name: (5.1, Age at onset),\n", - "\t\tvalue_set: ValueSet(elements=['At birth', , 'Undetermined', 'Antenatal'], name='Onset value set', description=''), required: True,\n", + "\t\tvalue_set: ValueSet(elements=['Antenatal', 'Undetermined', 'At birth', ], name='Onset value set', description=''), required: True,\n", "\t\tspecification: \n", "\t)\n", "\tDataField(\n", "\t\tid: age_at_diagnosis,\n", "\t\tsection: 5. Disease history,\n", "\t\tordinal, name: (5.2, Age at diagnosis),\n", - "\t\tvalue_set: ValueSet(elements=['At birth', , 'Undetermined', 'Antenatal'], name='Onset value set', description=''), required: True,\n", + "\t\tvalue_set: ValueSet(elements=['Antenatal', 'Undetermined', 'At birth', ], name='Onset value set', description=''), required: True,\n", "\t\tspecification: \n", "\t)\n", "\tDataField(\n", @@ -666,18 +675,7 @@ ] } ], - "source": [ - "print(erdri_cds_data_model)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.730896800Z", - "start_time": "2024-09-25T10:58:30.718390600Z" - } - }, - "id": "b89630b2b3d91a3e", - "execution_count": 188 + "execution_count": 82 }, { "cell_type": "markdown", @@ -693,28 +691,30 @@ }, { "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": "DataField(name='Date of Birth', value_set=ValueSet(elements=[], name='Value set for 2.1. Date of Birth', description='Value set for field 2.1. Date of Birth of the ERDRI CDS data model in section 2. Personal information'), id='date_of_birth', description='', section='2. Personal information', required=True, specification='', ordinal='2.1')" - }, - "execution_count": 189, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "erdri_cds_data_model.date_of_birth" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.742026500Z", - "start_time": "2024-09-25T10:58:30.736898800Z" + "end_time": "2024-10-04T12:31:24.595102Z", + "start_time": "2024-10-04T12:31:24.591133Z" } }, "id": "b7123400f775fc59", - "execution_count": 189 + "outputs": [ + { + "data": { + "text/plain": [ + "DataField(name='Date of Birth', value_set=ValueSet(elements=[], name='Value set for 2.1. Date of Birth', description='Value set for field 2.1. Date of Birth of the ERDRI CDS data model in section 2. Personal information'), id='date_of_birth', description='', section='2. Personal information', required=True, specification='', ordinal='2.1')" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 83 }, { "cell_type": "markdown", @@ -728,31 +728,30 @@ }, { "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['pseudonym', 'date_of_birth', 'sex', 'patient_s_status', 'date_of_death', 'first_contact_with_specialised_centre', 'age_at_onset', 'age_at_diagnosis', 'diagnosis_of_the_rare_disease', 'genetic_diagnosis', 'undiagnosed_case']\n" - ] - } - ], "source": [ "print(erdri_cds_data_model.get_field_ids())" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.752458800Z", - "start_time": "2024-09-25T10:58:30.746027400Z" + "end_time": "2024-10-04T12:31:24.642751Z", + "start_time": "2024-10-04T12:31:24.639568Z" } }, "id": "eb2e06f9d4005547", - "execution_count": 190 + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['pseudonym', 'date_of_birth', 'sex', 'patient_s_status', 'date_of_death', 'first_contact_with_specialised_centre', 'age_at_onset', 'age_at_diagnosis', 'diagnosis_of_the_rare_disease', 'genetic_diagnosis', 'undiagnosed_case']\n" + ] + } + ], + "execution_count": 84 }, { "cell_type": "code", - "outputs": [], "source": [ "from phenopacket_mapper.pipeline import load_data_using_data_model\n", "from pathlib import Path" @@ -760,31 +759,58 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.777765200Z", - "start_time": "2024-09-25T10:58:30.756231800Z" + "end_time": "2024-10-04T12:31:24.682926Z", + "start_time": "2024-10-04T12:31:24.680287Z" } }, "id": "d296198b6c5536a0", - "execution_count": 191 + "outputs": [], + "execution_count": 85 }, { "cell_type": "code", - "outputs": [], "source": [ "data_path = Path('../res/test_data/erdri/erdri_cds_test_data.xlsx')" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.802373Z", - "start_time": "2024-09-25T10:58:30.788763400Z" + "end_time": "2024-10-04T12:31:24.695315Z", + "start_time": "2024-10-04T12:31:24.692813Z" } }, "id": "3ed78087bf5fcdf", - "execution_count": 192 + "outputs": [], + "execution_count": 86 }, { "cell_type": "code", + "source": [ + "ds = erdri_cds_data_model.load_data(\n", + " path = data_path,\n", + " pseudonym_column=\"1.1. Pseudonym\",\n", + " date_of_birth_column= \"2.1. Date of Birth\",\n", + " sex_column= \"2.2. Sex\",\n", + " patient_s_status_column= \"3.1. Patient's status\",\n", + " date_of_death_column= \"3.2. Date of death\",\n", + " first_contact_with_specialised_centre_column= \"4.1. First contact with specialised centre\",\n", + " age_at_onset_column= \"5.1. Age at onset\",\n", + " age_at_diagnosis_column= \"5.2. Age at diagnosis\",\n", + " diagnosis_of_the_rare_disease_column= \"6.1. Diagnosis of the rare disease\",\n", + " genetic_diagnosis_column= \"6.2. Genetic diagnosis\",\n", + " undiagnosed_case_column= None,\n", + " \n", + " compliance='soft' # 'soft' or 'hard'\n", + ")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-04T12:31:24.727757Z", + "start_time": "2024-10-04T12:31:24.715772Z" + } + }, + "id": "2253ecd35c94e0d1", "outputs": [ { "name": "stderr", @@ -803,7 +829,7 @@ "(missing_fields=undiagnosed_case)\n", " warnings.warn(error_msg)\n", "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:358: UserWarning: Required fields are missing in the instance. (row 4) \n", - "(missing_fields=undiagnosed_case, genetic_diagnosis)\n", + "(missing_fields=genetic_diagnosis, undiagnosed_case)\n", " warnings.warn(error_msg)\n", "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:358: UserWarning: Required fields are missing in the instance. (row 5) \n", "(missing_fields=undiagnosed_case)\n", @@ -820,141 +846,266 @@ "(missing_fields=undiagnosed_case)\n", " warnings.warn(error_msg)\n", "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:358: UserWarning: Required fields are missing in the instance. (row 9) \n", - "(missing_fields=undiagnosed_case, diagnosis_of_the_rare_disease)\n", + "(missing_fields=diagnosis_of_the_rare_disease, undiagnosed_case)\n", " warnings.warn(error_msg)\n" ] } ], - "source": [ - "ds = erdri_cds_data_model.load_data(\n", - " path = data_path,\n", - " pseudonym_column=\"1.1. Pseudonym\",\n", - " date_of_birth_column= \"2.1. Date of Birth\",\n", - " sex_column= \"2.2. Sex\",\n", - " patient_s_status_column= \"3.1. Patient's status\",\n", - " date_of_death_column= \"3.2. Date of death\",\n", - " first_contact_with_specialised_centre_column= \"4.1. First contact with specialised centre\",\n", - " age_at_onset_column= \"5.1. Age at onset\",\n", - " age_at_diagnosis_column= \"5.2. Age at diagnosis\",\n", - " diagnosis_of_the_rare_disease_column= \"6.1. Diagnosis of the rare disease\",\n", - " genetic_diagnosis_column= \"6.2. Genetic diagnosis\",\n", - " undiagnosed_case_column= None,\n", - " \n", - " compliance='soft' # 'soft' or 'hard'\n", - ")" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.832047500Z", - "start_time": "2024-09-25T10:58:30.808357100Z" - } - }, - "id": "2253ecd35c94e0d1", - "execution_count": 193 + "execution_count": 87 }, { "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": " pseudonym date_of_birth sex patient_s_status \\\n0 patient0 2002-02-00T00:00:00Z Female Alive \n1 patient1 1979-06-17T00:00:00Z Male Dead \n2 patient2 2000-00-00T00:00:00Z Male Alive \n3 patient3 2003-01-07T00:00:00Z Female Alive \n4 patient4 2004-08-02T00:00:00Z Undetermined Lost in follow-up \n5 patient5 1923-04-08T00:00:00Z Foetus (Unknown) Opted-out \n6 patient6 1999-00-00T00:00:00Z False Alive \n7 patient7 2003-12-00T00:00:00Z m Alive \n8 patient8 1979-09-00T00:00:00Z Male Dead \n9 patient9 2002-00-00T00:00:00Z Female Alive \n\n date_of_death first_contact_with_specialised_centre \\\n0 None 2019-00-00T00:00:00Z \n1 2010-02-00T00:00:00Z 2019-00-00T00:00:00Z \n2 None 2020-00-00T00:00:00Z \n3 None 2023-00-00T00:00:00Z \n4 None 2020-00-00T00:00:00Z \n5 None 2021-00-00T00:00:00Z \n6 None 2019-00-00T00:00:00Z \n7 None 2024-00-00T00:00:00Z \n8 2017-05-31T00:00:00Z 2022-00-00T00:00:00Z \n9 None 2019-00-00T00:00:00Z \n\n age_at_onset age_at_diagnosis diagnosis_of_the_rare_disease \\\n0 Antenatal Antenatal ORPHA:206638 \n1 At birth At birth ICD9:781 \n2 2005-12-07T00:00:00Z 2005-12-07T00:00:00Z ORPHA:206638 \n3 2009-07-22T00:00:00Z 2010-07-22T00:00:00Z ORPHA:206638 \n4 Undetermined Undetermined ORPHA:206638 \n5 Undetermined Undetermined ORPHA:206638 \n6 2017-00-00T00:00:00Z 2018-00-00T00:00:00Z ORPHA:206638 \n7 2020-05-00T00:00:00Z 2020-05-00T00:00:00Z ORPHA:206638 \n8 2009-00-00T00:00:00Z 2009-00-00T00:00:00Z ORPHA:206638 \n9 2024-00-00T00:00:00Z 2024-00-00T00:00:00Z None \n\n genetic_diagnosis undiagnosed_case \n0 OMIM:614106 None \n1 OMIM:614106 None \n2 OMIM:614106 None \n3 OMIM:614106 None \n4 None None \n5 OMIM:614106 None \n6 None None \n7 OMIM:614106 None \n8 OMIM:614106 None \n9 OMIM:614106 None ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
pseudonymdate_of_birthsexpatient_s_statusdate_of_deathfirst_contact_with_specialised_centreage_at_onsetage_at_diagnosisdiagnosis_of_the_rare_diseasegenetic_diagnosisundiagnosed_case
0patient02002-02-00T00:00:00ZFemaleAliveNone2019-00-00T00:00:00ZAntenatalAntenatalORPHA:206638OMIM:614106None
1patient11979-06-17T00:00:00ZMaleDead2010-02-00T00:00:00Z2019-00-00T00:00:00ZAt birthAt birthICD9:781OMIM:614106None
2patient22000-00-00T00:00:00ZMaleAliveNone2020-00-00T00:00:00Z2005-12-07T00:00:00Z2005-12-07T00:00:00ZORPHA:206638OMIM:614106None
3patient32003-01-07T00:00:00ZFemaleAliveNone2023-00-00T00:00:00Z2009-07-22T00:00:00Z2010-07-22T00:00:00ZORPHA:206638OMIM:614106None
4patient42004-08-02T00:00:00ZUndeterminedLost in follow-upNone2020-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638NoneNone
5patient51923-04-08T00:00:00ZFoetus (Unknown)Opted-outNone2021-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638OMIM:614106None
6patient61999-00-00T00:00:00ZFalseAliveNone2019-00-00T00:00:00Z2017-00-00T00:00:00Z2018-00-00T00:00:00ZORPHA:206638NoneNone
7patient72003-12-00T00:00:00ZmAliveNone2024-00-00T00:00:00Z2020-05-00T00:00:00Z2020-05-00T00:00:00ZORPHA:206638OMIM:614106None
8patient81979-09-00T00:00:00ZMaleDead2017-05-31T00:00:00Z2022-00-00T00:00:00Z2009-00-00T00:00:00Z2009-00-00T00:00:00ZORPHA:206638OMIM:614106None
9patient92002-00-00T00:00:00ZFemaleAliveNone2019-00-00T00:00:00Z2024-00-00T00:00:00Z2024-00-00T00:00:00ZNoneOMIM:614106None
\n
" - }, - "execution_count": 194, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "ds.head(20)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.850165600Z", - "start_time": "2024-09-25T10:58:30.836049500Z" + "end_time": "2024-10-04T12:31:24.763736Z", + "start_time": "2024-10-04T12:31:24.755372Z" } }, "id": "fb05e92f9341ad27", - "execution_count": 194 - }, - { - "cell_type": "markdown", - "source": [ - "## 3. Preprocessing the data\n", - "\n", - "It is important to preprocess the data to adhere to the Phenopacket schema. \n", - "\n", - "E.g.: The `phenopackets.VitalStatus` expects an enum value out of 'ALIVE', 'DECEASED', and 'UNKNOWN_STATUS' for the `phenopackets.VitalStatus.status` field. But the ERDRI CDS defined it's data set as 'Alive', 'Dead', 'Lost in follow-up', 'Opted-out'. So we need to map these values to the expected ones." - ], - "metadata": { - "collapsed": false - }, - "id": "da740c3a22fc7f2a" - }, - { - "cell_type": "code", "outputs": [ { "data": { - "text/plain": " pseudonym date_of_birth sex patient_s_status \\\n0 patient0 2002-02-00T00:00:00Z FEMALE ALIVE \n1 patient1 1979-06-17T00:00:00Z MALE DECEASED \n2 patient2 2000-00-00T00:00:00Z MALE ALIVE \n3 patient3 2003-01-07T00:00:00Z FEMALE ALIVE \n4 patient4 2004-08-02T00:00:00Z UNKNOWN_SEX UNKNOWN_STATUS \n5 patient5 1923-04-08T00:00:00Z UNKNOWN_SEX UNKNOWN_STATUS \n6 patient6 1999-00-00T00:00:00Z FEMALE ALIVE \n7 patient7 2003-12-00T00:00:00Z MALE ALIVE \n8 patient8 1979-09-00T00:00:00Z MALE DECEASED \n9 patient9 2002-00-00T00:00:00Z FEMALE ALIVE \n\n date_of_death first_contact_with_specialised_centre \\\n0 None 2019-00-00T00:00:00Z \n1 2010-02-00T00:00:00Z 2019-00-00T00:00:00Z \n2 None 2020-00-00T00:00:00Z \n3 None 2023-00-00T00:00:00Z \n4 None 2020-00-00T00:00:00Z \n5 None 2021-00-00T00:00:00Z \n6 None 2019-00-00T00:00:00Z \n7 None 2024-00-00T00:00:00Z \n8 2017-05-31T00:00:00Z 2022-00-00T00:00:00Z \n9 None 2019-00-00T00:00:00Z \n\n age_at_onset age_at_diagnosis diagnosis_of_the_rare_disease \\\n0 Antenatal Antenatal ORPHA:206638 \n1 At birth At birth ICD9:781 \n2 2005-12-07T00:00:00Z 2005-12-07T00:00:00Z ORPHA:206638 \n3 2009-07-22T00:00:00Z 2010-07-22T00:00:00Z ORPHA:206638 \n4 Undetermined Undetermined ORPHA:206638 \n5 Undetermined Undetermined ORPHA:206638 \n6 2017-00-00T00:00:00Z 2018-00-00T00:00:00Z ORPHA:206638 \n7 2020-05-00T00:00:00Z 2020-05-00T00:00:00Z ORPHA:206638 \n8 2009-00-00T00:00:00Z 2009-00-00T00:00:00Z ORPHA:206638 \n9 2024-00-00T00:00:00Z 2024-00-00T00:00:00Z None \n\n genetic_diagnosis undiagnosed_case \n0 OMIM:614106 None \n1 OMIM:614106 None \n2 OMIM:614106 None \n3 OMIM:614106 None \n4 None None \n5 OMIM:614106 None \n6 None None \n7 OMIM:614106 None \n8 OMIM:614106 None \n9 OMIM:614106 None ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
pseudonymdate_of_birthsexpatient_s_statusdate_of_deathfirst_contact_with_specialised_centreage_at_onsetage_at_diagnosisdiagnosis_of_the_rare_diseasegenetic_diagnosisundiagnosed_case
0patient02002-02-00T00:00:00ZFEMALEALIVENone2019-00-00T00:00:00ZAntenatalAntenatalORPHA:206638OMIM:614106None
1patient11979-06-17T00:00:00ZMALEDECEASED2010-02-00T00:00:00Z2019-00-00T00:00:00ZAt birthAt birthICD9:781OMIM:614106None
2patient22000-00-00T00:00:00ZMALEALIVENone2020-00-00T00:00:00Z2005-12-07T00:00:00Z2005-12-07T00:00:00ZORPHA:206638OMIM:614106None
3patient32003-01-07T00:00:00ZFEMALEALIVENone2023-00-00T00:00:00Z2009-07-22T00:00:00Z2010-07-22T00:00:00ZORPHA:206638OMIM:614106None
4patient42004-08-02T00:00:00ZUNKNOWN_SEXUNKNOWN_STATUSNone2020-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638NoneNone
5patient51923-04-08T00:00:00ZUNKNOWN_SEXUNKNOWN_STATUSNone2021-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638OMIM:614106None
6patient61999-00-00T00:00:00ZFEMALEALIVENone2019-00-00T00:00:00Z2017-00-00T00:00:00Z2018-00-00T00:00:00ZORPHA:206638NoneNone
7patient72003-12-00T00:00:00ZMALEALIVENone2024-00-00T00:00:00Z2020-05-00T00:00:00Z2020-05-00T00:00:00ZORPHA:206638OMIM:614106None
8patient81979-09-00T00:00:00ZMALEDECEASED2017-05-31T00:00:00Z2022-00-00T00:00:00Z2009-00-00T00:00:00Z2009-00-00T00:00:00ZORPHA:206638OMIM:614106None
9patient92002-00-00T00:00:00ZFEMALEALIVENone2019-00-00T00:00:00Z2024-00-00T00:00:00Z2024-00-00T00:00:00ZNoneOMIM:614106None
\n
" + "text/plain": [ + " pseudonym date_of_birth sex patient_s_status \\\n", + "0 patient0 2002-02-00T00:00:00Z Female Alive \n", + "1 patient1 1979-06-17T00:00:00Z Male Dead \n", + "2 patient2 2000-00-00T00:00:00Z Male Alive \n", + "3 patient3 2003-01-07T00:00:00Z Female Alive \n", + "4 patient4 2004-08-02T00:00:00Z Undetermined Lost in follow-up \n", + "5 patient5 1923-04-08T00:00:00Z Foetus (Unknown) Opted-out \n", + "6 patient6 1999-00-00T00:00:00Z False Alive \n", + "7 patient7 2003-12-00T00:00:00Z m Alive \n", + "8 patient8 1979-09-00T00:00:00Z Male Dead \n", + "9 patient9 2002-00-00T00:00:00Z Female Alive \n", + "\n", + " date_of_death first_contact_with_specialised_centre \\\n", + "0 None 2019-00-00T00:00:00Z \n", + "1 2010-02-00T00:00:00Z 2019-00-00T00:00:00Z \n", + "2 None 2020-00-00T00:00:00Z \n", + "3 None 2023-00-00T00:00:00Z \n", + "4 None 2020-00-00T00:00:00Z \n", + "5 None 2021-00-00T00:00:00Z \n", + "6 None 2019-00-00T00:00:00Z \n", + "7 None 2024-00-00T00:00:00Z \n", + "8 2017-05-31T00:00:00Z 2022-00-00T00:00:00Z \n", + "9 None 2019-00-00T00:00:00Z \n", + "\n", + " age_at_onset age_at_diagnosis diagnosis_of_the_rare_disease \\\n", + "0 Antenatal Antenatal ORPHA:206638 \n", + "1 At birth At birth ICD9:781 \n", + "2 2005-12-07T00:00:00Z 2005-12-07T00:00:00Z ORPHA:206638 \n", + "3 2009-07-22T00:00:00Z 2010-07-22T00:00:00Z ORPHA:206638 \n", + "4 Undetermined Undetermined ORPHA:206638 \n", + "5 Undetermined Undetermined ORPHA:206638 \n", + "6 2017-00-00T00:00:00Z 2018-00-00T00:00:00Z ORPHA:206638 \n", + "7 2020-05-00T00:00:00Z 2020-05-00T00:00:00Z ORPHA:206638 \n", + "8 2009-00-00T00:00:00Z 2009-00-00T00:00:00Z ORPHA:206638 \n", + "9 2024-00-00T00:00:00Z 2024-00-00T00:00:00Z None \n", + "\n", + " genetic_diagnosis undiagnosed_case \n", + "0 OMIM:614106 None \n", + "1 OMIM:614106 None \n", + "2 OMIM:614106 None \n", + "3 OMIM:614106 None \n", + "4 None None \n", + "5 OMIM:614106 None \n", + "6 None None \n", + "7 OMIM:614106 None \n", + "8 OMIM:614106 None \n", + "9 OMIM:614106 None " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pseudonymdate_of_birthsexpatient_s_statusdate_of_deathfirst_contact_with_specialised_centreage_at_onsetage_at_diagnosisdiagnosis_of_the_rare_diseasegenetic_diagnosisundiagnosed_case
0patient02002-02-00T00:00:00ZFemaleAliveNone2019-00-00T00:00:00ZAntenatalAntenatalORPHA:206638OMIM:614106None
1patient11979-06-17T00:00:00ZMaleDead2010-02-00T00:00:00Z2019-00-00T00:00:00ZAt birthAt birthICD9:781OMIM:614106None
2patient22000-00-00T00:00:00ZMaleAliveNone2020-00-00T00:00:00Z2005-12-07T00:00:00Z2005-12-07T00:00:00ZORPHA:206638OMIM:614106None
3patient32003-01-07T00:00:00ZFemaleAliveNone2023-00-00T00:00:00Z2009-07-22T00:00:00Z2010-07-22T00:00:00ZORPHA:206638OMIM:614106None
4patient42004-08-02T00:00:00ZUndeterminedLost in follow-upNone2020-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638NoneNone
5patient51923-04-08T00:00:00ZFoetus (Unknown)Opted-outNone2021-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638OMIM:614106None
6patient61999-00-00T00:00:00ZFalseAliveNone2019-00-00T00:00:00Z2017-00-00T00:00:00Z2018-00-00T00:00:00ZORPHA:206638NoneNone
7patient72003-12-00T00:00:00ZmAliveNone2024-00-00T00:00:00Z2020-05-00T00:00:00Z2020-05-00T00:00:00ZORPHA:206638OMIM:614106None
8patient81979-09-00T00:00:00ZMaleDead2017-05-31T00:00:00Z2022-00-00T00:00:00Z2009-00-00T00:00:00Z2009-00-00T00:00:00ZORPHA:206638OMIM:614106None
9patient92002-00-00T00:00:00ZFemaleAliveNone2019-00-00T00:00:00Z2024-00-00T00:00:00Z2024-00-00T00:00:00ZNoneOMIM:614106None
\n", + "
" + ] }, - "execution_count": 195, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], - "source": [ - "ds.preprocess(\n", - " fields=erdri_cds_data_model.patient_s_status,\n", - " mapping={\n", - " \"Alive\": \"ALIVE\",\n", - " \"Dead\": \"DECEASED\",\n", - " \"Lost in follow-up\": \"UNKNOWN_STATUS\",\n", - " \"Opted-out\": \"UNKNOWN_STATUS\"\n", - " })\n", - "\n", - "ds.preprocess(\n", - " fields=erdri_cds_data_model.sex,\n", - " mapping={\n", - " 'Female': 'FEMALE',\n", - " 'Male': 'MALE',\n", - " 'Undetermined': 'UNKNOWN_SEX',\n", - " 'Foetus (Unknown)': 'UNKNOWN_SEX',\n", - " 'm': 'MALE',\n", - " False: 'FEMALE',\n", - " }\n", - ")\n", - "\n", - "def preprocess_age_at_diagnosis(values):\n", - " age_at_diagnosis = values['age_at_diagnosis']\n", - " date_of_birth = values['date_of_birth']\n", - " if age_at_diagnosis == 'At birth' or age_at_diagnosis == 'Antenatal':\n", - " return date_of_birth\n", - " elif age_at_diagnosis == 'Undetermined':\n", - " return None\n", - " else:\n", - " return age_at_diagnosis\n", - "\n", - "ds.head(20)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.881972500Z", - "start_time": "2024-09-25T10:58:30.847163600Z" - } - }, - "id": "7980416431ff905f", - "execution_count": 195 + "execution_count": 88 }, { "cell_type": "markdown", - "source": [ - "## 4. Defining the mapping to Phenopackets" - ], + "source": "## 3. Defining the mapping to Phenopackets", "metadata": { "collapsed": false }, @@ -962,7 +1113,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "import phenopackets\n", "\n", @@ -972,16 +1122,16 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.882976Z", - "start_time": "2024-09-25T10:58:30.861375Z" + "end_time": "2024-10-04T12:31:24.943254Z", + "start_time": "2024-10-04T12:31:24.939989Z" } }, "id": "ea6f88c242723618", - "execution_count": 196 + "outputs": [], + "execution_count": 90 }, { "cell_type": "code", - "outputs": [], "source": [ "added_fields = ['pseudonym', 'date_of_birth', 'patient_s_status', 'date_of_death', 'sex', 'diagnosis_of_the_rare_disease', 'age_at_diagnosis']\n", "\n", @@ -1029,41 +1179,346 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.898977200Z", - "start_time": "2024-09-25T10:58:30.866942300Z" + "end_time": "2024-10-04T12:31:25.023434Z", + "start_time": "2024-10-04T12:31:25.018928Z" } }, "id": "e214206e87497c92", - "execution_count": 197 + "outputs": [], + "execution_count": 91 }, { + "metadata": { + "collapsed": false + }, "cell_type": "markdown", "source": [ - "## 5. Perform the mapping" + "## 4. Preprocessing the data\n", + "\n", + "It is important to preprocess the data to adhere to the Phenopacket schema. \n", + "\n", + "E.g.: The `phenopackets.VitalStatus` expects an enum value out of 'ALIVE', 'DECEASED', and 'UNKNOWN_STATUS' for the `phenopackets.VitalStatus.status` field. But the ERDRI CDS defined it's data set as 'Alive', 'Dead', 'Lost in follow-up', 'Opted-out'. So we need to map these values to the expected ones." ], + "id": "da740c3a22fc7f2a" + }, + { + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-04T12:31:24.832449Z", + "start_time": "2024-10-04T12:31:24.823034Z" + } + }, + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": [ + " pseudonym date_of_birth sex patient_s_status \\\n", + "0 patient0 2002-02-00T00:00:00Z FEMALE ALIVE \n", + "1 patient1 1979-06-17T00:00:00Z MALE DECEASED \n", + "2 patient2 2000-00-00T00:00:00Z MALE ALIVE \n", + "3 patient3 2003-01-07T00:00:00Z FEMALE ALIVE \n", + "4 patient4 2004-08-02T00:00:00Z UNKNOWN_SEX UNKNOWN_STATUS \n", + "5 patient5 1923-04-08T00:00:00Z UNKNOWN_SEX UNKNOWN_STATUS \n", + "6 patient6 1999-00-00T00:00:00Z FEMALE ALIVE \n", + "7 patient7 2003-12-00T00:00:00Z MALE ALIVE \n", + "8 patient8 1979-09-00T00:00:00Z MALE DECEASED \n", + "9 patient9 2002-00-00T00:00:00Z FEMALE ALIVE \n", + "\n", + " date_of_death first_contact_with_specialised_centre \\\n", + "0 None 2019-00-00T00:00:00Z \n", + "1 2010-02-00T00:00:00Z 2019-00-00T00:00:00Z \n", + "2 None 2020-00-00T00:00:00Z \n", + "3 None 2023-00-00T00:00:00Z \n", + "4 None 2020-00-00T00:00:00Z \n", + "5 None 2021-00-00T00:00:00Z \n", + "6 None 2019-00-00T00:00:00Z \n", + "7 None 2024-00-00T00:00:00Z \n", + "8 2017-05-31T00:00:00Z 2022-00-00T00:00:00Z \n", + "9 None 2019-00-00T00:00:00Z \n", + "\n", + " age_at_onset age_at_diagnosis diagnosis_of_the_rare_disease \\\n", + "0 Antenatal Antenatal ORPHA:206638 \n", + "1 At birth At birth ICD9:781 \n", + "2 2005-12-07T00:00:00Z 2005-12-07T00:00:00Z ORPHA:206638 \n", + "3 2009-07-22T00:00:00Z 2010-07-22T00:00:00Z ORPHA:206638 \n", + "4 Undetermined Undetermined ORPHA:206638 \n", + "5 Undetermined Undetermined ORPHA:206638 \n", + "6 2017-00-00T00:00:00Z 2018-00-00T00:00:00Z ORPHA:206638 \n", + "7 2020-05-00T00:00:00Z 2020-05-00T00:00:00Z ORPHA:206638 \n", + "8 2009-00-00T00:00:00Z 2009-00-00T00:00:00Z ORPHA:206638 \n", + "9 2024-00-00T00:00:00Z 2024-00-00T00:00:00Z None \n", + "\n", + " genetic_diagnosis undiagnosed_case \n", + "0 OMIM:614106 None \n", + "1 OMIM:614106 None \n", + "2 OMIM:614106 None \n", + "3 OMIM:614106 None \n", + "4 None None \n", + "5 OMIM:614106 None \n", + "6 None None \n", + "7 OMIM:614106 None \n", + "8 OMIM:614106 None \n", + "9 OMIM:614106 None " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pseudonymdate_of_birthsexpatient_s_statusdate_of_deathfirst_contact_with_specialised_centreage_at_onsetage_at_diagnosisdiagnosis_of_the_rare_diseasegenetic_diagnosisundiagnosed_case
0patient02002-02-00T00:00:00ZFEMALEALIVENone2019-00-00T00:00:00ZAntenatalAntenatalORPHA:206638OMIM:614106None
1patient11979-06-17T00:00:00ZMALEDECEASED2010-02-00T00:00:00Z2019-00-00T00:00:00ZAt birthAt birthICD9:781OMIM:614106None
2patient22000-00-00T00:00:00ZMALEALIVENone2020-00-00T00:00:00Z2005-12-07T00:00:00Z2005-12-07T00:00:00ZORPHA:206638OMIM:614106None
3patient32003-01-07T00:00:00ZFEMALEALIVENone2023-00-00T00:00:00Z2009-07-22T00:00:00Z2010-07-22T00:00:00ZORPHA:206638OMIM:614106None
4patient42004-08-02T00:00:00ZUNKNOWN_SEXUNKNOWN_STATUSNone2020-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638NoneNone
5patient51923-04-08T00:00:00ZUNKNOWN_SEXUNKNOWN_STATUSNone2021-00-00T00:00:00ZUndeterminedUndeterminedORPHA:206638OMIM:614106None
6patient61999-00-00T00:00:00ZFEMALEALIVENone2019-00-00T00:00:00Z2017-00-00T00:00:00Z2018-00-00T00:00:00ZORPHA:206638NoneNone
7patient72003-12-00T00:00:00ZMALEALIVENone2024-00-00T00:00:00Z2020-05-00T00:00:00Z2020-05-00T00:00:00ZORPHA:206638OMIM:614106None
8patient81979-09-00T00:00:00ZMALEDECEASED2017-05-31T00:00:00Z2022-00-00T00:00:00Z2009-00-00T00:00:00Z2009-00-00T00:00:00ZORPHA:206638OMIM:614106None
9patient92002-00-00T00:00:00ZFEMALEALIVENone2019-00-00T00:00:00Z2024-00-00T00:00:00Z2024-00-00T00:00:00ZNoneOMIM:614106None
\n", + "
" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 89, + "source": [ + "ds.preprocess(\n", + " fields=erdri_cds_data_model.patient_s_status,\n", + " mapping={\n", + " \"Alive\": \"ALIVE\",\n", + " \"Dead\": \"DECEASED\",\n", + " \"Lost in follow-up\": \"UNKNOWN_STATUS\",\n", + " \"Opted-out\": \"UNKNOWN_STATUS\"\n", + " })\n", + "\n", + "ds.preprocess(\n", + " fields=erdri_cds_data_model.sex,\n", + " mapping={\n", + " 'Female': 'FEMALE',\n", + " 'Male': 'MALE',\n", + " 'Undetermined': 'UNKNOWN_SEX',\n", + " 'Foetus (Unknown)': 'UNKNOWN_SEX',\n", + " 'm': 'MALE',\n", + " False: 'FEMALE',\n", + " }\n", + ")\n", + "\n", + "def preprocess_age_at_diagnosis(values):\n", + " age_at_diagnosis = values['age_at_diagnosis']\n", + " date_of_birth = values['date_of_birth']\n", + " if age_at_diagnosis == 'At birth' or age_at_diagnosis == 'Antenatal':\n", + " return date_of_birth\n", + " elif age_at_diagnosis == 'Undetermined':\n", + " return None\n", + " else:\n", + " return age_at_diagnosis\n", + "\n", + "ds.head(20)" + ], + "id": "7980416431ff905f" + }, + { "metadata": { "collapsed": false }, + "cell_type": "markdown", + "source": "## 5. Perform the mapping", "id": "733d43d8444d3455" }, { "cell_type": "code", - "outputs": [], "source": [ "phenopackets_list = mapper.map(ds)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.958059Z", - "start_time": "2024-09-25T10:58:30.902977900Z" + "end_time": "2024-10-04T12:31:25.147344Z", + "start_time": "2024-10-04T12:31:25.142851Z" } }, "id": "6d5a7f4aec128e21", - "execution_count": 198 + "outputs": [], + "execution_count": 92 }, { "cell_type": "code", + "source": [ + "print(phenopackets_list[1])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-04T12:31:25.282627Z", + "start_time": "2024-10-04T12:31:25.279553Z" + } + }, + "id": "32e91d8b41e9217f", "outputs": [ { "name": "stdout", @@ -1096,22 +1551,12 @@ " term {\n", " id: \"ICD9:781\"\n", " }\n", - "}\n" + "}\n", + "\n" ] } ], - "source": [ - "print(phenopackets_list[1])" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-09-25T10:58:30.992351900Z", - "start_time": "2024-09-25T10:58:30.984426400Z" - } - }, - "id": "32e91d8b41e9217f", - "execution_count": 199 + "execution_count": 93 }, { "cell_type": "markdown", @@ -1125,23 +1570,22 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "from phenopacket_mapper.pipeline import write" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:31.008778900Z", - "start_time": "2024-09-25T10:58:30.994243Z" + "end_time": "2024-10-04T12:31:25.385642Z", + "start_time": "2024-10-04T12:31:25.381610Z" } }, "id": "a52ac804faaf1b48", - "execution_count": 200 + "outputs": [], + "execution_count": 94 }, { "cell_type": "code", - "outputs": [], "source": [ "output_path = Path('../res/test_data/erdri/output/')\n", "\n", @@ -1150,12 +1594,13 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:31.024409100Z", - "start_time": "2024-09-25T10:58:31.010755600Z" + "end_time": "2024-10-04T12:31:25.400354Z", + "start_time": "2024-10-04T12:31:25.394090Z" } }, "id": "b851fa9316a33ba8", - "execution_count": 201 + "outputs": [], + "execution_count": 95 }, { "cell_type": "markdown", @@ -1171,7 +1616,6 @@ }, { "cell_type": "code", - "outputs": [], "source": [ "resources = [SNOMED_CT, ORDO, ] # etc.\n", "ds = load_dataset(\n", @@ -1185,9 +1629,27 @@ ")" ], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-04T12:31:25.433132Z", + "start_time": "2024-10-04T12:31:25.419796Z" + } }, - "id": "52e5d5a0ae678124" + "id": "52e5d5a0ae678124", + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'load_dataset' is not defined", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[96], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m resources \u001B[38;5;241m=\u001B[39m [SNOMED_CT, ORDO, ] \u001B[38;5;66;03m# etc.\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m ds \u001B[38;5;241m=\u001B[39m \u001B[43mload_dataset\u001B[49m(\n\u001B[0;32m 3\u001B[0m path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m...\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 4\u001B[0m infer_data_model\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[0;32m 5\u001B[0m )\n\u001B[0;32m 6\u001B[0m PhenopacketMapper\u001B[38;5;241m.\u001B[39mmap(\n\u001B[0;32m 7\u001B[0m data\u001B[38;5;241m=\u001B[39mds,\n\u001B[0;32m 8\u001B[0m resources\u001B[38;5;241m=\u001B[39mresources,\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28mid\u001B[39m\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m.\u001B[39m\u001B[38;5;241m.\u001B[39m\u001B[38;5;241m.\u001B[39m\n\u001B[0;32m 10\u001B[0m )\n", + "\u001B[1;31mNameError\u001B[0m: name 'load_dataset' is not defined" + ] + } + ], + "execution_count": 96 }, { "cell_type": "markdown", @@ -1240,7 +1702,7 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:31.059907100Z", + "end_time": "2024-10-04T12:31:25.505858500Z", "start_time": "2024-09-25T10:58:31.038395300Z" } }, @@ -1265,7 +1727,7 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:31.060908500Z", + "end_time": "2024-10-04T12:31:25.505858500Z", "start_time": "2024-09-25T10:58:31.044680600Z" } }, @@ -1304,7 +1766,7 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:31.093405500Z", + "end_time": "2024-10-04T12:31:25.506864700Z", "start_time": "2024-09-25T10:58:31.067923300Z" } }, @@ -1318,7 +1780,7 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-25T10:58:31.105385800Z", + "end_time": "2024-10-04T12:31:25.506864700Z", "start_time": "2024-09-25T10:58:31.097376500Z" } }, diff --git a/src/phenopacket_mapper/mapping/mapper.py b/src/phenopacket_mapper/mapping/mapper.py index a11631d..402dc64 100644 --- a/src/phenopacket_mapper/mapping/mapper.py +++ b/src/phenopacket_mapper/mapping/mapper.py @@ -31,6 +31,18 @@ def __post_init__(self): for e in self.elements.values(): self.check_data_fields_in_model(e) + self.check_fields_adheres_to_phenopacket_allowed_values() + + def check_fields_adheres_to_phenopacket_allowed_values(self): + """Check if the fields in the mapping adhere to the values in the Phenopacket schema + + Check the Phenopacket schema to see if the fields in the mapping adhere to the values allowed by the schema. + Otherwise give precise error messages. + """ + tmp = self.elements.copy() + return True + + def check_data_fields_in_model(self, element: Union[PhenopacketElement, DataField]): if isinstance(element, DataField): field = element From 4343a496277b2f16102cb5b140ef3705036d719b Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:02:18 +0200 Subject: [PATCH 02/39] made datanode frozen --- src/phenopacket_mapper/_api/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index 6f77c4d..a402db6 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -29,7 +29,7 @@ class DataModel(metaclass=abc.ABCMeta): pass -@dataclass +@dataclass(frozen=True) class DataNode(metaclass=abc.ABCMeta): """ This is very much like Jackson (Java) `TreeNode`, From 7c705d6d3d1abaabaf8c0a03f79c23f4e0bafe15 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:02:31 +0200 Subject: [PATCH 03/39] made datafield inherit datanode --- src/phenopacket_mapper/data_standards/data_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index bd13228..34df6c2 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -16,6 +16,7 @@ import pandas as pd +from phenopacket_mapper._api import DataNode from phenopacket_mapper.data_standards import CodeSystem from phenopacket_mapper.data_standards.date import Date from phenopacket_mapper.data_standards.value_set import ValueSet @@ -23,7 +24,7 @@ @dataclass(slots=True, frozen=True) -class DataField: +class DataField(DataNode): """This class defines fields used in the definition of a `DataModel` A dataa field is the equivalent of a column in a table. It has a name, a value set, a description, a section, a From 2de9bff59912edcb0c4891c21760ccee96370e11 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:07:51 +0200 Subject: [PATCH 04/39] removed section and ordinal from datafield --- src/phenopacket_mapper/data_standards/data_model.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 34df6c2..8c87bec 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -52,18 +52,14 @@ class DataField(DataNode): :ivar specification: Value set of the field, if the value set is only one type, can also pass that type directly :ivar id: The identifier of the field, adhering to the naming rules stated above :ivar description: Description of the field - :ivar section: Section of the field (Only applicable if the data model is divided into sections) :ivar required: Required flag of the field - :ivar ordinal: Ordinal of the field (E.g. 1.1, 1.2, 2.1, etc.) """ # TODO: change section into path to data name: str = field() specification: Union[ValueSet, type, List[type]] = field() id: str = field(default=None) description: str = field(default='') - section: str = field(default='') required: bool = field(default=True) - ordinal: str = field(default='') def __post_init__(self): if not self.id: @@ -79,8 +75,7 @@ def __post_init__(self): def __str__(self): ret = "DataField(\n" ret += f"\t\tid: {self.id},\n" - ret += f"\t\tsection: {self.section},\n" - ret += f"\t\tordinal, name: ({self.ordinal}, {self.name}),\n" + ret += f"\t\tname: {self.name},\n" ret += f"\t\tvalue_set: {self.specification}, required: {self.required},\n" ret += f"\t\tspecification: {self.specification}\n" ret += "\t)" @@ -253,12 +248,9 @@ def from_file( file_type: Literal['csv', 'excel', 'unknown'] = 'unknown', column_names: Dict[str, str] = MappingProxyType({ DataField.name.__name__: 'data_field_name', - DataField.section.__name__: 'data_model_section', DataField.description.__name__: 'description', DataField.specification.__name__: 'value_set', DataField.required.__name__: 'required', - DataField.specification.__name__: 'specification', - DataField.ordinal.__name__: 'ordinal' }), parse_value_sets: bool = False, remove_line_breaks: bool = False, From a9d1a40f4a957310a3b169463ca6d5e4316915df Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:08:08 +0200 Subject: [PATCH 05/39] renamed label in datanode to name --- src/phenopacket_mapper/_api/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index a402db6..71e8e6f 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -41,7 +41,7 @@ class DataNode(metaclass=abc.ABCMeta): We want to be able to (de)serialize this. """ - label: str + name: str id: str required: bool From c7d8dab856a2db251cc6c821741c2a000b6b04e2 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:14:10 +0200 Subject: [PATCH 06/39] changed imports --- src/phenopacket_mapper/_api/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index 71e8e6f..49a8ced 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -2,12 +2,12 @@ This package is intended to expose the PhenopacketMapper API to the user. """ -import abc +from abc import ABCMeta, abstractmethod, abstractproperty from typing import Tuple, Iterable, Iterator from dataclasses import dataclass -class DataModelDefiner(metaclass=abc.ABCMeta): +class DataModelDefiner(metaclass=ABCMeta): """ Take some data model definition and try to load it into :class:`DataModel`. @@ -16,7 +16,7 @@ class DataModelDefiner(metaclass=abc.ABCMeta): pass -class DataModel(metaclass=abc.ABCMeta): +class DataModel(metaclass=ABCMeta): """ Value class. The fields: @@ -29,8 +29,7 @@ class DataModel(metaclass=abc.ABCMeta): pass -@dataclass(frozen=True) -class DataNode(metaclass=abc.ABCMeta): +class DataNode(metaclass=ABCMeta): """ This is very much like Jackson (Java) `TreeNode`, because it can be many things. From 7d34c9fce5fdb8e6926e2fb1aee6baf9abfa6fae Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:14:24 +0200 Subject: [PATCH 07/39] changed declaration of fields subclass must support --- src/phenopacket_mapper/_api/__init__.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index 49a8ced..aab6d6d 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -40,16 +40,29 @@ class DataNode(metaclass=ABCMeta): We want to be able to (de)serialize this. """ - name: str - id: str - required: bool + @property + @abstractmethod + def id(self) -> str: + pass + + @property + @abstractmethod + def name(self) -> str: + pass + + @property + @abstractmethod + def required(self) -> bool: + pass + + class DataInstance: pass -class Transformation(metaclass=abc.ABCMeta): +class Transformation(metaclass=ABCMeta): """ """ From c0c8ec41af4e3ae5aee10a30280e43200ec58081 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:15:16 +0200 Subject: [PATCH 08/39] removed section and ordinal --- src/phenopacket_mapper/pipeline/input.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/phenopacket_mapper/pipeline/input.py b/src/phenopacket_mapper/pipeline/input.py index 46fba7d..f37347a 100644 --- a/src/phenopacket_mapper/pipeline/input.py +++ b/src/phenopacket_mapper/pipeline/input.py @@ -22,11 +22,9 @@ def read_data_model( file_type: Literal['csv', 'excel', 'unknown'] = 'unknown', column_names: Dict[str, str] = MappingProxyType({ DataField.name.__name__: 'data_field_name', - DataField.section.__name__: 'data_model_section', DataField.description.__name__: 'description', DataField.specification.__name__: 'value_set', DataField.required.__name__: 'required', - DataField.ordinal.__name__: 'ordinal' }), parse_value_sets: bool = False, remove_line_breaks: bool = False, From c8a0baa54d5aca1b8ddc7bb19a2e64da8826f8bd Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:17:35 +0200 Subject: [PATCH 09/39] typo --- src/phenopacket_mapper/_api/__init__.py | 2 -- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index aab6d6d..fb75899 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -56,8 +56,6 @@ def required(self) -> bool: pass - - class DataInstance: pass diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 8c87bec..c2abacd 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -38,7 +38,7 @@ class DataField(DataNode): - The `id` field must be a valid Python identifier - The `id` field must start with a letter or the underscore character - The `id` field must cannot start with a number - - The `id` field can only contain lowercase alpha-numeric characters and underscores (a-z, 0-9, and _ ) + - The `id` field can only contain lowercase alphanumeric characters and underscores (a-z, 0-9, and _ ) - The `id` field cannot be any of the Python keywords (e.g. `in`, `is`, `not`, `class`, etc.). - The `id` field must be unique within a `DataModel` From 63ba119929fc4f005d81f820d5c7f71dc375a3ed Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:17:43 +0200 Subject: [PATCH 10/39] removed todo --- src/phenopacket_mapper/data_standards/data_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index c2abacd..de5c795 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -54,7 +54,6 @@ class DataField(DataNode): :ivar description: Description of the field :ivar required: Required flag of the field """ - # TODO: change section into path to data name: str = field() specification: Union[ValueSet, type, List[type]] = field() id: str = field(default=None) From 9de26e6e020210037985569905c87a028071f745 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:25:47 +0200 Subject: [PATCH 11/39] changed order and added description --- src/phenopacket_mapper/_api/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index fb75899..5631020 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -42,12 +42,12 @@ class DataNode(metaclass=ABCMeta): """ @property @abstractmethod - def id(self) -> str: + def name(self) -> str: pass @property @abstractmethod - def name(self) -> str: + def id(self) -> str: pass @property @@ -56,6 +56,12 @@ def required(self) -> bool: pass + @property + @abstractmethod + def description(self) -> str: + pass + + class DataInstance: pass From 4de8f031fd9ea91af6b5d289f9f12b2849767d25 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:25:58 +0200 Subject: [PATCH 12/39] changed order --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index de5c795..a0f1f7e 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -57,8 +57,8 @@ class DataField(DataNode): name: str = field() specification: Union[ValueSet, type, List[type]] = field() id: str = field(default=None) + required: bool = field(default=False) description: str = field(default='') - required: bool = field(default=True) def __post_init__(self): if not self.id: From 138529d5b0f38d691739830ae1e830580c07239d Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 10:26:17 +0200 Subject: [PATCH 13/39] added datasection class --- .../data_standards/data_model.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index a0f1f7e..c705e1b 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -88,7 +88,27 @@ def __eq__(self, other): and self.required == other.required) -@dataclass(slots=True) +@dataclass(slots=True, frozen=True) +class DataSection: + """This class defines a section in a `DataModel` + + A section is a collection of `DataField` or `DataSection` objects. It is used to group related fields in a + `DataModel`. + + :ivar name: Name of the section + :ivar fields: List of `DataField` objects + """ + name: str = field() + id: str = field(default=None) + fields: Tuple[Union[DataField, 'DataSection']] = field(default_factory=tuple) + required: bool = field(default=False) + + def __post_init__(self): + if not self.id: + from phenopacket_mapper.utils import str_to_valid_id + object.__setattr__(self, 'id', str_to_valid_id(self.name)) + +@dataclass(slots=True, frozen=True) class DataFieldValue: """This class defines the value of a `DataField` in a `DataModelInstance` From 2f6d6db70a57c035ddc54b87c868255f792d2c9d Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:29:33 +0200 Subject: [PATCH 14/39] added new files --- notebooks/hierarchical_data_model.ipynb | 37 +++++++++++++++++++ .../data_standards/cardinality.py | 0 2 files changed, 37 insertions(+) create mode 100644 notebooks/hierarchical_data_model.ipynb create mode 100644 src/phenopacket_mapper/data_standards/cardinality.py diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb new file mode 100644 index 0000000..54f657b --- /dev/null +++ b/notebooks/hierarchical_data_model.ipynb @@ -0,0 +1,37 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/phenopacket_mapper/data_standards/cardinality.py b/src/phenopacket_mapper/data_standards/cardinality.py new file mode 100644 index 0000000..e69de29 From 32eff16af19ada3c8674ab827fe61c5b7424a257 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:31:20 +0200 Subject: [PATCH 15/39] added Cardinality to abc DataNode --- src/phenopacket_mapper/_api/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index 5631020..3ff3106 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -3,9 +3,11 @@ """ from abc import ABCMeta, abstractmethod, abstractproperty -from typing import Tuple, Iterable, Iterator +from typing import Tuple, Iterable, Iterator, Union, Literal from dataclasses import dataclass +from phenopacket_mapper.data_standards import Cardinality + class DataModelDefiner(metaclass=ABCMeta): """ @@ -61,6 +63,11 @@ def required(self) -> bool: def description(self) -> str: pass + @property + @abstractmethod + def cardinality(self) -> Cardinality: + pass + class DataInstance: pass From a063aeb46c111aa0c5a3e9a46d0604599c3d2d90 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:32:03 +0200 Subject: [PATCH 16/39] implemented cardinality class --- src/phenopacket_mapper/data_standards/cardinality.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/phenopacket_mapper/data_standards/cardinality.py b/src/phenopacket_mapper/data_standards/cardinality.py index e69de29..eaad6b0 100644 --- a/src/phenopacket_mapper/data_standards/cardinality.py +++ b/src/phenopacket_mapper/data_standards/cardinality.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass, field +from typing import Union, Literal + + +@dataclass(slots=True, frozen=True) +class Cardinality: + min: int = field(default=0) + max: Union[int, Literal['n']] = field(default='n') \ No newline at end of file From e6e1a89eb01037f54a7627465ad230b1c3e4a87d Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:32:28 +0200 Subject: [PATCH 17/39] added cardinality to datafield and datasection --- src/phenopacket_mapper/data_standards/data_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index c705e1b..7bfcb0d 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -17,7 +17,7 @@ import pandas as pd from phenopacket_mapper._api import DataNode -from phenopacket_mapper.data_standards import CodeSystem +from phenopacket_mapper.data_standards import CodeSystem, Cardinality from phenopacket_mapper.data_standards.date import Date from phenopacket_mapper.data_standards.value_set import ValueSet from phenopacket_mapper.preprocessing import preprocess, preprocess_method @@ -59,6 +59,7 @@ class DataField(DataNode): id: str = field(default=None) required: bool = field(default=False) description: str = field(default='') + cardinality: Cardinality = field(default_factory=Cardinality) def __post_init__(self): if not self.id: @@ -102,6 +103,7 @@ class DataSection: id: str = field(default=None) fields: Tuple[Union[DataField, 'DataSection']] = field(default_factory=tuple) required: bool = field(default=False) + cardinality: Cardinality = field(default_factory=Cardinality) def __post_init__(self): if not self.id: From 69b9551ce98d9d300ee513df626ef0f4bd1e47d3 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:33:03 +0200 Subject: [PATCH 18/39] cardinality in init.py --- src/phenopacket_mapper/data_standards/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index 82c7ec3..f04840b 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -1,5 +1,6 @@ """This submodule defines the data standards used in the project.""" +from .cardinality import Cardinality from .date import Date from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC from .code import Coding, CodeableConcept @@ -8,6 +9,7 @@ from .value_set import ValueSet __all__ = [ + "Cardinality", "Coding", "CodeableConcept", "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "data_models", From e67c274f8e4e049df13a74836bfc4507f30b11a4 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:33:29 +0200 Subject: [PATCH 19/39] started example in new nb --- notebooks/hierarchical_data_model.ipynb | 76 +++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 54f657b..a3d826c 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -2,15 +2,83 @@ "cells": [ { "cell_type": "code", - "execution_count": null, "id": "initial_id", "metadata": { - "collapsed": true + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-10-11T08:27:10.741111Z", + "start_time": "2024-10-11T08:27:09.593853Z" + } }, + "source": [ + "from dataclasses import fields\n", + "\n", + "from build.lib.phenopacket_mapper.data_standards import DataField\n", + "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection" + ], "outputs": [], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "" - ] + "genomic_interpretation = DataModel(\n", + " data_model_name=\"Phenopacket schema Genomic Interpretation\",\n", + " fields=(\n", + " DataField(\n", + " name=\"subject_or_biosample_id\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"The id of the patient or biosample that is the subject being interpreted. REQUIRED.\"\n", + " ),\n", + " \n", + " DataField(\n", + " name=\"interpretation_status\",\n", + " specification=ValueSet(\n", + " name=\"Interpretation Status Value Set\",\n", + " elements=[\"UNKNOWN_STATUS\", \"REJECTED\", \"CANDIDATE\", \"CONTRIBUTORY\", \"CAUSATIVE\"],\n", + " ),\n", + " required=True,\n", + " description=\"status of the interpretation. REQUIRED.\",\n", + " ),\n", + " \n", + " DataSection(\n", + " name=\"call\",\n", + " fields=(\n", + " DataSection(\n", + " name=\"GeneDescriptor\",\n", + " fields=(\n", + " DataField(\n", + " name=\"value_id\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"Official identifier of the gene. REQUIRED.\"\n", + " ),\n", + " \n", + " DataField(\n", + " name=\"symbol\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"Official gene symbol. REQUIRED.\"\n", + " ),\n", + " \n", + " DataField(\n", + " name=\"description\",\n", + " specification=str,\n", + " required=False,\n", + " description=\"A free-text description of the gene\"\n", + " ),\n", + " ),\n", + " ),\n", + " ),\n", + " ),\n", + " )\n", + ")" + ], + "id": "2e979683ae450d9b" } ], "metadata": { From 835f5f1e620f7dc21061f5fb57335dc12721e4c0 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:51:54 +0200 Subject: [PATCH 20/39] added tests for cardinality --- tests/data_standards/test_cardinality.py | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/data_standards/test_cardinality.py diff --git a/tests/data_standards/test_cardinality.py b/tests/data_standards/test_cardinality.py new file mode 100644 index 0000000..d5737f6 --- /dev/null +++ b/tests/data_standards/test_cardinality.py @@ -0,0 +1,32 @@ +import pytest + +from phenopacket_mapper.data_standards import Cardinality + + +@pytest.mark.parametrize( + "inp, expected", [ + ((0, 1), Cardinality(0, 1)), + ((1, 1), Cardinality(1, 1)), + ((0, 'n'), Cardinality(0, 'n')), + ((1, 'n'), Cardinality(1, 'n')), + ((0, 3), Cardinality(0, 3)), + ] +) +def test_cardinality_instantiation(inp, expected): + assert Cardinality(*inp) == expected + + +@pytest.mark.parametrize( + "inp, exc_", [ + ((-1, 1), ValueError), + ((-1, -1), ValueError), + ((1, -1), ValueError), + ((0, 0), ValueError), + ((0, 'm'), ValueError), + ((1.0, 'n'), ValueError), + ((1.3, 'n'), ValueError), + ] +) +def test_cardinality_instantiation_raises(inp, exc_): + with pytest.raises(exc_): + Cardinality(*inp) \ No newline at end of file From 57bd24be08311ea2aefe7d2cac4c90d662369c5b Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:53:29 +0200 Subject: [PATCH 21/39] added post init in cardinality to check if instantiation was valid --- .../data_standards/cardinality.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/cardinality.py b/src/phenopacket_mapper/data_standards/cardinality.py index eaad6b0..b8c67fa 100644 --- a/src/phenopacket_mapper/data_standards/cardinality.py +++ b/src/phenopacket_mapper/data_standards/cardinality.py @@ -5,4 +5,15 @@ @dataclass(slots=True, frozen=True) class Cardinality: min: int = field(default=0) - max: Union[int, Literal['n']] = field(default='n') \ No newline at end of file + max: Union[int, Literal['n']] = field(default='n') + + def __post_init__(self): + if not isinstance(self.min, int): + raise ValueError(f"Parameter min must be of type integer. (Not: {type(self.min)})") + elif self.min < 0: + raise ValueError(f"Parameter min must be a non-negative integer. (Not: {self.min})") + if not (isinstance(self.max, int) or self.max == 'n'): + raise ValueError(f"Parameter max must be of type or equal to the literal 'n'. " + f"(Not: {self.min} ({type(self.min)}))") + elif self.max != 'n' and self.max < 1: # has to be an integer + raise ValueError(f"Parameter max must be a positive integer. (Not: {self.min})") \ No newline at end of file From 3e9b1934ec50be2f21e5875655edebe37b543c84 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:53:41 +0200 Subject: [PATCH 22/39] added datasection to init --- src/phenopacket_mapper/data_standards/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index f04840b..2f7e10f 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -4,14 +4,14 @@ from .date import Date from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC from .code import Coding, CodeableConcept -from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet +from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet, DataSection from . import data_models from .value_set import ValueSet __all__ = [ "Cardinality", "Coding", "CodeableConcept", - "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", + "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", "data_models", "CodeSystem", "SNOMED_CT", "HPO", "MONDO", "OMIM", "ORDO", "LOINC", From f89a021d48e532c10577433351875b226d6a84e8 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:54:45 +0200 Subject: [PATCH 23/39] modified datamodel to take both datafield and datasection --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 7bfcb0d..3b93114 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -177,7 +177,7 @@ class DataModel: :ivar resources: List of `CodeSystem` objects """ data_model_name: str = field() - fields: Tuple[DataField, ...] = field() + fields: Tuple[Union[DataField, DataSection], ...] = field() resources: List[CodeSystem] = field(default_factory=list) def __post_init__(self): From b8737ee081661dda5f50c9e5e11a4d46da718eed Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:55:15 +0200 Subject: [PATCH 24/39] typo --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 3b93114..0b3e91d 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -101,7 +101,7 @@ class DataSection: """ name: str = field() id: str = field(default=None) - fields: Tuple[Union[DataField, 'DataSection']] = field(default_factory=tuple) + fields: Tuple[Union[DataField, 'DataSection'], ...] = field(default_factory=tuple) required: bool = field(default=False) cardinality: Cardinality = field(default_factory=Cardinality) From 5d34667f7df500fc7cd5e06b79c5b30f0726ff36 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 12:58:03 +0200 Subject: [PATCH 25/39] added or group --- src/phenopacket_mapper/data_standards/__init__.py | 2 ++ src/phenopacket_mapper/data_standards/or_group.py | 9 +++++++++ 2 files changed, 11 insertions(+) create mode 100644 src/phenopacket_mapper/data_standards/or_group.py diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index 2f7e10f..1c0b980 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -1,5 +1,6 @@ """This submodule defines the data standards used in the project.""" +from .or_group import OrGroup from .cardinality import Cardinality from .date import Date from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC @@ -9,6 +10,7 @@ from .value_set import ValueSet __all__ = [ + "OrGroup", "Cardinality", "Coding", "CodeableConcept", "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", diff --git a/src/phenopacket_mapper/data_standards/or_group.py b/src/phenopacket_mapper/data_standards/or_group.py new file mode 100644 index 0000000..0b75cc1 --- /dev/null +++ b/src/phenopacket_mapper/data_standards/or_group.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass, field +from typing import Union, Tuple + +from phenopacket_mapper.data_standards import DataField, DataSection + + +@dataclass(slots=True, frozen=True) +class OrGroup: + fields: Tuple[Union[DataField, DataSection], ...] \ No newline at end of file From 7db18685eb15f23009a1c558109eb1ceb1437879 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 13:06:49 +0200 Subject: [PATCH 26/39] added or group --- .../data_standards/__init__.py | 4 ++-- .../data_standards/or_group.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index 1c0b980..e550fff 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -1,7 +1,7 @@ """This submodule defines the data standards used in the project.""" -from .or_group import OrGroup from .cardinality import Cardinality +from .or_group import OrGroup from .date import Date from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC from .code import Coding, CodeableConcept @@ -10,8 +10,8 @@ from .value_set import ValueSet __all__ = [ - "OrGroup", "Cardinality", + "OrGroup", "Coding", "CodeableConcept", "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", "data_models", diff --git a/src/phenopacket_mapper/data_standards/or_group.py b/src/phenopacket_mapper/data_standards/or_group.py index 0b75cc1..c29d259 100644 --- a/src/phenopacket_mapper/data_standards/or_group.py +++ b/src/phenopacket_mapper/data_standards/or_group.py @@ -1,9 +1,20 @@ from dataclasses import dataclass, field from typing import Union, Tuple -from phenopacket_mapper.data_standards import DataField, DataSection +from phenopacket_mapper._api import DataNode +from phenopacket_mapper.data_standards import Cardinality, data_node_classes @dataclass(slots=True, frozen=True) -class OrGroup: - fields: Tuple[Union[DataField, DataSection], ...] \ No newline at end of file +class OrGroup(DataNode): + fields: Tuple[data_node_classes, ...] + name: str = field(default='Or Group') + id: str = field(default=None) + description: str = field(default='') + required: bool = field(default=False) + cardinality: Cardinality = field(default_factory=Cardinality) + + def __post_init__(self): + if not self.id: + from phenopacket_mapper.utils import str_to_valid_id + object.__setattr__(self, 'id', str_to_valid_id(self.name)) \ No newline at end of file From 54029005feb053c825b3e40b05adb9fef64add36 Mon Sep 17 00:00:00 2001 From: frehburg Date: Fri, 11 Oct 2024 13:07:20 +0200 Subject: [PATCH 27/39] added data_node_classes as union of all types implementing datanodes --- src/phenopacket_mapper/data_standards/__init__.py | 6 +++++- src/phenopacket_mapper/data_standards/data_model.py | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index e550fff..57cc044 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -1,4 +1,5 @@ """This submodule defines the data standards used in the project.""" +from typing import Union from .cardinality import Cardinality from .or_group import OrGroup @@ -9,6 +10,8 @@ from . import data_models from .value_set import ValueSet +data_node_classes = Union[DataField, DataSection, OrGroup] + __all__ = [ "Cardinality", "OrGroup", @@ -18,5 +21,6 @@ "CodeSystem", "SNOMED_CT", "HPO", "MONDO", "OMIM", "ORDO", "LOINC", "Date", - "ValueSet" + "ValueSet", + "data_node_classes", ] diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 0b3e91d..df5f4c9 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -17,7 +17,7 @@ import pandas as pd from phenopacket_mapper._api import DataNode -from phenopacket_mapper.data_standards import CodeSystem, Cardinality +from phenopacket_mapper.data_standards import CodeSystem, Cardinality, data_node_classes from phenopacket_mapper.data_standards.date import Date from phenopacket_mapper.data_standards.value_set import ValueSet from phenopacket_mapper.preprocessing import preprocess, preprocess_method @@ -101,7 +101,7 @@ class DataSection: """ name: str = field() id: str = field(default=None) - fields: Tuple[Union[DataField, 'DataSection'], ...] = field(default_factory=tuple) + fields: Tuple[data_node_classes, ...] = field(default_factory=tuple) required: bool = field(default=False) cardinality: Cardinality = field(default_factory=Cardinality) @@ -177,7 +177,7 @@ class DataModel: :ivar resources: List of `CodeSystem` objects """ data_model_name: str = field() - fields: Tuple[Union[DataField, DataSection], ...] = field() + fields: Tuple[data_node_classes, ...] = field() resources: List[CodeSystem] = field(default_factory=list) def __post_init__(self): From 04e568ccdd9acc317fcd8c646f5b140cb733b220 Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:29:55 +0200 Subject: [PATCH 28/39] removed unnecessary imports --- src/phenopacket_mapper/_api/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/_api/__init__.py b/src/phenopacket_mapper/_api/__init__.py index 3ff3106..5f3730b 100644 --- a/src/phenopacket_mapper/_api/__init__.py +++ b/src/phenopacket_mapper/_api/__init__.py @@ -2,9 +2,8 @@ This package is intended to expose the PhenopacketMapper API to the user. """ -from abc import ABCMeta, abstractmethod, abstractproperty -from typing import Tuple, Iterable, Iterator, Union, Literal -from dataclasses import dataclass +from abc import ABCMeta, abstractmethod +from typing import Tuple, Iterable, Iterator from phenopacket_mapper.data_standards import Cardinality From afc43f3032da9df4eda21f8a10713e34b838644f Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:37:12 +0200 Subject: [PATCH 29/39] removed data field classes, circ import --- src/phenopacket_mapper/data_standards/__init__.py | 5 ----- src/phenopacket_mapper/data_standards/data_model.py | 6 +++--- src/phenopacket_mapper/data_standards/or_group.py | 6 +++--- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index 57cc044..8713c1e 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -1,6 +1,4 @@ """This submodule defines the data standards used in the project.""" -from typing import Union - from .cardinality import Cardinality from .or_group import OrGroup from .date import Date @@ -10,8 +8,6 @@ from . import data_models from .value_set import ValueSet -data_node_classes = Union[DataField, DataSection, OrGroup] - __all__ = [ "Cardinality", "OrGroup", @@ -22,5 +18,4 @@ "SNOMED_CT", "HPO", "MONDO", "OMIM", "ORDO", "LOINC", "Date", "ValueSet", - "data_node_classes", ] diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index df5f4c9..21489ca 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -17,7 +17,7 @@ import pandas as pd from phenopacket_mapper._api import DataNode -from phenopacket_mapper.data_standards import CodeSystem, Cardinality, data_node_classes +from phenopacket_mapper.data_standards import CodeSystem, Cardinality, OrGroup from phenopacket_mapper.data_standards.date import Date from phenopacket_mapper.data_standards.value_set import ValueSet from phenopacket_mapper.preprocessing import preprocess, preprocess_method @@ -101,7 +101,7 @@ class DataSection: """ name: str = field() id: str = field(default=None) - fields: Tuple[data_node_classes, ...] = field(default_factory=tuple) + fields: Tuple[Union[DataField, 'DataSection', OrGroup], ...] = field(default_factory=tuple) required: bool = field(default=False) cardinality: Cardinality = field(default_factory=Cardinality) @@ -177,7 +177,7 @@ class DataModel: :ivar resources: List of `CodeSystem` objects """ data_model_name: str = field() - fields: Tuple[data_node_classes, ...] = field() + fields: Tuple[Union[DataField, DataSection, OrGroup], ...] = field() resources: List[CodeSystem] = field(default_factory=list) def __post_init__(self): diff --git a/src/phenopacket_mapper/data_standards/or_group.py b/src/phenopacket_mapper/data_standards/or_group.py index c29d259..5ad6184 100644 --- a/src/phenopacket_mapper/data_standards/or_group.py +++ b/src/phenopacket_mapper/data_standards/or_group.py @@ -2,12 +2,12 @@ from typing import Union, Tuple from phenopacket_mapper._api import DataNode -from phenopacket_mapper.data_standards import Cardinality, data_node_classes +from phenopacket_mapper.data_standards import Cardinality, DataField, DataSection @dataclass(slots=True, frozen=True) class OrGroup(DataNode): - fields: Tuple[data_node_classes, ...] + fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] name: str = field(default='Or Group') id: str = field(default=None) description: str = field(default='') @@ -17,4 +17,4 @@ class OrGroup(DataNode): def __post_init__(self): if not self.id: from phenopacket_mapper.utils import str_to_valid_id - object.__setattr__(self, 'id', str_to_valid_id(self.name)) \ No newline at end of file + object.__setattr__(self, 'id', str_to_valid_id(self.name)) From cf9d66968eee7179872aad3a731723dec664a5f7 Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:37:23 +0200 Subject: [PATCH 30/39] instance not immutable --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 21489ca..631f59f 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -110,7 +110,7 @@ def __post_init__(self): from phenopacket_mapper.utils import str_to_valid_id object.__setattr__(self, 'id', str_to_valid_id(self.name)) -@dataclass(slots=True, frozen=True) +@dataclass(slots=True) class DataFieldValue: """This class defines the value of a `DataField` in a `DataModelInstance` From 83f75564013c3128044dace111abbbad1f3e35db Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:39:09 +0200 Subject: [PATCH 31/39] moved orgroup to datamodel file --- notebooks/hierarchical_data_model.ipynb | 50 +++++++++++-------- .../data_standards/__init__.py | 4 +- .../data_standards/data_model.py | 21 ++++++-- .../data_standards/or_group.py | 20 -------- 4 files changed, 48 insertions(+), 47 deletions(-) delete mode 100644 src/phenopacket_mapper/data_standards/or_group.py diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index a3d826c..a2831f3 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -14,7 +14,7 @@ "from dataclasses import fields\n", "\n", "from build.lib.phenopacket_mapper.data_standards import DataField\n", - "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection" + "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup" ], "outputs": [], "execution_count": 1 @@ -48,30 +48,38 @@ " DataSection(\n", " name=\"call\",\n", " fields=(\n", - " DataSection(\n", - " name=\"GeneDescriptor\",\n", + " OrGroup(\n", " fields=(\n", - " DataField(\n", - " name=\"value_id\",\n", - " specification=str,\n", - " required=True,\n", - " description=\"Official identifier of the gene. REQUIRED.\"\n", - " ),\n", - " \n", - " DataField(\n", - " name=\"symbol\",\n", - " specification=str,\n", - " required=True,\n", - " description=\"Official gene symbol. REQUIRED.\"\n", + " DataSection(\n", + " name=\"GeneDescriptor\",\n", + " fields=(\n", + " DataField(\n", + " name=\"value_id\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"Official identifier of the gene. REQUIRED.\"\n", + " ),\n", + "\n", + " DataField(\n", + " name=\"symbol\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"Official gene symbol. REQUIRED.\"\n", + " ),\n", + "\n", + " DataField(\n", + " name=\"description\",\n", + " specification=str,\n", + " required=False,\n", + " description=\"A free-text description of the gene\"\n", + " ),\n", + " ),\n", " ),\n", " \n", - " DataField(\n", - " name=\"description\",\n", - " specification=str,\n", - " required=False,\n", - " description=\"A free-text description of the gene\"\n", + " DataSection(\n", + " \n", " ),\n", - " ),\n", + " )\n", " ),\n", " ),\n", " ),\n", diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index 8713c1e..2c722cd 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -1,16 +1,14 @@ """This submodule defines the data standards used in the project.""" from .cardinality import Cardinality -from .or_group import OrGroup from .date import Date from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC from .code import Coding, CodeableConcept -from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet, DataSection +from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet, DataSection, OrGroup from . import data_models from .value_set import ValueSet __all__ = [ "Cardinality", - "OrGroup", "Coding", "CodeableConcept", "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", "data_models", diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 631f59f..73154b6 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -17,7 +17,7 @@ import pandas as pd from phenopacket_mapper._api import DataNode -from phenopacket_mapper.data_standards import CodeSystem, Cardinality, OrGroup +from phenopacket_mapper.data_standards import CodeSystem, Cardinality from phenopacket_mapper.data_standards.date import Date from phenopacket_mapper.data_standards.value_set import ValueSet from phenopacket_mapper.preprocessing import preprocess, preprocess_method @@ -101,7 +101,7 @@ class DataSection: """ name: str = field() id: str = field(default=None) - fields: Tuple[Union[DataField, 'DataSection', OrGroup], ...] = field(default_factory=tuple) + fields: Tuple[Union[DataField, 'DataSection', 'OrGroup'], ...] = field(default_factory=tuple) required: bool = field(default=False) cardinality: Cardinality = field(default_factory=Cardinality) @@ -177,7 +177,7 @@ class DataModel: :ivar resources: List of `CodeSystem` objects """ data_model_name: str = field() - fields: Tuple[Union[DataField, DataSection, OrGroup], ...] = field() + fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] = field() resources: List[CodeSystem] = field(default_factory=list) def __post_init__(self): @@ -499,6 +499,21 @@ def head(self, n: int = 5): warnings.warn("No data frame object available for this dataset") +@dataclass(slots=True, frozen=True) +class OrGroup(DataNode): + fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] + name: str = field(default='Or Group') + id: str = field(default=None) + description: str = field(default='') + required: bool = field(default=False) + cardinality: Cardinality = field(default_factory=Cardinality) + + def __post_init__(self): + if not self.id: + from phenopacket_mapper.utils import str_to_valid_id + object.__setattr__(self, 'id', str_to_valid_id(self.name)) + + if __name__ == "__main__": df = DataField(name="Field 1", specification=int) print(df.specification == ValueSet([int])) \ No newline at end of file diff --git a/src/phenopacket_mapper/data_standards/or_group.py b/src/phenopacket_mapper/data_standards/or_group.py deleted file mode 100644 index 5ad6184..0000000 --- a/src/phenopacket_mapper/data_standards/or_group.py +++ /dev/null @@ -1,20 +0,0 @@ -from dataclasses import dataclass, field -from typing import Union, Tuple - -from phenopacket_mapper._api import DataNode -from phenopacket_mapper.data_standards import Cardinality, DataField, DataSection - - -@dataclass(slots=True, frozen=True) -class OrGroup(DataNode): - fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] - name: str = field(default='Or Group') - id: str = field(default=None) - description: str = field(default='') - required: bool = field(default=False) - cardinality: Cardinality = field(default_factory=Cardinality) - - def __post_init__(self): - if not self.id: - from phenopacket_mapper.utils import str_to_valid_id - object.__setattr__(self, 'id', str_to_valid_id(self.name)) From f07e6bfdcbe114d1cfa697424ec76c7e471905b3 Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:55:59 +0200 Subject: [PATCH 32/39] fixed import --- notebooks/hierarchical_data_model.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index a2831f3..9256f63 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -13,7 +13,7 @@ "source": [ "from dataclasses import fields\n", "\n", - "from build.lib.phenopacket_mapper.data_standards import DataField\n", + "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup" ], "outputs": [], From c5495b40d1dc94984480c2dfa53177bc034850ea Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:56:05 +0200 Subject: [PATCH 33/39] typo --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 73154b6..6e86b6a 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -27,7 +27,7 @@ class DataField(DataNode): """This class defines fields used in the definition of a `DataModel` - A dataa field is the equivalent of a column in a table. It has a name, a value set, a description, a section, a + A data field is the equivalent of a column in a table. It has a name, a value set, a description, a section, a required flag, a specification, and an ordinal. The string for the `id` field is generated from the `name` field using the `str_to_valid_id` function from the From d7bbcdcd608c62c4ffc350bbf60134c44ee752fb Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 22:56:30 +0200 Subject: [PATCH 34/39] added orgroup to all --- src/phenopacket_mapper/data_standards/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index 2c722cd..d12bb64 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -10,7 +10,7 @@ __all__ = [ "Cardinality", "Coding", "CodeableConcept", - "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", + "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", "OrGroup", "data_models", "CodeSystem", "SNOMED_CT", "HPO", "MONDO", "OMIM", "ORDO", "LOINC", From 5c78d88695fab07c7db205b0eaf2a8a4fb09c2cf Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 23:00:41 +0200 Subject: [PATCH 35/39] sucessfully used orgroup --- notebooks/hierarchical_data_model.ipynb | 107 +++++++++++++++--------- 1 file changed, 69 insertions(+), 38 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 9256f63..79e97f4 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -6,24 +6,25 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-10-11T08:27:10.741111Z", - "start_time": "2024-10-11T08:27:09.593853Z" + "end_time": "2024-10-14T20:57:38.214684Z", + "start_time": "2024-10-14T20:57:38.210273Z" } }, "source": [ - "from dataclasses import fields\n", - "\n", "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup" ], "outputs": [], - "execution_count": 1 + "execution_count": 6 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-14T20:57:38.231064Z", + "start_time": "2024-10-14T20:57:38.224448Z" + } + }, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": [ "genomic_interpretation = DataModel(\n", " data_model_name=\"Phenopacket schema Genomic Interpretation\",\n", @@ -45,48 +46,78 @@ " description=\"status of the interpretation. REQUIRED.\",\n", " ),\n", " \n", - " DataSection(\n", + " OrGroup(\n", " name=\"call\",\n", " fields=(\n", - " OrGroup(\n", + " DataSection(\n", + " name=\"GeneDescriptor\",\n", " fields=(\n", - " DataSection(\n", - " name=\"GeneDescriptor\",\n", - " fields=(\n", - " DataField(\n", - " name=\"value_id\",\n", - " specification=str,\n", - " required=True,\n", - " description=\"Official identifier of the gene. REQUIRED.\"\n", - " ),\n", - "\n", - " DataField(\n", - " name=\"symbol\",\n", - " specification=str,\n", - " required=True,\n", - " description=\"Official gene symbol. REQUIRED.\"\n", - " ),\n", + " DataField(\n", + " name=\"value_id\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"Official identifier of the gene. REQUIRED.\"\n", + " ),\n", "\n", - " DataField(\n", - " name=\"description\",\n", - " specification=str,\n", - " required=False,\n", - " description=\"A free-text description of the gene\"\n", - " ),\n", - " ),\n", + " DataField(\n", + " name=\"symbol\",\n", + " specification=str,\n", + " required=True,\n", + " description=\"Official gene symbol. REQUIRED.\"\n", " ),\n", - " \n", - " DataSection(\n", - " \n", + "\n", + " DataField(\n", + " name=\"description\",\n", + " specification=str,\n", + " required=False,\n", + " description=\"A free-text description of the gene\"\n", " ),\n", - " )\n", + " ),\n", " ),\n", " ),\n", " ),\n", " )\n", ")" ], - "id": "2e979683ae450d9b" + "id": "2e979683ae450d9b", + "outputs": [], + "execution_count": 7 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-14T20:57:38.295528Z", + "start_time": "2024-10-14T20:57:38.291141Z" + } + }, + "cell_type": "code", + "source": "print(genomic_interpretation)", + "id": "35a697d8b9b8236d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataModel(name=Phenopacket schema Genomic Interpretation\n", + "\tDataField(\n", + "\t\tid: subject_or_biosample_id,\n", + "\t\tname: subject_or_biosample_id,\n", + "\t\tvalue_set: ValueSet(elements=[], name='', description=''), required: True,\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t)\n", + "\tDataField(\n", + "\t\tid: interpretation_status,\n", + "\t\tname: interpretation_status,\n", + "\t\tvalue_set: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description=''), required: True,\n", + "\t\tspecification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')\n", + "\t)\n", + "\tDataSection(name='call', id='call', fields=(OrGroup(fields=(DataSection(name='GeneDescriptor', id='genedescriptor', fields=(DataField(name='value_id', specification=ValueSet(elements=[], name='', description=''), id='value_id', required=True, description='Official identifier of the gene. REQUIRED.', cardinality=Cardinality(min=0, max='n')), DataField(name='symbol', specification=ValueSet(elements=[], name='', description=''), id='symbol', required=True, description='Official gene symbol. REQUIRED.', cardinality=Cardinality(min=0, max='n')), DataField(name='description', specification=ValueSet(elements=[], name='', description=''), id='description', required=False, description='A free-text description of the gene', cardinality=Cardinality(min=0, max='n'))), required=False, cardinality=Cardinality(min=0, max='n')),), name='Or Group', id='or_group', description='', required=False, cardinality=Cardinality(min=0, max='n')),), required=False, cardinality=Cardinality(min=0, max='n'))\n", + "---\n", + ")\n" + ] + } + ], + "execution_count": 8 } ], "metadata": { From 1d208dea198cf75fc1744dd5df0d9579dcd0e9fd Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 23:20:35 +0200 Subject: [PATCH 36/39] added str to cardinality --- src/phenopacket_mapper/data_standards/cardinality.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/cardinality.py b/src/phenopacket_mapper/data_standards/cardinality.py index b8c67fa..4ecdda1 100644 --- a/src/phenopacket_mapper/data_standards/cardinality.py +++ b/src/phenopacket_mapper/data_standards/cardinality.py @@ -16,4 +16,7 @@ def __post_init__(self): raise ValueError(f"Parameter max must be of type or equal to the literal 'n'. " f"(Not: {self.min} ({type(self.min)}))") elif self.max != 'n' and self.max < 1: # has to be an integer - raise ValueError(f"Parameter max must be a positive integer. (Not: {self.min})") \ No newline at end of file + raise ValueError(f"Parameter max must be a positive integer. (Not: {self.min})") + + def __str__(self): + return f"{self.min}..{self.max}" \ No newline at end of file From 5cd1c96fddbb87bef024b4661d6f4c8eb090bddf Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 23:21:12 +0200 Subject: [PATCH 37/39] changed str methods --- .../data_standards/data_model.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 6e86b6a..242f6aa 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -76,8 +76,9 @@ def __str__(self): ret = "DataField(\n" ret += f"\t\tid: {self.id},\n" ret += f"\t\tname: {self.name},\n" - ret += f"\t\tvalue_set: {self.specification}, required: {self.required},\n" + ret += f"\t\trequired: {self.required}\n" ret += f"\t\tspecification: {self.specification}\n" + ret += f"\t\tcardinality: {str(self.cardinality)}\n" ret += "\t)" return ret @@ -110,6 +111,18 @@ def __post_init__(self): from phenopacket_mapper.utils import str_to_valid_id object.__setattr__(self, 'id', str_to_valid_id(self.name)) + + def __str__(self): + ret = "DataSection(\n" + ret += f"\t\tid: {self.id},\n" + ret += f"\t\tname: {self.name},\n" + ret += f"\t\trequired: {self.required}\n" + ret += f"\t\tcardinality: {str(self.cardinality)}\n" + for _field in self.fields: + ret += f"\t{str(_field)}\n" + ret += "\t)" + return ret + @dataclass(slots=True) class DataFieldValue: """This class defines the value of a `DataField` in a `DataModelInstance` @@ -191,7 +204,8 @@ def __getattr__(self, var_name: str) -> DataField: raise AttributeError(f"'DataModel' object has no attribute '{var_name}'") def __str__(self): - ret = f"DataModel(name={self.data_model_name}\n" + ret = f"DataModel(\n" + ret += f"\tname: {self.data_model_name}\n" for _field in self.fields: ret += f"\t{str(_field)}\n" ret += "---\n" @@ -514,6 +528,19 @@ def __post_init__(self): object.__setattr__(self, 'id', str_to_valid_id(self.name)) + + def __str__(self): + ret = "OrGroup(\n" + ret += f"\t\tid: {self.id},\n" + ret += f"\t\tname: {self.name},\n" + ret += f"\t\trequired: {self.required}\n" + ret += f"\t\tcardinality: {self.cardinality}\n" + for _field in self.fields: + ret += f"\t{str(_field)}\n" + ret += "\t)" + return ret + + if __name__ == "__main__": df = DataField(name="Field 1", specification=int) print(df.specification == ValueSet([int])) \ No newline at end of file From 53118f358d58b8400e441e01894660bbdeea817d Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 23:21:27 +0200 Subject: [PATCH 38/39] auto set cardinality min 1 if required --- src/phenopacket_mapper/data_standards/data_model.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 242f6aa..6f7cddd 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -66,6 +66,9 @@ def __post_init__(self): from phenopacket_mapper.utils import str_to_valid_id object.__setattr__(self, 'id', str_to_valid_id(self.name)) + if self.required: + object.__setattr__(self, 'cardinality', Cardinality(min=1, max=self.cardinality.max)) + if isinstance(self.specification, type): object.__setattr__(self, 'specification', ValueSet(elements=[self.specification])) if isinstance(self.specification, list): @@ -111,6 +114,8 @@ def __post_init__(self): from phenopacket_mapper.utils import str_to_valid_id object.__setattr__(self, 'id', str_to_valid_id(self.name)) + if self.required: + object.__setattr__(self, 'cardinality', Cardinality(min=1, max=self.cardinality.max)) def __str__(self): ret = "DataSection(\n" @@ -527,6 +532,8 @@ def __post_init__(self): from phenopacket_mapper.utils import str_to_valid_id object.__setattr__(self, 'id', str_to_valid_id(self.name)) + if self.required: + object.__setattr__(self, 'cardinality', Cardinality(min=1, max=self.cardinality.max)) def __str__(self): From 7f65e2f359c8f5716aaf961b24d0463e233877fc Mon Sep 17 00:00:00 2001 From: frehburg Date: Mon, 14 Oct 2024 23:24:23 +0200 Subject: [PATCH 39/39] removed doctests --- notebooks/hierarchical_data_model.ipynb | 80 +++++++++++++++---- .../data_standards/data_model.py | 8 -- 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 79e97f4..cc3151e 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -6,8 +6,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-10-14T20:57:38.214684Z", - "start_time": "2024-10-14T20:57:38.210273Z" + "end_time": "2024-10-14T21:21:45.314262Z", + "start_time": "2024-10-14T21:21:45.309299Z" } }, "source": [ @@ -15,13 +15,13 @@ "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup" ], "outputs": [], - "execution_count": 6 + "execution_count": 4 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-14T20:57:38.231064Z", - "start_time": "2024-10-14T20:57:38.224448Z" + "end_time": "2024-10-14T21:21:45.340239Z", + "start_time": "2024-10-14T21:21:45.334880Z" } }, "cell_type": "code", @@ -81,43 +81,95 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 7 + "execution_count": 5 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-14T20:57:38.295528Z", - "start_time": "2024-10-14T20:57:38.291141Z" + "end_time": "2024-10-14T21:21:45.352293Z", + "start_time": "2024-10-14T21:21:45.347715Z" } }, "cell_type": "code", - "source": "print(genomic_interpretation)", + "source": [ + "s = str(genomic_interpretation)\n", + "\n", + "print(s)" + ], "id": "35a697d8b9b8236d", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataModel(name=Phenopacket schema Genomic Interpretation\n", + "DataModel(\n", + "\tname: Phenopacket schema Genomic Interpretation\n", "\tDataField(\n", "\t\tid: subject_or_biosample_id,\n", "\t\tname: subject_or_biosample_id,\n", - "\t\tvalue_set: ValueSet(elements=[], name='', description=''), required: True,\n", + "\t\trequired: True\n", "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", "\t)\n", "\tDataField(\n", "\t\tid: interpretation_status,\n", "\t\tname: interpretation_status,\n", - "\t\tvalue_set: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description=''), required: True,\n", + "\t\trequired: True\n", "\t\tspecification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tOrGroup(\n", + "\t\tid: call,\n", + "\t\tname: call,\n", + "\t\trequired: False\n", + "\t\tcardinality: 0..n\n", + "\tDataSection(\n", + "\t\tid: genedescriptor,\n", + "\t\tname: GeneDescriptor,\n", + "\t\trequired: False\n", + "\t\tcardinality: 0..n\n", + "\tDataField(\n", + "\t\tid: value_id,\n", + "\t\tname: value_id,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tDataField(\n", + "\t\tid: symbol,\n", + "\t\tname: symbol,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tDataField(\n", + "\t\tid: description,\n", + "\t\tname: description,\n", + "\t\trequired: False\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 0..n\n", + "\t)\n", + "\t)\n", "\t)\n", - "\tDataSection(name='call', id='call', fields=(OrGroup(fields=(DataSection(name='GeneDescriptor', id='genedescriptor', fields=(DataField(name='value_id', specification=ValueSet(elements=[], name='', description=''), id='value_id', required=True, description='Official identifier of the gene. REQUIRED.', cardinality=Cardinality(min=0, max='n')), DataField(name='symbol', specification=ValueSet(elements=[], name='', description=''), id='symbol', required=True, description='Official gene symbol. REQUIRED.', cardinality=Cardinality(min=0, max='n')), DataField(name='description', specification=ValueSet(elements=[], name='', description=''), id='description', required=False, description='A free-text description of the gene', cardinality=Cardinality(min=0, max='n'))), required=False, cardinality=Cardinality(min=0, max='n')),), name='Or Group', id='or_group', description='', required=False, cardinality=Cardinality(min=0, max='n')),), required=False, cardinality=Cardinality(min=0, max='n'))\n", "---\n", ")\n" ] } ], - "execution_count": 8 + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-14T21:21:45.378046Z", + "start_time": "2024-10-14T21:21:45.375530Z" + } + }, + "cell_type": "code", + "source": "", + "id": "4c78eb05ea58ff6c", + "outputs": [], + "execution_count": null } ], "metadata": { diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 6f7cddd..8b12e91 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -44,10 +44,6 @@ class DataField(DataNode): If the `value_set` is a single type, it can be passed directly as the `value_set` parameter. - e.g.: - >>> DataField(name="Field 1", specification=int) - DataField(name='Field 1', specification=ValueSet(elements=[], name='', description=''), id='field_1', description='', section='', required=True, ordinal='') - :ivar name: Name of the field :ivar specification: Value set of the field, if the value set is only one type, can also pass that type directly :ivar id: The identifier of the field, adhering to the naming rules stated above @@ -186,10 +182,6 @@ class DataModel: be accessed using the `id` as an attribute of the `DataModel` object. E.g.: `data_model.date_of_birth`. This is useful in the data reading and mapping processes. - >>> data_model = DataModel("Test data model", (DataField(name="Field 1", specification=ValueSet()),)) - >>> data_model.field_1 - DataField(name='Field 1', specification=ValueSet(elements=[], name='', description=''), id='field_1', description='', section='', required=True, ordinal='') - :ivar data_model_name: Name of the data model :ivar fields: List of `DataField` objects :ivar resources: List of `CodeSystem` objects