Merge pull request #178 from BIH-CEI/177-debug-data-reader

177 debug data reader
BIH-CEI · Oct 16, 2024 · b14ef3e · b14ef3e
2 parents 73c8c5a + ebeae86
commit b14ef3e
Show file tree

Hide file tree

Showing 4 changed files with 231 additions and 18 deletions.
diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb
@@ -6,22 +6,23 @@
    "metadata": {
     "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2024-10-14T21:21:45.314262Z",
-     "start_time": "2024-10-14T21:21:45.309299Z"
+     "end_time": "2024-10-15T16:49:45.994633Z",
+     "start_time": "2024-10-15T16:49:45.291536Z"
     }
    },
    "source": [
     "from phenopacket_mapper.data_standards import DataField\n",
-    "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup"
+    "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n",
+    "from phenopacket_mapper.utils.io import DataReader"
    ],
    "outputs": [],
-   "execution_count": 4
+   "execution_count": 1
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-14T21:21:45.340239Z",
-     "start_time": "2024-10-14T21:21:45.334880Z"
+     "end_time": "2024-10-15T16:49:46.001925Z",
+     "start_time": "2024-10-15T16:49:45.997639Z"
     }
    },
    "cell_type": "code",
@@ -81,13 +82,13 @@
    ],
    "id": "2e979683ae450d9b",
    "outputs": [],
-   "execution_count": 5
+   "execution_count": 2
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-14T21:21:45.352293Z",
-     "start_time": "2024-10-14T21:21:45.347715Z"
+     "end_time": "2024-10-15T16:49:46.057078Z",
+     "start_time": "2024-10-15T16:49:46.053873Z"
     }
    },
    "cell_type": "code",
@@ -156,20 +157,75 @@
      ]
     }
    ],
-   "execution_count": 6
+   "execution_count": 3
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-14T21:21:45.378046Z",
-     "start_time": "2024-10-14T21:21:45.375530Z"
+     "end_time": "2024-10-15T16:49:46.123667Z",
+     "start_time": "2024-10-15T16:49:46.120256Z"
     }
    },
    "cell_type": "code",
-   "source": "",
+   "source": [
+    "from io import StringIO\n",
+    "\n",
+    "xml_data = \\\n",
+    "    (\n",
+    "        '<?xml version=\"1.0\" encoding=\"UTF-8\" ?> <ODM xmlns=\"http://www.cdisc.org/ns/odm/v1.3\" xmlns:ds=\"http://www.w3.org/2000/09/xmldsig#\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xmlns:redcap=\"https://projectredcap.org\" xsi:schemaLocation=\"http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd\" ODMVersion=\"1.3.1\" FileOID=\"000-00-0000\" FileType=\"Snapshot\" Description=\"genAdipositas - ALT Demo\" AsOfDateTime=\"2024-10-14T11:57:18\" CreationDateTime=\"2024-10-14T11:57:18\" SourceSystem=\"REDCap\" SourceSystemVersion=\"14.6.9\"> '\n",
+    "        '<ClinicalData StudyOID=\"Project.GenAdipositasALTDemo\" MetaDataVersionOID=\"Metadata.GenAdipositasALTDemo_2024-10-14_1157\">'\n",
+    "        '<SubjectData SubjectKey=\"101\" redcap:RecordIdField=\"record_id\">\t'\n",
+    "        '</SubjectData>'\n",
+    "        '</ClinicalData>'\n",
+    "        '</ODM>'\n",
+    "    )\n",
+    "\n",
+    "buffer = StringIO(xml_data)"
+   ],
    "id": "4c78eb05ea58ff6c",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-10-15T16:49:46.146860Z",
+     "start_time": "2024-10-15T16:49:46.143098Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dr = DataReader(buffer, file_extension=\"xml\")",
+   "id": "a9f83d6e46715301",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_={'ODM': {'@xmlns': 'http://www.cdisc.org/ns/odm/v1.3', '@xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:redcap': 'https://projectredcap.org', '@xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', '@ODMVersion': '1.3.1', '@FileOID': '000-00-0000', '@FileType': 'Snapshot', '@Description': 'genAdipositas - ALT Demo', '@AsOfDateTime': '2024-10-14T11:57:18', '@CreationDateTime': '2024-10-14T11:57:18', '@SourceSystem': 'REDCap', '@SourceSystemVersion': '14.6.9', 'ClinicalData': {'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}}}, type(dict_)=<class 'dict'>\n",
+      "k='ODM', type(k)=<class 'str'>, v={'@xmlns': 'http://www.cdisc.org/ns/odm/v1.3', '@xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:redcap': 'https://projectredcap.org', '@xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', '@ODMVersion': '1.3.1', '@FileOID': '000-00-0000', '@FileType': 'Snapshot', '@Description': 'genAdipositas - ALT Demo', '@AsOfDateTime': '2024-10-14T11:57:18', '@CreationDateTime': '2024-10-14T11:57:18', '@SourceSystem': 'REDCap', '@SourceSystemVersion': '14.6.9', 'ClinicalData': {'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}}, type(v)=<class 'dict'>\n",
+      "k='@xmlns', type(k)=<class 'str'>, v='http://www.cdisc.org/ns/odm/v1.3', type(v)=<class 'str'>\n",
+      "k='@xmlns:ds', type(k)=<class 'str'>, v='http://www.w3.org/2000/09/xmldsig#', type(v)=<class 'str'>\n",
+      "k='@xmlns:xsi', type(k)=<class 'str'>, v='http://www.w3.org/2001/XMLSchema-instance', type(v)=<class 'str'>\n",
+      "k='@xmlns:redcap', type(k)=<class 'str'>, v='https://projectredcap.org', type(v)=<class 'str'>\n",
+      "k='@xsi:schemaLocation', type(k)=<class 'str'>, v='http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', type(v)=<class 'str'>\n",
+      "k='@ODMVersion', type(k)=<class 'str'>, v='1.3.1', type(v)=<class 'str'>\n",
+      "k='@FileOID', type(k)=<class 'str'>, v='000-00-0000', type(v)=<class 'str'>\n",
+      "k='@FileType', type(k)=<class 'str'>, v='Snapshot', type(v)=<class 'str'>\n",
+      "k='@Description', type(k)=<class 'str'>, v='genAdipositas - ALT Demo', type(v)=<class 'str'>\n",
+      "k='@AsOfDateTime', type(k)=<class 'str'>, v='2024-10-14T11:57:18', type(v)=<class 'str'>\n",
+      "k='@CreationDateTime', type(k)=<class 'str'>, v='2024-10-14T11:57:18', type(v)=<class 'str'>\n",
+      "k='@SourceSystem', type(k)=<class 'str'>, v='REDCap', type(v)=<class 'str'>\n",
+      "k='@SourceSystemVersion', type(k)=<class 'str'>, v='14.6.9', type(v)=<class 'str'>\n",
+      "k='ClinicalData', type(k)=<class 'str'>, v={'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}, type(v)=<class 'dict'>\n",
+      "k='@StudyOID', type(k)=<class 'str'>, v='Project.GenAdipositasALTDemo', type(v)=<class 'str'>\n",
+      "k='@MetaDataVersionOID', type(k)=<class 'str'>, v='Metadata.GenAdipositasALTDemo_2024-10-14_1157', type(v)=<class 'str'>\n",
+      "k='SubjectData', type(k)=<class 'str'>, v={'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}, type(v)=<class 'dict'>\n",
+      "k='@SubjectKey', type(k)=<class 'str'>, v='101', type(v)=<class 'str'>\n",
+      "k='@redcap:RecordIdField', type(k)=<class 'str'>, v='record_id', type(v)=<class 'str'>\n"
+     ]
+    }
+   ],
+   "execution_count": 5
   }
  ],
  "metadata": {

diff --git a/src/phenopacket_mapper/utils/io/data_reader.py b/src/phenopacket_mapper/utils/io/data_reader.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 from typing import Union, Tuple, List, Iterable, Literal, Dict
-from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase
+from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase, StringIO
 
 import pandas as pd
 
@@ -50,15 +50,17 @@ def __init__(
                 self.is_dir = True
 
         elif isinstance(file, IOBase):
-            if isinstance(file, (TextIOWrapper, TextIOBase)):
-                pass
+            if isinstance(file, (TextIOWrapper, TextIOBase, StringIO)):
+                self.file = file
             elif isinstance(file, (BytesIO, BufferedIOBase)):
                 self.file = TextIOWrapper(file, encoding=encoding)
 
             if file_extension is None:
                 raise ValueError("File extension must be provided when passing a file buffer.")
             else:
                 self.handle_file_extension(file_extension)
+        else:
+            raise ValueError(f"Invalid input type {type(file)}.")
 
         self.data, self.iterable = self._read()
 

diff --git a/src/phenopacket_mapper/utils/io/read_xml.py b/src/phenopacket_mapper/utils/io/read_xml.py
@@ -74,7 +74,6 @@ def remove_at_symbols(dict_: Dict) -> Dict:
 def parse_xml(file: IOBase) -> Dict:
     """Parse an XML file into a dictionary with inferred types."""
     dict_ = xmltodict.parse(file.read())
-    print(f"{dict_=}, {type(dict_)=}")
     dict_ = _post_process_xml_dict(dict_)
     dict_ = remove_at_symbols(dict_)
     return dict_

diff --git a/tests/utils/io/test_data_reader.py b/tests/utils/io/test_data_reader.py
@@ -0,0 +1,156 @@
+from io import StringIO
+
+import pandas as pd
+import pytest
+from phenopacket_mapper.utils.io import DataReader
+
+
+@pytest.mark.parametrize(
+    "inp,expected",
+    [
+        ('<?xml version="1.0" encoding="UTF-8" ?> <ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:redcap="https://projectredcap.org" xsi:schemaLocation="http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd" ODMVersion="1.3.1" FileOID="000-00-0000" FileType="Snapshot" Description="genAdipositas - ALT Demo" AsOfDateTime="2024-10-14T11:57:18" CreationDateTime="2024-10-14T11:57:18" SourceSystem="REDCap" SourceSystemVersion="14.6.9"> '
+         '<ClinicalData StudyOID="Project.GenAdipositasALTDemo" MetaDataVersionOID="Metadata.GenAdipositasALTDemo_2024-10-14_1157">'
+         '<SubjectData SubjectKey="101" redcap:RecordIdField="record_id">	'
+         '</SubjectData>'
+         '</ClinicalData>'
+         '</ODM>',
+         {'ODM': {'AsOfDateTime': '2024-10-14T11:57:18',
+                  'ClinicalData': {'MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157',
+                                   'StudyOID': 'Project.GenAdipositasALTDemo',
+                                   'SubjectData': {'SubjectKey': 101,
+                                                   'redcap:RecordIdField': 'record_id'}},
+                  'CreationDateTime': '2024-10-14T11:57:18',
+                  'Description': 'genAdipositas - ALT Demo',
+                  'FileOID': '000-00-0000',
+                  'FileType': 'Snapshot',
+                  'ODMVersion': '1.3.1',
+                  'SourceSystem': 'REDCap',
+                  'SourceSystemVersion': '14.6.9',
+                  'xmlns': 'http://www.cdisc.org/ns/odm/v1.3',
+                  'xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#',
+                  'xmlns:redcap': 'https://projectredcap.org',
+                  'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+                  'xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 '
+                                        'schema/odm/ODM1-3-1.xsd'}}
+         ),
+        ('<string>Hello World</string>', {"string": "Hello World"}),
+        ('<object><a>b</a><c>d</c></object>', {"object": {"a": "b", "c": "d"}}),
+        ('<number>123</number>', {"number": 123}),
+        ('<number>-123</number>', {"number": -123}),
+        ('<number>123.4</number>', {"number": 123.4}),
+        ('<null></null>', {"null": None}),  # empty tag
+        ('<null />', {"null": None}),  # empty tag
+        ('<null xsi:nil="true"/>', {"null": None}),  # explicit null
+        ('<color>gold</color>', {"color": "gold"}),
+        ('<boolean>true</boolean>', {"boolean": True}),
+        ('<boolean>false</boolean>', {"boolean": False}),
+        ('<array><item>1</item><item>2</item><item>3</item></array>', {"array": {"item": [1, 2, 3]}}),
+        ('<root>'
+         '<array>'
+         '<item>1</item>'
+         '<item>2</item>'
+         '<item>3</item>'
+         '</array>'
+         '<boolean>true</boolean>'
+         '<color>gold</color>'
+         '<number>123</number>'
+         '<object>'
+         '<a>b</a>'
+         '<c>d</c>'
+         '</object>'
+         '<string>Hello World</string>'
+         '</root>',
+         {
+             "root":{
+                 "array": {
+                     "item": [1, 2, 3]
+                 },
+                 "boolean": True,
+                 "color": "gold",
+                 "number": 123,
+                 "object": {
+                     "a": "b",
+                     "c": "d"
+                 },
+                 "string": "Hello World"
+             }
+         }),
+        ('<ItemData ItemOID="redcap_survey_identifier" Value=""/>', {"ItemData": {"ItemOID": "redcap_survey_identifier", "Value": ""}}),
+
+    ]
+)
+def test_read_xml(inp, expected):
+    data_reader = DataReader(StringIO(inp), file_extension="xml")
+    assert data_reader.data == expected
+
+
+@pytest.mark.parametrize(
+    "inp",
+    [
+        '<a b="b_content" c="c_content">	'
+        '</a>',
+        '<a b="b_content@@@" c="c_content">'
+        '</a>',
+    ]
+)
+def test_read_xml_no_at_symbols_in_keys(inp):
+    """
+    There are allowed to be at symbols in the data but the post processing function will remove @ symbols that the
+    xml to python dictionary reader puts in. Example:
+    <a b="b_content" c="c_content"></a>
+
+    will return:
+    {'a': {'@b': 'b_content', '@c': 'c_content'}}
+
+    this test exists to ensure that the post processing function is working correctly, returning:
+    {'a': {'b': 'b_content', 'c': 'c_content'}}.
+
+    However, to make sure that the postprocessor does not remove @ symbols in the data, there is an additional test case
+    """
+    data_reader = DataReader(StringIO(inp), file_extension="xml")
+    num_at_symbols = str(data_reader.data.keys()).count('@')
+    assert num_at_symbols == 0
+
+
+@pytest.mark.parametrize(
+    "inp, expected",
+    [
+        (
+            "a,b,c,d\n1,1.23,False,hello\n2,-123,FALSE,how\n3,.5,TRUE,#!$%$^@&*/\n4,0.5,True,are\n5,0,true,you",
+            pd.DataFrame(
+                {
+                    "a": [1, 2, 3, 4, 5],
+                    "b": [1.23, -123, 0.5, 0.5, 0],
+                    "c": [False, False, True, True, True],
+                    "d": ["hello", "how", "#!$%$^@&*/", "are", "you"],
+                }
+            )
+        )
+    ]
+)
+def test_reader_csv(inp, expected):
+    data_reader = DataReader(StringIO(inp), file_extension="csv")
+    assert set(data_reader.data.columns) == set(expected.columns)
+    for col in expected.columns:
+        assert data_reader.data[col].equals(expected[col])
+
+
+@pytest.mark.parametrize(
+    "inp,expected",
+    [
+        ('{"string": "Hello World"}', {"string": "Hello World"}),
+        ('{"object": {"a": "b","c": "d"}}', {"object": {"a": "b","c": "d"}}),
+        ('{"number": 123}', {"number": 123}),
+        ('{"number": -123}', {"number": -123}),
+        ('{"number": 123.4}', {"number": 123.4}),
+        ('{"null": null}', {"null": None}),
+        ('{"color": "gold"}', {"color": "gold"}),
+        ('{"boolean": true}', {"boolean": True}),
+        ('{"boolean": false}', {"boolean": False}),
+        ('{"array": [1,2,3]}', {"array": [1,2,3]}),
+        ('{"array": [1,2,3],"boolean": true, "color": "gold","null": null,"number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"}', {"array": [1,2,3],"boolean": True, "color": "gold","null": None, "number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"})
+    ],
+)
+def test_reader_json(inp, expected):
+    data_reader = DataReader(StringIO(inp), file_extension="json")
+    assert data_reader.data == expected