Skip to content

Commit

Permalink
Merge pull request #178 from BIH-CEI/177-debug-data-reader
Browse files Browse the repository at this point in the history
177 debug data reader
  • Loading branch information
frehburg authored Oct 16, 2024
2 parents 73c8c5a + ebeae86 commit b14ef3e
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 18 deletions.
84 changes: 70 additions & 14 deletions notebooks/hierarchical_data_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-10-14T21:21:45.314262Z",
"start_time": "2024-10-14T21:21:45.309299Z"
"end_time": "2024-10-15T16:49:45.994633Z",
"start_time": "2024-10-15T16:49:45.291536Z"
}
},
"source": [
"from phenopacket_mapper.data_standards import DataField\n",
"from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup"
"from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n",
"from phenopacket_mapper.utils.io import DataReader"
],
"outputs": [],
"execution_count": 4
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-10-14T21:21:45.340239Z",
"start_time": "2024-10-14T21:21:45.334880Z"
"end_time": "2024-10-15T16:49:46.001925Z",
"start_time": "2024-10-15T16:49:45.997639Z"
}
},
"cell_type": "code",
Expand Down Expand Up @@ -81,13 +82,13 @@
],
"id": "2e979683ae450d9b",
"outputs": [],
"execution_count": 5
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-10-14T21:21:45.352293Z",
"start_time": "2024-10-14T21:21:45.347715Z"
"end_time": "2024-10-15T16:49:46.057078Z",
"start_time": "2024-10-15T16:49:46.053873Z"
}
},
"cell_type": "code",
Expand Down Expand Up @@ -156,20 +157,75 @@
]
}
],
"execution_count": 6
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-10-14T21:21:45.378046Z",
"start_time": "2024-10-14T21:21:45.375530Z"
"end_time": "2024-10-15T16:49:46.123667Z",
"start_time": "2024-10-15T16:49:46.120256Z"
}
},
"cell_type": "code",
"source": "",
"source": [
"from io import StringIO\n",
"\n",
"xml_data = \\\n",
" (\n",
" '<?xml version=\"1.0\" encoding=\"UTF-8\" ?> <ODM xmlns=\"http://www.cdisc.org/ns/odm/v1.3\" xmlns:ds=\"http://www.w3.org/2000/09/xmldsig#\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xmlns:redcap=\"https://projectredcap.org\" xsi:schemaLocation=\"http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd\" ODMVersion=\"1.3.1\" FileOID=\"000-00-0000\" FileType=\"Snapshot\" Description=\"genAdipositas - ALT Demo\" AsOfDateTime=\"2024-10-14T11:57:18\" CreationDateTime=\"2024-10-14T11:57:18\" SourceSystem=\"REDCap\" SourceSystemVersion=\"14.6.9\"> '\n",
" '<ClinicalData StudyOID=\"Project.GenAdipositasALTDemo\" MetaDataVersionOID=\"Metadata.GenAdipositasALTDemo_2024-10-14_1157\">'\n",
" '<SubjectData SubjectKey=\"101\" redcap:RecordIdField=\"record_id\">\t'\n",
" '</SubjectData>'\n",
" '</ClinicalData>'\n",
" '</ODM>'\n",
" )\n",
"\n",
"buffer = StringIO(xml_data)"
],
"id": "4c78eb05ea58ff6c",
"outputs": [],
"execution_count": null
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-10-15T16:49:46.146860Z",
"start_time": "2024-10-15T16:49:46.143098Z"
}
},
"cell_type": "code",
"source": "dr = DataReader(buffer, file_extension=\"xml\")",
"id": "a9f83d6e46715301",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_={'ODM': {'@xmlns': 'http://www.cdisc.org/ns/odm/v1.3', '@xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:redcap': 'https://projectredcap.org', '@xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', '@ODMVersion': '1.3.1', '@FileOID': '000-00-0000', '@FileType': 'Snapshot', '@Description': 'genAdipositas - ALT Demo', '@AsOfDateTime': '2024-10-14T11:57:18', '@CreationDateTime': '2024-10-14T11:57:18', '@SourceSystem': 'REDCap', '@SourceSystemVersion': '14.6.9', 'ClinicalData': {'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}}}, type(dict_)=<class 'dict'>\n",
"k='ODM', type(k)=<class 'str'>, v={'@xmlns': 'http://www.cdisc.org/ns/odm/v1.3', '@xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:redcap': 'https://projectredcap.org', '@xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', '@ODMVersion': '1.3.1', '@FileOID': '000-00-0000', '@FileType': 'Snapshot', '@Description': 'genAdipositas - ALT Demo', '@AsOfDateTime': '2024-10-14T11:57:18', '@CreationDateTime': '2024-10-14T11:57:18', '@SourceSystem': 'REDCap', '@SourceSystemVersion': '14.6.9', 'ClinicalData': {'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}}, type(v)=<class 'dict'>\n",
"k='@xmlns', type(k)=<class 'str'>, v='http://www.cdisc.org/ns/odm/v1.3', type(v)=<class 'str'>\n",
"k='@xmlns:ds', type(k)=<class 'str'>, v='http://www.w3.org/2000/09/xmldsig#', type(v)=<class 'str'>\n",
"k='@xmlns:xsi', type(k)=<class 'str'>, v='http://www.w3.org/2001/XMLSchema-instance', type(v)=<class 'str'>\n",
"k='@xmlns:redcap', type(k)=<class 'str'>, v='https://projectredcap.org', type(v)=<class 'str'>\n",
"k='@xsi:schemaLocation', type(k)=<class 'str'>, v='http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', type(v)=<class 'str'>\n",
"k='@ODMVersion', type(k)=<class 'str'>, v='1.3.1', type(v)=<class 'str'>\n",
"k='@FileOID', type(k)=<class 'str'>, v='000-00-0000', type(v)=<class 'str'>\n",
"k='@FileType', type(k)=<class 'str'>, v='Snapshot', type(v)=<class 'str'>\n",
"k='@Description', type(k)=<class 'str'>, v='genAdipositas - ALT Demo', type(v)=<class 'str'>\n",
"k='@AsOfDateTime', type(k)=<class 'str'>, v='2024-10-14T11:57:18', type(v)=<class 'str'>\n",
"k='@CreationDateTime', type(k)=<class 'str'>, v='2024-10-14T11:57:18', type(v)=<class 'str'>\n",
"k='@SourceSystem', type(k)=<class 'str'>, v='REDCap', type(v)=<class 'str'>\n",
"k='@SourceSystemVersion', type(k)=<class 'str'>, v='14.6.9', type(v)=<class 'str'>\n",
"k='ClinicalData', type(k)=<class 'str'>, v={'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}, type(v)=<class 'dict'>\n",
"k='@StudyOID', type(k)=<class 'str'>, v='Project.GenAdipositasALTDemo', type(v)=<class 'str'>\n",
"k='@MetaDataVersionOID', type(k)=<class 'str'>, v='Metadata.GenAdipositasALTDemo_2024-10-14_1157', type(v)=<class 'str'>\n",
"k='SubjectData', type(k)=<class 'str'>, v={'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}, type(v)=<class 'dict'>\n",
"k='@SubjectKey', type(k)=<class 'str'>, v='101', type(v)=<class 'str'>\n",
"k='@redcap:RecordIdField', type(k)=<class 'str'>, v='record_id', type(v)=<class 'str'>\n"
]
}
],
"execution_count": 5
}
],
"metadata": {
Expand Down
8 changes: 5 additions & 3 deletions src/phenopacket_mapper/utils/io/data_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path
from typing import Union, Tuple, List, Iterable, Literal, Dict
from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase
from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase, StringIO

import pandas as pd

Expand Down Expand Up @@ -50,15 +50,17 @@ def __init__(
self.is_dir = True

elif isinstance(file, IOBase):
if isinstance(file, (TextIOWrapper, TextIOBase)):
pass
if isinstance(file, (TextIOWrapper, TextIOBase, StringIO)):
self.file = file
elif isinstance(file, (BytesIO, BufferedIOBase)):
self.file = TextIOWrapper(file, encoding=encoding)

if file_extension is None:
raise ValueError("File extension must be provided when passing a file buffer.")
else:
self.handle_file_extension(file_extension)
else:
raise ValueError(f"Invalid input type {type(file)}.")

self.data, self.iterable = self._read()

Expand Down
1 change: 0 additions & 1 deletion src/phenopacket_mapper/utils/io/read_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def remove_at_symbols(dict_: Dict) -> Dict:
def parse_xml(file: IOBase) -> Dict:
"""Parse an XML file into a dictionary with inferred types."""
dict_ = xmltodict.parse(file.read())
print(f"{dict_=}, {type(dict_)=}")
dict_ = _post_process_xml_dict(dict_)
dict_ = remove_at_symbols(dict_)
return dict_
Expand Down
156 changes: 156 additions & 0 deletions tests/utils/io/test_data_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from io import StringIO

import pandas as pd
import pytest
from phenopacket_mapper.utils.io import DataReader


@pytest.mark.parametrize(
"inp,expected",
[
('<?xml version="1.0" encoding="UTF-8" ?> <ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:redcap="https://projectredcap.org" xsi:schemaLocation="http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd" ODMVersion="1.3.1" FileOID="000-00-0000" FileType="Snapshot" Description="genAdipositas - ALT Demo" AsOfDateTime="2024-10-14T11:57:18" CreationDateTime="2024-10-14T11:57:18" SourceSystem="REDCap" SourceSystemVersion="14.6.9"> '
'<ClinicalData StudyOID="Project.GenAdipositasALTDemo" MetaDataVersionOID="Metadata.GenAdipositasALTDemo_2024-10-14_1157">'
'<SubjectData SubjectKey="101" redcap:RecordIdField="record_id"> '
'</SubjectData>'
'</ClinicalData>'
'</ODM>',
{'ODM': {'AsOfDateTime': '2024-10-14T11:57:18',
'ClinicalData': {'MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157',
'StudyOID': 'Project.GenAdipositasALTDemo',
'SubjectData': {'SubjectKey': 101,
'redcap:RecordIdField': 'record_id'}},
'CreationDateTime': '2024-10-14T11:57:18',
'Description': 'genAdipositas - ALT Demo',
'FileOID': '000-00-0000',
'FileType': 'Snapshot',
'ODMVersion': '1.3.1',
'SourceSystem': 'REDCap',
'SourceSystemVersion': '14.6.9',
'xmlns': 'http://www.cdisc.org/ns/odm/v1.3',
'xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#',
'xmlns:redcap': 'https://projectredcap.org',
'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 '
'schema/odm/ODM1-3-1.xsd'}}
),
('<string>Hello World</string>', {"string": "Hello World"}),
('<object><a>b</a><c>d</c></object>', {"object": {"a": "b", "c": "d"}}),
('<number>123</number>', {"number": 123}),
('<number>-123</number>', {"number": -123}),
('<number>123.4</number>', {"number": 123.4}),
('<null></null>', {"null": None}), # empty tag
('<null />', {"null": None}), # empty tag
('<null xsi:nil="true"/>', {"null": None}), # explicit null
('<color>gold</color>', {"color": "gold"}),
('<boolean>true</boolean>', {"boolean": True}),
('<boolean>false</boolean>', {"boolean": False}),
('<array><item>1</item><item>2</item><item>3</item></array>', {"array": {"item": [1, 2, 3]}}),
('<root>'
'<array>'
'<item>1</item>'
'<item>2</item>'
'<item>3</item>'
'</array>'
'<boolean>true</boolean>'
'<color>gold</color>'
'<number>123</number>'
'<object>'
'<a>b</a>'
'<c>d</c>'
'</object>'
'<string>Hello World</string>'
'</root>',
{
"root":{
"array": {
"item": [1, 2, 3]
},
"boolean": True,
"color": "gold",
"number": 123,
"object": {
"a": "b",
"c": "d"
},
"string": "Hello World"
}
}),
('<ItemData ItemOID="redcap_survey_identifier" Value=""/>', {"ItemData": {"ItemOID": "redcap_survey_identifier", "Value": ""}}),
]
)
def test_read_xml(inp, expected):
data_reader = DataReader(StringIO(inp), file_extension="xml")
assert data_reader.data == expected


@pytest.mark.parametrize(
"inp",
[
'<a b="b_content" c="c_content"> '
'</a>',
'<a b="b_content@@@" c="c_content">'
'</a>',
]
)
def test_read_xml_no_at_symbols_in_keys(inp):
"""
There are allowed to be at symbols in the data but the post processing function will remove @ symbols that the
xml to python dictionary reader puts in. Example:
<a b="b_content" c="c_content"></a>
will return:
{'a': {'@b': 'b_content', '@c': 'c_content'}}
this test exists to ensure that the post processing function is working correctly, returning:
{'a': {'b': 'b_content', 'c': 'c_content'}}.
However, to make sure that the postprocessor does not remove @ symbols in the data, there is an additional test case
"""
data_reader = DataReader(StringIO(inp), file_extension="xml")
num_at_symbols = str(data_reader.data.keys()).count('@')
assert num_at_symbols == 0


@pytest.mark.parametrize(
"inp, expected",
[
(
"a,b,c,d\n1,1.23,False,hello\n2,-123,FALSE,how\n3,.5,TRUE,#!$%$^@&*/\n4,0.5,True,are\n5,0,true,you",
pd.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": [1.23, -123, 0.5, 0.5, 0],
"c": [False, False, True, True, True],
"d": ["hello", "how", "#!$%$^@&*/", "are", "you"],
}
)
)
]
)
def test_reader_csv(inp, expected):
data_reader = DataReader(StringIO(inp), file_extension="csv")
assert set(data_reader.data.columns) == set(expected.columns)
for col in expected.columns:
assert data_reader.data[col].equals(expected[col])


@pytest.mark.parametrize(
"inp,expected",
[
('{"string": "Hello World"}', {"string": "Hello World"}),
('{"object": {"a": "b","c": "d"}}', {"object": {"a": "b","c": "d"}}),
('{"number": 123}', {"number": 123}),
('{"number": -123}', {"number": -123}),
('{"number": 123.4}', {"number": 123.4}),
('{"null": null}', {"null": None}),
('{"color": "gold"}', {"color": "gold"}),
('{"boolean": true}', {"boolean": True}),
('{"boolean": false}', {"boolean": False}),
('{"array": [1,2,3]}', {"array": [1,2,3]}),
('{"array": [1,2,3],"boolean": true, "color": "gold","null": null,"number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"}', {"array": [1,2,3],"boolean": True, "color": "gold","null": None, "number": 123, "object": {"a": "b","c": "d"}, "string": "Hello World"})
],
)
def test_reader_json(inp, expected):
data_reader = DataReader(StringIO(inp), file_extension="json")
assert data_reader.data == expected

0 comments on commit b14ef3e

Please sign in to comment.