Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

156 add util method that loads data in whatever format and provides an iterator for instances #174

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
66009bc
Merge pull request #164 from BIH-CEI/develop
frehburg Oct 8, 2024
205eab3
restructured project
frehburg Oct 8, 2024
68a72b2
create data reader
frehburg Oct 8, 2024
918b5c9
added todo
frehburg Oct 8, 2024
1095973
added read json and xml methods
frehburg Oct 8, 2024
29e5178
implement data reader that will read any file type
frehburg Oct 8, 2024
e854c79
renamed file path to path
frehburg Oct 8, 2024
f6a591d
added data reader to init
frehburg Oct 8, 2024
1054182
implemented loading using data loader
frehburg Oct 8, 2024
e78f4d7
change to accept a file
frehburg Oct 10, 2024
c49f086
added files for read json and xml
frehburg Oct 14, 2024
f1e68c7
added test case for read json
frehburg Oct 14, 2024
8f109cc
updated read json
frehburg Oct 14, 2024
d4054a4
wrote test case for xml
frehburg Oct 14, 2024
f7c3a27
updated data reader to work with buffers
frehburg Oct 14, 2024
0c27663
updated read xml, still some bugs
frehburg Oct 14, 2024
d4c2904
implemented logic for loading in direcories of files
frehburg Oct 14, 2024
41dd8de
added todos
frehburg Oct 14, 2024
5722fed
syntax err with walrus operator
frehburg Oct 15, 2024
56f8639
updated test cases for xml reading
frehburg Oct 15, 2024
be812a8
updated reading of xmls implementation
frehburg Oct 15, 2024
91b11a8
added xmltodict to pyproject.toml
frehburg Oct 15, 2024
3db6458
refactored toml
frehburg Oct 15, 2024
0b9336b
added parse xml to init
frehburg Oct 15, 2024
69ec03b
merge
frehburg Oct 15, 2024
19e8050
deleted cli package
frehburg Oct 15, 2024
55ad3f1
removed from init
frehburg Oct 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ classifiers = [
"Topic :: Scientific/Engineering :: Machine Learning"
]
dependencies = [
"loguru", "phenopackets", "pandas", "openpyxl", "jupyter", "requests", "bs4",
"loguru",
"phenopackets",
"pandas",
"openpyxl",
"jupyter",
"requests",
"bs4",
"xmltodict==0.14.1",
]
dynamic = ["version"]

Expand Down
14 changes: 11 additions & 3 deletions src/phenopacket_mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@

__version__ = "0.0.1"

from . import cli, data_standards, pipeline, preprocessing, api_requests
from . import data_standards, validate, preprocessing, api_requests, mapping, utils

from .pipeline import PhenopacketMapper
from .data_standards import DataModel
from .mapping import PhenopacketMapper

__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper", "preprocessing", "api_requests"]
__all__ = [
"data_standards", "DataModel",
"validate",
"preprocessing",
"api_requests",
"mapping", "PhenopacketMapper",
"utils",
]
1 change: 0 additions & 1 deletion src/phenopacket_mapper/cli/__init__.py

This file was deleted.

50 changes: 0 additions & 50 deletions src/phenopacket_mapper/cli/main.py

This file was deleted.

32 changes: 0 additions & 32 deletions src/phenopacket_mapper/cli/mapping_command.py

This file was deleted.

50 changes: 0 additions & 50 deletions src/phenopacket_mapper/cli/quickstart_command.py

This file was deleted.

19 changes: 0 additions & 19 deletions src/phenopacket_mapper/cli/validate_command.py

This file was deleted.

3 changes: 2 additions & 1 deletion src/phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def load_data(
:param kwargs: Dynamically passed parameters that match {id}_column for each item
:return: A list of `DataModelInstance` objects
"""
# TODO: move the dynamic params to the load method in utils.io
column_names = dict()
for f in self.fields:
column_param = f"{f.id}_column"
Expand All @@ -264,7 +265,7 @@ def load_data(
else:
column_names[f.id] = kwargs[column_param]

from phenopacket_mapper.pipeline import load_data_using_data_model
from phenopacket_mapper.utils.io import load_data_using_data_model
return load_data_using_data_model(
path=path,
data_model=self,
Expand Down
12 changes: 0 additions & 12 deletions src/phenopacket_mapper/pipeline/__init__.py

This file was deleted.

19 changes: 19 additions & 0 deletions src/phenopacket_mapper/utils/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""This module handles the input and output of data."""

from .read_json import read_json
from .read_xml import read_xml, parse_xml
from .data_reader import DataReader
from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model
from .output import write

__all__ = [
'read_json',
'read_xml', 'parse_xml',
'DataReader',
'read_data_model',
'read_phenopackets',
'read_phenopacket_from_json',
'load_data_using_data_model',

'write',
]
111 changes: 111 additions & 0 deletions src/phenopacket_mapper/utils/io/data_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from pathlib import Path
from typing import Union, Tuple, List, Iterable, Literal, Dict
from io import IOBase, TextIOWrapper, BytesIO, BufferedIOBase, TextIOBase

import pandas as pd

from phenopacket_mapper.utils.io import read_json, read_xml


class DataReader:
def __init__(
self,
file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]],
encoding: str = 'utf-8',
file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None
):
"""Initializes the data reader.

:param file: a `str`, :class:`Path` or :class:`IOBase` to read from. If `str` or :class:`Path`, then the
input is interpreted as a path to a local file.
:param encoding: The encoding to use when reading the file. Default is 'utf-8'.
:param file_extension: The file extension of the file to read. If `None`, the file extension is inferred from the
file path. Default is `None`.
"""
# TODO: fix read xml
# TODO: add option to pass a list of files to read
self.is_dir = False
self.file_extension = None

if isinstance(file, str):
self.path = Path(file)
self.file = open(self.path, "r", encoding=encoding)

if file_extension is None: # extract the file extension from the file path
file_extension = self.path.suffix[1:]

self.handle_file_extension(file_extension)
elif isinstance(file, Path):
if not file.exists():
raise FileNotFoundError(f"File {file} does not exist.")
if file.is_file():
self.path = file
self.file = open(self.path, "r", encoding=encoding)

if file_extension is None: # extract the file extension from the file path
file_extension = self.path.suffix[1:]

self.handle_file_extension(file_extension)
elif file.is_dir():
self.is_dir = True

elif isinstance(file, IOBase):
if isinstance(file, (TextIOWrapper, TextIOBase)):
pass
elif isinstance(file, (BytesIO, BufferedIOBase)):
self.file = TextIOWrapper(file, encoding=encoding)

if file_extension is None:
raise ValueError("File extension must be provided when passing a file buffer.")
else:
self.handle_file_extension(file_extension)

self.data, self.iterable = self._read()

def handle_file_extension(self, fe: str):
if fe.lower() in ['csv', 'xlsx', 'json', 'xml']:
self.file_extension = fe.lower()
else:
raise ValueError(f"File extension {fe} not recognized.")

def _read(self) -> Tuple[Union[pd.DataFrame, List, Dict], Iterable]:
"""Reads the data.

:return: The data and an iterable representation of the data.
"""
# we know that file is always a buffer with the contents of the file
# change this to work with self.file
if not self.is_dir:
if self.file_extension == 'csv':
df = pd.read_csv(self.file)
return df, [row for row in df.iterrows()]
elif self.file_extension == 'xlsx':
df = pd.read_excel(self.file)
return df, [row for row in df.iterrows()]
elif self.file_extension == 'json':
return (file_contents := read_json(self.file)), [file_contents]
elif self.file_extension == 'xml':
return (file_contents := read_xml(self.file)), [file_contents]
else:
raise ValueError(f'Unknown file type with extension {self.file_extension}')
elif self.is_dir:
# collect list of all files in the folder
files: List[Path] = [file for file in self.path.iterdir() if file.is_file()]
file_extension = list(set([file.suffix[1:] for file in files]))
if len(file_extension) > 1:
raise ValueError(f"Cannot read files of different types: {file_extension}")
elif len(file_extension) == 0:
raise ValueError(f"No files found in the directory specified: {self.file}")

self.handle_file_extension(file_extension[0])

if self.file_extension == 'json':
jsons = [read_json(file) for file in files]
return jsons, jsons
elif self.file_extension == 'xml':
xmls = [read_xml(file) for file in files]
return xmls, xmls
else:
raise ValueError(f"File extension {file_extension} not recognized or not supported for reading files "
f"from a directory. Specified directory: {self.file}. Extensions found: "
f"{file_extension}")
Loading
Loading