Skip to content

Commit

Permalink
Path reader update (#9)
Browse files Browse the repository at this point in the history
* fixed but in versioned dataset and added tests for it

* fixing flake

* fixing bug

* updating so path_reader can read all files in a path if not provided read_args

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixing linting

* fixing mypy lint

* fixed so path_reader now can read entire path if no read_args are provided

* path_reader can now read all files in a path and return as dataframe

* fixing small bug

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixing mypy

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ms-cmy and pre-commit-ci[bot] authored Oct 6, 2023
1 parent 07b0ae3 commit 2dab392
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 20 deletions.
37 changes: 17 additions & 20 deletions kedro_projetaai/utils/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
pickle_dump,
)

TIME_SCALE_MAP = {"D": "days", "M": "months", "Y": "years"}


class DatasetTypes(Enum):
"""Abstracts the read and write methods for each file extension."""
Expand Down Expand Up @@ -498,7 +500,7 @@ class PathReader(BaseDataset):
def __init__(
self,
path: str,
read_args: Optional[dict[str, Any]],
read_args: Optional[dict[str, Any]] = None,
load_args: Optional[dict[str, Any]] = None,
credentials: Optional[dict] = None,
back_date: Optional[str] = None,
Expand All @@ -517,36 +519,30 @@ def __init__(
starting_weekday: int, the starting weekday to
use as reference to read data.
"""
self.read_args = self.raise_if_read_args_is_none(read_args)
self._setting_read_args(read_args)
super().__init__(
path=path, load_args=load_args, credentials=credentials, back_date=back_date
)
self._transform_load_config()

def raise_if_read_args_is_none(
self, read_args: Optional[dict[str, Any]]
) -> dict[str, Any]:
def _setting_read_args(self, read_args: Optional[dict[str, Any]]):
"""Raises an error if the read_args is None."""
if read_args is None:
raise ValueError(
"""read_args must be provided in yml file \n
with the following arguments: \n
time_scale, history_length"""
)
return read_args
self.read_args = {} if read_args is None else read_args
self._transform_time_scale()
return

def _validate_load_config(self) -> str:
def _validate_read_args_config(self) -> str:
"""Validates the time_scale argument in the read_args."""
current_time_scale = self.read_args.get("time_scale", None)
if current_time_scale is None:
raise ValueError("time_scale must be provided in yml file")
return current_time_scale

def _transform_load_config(self) -> None:
def _transform_time_scale(self) -> None:
"""Transforms the time_scale to the pandas time scale."""
time_scale_map = {"D": "days", "M": "months", "Y": "years"}
current_time_scale = self._validate_load_config()
self.read_args["time_scale"] = time_scale_map.get(current_time_scale, "days")
if self.read_args:
self.read_args["time_scale"] = TIME_SCALE_MAP.get(
self._validate_read_args_config(), "days"
)
return

def _is_within_date_range(
Expand All @@ -564,12 +560,13 @@ def _is_within_date_range(

def _get_paths(self) -> list[str]:
path_list = self._filesystem.find(self.path)
if path_list is False:
if path_list:
raise ValueError(
f"""No files found in the given path
please check if it's correct: {self.path}"""
)
path_list = self._filter(path_list)
if self.read_args:
path_list = self._filter(path_list)
return path_list

def _filter(self, path_list: list[str]) -> list[str]:
Expand Down
23 changes: 23 additions & 0 deletions tests/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,29 @@ def test_PathReader_with_date_folders(self):
remove_files()
return

def test_PathReader_read_all(self):
"""
Test if the PathReader class is working correctly.
if read_args is not provided, it will read all files in path
and return it as a dataframe
"""
df = generate_dataframe(270, 2)
save_files(
df.groupby(df["date"].dt.strftime("%Y-%m-%d")),
"parquet",
TEMP_PREFIX + "test.parquet",
name_group_edit=True,
)
readfile_obj = PathReader(
path=TEMP_PREFIX,
credentials=None,
)
df_read = readfile_obj._load()
self.assertTrue(df_read.equals(df))
remove_files()
return

def test_PathReader_back_date(self):
"""Test if the backdate is being applied correctly in PathReader."""
df = generate_dataframe(300, 2)
Expand Down

0 comments on commit 2dab392

Please sign in to comment.