-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added Pydantic validation function and manual validation documentation #87
Changes from 8 commits
1ffb885
389ada9
d24df32
fe79097
c67ca33
dfe23a4
7b0b43e
e1045aa
0480051
94f3207
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,3 +1,18 @@ | ||||||
""" | ||||||
To manually use Pydantic with CBRkit to validate your case base, you can use an appropriate | ||||||
Pydantic model instead of the CBRkit loaders (see example below). | ||||||
Alternatively, the dataframe, path, file and folder accept an optional validation_model argument | ||||||
to validate the Casebase entries. | ||||||
|
||||||
|
||||||
Example: | ||||||
>>> from pydantic import BaseModel, PositiveInt, NonNegativeInt | ||||||
>>> from data.cars_validation_model import Car | ||||||
>>> data = csv("data/cars-1k.csv") | ||||||
>>> for row in data.values(): | ||||||
... assert isinstance(Car.model_validate(row), Car) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that we have the validation function, maybe we should just use refer to its docstring instead? |
||||||
""" | ||||||
|
||||||
import csv as csvlib | ||||||
import tomllib | ||||||
from collections import abc | ||||||
|
@@ -13,6 +28,7 @@ | |||||
from pandas import DataFrame, Series | ||||||
|
||||||
from cbrkit.typing import Casebase, FilePath | ||||||
from pydantic import BaseModel | ||||||
|
||||||
__all__ = [ | ||||||
"csv", | ||||||
|
@@ -26,6 +42,7 @@ | |||||
"python", | ||||||
"txt", | ||||||
"xml", | ||||||
"validate", | ||||||
] | ||||||
|
||||||
|
||||||
|
@@ -325,6 +342,12 @@ def file(path: Path) -> Casebase[Any, Any] | None: | |||||
>>> from pathlib import Path | ||||||
>>> file_path = Path("./data/cars-1k.csv") | ||||||
>>> result = file(file_path) | ||||||
|
||||||
>>> from pydantic import BaseModel, PositiveInt, NonNegativeInt | ||||||
>>> from pathlib import Path | ||||||
>>> file_path = Path("./data/cars-1k.csv") | ||||||
>>> result = file(file_path) | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this snippet achieve? The model does not seem to be used here. |
||||||
""" | ||||||
if path.suffix not in _batch_loaders: | ||||||
return None | ||||||
|
@@ -341,23 +364,54 @@ def folder(path: Path, pattern: str) -> Casebase[Any, Any] | None: | |||||
Args: | ||||||
path: Path of the folder. | ||||||
pattern: Relative pattern for the files. | ||||||
|
||||||
Returns: | ||||||
Returns a Casebase. | ||||||
|
||||||
Examples: | ||||||
>>> from pathlib import Path | ||||||
>>> from data.cars_validation_model import Car | ||||||
>>> folder_path = Path("./data") | ||||||
>>> result = folder(folder_path, ".csv") | ||||||
>>> result = folder(folder_path, "*.csv") | ||||||
>>> assert result is not None | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above, what is the goal here? |
||||||
""" | ||||||
cb: Casebase[Any, Any] = {} | ||||||
|
||||||
for file in path.glob(pattern): | ||||||
if file.is_file() and file.suffix in _single_loaders: | ||||||
loader = _single_loaders[path.suffix] | ||||||
loader = _single_loaders[file.suffix] | ||||||
cb[file.name] = loader(file) | ||||||
|
||||||
if len(cb) == 0: | ||||||
return None | ||||||
|
||||||
return cb | ||||||
|
||||||
|
||||||
def validate(data: dict[str, Any] | object, validation_model: BaseModel): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
"""Validates the data against a Pydantic model. Throws a ValueError if data is None or a Pydantic ValidationError if the data does not match the model. | ||||||
|
||||||
Args: | ||||||
data: Data to validate. Can be an entire case base or a single case. | ||||||
validation_model: Pydantic model to validate the data. | ||||||
|
||||||
Examples: | ||||||
>>> from pydantic import BaseModel, PositiveInt, NonNegativeInt | ||||||
>>> from data.cars_validation_model import Car | ||||||
>>> from pathlib import Path | ||||||
>>> data = path(Path("data/cars-1k.csv")) | ||||||
>>> validate(data, Car) | ||||||
>>> import pandas as pd | ||||||
>>> df = pd.read_csv("data/cars-1k.csv") | ||||||
>>> data = dataframe(df) | ||||||
>>> validate(data, Car) | ||||||
""" | ||||||
if data is None: | ||||||
raise ValueError("Data is None") | ||||||
if isinstance(data, DataFrameCasebase): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
data = data.df.to_dict("index") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is extremely slow. Is there a way to just iterate over all Series entries instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is possible to do it in the following manner: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then just leave it as it is for the time being. In case someone reports performance issues we can revisit it in the future. |
||||||
if isinstance(data, dict): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
for item in data.values(): | ||||||
validation_model.model_validate(item) | ||||||
else: | ||||||
validation_model.model_validate(data) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
"""This module contains the Pydantic model for validating the car data.""" | ||
from pydantic import BaseModel, PositiveInt, NonNegativeInt | ||
from typing import Literal | ||
|
||
class Car(BaseModel): | ||
price: NonNegativeInt | ||
year: NonNegativeInt | ||
manufacturer: str | ||
make: str | ||
fuel: Literal["gas", "diesel"] | ||
miles: NonNegativeInt | ||
title_status: Literal["clean", "rebuilt"] | ||
transmission: Literal["automatic", "manual"] | ||
drive: Literal["fwd", "rwd", "4wd"] | ||
type: str | ||
paint_color: str |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -55,6 +55,7 @@ transformers = { version = "^4.35", optional = true } | |||||
typer = { version = ">=0.9, <1.0", extras = ["all"], optional = true } | ||||||
uvicorn = { version = ">=0.24, <1.0", optional = true, extras = ["standard"] } | ||||||
xmltodict = ">=0.13, <1.0" | ||||||
pydantic = { version = ">=2.0.0", optional = true } | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since
Suggested change
|
||||||
|
||||||
[tool.poetry.group.dev.dependencies] | ||||||
pytest = "^8.0.0" | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The description should be updated for the new
cbrkit.loaders.validate
function.