From 10f8fdbe173b9421a545da72a5ad68cddac49b10 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 29 Feb 2024 12:49:15 +0100 Subject: [PATCH 1/2] enable timezone type --- src/fondant/core/schema.py | 9 +++++++++ src/fondant/core/schemas/common.json | 4 ++++ tests/component/examples/component_specs/component.yaml | 3 +++ tests/component/test_component.py | 5 +++++ tests/core/test_schema.py | 7 +++++++ 5 files changed, 28 insertions(+) diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index 7108fa4c..14599ea7 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -197,7 +197,13 @@ def from_dict(cls, json_schema: dict): fields = [(name, cls.from_dict(prop)) for name, prop in properties.items()] return cls.struct(fields) + if type_name == "timestamp": + return cls(type_name) + if isinstance(type_name, str): + type_format = json_schema.get("format", None) + if type_format == "date-time": + return cls(pa.timestamp("us", tz="UTC")) return cls(type_name) msg = f"Invalid 'type' value: {type_name}" @@ -218,6 +224,9 @@ def to_dict(self) -> dict: fields = [(field.name, Type(field.type).to_dict()) for field in self.value] return {"type": "object", "properties": dict(fields)} + elif isinstance(self.value, pa.TimestampType): + return {"type": "string", "format": "date-time"} + type_ = None for type_name, data_type in _TYPES.items(): if self.value.equals(data_type): diff --git a/src/fondant/core/schemas/common.json b/src/fondant/core/schemas/common.json index b00eb8a0..75744900 100644 --- a/src/fondant/core/schemas/common.json +++ b/src/fondant/core/schemas/common.json @@ -38,6 +38,10 @@ "type": "string", "$ref": "#/definitions/subset_data_type" }, + "format": { + "type": "string", + "description": "additional format information for the field" + }, "properties": { "type": "object", "properties": { diff --git a/tests/component/examples/component_specs/component.yaml b/tests/component/examples/component_specs/component.yaml index 43d1f221..7d4c2a4f 100644 --- a/tests/component/examples/component_specs/component.yaml +++ b/tests/component/examples/component_specs/component.yaml @@ -30,6 +30,9 @@ produces: number: type: int32 + date: + type: string + format: date-time args: flag: diff --git a/tests/component/test_component.py b/tests/component/test_component.py index 75aa7662..4511e1f4 100644 --- a/tests/component/test_component.py +++ b/tests/component/test_component.py @@ -55,6 +55,11 @@ def mocked_load_dataframe(self): return dd.from_dict( { "images_data": [1, 2, 3], + "date": [ + "2024-02-29T12:30:45", + "2024-02-29T12:30:45", + "2024-02-29T12:30:45", + ], "element": [ ("1", 1), ("2", 2), diff --git a/tests/core/test_schema.py b/tests/core/test_schema.py index c1d9df34..752ac291 100644 --- a/tests/core/test_schema.py +++ b/tests/core/test_schema.py @@ -11,6 +11,10 @@ def test_valid_type(): assert Type.list(Type("int8")).value == pa.list_(pa.int8()) assert Type.list(Type.list(Type("string"))).value == pa.list_(pa.list_(pa.string())) assert Type("int8").to_dict() == {"type": "int8"} + assert Type(pa.timestamp("us", tz="UTC")).to_dict() == { + "type": "string", + "format": "date-time", + } assert Type.list("float32").to_dict() == { "type": "array", "items": {"type": "float32"}, @@ -42,6 +46,9 @@ def test_valid_json_schema(): assert Type.from_dict( {"type": "array", "items": {"type": "array", "items": {"type": "int8"}}}, ).value == pa.list_(pa.list_(pa.int8())) + assert Type.from_dict( + {"type": "string", "format": "date-time"}, + ).value == pa.timestamp("us", tz="UTC") assert Type.from_dict( { "type": "object", From a81299d95c9f6b6e7e70a30926ee1f952954e5c8 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Fri, 1 Mar 2024 10:41:44 +0100 Subject: [PATCH 2/2] implement PR feedback --- src/fondant/core/schema.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index 14599ea7..728bef5d 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -159,6 +159,7 @@ def struct( else: type_ = cls._validate_data_type(data_type) validated_fields.append(pa.field(name, type_)) + return cls(pa.struct(validated_fields)) @classmethod @@ -194,16 +195,17 @@ def from_dict(cls, json_schema: dict): if not isinstance(properties, dict): msg = "Invalid 'properties' type in object schema." raise InvalidTypeSchema(msg) + fields = [(name, cls.from_dict(prop)) for name, prop in properties.items()] - return cls.struct(fields) - if type_name == "timestamp": - return cls(type_name) + return cls.struct(fields) if isinstance(type_name, str): type_format = json_schema.get("format", None) + if type_format == "date-time": return cls(pa.timestamp("us", tz="UTC")) + return cls(type_name) msg = f"Invalid 'type' value: {type_name}" @@ -220,6 +222,7 @@ def to_dict(self) -> dict: items = self.value.value_type if isinstance(items, pa.DataType): return {"type": "array", "items": Type(items).to_dict()} + elif isinstance(self.value, pa.StructType): fields = [(field.name, Type(field.type).to_dict()) for field in self.value] return {"type": "object", "properties": dict(fields)}