Skip to content

Commit

Permalink
Enable timezone type for timestamp pyarrow type (#888)
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilippeMoussalli authored Mar 4, 2024
1 parent 16bcfd7 commit 1ff9b1a
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/fondant/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def struct(
else:
type_ = cls._validate_data_type(data_type)
validated_fields.append(pa.field(name, type_))

return cls(pa.struct(validated_fields))

@classmethod
Expand Down Expand Up @@ -194,10 +195,17 @@ def from_dict(cls, json_schema: dict):
if not isinstance(properties, dict):
msg = "Invalid 'properties' type in object schema."
raise InvalidTypeSchema(msg)

fields = [(name, cls.from_dict(prop)) for name, prop in properties.items()]

return cls.struct(fields)

if isinstance(type_name, str):
type_format = json_schema.get("format", None)

if type_format == "date-time":
return cls(pa.timestamp("us", tz="UTC"))

return cls(type_name)

msg = f"Invalid 'type' value: {type_name}"
Expand All @@ -214,10 +222,14 @@ def to_dict(self) -> dict:
items = self.value.value_type
if isinstance(items, pa.DataType):
return {"type": "array", "items": Type(items).to_dict()}

elif isinstance(self.value, pa.StructType):
fields = [(field.name, Type(field.type).to_dict()) for field in self.value]
return {"type": "object", "properties": dict(fields)}

elif isinstance(self.value, pa.TimestampType):
return {"type": "string", "format": "date-time"}

type_ = None
for type_name, data_type in _TYPES.items():
if self.value.equals(data_type):
Expand Down
4 changes: 4 additions & 0 deletions src/fondant/core/schemas/common.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
"type": "string",
"$ref": "#/definitions/subset_data_type"
},
"format": {
"type": "string",
"description": "additional format information for the field"
},
"properties": {
"type": "object",
"properties": {
Expand Down
3 changes: 3 additions & 0 deletions tests/component/examples/component_specs/component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ produces:
number:
type: int32

date:
type: string
format: date-time

args:
flag:
Expand Down
5 changes: 5 additions & 0 deletions tests/component/test_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def mocked_load_dataframe(self):
return dd.from_dict(
{
"images_data": [1, 2, 3],
"date": [
"2024-02-29T12:30:45",
"2024-02-29T12:30:45",
"2024-02-29T12:30:45",
],
"element": [
("1", 1),
("2", 2),
Expand Down
7 changes: 7 additions & 0 deletions tests/core/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ def test_valid_type():
assert Type.list(Type("int8")).value == pa.list_(pa.int8())
assert Type.list(Type.list(Type("string"))).value == pa.list_(pa.list_(pa.string()))
assert Type("int8").to_dict() == {"type": "int8"}
assert Type(pa.timestamp("us", tz="UTC")).to_dict() == {
"type": "string",
"format": "date-time",
}
assert Type.list("float32").to_dict() == {
"type": "array",
"items": {"type": "float32"},
Expand Down Expand Up @@ -42,6 +46,9 @@ def test_valid_json_schema():
assert Type.from_dict(
{"type": "array", "items": {"type": "array", "items": {"type": "int8"}}},
).value == pa.list_(pa.list_(pa.int8()))
assert Type.from_dict(
{"type": "string", "format": "date-time"},
).value == pa.timestamp("us", tz="UTC")
assert Type.from_dict(
{
"type": "object",
Expand Down

0 comments on commit 1ff9b1a

Please sign in to comment.