diff --git a/ci/schema/postgres.sql b/ci/schema/postgres.sql index e94eec27a3f4..5407351254ce 100644 --- a/ci/schema/postgres.sql +++ b/ci/schema/postgres.sql @@ -268,6 +268,26 @@ INSERT INTO json_t VALUES (13, '42'), (14, '37.37'); +DROP TABLE IF EXISTS jsonb_t CASCADE; + +CREATE TABLE IF NOT EXISTS jsonb_t (rowid BIGINT, js JSONB); + +INSERT INTO jsonb_t VALUES + (1, '{"a": [1,2,3,4], "b": 1}'), + (2, '{"a":null,"b":2}'), + (3, '{"a":"foo", "c":null}'), + (4, 'null'), + (5, '[42,47,55]'), + (6, '[]'), + (7, '"a"'), + (8, '""'), + (9, '"b"'), + (10, NULL), + (11, 'true'), + (12, 'false'), + (13, '42'), + (14, '37.37'); + DROP TABLE IF EXISTS win CASCADE; CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT); INSERT INTO win VALUES diff --git a/ibis/backends/postgres/tests/test_client.py b/ibis/backends/postgres/tests/test_client.py index bde3637cfac0..496ae0190c34 100644 --- a/ibis/backends/postgres/tests/test_client.py +++ b/ibis/backends/postgres/tests/test_client.py @@ -173,7 +173,7 @@ def test_create_and_drop_table(con, temp_table, params): ("numeric", dt.decimal), ("numeric(3, 2)", dt.Decimal(3, 2)), ("uuid", dt.uuid), - ("jsonb", dt.json), + ("jsonb", dt.jsonb), ("geometry", dt.geometry), ("geography", dt.geography), ] diff --git a/ibis/backends/postgres/tests/test_json.py b/ibis/backends/postgres/tests/test_json.py index 6f6d04b58f7a..219562b81cb8 100644 --- a/ibis/backends/postgres/tests/test_json.py +++ b/ibis/backends/postgres/tests/test_json.py @@ -4,10 +4,20 @@ import json +import numpy as np +import pandas as pd +import pandas.testing as tm import pytest from pytest import param import ibis +import ibis.expr.datatypes as dt +import ibis.expr.types as ir + + +@pytest.fixture(scope="module") +def jsonb_t(con): + return con.table("jsonb_t") @pytest.mark.parametrize("data", [param({"status": True}, id="status")]) @@ -16,3 +26,71 @@ def test_json(data, alltypes): expr = alltypes[[alltypes.id, lit]].head(1) df = expr.execute() assert df["tmp"].iloc[0] == data + + +def test_jsonb_extract_path(con): + json_t = con.table("json_t") + jsonb_t = con.table("jsonb_t") + + assert json_t.js.type() == dt.JSON(binary=False) + assert jsonb_t.js.type() == dt.JSON(binary=True) + + tm.assert_series_equal(jsonb_t.js["a"].execute(), json_t.js["a"].execute()) + + +def test_json_getitem_object(jsonb_t): + expr_fn = lambda t: t.js["a"].name("res") + expected = frozenset([(1, 2, 3, 4), None, "foo"] + [None] * 3) + expr = expr_fn(jsonb_t) + result = frozenset( + expr.execute() + .map(lambda o: tuple(o) if isinstance(o, list) else o) + .replace({np.nan: None}) + ) + assert result == expected + + +def test_json_getitem_array(jsonb_t): + expr_fn = lambda t: t.js[1].name("res") + expected = frozenset([None] * 4 + [47, None]) + expr = expr_fn(jsonb_t) + result = frozenset(expr.execute().replace({np.nan: None})) + assert result == expected + + +def test_json_array(jsonb_t): + expr = jsonb_t.mutate("rowid", res=jsonb_t.js.array).order_by("rowid") + result = expr.execute().res + expected = pd.Series( + [None, None, None, None, [42, 47, 55], []] + [None] * 8, + name="res", + dtype="object", + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + ("typ", "expected_data"), + [ + ("str", [None] * 6 + ["a", "", "b"] + [None] * 5), + ("int", [None] * 12 + [42, None]), + ("float", [None] * 12 + [42.0, 37.37]), + ("bool", [None] * 10 + [True, False, None, None]), + ], + ids=["str", "int", "float", "bool"], +) +@pytest.mark.parametrize( + "expr_fn", [getattr, ir.JSONValue.unwrap_as], ids=["getattr", "unwrap_as"] +) +def test_json_unwrap(jsonb_t, typ, expected_data, expr_fn): + expr = expr_fn(jsonb_t.js, typ).name("res") + result = expr.execute() + expected = pd.Series(expected_data, name="res", dtype="object") + tm.assert_series_equal( + result.replace(np.nan, None).fillna(pd.NA).sort_values().reset_index(drop=True), + expected.replace(np.nan, None) + .fillna(pd.NA) + .sort_values() + .reset_index(drop=True), + check_dtype=False, + ) diff --git a/ibis/backends/risingwave/tests/test_client.py b/ibis/backends/risingwave/tests/test_client.py index 255a6264eec7..1d2ce761242d 100644 --- a/ibis/backends/risingwave/tests/test_client.py +++ b/ibis/backends/risingwave/tests/test_client.py @@ -110,7 +110,7 @@ def test_create_and_drop_table(con, temp_table): ("timestamp with time zone", dt.Timestamp("UTC", scale=6)), ("interval", dt.Interval("s")), ("numeric", dt.decimal), - ("jsonb", dt.json), + ("jsonb", dt.JSON(binary=True)), ] ], ) diff --git a/ibis/backends/sql/compilers/postgres.py b/ibis/backends/sql/compilers/postgres.py index e074f9f89c4e..9f8050c4a62d 100644 --- a/ibis/backends/sql/compilers/postgres.py +++ b/ibis/backends/sql/compilers/postgres.py @@ -315,66 +315,70 @@ def visit_StructField(self, op, *, arg, field): # # but also postgres should really support anonymous structs return self.cast( - self.f.jsonb_extract_path(self.f.to_jsonb(arg), sge.convert(f"f{idx:d}")), - op.dtype, + self.f.jsonb_extract_path(self.f.to_jsonb(arg), f"f{idx:d}"), op.dtype + ) + + def json_typeof(self, op, arg): + b = "b" * op.arg.dtype.binary + return self.f[f"json{b}_typeof"](arg) + + def json_extract_path_text(self, op, arg, *rest): + b = "b" * op.arg.dtype.binary + return self.f[f"json{b}_extract_path_text"]( + arg, + *rest, + # this is apparently how you pass in no additional arguments to + # a variadic function, see the "Variadic Function Resolution" + # section in + # https://www.postgresql.org/docs/current/typeconv-func.html + sge.Var(this="VARIADIC ARRAY[]::TEXT[]"), ) def visit_UnwrapJSONString(self, op, *, arg): return self.if_( - self.f.json_typeof(arg).eq(sge.convert("string")), - self.f.json_extract_path_text( - arg, - # this is apparently how you pass in no additional arguments to - # a variadic function, see the "Variadic Function Resolution" - # section in - # https://www.postgresql.org/docs/current/typeconv-func.html - sge.Var(this="VARIADIC ARRAY[]::TEXT[]"), - ), + self.json_typeof(op, arg).eq(sge.convert("string")), + self.json_extract_path_text(op, arg), NULL, ) def visit_UnwrapJSONInt64(self, op, *, arg): - text = self.f.json_extract_path_text( - arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]") - ) + text = self.json_extract_path_text(op, arg) return self.if_( - self.f.json_typeof(arg).eq(sge.convert("number")), + self.json_typeof(op, arg).eq(sge.convert("number")), self.cast( - self.if_(self.f.regexp_like(text, r"^\d+$", "g"), text, NULL), - op.dtype, + self.if_(self.f.regexp_like(text, r"^\d+$", "g"), text, NULL), op.dtype ), NULL, ) def visit_UnwrapJSONFloat64(self, op, *, arg): - text = self.f.json_extract_path_text( - arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]") - ) + text = self.json_extract_path_text(op, arg) return self.if_( - self.f.json_typeof(arg).eq(sge.convert("number")), + self.json_typeof(op, arg).eq(sge.convert("number")), self.cast(text, op.dtype), NULL, ) def visit_UnwrapJSONBoolean(self, op, *, arg): return self.if_( - self.f.json_typeof(arg).eq(sge.convert("boolean")), - self.cast( - self.f.json_extract_path_text( - arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]") - ), - op.dtype, - ), + self.json_typeof(op, arg).eq(sge.convert("boolean")), + self.cast(self.json_extract_path_text(op, arg), op.dtype), NULL, ) + def visit_JSONGetItem(self, op, *, arg, index): + if op.arg.dtype.binary: + return self.f.jsonb_extract_path(arg, self.cast(index, dt.string)) + return super().visit_JSONGetItem(op, arg=arg, index=index) + def visit_StructColumn(self, op, *, names, values): return self.f.row(*map(self.cast, values, op.dtype.types)) def visit_ToJSONArray(self, op, *, arg): + b = "b" * op.arg.dtype.binary return self.if_( - self.f.json_typeof(arg).eq(sge.convert("array")), - self.f.array(sg.select(STAR).from_(self.f.json_array_elements(arg))), + self.json_typeof(op, arg).eq(sge.convert("array")), + self.f.array(sg.select(STAR).from_(self.f[f"json{b}_array_elements"](arg))), NULL, ) diff --git a/ibis/backends/sql/datatypes.py b/ibis/backends/sql/datatypes.py index 62f3a24edb39..c18c97472d65 100644 --- a/ibis/backends/sql/datatypes.py +++ b/ibis/backends/sql/datatypes.py @@ -34,7 +34,7 @@ typecode.INT: dt.Int32, typecode.IPADDRESS: dt.INET, typecode.JSON: dt.JSON, - typecode.JSONB: dt.JSON, + typecode.JSONB: partial(dt.JSON, binary=True), typecode.LONGBLOB: dt.Binary, typecode.LONGTEXT: dt.String, typecode.MEDIUMBLOB: dt.Binary, @@ -115,7 +115,6 @@ dt.Float64: typecode.DOUBLE, dt.String: typecode.VARCHAR, dt.Binary: typecode.VARBINARY, - dt.JSON: typecode.JSON, dt.INET: typecode.INET, dt.UUID: typecode.UUID, dt.MACADDR: typecode.VARCHAR, @@ -325,6 +324,10 @@ def _from_sqlglot_GEOGRAPHY( srid = int(srid.this.this) return typeclass(geotype="geography", nullable=cls.default_nullable, srid=srid) + @classmethod + def _from_ibis_JSON(cls, dtype: dt.JSON) -> sge.DataType: + return sge.DataType(this=typecode.JSONB if dtype.binary else typecode.JSON) + @classmethod def _from_ibis_Interval(cls, dtype: dt.Interval) -> sge.DataType: assert dtype.unit is not None, "interval unit cannot be None" diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py index 2c8316de79d7..f7c4324ceb1a 100644 --- a/ibis/expr/datatypes/core.py +++ b/ibis/expr/datatypes/core.py @@ -936,6 +936,13 @@ class JSON(Variadic): scalar = "JSONScalar" column = "JSONColumn" + binary: bool = False + """True if JSON is stored as binary, e.g., JSONB in PostgreSQL.""" + + @property + def _pretty_piece(self) -> str: + return "b" * self.binary + @public class GeoSpatial(DataType): @@ -1066,7 +1073,8 @@ class INET(DataType): multipoint = MultiPoint() multipolygon = MultiPolygon() # json -json = JSON() +json = JSON(binary=False) +jsonb = JSON(binary=True) # special string based data type uuid = UUID() macaddr = MACADDR() @@ -1107,6 +1115,7 @@ class INET(DataType): multipoint=multipoint, multipolygon=multipolygon, json=json, + jsonb=jsonb, uuid=uuid, macaddr=macaddr, inet=inet, diff --git a/ibis/expr/datatypes/parse.py b/ibis/expr/datatypes/parse.py index ac802e7f89bf..22ec355333bb 100644 --- a/ibis/expr/datatypes/parse.py +++ b/ibis/expr/datatypes/parse.py @@ -200,7 +200,7 @@ def geotype_parser(typ: type[dt.DataType]) -> dt.DataType: | array | map | struct - | spaceless_string("json", "uuid", "macaddr", "inet").map( + | spaceless_string("jsonb", "json", "uuid", "macaddr", "inet").map( functools.partial(getattr, dt) ) | spaceless_string("int").result(dt.int64) diff --git a/ibis/expr/operations/json.py b/ibis/expr/operations/json.py index bf5675fad4db..0f009af51b13 100644 --- a/ibis/expr/operations/json.py +++ b/ibis/expr/operations/json.py @@ -6,6 +6,7 @@ import ibis.expr.datatypes as dt import ibis.expr.rules as rlz +from ibis.common.annotations import attribute from ibis.expr.operations import Value @@ -16,7 +17,7 @@ class JSONGetItem(Value): arg: Value[dt.JSON] index: Value[dt.String | dt.Integer] - dtype = dt.json + dtype = rlz.dtype_like("arg") shape = rlz.shape_like("args") @@ -26,9 +27,12 @@ class ToJSONArray(Value): arg: Value[dt.JSON] - dtype = dt.Array(dt.json) shape = rlz.shape_like("arg") + @attribute + def dtype(self) -> dt.DataType: + return dt.Array(self.arg.dtype) + @public class ToJSONMap(Value): @@ -36,9 +40,12 @@ class ToJSONMap(Value): arg: Value[dt.JSON] - dtype = dt.Map(dt.string, dt.json) shape = rlz.shape_like("arg") + @attribute + def dtype(self) -> dt.DataType: + return dt.Map(dt.string, self.arg.dtype) + @public class UnwrapJSONString(Value): diff --git a/ibis/tests/strategies.py b/ibis/tests/strategies.py index d31e1ca035ab..8d2e7cdb14d3 100644 --- a/ibis/tests/strategies.py +++ b/ibis/tests/strategies.py @@ -81,7 +81,7 @@ def binary_dtype(nullable=_nullable): def json_dtype(nullable=_nullable): - return st.builds(dt.JSON, nullable=nullable) + return st.builds(dt.JSON, binary=st.booleans(), nullable=nullable) def inet_dtype(nullable=_nullable):