Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(postgres): support basic jsonb type and existing operations #9630

Merged
merged 1 commit into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions ci/schema/postgres.sql
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,26 @@ INSERT INTO json_t VALUES
(13, '42'),
(14, '37.37');

DROP TABLE IF EXISTS jsonb_t CASCADE;

CREATE TABLE IF NOT EXISTS jsonb_t (rowid BIGINT, js JSONB);

INSERT INTO jsonb_t VALUES
(1, '{"a": [1,2,3,4], "b": 1}'),
(2, '{"a":null,"b":2}'),
(3, '{"a":"foo", "c":null}'),
(4, 'null'),
(5, '[42,47,55]'),
(6, '[]'),
(7, '"a"'),
(8, '""'),
(9, '"b"'),
(10, NULL),
(11, 'true'),
(12, 'false'),
(13, '42'),
(14, '37.37');

DROP TABLE IF EXISTS win CASCADE;
CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/postgres/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def test_create_and_drop_table(con, temp_table, params):
("numeric", dt.decimal),
("numeric(3, 2)", dt.Decimal(3, 2)),
("uuid", dt.uuid),
("jsonb", dt.json),
("jsonb", dt.jsonb),
("geometry", dt.geometry),
("geography", dt.geography),
]
Expand Down
78 changes: 78 additions & 0 deletions ibis/backends/postgres/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,20 @@

import json

import numpy as np
import pandas as pd
import pandas.testing as tm
import pytest
from pytest import param

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.types as ir


@pytest.fixture(scope="module")
def jsonb_t(con):
return con.table("jsonb_t")


@pytest.mark.parametrize("data", [param({"status": True}, id="status")])
Expand All @@ -16,3 +26,71 @@ def test_json(data, alltypes):
expr = alltypes[[alltypes.id, lit]].head(1)
df = expr.execute()
assert df["tmp"].iloc[0] == data


def test_jsonb_extract_path(con):
json_t = con.table("json_t")
jsonb_t = con.table("jsonb_t")

assert json_t.js.type() == dt.JSON(binary=False)
assert jsonb_t.js.type() == dt.JSON(binary=True)

tm.assert_series_equal(jsonb_t.js["a"].execute(), json_t.js["a"].execute())


def test_json_getitem_object(jsonb_t):
expr_fn = lambda t: t.js["a"].name("res")
expected = frozenset([(1, 2, 3, 4), None, "foo"] + [None] * 3)
expr = expr_fn(jsonb_t)
result = frozenset(
expr.execute()
.map(lambda o: tuple(o) if isinstance(o, list) else o)
.replace({np.nan: None})
)
assert result == expected


def test_json_getitem_array(jsonb_t):
expr_fn = lambda t: t.js[1].name("res")
expected = frozenset([None] * 4 + [47, None])
expr = expr_fn(jsonb_t)
result = frozenset(expr.execute().replace({np.nan: None}))
assert result == expected


def test_json_array(jsonb_t):
expr = jsonb_t.mutate("rowid", res=jsonb_t.js.array).order_by("rowid")
result = expr.execute().res
expected = pd.Series(
[None, None, None, None, [42, 47, 55], []] + [None] * 8,
name="res",
dtype="object",
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
("typ", "expected_data"),
[
("str", [None] * 6 + ["a", "", "b"] + [None] * 5),
("int", [None] * 12 + [42, None]),
("float", [None] * 12 + [42.0, 37.37]),
("bool", [None] * 10 + [True, False, None, None]),
],
ids=["str", "int", "float", "bool"],
)
@pytest.mark.parametrize(
"expr_fn", [getattr, ir.JSONValue.unwrap_as], ids=["getattr", "unwrap_as"]
)
def test_json_unwrap(jsonb_t, typ, expected_data, expr_fn):
expr = expr_fn(jsonb_t.js, typ).name("res")
result = expr.execute()
expected = pd.Series(expected_data, name="res", dtype="object")
tm.assert_series_equal(
result.replace(np.nan, None).fillna(pd.NA).sort_values().reset_index(drop=True),
expected.replace(np.nan, None)
.fillna(pd.NA)
.sort_values()
.reset_index(drop=True),
check_dtype=False,
)
2 changes: 1 addition & 1 deletion ibis/backends/risingwave/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_create_and_drop_table(con, temp_table):
("timestamp with time zone", dt.Timestamp("UTC", scale=6)),
("interval", dt.Interval("s")),
("numeric", dt.decimal),
("jsonb", dt.json),
("jsonb", dt.JSON(binary=True)),
]
],
)
Expand Down
64 changes: 34 additions & 30 deletions ibis/backends/sql/compilers/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,66 +315,70 @@ def visit_StructField(self, op, *, arg, field):
#
# but also postgres should really support anonymous structs
return self.cast(
self.f.jsonb_extract_path(self.f.to_jsonb(arg), sge.convert(f"f{idx:d}")),
op.dtype,
self.f.jsonb_extract_path(self.f.to_jsonb(arg), f"f{idx:d}"), op.dtype
)

def json_typeof(self, op, arg):
b = "b" * op.arg.dtype.binary
return self.f[f"json{b}_typeof"](arg)

def json_extract_path_text(self, op, arg, *rest):
b = "b" * op.arg.dtype.binary
return self.f[f"json{b}_extract_path_text"](
arg,
*rest,
# this is apparently how you pass in no additional arguments to
# a variadic function, see the "Variadic Function Resolution"
# section in
# https://www.postgresql.org/docs/current/typeconv-func.html
sge.Var(this="VARIADIC ARRAY[]::TEXT[]"),
)

def visit_UnwrapJSONString(self, op, *, arg):
return self.if_(
self.f.json_typeof(arg).eq(sge.convert("string")),
self.f.json_extract_path_text(
arg,
# this is apparently how you pass in no additional arguments to
# a variadic function, see the "Variadic Function Resolution"
# section in
# https://www.postgresql.org/docs/current/typeconv-func.html
sge.Var(this="VARIADIC ARRAY[]::TEXT[]"),
),
self.json_typeof(op, arg).eq(sge.convert("string")),
self.json_extract_path_text(op, arg),
NULL,
)

def visit_UnwrapJSONInt64(self, op, *, arg):
text = self.f.json_extract_path_text(
arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]")
)
text = self.json_extract_path_text(op, arg)
return self.if_(
self.f.json_typeof(arg).eq(sge.convert("number")),
self.json_typeof(op, arg).eq(sge.convert("number")),
self.cast(
self.if_(self.f.regexp_like(text, r"^\d+$", "g"), text, NULL),
op.dtype,
self.if_(self.f.regexp_like(text, r"^\d+$", "g"), text, NULL), op.dtype
),
NULL,
)

def visit_UnwrapJSONFloat64(self, op, *, arg):
text = self.f.json_extract_path_text(
arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]")
)
text = self.json_extract_path_text(op, arg)
return self.if_(
self.f.json_typeof(arg).eq(sge.convert("number")),
self.json_typeof(op, arg).eq(sge.convert("number")),
self.cast(text, op.dtype),
NULL,
)

def visit_UnwrapJSONBoolean(self, op, *, arg):
return self.if_(
self.f.json_typeof(arg).eq(sge.convert("boolean")),
self.cast(
self.f.json_extract_path_text(
arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]")
),
op.dtype,
),
self.json_typeof(op, arg).eq(sge.convert("boolean")),
self.cast(self.json_extract_path_text(op, arg), op.dtype),
NULL,
)

def visit_JSONGetItem(self, op, *, arg, index):
if op.arg.dtype.binary:
return self.f.jsonb_extract_path(arg, self.cast(index, dt.string))
return super().visit_JSONGetItem(op, arg=arg, index=index)

def visit_StructColumn(self, op, *, names, values):
return self.f.row(*map(self.cast, values, op.dtype.types))

def visit_ToJSONArray(self, op, *, arg):
b = "b" * op.arg.dtype.binary
return self.if_(
self.f.json_typeof(arg).eq(sge.convert("array")),
self.f.array(sg.select(STAR).from_(self.f.json_array_elements(arg))),
self.json_typeof(op, arg).eq(sge.convert("array")),
self.f.array(sg.select(STAR).from_(self.f[f"json{b}_array_elements"](arg))),
NULL,
)

Expand Down
7 changes: 5 additions & 2 deletions ibis/backends/sql/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
typecode.INT: dt.Int32,
typecode.IPADDRESS: dt.INET,
typecode.JSON: dt.JSON,
typecode.JSONB: dt.JSON,
typecode.JSONB: partial(dt.JSON, binary=True),
typecode.LONGBLOB: dt.Binary,
typecode.LONGTEXT: dt.String,
typecode.MEDIUMBLOB: dt.Binary,
Expand Down Expand Up @@ -115,7 +115,6 @@
dt.Float64: typecode.DOUBLE,
dt.String: typecode.VARCHAR,
dt.Binary: typecode.VARBINARY,
dt.JSON: typecode.JSON,
dt.INET: typecode.INET,
dt.UUID: typecode.UUID,
dt.MACADDR: typecode.VARCHAR,
Expand Down Expand Up @@ -325,6 +324,10 @@ def _from_sqlglot_GEOGRAPHY(
srid = int(srid.this.this)
return typeclass(geotype="geography", nullable=cls.default_nullable, srid=srid)

@classmethod
def _from_ibis_JSON(cls, dtype: dt.JSON) -> sge.DataType:
return sge.DataType(this=typecode.JSONB if dtype.binary else typecode.JSON)

@classmethod
def _from_ibis_Interval(cls, dtype: dt.Interval) -> sge.DataType:
assert dtype.unit is not None, "interval unit cannot be None"
Expand Down
11 changes: 10 additions & 1 deletion ibis/expr/datatypes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,13 @@ class JSON(Variadic):
scalar = "JSONScalar"
column = "JSONColumn"

binary: bool = False
"""True if JSON is stored as binary, e.g., JSONB in PostgreSQL."""

@property
def _pretty_piece(self) -> str:
return "b" * self.binary


@public
class GeoSpatial(DataType):
Expand Down Expand Up @@ -1066,7 +1073,8 @@ class INET(DataType):
multipoint = MultiPoint()
multipolygon = MultiPolygon()
# json
json = JSON()
json = JSON(binary=False)
jsonb = JSON(binary=True)
# special string based data type
uuid = UUID()
macaddr = MACADDR()
Expand Down Expand Up @@ -1107,6 +1115,7 @@ class INET(DataType):
multipoint=multipoint,
multipolygon=multipolygon,
json=json,
jsonb=jsonb,
uuid=uuid,
macaddr=macaddr,
inet=inet,
Expand Down
2 changes: 1 addition & 1 deletion ibis/expr/datatypes/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def geotype_parser(typ: type[dt.DataType]) -> dt.DataType:
| array
| map
| struct
| spaceless_string("json", "uuid", "macaddr", "inet").map(
| spaceless_string("jsonb", "json", "uuid", "macaddr", "inet").map(
functools.partial(getattr, dt)
)
| spaceless_string("int").result(dt.int64)
Expand Down
13 changes: 10 additions & 3 deletions ibis/expr/operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import ibis.expr.datatypes as dt
import ibis.expr.rules as rlz
from ibis.common.annotations import attribute
from ibis.expr.operations import Value


Expand All @@ -16,7 +17,7 @@ class JSONGetItem(Value):
arg: Value[dt.JSON]
index: Value[dt.String | dt.Integer]

dtype = dt.json
dtype = rlz.dtype_like("arg")
shape = rlz.shape_like("args")


Expand All @@ -26,19 +27,25 @@ class ToJSONArray(Value):

arg: Value[dt.JSON]

dtype = dt.Array(dt.json)
shape = rlz.shape_like("arg")

@attribute
def dtype(self) -> dt.DataType:
return dt.Array(self.arg.dtype)


@public
class ToJSONMap(Value):
"""Convert a value to a map of string to JSON."""

arg: Value[dt.JSON]

dtype = dt.Map(dt.string, dt.json)
shape = rlz.shape_like("arg")

@attribute
def dtype(self) -> dt.DataType:
return dt.Map(dt.string, self.arg.dtype)


@public
class UnwrapJSONString(Value):
Expand Down
2 changes: 1 addition & 1 deletion ibis/tests/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def binary_dtype(nullable=_nullable):


def json_dtype(nullable=_nullable):
return st.builds(dt.JSON, nullable=nullable)
return st.builds(dt.JSON, binary=st.booleans(), nullable=nullable)


def inet_dtype(nullable=_nullable):
Expand Down