Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(api): add API for unwrapping JSON values into backend-native values #8958

Merged
merged 1 commit into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion ci/schema/bigquery.sql
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,15 @@ INSERT INTO {dataset}.json_t VALUES
(JSON '{{"a":"foo", "c":null}}'),
(JSON 'null'),
(JSON '[42,47,55]'),
(JSON '[]');
(JSON '[]'),
(JSON '"a"'),
(JSON '""'),
(JSON '"b"'),
(NULL),
(JSON 'true'),
(JSON 'false'),
(JSON '42'),
(JSON '37.37');


LOAD DATA OVERWRITE {dataset}.functional_alltypes (
Expand Down
12 changes: 10 additions & 2 deletions ci/schema/duckdb.sql
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,23 @@ INSERT INTO struct VALUES
(NULL),
({'a': 3.0, 'b': 'orange', 'c': NULL});

CREATE OR REPLACE TABLE json_t (js TEXT);
CREATE OR REPLACE TABLE json_t (js JSON);

INSERT INTO json_t VALUES
('{"a": [1,2,3,4], "b": 1}'),
('{"a":null,"b":2}'),
('{"a":"foo", "c":null}'),
('null'),
('[42,47,55]'),
('[]');
('[]'),
('"a"'),
('""'),
('"b"'),
(NULL),
('true'),
('false'),
('42'),
('37.37');

CREATE OR REPLACE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
Expand Down
10 changes: 9 additions & 1 deletion ci/schema/mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,15 @@ INSERT INTO json_t VALUES
('{"a":"foo", "c":null}'),
('null'),
('[42,47,55]'),
('[]');
('[]'),
('"a"'),
('""'),
('"b"'),
(NULL),
('true'),
('false'),
('42'),
('37.37');

DROP TABLE IF EXISTS win CASCADE;

Expand Down
10 changes: 9 additions & 1 deletion ci/schema/postgres.sql
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,15 @@ INSERT INTO json_t VALUES
('{"a":"foo", "c":null}'),
('null'),
('[42,47,55]'),
('[]');
('[]'),
('"a"'),
('""'),
('"b"'),
(NULL),
('true'),
('false'),
('42'),
('37.37');

DROP TABLE IF EXISTS win CASCADE;
CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
Expand Down
10 changes: 9 additions & 1 deletion ci/schema/risingwave.sql
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,15 @@ INSERT INTO "json_t" VALUES
('{"a":"foo", "c":null}'),
('null'),
('[42,47,55]'),
('[]');
('[]'),
('"a"'),
('""'),
('"b"'),
(NULL),
('true'),
('false'),
('42'),
('37.37');

DROP TABLE IF EXISTS "win" CASCADE;
CREATE TABLE "win" ("g" TEXT, "x" BIGINT, "y" BIGINT);
Expand Down
10 changes: 9 additions & 1 deletion ci/schema/snowflake.sql
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,15 @@ INSERT INTO "json_t" ("js")
SELECT parse_json('{"a":"foo", "c":null}') UNION
SELECT parse_json('null') UNION
SELECT parse_json('[42,47,55]') UNION
SELECT parse_json('[]');
SELECT parse_json('[]') UNION
SELECT parse_json('"a"') UNION
SELECT parse_json('""') UNION
SELECT parse_json('"b"') UNION
SELECT NULL UNION
SELECT parse_json('true') UNION
SELECT parse_json('false') UNION
SELECT parse_json('42') UNION
SELECT parse_json('37.37');

CREATE OR REPLACE TABLE "win" ("g" TEXT, "x" BIGINT NOT NULL, "y" BIGINT);
INSERT INTO "win" VALUES
Expand Down
10 changes: 9 additions & 1 deletion ci/schema/sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,15 @@ INSERT INTO json_t VALUES
('{"a":"foo", "c":null}'),
('null'),
('[42,47,55]'),
('[]');
('[]'),
('"a"'),
('""'),
('"b"'),
(NULL),
('true'),
('false'),
('42'),
('37.37');

DROP TABLE IF EXISTS win;
CREATE TABLE win (g TEXT, x BIGINT NOT NULL, y BIGINT);
Expand Down
10 changes: 9 additions & 1 deletion ci/schema/trino.sql
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,15 @@ INSERT INTO memory.default.json_t VALUES
(JSON '{"a":"foo", "c":null}'),
(JSON 'null'),
(JSON '[42,47,55]'),
(JSON '[]');
(JSON '[]'),
(JSON '"a"'),
(JSON '""'),
(JSON '"b"'),
(NULL),
(JSON 'true'),
(JSON 'false'),
(JSON '42'),
(JSON '37.37');

DROP TABLE IF EXISTS win;
CREATE TABLE win (g VARCHAR, x BIGINT, y BIGINT);
Expand Down
12 changes: 12 additions & 0 deletions ibis/backends/bigquery/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,18 @@
def visit_JSONGetItem(self, op, *, arg, index):
return arg[index]

def visit_UnwrapJSONString(self, op, *, arg):
return self.f.anon["safe.string"](arg)

Check warning on line 385 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L385

Added line #L385 was not covered by tests

def visit_UnwrapJSONInt64(self, op, *, arg):
return self.f.anon["safe.int64"](arg)

Check warning on line 388 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L388

Added line #L388 was not covered by tests

def visit_UnwrapJSONFloat64(self, op, *, arg):
return self.f.anon["safe.float64"](arg)

Check warning on line 391 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L391

Added line #L391 was not covered by tests

def visit_UnwrapJSONBoolean(self, op, *, arg):
return self.f.anon["safe.bool"](arg)

Check warning on line 394 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L394

Added line #L394 was not covered by tests

def visit_ExtractEpochSeconds(self, op, *, arg):
return self.f.unix_seconds(arg)

Expand Down
36 changes: 34 additions & 2 deletions ibis/backends/duckdb/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,42 @@ def visit_MapContains(self, op, *, arg, key):
return self.f.len(self.f.element_at(arg, key)).neq(0)

def visit_ToJSONMap(self, op, *, arg):
return sge.TryCast(this=arg, to=self.type_mapper.from_ibis(op.dtype))
return self.if_(
self.f.json_type(arg).eq("OBJECT"),
self.cast(self.cast(arg, dt.json), op.dtype),
NULL,
)

def visit_ToJSONArray(self, op, *, arg):
return self.visit_ToJSONMap(op, arg=arg)
return self.if_(
self.f.json_type(arg).eq("ARRAY"),
self.cast(self.cast(arg, dt.json), op.dtype),
NULL,
)

def visit_UnwrapJSONString(self, op, *, arg):
return self.if_(
self.f.json_type(arg).eq("VARCHAR"),
self.f.json_extract_string(arg, "$"),
NULL,
)

def visit_UnwrapJSONInt64(self, op, *, arg):
arg_type = self.f.json_type(arg)
return self.if_(
arg_type.isin("UBIGINT", "BIGINT"), self.cast(arg, op.dtype), NULL
)

def visit_UnwrapJSONFloat64(self, op, *, arg):
arg_type = self.f.json_type(arg)
return self.if_(
arg_type.isin("UBIGINT", "BIGINT", "DOUBLE"), self.cast(arg, op.dtype), NULL
)

def visit_UnwrapJSONBoolean(self, op, *, arg):
return self.if_(
self.f.json_type(arg).eq("BOOLEAN"), self.cast(arg, op.dtype), NULL
)

def visit_ArrayConcat(self, op, *, arg):
# TODO(cpcloud): map ArrayConcat to this in sqlglot instead of here
Expand Down
22 changes: 22 additions & 0 deletions ibis/backends/mysql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,3 +342,25 @@ def visit_TimestampAdd(self, op, *, left, right):
this=right.this * 1_000, unit=sge.Var(this="MICROSECOND")
)
return self.f.date_add(left, right, dialect=self.dialect)

def visit_UnwrapJSONString(self, op, *, arg):
return self.if_(
self.f.json_type(arg).eq("STRING"), self.f.json_unquote(arg), NULL
)

def visit_UnwrapJSONInt64(self, op, *, arg):
return self.if_(
self.f.json_type(arg).eq("INTEGER"), self.cast(arg, op.dtype), NULL
)

def visit_UnwrapJSONFloat64(self, op, *, arg):
return self.if_(
self.f.json_type(arg).isin("DOUBLE", "INTEGER"),
self.cast(arg, op.dtype),
NULL,
)

def visit_UnwrapJSONBoolean(self, op, *, arg):
return self.if_(
self.f.json_type(arg).eq("BOOLEAN"), self.if_(arg.eq("true"), 1, 0), NULL
)
47 changes: 47 additions & 0 deletions ibis/backends/postgres/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,53 @@ def visit_StructField(self, op, *, arg, field):
op.dtype,
)

def visit_UnwrapJSONString(self, op, *, arg):
return self.if_(
self.f.json_typeof(arg).eq("string"),
self.f.json_extract_path_text(
arg,
# this is apparently how you pass in no additional arguments to
# a variadic function, see the "Variadic Function Resolution"
# section in
# https://www.postgresql.org/docs/current/typeconv-func.html
sge.Var(this="VARIADIC ARRAY[]::TEXT[]"),
),
NULL,
)

def visit_UnwrapJSONInt64(self, op, *, arg):
text = self.f.json_extract_path_text(
arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]")
)
return self.if_(
self.f.json_typeof(arg).eq("number"),
self.cast(
self.if_(self.f.regexp_like(text, r"^\d+$", "g"), text, NULL),
op.dtype,
),
NULL,
)

def visit_UnwrapJSONFloat64(self, op, *, arg):
text = self.f.json_extract_path_text(
arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]")
)
return self.if_(
self.f.json_typeof(arg).eq("number"), self.cast(text, op.dtype), NULL
)

def visit_UnwrapJSONBoolean(self, op, *, arg):
return self.if_(
self.f.json_typeof(arg).eq("boolean"),
self.cast(
self.f.json_extract_path_text(
arg, sge.Var(this="VARIADIC ARRAY[]::TEXT[]")
),
op.dtype,
),
NULL,
)

def visit_StructColumn(self, op, *, names, values):
return self.f.row(*map(self.cast, values, op.dtype.types))

Expand Down
46 changes: 46 additions & 0 deletions ibis/backends/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pyspark import SparkConf
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import PandasUDFType, pandas_udf
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType

import ibis.common.exceptions as com
import ibis.config
Expand Down Expand Up @@ -40,6 +41,47 @@
return list(map(util.normalize_filename, source_list))


@pandas_udf(returnType=DoubleType(), functionType=PandasUDFType.SCALAR)
def unwrap_json_float(s: pd.Series) -> pd.Series:
import json

Check warning on line 46 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L46

Added line #L46 was not covered by tests

import pandas as pd

Check warning on line 48 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L48

Added line #L48 was not covered by tests

def nullify_type_mismatched_value(raw):

Check warning on line 50 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L50

Added line #L50 was not covered by tests
if pd.isna(raw):
return None

Check warning on line 52 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L52

Added line #L52 was not covered by tests

value = json.loads(raw)

Check warning on line 54 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L54

Added line #L54 was not covered by tests
# exact type check because we want to distinguish between integer
# and booleans and bool is a subclass of int
return value if type(value) in (float, int) else None

Check warning on line 57 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L57

Added line #L57 was not covered by tests

return s.map(nullify_type_mismatched_value)

Check warning on line 59 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L59

Added line #L59 was not covered by tests


def unwrap_json(typ):
import json

import pandas as pd

type_mapping = {str: StringType(), int: LongType(), bool: BooleanType()}

@pandas_udf(returnType=type_mapping[typ], functionType=PandasUDFType.SCALAR)
def unwrap(s: pd.Series) -> pd.Series:
def nullify_type_mismatched_value(raw):

Check warning on line 71 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L71

Added line #L71 was not covered by tests
if pd.isna(raw):
return None

Check warning on line 73 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L73

Added line #L73 was not covered by tests

value = json.loads(raw)

Check warning on line 75 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L75

Added line #L75 was not covered by tests
# exact type check because we want to distinguish between integer
# and booleans and bool is a subclass of int
return value if type(value) == typ else None

Check warning on line 78 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L78

Added line #L78 was not covered by tests

return s.map(nullify_type_mismatched_value)

Check warning on line 80 in ibis/backends/pyspark/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/__init__.py#L80

Added line #L80 was not covered by tests

return unwrap


class _PySparkCursor:
"""Spark cursor.

Expand Down Expand Up @@ -252,6 +294,10 @@
spark_udf = pandas_udf(udf_func, udf_return, PandasUDFType.GROUPED_AGG)
self._session.udf.register(udf_name, spark_udf)

for typ in (str, int, bool):
self._session.udf.register(f"unwrap_json_{typ.__name__}", unwrap_json(typ))
self._session.udf.register("unwrap_json_float", unwrap_json_float)

def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
schema = PySparkSchema.from_ibis(op.schema)
df = self._session.createDataFrame(data=op.data.to_frame(), schema=schema)
Expand Down
4 changes: 4 additions & 0 deletions ibis/backends/pyspark/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ class PySparkCompiler(SQLGlotCompiler):
ops.MapMerge: "map_concat",
ops.MapKeys: "map_keys",
ops.MapValues: "map_values",
ops.UnwrapJSONString: "unwrap_json_str",
ops.UnwrapJSONInt64: "unwrap_json_int",
ops.UnwrapJSONFloat64: "unwrap_json_float",
ops.UnwrapJSONBoolean: "unwrap_json_bool",
}

def _aggregate(self, funcname: str, *args, where):
Expand Down
12 changes: 12 additions & 0 deletions ibis/backends/snowflake/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,18 @@
def visit_ToJSONArray(self, op, *, arg):
return self.if_(self.f.is_array(arg), arg, NULL)

def visit_UnwrapJSONString(self, op, *, arg):
return self.if_(self.f.is_varchar(arg), self.f.as_varchar(arg), NULL)

Check warning on line 183 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L183

Added line #L183 was not covered by tests

def visit_UnwrapJSONInt64(self, op, *, arg):
return self.if_(self.f.is_integer(arg), self.f.as_integer(arg), NULL)

Check warning on line 186 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L186

Added line #L186 was not covered by tests

def visit_UnwrapJSONFloat64(self, op, *, arg):
return self.if_(self.f.is_double(arg), self.f.as_double(arg), NULL)

Check warning on line 189 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L189

Added line #L189 was not covered by tests

def visit_UnwrapJSONBoolean(self, op, *, arg):
return self.if_(self.f.is_boolean(arg), self.f.as_boolean(arg), NULL)

Check warning on line 192 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L192

Added line #L192 was not covered by tests

def visit_IsNan(self, op, *, arg):
return arg.eq(self.NAN)

Expand Down
Loading
Loading