feat(pyspark): add json string unwrap implementation

ibis-project · Apr 13, 2024 · 7d762f2 · 7d762f2
1 parent d13c533
commit 7d762f2
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 1 deletion.
diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py
@@ -11,6 +11,7 @@
 from pyspark import SparkConf
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import PandasUDFType, pandas_udf
+from pyspark.sql.types import StringType
 
 import ibis.common.exceptions as com
 import ibis.config
@@ -40,6 +41,22 @@ def normalize_filenames(source_list):
     return list(map(util.normalize_filename, source_list))
 
 
+@pandas_udf(StringType(), PandasUDFType.SCALAR)
+def unwrap_json(s: pd.Series) -> pd.Series:
+    import json
+
+    import pandas as pd
+
+    def nullify_non_string(raw):
+        if pd.isna(raw):
+            return None
+
+        value = json.loads(raw)
+        return value if isinstance(value, str) else None
+
+    return s.map(nullify_non_string)
+
+
 class _PySparkCursor:
     """Spark cursor.
 
@@ -252,6 +269,8 @@ def _register_udfs(self, expr: ir.Expr) -> None:
             spark_udf = pandas_udf(udf_func, udf_return, PandasUDFType.GROUPED_AGG)
             self._session.udf.register(udf_name, spark_udf)
 
+        self._session.udf.register("unwrap_json", unwrap_json)
+
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = PySparkSchema.from_ibis(op.schema)
         df = self._session.createDataFrame(data=op.data.to_frame(), schema=schema)

diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py
@@ -77,6 +77,7 @@ class PySparkCompiler(SQLGlotCompiler):
         ops.MapMerge: "map_concat",
         ops.MapKeys: "map_keys",
         ops.MapValues: "map_values",
+        ops.UnwrapJSONString: "unwrap_json",
     }
 
     def _aggregate(self, funcname: str, *args, where):

diff --git a/ibis/backends/tests/data.py b/ibis/backends/tests/data.py
@@ -100,6 +100,10 @@
             "null",
             "[42,47,55]",
             "[]",
+            '"a"',
+            '""',
+            '"b"',
+            None,
         ]
     }
 )

diff --git a/ibis/backends/tests/test_json.py b/ibis/backends/tests/test_json.py
@@ -103,7 +103,7 @@ def test_json_array(backend, json_t):
 
 @pytest.mark.notimpl(["dask", "pandas", "risingwave"])
 @pytest.mark.notyet(
-    ["pyspark", "trino", "flink"], reason="should work but doesn't deserialize JSON"
+    ["trino", "flink"], reason="should work but doesn't deserialize JSON"
 )
 def test_json_string(backend, json_t):
     expr = json_t.js.string.name("res")
-Original file line number
+Diff line change
@@ Expand Up / @@ -100,6 +100,10 @@ @@
                 "null",
                 "[42,47,55]",
                 "[]",
+                '"a"',
+                '""',
+                '"b"',
+                None,
             ]
         }
     )
@@ Expand Down @@