From ba93e4ff0e1b3d30f87dc997c2cd56d0ac8a7380 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Fri, 24 May 2024 12:31:44 +0200 Subject: [PATCH 1/5] initial fix --- python/pyspark/sql/types.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index b9db59e0a58ac..4815e4360f873 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -747,8 +747,8 @@ def jsonValue(self) -> Dict[str, Any]: def fromJson( cls, json: Dict[str, Any], - fieldPath: str, - collationsMap: Optional[Dict[str, str]], + fieldPath: str = "", + collationsMap: Optional[Dict[str, str]] = None, ) -> "ArrayType": elementType = _parse_datatype_json_value( json["elementType"], fieldPath + ".element", collationsMap @@ -887,8 +887,8 @@ def jsonValue(self) -> Dict[str, Any]: def fromJson( cls, json: Dict[str, Any], - fieldPath: str, - collationsMap: Optional[Dict[str, str]], + fieldPath: str = "", + collationsMap: Optional[Dict[str, str]] = None, ) -> "MapType": keyType = _parse_datatype_json_value(json["keyType"], fieldPath + ".key", collationsMap) valueType = _parse_datatype_json_value( From c1bd4acc3f6c8351b3d9dd15a376158b512ab9b0 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Tue, 9 Jul 2024 11:38:20 +0200 Subject: [PATCH 2/5] add test for array and map deser --- python/pyspark/sql/tests/test_types.py | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 4810cf40e2315..c5a8cbaf39677 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -817,6 +817,35 @@ def test_schema_with_collations_on_non_string_types(self): PySparkTypeError, lambda: _parse_datatype_json_string(collations_in_nested_map_json) ) + def test_array_type_from_json(self): + arrayWithoutCollations = ArrayType(StringType(), True) + arrayWithCollations = ArrayType(StringType("UNICODE"), True) + array_json = '{"type": "array", "elementType": "string", "containsNull": true}' + collationsMap = {".element": "UNICODE"} + + self.assertEqual(arrayWithoutCollations, ArrayType.fromJson(array_json)) + self.assertEqual( + arrayWithCollations, + ArrayType.fromJson(array_json, fieldPath="", collationsMap=collationsMap), + ) + self.assertEqual( + arrayWithCollations, ArrayType.fromJson(array_json, collationsMap=collationsMap) + ) + + def test_map_type_from_json(self): + mapWithoutCollations = MapType(StringType(), StringType(), True) + mapWithCollations = MapType(StringType("UNICODE"), StringType("UNICODE"), True) + map_json = ( + '{"type": "map", "keyType": "string", "valueType": "string", "valueContainsNull": true}' + ) + collationsMap = {".key": "UNICODE", ".value": "UNICODE"} + + self.assertEqual(mapWithoutCollations, MapType.fromJson(map_json)) + self.assertEqual( + mapWithCollations, MapType.fromJson(map_json, fieldPath="", collationsMap=collationsMap) + ) + self.assertEqual(mapWithCollations, MapType.fromJson(map_json, collationsMap=collationsMap)) + def test_schema_with_bad_collations_provider(self): from pyspark.sql.types import _parse_datatype_json_string, _COLLATIONS_METADATA_KEY From 16aeecc1de60d65d5052e46272ec895f866ec05a Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Tue, 9 Jul 2024 13:07:56 +0200 Subject: [PATCH 3/5] fix tests --- python/pyspark/sql/tests/test_types.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index c5a8cbaf39677..2f49956cc2640 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -820,7 +820,7 @@ def test_schema_with_collations_on_non_string_types(self): def test_array_type_from_json(self): arrayWithoutCollations = ArrayType(StringType(), True) arrayWithCollations = ArrayType(StringType("UNICODE"), True) - array_json = '{"type": "array", "elementType": "string", "containsNull": true}' + array_json = {"type": "array", "elementType": "string", "containsNull": "true"} collationsMap = {".element": "UNICODE"} self.assertEqual(arrayWithoutCollations, ArrayType.fromJson(array_json)) @@ -835,9 +835,7 @@ def test_array_type_from_json(self): def test_map_type_from_json(self): mapWithoutCollations = MapType(StringType(), StringType(), True) mapWithCollations = MapType(StringType("UNICODE"), StringType("UNICODE"), True) - map_json = ( - '{"type": "map", "keyType": "string", "valueType": "string", "valueContainsNull": true}' - ) + map_json = {"type": "map", "keyType": "string", "valueType": "string", "valueContainsNull": "true"} collationsMap = {".key": "UNICODE", ".value": "UNICODE"} self.assertEqual(mapWithoutCollations, MapType.fromJson(map_json)) From 253eabbe6ff03041dde0b3c8642c8ae3e80cd1f6 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Tue, 9 Jul 2024 15:14:52 +0200 Subject: [PATCH 4/5] reformat python --- python/pyspark/sql/tests/test_types.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 2f49956cc2640..0bb74259f82f9 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -820,7 +820,7 @@ def test_schema_with_collations_on_non_string_types(self): def test_array_type_from_json(self): arrayWithoutCollations = ArrayType(StringType(), True) arrayWithCollations = ArrayType(StringType("UNICODE"), True) - array_json = {"type": "array", "elementType": "string", "containsNull": "true"} + array_json = {"type": "array", "elementType": "string", "containsNull": True} collationsMap = {".element": "UNICODE"} self.assertEqual(arrayWithoutCollations, ArrayType.fromJson(array_json)) @@ -835,7 +835,12 @@ def test_array_type_from_json(self): def test_map_type_from_json(self): mapWithoutCollations = MapType(StringType(), StringType(), True) mapWithCollations = MapType(StringType("UNICODE"), StringType("UNICODE"), True) - map_json = {"type": "map", "keyType": "string", "valueType": "string", "valueContainsNull": "true"} + map_json = { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": True, + } collationsMap = {".key": "UNICODE", ".value": "UNICODE"} self.assertEqual(mapWithoutCollations, MapType.fromJson(map_json)) From 0ef8c3e2aa32f526fc71e69e49b10d6ffad3179e Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Tue, 9 Jul 2024 15:45:13 +0200 Subject: [PATCH 5/5] special case arr/map types --- python/pyspark/sql/tests/test_types.py | 4 ++-- python/pyspark/sql/types.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 0bb74259f82f9..420f2a887e3b7 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -821,7 +821,7 @@ def test_array_type_from_json(self): arrayWithoutCollations = ArrayType(StringType(), True) arrayWithCollations = ArrayType(StringType("UNICODE"), True) array_json = {"type": "array", "elementType": "string", "containsNull": True} - collationsMap = {".element": "UNICODE"} + collationsMap = {"element": "UNICODE"} self.assertEqual(arrayWithoutCollations, ArrayType.fromJson(array_json)) self.assertEqual( @@ -841,7 +841,7 @@ def test_map_type_from_json(self): "valueType": "string", "valueContainsNull": True, } - collationsMap = {".key": "UNICODE", ".value": "UNICODE"} + collationsMap = {"key": "UNICODE", "value": "UNICODE"} self.assertEqual(mapWithoutCollations, MapType.fromJson(map_json)) self.assertEqual( diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 7ebd2739500f7..d4286afc1b03c 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -775,7 +775,9 @@ def fromJson( collationsMap: Optional[Dict[str, str]] = None, ) -> "ArrayType": elementType = _parse_datatype_json_value( - json["elementType"], fieldPath + ".element", collationsMap + json["elementType"], + "element" if fieldPath == "" else fieldPath + ".element", + collationsMap, ) return ArrayType(elementType, json["containsNull"]) @@ -914,9 +916,11 @@ def fromJson( fieldPath: str = "", collationsMap: Optional[Dict[str, str]] = None, ) -> "MapType": - keyType = _parse_datatype_json_value(json["keyType"], fieldPath + ".key", collationsMap) + keyType = _parse_datatype_json_value( + json["keyType"], "key" if fieldPath == "" else fieldPath + ".key", collationsMap + ) valueType = _parse_datatype_json_value( - json["valueType"], fieldPath + ".value", collationsMap + json["valueType"], "value" if fieldPath == "" else fieldPath + ".value", collationsMap ) return MapType( keyType,