diff --git a/integration_tests/src/main/python/json_matrix_test.py b/integration_tests/src/main/python/json_matrix_test.py index 8b9836673e7..3a8415b06cb 100644 --- a/integration_tests/src/main/python/json_matrix_test.py +++ b/integration_tests/src/main/python/json_matrix_test.py @@ -123,20 +123,18 @@ def test_json_tuple_allow_comments_off(std_input_path): @allow_non_gpu('FileSourceScanExec') @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) def test_scan_json_allow_single_quotes_off(std_input_path, read_func, spark_tmp_table_factory): - assert_gpu_fallback_collect( + assert_gpu_and_cpu_are_equal_collect( read_func(std_input_path + '/' + WITH_SQ_FILE, WITH_SQ_SCHEMA, spark_tmp_table_factory, {"allowSingleQuotes": "false"}), - 'FileSourceScanExec', conf=_enable_all_types_json_scan_conf) @allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC) def test_from_json_allow_single_quotes_off(std_input_path): schema = WITH_SQ_SCHEMA - assert_gpu_fallback_collect( + assert_gpu_and_cpu_are_equal_collect( lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "false"})), - 'JsonToStructs', conf =_enable_json_to_structs_conf) # On is the default so it really needs to work diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 39eca296bb5..d21e7c46e63 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -679,6 +679,53 @@ def test_from_json_map(): .select(f.from_json(f.col('a'), 'MAP')), conf=_enable_all_types_conf) +@allow_non_gpu(*non_utc_allow) +def test_from_json_map_with_invalid(): + # The test here is working around some inconsistencies in how the keys are parsed for maps + # on the GPU the keys are dense, but on the CPU they are sparse + json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"(, "b": "[A-Z]{0,5}")?}') \ + .with_special_pattern('', weight=50) \ + .with_special_pattern(' ', weight=50) \ + .with_special_pattern('null', weight=50) \ + .with_special_pattern('invalid', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}"', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}"}abc', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}"}{"b": "B"}', weight=50) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen) \ + .select(f.from_json(f.col('a'), 'MAP')), + conf=_enable_all_types_conf) + +@allow_non_gpu(*non_utc_allow) +@pytest.mark.parametrize('allow_single_quotes', ['true', 'false']) +@pytest.mark.parametrize('allow_non_numeric_numbers', ['true', 'false']) +@pytest.mark.parametrize('allow_unquoted_chars', ['true', 'false']) +def test_from_json_map_with_options(allow_single_quotes, + allow_non_numeric_numbers, + allow_unquoted_chars): + # Test the input with: + # - Double quotes + # - Single quotes + # - Numbers with leading zeros + # - Non-numeric numbers + # - Unquoted control characters in quoted strings + json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"}') \ + .with_special_pattern(r"""{'a': "[0-9]{0,5}"}""", weight=50) \ + .with_special_pattern(r'{"a": 0[0-9]{0,5}}', weight=50) \ + .with_special_pattern(r'{"a": [+-]?(INF|Infinity|NaN)}', weight=50) \ + .with_special_pattern(r'{"(a|a\r\n\tb)": "(xyz|01\r\n\t23)"}', weight=50) + options = {"allowSingleQuotes": allow_single_quotes, + # Cannot test `allowNumericLeadingZeros==true` because the GPU output always has + # leading zeros while the CPU output does not, thus test will always fail. + "allowNumericLeadingZeros": "false", + "allowNonNumericNumbers": allow_non_numeric_numbers, + "allowUnquotedControlChars": allow_unquoted_chars} + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen, length=20) \ + .select(f.from_json(f.col('a'), 'MAP', options)), + conf=_enable_all_types_conf) + @allow_non_gpu('ProjectExec', 'JsonToStructs') def test_from_json_map_fallback(): # The test here is working around some inconsistencies in how the keys are parsed for maps diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala index dbd23c31a78..e6a2d506e37 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala @@ -90,11 +90,6 @@ object GpuJsonScan { meta.willNotWorkOnGpu(s"$op does not support allowUnquotedFieldNames") } - // {'name': 'Reynold Xin'} turning single quotes off is not supported by CUDF - if (!options.allowSingleQuotes) { - meta.willNotWorkOnGpu(s"$op does not support disabling allowSingleQuotes") - } - // {"name": "Cazen Lee", "price": "\$10"} is not supported by CUDF if (options.allowBackslashEscapingAnyCharacter) { meta.willNotWorkOnGpu(s"$op does not support allowBackslashEscapingAnyCharacter") diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala index a62aba24760..9c311c1fb73 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuJsonToStructs.scala @@ -84,7 +84,8 @@ case class GpuJsonToStructs( override protected def doColumnar(input: GpuColumnVector): cudf.ColumnVector = { withResource(new NvtxRange("GpuJsonToStructs", NvtxColor.YELLOW)) { _ => schema match { - case _: MapType => JSONUtils.extractRawMapFromJsonString(input.getBase) + case _: MapType => + JSONUtils.extractRawMapFromJsonString(input.getBase, jsonOptionBuilder.build()) case struct: StructType => // if we ever need to support duplicate keys we need to keep track of the duplicates // and make the first one null, but I don't think this will ever happen in practice