Spark 4: Handle ANSI mode in sort_test.py (#11099)

* Spark 4: Handle ANSI mode in sort_test.py Fixes #11027. With ANSI mode enabled (like the default in Spark 4), one sees that some tests in `sort_test.py` fail, because they expect ANSI mode to be off. This commit disables running those tests with ANSI enabled, and add a separate test for ANSI on/off. Signed-off-by: MithunR <mithunr@nvidia.com> * Refactored not to use disable_ansi_mode. These tests need not be revisited. They test all combinations of ANSI mode, including overflow failures. Signed-off-by: MithunR <mithunr@nvidia.com> --------- Signed-off-by: MithunR <mithunr@nvidia.com>
NVIDIA · Jul 1, 2024 · 850365c · 850365c
1 parent f56fe2c
commit 850365c
Showing 1 changed file with 52 additions and 14 deletions.
diff --git a/integration_tests/src/main/python/sort_test.py b/integration_tests/src/main/python/sort_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect
 from conftest import is_not_utc
 from data_gen import *
 from marks import allow_non_gpu
@@ -224,29 +224,67 @@ def test_multi_orderby_with_limit_single_part(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : binary_op_df(spark, data_gen).coalesce(1).orderBy(f.col('a'), f.col('b').desc()).limit(100))
 
+
 # We are not trying all possibilities, just doing a few with numbers so the query works.
-@pytest.mark.parametrize('data_gen', [byte_gen, long_gen, float_gen], ids=idfn)
-def test_orderby_with_processing(data_gen):
+@pytest.mark.parametrize('is_ansi_enabled', [False, True])
+@pytest.mark.parametrize('data_gen', [ByteGen, LongGen, FloatGen], ids=idfn)
+def test_orderby_with_processing(data_gen, is_ansi_enabled):
+    """
+    Tests the cases where arithmetic overflows don't occur, in ANSI mode.
+    Overflow exceptions are tested in test_orderby_with_ansi_overflow_exceptions.
+    """
+    conf = {'spark.sql.ansi.enabled': is_ansi_enabled}
+    gen = data_gen(min_val=0) if (is_ansi_enabled and data_gen != FloatGen) else data_gen()
     assert_gpu_and_cpu_are_equal_collect(
-            # avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
-            lambda spark : unary_op_df(spark, data_gen).orderBy(f.lit(100) - f.col('a'), f.col('a')))
+            # avoid ambiguity in the order by statement for floating point by including `a` as a backup ordering column
+            lambda spark: unary_op_df(spark, gen).orderBy(f.lit(100) - f.col('a'), f.col('a')),
+            conf=conf)
+
+
+@pytest.mark.parametrize('data_gen', [long_gen], ids=idfn)
+def test_orderby_with_ansi_overflow_exceptions(data_gen):
+    """
+    Test to check that ANSI mode is honoured when there's an order-by with a subtraction expression.
+    With ANSI mode enabled, the subtraction will overflow, causing an ArithmeticException.
+    """
+    def test_function(spark):
+        return unary_op_df(spark, data_gen).orderBy(f.lit(100) - f.col('a'), f.col('a'))
+
+    assert_gpu_and_cpu_error(lambda spark: test_function(spark).collect(),
+                             conf=ansi_enabled_conf,
+                             error_message='ArithmeticException')
+
 
 # We are not trying all possibilities, just doing a few with numbers so the query works.
-@pytest.mark.parametrize('data_gen', [byte_gen, long_gen, float_gen], ids=idfn)
-def test_orderby_with_processing_and_limit(data_gen):
+@pytest.mark.parametrize('is_ansi_enabled', [False, True])
+@pytest.mark.parametrize('data_gen', [ByteGen, LongGen, FloatGen], ids=idfn)
+def test_orderby_with_processing_and_limit(data_gen, is_ansi_enabled):
+    """
+    Tests the cases where arithmetic overflows don't occur, in ANSI mode.
+    Overflow exceptions are tested in test_orderby_with_ansi_overflow_exceptions.
+    """
+    conf = {'spark.sql.ansi.enabled': is_ansi_enabled}
+    gen = data_gen(min_val=0) if (is_ansi_enabled and data_gen != FloatGen) else data_gen()
     assert_gpu_and_cpu_are_equal_collect(
-            # avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
-            lambda spark : unary_op_df(spark, data_gen).orderBy(f.lit(100) - f.col('a'), f.col('a')).limit(100))
+        # avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
+        lambda spark: unary_op_df(spark, gen).orderBy(f.lit(100) - f.col('a'), f.col('a')).limit(100), conf=conf)
 
 
 # We are not trying all possibilities, just doing a few with numbers so the query works.
+@pytest.mark.parametrize('is_ansi_enabled', [False, True])
 @pytest.mark.parametrize('data_gen', [StructGen([('child0', long_gen)])], ids=idfn)
-def test_single_nested_orderby_with_processing_and_limit(data_gen):
+def test_single_nested_orderby_with_processing_and_limit(data_gen, is_ansi_enabled):
+    """
+    Tests the cases where arithmetic overflows don't occur, in ANSI mode.
+    Overflow exceptions are tested in test_orderby_with_ansi_overflow_exceptions.
+    """
+    conf = {'spark.sql.ansi.enabled': is_ansi_enabled}
+    data_gen = StructGen([('child0', LongGen(min_val=0) if is_ansi_enabled else LongGen())])
     assert_gpu_and_cpu_are_equal_collect(
         # avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
-        lambda spark : unary_op_df(spark, data_gen)\
-            .orderBy(f.struct(f.lit(100) - f.col('a.child0')), f.col('a'))\
-            .limit(100))
+        lambda spark: unary_op_df(spark, data_gen)\
+                        .orderBy(f.struct(f.lit(100) - f.col('a.child0')), f.col('a')).limit(100),
+        conf=conf)
 
 # We are not trying all possibilities, just doing a few with numbers so the query works.
 @pytest.mark.parametrize('data_gen', [byte_gen, long_gen, float_gen], ids=idfn)