Skip to content

Commit

Permalink
Spark 4: Handle ANSI mode in sort_test.py (#11099)
Browse files Browse the repository at this point in the history
* Spark 4: Handle ANSI mode in sort_test.py

Fixes #11027.

With ANSI mode enabled (like the default in Spark 4), one sees that some
tests in `sort_test.py` fail, because they expect ANSI mode to be off.

This commit disables running those tests with ANSI enabled, and add a
separate test for ANSI on/off.

Signed-off-by: MithunR <mithunr@nvidia.com>

* Refactored not to use disable_ansi_mode.

These tests need not be revisited.  They test all combinations of ANSI mode,
including overflow failures.

Signed-off-by: MithunR <mithunr@nvidia.com>

---------

Signed-off-by: MithunR <mithunr@nvidia.com>
  • Loading branch information
mythrocks authored Jul 1, 2024
1 parent f56fe2c commit 850365c
Showing 1 changed file with 52 additions and 14 deletions.
66 changes: 52 additions & 14 deletions integration_tests/src/main/python/sort_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,7 @@

import pytest

from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect
from conftest import is_not_utc
from data_gen import *
from marks import allow_non_gpu
Expand Down Expand Up @@ -224,29 +224,67 @@ def test_multi_orderby_with_limit_single_part(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).coalesce(1).orderBy(f.col('a'), f.col('b').desc()).limit(100))


# We are not trying all possibilities, just doing a few with numbers so the query works.
@pytest.mark.parametrize('data_gen', [byte_gen, long_gen, float_gen], ids=idfn)
def test_orderby_with_processing(data_gen):
@pytest.mark.parametrize('is_ansi_enabled', [False, True])
@pytest.mark.parametrize('data_gen', [ByteGen, LongGen, FloatGen], ids=idfn)
def test_orderby_with_processing(data_gen, is_ansi_enabled):
"""
Tests the cases where arithmetic overflows don't occur, in ANSI mode.
Overflow exceptions are tested in test_orderby_with_ansi_overflow_exceptions.
"""
conf = {'spark.sql.ansi.enabled': is_ansi_enabled}
gen = data_gen(min_val=0) if (is_ansi_enabled and data_gen != FloatGen) else data_gen()
assert_gpu_and_cpu_are_equal_collect(
# avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
lambda spark : unary_op_df(spark, data_gen).orderBy(f.lit(100) - f.col('a'), f.col('a')))
# avoid ambiguity in the order by statement for floating point by including `a` as a backup ordering column
lambda spark: unary_op_df(spark, gen).orderBy(f.lit(100) - f.col('a'), f.col('a')),
conf=conf)


@pytest.mark.parametrize('data_gen', [long_gen], ids=idfn)
def test_orderby_with_ansi_overflow_exceptions(data_gen):
"""
Test to check that ANSI mode is honoured when there's an order-by with a subtraction expression.
With ANSI mode enabled, the subtraction will overflow, causing an ArithmeticException.
"""
def test_function(spark):
return unary_op_df(spark, data_gen).orderBy(f.lit(100) - f.col('a'), f.col('a'))

assert_gpu_and_cpu_error(lambda spark: test_function(spark).collect(),
conf=ansi_enabled_conf,
error_message='ArithmeticException')


# We are not trying all possibilities, just doing a few with numbers so the query works.
@pytest.mark.parametrize('data_gen', [byte_gen, long_gen, float_gen], ids=idfn)
def test_orderby_with_processing_and_limit(data_gen):
@pytest.mark.parametrize('is_ansi_enabled', [False, True])
@pytest.mark.parametrize('data_gen', [ByteGen, LongGen, FloatGen], ids=idfn)
def test_orderby_with_processing_and_limit(data_gen, is_ansi_enabled):
"""
Tests the cases where arithmetic overflows don't occur, in ANSI mode.
Overflow exceptions are tested in test_orderby_with_ansi_overflow_exceptions.
"""
conf = {'spark.sql.ansi.enabled': is_ansi_enabled}
gen = data_gen(min_val=0) if (is_ansi_enabled and data_gen != FloatGen) else data_gen()
assert_gpu_and_cpu_are_equal_collect(
# avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
lambda spark : unary_op_df(spark, data_gen).orderBy(f.lit(100) - f.col('a'), f.col('a')).limit(100))
# avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
lambda spark: unary_op_df(spark, gen).orderBy(f.lit(100) - f.col('a'), f.col('a')).limit(100), conf=conf)


# We are not trying all possibilities, just doing a few with numbers so the query works.
@pytest.mark.parametrize('is_ansi_enabled', [False, True])
@pytest.mark.parametrize('data_gen', [StructGen([('child0', long_gen)])], ids=idfn)
def test_single_nested_orderby_with_processing_and_limit(data_gen):
def test_single_nested_orderby_with_processing_and_limit(data_gen, is_ansi_enabled):
"""
Tests the cases where arithmetic overflows don't occur, in ANSI mode.
Overflow exceptions are tested in test_orderby_with_ansi_overflow_exceptions.
"""
conf = {'spark.sql.ansi.enabled': is_ansi_enabled}
data_gen = StructGen([('child0', LongGen(min_val=0) if is_ansi_enabled else LongGen())])
assert_gpu_and_cpu_are_equal_collect(
# avoid ambiguity in the order by statement for floating point by including a as a backup ordering column
lambda spark : unary_op_df(spark, data_gen)\
.orderBy(f.struct(f.lit(100) - f.col('a.child0')), f.col('a'))\
.limit(100))
lambda spark: unary_op_df(spark, data_gen)\
.orderBy(f.struct(f.lit(100) - f.col('a.child0')), f.col('a')).limit(100),
conf=conf)

# We are not trying all possibilities, just doing a few with numbers so the query works.
@pytest.mark.parametrize('data_gen', [byte_gen, long_gen, float_gen], ids=idfn)
Expand Down

0 comments on commit 850365c

Please sign in to comment.