From 386260112c4f1b97a16add62672415c0bf7a727c Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 29 Jul 2024 10:55:25 -0700
Subject: [PATCH 1/3] adjust tests for 24.04

---
 nvtabular/ops/categorify.py   |  8 +++++++-
 tests/unit/ops/test_lambda.py |  2 +-
 tests/unit/test_dask_nvt.py   | 10 ++++++----
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
index 4ebc878621a..c7e0db5786a 100644
--- a/nvtabular/ops/categorify.py
+++ b/nvtabular/ops/categorify.py
@@ -39,6 +39,7 @@
 from dask.highlevelgraph import HighLevelGraph
 from dask.utils import parse_bytes
 from fsspec.core import get_fs_token_paths
+from packaging.version import Version
 
 from merlin.core import dispatch
 from merlin.core.dispatch import DataFrameType, annotate, is_cpu_object, nullable_series
@@ -53,6 +54,7 @@
 PAD_OFFSET = 0
 NULL_OFFSET = 1
 OOV_OFFSET = 2
+PA_GE_14 = Version(pa.__version__) >= Version("14.0")
 
 
 class Categorify(StatOperator):
@@ -907,7 +909,11 @@ def _general_concat(
 ):
     # Concatenate DataFrame or pa.Table objects
     if isinstance(frames[0], pa.Table):
-        df = pa.concat_tables(frames, promote=True)
+        if PA_GE_14:
+            df = pa.concat_tables(frames, promote_options="default")
+        else:
+            df = pa.concat_tables(frames, promote=True)
+
         if (
             cardinality_memory_limit
             and col_selector is not None
diff --git a/tests/unit/ops/test_lambda.py b/tests/unit/ops/test_lambda.py
index d50f5d23791..e864fe3b6a7 100644
--- a/tests/unit/ops/test_lambda.py
+++ b/tests/unit/ops/test_lambda.py
@@ -227,7 +227,7 @@ def test_lambdaop_dtype_multi_op_propagation(cpu):
         {
             "a": np.arange(size),
             "b": np.random.choice(["apple", "banana", "orange"], size),
-            "c": np.random.choice([0, 1], size).astype(np.float16),
+            "c": np.random.choice([0, 1], size),
         }
     )
     ddf0 = dd.from_pandas(df0, npartitions=4)
diff --git a/tests/unit/test_dask_nvt.py b/tests/unit/test_dask_nvt.py
index 10b9ed93234..986d63d776b 100644
--- a/tests/unit/test_dask_nvt.py
+++ b/tests/unit/test_dask_nvt.py
@@ -175,10 +175,12 @@ def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):
     gb_e = expect.groupby("name-cat").aggregate({"name-cat": "count", "x": ["sum", "min", "std"]})
     gb_e.columns = ["count", "sum", "min", "std"]
     df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left")
-    assert_eq(df_check["name-cat_count"], df_check["count"], check_names=False)
-    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
-    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
-    assert_eq(df_check["name-cat_x_std"], df_check["std"].astype("float32"), check_names=False)
+    # Names and dtypes don't need to match (just values)
+    options = {"check_names": False, "check_dtype": False}
+    assert_eq(df_check["name-cat_count"], df_check["count"], **options)
+    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], **options)
+    assert_eq(df_check["name-cat_x_min"], df_check["min"], **options)
+    assert_eq(df_check["name-cat_x_std"], df_check["std"], **options)
 
 
 @pytest.mark.parametrize("part_mem_fraction", [0.01])

From 9bd271b0550a1046709a108294dbd70856d7f37b Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 29 Jul 2024 11:35:25 -0700
Subject: [PATCH 2/3] make sure dtypes are correct

---
 tests/unit/test_dask_nvt.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/unit/test_dask_nvt.py b/tests/unit/test_dask_nvt.py
index 986d63d776b..82f0d3c987d 100644
--- a/tests/unit/test_dask_nvt.py
+++ b/tests/unit/test_dask_nvt.py
@@ -142,6 +142,8 @@ def test_dask_workflow_api_dlrm(
 
 @pytest.mark.parametrize("part_mem_fraction", [0.01])
 def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):
+    from nvtabular.ops.join_groupby import AGG_DTYPES
+
     set_dask_client(client=client)
 
     engine = "parquet"
@@ -175,12 +177,14 @@ def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):
     gb_e = expect.groupby("name-cat").aggregate({"name-cat": "count", "x": ["sum", "min", "std"]})
     gb_e.columns = ["count", "sum", "min", "std"]
     df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left")
-    # Names and dtypes don't need to match (just values)
-    options = {"check_names": False, "check_dtype": False}
-    assert_eq(df_check["name-cat_count"], df_check["count"], **options)
-    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], **options)
-    assert_eq(df_check["name-cat_x_min"], df_check["min"], **options)
-    assert_eq(df_check["name-cat_x_std"], df_check["std"], **options)
+    assert_eq(
+        df_check["name-cat_count"], df_check["count"].astype(AGG_DTYPES["count"]), check_names=False
+    )
+    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
+    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
+    assert_eq(
+        df_check["name-cat_x_std"], df_check["std"].astype(AGG_DTYPES["std"]), check_names=False
+    )
 
 
 @pytest.mark.parametrize("part_mem_fraction", [0.01])

From 3886cd6ab9587286740e26e6faf503e5505d9b19 Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 29 Jul 2024 20:03:00 -0700
Subject: [PATCH 3/3] update constant name

---
 nvtabular/ops/categorify.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
index c7e0db5786a..9c246b12dbc 100644
--- a/nvtabular/ops/categorify.py
+++ b/nvtabular/ops/categorify.py
@@ -54,7 +54,7 @@
 PAD_OFFSET = 0
 NULL_OFFSET = 1
 OOV_OFFSET = 2
-PA_GE_14 = Version(pa.__version__) >= Version("14.0")
+PYARROW_GE_14 = Version(pa.__version__) >= Version("14.0")
 
 
 class Categorify(StatOperator):
@@ -909,7 +909,7 @@ def _general_concat(
 ):
     # Concatenate DataFrame or pa.Table objects
     if isinstance(frames[0], pa.Table):
-        if PA_GE_14:
+        if PYARROW_GE_14:
             df = pa.concat_tables(frames, promote_options="default")
         else:
             df = pa.concat_tables(frames, promote=True)