From 30276b2af11fdd36d848f3a4be6e83cbcad3bf6e Mon Sep 17 00:00:00 2001 From: Leon Luttenberger Date: Wed, 26 Jun 2024 18:01:10 -0500 Subject: [PATCH] fix: pyarrow backend trimming timestamp to date (#2875) --- awswrangler/_data_types.py | 2 +- tests/_utils.py | 29 ++++++++++--------- .../unit/test_pandas_pyarrow_dtype_backend.py | 2 +- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py index b8cf9ec18..756019b95 100644 --- a/awswrangler/_data_types.py +++ b/awswrangler/_data_types.py @@ -369,7 +369,7 @@ def athena2pandas(dtype: str, dtype_backend: str | None = None) -> str: # noqa: if (dtype == "string") or dtype.startswith("char") or dtype.startswith("varchar"): return "string" if dtype_backend != "pyarrow" else "string[pyarrow]" if dtype in ("timestamp", "timestamp with time zone"): - return "datetime64" if dtype_backend != "pyarrow" else "date64[pyarrow]" + return "datetime64" if dtype_backend != "pyarrow" else "timestamp[ns][pyarrow]" if dtype == "date": return "date" if dtype_backend != "pyarrow" else "date32[pyarrow]" if dtype.startswith("decimal"): diff --git a/tests/_utils.py b/tests/_utils.py index 6521016f1..dcb467dbd 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -334,31 +334,32 @@ def get_df_dtype_backend(dtype_backend: Literal["numpy_nullable", "pyarrow"] = " "int32_nullable": [1, None, 3], "int64_nullable": [1, None, 3], "float_nullable": [0.0, None, 2.2], - # "bool_nullable": [True, None, False], + "bool_nullable": [True, None, False], "string_nullable": ["Washington", None, "Seattle"], - # "date_nullable": [dt("2020-01-01"), None, dt("2020-01-02")], - # "timestamp_nullable": [ts("2020-01-01 00:00:00.0"), None, ts("2020-01-02 00:00:01.0")], + "date_nullable": [dt("2020-01-01"), None, dt("2020-01-02")], + "timestamp_nullable": [ts("2020-01-01 00:00:00.0"), None, ts("2020-01-02 00:00:01.0")], } ) if dtype_backend == "numpy_nullable": - df["int8_nullable"] = df["int8_nullable"].astype("Int8") - df["int16_nullable"] = df["int16_nullable"].astype("Int16") - df["int32_nullable"] = df["int32_nullable"].astype("Int32") - df["int64_nullable"] = df["int64_nullable"].astype("Int64") - df["float_nullable"] = df["float_nullable"].astype("Float64") - # df["bool_nullable"] = df["bool_nullable"].astype("boolean") - # df["date_nullable"] = df["date_nullable"].astype("string[python]") - df["string_nullable"] = df["string_nullable"].astype("string[python]") + df["int8_nullable"] = df["int8_nullable"].astype(pd.Int8Dtype()) + df["int16_nullable"] = df["int16_nullable"].astype(pd.Int16Dtype()) + df["int32_nullable"] = df["int32_nullable"].astype(pd.Int32Dtype()) + df["int64_nullable"] = df["int64_nullable"].astype(pd.Int64Dtype()) + df["float_nullable"] = df["float_nullable"].astype(pd.Float64Dtype()) + df["bool_nullable"] = df["bool_nullable"].astype(pd.BooleanDtype()) + df["string_nullable"] = df["string_nullable"].astype(pd.StringDtype()) + df["timestamp_nullable"] = df["timestamp_nullable"].astype(pd.DatetimeTZDtype()) + df["date_nullable"] = df["date_nullable"].astype(pd.StringDtype()) elif dtype_backend == "pyarrow": df["int8_nullable"] = df["int8_nullable"].astype(pd.ArrowDtype(pa.int8())) df["int16_nullable"] = df["int16_nullable"].astype(pd.ArrowDtype(pa.int16())) df["int32_nullable"] = df["int32_nullable"].astype(pd.ArrowDtype(pa.int32())) df["int64_nullable"] = df["int64_nullable"].astype(pd.ArrowDtype(pa.int64())) df["float_nullable"] = df["float_nullable"].astype(pd.ArrowDtype(pa.float64())) - # df["bool_nullable"] = df["bool_nullable"].astype(pd.ArrowDtype(pa.bool_())) - # df["date_nullable"] = df["date_nullable"].astype(pd.ArrowDtype(pa.string())) + df["bool_nullable"] = df["bool_nullable"].astype(pd.ArrowDtype(pa.bool_())) df["string_nullable"] = df["string_nullable"].astype(pd.ArrowDtype(pa.string())) - # df["timestamp_nullable"] = df["timestamp_nullable"].astype("date64[ms][pyarrow]") + df["date_nullable"] = df["date_nullable"].astype(pd.ArrowDtype(pa.date32())) + df["timestamp_nullable"] = df["timestamp_nullable"].astype(pd.ArrowDtype(pa.timestamp("ns"))) else: raise ValueError(f"Unknown dtype_backend: {dtype_backend}") return df diff --git a/tests/unit/test_pandas_pyarrow_dtype_backend.py b/tests/unit/test_pandas_pyarrow_dtype_backend.py index b54adb8b4..a1541b840 100644 --- a/tests/unit/test_pandas_pyarrow_dtype_backend.py +++ b/tests/unit/test_pandas_pyarrow_dtype_backend.py @@ -105,7 +105,7 @@ def test_athena_csv_dtype_backend( df["string_nullable"] = df["string_nullable"].astype("string[pyarrow]") if ctas_approach or unload_approach: - df2["string_nullable"].replace("", pa.NA, inplace=True) + df2["string_nullable"] = df2["string_nullable"].replace("", pa.NA) assert_pandas_equals(df, df2)