From d0cfd7b3136f0c10f0dfc74fb063db39ce027a24 Mon Sep 17 00:00:00 2001 From: Santiago Rodriguez Date: Fri, 14 Jun 2024 09:36:54 -0500 Subject: [PATCH 1/3] fix: `tz` attribute check, it was checking `dtype` instead of `dt` This will fix the issue reported here: https://github.com/aws/aws-sdk-pandas/issues/2410 --- awswrangler/_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awswrangler/_arrow.py b/awswrangler/_arrow.py index cbb1727ca..b8e336a6c 100644 --- a/awswrangler/_arrow.py +++ b/awswrangler/_arrow.py @@ -75,7 +75,7 @@ def _apply_timezone(df: pd.DataFrame, metadata: dict[str, Any]) -> pd.DataFrame: if timezone_str: timezone: datetime.tzinfo = pa.lib.string_to_tzinfo(timezone_str) _logger.debug("applying timezone (%s) on column %s", timezone, col_name) - if hasattr(df[col_name].dtype, "tz") is False: + if hasattr(df[col_name].dt, "tz") is False: df[col_name] = df[col_name].dt.tz_localize(tz="UTC") if timezone is not None and timezone != pytz.UTC and hasattr(df[col_name].dt, "tz_convert"): df[col_name] = df[col_name].dt.tz_convert(tz=timezone) From 98d5761c74a622318b1ef33f1e766fed6a7a8dc9 Mon Sep 17 00:00:00 2001 From: Leon Luttenberger Date: Mon, 24 Jun 2024 11:34:01 -0600 Subject: [PATCH 2/3] fix test_timezone_raw_values --- awswrangler/_arrow.py | 6 +++++- tests/unit/test_s3_parquet.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/awswrangler/_arrow.py b/awswrangler/_arrow.py index b8e336a6c..796b30ddf 100644 --- a/awswrangler/_arrow.py +++ b/awswrangler/_arrow.py @@ -72,13 +72,17 @@ def _apply_timezone(df: pd.DataFrame, metadata: dict[str, Any]) -> pd.DataFrame: if col_name in df.columns and c["pandas_type"] == "datetimetz": column_metadata: dict[str, Any] = c["metadata"] if c.get("metadata") else {} timezone_str: str | None = column_metadata.get("timezone") + if timezone_str: timezone: datetime.tzinfo = pa.lib.string_to_tzinfo(timezone_str) _logger.debug("applying timezone (%s) on column %s", timezone, col_name) - if hasattr(df[col_name].dt, "tz") is False: + + if hasattr(df[col_name].dt, "tz") is False or df[col_name].dt.tz is None: df[col_name] = df[col_name].dt.tz_localize(tz="UTC") + if timezone is not None and timezone != pytz.UTC and hasattr(df[col_name].dt, "tz_convert"): df[col_name] = df[col_name].dt.tz_convert(tz=timezone) + return df diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py index 03722671e..c3ea04e9d 100644 --- a/tests/unit/test_s3_parquet.py +++ b/tests/unit/test_s3_parquet.py @@ -645,13 +645,16 @@ def test_timezone_raw_values(path): df["c3"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(-timedelta(seconds=14400)))) df["c4"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(hours=-8)))) wr.s3.to_parquet(partition_cols=["par"], df=df, path=path, dataset=True, sanitize_columns=False) + df2 = wr.s3.read_parquet(path, dataset=True, use_threads=False, pyarrow_additional_kwargs={"ignore_metadata": True}) + # Use pandas to read because of Modin "Internal Error: Internal and external indices on axis 1 do not match." import pandas - df3 = pandas.read_parquet(path) + df2["par"] = df2["par"].astype("string") df3["par"] = df3["par"].astype("string") + assert_pandas_equals(df2, df3) From fa68914df9483c7091260d196ddadadf6b97f330 Mon Sep 17 00:00:00 2001 From: Leon Luttenberger Date: Mon, 24 Jun 2024 11:38:41 -0600 Subject: [PATCH 3/3] fix formatting --- tests/unit/test_s3_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py index c3ea04e9d..c1cbf14d6 100644 --- a/tests/unit/test_s3_parquet.py +++ b/tests/unit/test_s3_parquet.py @@ -650,6 +650,7 @@ def test_timezone_raw_values(path): # Use pandas to read because of Modin "Internal Error: Internal and external indices on axis 1 do not match." import pandas + df3 = pandas.read_parquet(path) df2["par"] = df2["par"].astype("string")