From 17658068815009e4c95c7bd5d63bc9d08cfeee91 Mon Sep 17 00:00:00 2001 From: Igor Tavares Date: Tue, 8 Sep 2020 19:47:55 -0300 Subject: [PATCH] Fix bug for read_parquet with offset timezones. #385 --- README.md | 2 +- awswrangler/__metadata__.py | 2 +- awswrangler/s3/_read_parquet.py | 7 ++++--- tests/test_metadata.py | 2 +- tests/test_s3_parquet.py | 20 ++++++++++++++++++-- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index fe9d68a82..d7da9c461 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ > An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | aws-proserve-opensource@amazon.com -[![Release](https://img.shields.io/badge/release-1.9.2-brightgreen.svg)](https://pypi.org/project/awswrangler/) +[![Release](https://img.shields.io/badge/release-1.9.3-brightgreen.svg)](https://pypi.org/project/awswrangler/) [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py index 6c03b8615..becab9ee4 100644 --- a/awswrangler/__metadata__.py +++ b/awswrangler/__metadata__.py @@ -7,5 +7,5 @@ __title__: str = "awswrangler" __description__: str = "Pandas on AWS." -__version__: str = "1.9.2" +__version__: str = "1.9.3" __license__: str = "Apache License 2.0" diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py index 4b5a411da..6d53fd71a 100644 --- a/awswrangler/s3/_read_parquet.py +++ b/awswrangler/s3/_read_parquet.py @@ -192,10 +192,11 @@ def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame: def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame: for c in metadata["columns"]: if c["field_name"] in df.columns and c["pandas_type"] == "datetimetz": - _logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"]) - if isinstance(df[c["field_name"]].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) is False: + timezone: datetime.tzinfo = pa.lib.string_to_tzinfo(c["metadata"]["timezone"]) + _logger.debug("applying timezone (%s) on column %s", timezone, c["field_name"]) + if hasattr(df[c["field_name"]].dtype, "tz") is False: df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC") - df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"]) + df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=timezone) return df diff --git a/tests/test_metadata.py b/tests/test_metadata.py index c1254c0fe..bf4f54883 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -2,7 +2,7 @@ def test_metadata(): - assert wr.__version__ == "1.9.2" + assert wr.__version__ == "1.9.3" assert wr.__title__ == "awswrangler" assert wr.__description__ == "Pandas on AWS." assert wr.__license__ == "Apache License 2.0" diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py index 9b390616c..5f6e55325 100644 --- a/tests/test_s3_parquet.py +++ b/tests/test_s3_parquet.py @@ -1,7 +1,7 @@ import itertools import logging import math -from datetime import datetime +from datetime import datetime, timedelta, timezone import boto3 import numpy as np @@ -362,7 +362,7 @@ def test_timezone_file(path, use_threads): assert df.equals(df2) -@pytest.mark.parametrize("use_threads", [False]) +@pytest.mark.parametrize("use_threads", [True, False]) def test_timezone_file_columns(path, use_threads): file_path = f"{path}0.parquet" df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "c1": [1.1, 2.2]}) @@ -371,3 +371,19 @@ def test_timezone_file_columns(path, use_threads): wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads) df2 = wr.s3.read_parquet(path, columns=["c1"], use_threads=use_threads) assert df[["c1"]].equals(df2) + + +@pytest.mark.parametrize("use_threads", [True, False]) +def test_timezone_raw_values(path, use_threads): + df = pd.DataFrame({"c0": [1.1, 2.2], "par": ["a", "b"]}) + df["c1"] = pd.to_datetime(datetime.now(timezone.utc)) + df["c2"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(seconds=14400)))) + df["c3"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(-timedelta(seconds=14400)))) + df["c4"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(hours=-8)))) + paths = wr.s3.to_parquet(partition_cols=["par"], df=df, path=path, dataset=True, sanitize_columns=False)["paths"] + wr.s3.wait_objects_exist(paths, use_threads=use_threads) + df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads) + df3 = pd.concat([pd.read_parquet(p) for p in paths], ignore_index=True) + df2["par"] = df2["par"].astype("string") + df3["par"] = df3["par"].astype("string") + assert df2.equals(df3)