Skip to content

Commit

Permalink
Fix bug for read_parquet with offset timezones. #385
Browse files Browse the repository at this point in the history
  • Loading branch information
igorborgest committed Sep 8, 2020
1 parent 1bd34c4 commit 1765806
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 8 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

> An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | aws-proserve-opensource@amazon.com
[![Release](https://img.shields.io/badge/release-1.9.2-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Release](https://img.shields.io/badge/release-1.9.3-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/__metadata__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

__title__: str = "awswrangler"
__description__: str = "Pandas on AWS."
__version__: str = "1.9.2"
__version__: str = "1.9.3"
__license__: str = "Apache License 2.0"
7 changes: 4 additions & 3 deletions awswrangler/s3/_read_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,11 @@ def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
for c in metadata["columns"]:
if c["field_name"] in df.columns and c["pandas_type"] == "datetimetz":
_logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"])
if isinstance(df[c["field_name"]].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) is False:
timezone: datetime.tzinfo = pa.lib.string_to_tzinfo(c["metadata"]["timezone"])
_logger.debug("applying timezone (%s) on column %s", timezone, c["field_name"])
if hasattr(df[c["field_name"]].dtype, "tz") is False:
df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"])
df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=timezone)
return df


Expand Down
2 changes: 1 addition & 1 deletion tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def test_metadata():
assert wr.__version__ == "1.9.2"
assert wr.__version__ == "1.9.3"
assert wr.__title__ == "awswrangler"
assert wr.__description__ == "Pandas on AWS."
assert wr.__license__ == "Apache License 2.0"
20 changes: 18 additions & 2 deletions tests/test_s3_parquet.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import itertools
import logging
import math
from datetime import datetime
from datetime import datetime, timedelta, timezone

import boto3
import numpy as np
Expand Down Expand Up @@ -362,7 +362,7 @@ def test_timezone_file(path, use_threads):
assert df.equals(df2)


@pytest.mark.parametrize("use_threads", [False])
@pytest.mark.parametrize("use_threads", [True, False])
def test_timezone_file_columns(path, use_threads):
file_path = f"{path}0.parquet"
df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "c1": [1.1, 2.2]})
Expand All @@ -371,3 +371,19 @@ def test_timezone_file_columns(path, use_threads):
wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
df2 = wr.s3.read_parquet(path, columns=["c1"], use_threads=use_threads)
assert df[["c1"]].equals(df2)


@pytest.mark.parametrize("use_threads", [True, False])
def test_timezone_raw_values(path, use_threads):
df = pd.DataFrame({"c0": [1.1, 2.2], "par": ["a", "b"]})
df["c1"] = pd.to_datetime(datetime.now(timezone.utc))
df["c2"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(seconds=14400))))
df["c3"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(-timedelta(seconds=14400))))
df["c4"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(hours=-8))))
paths = wr.s3.to_parquet(partition_cols=["par"], df=df, path=path, dataset=True, sanitize_columns=False)["paths"]
wr.s3.wait_objects_exist(paths, use_threads=use_threads)
df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
df3 = pd.concat([pd.read_parquet(p) for p in paths], ignore_index=True)
df2["par"] = df2["par"].astype("string")
df3["par"] = df3["par"].astype("string")
assert df2.equals(df3)

0 comments on commit 1765806

Please sign in to comment.