Skip to content

Commit

Permalink
[SPARK-23360][SQL][PYTHON] Get local timezone from environment via py…
Browse files Browse the repository at this point in the history
…tz, or dateutil.

## What changes were proposed in this pull request?

Currently we use `tzlocal()` to get Python local timezone, but it sometimes causes unexpected behavior.
I changed the way to get Python local timezone to use pytz if the timezone is specified in environment variable, or timezone file via dateutil .

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20559 from ueshin/issues/SPARK-23360/master.
  • Loading branch information
ueshin authored and HyukjinKwon committed Feb 10, 2018
1 parent 6d7c383 commit 97a224a
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
28 changes: 28 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2868,6 +2868,34 @@ def test_create_dataframe_required_pandas_not_found(self):
"d": [pd.Timestamp.now().date()]})
self.spark.createDataFrame(pdf)

# Regression test for SPARK-23360
@unittest.skipIf(not _have_pandas, _pandas_requirement_message)
def test_create_dateframe_from_pandas_with_dst(self):
import pandas as pd
from datetime import datetime

pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]})

df = self.spark.createDataFrame(pdf)
self.assertPandasEqual(pdf, df.toPandas())

orig_env_tz = os.environ.get('TZ', None)
orig_session_tz = self.spark.conf.get('spark.sql.session.timeZone')
try:
tz = 'America/Los_Angeles'
os.environ['TZ'] = tz
time.tzset()
self.spark.conf.set('spark.sql.session.timeZone', tz)

df = self.spark.createDataFrame(pdf)
self.assertPandasEqual(pdf, df.toPandas())
finally:
del os.environ['TZ']
if orig_env_tz is not None:
os.environ['TZ'] = orig_env_tz
time.tzset()
self.spark.conf.set('spark.sql.session.timeZone', orig_session_tz)


class HiveSparkSubmitTests(SparkSubmitTests):

Expand Down
23 changes: 19 additions & 4 deletions python/pyspark/sql/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -1709,6 +1709,21 @@ def _check_dataframe_convert_date(pdf, schema):
return pdf


def _get_local_timezone():
""" Get local timezone using pytz with environment variable, or dateutil.
If there is a 'TZ' environment variable, pass it to pandas to use pytz and use it as timezone
string, otherwise use the special word 'dateutil/:' which means that pandas uses dateutil and
it reads system configuration to know the system local timezone.
See also:
- https://github.com/pandas-dev/pandas/blob/0.19.x/pandas/tslib.pyx#L1753
- https://github.com/dateutil/dateutil/blob/2.6.1/dateutil/tz/tz.py#L1338
"""
import os
return os.environ.get('TZ', 'dateutil/:')


def _check_dataframe_localize_timestamps(pdf, timezone):
"""
Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
Expand All @@ -1721,7 +1736,7 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
require_minimum_pandas_version()

from pandas.api.types import is_datetime64tz_dtype
tz = timezone or 'tzlocal()'
tz = timezone or _get_local_timezone()
for column, series in pdf.iteritems():
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64tz_dtype(series.dtype):
Expand All @@ -1744,7 +1759,7 @@ def _check_series_convert_timestamps_internal(s, timezone):
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
tz = timezone or 'tzlocal()'
tz = timezone or _get_local_timezone()
return s.dt.tz_localize(tz).dt.tz_convert('UTC')
elif is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert('UTC')
Expand All @@ -1766,8 +1781,8 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):

import pandas as pd
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
from_tz = from_timezone or 'tzlocal()'
to_tz = to_timezone or 'tzlocal()'
from_tz = from_timezone or _get_local_timezone()
to_tz = to_timezone or _get_local_timezone()
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert(to_tz).dt.tz_localize(None)
Expand Down

0 comments on commit 97a224a

Please sign in to comment.