From 7f67ac9eed344223cafdc3ddd10a3ed30947c911 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 6 Sep 2017 07:50:42 -0700 Subject: [PATCH 01/25] move cache into convert_listlike --- pandas/core/tools/datetimes.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ae8aa275b2bae..b4f4fffe1ba8f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -38,7 +38,8 @@ def _guess_datetime_format_for_array(arr, **kwargs): def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, - unit=None, infer_datetime_format=False, origin='unix'): + unit=None, infer_datetime_format=False, origin='unix', + cache=True): """ Convert argument to datetime. @@ -111,7 +112,11 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded: 0.20.0 + cache_datetime : boolean, default False + If True, use a cache of unique, converted dates to apply the datetime + conversion. Produces signficant speed-ups when parsing duplicate date. + .. versionadded: 0.20.2 Returns ------- ret : datetime if parsing succeeded. @@ -201,6 +206,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, def _convert_listlike(arg, box, format, name=None, tz=tz): + datetime_cache = None + if cache and is_list_like(arg) and not isinstance(arg, DatetimeIndex): + unique_dates = algorithms.unique(arg) + if len(unique_dates) != len(arg): + datetime_cache = Series(pd.to_datetime(unique_dates, + errors=errors, dayfirst=dayfirst, + yearfirst=yearfirst, utc=utc, box=box, format=format, + exact=exact, unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin, cache=False), index=unique_dates) if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') From a7c65f75757aa823c9250938140b47d00f8acfd1 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 9 Sep 2017 10:58:07 -0700 Subject: [PATCH 02/25] Move cache down the stack, explore threshold to trigger cache --- pandas/core/tools/datetimes.py | 46 +++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b4f4fffe1ba8f..6a3fbf8176aed 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -206,16 +206,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, def _convert_listlike(arg, box, format, name=None, tz=tz): - datetime_cache = None - if cache and is_list_like(arg) and not isinstance(arg, DatetimeIndex): - unique_dates = algorithms.unique(arg) - if len(unique_dates) != len(arg): - datetime_cache = Series(pd.to_datetime(unique_dates, - errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, format=format, - exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin, cache=False), index=unique_dates) + if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -381,18 +372,43 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): arg = np.asarray(arg) arg = arg + offset + convert_cache = None + if cache and is_list_like(arg) and not isinstance(arg, DatetimeIndex): + # unique currently cannot determine dates that are out of bounds + # use the cache only if the data is a string and there are more than 10**5 values + unique_dates = algorithms.unique(arg) + if len(unique_dates) != len(arg): + from pandas import Series + cache_data = _convert_listlike(unique_dates, True, format) + convert_cache = Series(cache_data, index=unique_dates) + if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - from pandas import Series - values = _convert_listlike(arg._values, True, format) - result = Series(values, index=arg.index, name=arg.name) + if convert_cache is not None: + result = arg.map(convert_cache) + else: + from pandas import Series + values = _convert_listlike(arg._values, True, format) + result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - result = _convert_listlike(arg, box, format, name=arg.name) + if convert_cache is not None: + from pandas import Series + result = Series(arg).map(convert_cache).values + if box: + result = DatetimeIndex(result, tz=tz, name=arg.name) + else: + result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): - result = _convert_listlike(arg, box, format) + if convert_cache is not None: + from pandas import Series + result = Series(arg).map(convert_cache).values + if box: + result = DatetimeIndex(result, tz=tz) + else: + result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] From 243349aaef0ecf83c477efce26d62493c1a347e3 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 10 Sep 2017 18:42:38 -0700 Subject: [PATCH 03/25] Add more cache conditions --- asv_bench/benchmarks/timeseries.py | 8 +++++ pandas/core/tools/datetimes.py | 22 +++++++++----- pandas/tests/indexes/datetimes/test_tools.py | 32 ++++++++++++++++++++ 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 779fc0bd20964..0a95ebab8c9a4 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -356,6 +356,8 @@ def setup(self): self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) self.s2 = self.s.str.replace(':\\S+$', '') + self.numeric_data = Series([range(100000)]) + self.datetime_data = [dt.datetime(2010, 1, 1)] * 100000 def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') @@ -381,6 +383,12 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) + def time_cache_numeric_data(self): + to_datetime(self.numeric_data) + + def time_cache_datetime_data(self): + to_datetime(self.datetime_data) + class Offsets(object): goal_time = 0.2 diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6a3fbf8176aed..102f20a746db6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -21,7 +21,8 @@ is_float, is_list_like, is_scalar, - is_numeric_dtype) + is_numeric_dtype, + is_string_dtype) from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, ABCDataFrame) @@ -373,14 +374,19 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): arg = arg + offset convert_cache = None - if cache and is_list_like(arg) and not isinstance(arg, DatetimeIndex): + if cache and is_list_like(arg): + # Create a cache only if there are more than 10k values and the user + # passes in datestrings + min_cache_threshold = 10**5 + if len(arg) >= min_cache_threshold and is_string_dtype(arg): # unique currently cannot determine dates that are out of bounds - # use the cache only if the data is a string and there are more than 10**5 values - unique_dates = algorithms.unique(arg) - if len(unique_dates) != len(arg): - from pandas import Series - cache_data = _convert_listlike(unique_dates, True, format) - convert_cache = Series(cache_data, index=unique_dates) + # recurison errors with datetime + unique_dates = algorithms.unique(arg) + # Essentially they need to all be the same value + if len(unique_dates) == 1: + from pandas import Series + cache_data = _convert_listlike(unique_dates, True, format) + convert_cache = Series(cache_data, index=unique_dates) if isinstance(arg, tslib.Timestamp): result = arg diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 8205b4fde217b..01ae747c35278 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -371,6 +371,38 @@ def test_datetime_invalid_datatype(self): pd.to_datetime(bool) with pytest.raises(TypeError): pd.to_datetime(pd.to_datetime) + + @pytest.mark.parametrize("utc", [True, None]) + @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + @pytest.mark.parametrize("box", [True, False]) + @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index]) + def test_to_datetime_cache(self, utc, format, box, constructor): + date = '20130101 00:00:00' + test_dates = [date] * 10**5 + data = constructor(test_dates) + result = pd.to_datetime(data, utc=utc, format=format, box=box) + expected = pd.to_datetime(data, utc=utc, format=format, box=box, + cache=False) + if box: + tm.assert_index_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("utc", [True, None]) + @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + def test_to_datetime_cache_series(self, utc, format): + date = '20130101 00:00:00' + test_dates = [date] * 10**5 + data = pd.Series(test_dates) + result = pd.to_datetime(data, utc=utc, format=format, cache=True) + expected = pd.to_datetime(data, utc=utc, format=format) + tm.assert_series_equal(result, expected) + + def test_to_datetime_cache_scalar(self): + date = '20130101 00:00:00' + result = pd.to_datetime(date, cache=True) + expected = pd.Timestamp('20130101 00:00:00') + assert result == expected @pytest.mark.parametrize('date, format', [('2017-20', '%Y-%W'), From d154a6dd72d058129253b6a75018b0a85ade0c15 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 10 Sep 2017 22:54:14 -0700 Subject: [PATCH 04/25] Add some benchmarks --- asv_bench/benchmarks/timeseries.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 0a95ebab8c9a4..e084d18a68ed3 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -356,8 +356,9 @@ def setup(self): self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) self.s2 = self.s.str.replace(':\\S+$', '') - self.numeric_data = Series([range(100000)]) - self.datetime_data = [dt.datetime(2010, 1, 1)] * 100000 + self.dup_numeric_data = Series([1000] * 100000) + self.dup_string_data = ['2013-01-01'] * 100000 + self.dup_datetime_data = [dt.datetime(2010, 1, 1)] * 100000 def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') @@ -383,11 +384,14 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) - def time_cache_numeric_data(self): - to_datetime(self.numeric_data) + def time_cache_dup_numeric_data(self): + to_datetime(self.dup_numeric_data, unit='s') - def time_cache_datetime_data(self): - to_datetime(self.datetime_data) + def time_cache_dup_datetime_data(self): + to_datetime(self.dup_datetime_data) + + def time_cache_dup_string_data(self): + to_datetime(self.dup_string_data) class Offsets(object): From b5e71d25474f7ee3ee054dfba7abbdfb647a248d Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 13 Sep 2017 21:27:52 -0700 Subject: [PATCH 05/25] Some performance testing --- asv_bench/benchmarks/timeseries.py | 44 ++++++++++++++++++++++++------ pandas/core/indexes/datetimes.py | 2 +- pandas/core/tools/datetimes.py | 18 ++++++------ 3 files changed, 45 insertions(+), 19 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index e084d18a68ed3..a1cc822b58f2b 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -356,9 +356,17 @@ def setup(self): self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) self.s2 = self.s.str.replace(':\\S+$', '') - self.dup_numeric_data = Series([1000] * 100000) - self.dup_string_data = ['2013-01-01'] * 100000 - self.dup_datetime_data = [dt.datetime(2010, 1, 1)] * 100000 + self.dup_numeric_data_10_5 = Series([1000] * 100000) + self.dup_string_data_10_5 = ['2013-01-01 01:00:00'] * 100000 + self.dup_datetime_data_10_5 = [dt.datetime(2010, 1, 1)] * 100000 + + self.dup_numeric_data_10_3 = Series([1000] * 100) + self.dup_string_data_10_3 = ['2013-01-01 01:00:00'] * 100 + self.dup_datetime_data_10_3 = [dt.datetime(2010, 1, 1)] * 100 + + self.dup_numeric_data_10_7 = Series([1000] * 10**7) + self.dup_string_data_10_7 = ['2013-01-01 01:00:00'] * 10**7 + self.dup_datetime_data_10_7 = [dt.datetime(2010, 1, 1)] * 10**7 def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') @@ -384,14 +392,32 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) - def time_cache_dup_numeric_data(self): - to_datetime(self.dup_numeric_data, unit='s') + def time_cache_dup_numeric_data_10_3(self): + to_datetime(self.dup_numeric_data_10_3, unit='s') + + def time_cache_dup_datetime_data_10_3(self): + to_datetime(self.dup_datetime_data_10_3) + + def time_cache_dup_string_data_10_3(self): + to_datetime(self.dup_string_data_10_3) + + def time_cache_dup_numeric_data_10_5(self): + to_datetime(self.dup_numeric_data_10_5, unit='s') + + def time_cache_dup_datetime_data_10_5(self): + to_datetime(self.dup_datetime_data_10_5) + + def time_cache_dup_string_data_10_5(self): + to_datetime(self.dup_string_data_10_5) + + def time_cache_dup_numeric_data_10_7(self): + to_datetime(self.dup_numeric_data_10_7, unit='s') - def time_cache_dup_datetime_data(self): - to_datetime(self.dup_datetime_data) + def time_cache_dup_datetime_data_10_7(self): + to_datetime(self.dup_datetime_data_10_7) - def time_cache_dup_string_data(self): - to_datetime(self.dup_string_data) + def time_cache_dup_string_data_10_7(self): + to_datetime(self.dup_string_data_10_7) class Offsets(object): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 78869de318dce..dd1c7306d2c26 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -339,7 +339,7 @@ def __new__(cls, data=None, if not (is_datetime64_dtype(data) or is_datetimetz(data) or is_integer_dtype(data)): data = tools.to_datetime(data, dayfirst=dayfirst, - yearfirst=yearfirst) + yearfirst=yearfirst, cache=False) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 102f20a746db6..165c438608493 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -207,7 +207,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, def _convert_listlike(arg, box, format, name=None, tz=tz): - + import pdb; pdb.set_trace() if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -377,16 +377,16 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): if cache and is_list_like(arg): # Create a cache only if there are more than 10k values and the user # passes in datestrings - min_cache_threshold = 10**5 - if len(arg) >= min_cache_threshold and is_string_dtype(arg): + #min_cache_threshold = 10**5 + #if len(arg) >= min_cache_threshold and is_string_dtype(arg): # unique currently cannot determine dates that are out of bounds # recurison errors with datetime - unique_dates = algorithms.unique(arg) - # Essentially they need to all be the same value - if len(unique_dates) == 1: - from pandas import Series - cache_data = _convert_listlike(unique_dates, True, format) - convert_cache = Series(cache_data, index=unique_dates) + unique_dates = algorithms.unique(arg) + # Essentially they need to all be the same value + if len(unique_dates) != len(arg): + from pandas import Series + cache_data = _convert_listlike(unique_dates, False, format) + convert_cache = Series(cache_data, index=unique_dates) if isinstance(arg, tslib.Timestamp): result = arg From fb2e831009ed57c9affd368d3630bb0b40d082a3 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Sep 2017 16:55:25 -0700 Subject: [PATCH 06/25] Add asvs, modify tests for caches --- asv_bench/benchmarks/timeseries.py | 53 +-- pandas/core/tools/datetimes.py | 21 +- pandas/tests/indexes/datetimes/test_tools.py | 459 +++++++++++-------- 3 files changed, 299 insertions(+), 234 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index a1cc822b58f2b..d9d1b48f420e6 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -346,27 +346,21 @@ class ToDatetime(object): def setup(self): self.rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) + self.stringsD = Series(self.rng.strftime('%Y%m%d')) self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] + self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist() + self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist() self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' for x in self.rng] self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) self.s2 = self.s.str.replace(':\\S+$', '') - self.dup_numeric_data_10_5 = Series([1000] * 100000) - self.dup_string_data_10_5 = ['2013-01-01 01:00:00'] * 100000 - self.dup_datetime_data_10_5 = [dt.datetime(2010, 1, 1)] * 100000 - self.dup_numeric_data_10_3 = Series([1000] * 100) - self.dup_string_data_10_3 = ['2013-01-01 01:00:00'] * 100 - self.dup_datetime_data_10_3 = [dt.datetime(2010, 1, 1)] * 100 - - self.dup_numeric_data_10_7 = Series([1000] * 10**7) - self.dup_string_data_10_7 = ['2013-01-01 01:00:00'] * 10**7 - self.dup_datetime_data_10_7 = [dt.datetime(2010, 1, 1)] * 10**7 + self.unique_numeric_seconds = range(10000) + self.dup_numeric_seconds = [1000] * 10000 + self.dup_string_dates = ['2000-02-11'] * 10000 + self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000 def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') @@ -392,32 +386,23 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) - def time_cache_dup_numeric_data_10_3(self): - to_datetime(self.dup_numeric_data_10_3, unit='s') - - def time_cache_dup_datetime_data_10_3(self): - to_datetime(self.dup_datetime_data_10_3) - - def time_cache_dup_string_data_10_3(self): - to_datetime(self.dup_string_data_10_3) - - def time_cache_dup_numeric_data_10_5(self): - to_datetime(self.dup_numeric_data_10_5, unit='s') + def time_cache_with_unique_seconds_and unit(self): + to_datetime(self.unique_numeric_seconds, unit='s') - def time_cache_dup_datetime_data_10_5(self): - to_datetime(self.dup_datetime_data_10_5) + def time_cache_with_dup_seconds_and_unit(self): + to_datetime(self.dup_numeric_seconds, unit='s') - def time_cache_dup_string_data_10_5(self): - to_datetime(self.dup_string_data_10_5) + def time_cache_with_dup_string_dates(self): + to_datetime(self.dup_string_dates) - def time_cache_dup_numeric_data_10_7(self): - to_datetime(self.dup_numeric_data_10_7, unit='s') + def time_cache_with_dup_string_dates_and_format(self): + to_datetime(self.dup_string_dates, format='%Y-%m-%d') - def time_cache_dup_datetime_data_10_7(self): - to_datetime(self.dup_datetime_data_10_7) + def time_cache_with_dup_string_tzoffset_dates(self): + to_datetime(self.dup_string_with_tz) - def time_cache_dup_string_data_10_7(self): - to_datetime(self.dup_string_data_10_7) + def time_cache_with_dup_string_tzoffset_dates_and_format(self): + to_datetim(self.dup_string_with_tz, format='%Y-%m-%d %H:%M:%S%z') class Offsets(object): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 165c438608493..ac29c7144556c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -113,7 +113,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded: 0.20.0 - cache_datetime : boolean, default False + cache : boolean, default False If True, use a cache of unique, converted dates to apply the datetime conversion. Produces signficant speed-ups when parsing duplicate date. @@ -207,7 +207,6 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, def _convert_listlike(arg, box, format, name=None, tz=tz): - import pdb; pdb.set_trace() if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -375,18 +374,12 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): convert_cache = None if cache and is_list_like(arg): - # Create a cache only if there are more than 10k values and the user - # passes in datestrings - #min_cache_threshold = 10**5 - #if len(arg) >= min_cache_threshold and is_string_dtype(arg): - # unique currently cannot determine dates that are out of bounds - # recurison errors with datetime - unique_dates = algorithms.unique(arg) - # Essentially they need to all be the same value - if len(unique_dates) != len(arg): - from pandas import Series - cache_data = _convert_listlike(unique_dates, False, format) - convert_cache = Series(cache_data, index=unique_dates) + if len(arg) >= 1000: + unique_dates = algorithms.unique(arg) + if len(unique_dates) != len(arg): + from pandas import Series + cache_dates = _convert_listlike(unique_dates, False, format) + convert_cache = Series(cache_dates, index=unique_dates) if isinstance(arg, tslib.Timestamp): result = arg diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 01ae747c35278..1a099fe2d1505 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -28,7 +28,8 @@ class TestTimeConversionFormats(object): - def test_to_datetime_format(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format(self, cache): values = ['1/1/2000', '1/2/2000', '1/3/2000'] results1 = [Timestamp('20000101'), Timestamp('20000201'), @@ -43,7 +44,7 @@ def test_to_datetime_format(self): (values[2], (results1[2], results2[2]))]: for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): - result = to_datetime(vals, format=fmt) + result = to_datetime(vals, format=fmt, cache=cache) expected = expecteds[i] if isinstance(expected, Series): @@ -53,14 +54,15 @@ def test_to_datetime_format(self): else: tm.assert_index_equal(result, expected) - def test_to_datetime_format_YYYYMMDD(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_YYYYMMDD(self, cache): s = Series([19801222, 19801222] + [19810105] * 5) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y%m%d') + result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) - result = to_datetime(s.apply(str), format='%Y%m%d') + result = to_datetime(s.apply(str), format='%Y%m%d', cache=cache) assert_series_equal(result, expected) # with NaT @@ -69,44 +71,48 @@ def test_to_datetime_format_YYYYMMDD(self): expected[2] = np.nan s[2] = np.nan - result = to_datetime(s, format='%Y%m%d') + result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) # string with NaT s = s.apply(str) s[2] = 'nat' - result = to_datetime(s, format='%Y%m%d') + result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) # coercion # GH 7930 s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') + result = pd.to_datetime(s, format='%Y%m%d', errors='ignore', + cache=cache) expected = Series([datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], dtype=object) tm.assert_series_equal(result, expected) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') assert_series_equal(result, expected) - # GH 10178 - def test_to_datetime_format_integer(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_integer(self, cache): + # GH 10178 s = Series([2000, 2001, 2002]) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y') + result = to_datetime(s, format='%Y', cache=cache) assert_series_equal(result, expected) s = Series([200001, 200105, 200206]) expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) ]) - result = to_datetime(s, format='%Y%m') + result = to_datetime(s, format='%Y%m', cache=cache) assert_series_equal(result, expected) - def test_to_datetime_format_microsecond(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_microsecond(self, cache): # these are locale dependent lang, _ = locale.getlocale() @@ -114,11 +120,12 @@ def test_to_datetime_format_microsecond(self): val = '01-{}-2011 00:00:01.978'.format(month_abbr) format = '%d-%b-%Y %H:%M:%S.%f' - result = to_datetime(val, format=format) + result = to_datetime(val, format=format, cache=cache) exp = datetime.strptime(val, format) assert result == exp - def test_to_datetime_format_time(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_time(self, cache): data = [ ['01/10/2010 15:20', '%m/%d/%Y %H:%M', Timestamp('2010-01-10 15:20')], @@ -134,9 +141,10 @@ def test_to_datetime_format_time(self): # Timestamp('2010-01-10 09:12:56')] ] for s, format, dt in data: - assert to_datetime(s, format=format) == dt + assert to_datetime(s, format=format, cache=cache) == dt - def test_to_datetime_with_non_exact(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_with_non_exact(self, cache): # GH 10834 tm._skip_if_has_locale() @@ -147,12 +155,13 @@ def test_to_datetime_with_non_exact(self): s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', '19MAY11 00:00:00Z']) - result = to_datetime(s, format='%d%b%y', exact=False) + result = to_datetime(s, format='%d%b%y', exact=False, cache=cache) expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), - format='%d%b%y') + format='%d%b%y', cache=cache) assert_series_equal(result, expected) - def test_parse_nanoseconds_with_formula(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parse_nanoseconds_with_formula(self, cache): # GH8989 # trunctaing the nanoseconds when a format was provided @@ -161,44 +170,48 @@ def test_parse_nanoseconds_with_formula(self): "2012-01-01 09:00:00.001", "2012-01-01 09:00:00.001000", "2012-01-01 09:00:00.001000000", ]: - expected = pd.to_datetime(v) - result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") + expected = pd.to_datetime(v, cache=cache) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", + cache=cache) assert result == expected - def test_to_datetime_format_weeks(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_weeks(self, cache): data = [ ['2009324', '%Y%W%w', Timestamp('2009-08-13')], ['2013020', '%Y%U%w', Timestamp('2013-01-13')] ] for s, format, dt in data: - assert to_datetime(s, format=format) == dt + assert to_datetime(s, format=format, cache=cache) == dt class TestToDatetime(object): - def test_to_datetime_dt64s(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_dt64s(self, cache): in_bound_dts = [ np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] for dt in in_bound_dts: - assert pd.to_datetime(dt) == Timestamp(dt) + assert pd.to_datetime(dt, cache=cache) == Timestamp(dt) oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ] for dt in oob_dts: pytest.raises(ValueError, pd.to_datetime, dt, errors='raise') pytest.raises(ValueError, Timestamp, dt) - assert pd.to_datetime(dt, errors='coerce') is NaT + assert pd.to_datetime(dt, errors='coerce', cache=cache) is NaT - def test_to_datetime_array_of_dt64s(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_array_of_dt64s(self, cache): dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing tm.assert_numpy_array_equal( - pd.to_datetime(dts, box=False), + pd.to_datetime(dts, box=False, cache=cache), np.array([Timestamp(x).asm8 for x in dts]) ) @@ -209,7 +222,8 @@ def test_to_datetime_array_of_dt64s(self): errors='raise') tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='coerce'), + pd.to_datetime(dts_with_oob, box=False, errors='coerce', + cache=cache), np.array( [ Timestamp(dts_with_oob[0]).asm8, @@ -224,20 +238,22 @@ def test_to_datetime_array_of_dt64s(self): # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='ignore'), + pd.to_datetime(dts_with_oob, box=False, errors='ignore', + cache=cache), np.array( [dt.item() for dt in dts_with_oob], dtype='O' ) ) - def test_to_datetime_tz(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_tz(self, cache): # xref 8260 # uniform returns a DatetimeIndex arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] - result = pd.to_datetime(arr) + result = pd.to_datetime(arr, cache=cache) expected = DatetimeIndex( ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') tm.assert_index_equal(result, expected) @@ -245,9 +261,10 @@ def test_to_datetime_tz(self): # mixed tzs will raise arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] - pytest.raises(ValueError, lambda: pd.to_datetime(arr)) + pytest.raises(ValueError, lambda: pd.to_datetime(arr, cache=cache)) - def test_to_datetime_tz_pytz(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_tz_pytz(self, cache): # see gh-8260 us_eastern = pytz.timezone('US/Eastern') arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, @@ -255,18 +272,20 @@ def test_to_datetime_tz_pytz(self): us_eastern.localize(datetime(year=2000, month=6, day=1, hour=3, minute=0))], dtype=object) - result = pd.to_datetime(arr, utc=True) + result = pd.to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', '2000-06-01 07:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('cache', [True, False]) @pytest.mark.parametrize("init_constructor, end_constructor, test_method", [(Index, DatetimeIndex, tm.assert_index_equal), (list, DatetimeIndex, tm.assert_index_equal), (np.array, DatetimeIndex, tm.assert_index_equal), (Series, Series, tm.assert_series_equal)]) def test_to_datetime_utc_true(self, + cache, init_constructor, end_constructor, test_method): @@ -277,39 +296,47 @@ def test_to_datetime_utc_true(self, result = pd.to_datetime(init_constructor(data), format='%Y%m%d %H%M%S', - utc=True) + utc=True, + cache=cache) expected = end_constructor(expected_data) test_method(result, expected) # Test scalar case as well for scalar, expected in zip(data, expected_data): - result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True) + result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True, + cache=cache) assert result == expected - def test_to_datetime_utc_true_with_series_single_value(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series ts = 1.5e18 - result = pd.to_datetime(pd.Series([ts]), utc=True) + result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache) expected = pd.Series([pd.Timestamp(ts, tz='utc')]) tm.assert_series_equal(result, expected) - def test_to_datetime_utc_true_with_series_tzaware_string(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): ts = '2013-01-01 00:00:00-01:00' expected_ts = '2013-01-01 01:00:00' data = pd.Series([ts] * 3) - result = pd.to_datetime(data, utc=True) + result = pd.to_datetime(data, utc=True, cache=cache) expected = pd.Series([pd.Timestamp(expected_ts, tz='utc')] * 3) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('cache', [True, False]) @pytest.mark.parametrize('date, dtype', [('2013-01-01 01:00:00', 'datetime64[ns]'), ('2013-01-01 01:00:00', 'datetime64[ns, UTC]')]) - def test_to_datetime_utc_true_with_series_datetime_ns(self, date, dtype): + def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, + dtype): expected = pd.Series([pd.Timestamp('2013-01-01 01:00:00', tz='UTC')]) - result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True) + result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, + cache=cache) tm.assert_series_equal(result, expected) - def test_to_datetime_tz_psycopg2(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_tz_psycopg2(self, cache): # xref 8260 try: @@ -324,7 +351,7 @@ def test_to_datetime_tz_psycopg2(self): datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], dtype=object) - result = pd.to_datetime(arr, errors='coerce', utc=True) + result = pd.to_datetime(arr, errors='coerce', utc=True, cache=cache) expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', '2000-06-01 07:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) @@ -337,32 +364,35 @@ def test_to_datetime_tz_psycopg2(self): assert is_datetime64_ns_dtype(i) # tz coerceion - result = pd.to_datetime(i, errors='coerce') + result = pd.to_datetime(i, errors='coerce', cache=cache) tm.assert_index_equal(result, i) - result = pd.to_datetime(i, errors='coerce', utc=True) + result = pd.to_datetime(i, errors='coerce', utc=True, cache=cache) expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) - def test_datetime_bool(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): to_datetime(False) - assert to_datetime(False, errors="coerce") is NaT - assert to_datetime(False, errors="ignore") is False + assert to_datetime(False, errors="coerce", cache=cache) is NaT + assert to_datetime(False, errors="ignore", cache=cache) is False with pytest.raises(TypeError): to_datetime(True) - assert to_datetime(True, errors="coerce") is NaT - assert to_datetime(True, errors="ignore") is True + assert to_datetime(True, errors="coerce", cache=cache) is NaT + assert to_datetime(True, errors="ignore", cache=cache) is True with pytest.raises(TypeError): - to_datetime([False, datetime.today()]) + to_datetime([False, datetime.today()], cache=cache) with pytest.raises(TypeError): - to_datetime(['20130101', True]) + to_datetime(['20130101', True], cache=cache) tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], - errors="coerce"), - DatetimeIndex([to_datetime(0), NaT, - NaT, to_datetime(0)])) + errors="coerce", cache=cache), + DatetimeIndex([to_datetime(0, cache=cache), + NaT, + NaT, + to_datetime(0, cache=cache)])) def test_datetime_invalid_datatype(self): # GH13176 @@ -421,71 +451,77 @@ def test_week_without_day_and_calendar_year(self, date, format): class TestToDatetimeUnit(object): - def test_unit(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit(self, cache): # GH 11758 # test proper behavior with erros with pytest.raises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') + to_datetime([1], unit='D', format='%Y%m%d', cache=cache) values = [11111111, 1, 1.0, tslib.iNaT, NaT, np.nan, 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') + result = to_datetime(values, unit='D', errors='ignore', cache=cache) expected = Index([11111111, Timestamp('1970-01-02'), Timestamp('1970-01-02'), NaT, NaT, NaT, NaT, NaT], dtype=object) tm.assert_index_equal(result, expected) - result = to_datetime(values, unit='D', errors='coerce') + result = to_datetime(values, unit='D', errors='coerce', cache=cache) expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') + to_datetime(values, unit='D', errors='raise', cache=cache) values = [1420043460000, tslib.iNaT, NaT, np.nan, 'NaT'] - result = to_datetime(values, errors='ignore', unit='s') + result = to_datetime(values, errors='ignore', unit='s', cache=cache) expected = Index([1420043460000, NaT, NaT, NaT, NaT], dtype=object) tm.assert_index_equal(result, expected) - result = to_datetime(values, errors='coerce', unit='s') + result = to_datetime(values, errors='coerce', unit='s', cache=cache) expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') + to_datetime(values, errors='raise', unit='s', cache=cache) # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime for val in ['foo', Timestamp('20130101')]: try: - to_datetime(val, errors='raise', unit='s') + to_datetime(val, errors='raise', unit='s', cache=cache) except tslib.OutOfBoundsDatetime: raise AssertionError("incorrect exception raised") except ValueError: pass - def test_unit_consistency(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_consistency(self, cache): # consistency of conversions expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') + result = pd.to_datetime(11111111, unit='s', errors='raise', + cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='coerce') + result = pd.to_datetime(11111111, unit='s', errors='coerce', + cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='ignore') + result = pd.to_datetime(11111111, unit='s', errors='ignore', + cache=cache) assert result == expected assert isinstance(result, Timestamp) - def test_unit_with_numeric(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_with_numeric(self, cache): # GH 13180 # coercions from floats/ints are ok @@ -494,10 +530,10 @@ def test_unit_with_numeric(self): arr1 = [1.434692e+18, 1.432766e+18] arr2 = np.array(arr1).astype('int64') for errors in ['ignore', 'raise', 'coerce']: - result = pd.to_datetime(arr1, errors=errors) + result = pd.to_datetime(arr1, errors=errors, cache=cache) tm.assert_index_equal(result, expected) - result = pd.to_datetime(arr2, errors=errors) + result = pd.to_datetime(arr2, errors=errors, cache=cache) tm.assert_index_equal(result, expected) # but we want to make sure that we are coercing @@ -506,7 +542,7 @@ def test_unit_with_numeric(self): '2015-06-19 05:33:20', '2015-05-27 22:33:20']) arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) expected = DatetimeIndex(['2015-06-19 05:33:20', @@ -514,31 +550,33 @@ def test_unit_with_numeric(self): 'NaT', 'NaT']) arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) - def test_unit_mixed(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_mixed(self, cache): # mixed integers/datetimes expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise') + pd.to_datetime(arr, errors='raise', cache=cache) expected = DatetimeIndex(['NaT', 'NaT', '2013-01-01']) arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise') + pd.to_datetime(arr, errors='raise', cache=cache) - def test_dataframe(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dataframe(self, cache): df = DataFrame({'year': [2015, 2016], 'month': [2, 3], @@ -552,19 +590,20 @@ def test_dataframe(self): result = to_datetime({'year': df['year'], 'month': df['month'], - 'day': df['day']}) + 'day': df['day']}, cache=cache) expected = Series([Timestamp('20150204 00:00:00'), Timestamp('20160305 00:0:00')]) assert_series_equal(result, expected) # dict-like - result = to_datetime(df[['year', 'month', 'day']].to_dict()) + result = to_datetime(df[['year', 'month', 'day']].to_dict(), + cache=cache) assert_series_equal(result, expected) # dict but with constructable df2 = df[['year', 'month', 'day']].to_dict() df2['month'] = 2 - result = to_datetime(df2) + result = to_datetime(df2, cache=cache) expected2 = Series([Timestamp('20150204 00:00:00'), Timestamp('20160205 00:0:00')]) assert_series_equal(result, expected2) @@ -585,7 +624,8 @@ def test_dataframe(self): ] for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d)) + result = to_datetime(df[list(d.keys())].rename(columns=d), + cache=cache) expected = Series([Timestamp('20150204 06:58:10'), Timestamp('20160305 07:59:11')]) assert_series_equal(result, expected) @@ -600,13 +640,13 @@ def test_dataframe(self): 'us': 'us', 'ns': 'ns'} - result = to_datetime(df.rename(columns=d)) + result = to_datetime(df.rename(columns=d), cache=cache) expected = Series([Timestamp('20150204 06:58:10.001002003'), Timestamp('20160305 07:59:11.001002003')]) assert_series_equal(result, expected) # coerce back to int - result = to_datetime(df.astype(str)) + result = to_datetime(df.astype(str), cache=cache) assert_series_equal(result, expected) # passing coerce @@ -617,8 +657,8 @@ def test_dataframe(self): msg = ("cannot assemble the datetimes: time data .+ does not " "match format '%Y%m%d' \(match\)") with tm.assert_raises_regex(ValueError, msg): - to_datetime(df2) - result = to_datetime(df2, errors='coerce') + to_datetime(df2, cache=cache) + result = to_datetime(df2, errors='coerce', cache=cache) expected = Series([Timestamp('20150204 00:00:00'), NaT]) assert_series_equal(result, expected) @@ -629,7 +669,7 @@ def test_dataframe(self): with tm.assert_raises_regex(ValueError, msg): df2 = df.copy() df2['foo'] = 1 - to_datetime(df2) + to_datetime(df2, cache=cache) # not enough msg = ('to assemble mappings requires at least that \[year, month, ' @@ -640,7 +680,7 @@ def test_dataframe(self): ['month', 'day'], ['year', 'day', 'second']]: with tm.assert_raises_regex(ValueError, msg): - to_datetime(df[c]) + to_datetime(df[c], cache=cache) # duplicates msg = 'cannot assemble with duplicate keys' @@ -649,7 +689,7 @@ def test_dataframe(self): 'day': [4, 5]}) df2.columns = ['year', 'year', 'day'] with tm.assert_raises_regex(ValueError, msg): - to_datetime(df2) + to_datetime(df2, cache=cache) df2 = DataFrame({'year': [2015, 2016], 'month': [2, 20], @@ -657,16 +697,17 @@ def test_dataframe(self): 'hour': [4, 5]}) df2.columns = ['year', 'month', 'day', 'day'] with tm.assert_raises_regex(ValueError, msg): - to_datetime(df2) + to_datetime(df2, cache=cache) - def test_dataframe_dtypes(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dataframe_dtypes(self, cache): # #13451 df = DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) # int16 - result = to_datetime(df.astype('int16')) + result = to_datetime(df.astype('int16'), cache=cache) expected = Series([Timestamp('20150204 00:00:00'), Timestamp('20160305 00:00:00')]) assert_series_equal(result, expected) @@ -674,7 +715,7 @@ def test_dataframe_dtypes(self): # mixed dtypes df['month'] = df['month'].astype('int8') df['day'] = df['day'].astype('int8') - result = to_datetime(df) + result = to_datetime(df, cache=cache) expected = Series([Timestamp('20150204 00:00:00'), Timestamp('20160305 00:00:00')]) assert_series_equal(result, expected) @@ -684,18 +725,19 @@ def test_dataframe_dtypes(self): 'month': [1.5, 1], 'day': [1, 1]}) with pytest.raises(ValueError): - to_datetime(df) + to_datetime(df, cache=cache) class TestToDatetimeMisc(object): - def test_index_to_datetime(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_index_to_datetime(self, cache): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = idx.to_datetime() - expected = DatetimeIndex(pd.to_datetime(idx.values)) + expected = DatetimeIndex(pd.to_datetime(idx.values, cache=cache)) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning, @@ -706,17 +748,19 @@ def test_index_to_datetime(self): expected = DatetimeIndex([today]) tm.assert_index_equal(result, expected) - def test_to_datetime_iso8601(self): - result = to_datetime(["2012-01-01 00:00:00"]) + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_iso8601(self, cache): + result = to_datetime(["2012-01-01 00:00:00"], cache=cache) exp = Timestamp("2012-01-01 00:00:00") assert result[0] == exp - result = to_datetime(['20121001']) # bad iso 8601 + result = to_datetime(['20121001'], cache=cache) # bad iso 8601 exp = Timestamp('2012-10-01') assert result[0] == exp - def test_to_datetime_default(self): - rs = to_datetime('2001') + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_default(self, cache): + rs = to_datetime('2001', cache=cache) xp = datetime(2001, 1, 1) assert rs == xp @@ -726,71 +770,80 @@ def test_to_datetime_default(self): # pytest.raises(ValueError, to_datetime('01-13-2012', # dayfirst=True)) - def test_to_datetime_on_datetime64_series(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_on_datetime64_series(self, cache): # #2699 s = Series(date_range('1/1/2000', periods=10)) - result = to_datetime(s) + result = to_datetime(s, cache=cache) assert result[0] == s[0] - def test_to_datetime_with_space_in_series(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_with_space_in_series(self, cache): # GH 6428 s = Series(['10/18/2006', '10/18/2008', ' ']) - pytest.raises(ValueError, lambda: to_datetime(s, errors='raise')) - result_coerce = to_datetime(s, errors='coerce') + pytest.raises(ValueError, lambda: to_datetime(s, + errors='raise', + cache=cache)) + result_coerce = to_datetime(s, errors='coerce', cache=cache) expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(s, errors='ignore') + result_ignore = to_datetime(s, errors='ignore', cache=cache) tm.assert_series_equal(result_ignore, s) - def test_to_datetime_with_apply(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_with_apply(self, cache): # this is only locale tested with US/None locales tm._skip_if_has_locale() # GH 5195 # with a format and coerce a single item to_datetime fails td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) - expected = pd.to_datetime(td, format='%b %y') - result = td.apply(pd.to_datetime, format='%b %y') + expected = pd.to_datetime(td, format='%b %y', cache=cache) + result = td.apply(pd.to_datetime, format='%b %y', cache=cache) assert_series_equal(result, expected) td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) pytest.raises(ValueError, lambda: pd.to_datetime(td, format='%b %y', - errors='raise')) + errors='raise', + cache=cache)) pytest.raises(ValueError, lambda: td.apply(pd.to_datetime, format='%b %y', - errors='raise')) - expected = pd.to_datetime(td, format='%b %y', errors='coerce') + errors='raise', cache=cache)) + expected = pd.to_datetime(td, format='%b %y', errors='coerce', + cache=cache) result = td.apply( - lambda x: pd.to_datetime(x, format='%b %y', errors='coerce')) + lambda x: pd.to_datetime(x, format='%b %y', errors='coerce', + cache=cache)) assert_series_equal(result, expected) - def test_to_datetime_types(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_types(self, cache): # empty string - result = to_datetime('') + result = to_datetime('', cache=cache) assert result is NaT - result = to_datetime(['', '']) + result = to_datetime(['', ''], cache=cache) assert isna(result).all() # ints result = Timestamp(0) - expected = to_datetime(0) + expected = to_datetime(0, cache=cache) assert result == expected # GH 3888 (strings) - expected = to_datetime(['2012'])[0] - result = to_datetime('2012') + expected = to_datetime(['2012'], cache=cache)[0] + result = to_datetime('2012', cache=cache) assert result == expected # array = ['2012','20120101','20120101 12:01:01'] array = ['20120101', '20120101 12:01:01'] - expected = list(to_datetime(array)) + expected = list(to_datetime(array, cache=cache)) result = lmap(Timestamp, array) tm.assert_almost_equal(result, expected) @@ -799,13 +852,15 @@ def test_to_datetime_types(self): # expected = to_datetime('2012') # assert result == expected - def test_to_datetime_unprocessable_input(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_unprocessable_input(self, cache): # GH 4928 tm.assert_numpy_array_equal( - to_datetime([1, '1'], errors='ignore'), + to_datetime([1, '1'], errors='ignore', cache=cache), np.array([1, '1'], dtype='O') ) - pytest.raises(TypeError, to_datetime, [1, '1'], errors='raise') + pytest.raises(TypeError, to_datetime, [1, '1'], errors='raise', + cache=cache) def test_to_datetime_other_datetime64_units(self): # 5/25/2012 @@ -841,7 +896,8 @@ def test_to_datetime_overflow(self): with pytest.raises(OverflowError): date_range(start='1/1/1700', freq='B', periods=100000) - def test_string_na_nat_conversion(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_string_na_nat_conversion(self, cache): # GH #999, #858 from pandas.compat import parse_date @@ -859,7 +915,7 @@ def test_string_na_nat_conversion(self): result = tslib.array_to_datetime(strings) tm.assert_almost_equal(result, expected) - result2 = to_datetime(strings) + result2 = to_datetime(strings, cache=cache) assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) @@ -867,22 +923,25 @@ def test_string_na_nat_conversion(self): # GH 10636, default is now 'raise' pytest.raises(ValueError, - lambda: to_datetime(malformed, errors='raise')) + lambda: to_datetime(malformed, errors='raise', + cache=cache)) - result = to_datetime(malformed, errors='ignore') + result = to_datetime(malformed, errors='ignore', cache=cache) tm.assert_numpy_array_equal(result, malformed) - pytest.raises(ValueError, to_datetime, malformed, errors='raise') + pytest.raises(ValueError, to_datetime, malformed, errors='raise', + cache=cache) idx = ['a', 'b', 'c', 'd', 'e'] series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000'), np.nan, - to_datetime('1/3/2000'), np.nan, - to_datetime('1/5/2000')], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000', cache=cache), np.nan, + to_datetime('1/3/2000', cache=cache), np.nan, + to_datetime('1/5/2000', cache=cache)], + index=idx, name='foo') - result = to_datetime(series) - dresult = to_datetime(dseries) + result = to_datetime(series, cache=cache) + dresult = to_datetime(dseries, cache=cache) expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) for i in range(5): @@ -890,7 +949,7 @@ def test_string_na_nat_conversion(self): if isna(x): expected[i] = tslib.iNaT else: - expected[i] = to_datetime(x) + expected[i] = to_datetime(x, cache=cache) assert_series_equal(result, expected, check_names=False) assert result.name == 'foo' @@ -898,26 +957,29 @@ def test_string_na_nat_conversion(self): assert_series_equal(dresult, expected, check_names=False) assert dresult.name == 'foo' - def test_dti_constructor_numpy_timeunits(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dti_constructor_numpy_timeunits(self, cache): # GH 9114 - base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) + base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT'], + cache=cache) for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: values = base.values.astype(dtype) tm.assert_index_equal(DatetimeIndex(values), base) - tm.assert_index_equal(to_datetime(values), base) + tm.assert_index_equal(to_datetime(values, cache=cache), base) - def test_dayfirst(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dayfirst(self, cache): # GH 5917 arr = ['10/02/2014', '11/02/2014', '12/02/2014'] expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)]) idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) - idx3 = to_datetime(arr, dayfirst=True) - idx4 = to_datetime(np.array(arr), dayfirst=True) + idx3 = to_datetime(arr, dayfirst=True, cache=cache) + idx4 = to_datetime(np.array(arr), dayfirst=True, cache=cache) idx5 = DatetimeIndex(Index(arr), dayfirst=True) idx6 = DatetimeIndex(Series(arr), dayfirst=True) tm.assert_index_equal(expected, idx1) @@ -952,7 +1014,8 @@ def test_guess_datetime_format_for_array(self): class TestToDatetimeInferFormat(object): - def test_to_datetime_infer_datetime_format_consistent_format(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_consistent_format(self, cache): s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', @@ -961,90 +1024,113 @@ def test_to_datetime_infer_datetime_format_consistent_format(self): for test_format in test_formats: s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) - with_format = pd.to_datetime(s_as_dt_strings, format=test_format) + with_format = pd.to_datetime(s_as_dt_strings, format=test_format, + cache=cache) no_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=False) + infer_datetime_format=False, + cache=cache) yes_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=True) + infer_datetime_format=True, + cache=cache) # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same tm.assert_series_equal(with_format, no_infer) tm.assert_series_equal(no_infer, yes_infer) - def test_to_datetime_infer_datetime_format_inconsistent_format(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_inconsistent_format(self, + cache): s = pd.Series(np.array(['01/01/2011 00:00:00', '01-02-2011 00:00:00', '2011-01-03T00:00:00'])) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) - def test_to_datetime_infer_datetime_format_series_with_nans(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, '01/03/2011 00:00:00', np.nan])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) + + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_series_starting_with_nans(self, + cache): s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) - def test_to_datetime_iso8601_noleading_0s(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) expected = pd.Series([pd.Timestamp('2014-01-01'), pd.Timestamp('2014-02-02'), pd.Timestamp('2015-03-03')]) - tm.assert_series_equal(pd.to_datetime(s), expected) - tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) + tm.assert_series_equal(pd.to_datetime(s, cache=cache), expected) + tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d', + cache=cache), expected) class TestDaysInMonth(object): # tests for issue #10154 - def test_day_not_in_month_coerce(self): - assert isna(to_datetime('2015-02-29', errors='coerce')) + @pytest.mark.parametrize('cache', [True, False]) + def test_day_not_in_month_coerce(self, cache): + assert isna(to_datetime('2015-02-29', errors='coerce', cache=cache)) assert isna(to_datetime('2015-02-29', format="%Y-%m-%d", - errors='coerce')) + errors='coerce', cache=cache)) assert isna(to_datetime('2015-02-32', format="%Y-%m-%d", - errors='coerce')) + errors='coerce', cache=cache)) assert isna(to_datetime('2015-04-31', format="%Y-%m-%d", - errors='coerce')) + errors='coerce', cache=cache)) - def test_day_not_in_month_raise(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_day_not_in_month_raise(self, cache): pytest.raises(ValueError, to_datetime, '2015-02-29', - errors='raise') + errors='raise', cache=cache) pytest.raises(ValueError, to_datetime, '2015-02-29', - errors='raise', format="%Y-%m-%d") + errors='raise', format="%Y-%m-%d", cache=cache) pytest.raises(ValueError, to_datetime, '2015-02-32', - errors='raise', format="%Y-%m-%d") + errors='raise', format="%Y-%m-%d", cache=cache) pytest.raises(ValueError, to_datetime, '2015-04-31', - errors='raise', format="%Y-%m-%d") + errors='raise', format="%Y-%m-%d", cache=cache) - def test_day_not_in_month_ignore(self): - assert to_datetime('2015-02-29', errors='ignore') == '2015-02-29' + @pytest.mark.parametrize('cache', [True, False]) + def test_day_not_in_month_ignore(self, cache): + assert to_datetime('2015-02-29', errors='ignore', + cache=cache) == '2015-02-29' assert to_datetime('2015-02-29', errors='ignore', - format="%Y-%m-%d") == '2015-02-29' + format="%Y-%m-%d", cache=cache) == '2015-02-29' assert to_datetime('2015-02-32', errors='ignore', - format="%Y-%m-%d") == '2015-02-32' + format="%Y-%m-%d", cache=cache) == '2015-02-32' assert to_datetime('2015-04-31', errors='ignore', - format="%Y-%m-%d") == '2015-04-31' + format="%Y-%m-%d", cache=cache) == '2015-04-31' class TestDatetimeParsingWrappers(object): - def test_parsers(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers(self, cache): # https://github.com/dateutil/dateutil/issues/217 import dateutil @@ -1108,7 +1194,7 @@ def test_parsers(self): result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below result4 = to_datetime(np.array([date_str], dtype=object), - yearfirst=yearfirst) + yearfirst=yearfirst, cache=cache) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -1222,7 +1308,7 @@ def test_parsers_dayfirst_yearfirst(self): assert result2 == expected result3 = to_datetime(date_str, dayfirst=dayfirst, - yearfirst=yearfirst) + yearfirst=yearfirst, cache=cache) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] @@ -1231,7 +1317,8 @@ def test_parsers_dayfirst_yearfirst(self): assert result3 == expected assert result4 == expected - def test_parsers_timestring(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers_timestring(self, cache): # must be the same as dateutil result cases = {'10:15': (parse('10:15'), datetime(1, 1, 1, 10, 15)), '9:05': (parse('9:05'), datetime(1, 1, 1, 9, 5))} @@ -1288,7 +1375,7 @@ def test_parsers_time(self): def test_parsers_timezone_minute_offsets_roundtrip(self): # GH11708 - base = to_datetime("2013-01-01 00:00:00") + base = to_datetime("2013-01-01 00:00:00", cache=cache) dt_strings = [ ('2013-01-01 05:45+0545', "Asia/Katmandu", @@ -1299,7 +1386,7 @@ def test_parsers_timezone_minute_offsets_roundtrip(self): ] for dt_string, tz, dt_string_repr in dt_strings: - dt_time = to_datetime(dt_string) + dt_time = to_datetime(dt_string, cache=cache) assert base == dt_time converted_time = dt_time.tz_localize('UTC').tz_convert(tz) assert dt_string_repr == repr(converted_time) From 33c79d3218cdf61006ccfdffdd717d72ef0218eb Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Sep 2017 17:51:07 -0700 Subject: [PATCH 07/25] Fix asv errors and condition --- asv_bench/benchmarks/timeseries.py | 15 ++++++--------- pandas/core/tools/datetimes.py | 13 ++++++------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index d9d1b48f420e6..1ae3601320e1e 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -386,23 +386,20 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) - def time_cache_with_unique_seconds_and unit(self): - to_datetime(self.unique_numeric_seconds, unit='s') + def time_cache_with_unique_seconds_and_unit(self): + to_datetime(self.unique_numeric_seconds, unit='s') def time_cache_with_dup_seconds_and_unit(self): - to_datetime(self.dup_numeric_seconds, unit='s') + to_datetime(self.dup_numeric_seconds, unit='s') def time_cache_with_dup_string_dates(self): - to_datetime(self.dup_string_dates) + to_datetime(self.dup_string_dates) def time_cache_with_dup_string_dates_and_format(self): - to_datetime(self.dup_string_dates, format='%Y-%m-%d') + to_datetime(self.dup_string_dates, format='%Y-%m-%d') def time_cache_with_dup_string_tzoffset_dates(self): - to_datetime(self.dup_string_with_tz) - - def time_cache_with_dup_string_tzoffset_dates_and_format(self): - to_datetim(self.dup_string_with_tz, format='%Y-%m-%d %H:%M:%S%z') + to_datetime(self.dup_string_with_tz) class Offsets(object): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ac29c7144556c..da4a37b041a8f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -373,13 +373,12 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): arg = arg + offset convert_cache = None - if cache and is_list_like(arg): - if len(arg) >= 1000: - unique_dates = algorithms.unique(arg) - if len(unique_dates) != len(arg): - from pandas import Series - cache_dates = _convert_listlike(unique_dates, False, format) - convert_cache = Series(cache_dates, index=unique_dates) + if cache and is_list_like(arg) and len(arg) >= 1000: + unique_dates = algorithms.unique(arg) + if len(unique_dates) != len(arg): + from pandas import Series + cache_dates = _convert_listlike(unique_dates, False, format) + convert_cache = Series(cache_dates, index=unique_dates) if isinstance(arg, tslib.Timestamp): result = arg From dcaafb68d742d17a89d1eba5bed7433a6bdfdeba Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Sep 2017 18:19:39 -0700 Subject: [PATCH 08/25] Pep8 fixes --- pandas/tests/indexes/datetimes/test_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 1a099fe2d1505..704b2055b41d7 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -222,7 +222,7 @@ def test_to_datetime_array_of_dt64s(self, cache): errors='raise') tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='coerce', + pd.to_datetime(dts_with_oob, box=False, errors='coerce', cache=cache), np.array( [ @@ -401,7 +401,7 @@ def test_datetime_invalid_datatype(self): pd.to_datetime(bool) with pytest.raises(TypeError): pd.to_datetime(pd.to_datetime) - + @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) @pytest.mark.parametrize("box", [True, False]) @@ -1069,8 +1069,8 @@ def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): cache=cache)) @pytest.mark.parametrize('cache', [True, False]) - def test_to_datetime_infer_datetime_format_series_starting_with_nans(self, - cache): + def test_to_datetime_infer_datetime_format_series_start_with_nans(self, + cache): s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) From 04df9d9f5740e9856c5a40df536462ce3d9420ce Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Sep 2017 20:53:58 -0700 Subject: [PATCH 09/25] Remove unused import --- pandas/core/tools/datetimes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index da4a37b041a8f..235f9360d1287 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -21,8 +21,7 @@ is_float, is_list_like, is_scalar, - is_numeric_dtype, - is_string_dtype) + is_numeric_dtype) from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, ABCDataFrame) From 34b468fa263f75c1cfe7b794e2a9e116c87d84f4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 26 Sep 2017 12:21:08 -0700 Subject: [PATCH 10/25] Wrap cache logic in a function --- pandas/core/tools/datetimes.py | 57 +++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 235f9360d1287..1018ac57ab870 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -116,7 +116,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, If True, use a cache of unique, converted dates to apply the datetime conversion. Produces signficant speed-ups when parsing duplicate date. - .. versionadded: 0.20.2 + .. versionadded: 0.21.0 Returns ------- ret : datetime if parsing succeeded. @@ -310,6 +310,28 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): except (ValueError, TypeError): raise e + def _maybe_convert_cache(arg, cache): + """Try to convert the datetimelike arg using + a cache of converted dates. + + arg: datetimelike arg from to_datetime + cache: bool whether to convert using a cache + + Result: + Series of converted datetime arg or + None if the conversion failed + """ + if cache and is_list_like(arg) and len(arg) >= 1000: + unique_dates = algorithms.unique(arg) + if len(unique_dates) != len(arg): + from pandas import Series + cache_dates = _convert_listlike(unique_dates, False, format) + convert_cache = Series(cache_dates, index=unique_dates) + if not isinstance(arg, Series): + arg = Series(arg) + return arg.map(convert_cache) + return None + if arg is None: return None @@ -371,41 +393,32 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): arg = np.asarray(arg) arg = arg + offset - convert_cache = None - if cache and is_list_like(arg) and len(arg) >= 1000: - unique_dates = algorithms.unique(arg) - if len(unique_dates) != len(arg): - from pandas import Series - cache_dates = _convert_listlike(unique_dates, False, format) - convert_cache = Series(cache_dates, index=unique_dates) - if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - if convert_cache is not None: - result = arg.map(convert_cache) - else: + result = _maybe_convert_cache(arg, cache) + if result is None: from pandas import Series values = _convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - if convert_cache is not None: - from pandas import Series - result = Series(arg).map(convert_cache).values + result = _maybe_convert_cache(arg, cache) + if result is None: + result = _convert_listlike(arg, box, format, name=arg.name) + else: + result = result.values if box: result = DatetimeIndex(result, tz=tz, name=arg.name) - else: - result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): - if convert_cache is not None: - from pandas import Series - result = Series(arg).map(convert_cache).values + result = _maybe_convert_cache(arg, cache) + if result is None: + result = _convert_listlike(arg, box, format) + else: + result = result.values if box: result = DatetimeIndex(result, tz=tz) - else: - result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] From d287cc66c7165d4adbae583bb2129e0455463d7a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 26 Sep 2017 13:00:43 -0700 Subject: [PATCH 11/25] Fix Series test --- pandas/core/tools/datetimes.py | 11 ++++++----- pandas/tests/indexes/datetimes/test_tools.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1018ac57ab870..dc0d6fbf6d61f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -310,7 +310,7 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): except (ValueError, TypeError): raise e - def _maybe_convert_cache(arg, cache): + def _maybe_convert_cache(arg, cache, tz): """Try to convert the datetimelike arg using a cache of converted dates. @@ -325,7 +325,8 @@ def _maybe_convert_cache(arg, cache): unique_dates = algorithms.unique(arg) if len(unique_dates) != len(arg): from pandas import Series - cache_dates = _convert_listlike(unique_dates, False, format) + cache_dates = _convert_listlike(unique_dates, True, format, + tz=tz) convert_cache = Series(cache_dates, index=unique_dates) if not isinstance(arg, Series): arg = Series(arg) @@ -396,7 +397,7 @@ def _maybe_convert_cache(arg, cache): if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - result = _maybe_convert_cache(arg, cache) + result = _maybe_convert_cache(arg, cache, tz) if result is None: from pandas import Series values = _convert_listlike(arg._values, True, format) @@ -404,7 +405,7 @@ def _maybe_convert_cache(arg, cache): elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - result = _maybe_convert_cache(arg, cache) + result = _maybe_convert_cache(arg, cache, tz) if result is None: result = _convert_listlike(arg, box, format, name=arg.name) else: @@ -412,7 +413,7 @@ def _maybe_convert_cache(arg, cache): if box: result = DatetimeIndex(result, tz=tz, name=arg.name) elif is_list_like(arg): - result = _maybe_convert_cache(arg, cache) + result = _maybe_convert_cache(arg, cache, tz) if result is None: result = _convert_listlike(arg, box, format) else: diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 704b2055b41d7..47bb1c2c6529b 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -425,7 +425,7 @@ def test_to_datetime_cache_series(self, utc, format): test_dates = [date] * 10**5 data = pd.Series(test_dates) result = pd.to_datetime(data, utc=utc, format=format, cache=True) - expected = pd.to_datetime(data, utc=utc, format=format) + expected = pd.to_datetime(data, utc=utc, format=format, cache=False) tm.assert_series_equal(result, expected) def test_to_datetime_cache_scalar(self): From 1bf4c9dd0ab8be7094d935a9ddc889e81e153ae6 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 26 Sep 2017 13:36:29 -0700 Subject: [PATCH 12/25] Add whatsnew and small documentation fix --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/tools/datetimes.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4c460eeb85b82..9b0d9168c5704 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -999,6 +999,7 @@ Performance Improvements - :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) - Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) - Improved performance of :meth:`RangeIndex.min` and :meth:`RangeIndex.max` by using ``RangeIndex`` properties to perform the computations (:issue:`17607`) +- Added a keyword argument, `cache`, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue: `11665`) .. _whatsnew_0210.docs: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index dc0d6fbf6d61f..1ea1735e11314 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -114,7 +114,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, .. versionadded: 0.20.0 cache : boolean, default False If True, use a cache of unique, converted dates to apply the datetime - conversion. Produces signficant speed-ups when parsing duplicate date. + conversion. Produces signficant speed-ups when parsing duplicate dates. .. versionadded: 0.21.0 Returns From 3ffdd461e3f02742aedc089ca5f33d4f24d5ef76 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 26 Sep 2017 13:39:03 -0700 Subject: [PATCH 13/25] pep 8 fixes --- pandas/core/tools/datetimes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1ea1735e11314..6009a194ac285 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -311,14 +311,14 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): raise e def _maybe_convert_cache(arg, cache, tz): - """Try to convert the datetimelike arg using + """Try to convert the datetimelike arg using a cache of converted dates. - + arg: datetimelike arg from to_datetime cache: bool whether to convert using a cache - Result: - Series of converted datetime arg or + Returns: + Series of converted datetime arg or None if the conversion failed """ if cache and is_list_like(arg) and len(arg) >= 1000: From a093b88d98fab606f06d1e9a7d2fcf17d00d1434 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 9 Oct 2017 16:26:55 -0700 Subject: [PATCH 14/25] Move box logic into maybe_convert_cache --- pandas/core/tools/datetimes.py | 62 ++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6009a194ac285..408a96b20ea73 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -112,7 +112,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded: 0.20.0 - cache : boolean, default False + cache : boolean, default True If True, use a cache of unique, converted dates to apply the datetime conversion. Produces signficant speed-ups when parsing duplicate dates. @@ -310,16 +310,32 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): except (ValueError, TypeError): raise e - def _maybe_convert_cache(arg, cache, tz): - """Try to convert the datetimelike arg using - a cache of converted dates. - - arg: datetimelike arg from to_datetime - cache: bool whether to convert using a cache - - Returns: - Series of converted datetime arg or - None if the conversion failed + def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz): + """ + Try to convert the datetimelike arg using + a cache of converted dates. + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + Datetime argument to convert + cache : boolean + If True, try to convert the dates with a cache + If False, short circuit and return None + Flag whether to cache the converted dates + box : boolean + If True, return a DatetimeIndex + if False, return an ndarray of values + tz : String or None + 'utc' if UTC=True was passed else None + name : String, default None + DatetimeIndex name + Returns + ------- + Series if original argument was a Series + DatetimeIndex if box=True and original argument was not a Series + ndarray if box=False and original argument was not a Series + None if the conversion failed """ if cache and is_list_like(arg) and len(arg) >= 1000: unique_dates = algorithms.unique(arg) @@ -328,9 +344,13 @@ def _maybe_convert_cache(arg, cache, tz): cache_dates = _convert_listlike(unique_dates, True, format, tz=tz) convert_cache = Series(cache_dates, index=unique_dates) - if not isinstance(arg, Series): - arg = Series(arg) - return arg.map(convert_cache) + result = Series(arg, name=name).map(convert_cache) + if isinstance(arg, Series): + return result + elif box: + return DatetimeIndex(result, name=name) + else: + return result.values return None if arg is None: @@ -397,7 +417,7 @@ def _maybe_convert_cache(arg, cache, tz): if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - result = _maybe_convert_cache(arg, cache, tz) + result = _maybe_convert_cache(arg, cache, box, format, name=arg.name) if result is None: from pandas import Series values = _convert_listlike(arg._values, True, format) @@ -405,21 +425,13 @@ def _maybe_convert_cache(arg, cache, tz): elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - result = _maybe_convert_cache(arg, cache, tz) + result = _maybe_convert_cache(arg, cache, box, format, name=arg.name) if result is None: result = _convert_listlike(arg, box, format, name=arg.name) - else: - result = result.values - if box: - result = DatetimeIndex(result, tz=tz, name=arg.name) elif is_list_like(arg): - result = _maybe_convert_cache(arg, cache, tz) + result = _maybe_convert_cache(arg, cache, box, format) if result is None: result = _convert_listlike(arg, box, format) - else: - result = result.values - if box: - result = DatetimeIndex(result, tz=tz) else: result = _convert_listlike(np.array([arg]), box, format)[0] From d1fc211936d45d6c7028e9bda2c8725ab1b115f3 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 19 Oct 2017 22:53:20 -0700 Subject: [PATCH 15/25] Use quicker unique check --- pandas/core/tools/datetimes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 408a96b20ea73..1eca26ad9cb91 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -338,8 +338,10 @@ def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz): None if the conversion failed """ if cache and is_list_like(arg) and len(arg) >= 1000: - unique_dates = algorithms.unique(arg) - if len(unique_dates) != len(arg): + # Perform a quicker unique check + from pandas import Index + if not Index(arg).is_unique: + unique_dates = algorithms.unique(arg) from pandas import Series cache_dates = _convert_listlike(unique_dates, True, format, tz=tz) From 9486df3ef84cf87cad98c18181e9853def28c649 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 4 Nov 2017 00:31:50 -0700 Subject: [PATCH 16/25] Move caching function outside to_datetime --- pandas/core/indexes/datetimes.py | 2 +- pandas/core/tools/datetimes.py | 90 +++++++++----------- pandas/tests/indexes/datetimes/test_tools.py | 6 +- 3 files changed, 43 insertions(+), 55 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index dd1c7306d2c26..78869de318dce 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -339,7 +339,7 @@ def __new__(cls, data=None, if not (is_datetime64_dtype(data) or is_datetimetz(data) or is_integer_dtype(data)): data = tools.to_datetime(data, dayfirst=dayfirst, - yearfirst=yearfirst, cache=False) + yearfirst=yearfirst) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1eca26ad9cb91..fd96d819ed201 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -35,11 +35,35 @@ def _guess_datetime_format_for_array(arr, **kwargs): if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) +def _maybe_cache(arg, format, cache, tz, _convert_listlike): + """Create a cache of unique dates from an array of dates""" + from pandas import Series + cache_array = Series() + if cache: + # Perform a quicker unique check + from pandas import Index + if not Index(arg).is_unique: + unique_dates = algorithms.unique(arg) + cache_dates = _convert_listlike(unique_dates, True, format, + tz=tz) + cache_array = Series(cache_dates, index=unique_dates) + return cache_array + +def _convert_and_box_cache(arg, cache_array, box, name=None): + """Convert array of dates with a cache and box the result""" + from pandas import Series + from pandas.core.indexes.datetimes import DatetimeIndex + result = Series(arg).map(cache_array) + if box: + result = DatetimeIndex(result, name=name) + else: + result = result.values + return result def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', - cache=True): + cache=False): """ Convert argument to datetime. @@ -310,51 +334,6 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): except (ValueError, TypeError): raise e - def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz): - """ - Try to convert the datetimelike arg using - a cache of converted dates. - - Parameters - ---------- - arg : integer, float, string, datetime, list, tuple, 1-d array, Series - Datetime argument to convert - cache : boolean - If True, try to convert the dates with a cache - If False, short circuit and return None - Flag whether to cache the converted dates - box : boolean - If True, return a DatetimeIndex - if False, return an ndarray of values - tz : String or None - 'utc' if UTC=True was passed else None - name : String, default None - DatetimeIndex name - Returns - ------- - Series if original argument was a Series - DatetimeIndex if box=True and original argument was not a Series - ndarray if box=False and original argument was not a Series - None if the conversion failed - """ - if cache and is_list_like(arg) and len(arg) >= 1000: - # Perform a quicker unique check - from pandas import Index - if not Index(arg).is_unique: - unique_dates = algorithms.unique(arg) - from pandas import Series - cache_dates = _convert_listlike(unique_dates, True, format, - tz=tz) - convert_cache = Series(cache_dates, index=unique_dates) - result = Series(arg, name=name).map(convert_cache) - if isinstance(arg, Series): - return result - elif box: - return DatetimeIndex(result, name=name) - else: - return result.values - return None - if arg is None: return None @@ -419,20 +398,27 @@ def _maybe_convert_cache(arg, cache, box, format, name=None, tz=tz): if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - result = _maybe_convert_cache(arg, cache, box, format, name=arg.name) - if result is None: + cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + if not cache_array.empty: + result = arg.map(cache_array) + else: from pandas import Series values = _convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - result = _maybe_convert_cache(arg, cache, box, format, name=arg.name) - if result is None: + cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array, box, + name=arg.name) + else: result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): - result = _maybe_convert_cache(arg, cache, box, format) - if result is None: + cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array, box) + else: result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 47bb1c2c6529b..ede85a1d97bfd 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1224,7 +1224,8 @@ def test_parsers(self, cache): assert result3 is tslib.NaT assert result4 is tslib.NaT - def test_parsers_dayfirst_yearfirst(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers_dayfirst_yearfirst(self, cache): # OK # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00 @@ -1373,7 +1374,8 @@ def test_parsers_time(self): assert isinstance(res, list) assert res == expected_arr - def test_parsers_timezone_minute_offsets_roundtrip(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers_timezone_minute_offsets_roundtrip(self, cache): # GH11708 base = to_datetime("2013-01-01 00:00:00", cache=cache) dt_strings = [ From d059d44706729f06545859ada4a6efecb2d894b1 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 4 Nov 2017 22:55:22 -0700 Subject: [PATCH 17/25] Pass most tests --- pandas/core/tools/datetimes.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index fd96d819ed201..9f53cf5b91e51 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -49,15 +49,17 @@ def _maybe_cache(arg, format, cache, tz, _convert_listlike): cache_array = Series(cache_dates, index=unique_dates) return cache_array -def _convert_and_box_cache(arg, cache_array, box, name=None): +def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None): """Convert array of dates with a cache and box the result""" from pandas import Series from pandas.core.indexes.datetimes import DatetimeIndex result = Series(arg).map(cache_array) if box: - result = DatetimeIndex(result, name=name) - else: - result = result.values + if errors == 'ignore': + from pandas import Index + result = Index(result) + else: + result = DatetimeIndex(result, tz=tz, name=name) return result def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, @@ -410,14 +412,14 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, + result = _convert_and_box_cache(arg, cache_array, box, errors, tz, name=arg.name) else: result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box) + result = _convert_and_box_cache(arg, cache_array, box, errors, tz) else: result = _convert_listlike(arg, box, format) else: From 02ab4f38105204d9dc323e71331336ef8fa4982f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 5 Nov 2017 12:36:42 -0800 Subject: [PATCH 18/25] Skip test related to GH 18111, lint --- doc/source/whatsnew/v0.21.0.txt | 1 - doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/tools/datetimes.py | 9 ++++++--- pandas/tests/indexes/datetimes/test_tools.py | 6 ++++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 9b0d9168c5704..4c460eeb85b82 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -999,7 +999,6 @@ Performance Improvements - :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) - Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) - Improved performance of :meth:`RangeIndex.min` and :meth:`RangeIndex.max` by using ``RangeIndex`` properties to perform the computations (:issue:`17607`) -- Added a keyword argument, `cache`, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue: `11665`) .. _whatsnew_0210.docs: diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 185f08514641f..9534d582591b2 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -39,7 +39,7 @@ Deprecations Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Added a keyword argument, `cache`, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue: `11665`) - - diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9f53cf5b91e51..2aecdf35c67c0 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -35,6 +35,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) + def _maybe_cache(arg, format, cache, tz, _convert_listlike): """Create a cache of unique dates from an array of dates""" from pandas import Series @@ -49,6 +50,7 @@ def _maybe_cache(arg, format, cache, tz, _convert_listlike): cache_array = Series(cache_dates, index=unique_dates) return cache_array + def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None): """Convert array of dates with a cache and box the result""" from pandas import Series @@ -62,6 +64,7 @@ def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None): result = DatetimeIndex(result, tz=tz, name=name) return result + def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', @@ -138,11 +141,11 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded: 0.20.0 - cache : boolean, default True - If True, use a cache of unique, converted dates to apply the datetime + cache : boolean, default False + If False, use a cache of unique, converted dates to apply the datetime conversion. Produces signficant speed-ups when parsing duplicate dates. - .. versionadded: 0.21.0 + .. versionadded: 0.21.1 Returns ------- ret : datetime if parsing succeeded. diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index ede85a1d97bfd..cf2f2f56e55cb 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -372,7 +372,10 @@ def test_to_datetime_tz_psycopg2(self, cache): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + bool_skip = pytest.mark.skipif(True, reason="GH 18111") + + @pytest.mark.parametrize('cache', [pytest.param(True, marks=bool_skip), + False]) def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): @@ -450,7 +453,6 @@ def test_week_without_day_and_calendar_year(self, date, format): class TestToDatetimeUnit(object): - @pytest.mark.parametrize('cache', [True, False]) def test_unit(self, cache): # GH 11758 From 82f36d39f00b0f670661ae3c3d93768440ce2503 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 5 Nov 2017 16:42:48 -0800 Subject: [PATCH 19/25] Update docstring --- pandas/core/tools/datetimes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2aecdf35c67c0..aa2dd088999ee 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -143,7 +143,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, .. versionadded: 0.20.0 cache : boolean, default False If False, use a cache of unique, converted dates to apply the datetime - conversion. Produces signficant speed-ups when parsing duplicate dates. + conversion. May produce sigificant speed-up when parsing duplicate date + strings, especially ones with timezone offsets. .. versionadded: 0.21.1 Returns From 76547e1631094680c3d7c34a2ee1dd79be9d20aa Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 7 Nov 2017 20:44:17 -0800 Subject: [PATCH 20/25] adjust imports, docs and move whatsnew --- doc/source/whatsnew/v0.21.1.txt | 2 +- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/tools/datetimes.py | 8 +++----- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 9534d582591b2..4483cb90a6f48 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -39,7 +39,7 @@ Deprecations Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Added a keyword argument, `cache`, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue: `11665`) +- - - diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 61679b14a8592..712119caae6f2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -70,7 +70,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) -- +- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - .. _whatsnew_0220.docs: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index aa2dd088999ee..8e2894d8d917e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -53,12 +53,10 @@ def _maybe_cache(arg, format, cache, tz, _convert_listlike): def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None): """Convert array of dates with a cache and box the result""" - from pandas import Series - from pandas.core.indexes.datetimes import DatetimeIndex + from pandas import Series, DatetimeIndex, Index result = Series(arg).map(cache_array) if box: if errors == 'ignore': - from pandas import Index result = Index(result) else: result = DatetimeIndex(result, tz=tz, name=name) @@ -142,11 +140,11 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, .. versionadded: 0.20.0 cache : boolean, default False - If False, use a cache of unique, converted dates to apply the datetime + If True, use a cache of unique, converted dates to apply the datetime conversion. May produce sigificant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionadded: 0.21.1 + .. versionadded: 0.22.0 Returns ------- ret : datetime if parsing succeeded. From 590c9cc0547ef553394cb31afacad6ed49bbf26c Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 7 Nov 2017 20:45:57 -0800 Subject: [PATCH 21/25] Remove whitespace --- doc/source/whatsnew/v0.21.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4483cb90a6f48..185f08514641f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -39,7 +39,7 @@ Deprecations Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- - - From 9a985acb1557b88ab8cc8fe4f4cecf60811212dc Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 8 Nov 2017 22:42:48 -0800 Subject: [PATCH 22/25] Address comments --- asv_bench/benchmarks/timeseries.py | 35 ++++++++++++++------ pandas/core/tools/datetimes.py | 33 ++++++++++++++++-- pandas/tests/indexes/datetimes/test_tools.py | 7 ++-- 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 1ae3601320e1e..9614a63332609 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -386,20 +386,35 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) - def time_cache_with_unique_seconds_and_unit(self): - to_datetime(self.unique_numeric_seconds, unit='s') + def time_cache_true_with_unique_seconds_and_unit(self): + to_datetime(self.unique_numeric_seconds, unit='s', cache=True) - def time_cache_with_dup_seconds_and_unit(self): - to_datetime(self.dup_numeric_seconds, unit='s') + def time_cache_false_with_unique_seconds_and_unit(self): + to_datetime(self.unique_numeric_seconds, unit='s', cache=False) - def time_cache_with_dup_string_dates(self): - to_datetime(self.dup_string_dates) + def time_cache_true_with_dup_seconds_and_unit(self): + to_datetime(self.dup_numeric_seconds, unit='s', cache=True) - def time_cache_with_dup_string_dates_and_format(self): - to_datetime(self.dup_string_dates, format='%Y-%m-%d') + def time_cache_false_with_dup_seconds_and_unit(self): + to_datetime(self.dup_numeric_seconds, unit='s', cache=False) - def time_cache_with_dup_string_tzoffset_dates(self): - to_datetime(self.dup_string_with_tz) + def time_cache_true_with_dup_string_dates(self): + to_datetime(self.dup_string_dates, cache=True) + + def time_cache_false_with_dup_string_dates(self): + to_datetime(self.dup_string_dates, cache=False) + + def time_cache_true_with_dup_string_dates_and_format(self): + to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=True) + + def time_cache_false_with_dup_string_dates_and_format(self): + to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=False) + + def time_cache_true_with_dup_string_tzoffset_dates(self): + to_datetime(self.dup_string_with_tz, cache=True) + + def time_cache_false_with_dup_string_tzoffset_dates(self): + to_datetime(self.dup_string_with_tz, cache=False) class Offsets(object): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8e2894d8d917e..95e0c8b552ee1 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -37,7 +37,21 @@ def _guess_datetime_format_for_array(arr, **kwargs): def _maybe_cache(arg, format, cache, tz, _convert_listlike): - """Create a cache of unique dates from an array of dates""" + """ + Create a cache of unique dates from an array of dates + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array of dates + format : string, strftime to parse time + cache: boolean, whether to convert with cache + tz: string, timezone of the dates + _convert_listlike: function, conversion function to apply on dates + + Returns + ------- + cache_array: Series, cache of converted, unique dates, can be empty + """ from pandas import Series cache_array = Series() if cache: @@ -52,7 +66,22 @@ def _maybe_cache(arg, format, cache, tz, _convert_listlike): def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None): - """Convert array of dates with a cache and box the result""" + """ + Convert array of dates with a cache and box the result + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array of dates + cache_array: Series, cache of converted, unique dates + box: boolean, True boxes result as an Index-like + errors: string, 'ignore' plus box=True will convert result to Index + tz: string, timezone of the dates + name: string, default None. name for a DatetimeIndex + + Returns + ------- + result: Index-like if box=True else array-like of converted dates + """ from pandas import Series, DatetimeIndex, Index result = Series(arg).map(cache_array) if box: diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index cf2f2f56e55cb..1772c0c098b5d 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -372,10 +372,9 @@ def test_to_datetime_tz_psycopg2(self, cache): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) - bool_skip = pytest.mark.skipif(True, reason="GH 18111") - - @pytest.mark.parametrize('cache', [pytest.param(True, marks=bool_skip), - False]) + @pytest.mark.parametrize('cache', + [pytest.param(True, marks=pytest.mark.skipif(True, reason="GH 18111")), + False]) def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): From 85a1f2da24eff47358ec90f8ee5fc803639d0933 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 8 Nov 2017 22:58:35 -0800 Subject: [PATCH 23/25] Lint fix --- pandas/tests/indexes/datetimes/test_tools.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 1772c0c098b5d..abaef39cb58ab 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -372,9 +372,11 @@ def test_to_datetime_tz_psycopg2(self, cache): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', - [pytest.param(True, marks=pytest.mark.skipif(True, reason="GH 18111")), - False]) + @pytest.mark.parametrize( + 'cache', + [pytest.param(True, + marks=pytest.mark.skipif(True, reason="GH 18111")), + False]) def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): From 49f5850148ea5f0904bb2817e2dbc0eca99d3516 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 9 Nov 2017 20:07:41 -0800 Subject: [PATCH 24/25] Move docs and adjust test --- pandas/core/tools/datetimes.py | 55 ++++++++++++-------- pandas/tests/indexes/datetimes/test_tools.py | 3 +- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 95e0c8b552ee1..029b166307936 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -36,21 +36,26 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def _maybe_cache(arg, format, cache, tz, _convert_listlike): +def _maybe_cache(arg, format, cache, tz, convert_listlike): """ Create a cache of unique dates from an array of dates Parameters ---------- - arg : integer, float, string, datetime, list, tuple, 1-d array of dates - format : string, strftime to parse time - cache: boolean, whether to convert with cache - tz: string, timezone of the dates - _convert_listlike: function, conversion function to apply on dates + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + format : string + Strftime format to parse time + cache : boolean + True attempts to create a cache of converted values + tz : string + Timezone of the dates + convert_listlike : function + Conversion function to apply on dates Returns ------- - cache_array: Series, cache of converted, unique dates, can be empty + cache_array : Series + Cache of converted, unique dates. Can be empty """ from pandas import Series cache_array = Series() @@ -59,37 +64,43 @@ def _maybe_cache(arg, format, cache, tz, _convert_listlike): from pandas import Index if not Index(arg).is_unique: unique_dates = algorithms.unique(arg) - cache_dates = _convert_listlike(unique_dates, True, format, - tz=tz) + cache_dates = convert_listlike(unique_dates, True, format, tz=tz) cache_array = Series(cache_dates, index=unique_dates) return cache_array -def _convert_and_box_cache(arg, cache_array, box, errors, tz, name=None): +def _convert_and_box_cache(arg, cache_array, box, errors, name=None): """ Convert array of dates with a cache and box the result Parameters ---------- - arg : integer, float, string, datetime, list, tuple, 1-d array of dates - cache_array: Series, cache of converted, unique dates - box: boolean, True boxes result as an Index-like - errors: string, 'ignore' plus box=True will convert result to Index - tz: string, timezone of the dates - name: string, default None. name for a DatetimeIndex + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + cache_array : Series + Cache of converted, unique dates + box : boolean + True boxes result as an Index-like, False returns an ndarray + errors : string + 'ignore' plus box=True will convert result to Index + name : string, default None + Name for a DatetimeIndex Returns ------- - result: Index-like if box=True else array-like of converted dates + result : datetime of converted dates + Returns: + + - Index-like if box=True + - ndarray if box=False """ from pandas import Series, DatetimeIndex, Index result = Series(arg).map(cache_array) if box: if errors == 'ignore': - result = Index(result) + return Index(result) else: - result = DatetimeIndex(result, tz=tz, name=name) - return result + return DatetimeIndex(result, name=name) + return result.values def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, @@ -443,14 +454,14 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, errors, tz, + result = _convert_and_box_cache(arg, cache_array, box, errors, name=arg.name) else: result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, errors, tz) + result = _convert_and_box_cache(arg, cache_array, box, errors) else: result = _convert_listlike(arg, box, format) else: diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index abaef39cb58ab..307184cb34e27 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -414,7 +414,8 @@ def test_to_datetime_cache(self, utc, format, box, constructor): date = '20130101 00:00:00' test_dates = [date] * 10**5 data = constructor(test_dates) - result = pd.to_datetime(data, utc=utc, format=format, box=box) + result = pd.to_datetime(data, utc=utc, format=format, box=box, + cache=True) expected = pd.to_datetime(data, utc=utc, format=format, box=box, cache=False) if box: From 07fa22d12302252607cbe2edc3bc31f3a144515e Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 10 Nov 2017 18:50:21 -0800 Subject: [PATCH 25/25] Lint --- pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 029b166307936..19f7e459d0725 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -87,10 +87,10 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None): Returns ------- - result : datetime of converted dates + result : datetime of converted dates Returns: - - Index-like if box=True + - Index-like if box=True - ndarray if box=False """ from pandas import Series, DatetimeIndex, Index