Skip to content

Commit

Permalink
PERF: Add cache keyword to to_datetime (pandas-dev#11665) (pandas-dev…
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and No-Stream committed Nov 28, 2017
1 parent 7920032 commit 2267b97
Show file tree
Hide file tree
Showing 4 changed files with 445 additions and 199 deletions.
41 changes: 38 additions & 3 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,17 +346,22 @@ class ToDatetime(object):

def setup(self):
self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
self.stringsD = Series(self.rng.strftime('%Y%m%d'))

self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist()
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
for x in self.rng]

self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
self.s2 = self.s.str.replace(':\\S+$', '')

self.unique_numeric_seconds = range(10000)
self.dup_numeric_seconds = [1000] * 10000
self.dup_string_dates = ['2000-02-11'] * 10000
self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000

def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format='%Y%m%d')

Expand All @@ -381,6 +386,36 @@ def time_format_exact(self):
def time_format_no_exact(self):
to_datetime(self.s, format='%d%b%y', exact=False)

def time_cache_true_with_unique_seconds_and_unit(self):
to_datetime(self.unique_numeric_seconds, unit='s', cache=True)

def time_cache_false_with_unique_seconds_and_unit(self):
to_datetime(self.unique_numeric_seconds, unit='s', cache=False)

def time_cache_true_with_dup_seconds_and_unit(self):
to_datetime(self.dup_numeric_seconds, unit='s', cache=True)

def time_cache_false_with_dup_seconds_and_unit(self):
to_datetime(self.dup_numeric_seconds, unit='s', cache=False)

def time_cache_true_with_dup_string_dates(self):
to_datetime(self.dup_string_dates, cache=True)

def time_cache_false_with_dup_string_dates(self):
to_datetime(self.dup_string_dates, cache=False)

def time_cache_true_with_dup_string_dates_and_format(self):
to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=True)

def time_cache_false_with_dup_string_dates_and_format(self):
to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=False)

def time_cache_true_with_dup_string_tzoffset_dates(self):
to_datetime(self.dup_string_with_tz, cache=True)

def time_cache_false_with_dup_string_tzoffset_dates(self):
to_datetime(self.dup_string_with_tz, cache=False)


class Offsets(object):
goal_time = 0.2
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
-
- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
-

.. _whatsnew_0220.docs:
Expand Down
98 changes: 92 additions & 6 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,77 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)


def _maybe_cache(arg, format, cache, tz, convert_listlike):
"""
Create a cache of unique dates from an array of dates
Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : boolean
True attempts to create a cache of converted values
tz : string
Timezone of the dates
convert_listlike : function
Conversion function to apply on dates
Returns
-------
cache_array : Series
Cache of converted, unique dates. Can be empty
"""
from pandas import Series
cache_array = Series()
if cache:
# Perform a quicker unique check
from pandas import Index
if not Index(arg).is_unique:
unique_dates = algorithms.unique(arg)
cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
cache_array = Series(cache_dates, index=unique_dates)
return cache_array


def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
"""
Convert array of dates with a cache and box the result
Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
cache_array : Series
Cache of converted, unique dates
box : boolean
True boxes result as an Index-like, False returns an ndarray
errors : string
'ignore' plus box=True will convert result to Index
name : string, default None
Name for a DatetimeIndex
Returns
-------
result : datetime of converted dates
Returns:
- Index-like if box=True
- ndarray if box=False
"""
from pandas import Series, DatetimeIndex, Index
result = Series(arg).map(cache_array)
if box:
if errors == 'ignore':
return Index(result)
else:
return DatetimeIndex(result, name=name)
return result.values


def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix'):
unit=None, infer_datetime_format=False, origin='unix',
cache=False):
"""
Convert argument to datetime.
Expand Down Expand Up @@ -111,7 +179,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
origin.
.. versionadded: 0.20.0
cache : boolean, default False
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce sigificant speed-up when parsing duplicate date
strings, especially ones with timezone offsets.
.. versionadded: 0.22.0
Returns
-------
ret : datetime if parsing succeeded.
Expand Down Expand Up @@ -369,15 +442,28 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
if isinstance(arg, tslib.Timestamp):
result = arg
elif isinstance(arg, ABCSeries):
from pandas import Series
values = _convert_listlike(arg._values, True, format)
result = Series(values, index=arg.index, name=arg.name)
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = arg.map(cache_array)
else:
from pandas import Series
values = _convert_listlike(arg._values, True, format)
result = Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
result = _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
result = _convert_listlike(arg, box, format, name=arg.name)
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors,
name=arg.name)
else:
result = _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
result = _convert_listlike(arg, box, format)
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors)
else:
result = _convert_listlike(arg, box, format)
else:
result = _convert_listlike(np.array([arg]), box, format)[0]

Expand Down
Loading

0 comments on commit 2267b97

Please sign in to comment.