Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Add cache keyword to to_datetime (#11665) #17077

Merged
merged 25 commits into from
Nov 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,17 +346,22 @@ class ToDatetime(object):

def setup(self):
self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
self.stringsD = Series(self.rng.strftime('%Y%m%d'))

self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist()
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
for x in self.rng]

self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
self.s2 = self.s.str.replace(':\\S+$', '')

self.unique_numeric_seconds = range(10000)
self.dup_numeric_seconds = [1000] * 10000
self.dup_string_dates = ['2000-02-11'] * 10000
self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000

def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format='%Y%m%d')

Expand All @@ -381,6 +386,36 @@ def time_format_exact(self):
def time_format_no_exact(self):
to_datetime(self.s, format='%d%b%y', exact=False)

def time_cache_true_with_unique_seconds_and_unit(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I think there is a actually a way to do this with an asv matrix, but can ok for now

to_datetime(self.unique_numeric_seconds, unit='s', cache=True)

def time_cache_false_with_unique_seconds_and_unit(self):
to_datetime(self.unique_numeric_seconds, unit='s', cache=False)

def time_cache_true_with_dup_seconds_and_unit(self):
to_datetime(self.dup_numeric_seconds, unit='s', cache=True)

def time_cache_false_with_dup_seconds_and_unit(self):
to_datetime(self.dup_numeric_seconds, unit='s', cache=False)

def time_cache_true_with_dup_string_dates(self):
to_datetime(self.dup_string_dates, cache=True)

def time_cache_false_with_dup_string_dates(self):
to_datetime(self.dup_string_dates, cache=False)

def time_cache_true_with_dup_string_dates_and_format(self):
to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=True)

def time_cache_false_with_dup_string_dates_and_format(self):
to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=False)

def time_cache_true_with_dup_string_tzoffset_dates(self):
to_datetime(self.dup_string_with_tz, cache=True)

def time_cache_false_with_dup_string_tzoffset_dates(self):
to_datetime(self.dup_string_with_tz, cache=False)


class Offsets(object):
goal_time = 0.2
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
-
- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
-

.. _whatsnew_0220.docs:
Expand Down
98 changes: 92 additions & 6 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,77 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)


def _maybe_cache(arg, format, cache, tz, convert_listlike):
"""
Create a cache of unique dates from an array of dates

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : boolean
True attempts to create a cache of converted values
tz : string
Timezone of the dates
convert_listlike : function
Conversion function to apply on dates

Returns
-------
cache_array : Series
Cache of converted, unique dates. Can be empty
"""
from pandas import Series
cache_array = Series()
if cache:
# Perform a quicker unique check
from pandas import Index
if not Index(arg).is_unique:
unique_dates = algorithms.unique(arg)
cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
cache_array = Series(cache_dates, index=unique_dates)
return cache_array


def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
"""
Convert array of dates with a cache and box the result

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
cache_array : Series
Cache of converted, unique dates
box : boolean
True boxes result as an Index-like, False returns an ndarray
errors : string
'ignore' plus box=True will convert result to Index
name : string, default None
Name for a DatetimeIndex

Returns
-------
result : datetime of converted dates
Returns:

- Index-like if box=True
- ndarray if box=False
"""
from pandas import Series, DatetimeIndex, Index
result = Series(arg).map(cache_array)
if box:
if errors == 'ignore':
return Index(result)
else:
return DatetimeIndex(result, name=name)
return result.values


def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix'):
unit=None, infer_datetime_format=False, origin='unix',
cache=False):
"""
Convert argument to datetime.

Expand Down Expand Up @@ -111,7 +179,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
origin.

.. versionadded: 0.20.0
cache : boolean, default False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

default is True

If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce sigificant speed-up when parsing duplicate date
strings, especially ones with timezone offsets.

.. versionadded: 0.22.0
Returns
-------
ret : datetime if parsing succeeded.
Expand Down Expand Up @@ -369,15 +442,28 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
if isinstance(arg, tslib.Timestamp):
result = arg
elif isinstance(arg, ABCSeries):
from pandas import Series
values = _convert_listlike(arg._values, True, format)
result = Series(values, index=arg.index, name=arg.name)
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this still looks pretty duplicative, but I guess ok for now.

result = arg.map(cache_array)
else:
from pandas import Series
values = _convert_listlike(arg._values, True, format)
result = Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
result = _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
result = _convert_listlike(arg, box, format, name=arg.name)
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors,
name=arg.name)
else:
result = _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
result = _convert_listlike(arg, box, format)
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors)
else:
result = _convert_listlike(arg, box, format)
else:
result = _convert_listlike(np.array([arg]), box, format)[0]

Expand Down
Loading