Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: changed default value of cache parameter to True in to_datetime function #26043

Merged
merged 22 commits into from
Jul 4, 2019
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format='%Y%m%d')


class ToDatetimeCacheSmallCount(object):

params = ([True, False], [50, 500, 5000, 100000])
param_names = ['cache', 'count']

def setup(self, cache, count):
rng = date_range(start='1/1/1971', periods=count)
self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()

def time_unique_date_strings(self, cache, count):
to_datetime(self.unique_date_strings, cache=cache)


class ToDatetimeISO8601:

def setup(self):
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,7 @@ Performance improvements
- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)

.. _whatsnew_0250.bug_fixes:

Expand Down
78 changes: 73 additions & 5 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@

from pandas._typing import ArrayLike
from pandas.core import algorithms
from pandas.core.algorithms import unique

# ---------------------------------------------------------------------
# types used in annotations

ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries]

# ---------------------------------------------------------------------

# ---------------------------------------------------------------------
# types used in annotations
Expand All @@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)


def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
check_count: Optional[int] = None) -> bool:
"""
Decides whether to do caching.
anmyachev marked this conversation as resolved.
Show resolved Hide resolved

If the percent of unique elements among `check_count` elements less
than `unique_share * 100` then we can do caching.

Parameters
----------
arg: listlike, tuple, 1-d array, Series
unique_share: float, default=0.7, optional
0 < unique_share < 1
check_count: int, optional
0 <= check_count <= len(arg)

Returns
-------
do_caching: bool

anmyachev marked this conversation as resolved.
Show resolved Hide resolved
Notes
-----
By default for a sequence of less than 50 items in size, we don't do
caching; for the number of elements less than 5000, we take ten percent of
all elements to check for a uniqueness share; if the sequence size is more
than 5000, then we check only the first 500 elements.
All constants were chosen empirically by.
"""
do_caching = True

# default realization
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
if check_count is None:
# in this case, the gain from caching is negligible
if len(arg) <= 50:
return False

if len(arg) <= 5000:
check_count = int(len(arg) * 0.1)
else:
check_count = 500
else:
assert 0 <= check_count <= len(arg), \
'check_count must be in next bounds: [0; len(arg)]'
if check_count == 0:
return False

assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'

unique_elements = unique(arg[:check_count])
if len(unique_elements) > check_count * unique_share:
do_caching = False
return do_caching


def _maybe_cache(arg, format, cache, convert_listlike):
"""
Create a cache of unique dates from an array of dates

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
arg : listlike, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : boolean
Expand All @@ -66,10 +128,13 @@ def _maybe_cache(arg, format, cache, convert_listlike):
if cache:
# Perform a quicker unique check
from pandas import Index

if not should_cache(arg):
return cache_array

unique_dates = Index(arg).unique()
if len(unique_dates) < len(arg):
cache_dates = convert_listlike(unique_dates.to_numpy(),
True, format)
cache_dates = convert_listlike(unique_dates, True, format)
cache_array = Series(cache_dates, index=unique_dates)
return cache_array

Expand Down Expand Up @@ -448,7 +513,7 @@ def _adjust_to_origin(arg, origin, unit):
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix',
cache=False):
cache=True):
"""
Convert argument to datetime.

Expand Down Expand Up @@ -529,13 +594,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
origin.

.. versionadded:: 0.20.0
cache : boolean, default False
cache : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.

.. versionadded:: 0.23.0

.. versionchanged:: 0.25.0
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
- changed default value from False to True
anmyachev marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
ret : datetime if parsing succeeded.
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
result = to_datetime([arg], unit='ns', utc=utc)
expected = to_datetime([exp])
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize('listlike,do_caching', [
([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)
])
def test_should_cache(listlike, do_caching):
assert tools.should_cache(listlike, check_count=len(listlike),
unique_share=0.7) == do_caching


@pytest.mark.parametrize('unique_share,check_count, err_message', [
(0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
(10, 2, r'unique_share must be in next bounds: \(0; 1\)')
])
def test_should_cache_errors(unique_share, check_count, err_message):
arg = [5] * 10

with pytest.raises(AssertionError, match=err_message):
tools.should_cache(arg, unique_share, check_count)