Skip to content

Commit

Permalink
Allow pd.unique to handle tuples, added test to to_datetime
Browse files Browse the repository at this point in the history
Refactor tests and add doc notes

Add whatsnew and some pep8 changes
  • Loading branch information
mroeschke committed Jul 26, 2017
1 parent 13b57cd commit 130406e
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 7 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ Other Enhancements
- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`)
- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
- :func:`to_datetime` now accepts a `cache_datetime` keyword which allows for faster parsing of duplicate dates. (:issue:`11665`)


.. _whatsnew_0210.api_breaking:

Expand Down
17 changes: 17 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,23 @@ cpdef ndarray[object] list_to_object_array(list obj):
return arr


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[object] tuple_to_object_array(tuple obj):
"""
Convert list to object ndarray. Seriously can\'t believe
I had to write this function.
"""
cdef:
Py_ssize_t i, n = len(obj)
ndarray[object] arr = np.empty(n, dtype=object)

for i in range(n):
arr[i] = obj[i]

return arr


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique(ndarray[object] values):
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,10 @@ def _ensure_arraylike(values):
ABCIndexClass, ABCSeries)):
inferred = lib.infer_dtype(values)
if inferred in ['mixed', 'string', 'unicode']:
values = lib.list_to_object_array(values)
if isinstance(values, tuple):
values = lib.tuple_to_object_array(values)
else:
values = lib.list_to_object_array(values)
else:
values = np.asarray(values)
return values
Expand Down
43 changes: 37 additions & 6 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime, timedelta, time
import numpy as np
import pandas as pd
from collections import MutableMapping

from pandas._libs import lib, tslib
Expand Down Expand Up @@ -183,7 +184,8 @@ def _guess_datetime_format_for_array(arr, **kwargs):

def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix'):
unit=None, infer_datetime_format=False, origin='unix',
cache_datetime=False):
"""
Convert argument to datetime.
Expand Down Expand Up @@ -257,6 +259,10 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
.. versionadded: 0.20.0
cache_datetime : boolean, default False
If True, use a cache of unique, converted dates to apply the datetime
conversion. Produces signficant speed-ups when parsing duplicate dates
Returns
-------
ret : datetime if parsing succeeded.
Expand Down Expand Up @@ -340,6 +346,19 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,

tz = 'utc' if utc else None

cache = None
if (cache_datetime and is_list_like(arg) and
not isinstance(arg, DatetimeIndex)):
# No need to convert with a cache if the arg is already a DatetimeIndex
unique_dates = pd.unique(arg)
if len(unique_dates) != len(arg):
cache = {d: pd.to_datetime(d, errors=errors, dayfirst=dayfirst,
yearfirst=yearfirst, utc=utc, box=box, format=format,
exact=exact, unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin, cache_datetime=False)
for d in unique_dates}

def _convert_listlike(arg, box, format, name=None, tz=tz):

if isinstance(arg, (list, tuple)):
Expand Down Expand Up @@ -505,15 +524,27 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
if isinstance(arg, tslib.Timestamp):
result = arg
elif isinstance(arg, ABCSeries):
from pandas import Series
values = _convert_listlike(arg._values, False, format)
result = Series(values, index=arg.index, name=arg.name)
if cache:
result = arg.map(cache)
else:
values = _convert_listlike(arg._values, False, format)
result = pd.Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
result = _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
result = _convert_listlike(arg, box, format, name=arg.name)
if cache:
result = pd.Series(arg.values).map(cache).values
if box:
result = DatetimeIndex(result, tz=tz, name=arg.name)
else:
result = _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
result = _convert_listlike(arg, box, format)
if cache:
result = pd.Series(arg).map(cache).values
if box:
result = DatetimeIndex(result, tz=tz)
else:
result = _convert_listlike(arg, box, format)
else:
result = _convert_listlike(np.array([arg]), box, format)[0]

Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,45 @@ def test_to_datetime_tz_psycopg2(self):
dtype='datetime64[ns, UTC]')
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("box", [True, False])
@pytest.mark.parametrize("utc", [True, None])
@pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None])
def test_to_datetime_cache_datetime(self, box, utc, format):
# GH 11665
test_dates = ['20130101 00:00:00'] * 10
# Test Index results
test_inputs = [test_dates, tuple(test_dates), np.array(test_dates),
pd.Index(test_dates)]
for test_input in test_inputs:
expected = pd.to_datetime(test_dates,
box=box,
utc=utc,
format=format)
result = pd.to_datetime(test_dates,
box=box,
utc=utc,
format=format,
cache_datetime=True)
if box:
tm.assert_index_equal(result, expected)
else:
tm.assert_numpy_array_equal(result, expected)
# Test Series result
expected = pd.to_datetime(pd.Series(test_dates),
utc=utc,
format=format)
result = pd.to_datetime(pd.Series(test_dates),
utc=utc,
format=format,
cache_datetime=True)
tm.assert_series_equal(expected, result)
# Test Scalar result: cache_datetime=True should not affect conversion
test_date = '20130101 00:00:00'
expected = pd.Timestamp('20130101 00:00:00')
for scalar_result in [test_date, pd.Timestamp(test_date)]:
result = pd.to_datetime(scalar_result, cache_datetime=True)
assert result == expected

def test_datetime_bool(self):
# GH13176
with pytest.raises(TypeError):
Expand Down

0 comments on commit 130406e

Please sign in to comment.