Allow pd.unique to handle tuples, added test to to_datetime

Refactor tests and add doc notes Add whatsnew and some pep8 changes
pandas-dev · Jul 26, 2017 · 130406e · 130406e
1 parent 13b57cd
commit 130406e
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 7 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -78,6 +78,8 @@ Other Enhancements
 - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`)
 - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
 - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
+- :func:`to_datetime` now accepts a `cache_datetime` keyword which allows for faster parsing of duplicate dates. (:issue:`11665`) 
+
 
 .. _whatsnew_0210.api_breaking:
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -373,6 +373,23 @@ cpdef ndarray[object] list_to_object_array(list obj):
     return arr
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef ndarray[object] tuple_to_object_array(tuple obj):
+    """
+    Convert list to object ndarray. Seriously can\'t believe
+    I had to write this function.
+    """
+    cdef:
+        Py_ssize_t i, n = len(obj)
+        ndarray[object] arr = np.empty(n, dtype=object)
+
+    for i in range(n):
+        arr[i] = obj[i]
+
+    return arr
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def fast_unique(ndarray[object] values):

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -170,7 +170,10 @@ def _ensure_arraylike(values):
                                ABCIndexClass, ABCSeries)):
         inferred = lib.infer_dtype(values)
         if inferred in ['mixed', 'string', 'unicode']:
-            values = lib.list_to_object_array(values)
+            if isinstance(values, tuple):
+                values = lib.tuple_to_object_array(values)
+            else:
+                values = lib.list_to_object_array(values)
         else:
             values = np.asarray(values)
     return values

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -1,5 +1,6 @@
 from datetime import datetime, timedelta, time
 import numpy as np
+import pandas as pd
 from collections import MutableMapping
 
 from pandas._libs import lib, tslib
@@ -183,7 +184,8 @@ def _guess_datetime_format_for_array(arr, **kwargs):
 
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                 utc=None, box=True, format=None, exact=True,
-                unit=None, infer_datetime_format=False, origin='unix'):
+                unit=None, infer_datetime_format=False, origin='unix',
+                cache_datetime=False):
     """
     Convert argument to datetime.
 
@@ -257,6 +259,10 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
 
         .. versionadded: 0.20.0
 
+    cache_datetime : boolean, default False
+        If True, use a cache of unique, converted dates to apply the datetime
+        conversion. Produces signficant speed-ups when parsing duplicate dates
+
     Returns
     -------
     ret : datetime if parsing succeeded.
@@ -340,6 +346,19 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
 
     tz = 'utc' if utc else None
 
+    cache = None
+    if (cache_datetime and is_list_like(arg) and
+       not isinstance(arg, DatetimeIndex)):
+        # No need to convert with a cache if the arg is already a DatetimeIndex
+        unique_dates = pd.unique(arg)
+        if len(unique_dates) != len(arg):
+            cache = {d: pd.to_datetime(d, errors=errors, dayfirst=dayfirst,
+                     yearfirst=yearfirst, utc=utc, box=box, format=format,
+                     exact=exact, unit=unit,
+                     infer_datetime_format=infer_datetime_format,
+                     origin=origin, cache_datetime=False)
+                     for d in unique_dates}
+
     def _convert_listlike(arg, box, format, name=None, tz=tz):
 
         if isinstance(arg, (list, tuple)):
@@ -505,15 +524,27 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
     if isinstance(arg, tslib.Timestamp):
         result = arg
     elif isinstance(arg, ABCSeries):
-        from pandas import Series
-        values = _convert_listlike(arg._values, False, format)
-        result = Series(values, index=arg.index, name=arg.name)
+        if cache:
+            result = arg.map(cache)
+        else:
+            values = _convert_listlike(arg._values, False, format)
+            result = pd.Series(values, index=arg.index, name=arg.name)
     elif isinstance(arg, (ABCDataFrame, MutableMapping)):
         result = _assemble_from_unit_mappings(arg, errors=errors)
     elif isinstance(arg, ABCIndexClass):
-        result = _convert_listlike(arg, box, format, name=arg.name)
+        if cache:
+            result = pd.Series(arg.values).map(cache).values
+            if box:
+                result = DatetimeIndex(result, tz=tz, name=arg.name)
+        else:
+            result = _convert_listlike(arg, box, format, name=arg.name)
     elif is_list_like(arg):
-        result = _convert_listlike(arg, box, format)
+        if cache:
+            result = pd.Series(arg).map(cache).values
+            if box:
+                result = DatetimeIndex(result, tz=tz)
+        else:
+            result = _convert_listlike(arg, box, format)
     else:
         result = _convert_listlike(np.array([arg]), box, format)[0]
 

diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -306,6 +306,45 @@ def test_to_datetime_tz_psycopg2(self):
                                     dtype='datetime64[ns, UTC]')
         tm.assert_index_equal(result, expected)
 
+    @pytest.mark.parametrize("box", [True, False])
+    @pytest.mark.parametrize("utc", [True, None])
+    @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None])
+    def test_to_datetime_cache_datetime(self, box, utc, format):
+        # GH 11665
+        test_dates = ['20130101 00:00:00'] * 10
+        # Test Index results
+        test_inputs = [test_dates, tuple(test_dates), np.array(test_dates),
+                       pd.Index(test_dates)]
+        for test_input in test_inputs:
+            expected = pd.to_datetime(test_dates,
+                                      box=box,
+                                      utc=utc,
+                                      format=format)
+            result = pd.to_datetime(test_dates,
+                                    box=box,
+                                    utc=utc,
+                                    format=format,
+                                    cache_datetime=True)
+            if box:
+                tm.assert_index_equal(result, expected)
+            else:
+                tm.assert_numpy_array_equal(result, expected)
+        # Test Series result
+        expected = pd.to_datetime(pd.Series(test_dates),
+                                  utc=utc,
+                                  format=format)
+        result = pd.to_datetime(pd.Series(test_dates),
+                                utc=utc,
+                                format=format,
+                                cache_datetime=True)
+        tm.assert_series_equal(expected, result)
+        # Test Scalar result: cache_datetime=True should not affect conversion
+        test_date = '20130101 00:00:00'
+        expected = pd.Timestamp('20130101 00:00:00')
+        for scalar_result in [test_date, pd.Timestamp(test_date)]:
+            result = pd.to_datetime(scalar_result, cache_datetime=True)
+            assert result == expected
+
     def test_datetime_bool(self):
         # GH13176
         with pytest.raises(TypeError):