API: #10636, changing default of to_datetime to raise, deprecating co…

…erce in favor of errors
pandas-dev · Jul 30, 2015 · 987b7e7 · 987b7e7
1 parent 8ae292c
commit 987b7e7
Show file tree

Hide file tree

Showing 16 changed files with 231 additions and 127 deletions.
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -197,18 +197,30 @@ or ``format``, use ``to_datetime`` if these are required.
 Invalid Data
 ~~~~~~~~~~~~
 
-Pass ``coerce=True`` to convert invalid data to ``NaT`` (not a time):
+.. note::
+
+   In version 0.17.0, the default for ``to_datetime`` is now ``errors='raise'``, rather than ``errors='ignore'``. This means
+   that invalid parsing will raise rather that return the original input as in previous versions.
+
+Pass ``errors='coerce'`` to convert invalid data to ``NaT`` (not a time):
 
 .. ipython:: python
+   :okexcept:
+
+   # this is the default, raise when unparseable
+   to_datetime(['2009-07-31', 'asd'], errors='raise')
 
-   to_datetime(['2009-07-31', 'asd'])
+   # return the original input when unparseable
+   to_datetime(['2009-07-31', 'asd'], errors='ignore')
 
-   to_datetime(['2009-07-31', 'asd'], coerce=True)
+   # return NaT for input when unparseable
+   to_datetime(['2009-07-31', 'asd'], errors='coerce')
 
 
 Take care, ``to_datetime`` may not act as you expect on mixed data:
 
 .. ipython:: python
+   :okexcept:
 
    to_datetime([1, '1'])
 

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -65,10 +65,11 @@ Other enhancements
 - Enable `read_hdf` to be used without specifying a key when the HDF file contains a single dataset (:issue:`10443`)
 
 - ``DatetimeIndex`` can be instantiated using strings contains ``NaT`` (:issue:`7599`)
-- The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has been made consistent" (:issue:`7599`)
+- The string parsing of ``to_datetime``, ``Timestamp`` and ``DatetimeIndex`` has been made consistent. (:issue:`7599`)
 
-  Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime-string incorrectly using today's date, otherwise ``DatetimeIndex`` uses the beginning of the year.
-  ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex`` can parse, such as quarterly string.
+  Prior to v0.17.0, ``Timestamp`` and ``to_datetime`` may parse year-only datetime-string incorrectly using today's date, otherwise ``DatetimeIndex``
+  uses the beginning of the year. ``Timestamp`` and ``to_datetime`` may raise ``ValueError`` in some types of datetime-string which ``DatetimeIndex``
+  can parse, such as a quarterly string.
 
   Previous Behavior
 
@@ -119,6 +120,45 @@ Backwards incompatible API changes
 
 - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
 
+.. _whatsnew_0170.api_breaking.to_datetime
+
+Changes to to_datetime and to_timedelta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The default for ``pd.to_datetime`` error handling has changed to ``errors='raise'``. In prior versions it was ``errors='ignore'``.
+Furthermore, the ``coerce`` argument has been deprecated in favor of ``errors='coerce'``. This means that invalid parsing will raise rather that return the original
+input as in previous versions. (:issue:`10636`)
+
+Previous Behavior:
+
+  .. code-block:: python
+
+     In [2]: pd.to_datetime(['2009-07-31', 'asd'])
+     Out[2]: array(['2009-07-31', 'asd'], dtype=object)
+
+New Behavior:
+
+  .. ipython:: python
+     :okexcept:
+
+     pd.to_datetime(['2009-07-31', 'asd'])
+
+  Of course you can coerce this as well.
+
+  .. ipython:: python
+
+     to_datetime(['2009-07-31', 'asd'], errors='coerce')
+
+  To keep the previous behaviour, you can use `errors='ignore'`:
+
+  .. ipython:: python
+    :okexcept:
+
+    to_datetime(['2009-07-31', 'asd'], errors='ignore')
+
+``pd.to_timedelta`` gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
+has been deprecated in favor of ``errors='coerce'``.
+
 .. _whatsnew_0170.api_breaking.convert_objects:
 
 Changes to convert_objects

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1903,9 +1903,9 @@ def _possibly_convert_objects(values,
 
         # Immediate return if coerce
         if datetime:
-            return pd.to_datetime(values, coerce=True, box=False)
+            return pd.to_datetime(values, errors='coerce', box=False)
         elif timedelta:
-            return pd.to_timedelta(values, coerce=True, box=False)
+            return pd.to_timedelta(values, errors='coerce', box=False)
         elif numeric:
             return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
 
@@ -1958,7 +1958,7 @@ def _possibly_convert_platform(values):
     return values
 
 
-def _possibly_cast_to_datetime(value, dtype, coerce=False):
+def _possibly_cast_to_datetime(value, dtype, errors='raise'):
     """ try to cast the array/value to a datetimelike dtype, converting float
     nan to iNaT
     """
@@ -2002,9 +2002,9 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
                 elif np.prod(value.shape) and value.dtype != dtype:
                     try:
                         if is_datetime64:
-                            value = to_datetime(value, coerce=coerce).values
+                            value = to_datetime(value, errors=errors).values
                         elif is_timedelta64:
-                            value = to_timedelta(value, coerce=coerce).values
+                            value = to_timedelta(value, errors=errors).values
                     except (AttributeError, ValueError):
                         pass
 
@@ -2066,7 +2066,7 @@ def _possibly_infer_to_datetimelike(value, convert_dates=False):
         def _try_datetime(v):
             # safe coerce to datetime64
             try:
-                return tslib.array_to_datetime(v, raise_=True).reshape(shape)
+                return tslib.array_to_datetime(v, errors='raise').reshape(shape)
             except:
                 return v
 

diff --git a/pandas/core/ops.py b/pandas/core/ops.py
@@ -341,7 +341,6 @@ def _convert_to_array(self, values, name=None, other=None):
         """converts values to ndarray"""
         from pandas.tseries.timedeltas import to_timedelta
 
-        coerce = True
         if not is_list_like(values):
             values = np.array([values])
         inferred_type = lib.infer_dtype(values)
@@ -362,7 +361,7 @@ def _convert_to_array(self, values, name=None, other=None):
                 values = tslib.array_to_datetime(values)
         elif inferred_type in ('timedelta', 'timedelta64'):
             # have a timedelta, convert to to ns here
-            values = to_timedelta(values, coerce=coerce)
+            values = to_timedelta(values, errors='coerce')
         elif inferred_type == 'integer':
             # py3 compat where dtype is 'm' but is an integer
             if values.dtype.kind == 'm':
@@ -381,7 +380,7 @@ def _convert_to_array(self, values, name=None, other=None):
                                 "datetime/timedelta operations [{0}]".format(
                                     ', '.join([com.pprint_thing(v)
                                                for v in values[mask]])))
-            values = to_timedelta(os, coerce=coerce)
+            values = to_timedelta(os, errors='coerce')
         elif inferred_type == 'floating':
 
             # all nan, so ok, use the other dtype (e.g. timedelta or datetime)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2057,14 +2057,15 @@ def converter(*date_cols):
                     utc=None,
                     box=False,
                     dayfirst=dayfirst,
+                    errors='ignore',
                     infer_datetime_format=infer_datetime_format
                 )
             except:
                 return tools.to_datetime(
                     lib.try_parse_dates(strs, dayfirst=dayfirst))
         else:
             try:
-                result = tools.to_datetime(date_parser(*date_cols))
+                result = tools.to_datetime(date_parser(*date_cols), errors='ignore')
                 if isinstance(result, datetime.datetime):
                     raise Exception('scalar parser')
                 return result
@@ -2073,7 +2074,8 @@ def converter(*date_cols):
                     return tools.to_datetime(
                         lib.try_parse_dates(_concat_date_cols(date_cols),
                                             parser=date_parser,
-                                            dayfirst=dayfirst))
+                                            dayfirst=dayfirst),
+                        errors='ignore')
                 except Exception:
                     return generic_parser(date_parser, *date_cols)
 

diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -80,17 +80,17 @@ def _convert_params(sql, params):
 
 def _handle_date_column(col, format=None):
     if isinstance(format, dict):
-        return to_datetime(col, **format)
+        return to_datetime(col, errors='ignore', **format)
     else:
         if format in ['D', 's', 'ms', 'us', 'ns']:
-            return to_datetime(col, coerce=True, unit=format, utc=True)
+            return to_datetime(col, errors='coerce', unit=format, utc=True)
         elif (issubclass(col.dtype.type, np.floating)
                 or issubclass(col.dtype.type, np.integer)):
             # parse dates as timestamp
             format = 's' if format is None else format
-            return to_datetime(col, coerce=True, unit=format, utc=True)
+            return to_datetime(col, errors='coerce', unit=format, utc=True)
         else:
-            return to_datetime(col, coerce=True, format=format, utc=True)
+            return to_datetime(col, errors='coerce', format=format, utc=True)
 
 
 def _parse_date_columns(data_frame, parse_dates):

diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py
@@ -216,7 +216,7 @@ def _get_all_tables(self):
 
     def _close_conn(self):
         pass
-           
+
 class PandasSQLTest(unittest.TestCase):
     """
     Base class with common private methods for SQLAlchemy and fallback cases.
@@ -1271,7 +1271,7 @@ def test_datetime_NaT(self):
         result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn)
         if self.flavor == 'sqlite':
             self.assertTrue(isinstance(result.loc[0, 'A'], string_types))
-            result['A'] = to_datetime(result['A'], coerce=True)
+            result['A'] = to_datetime(result['A'], errors='coerce')
             tm.assert_frame_equal(result, df)
         else:
             tm.assert_frame_equal(result, df)
@@ -1720,7 +1720,7 @@ class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy):
     pass
 
 
-class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn): 
+class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn):
     pass
 
 

diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -419,7 +419,7 @@ def test_read_write_reread_dta14(self):
         for col in cols:
             expected[col] = expected[col].convert_objects(datetime=True, numeric=True)
         expected['float_'] = expected['float_'].astype(np.float32)
-        expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True)
+        expected['date_td'] = pd.to_datetime(expected['date_td'], errors='coerce')
 
         parsed_113 = self.read_dta(self.dta14_113)
         parsed_113.index.name = 'index'
@@ -464,7 +464,7 @@ def test_timestamp_and_label(self):
         data_label = 'This is a data file.'
         with tm.ensure_clean() as path:
             original.to_stata(path, time_stamp=time_stamp, data_label=data_label)
-	    
+
             with StataReader(path) as reader:
                 parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M'))
                 assert parsed_time_stamp == time_stamp

diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py
@@ -68,7 +68,7 @@ def test_to_datetime1():
 
     # unparseable
     s = 'Month 1, 1999'
-    assert to_datetime(s) == s
+    assert to_datetime(s, errors='ignore') == s
 
 
 def test_normalize_date():

diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -607,12 +607,22 @@ def testit(unit, transform):
         # ms
         testit('L',lambda x: 'ms')
 
+    def test_to_timedelta_invalid(self):
+
         # these will error
         self.assertRaises(ValueError, lambda : to_timedelta([1,2],unit='foo'))
         self.assertRaises(ValueError, lambda : to_timedelta(1,unit='foo'))
 
         # time not supported ATM
         self.assertRaises(ValueError, lambda :to_timedelta(time(second=1)))
+        self.assertTrue(to_timedelta(time(second=1), errors='coerce') is pd.NaT)
+
+        self.assertRaises(ValueError, lambda : to_timedelta(['foo','bar']))
+        tm.assert_index_equal(TimedeltaIndex([pd.NaT,pd.NaT]),
+                              to_timedelta(['foo','bar'], errors='coerce'))
+
+        tm.assert_index_equal(TimedeltaIndex(['1 day', pd.NaT, '1 min']),
+                              to_timedelta(['1 day','bar','1 min'], errors='coerce'))
 
     def test_to_timedelta_via_apply(self):
         # GH 5458