BUG: Better handle larger numbers in to_numeric (#24956)

* BUG: Better handle larger numbers in to_numeric * Warn about lossiness when passing really large numbers that exceed (u)int64 ranges. * Coerce negative numbers to float when requested instead of crashing and returning object. * Consistently parse numbers as integers / floats, even if we know that the resulting container has to be float. This is to ensure consistent error behavior when inputs numbers are too large. Closes gh-24910. * MAINT: Address comments
pandas-dev · Jan 31, 2019 · 149138e · 149138e
1 parent 534bce9
commit 149138e
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 25 deletions.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -110,6 +110,8 @@ Timezones
 Numeric
 ^^^^^^^
 
+- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`)
+- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`)
 -
 -
 -

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
         except (ValueError, OverflowError, TypeError):
             pass
 
-    # otherwise, iterate and do full infererence
+    # Otherwise, iterate and do full inference.
     cdef:
         int status, maybe_int
         Py_ssize_t i, n = values.size
@@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
                 else:
                     seen.float_ = True
 
-            if val <= oINT64_MAX:
+            if oINT64_MIN <= val <= oINT64_MAX:
                 ints[i] = val
 
-            if seen.sint_ and seen.uint_:
+            if val < oINT64_MIN or (seen.sint_ and seen.uint_):
                 seen.float_ = True
 
         elif util.is_bool_object(val):
@@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
                     else:
                         seen.saw_int(as_int)
 
-                    if not (seen.float_ or as_int in na_values):
+                    if as_int not in na_values:
                         if as_int < oINT64_MIN or as_int > oUINT64_MAX:
-                            raise ValueError('Integer out of range.')
+                            if seen.coerce_numeric:
+                                seen.float_ = True
+                            else:
+                                raise ValueError("Integer out of range.")
+                        else:
+                            if as_int >= 0:
+                                uints[i] = as_int
 
-                        if as_int >= 0:
-                            uints[i] = as_int
-                        if as_int <= oINT64_MAX:
-                            ints[i] = as_int
+                            if as_int <= oINT64_MAX:
+                                ints[i] = as_int
 
                     seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
                 else:
                     seen.float_ = True
             except (TypeError, ValueError) as e:
                 if not seen.coerce_numeric:
-                    raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
+                    raise type(e)(str(e) + " at position {pos}".format(pos=i))
                 elif "uint64" in str(e):  # Exception from check functions.
                     raise
+
                 seen.saw_null()
                 floats[i] = NaN
 

diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None):
     depending on the data supplied. Use the `downcast` parameter
     to obtain other dtypes.
 
+    Please note that precision loss may occur if really large numbers
+    are passed in. Due to the internal limitations of `ndarray`, if
+    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
+    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
+    passed in, it is very likely they will be converted to float so that
+    they can stored in an `ndarray`. These warnings apply similarly to
+    `Series` since it internally leverages `ndarray`.
+
     Parameters
     ----------
     arg : scalar, list, tuple, 1-d array, or Series

diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py
@@ -4,11 +4,50 @@
 from numpy import iinfo
 import pytest
 
+import pandas.compat as compat
+
 import pandas as pd
 from pandas import DataFrame, Index, Series, to_numeric
 from pandas.util import testing as tm
 
 
+@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
+def errors(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def signed(request):
+    return request.param
+
+
+@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
+def transform(request):
+    return request.param
+
+
+@pytest.fixture(params=[
+    47393996303418497800,
+    100000000000000000000
+])
+def large_val(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def multiple_elts(request):
+    return request.param
+
+
+@pytest.fixture(params=[
+    (lambda x: Index(x, name="idx"), tm.assert_index_equal),
+    (lambda x: Series(x, name="ser"), tm.assert_series_equal),
+    (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal)
+])
+def transform_assert_equal(request):
+    return request.param
+
+
 @pytest.mark.parametrize("input_kwargs,result_kwargs", [
     (dict(), dict(dtype=np.int64)),
     (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8))
@@ -172,7 +211,6 @@ def test_all_nan():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
 def test_type_check(errors):
     # see gh-11776
     df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
@@ -183,11 +221,100 @@ def test_type_check(errors):
         to_numeric(df, **kwargs)
 
 
-@pytest.mark.parametrize("val", [
-    1, 1.1, "1", "1.1", -1.5, "-1.5"
-])
-def test_scalar(val):
-    assert to_numeric(val) == float(val)
+@pytest.mark.parametrize("val", [1, 1.1, 20001])
+def test_scalar(val, signed, transform):
+    val = -val if signed else val
+    assert to_numeric(transform(val)) == float(val)
+
+
+def test_really_large_scalar(large_val, signed, transform, errors):
+    # see gh-24910
+    kwargs = dict(errors=errors) if errors is not None else dict()
+    val = -large_val if signed else large_val
+
+    val = transform(val)
+    val_is_string = isinstance(val, str)
+
+    if val_is_string and errors in (None, "raise"):
+        msg = "Integer out of range. at position 0"
+        with pytest.raises(ValueError, match=msg):
+            to_numeric(val, **kwargs)
+    else:
+        expected = float(val) if (errors == "coerce" and
+                                  val_is_string) else val
+        assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
+
+
+def test_really_large_in_arr(large_val, signed, transform,
+                             multiple_elts, errors):
+    # see gh-24910
+    kwargs = dict(errors=errors) if errors is not None else dict()
+    val = -large_val if signed else large_val
+    val = transform(val)
+
+    extra_elt = "string"
+    arr = [val] + multiple_elts * [extra_elt]
+
+    val_is_string = isinstance(val, str)
+    coercing = errors == "coerce"
+
+    if errors in (None, "raise") and (val_is_string or multiple_elts):
+        if val_is_string:
+            msg = "Integer out of range. at position 0"
+        else:
+            msg = 'Unable to parse string "string" at position 1'
+
+        with pytest.raises(ValueError, match=msg):
+            to_numeric(arr, **kwargs)
+    else:
+        result = to_numeric(arr, **kwargs)
+
+        exp_val = float(val) if (coercing and val_is_string) else val
+        expected = [exp_val]
+
+        if multiple_elts:
+            if coercing:
+                expected.append(np.nan)
+                exp_dtype = float
+            else:
+                expected.append(extra_elt)
+                exp_dtype = object
+        else:
+            exp_dtype = float if isinstance(exp_val, (
+                int, compat.long, float)) else object
+
+        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
+
+
+def test_really_large_in_arr_consistent(large_val, signed,
+                                        multiple_elts, errors):
+    # see gh-24910
+    #
+    # Even if we discover that we have to hold float, does not mean
+    # we should be lenient on subsequent elements that fail to be integer.
+    kwargs = dict(errors=errors) if errors is not None else dict()
+    arr = [str(-large_val if signed else large_val)]
+
+    if multiple_elts:
+        arr.insert(0, large_val)
+
+    if errors in (None, "raise"):
+        index = int(multiple_elts)
+        msg = "Integer out of range. at position {index}".format(index=index)
+
+        with pytest.raises(ValueError, match=msg):
+            to_numeric(arr, **kwargs)
+    else:
+        result = to_numeric(arr, **kwargs)
+
+        if errors == "coerce":
+            expected = [float(i) for i in arr]
+            exp_dtype = float
+        else:
+            expected = arr
+            exp_dtype = object
+
+        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
 
 
 @pytest.mark.parametrize("errors,checker", [
@@ -205,15 +332,6 @@ def test_scalar_fail(errors, checker):
         assert checker(to_numeric(scalar, errors=errors))
 
 
-@pytest.fixture(params=[
-    (lambda x: Index(x, name="idx"), tm.assert_index_equal),
-    (lambda x: Series(x, name="ser"), tm.assert_series_equal),
-    (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal)
-])
-def transform_assert_equal(request):
-    return request.param
-
-
 @pytest.mark.parametrize("data", [
     [1, 2, 3],
     [1., np.nan, 3, np.nan]