Merge remote-tracking branch 'upstream/master' into ea-repr

TomAugspurger · Nov 12, 2018 · 9116930 · 9116930
2 parents 0f4083e + 011b79f
commit 9116930
Show file tree

Hide file tree

Showing 12 changed files with 227 additions and 28 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2854,6 +2854,11 @@ It is often the case that users will insert columns to do temporary computations
 in Excel and you may not want to read in those columns. ``read_excel`` takes
 a ``usecols`` keyword to allow you to specify a subset of columns to parse.
 
+.. deprecated:: 0.24.0
+
+Passing in an integer for ``usecols`` has been deprecated. Please pass in a list
+of ints from 0 to ``usecols`` inclusive instead.
+
 If ``usecols`` is an integer, then it is assumed to indicate the last column
 to be parsed.
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -972,6 +972,7 @@ Deprecations
 - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of
   `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`)
 - :meth:`ExtensionArray._formatting_values` is deprecated. Use `ExtensionArray._formatter` instead. (:issue:`23601`)
+- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`)
 - Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`)
 
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
@@ -1301,6 +1302,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
 - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
+- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
 - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
 - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
 - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h":
         int64_t *word_starts  # where we are in the stream
         int64_t words_len
         int64_t words_cap
+        int64_t max_words_cap    # maximum word cap encountered
 
         char *pword_start        # pointer to stream start of current field
         int64_t word_start       # position start of current field

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -197,6 +197,7 @@ int parser_init(parser_t *self) {
     sz = sz ? sz : 1;
     self->words = (char **)malloc(sz * sizeof(char *));
     self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t));
+    self->max_words_cap = sz;
     self->words_cap = sz;
     self->words_len = 0;
 
@@ -247,7 +248,7 @@ void parser_del(parser_t *self) {
 }
 
 static int make_stream_space(parser_t *self, size_t nbytes) {
-    int64_t i, cap;
+    int64_t i, cap, length;
     int status;
     void *orig_ptr, *newptr;
 
@@ -287,8 +288,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
     */
 
     cap = self->words_cap;
+
+    /**
+     * If we are reading in chunks, we need to be aware of the maximum number
+     * of words we have seen in previous chunks (self->max_words_cap), so
+     * that way, we can properly allocate when reading subsequent ones.
+     *
+     * Otherwise, we risk a buffer overflow if we mistakenly under-allocate
+     * just because a recent chunk did not have as many words.
+     */
+    if (self->words_len + nbytes < self->max_words_cap) {
+        length = self->max_words_cap - nbytes;
+    } else {
+        length = self->words_len;
+    }
+
     self->words =
-        (char **)grow_buffer((void *)self->words, self->words_len,
+        (char **)grow_buffer((void *)self->words, length,
                              (int64_t*)&self->words_cap, nbytes,
                              sizeof(char *), &status);
     TRACE(
@@ -1241,6 +1257,19 @@ int parser_trim_buffers(parser_t *self) {
 
     int64_t i;
 
+    /**
+     * Before we free up space and trim, we should
+     * save how many words we saw when parsing, if
+     * it exceeds the maximum number we saw before.
+     *
+     * This is important for when we read in chunks,
+     * so that we can inform subsequent chunk parsing
+     * as to how many words we could possibly see.
+     */
+    if (self->words_cap > self->max_words_cap) {
+        self->max_words_cap = self->words_cap;
+    }
+
     /* trim words, word_starts */
     new_cap = _next_pow2(self->words_len) + 1;
     if (new_cap < self->words_cap) {

diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -142,6 +142,7 @@ typedef struct parser_t {
     int64_t *word_starts;   // where we are in the stream
     int64_t words_len;
     int64_t words_cap;
+    int64_t max_words_cap;  // maximum word cap encountered
 
     char *pword_start;      // pointer to stream start of current field
     int64_t word_start;     // position start of current field

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -284,6 +284,83 @@ def is_dtype(cls, dtype):
             return True
         return isinstance(dtype, np.dtype) or dtype == 'Sparse'
 
+    def update_dtype(self, dtype):
+        """Convert the SparseDtype to a new dtype.
+
+        This takes care of converting the ``fill_value``.
+
+        Parameters
+        ----------
+        dtype : Union[str, numpy.dtype, SparseDtype]
+            The new dtype to use.
+
+            * For a SparseDtype, it is simply returned
+            * For a NumPy dtype (or str), the current fill value
+              is converted to the new dtype, and a SparseDtype
+              with `dtype` and the new fill value is returned.
+
+        Returns
+        -------
+        SparseDtype
+            A new SparseDtype with the corret `dtype` and fill value
+            for that `dtype`.
+
+        Raises
+        ------
+        ValueError
+            When the current fill value cannot be converted to the
+            new `dtype` (e.g. trying to convert ``np.nan`` to an
+            integer dtype).
+
+
+        Examples
+        --------
+        >>> SparseDtype(int, 0).update_dtype(float)
+        Sparse[float64, 0.0]
+
+        >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
+        Sparse[float64, nan]
+        """
+        cls = type(self)
+        dtype = pandas_dtype(dtype)
+
+        if not isinstance(dtype, cls):
+            fill_value = astype_nansafe(np.array(self.fill_value),
+                                        dtype).item()
+            dtype = cls(dtype, fill_value=fill_value)
+
+        return dtype
+
+    @property
+    def _subtype_with_str(self):
+        """
+        Whether the SparseDtype's subtype should be considered ``str``.
+
+        Typically, pandas will store string data in an object-dtype array.
+        When converting values to a dtype, e.g. in ``.astype``, we need to
+        be more specific, we need the actual underlying type.
+
+        Returns
+        -------
+
+        >>> SparseDtype(int, 1)._subtype_with_str
+        dtype('int64')
+
+        >>> SparseDtype(object, 1)._subtype_with_str
+        dtype('O')
+
+        >>> dtype = SparseDtype(str, '')
+        >>> dtype.subtype
+        dtype('O')
+
+        >>> dtype._subtype_with_str
+        str
+        """
+        if isinstance(self.fill_value, compat.string_types):
+            return type(self.fill_value)
+        return self.subtype
+
+
 # ----------------------------------------------------------------------------
 # Array
 
@@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True):
                     # Can't put pd.NaT in a datetime64[ns]
                     fill_value = np.datetime64('NaT')
             try:
-                dtype = np.result_type(self.sp_values.dtype, fill_value)
+                dtype = np.result_type(self.sp_values.dtype, type(fill_value))
             except TypeError:
                 dtype = object
 
@@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None):
         if len(self) == 0:
             # Empty... Allow taking only if all empty
             if (indices == -1).all():
-                dtype = np.result_type(self.sp_values, fill_value)
+                dtype = np.result_type(self.sp_values, type(fill_value))
                 taken = np.empty_like(indices, dtype=dtype)
                 taken.fill(fill_value)
                 return taken
@@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None):
         if self.sp_index.npoints == 0:
             # Avoid taking from the empty self.sp_values
             taken = np.full(sp_indexer.shape, fill_value=fill_value,
-                            dtype=np.result_type(fill_value))
+                            dtype=np.result_type(type(fill_value)))
         else:
             taken = self.sp_values.take(sp_indexer)
 
@@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None):
             result_type = taken.dtype
 
             if m0.any():
-                result_type = np.result_type(result_type, self.fill_value)
+                result_type = np.result_type(result_type,
+                                             type(self.fill_value))
                 taken = taken.astype(result_type)
                 taken[old_fill_indices] = self.fill_value
 
             if m1.any():
-                result_type = np.result_type(result_type, fill_value)
+                result_type = np.result_type(result_type, type(fill_value))
                 taken = taken.astype(result_type)
                 taken[new_fill_indices] = fill_value
 
@@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices):
             # edge case in take...
             # I think just return
             out = np.full(indices.shape, self.fill_value,
-                          dtype=np.result_type(self.fill_value))
+                          dtype=np.result_type(type(self.fill_value)))
             arr, sp_index, fill_value = make_sparse(out,
                                                     fill_value=self.fill_value)
             return type(self)(arr, sparse_index=sp_index,
@@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices):
 
         if fillable.any():
             # TODO: may need to coerce array to fill value
-            result_type = np.result_type(taken, self.fill_value)
+            result_type = np.result_type(taken, type(self.fill_value))
             taken = taken.astype(result_type)
             taken[fillable] = self.fill_value
 
@@ -1093,7 +1171,9 @@ def _concat_same_type(cls, to_concat):
 
         fill_value = fill_values[0]
 
-        if len(set(fill_values)) > 1:
+        # np.nan isn't a singleton, so we may end up with multiple
+        # NaNs here, so we ignore tha all NA case too.
+        if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
             warnings.warn("Concatenating sparse arrays with multiple fill "
                           "values: '{}'. Picking the first and "
                           "converting the rest.".format(fill_values),
@@ -1212,13 +1292,10 @@ def astype(self, dtype=None, copy=True):
         IntIndex
         Indices: array([2, 3], dtype=int32)
         """
-        dtype = pandas_dtype(dtype)
-
-        if not isinstance(dtype, SparseDtype):
-            dtype = SparseDtype(dtype, fill_value=self.fill_value)
-
+        dtype = self.dtype.update_dtype(dtype)
+        subtype = dtype._subtype_with_str
         sp_values = astype_nansafe(self.sp_values,
-                                   dtype.subtype,
+                                   subtype,
                                    copy=copy)
         if sp_values is self.sp_values and copy:
             sp_values = sp_values.copy()

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -95,6 +95,10 @@
 usecols : int, str, list-like, or callable default None
     * If None, then parse all columns,
     * If int, then indicates last column to be parsed
+
+    .. deprecated:: 0.24.0
+       Pass in a list of ints instead from 0 to `usecols` inclusive.
+
     * If string, then indicates comma separated list of Excel column letters
       and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
       both sides.
@@ -778,6 +782,10 @@ def _maybe_convert_usecols(usecols):
         return usecols
 
     if is_integer(usecols):
+        warnings.warn(("Passing in an integer for `usecols` has been "
+                       "deprecated. Please pass in a list of ints from "
+                       "0 to `usecols` inclusive instead."),
+                      FutureWarning, stacklevel=2)
         return lrange(usecols + 1)
 
     if isinstance(usecols, compat.string_types):

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -461,7 +461,8 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
 
 excessive_string_length_error = """
 Fixed width strings in Stata .dta files are limited to 244 (or fewer)
-characters.  Column '%s' does not satisfy this restriction.
+characters.  Column '%s' does not satisfy this restriction. Use the
+'version=117' parameter to write the newer (Stata 13 and later) format.
 """
 
 

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -477,6 +477,34 @@ def test_astype_all(self, any_real_dtype):
         tm.assert_numpy_array_equal(np.asarray(res.values),
                                     vals.astype(typ))
 
+    @pytest.mark.parametrize('array, dtype, expected', [
+        (SparseArray([0, 1]), 'float',
+         SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
+        (SparseArray([0, 1]), bool, SparseArray([False, True])),
+        (SparseArray([0, 1], fill_value=1), bool,
+         SparseArray([False, True], dtype=SparseDtype(bool, True))),
+        pytest.param(
+            SparseArray([0, 1]), 'datetime64[ns]',
+            SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
+                        dtype=SparseDtype('datetime64[ns]',
+                                          pd.Timestamp('1970'))),
+            marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
+        ),
+        (SparseArray([0, 1, 10]), str,
+         SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
+        (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
+        (SparseArray([0, 1, 0]), object,
+         SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
+    ])
+    def test_astype_more(self, array, dtype, expected):
+        result = array.astype(dtype)
+        tm.assert_sp_array_equal(result, expected)
+
+    def test_astype_nan_raises(self):
+        arr = SparseArray([1.0, np.nan])
+        with pytest.raises(ValueError, match='Cannot convert non-finite'):
+            arr.astype(int)
+
     def test_set_fill_value(self):
         arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
         arr.fill_value = 2

diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
@@ -139,3 +139,23 @@ def test_parse_subtype(string, expected):
 def test_construct_from_string_fill_value_raises(string):
     with pytest.raises(TypeError, match='fill_value in the string is not'):
         SparseDtype.construct_from_string(string)
+
+
+@pytest.mark.parametrize('original, dtype, expected', [
+    (SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
+    (SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
+    (SparseDtype(int, 1), str, SparseDtype(object, '1')),
+    (SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
+])
+def test_update_dtype(original, dtype, expected):
+    result = original.update_dtype(dtype)
+    assert result == expected
+
+
+@pytest.mark.parametrize("original, dtype", [
+    (SparseDtype(float, np.nan), int),
+    (SparseDtype(str, 'abc'), int),
+])
+def test_update_dtype_raises(original, dtype):
+    with pytest.raises(ValueError):
+        original.update_dtype(dtype)
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -458,6 +458,22 @@ def test_read_chunksize_generated_index(self):
 
         tm.assert_frame_equal(pd.concat(reader), df)
 
+    def test_read_chunksize_jagged_names(self):
+        # see gh-23509
+        data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
+        reader = self.read_csv(StringIO(data), names=range(10), chunksize=4)
+
+        expected = DataFrame()
+
+        for i in range(10):
+            if i == 0:
+                expected[i] = [0] * 8
+            else:
+                expected[i] = [np.nan] * 7 + [0]
+
+        result = pd.concat(reader)
+        tm.assert_frame_equal(result, expected)
+
     def test_read_text_list(self):
         data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
         as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',