diff --git a/doc/source/io.rst b/doc/source/io.rst index 1a777c3e0b15f..184767015bf93 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -143,15 +143,6 @@ usecols : array-like or callable, default ``None`` pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) Using this parameter results in much faster parsing time and lower memory usage. -as_recarray : boolean, default ``False`` - .. deprecated:: 0.18.2 - - Please call ``pd.read_csv(...).to_records()`` instead. - - Return a NumPy recarray instead of a DataFrame after parsing the data. If - set to ``True``, this option takes precedence over the ``squeeze`` parameter. - In addition, as row indices are not available in such a format, the ``index_col`` - parameter will be ignored. squeeze : boolean, default ``False`` If the parsed data only contains one column then return a Series. prefix : str, default ``None`` diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 9dc10a09378f8..7d8770723b160 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -221,6 +221,7 @@ Removal of prior version deprecations/changes and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668) - ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`) - :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`) +- :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`) .. _whatsnew_0220.performance: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 8d4f2af19701a..c6899fa527b6e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -91,7 +91,6 @@ except NameError: basestring = str cdef extern from "src/numpy_helper.h": - object sarr_from_data(cnp.dtype, int length, void* data) void transfer_object_column(char *dst, char *src, size_t stride, size_t length) @@ -302,7 +301,6 @@ cdef class TextReader: object delimiter, converters, delim_whitespace object na_values object memory_map - object as_recarray object header, orig_header, names, header_start, header_end object index_col object low_memory @@ -334,8 +332,6 @@ cdef class TextReader: converters=None, - as_recarray=False, - skipinitialspace=False, escapechar=None, doublequote=True, @@ -489,8 +485,6 @@ cdef class TextReader: self.converters = converters self.na_filter = na_filter - self.as_recarray = as_recarray - self.compact_ints = compact_ints self.use_unsigned = use_unsigned @@ -903,14 +897,7 @@ cdef class TextReader: # Don't care about memory usage columns = self._read_rows(rows, 1) - if self.as_recarray: - self._start_clock() - result = _to_structured_array(columns, self.header, self.usecols) - self._end_clock('Conversion to structured array') - - return result - else: - return columns + return columns cdef _read_low_memory(self, rows): cdef: @@ -999,7 +986,7 @@ cdef class TextReader: self._start_clock() columns = self._convert_column_data(rows=rows, footer=footer, - upcast_na=not self.as_recarray) + upcast_na=True) self._end_clock('Type conversion') self._start_clock() @@ -2321,77 +2308,6 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col, return lib.maybe_convert_objects(result) -def _to_structured_array(dict columns, object names, object usecols): - cdef: - ndarray recs, column - cnp.dtype dt - dict fields - - object name, fnames, field_type - Py_ssize_t i, offset, nfields, length - int64_t stride, elsize - char *buf - - if names is None: - names = ['%d' % i for i in range(len(columns))] - else: - # single line header - names = names[0] - - if usecols is not None: - names = [n for i, n in enumerate(names) - if i in usecols or n in usecols] - - dt = np.dtype([(str(name), columns[i].dtype) - for i, name in enumerate(names)]) - fnames = dt.names - fields = dt.fields - - nfields = len(fields) - - if PY3: - length = len(list(columns.values())[0]) - else: - length = len(columns.values()[0]) - - stride = dt.itemsize - - # We own the data. - buf = malloc(length * stride) - - recs = sarr_from_data(dt, length, buf) - assert(recs.flags.owndata) - - for i in range(nfields): - # XXX - field_type = fields[fnames[i]] - - # (dtype, stride) tuple - offset = field_type[1] - elsize = field_type[0].itemsize - column = columns[i] - - _fill_structured_column(buf + offset, column.data, - elsize, stride, length, - field_type[0] == np.object_) - - return recs - - -cdef _fill_structured_column(char *dst, char* src, int64_t elsize, - int64_t stride, int64_t length, bint incref): - cdef: - int64_t i - - if incref: - transfer_object_column(dst, src, stride, length) - else: - for i in range(length): - memcpy(dst, src, elsize) - dst += stride - src += elsize - - def _maybe_encode(values): if values is None: return [] diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index 8a9a05723d9fe..de3486eca3e9b 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -75,19 +75,6 @@ PANDAS_INLINE PyObject* char_to_string(char* data) { #endif } -PyObject* sarr_from_data(PyArray_Descr* descr, int length, void* data) { - PyArrayObject* result; - npy_intp dims[1] = {length}; - Py_INCREF(descr); // newfromdescr steals a reference to descr - result = (PyArrayObject*)PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, - NULL, data, 0, NULL); - - // Returned array doesn't own data by default - result->flags |= NPY_OWNDATA; - - return (PyObject*)result; -} - void transfer_object_column(char* dst, char* src, size_t stride, size_t length) { size_t i; @@ -105,7 +92,6 @@ void transfer_object_column(char* dst, char* src, size_t stride, } } - void set_array_not_contiguous(PyArrayObject* ao) { ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS); } diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 927edbf236366..c2fca1f961222 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -108,14 +108,6 @@ example of a valid callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster parsing time and lower memory usage. -as_recarray : boolean, default False - .. deprecated:: 0.19.0 - Please call `pd.read_csv(...).to_records()` instead. - - Return a NumPy recarray instead of a DataFrame after parsing the data. - If set to True, this option takes precedence over the `squeeze` parameter. - In addition, as row indices are not available in such a format, the - `index_col` parameter will be ignored. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None @@ -506,7 +498,6 @@ def _read(filepath_or_buffer, kwds): _c_parser_defaults = { 'delim_whitespace': False, - 'as_recarray': False, 'na_filter': True, 'compact_ints': False, 'use_unsigned': False, @@ -532,14 +523,12 @@ def _read(filepath_or_buffer, kwds): } _deprecated_defaults = { - 'as_recarray': None, 'buffer_lines': None, 'compact_ints': None, 'use_unsigned': None, 'tupleize_cols': None } _deprecated_args = { - 'as_recarray', 'buffer_lines', 'compact_ints', 'use_unsigned', @@ -614,7 +603,6 @@ def parser_f(filepath_or_buffer, # Internal doublequote=True, delim_whitespace=False, - as_recarray=None, compact_ints=None, use_unsigned=None, low_memory=_c_parser_defaults['low_memory'], @@ -685,7 +673,6 @@ def parser_f(filepath_or_buffer, compact_ints=compact_ints, use_unsigned=use_unsigned, delim_whitespace=delim_whitespace, - as_recarray=as_recarray, warn_bad_lines=warn_bad_lines, error_bad_lines=error_bad_lines, low_memory=low_memory, @@ -971,9 +958,7 @@ def _clean_options(self, options, engine): "and will be removed in a future version." .format(arg=arg)) - if arg == 'as_recarray': - msg += ' Please call pd.to_csv(...).to_records() instead.' - elif arg == 'tupleize_cols': + if arg == 'tupleize_cols': msg += (' Column tuples will then ' 'always be converted to MultiIndex.') @@ -1059,9 +1044,6 @@ def read(self, nrows=None): ret = self._engine.read(nrows) - if self.options.get('as_recarray'): - return ret - # May alter columns / col_dict index, columns, col_dict = self._create_index(ret) @@ -1279,7 +1261,6 @@ def __init__(self, kwds): self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') - self.as_recarray = kwds.get('as_recarray', False) self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) @@ -1295,9 +1276,6 @@ def __init__(self, kwds): if isinstance(self.header, (list, tuple, np.ndarray)): if not all(map(is_integer, self.header)): raise ValueError("header must be integer or list of integers") - if kwds.get('as_recarray'): - raise ValueError("cannot specify as_recarray when " - "specifying a multi-index header") if kwds.get('usecols'): raise ValueError("cannot specify usecols when " "specifying a multi-index header") @@ -1900,10 +1878,6 @@ def read(self, nrows=None): # Done with first read, next time raise StopIteration self._first_chunk = False - if self.as_recarray: - # what to do if there are leading columns? - return data - names = self.names if self._reader.leading_cols: @@ -2306,9 +2280,6 @@ def read(self, rows=None): columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) - if self.as_recarray: - return self._to_recarray(data, columns) - index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data @@ -2376,19 +2347,6 @@ def _clean_mapping(mapping): clean_na_fvalues, self.verbose, clean_conv, clean_dtypes) - def _to_recarray(self, data, columns): - dtypes = [] - o = compat.OrderedDict() - - # use the columns to "order" the keys - # in the unordered 'data' dictionary - for col in columns: - dtypes.append((str(col), data[col].dtype)) - o[col] = data[col] - - tuples = lzip(*o.values()) - return np.array(tuples, dtypes) - def _infer_columns(self): names = self.names num_original_columns = 0 diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 7a1fca55dd51e..e0422249289b7 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -18,7 +18,6 @@ import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas import DataFrame -from pandas import compat from pandas.compat import StringIO, range, lrange @@ -161,25 +160,6 @@ def error(val): assert sum(precise_errors) <= sum(normal_errors) assert max(precise_errors) <= max(normal_errors) - def test_pass_dtype_as_recarray(self): - if compat.is_platform_windows() and self.low_memory: - pytest.skip( - "segfaults on win-64, only when all tests are run") - - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 1: 'S1'}, as_recarray=True) - assert result['one'].dtype == 'u1' - assert result['two'].dtype == 'S1' - def test_usecols_dtypes(self): data = """\ 1,2,3 diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 6a996213b28bb..8a1f23d203a32 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -997,23 +997,6 @@ def test_empty_with_nrows_chunksize(self): StringIO('foo,bar\n'), chunksize=10))) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO('foo,bar\n'), - nrows=10, as_recarray=True) - result = DataFrame(result[2], columns=result[1], - index=result[0]) - tm.assert_frame_equal(DataFrame.from_records( - result), expected, check_index_type=False) - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = next(iter(self.read_csv(StringIO('foo,bar\n'), - chunksize=10, as_recarray=True))) - result = DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(DataFrame.from_records(result), expected, - check_index_type=False) - def test_eof_states(self): # see gh-10728, gh-10548 @@ -1431,93 +1414,6 @@ def test_compact_ints_use_unsigned(self): use_unsigned=True) tm.assert_frame_equal(out, expected) - def test_compact_ints_as_recarray(self): - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - assert result.dtype == ex_dtype - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - assert result.dtype == ex_dtype - - def test_as_recarray(self): - # basic test - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = 'a,b\n1,a\n2,b' - expected = np.array([(1, 'a'), (2, 'b')], - dtype=[('a', '=i8'), ('b', 'O')]) - out = self.read_csv(StringIO(data), as_recarray=True) - tm.assert_numpy_array_equal(out, expected) - - # index_col ignored - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = 'a,b\n1,a\n2,b' - expected = np.array([(1, 'a'), (2, 'b')], - dtype=[('a', '=i8'), ('b', 'O')]) - out = self.read_csv(StringIO(data), as_recarray=True, index_col=0) - tm.assert_numpy_array_equal(out, expected) - - # respects names - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = '1,a\n2,b' - expected = np.array([(1, 'a'), (2, 'b')], - dtype=[('a', '=i8'), ('b', 'O')]) - out = self.read_csv(StringIO(data), names=['a', 'b'], - header=None, as_recarray=True) - tm.assert_numpy_array_equal(out, expected) - - # header order is respected even though it conflicts - # with the natural ordering of the column names - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = 'b,a\n1,a\n2,b' - expected = np.array([(1, 'a'), (2, 'b')], - dtype=[('b', '=i8'), ('a', 'O')]) - out = self.read_csv(StringIO(data), as_recarray=True) - tm.assert_numpy_array_equal(out, expected) - - # overrides the squeeze parameter - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = 'a\n1' - expected = np.array([(1,)], dtype=[('a', '=i8')]) - out = self.read_csv(StringIO(data), as_recarray=True, squeeze=True) - tm.assert_numpy_array_equal(out, expected) - - # does data conversions before doing recarray conversion - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = 'a,b\n1,a\n2,b' - conv = lambda x: int(x) + 1 - expected = np.array([(2, 'a'), (3, 'b')], - dtype=[('a', '=i8'), ('b', 'O')]) - out = self.read_csv(StringIO(data), as_recarray=True, - converters={'a': conv}) - tm.assert_numpy_array_equal(out, expected) - - # filters by usecols before doing recarray conversion - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - data = 'a,b\n1,a\n2,b' - expected = np.array([(1,), (2,)], dtype=[('a', '=i8')]) - out = self.read_csv(StringIO(data), as_recarray=True, - usecols=['a']) - tm.assert_numpy_array_equal(out, expected) - def test_memory_map(self): mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') expected = DataFrame({ diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py index 58dae112c59b7..3fb0650348763 100644 --- a/pandas/tests/io/parser/header.py +++ b/pandas/tests/io/parser/header.py @@ -116,13 +116,6 @@ def test_header_multi_index(self): # INVALID OPTIONS - # no as_recarray - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], as_recarray=True) - # names pytest.raises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index f66f9ccf065f7..ab4c14034cd20 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -194,33 +194,6 @@ def test_header_not_enough_lines(self): 2: np.array([3, 6], dtype=np.int64)} assert_array_dicts_equal(recs, expected) - # not enough rows - pytest.raises(parser.ParserError, TextReader, StringIO(data), - delimiter=',', header=5, as_recarray=True) - - def test_header_not_enough_lines_as_recarray(self): - data = ('skip this\n' - 'skip this\n' - 'a,b,c\n' - '1,2,3\n' - '4,5,6') - - reader = TextReader(StringIO(data), delimiter=',', - header=2, as_recarray=True) - header = reader.header - expected = [['a', 'b', 'c']] - assert header == expected - - recs = reader.read() - expected = {'a': np.array([1, 4], dtype=np.int64), - 'b': np.array([2, 5], dtype=np.int64), - 'c': np.array([3, 6], dtype=np.int64)} - assert_array_dicts_equal(expected, recs) - - # not enough rows - pytest.raises(parser.ParserError, TextReader, StringIO(data), - delimiter=',', header=5, as_recarray=True) - def test_escapechar(self): data = ('\\"hello world\"\n' '\\"hello world\"\n' @@ -267,25 +240,6 @@ def _make_reader(**kwds): assert (result[0] == ex_values).all() assert result[1].dtype == 'S4' - def test_numpy_string_dtype_as_recarray(self): - data = """\ -a,1 -aa,2 -aaa,3 -aaaa,4 -aaaaa,5""" - - def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', header=None, - **kwds) - - reader = _make_reader(dtype='S4', as_recarray=True) - result = reader.read() - assert result['0'].dtype == 'S4' - ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') - assert (result['0'] == ex_values).all() - assert result['1'].dtype == 'S4' - def test_pass_dtype(self): data = """\ one,two diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index ab5d8a7595c96..b944322b1ed40 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -128,9 +128,7 @@ def read(self): class TestDeprecatedFeatures(object): @pytest.mark.parametrize("engine", ["c", "python"]) - @pytest.mark.parametrize("kwargs", [{"as_recarray": True}, - {"as_recarray": False}, - {"buffer_lines": True}, + @pytest.mark.parametrize("kwargs", [{"buffer_lines": True}, {"buffer_lines": False}, {"compact_ints": True}, {"compact_ints": False},