pandas-dev · jreback · Dec 18, 2017 · Dec 15, 2017 · Dec 16, 2017 · Dec 17, 2017
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -143,15 +143,6 @@ usecols : array-like or callable, default ``None``
      pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
 
   Using this parameter results in much faster parsing time and lower memory usage.
-as_recarray : boolean, default ``False``
-  .. deprecated:: 0.18.2
-
-     Please call ``pd.read_csv(...).to_records()`` instead.
-
-  Return a NumPy recarray instead of a DataFrame after parsing the data. If
-  set to ``True``, this option takes precedence over the ``squeeze`` parameter.
-  In addition, as row indices are not available in such a format, the ``index_col``
-  parameter will be ignored.
 squeeze : boolean, default ``False``
   If the parsed data only contains one column then return a Series.
 prefix : str, default ``None``

diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -221,6 +221,7 @@ Removal of prior version deprecations/changes
   and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668)
 - ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`)
 - :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`)
+- :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`)
 
 .. _whatsnew_0220.performance:
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -91,7 +91,6 @@ except NameError:
     basestring = str
 
 cdef extern from "src/numpy_helper.h":
-    object sarr_from_data(cnp.dtype, int length, void* data)
     void transfer_object_column(char *dst, char *src, size_t stride,
                                 size_t length)
 
@@ -302,7 +301,6 @@ cdef class TextReader:
         object delimiter, converters, delim_whitespace
         object na_values
         object memory_map
-        object as_recarray
         object header, orig_header, names, header_start, header_end
         object index_col
         object low_memory
@@ -334,8 +332,6 @@ cdef class TextReader:
 
                   converters=None,
 
-                  as_recarray=False,
-
                   skipinitialspace=False,
                   escapechar=None,
                   doublequote=True,
@@ -489,8 +485,6 @@ cdef class TextReader:
         self.converters = converters
 
         self.na_filter = na_filter
-        self.as_recarray = as_recarray
-
         self.compact_ints = compact_ints
         self.use_unsigned = use_unsigned
 
@@ -903,14 +897,7 @@ cdef class TextReader:
             # Don't care about memory usage
             columns = self._read_rows(rows, 1)
 
-        if self.as_recarray:
-            self._start_clock()
-            result = _to_structured_array(columns, self.header, self.usecols)
-            self._end_clock('Conversion to structured array')
-
-            return result
-        else:
-            return columns
+        return columns
 
     cdef _read_low_memory(self, rows):
         cdef:
@@ -999,7 +986,7 @@ cdef class TextReader:
         self._start_clock()
         columns = self._convert_column_data(rows=rows,
                                             footer=footer,
-                                            upcast_na=not self.as_recarray)
+                                            upcast_na=True)
         self._end_clock('Type conversion')
 
         self._start_clock()
@@ -2321,77 +2308,6 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
     return lib.maybe_convert_objects(result)
 
 
-def _to_structured_array(dict columns, object names, object usecols):
-    cdef:
-        ndarray recs, column
-        cnp.dtype dt
-        dict fields
-
-        object name, fnames, field_type
-        Py_ssize_t i, offset, nfields, length
-        int64_t stride, elsize
-        char *buf
-
-    if names is None:
-        names = ['%d' % i for i in range(len(columns))]
-    else:
-        # single line header
-        names = names[0]
-
-    if usecols is not None:
-        names = [n for i, n in enumerate(names)
-                 if i in usecols or n in usecols]
-
-    dt = np.dtype([(str(name), columns[i].dtype)
-                   for i, name in enumerate(names)])
-    fnames = dt.names
-    fields = dt.fields
-
-    nfields = len(fields)
-
-    if PY3:
-        length = len(list(columns.values())[0])
-    else:
-        length = len(columns.values()[0])
-
-    stride = dt.itemsize
-
-    # We own the data.
-    buf = <char*> malloc(length * stride)
-
-    recs = sarr_from_data(dt, length, buf)
-    assert(recs.flags.owndata)
-
-    for i in range(nfields):
-        # XXX
-        field_type = fields[fnames[i]]
-
-        # (dtype, stride) tuple
-        offset = field_type[1]
-        elsize = field_type[0].itemsize
-        column = columns[i]
-
-        _fill_structured_column(buf + offset, <char*> column.data,
-                                elsize, stride, length,
-                                field_type[0] == np.object_)
-
-    return recs
-
-
-cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
-                             int64_t stride, int64_t length, bint incref):
-    cdef:
-        int64_t i
-
-    if incref:
-        transfer_object_column(dst, src, stride, length)
-    else:
-        for i in range(length):
-            memcpy(dst, src, elsize)
-            dst += stride
-            src += elsize
-
-
 def _maybe_encode(values):
     if values is None:
         return []

diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h
@@ -75,19 +75,6 @@ PANDAS_INLINE PyObject* char_to_string(char* data) {
 #endif
 }
 
-PyObject* sarr_from_data(PyArray_Descr* descr, int length, void* data) {
-    PyArrayObject* result;
-    npy_intp dims[1] = {length};
-    Py_INCREF(descr);  // newfromdescr steals a reference to descr
-    result = (PyArrayObject*)PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims,
-                                                  NULL, data, 0, NULL);
-
-    // Returned array doesn't own data by default
-    result->flags |= NPY_OWNDATA;
-
-    return (PyObject*)result;
-}
-
 void transfer_object_column(char* dst, char* src, size_t stride,
                             size_t length) {
     size_t i;
@@ -105,7 +92,6 @@ void transfer_object_column(char* dst, char* src, size_t stride,
     }
 }
 
-
 void set_array_not_contiguous(PyArrayObject* ao) {
     ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS);
 }

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -108,14 +108,6 @@
     example of a valid callable argument would be ``lambda x: x.upper() in
     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
     parsing time and lower memory usage.
-as_recarray : boolean, default False
-    .. deprecated:: 0.19.0
-       Please call `pd.read_csv(...).to_records()` instead.
-
-    Return a NumPy recarray instead of a DataFrame after parsing the data.
-    If set to True, this option takes precedence over the `squeeze` parameter.
-    In addition, as row indices are not available in such a format, the
-    `index_col` parameter will be ignored.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 prefix : str, default None
@@ -506,7 +498,6 @@ def _read(filepath_or_buffer, kwds):
 
 _c_parser_defaults = {
     'delim_whitespace': False,
-    'as_recarray': False,
     'na_filter': True,
     'compact_ints': False,
     'use_unsigned': False,
@@ -532,14 +523,12 @@ def _read(filepath_or_buffer, kwds):
 }
 
 _deprecated_defaults = {
-    'as_recarray': None,
     'buffer_lines': None,
     'compact_ints': None,
     'use_unsigned': None,
     'tupleize_cols': None
 }
 _deprecated_args = {
-    'as_recarray',
     'buffer_lines',
     'compact_ints',
     'use_unsigned',
@@ -614,7 +603,6 @@ def parser_f(filepath_or_buffer,
                  # Internal
                  doublequote=True,
                  delim_whitespace=False,
-                 as_recarray=None,
                  compact_ints=None,
                  use_unsigned=None,
                  low_memory=_c_parser_defaults['low_memory'],
@@ -685,7 +673,6 @@ def parser_f(filepath_or_buffer,
                     compact_ints=compact_ints,
                     use_unsigned=use_unsigned,
                     delim_whitespace=delim_whitespace,
-                    as_recarray=as_recarray,
                     warn_bad_lines=warn_bad_lines,
                     error_bad_lines=error_bad_lines,
                     low_memory=low_memory,
@@ -971,9 +958,7 @@ def _clean_options(self, options, engine):
                    "and will be removed in a future version."
                    .format(arg=arg))
 
-            if arg == 'as_recarray':
-                msg += ' Please call pd.to_csv(...).to_records() instead.'
-            elif arg == 'tupleize_cols':
+            if arg == 'tupleize_cols':
                 msg += (' Column tuples will then '
                         'always be converted to MultiIndex.')
 
@@ -1059,9 +1044,6 @@ def read(self, nrows=None):
 
         ret = self._engine.read(nrows)
 
-        if self.options.get('as_recarray'):
-            return ret
-
         # May alter columns / col_dict
         index, columns, col_dict = self._create_index(ret)
 
@@ -1279,7 +1261,6 @@ def __init__(self, kwds):
 
         self.true_values = kwds.get('true_values')
         self.false_values = kwds.get('false_values')
-        self.as_recarray = kwds.get('as_recarray', False)
         self.tupleize_cols = kwds.get('tupleize_cols', False)
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
         self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
@@ -1295,9 +1276,6 @@ def __init__(self, kwds):
         if isinstance(self.header, (list, tuple, np.ndarray)):
             if not all(map(is_integer, self.header)):
                 raise ValueError("header must be integer or list of integers")
-            if kwds.get('as_recarray'):
-                raise ValueError("cannot specify as_recarray when "
-                                 "specifying a multi-index header")
             if kwds.get('usecols'):
                 raise ValueError("cannot specify usecols when "
                                  "specifying a multi-index header")
@@ -1900,10 +1878,6 @@ def read(self, nrows=None):
         # Done with first read, next time raise StopIteration
         self._first_chunk = False
 
-        if self.as_recarray:
-            # what to do if there are leading columns?
-            return data
-
         names = self.names
 
         if self._reader.leading_cols:
@@ -2306,9 +2280,6 @@ def read(self, rows=None):
         columns, data = self._do_date_conversions(columns, data)
 
         data = self._convert_data(data)
-        if self.as_recarray:
-            return self._to_recarray(data, columns)
-
         index, columns = self._make_index(data, alldata, columns, indexnamerow)
 
         return index, columns, data
@@ -2376,19 +2347,6 @@ def _clean_mapping(mapping):
                                          clean_na_fvalues, self.verbose,
                                          clean_conv, clean_dtypes)
 
-    def _to_recarray(self, data, columns):
-        dtypes = []
-        o = compat.OrderedDict()
-
-        # use the columns to "order" the keys
-        # in the unordered 'data' dictionary
-        for col in columns:
-            dtypes.append((str(col), data[col].dtype))
-            o[col] = data[col]
-
-        tuples = lzip(*o.values())
-        return np.array(tuples, dtypes)
-
     def _infer_columns(self):
         names = self.names
         num_original_columns = 0

diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -18,7 +18,6 @@
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
 from pandas import DataFrame
-from pandas import compat
 from pandas.compat import StringIO, range, lrange
 
 
@@ -161,25 +160,6 @@ def error(val):
         assert sum(precise_errors) <= sum(normal_errors)
         assert max(precise_errors) <= max(normal_errors)
 
-    def test_pass_dtype_as_recarray(self):
-        if compat.is_platform_windows() and self.low_memory:
-            pytest.skip(
-                "segfaults on win-64, only when all tests are run")
-
-        data = """\
-one,two
-1,2.5
-2,3.5
-3,4.5
-4,5.5"""
-
-        with tm.assert_produces_warning(
-                FutureWarning, check_stacklevel=False):
-            result = self.read_csv(StringIO(data), dtype={
-                'one': 'u1', 1: 'S1'}, as_recarray=True)
-            assert result['one'].dtype == 'u1'
-            assert result['two'].dtype == 'S1'
-
     def test_usecols_dtypes(self):
         data = """\
 1,2,3