Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Drop the as_recarray parameter in read_csv #18804

Merged
merged 5 commits into from
Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,6 @@ usecols : array-like or callable, default ``None``
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])

Using this parameter results in much faster parsing time and lower memory usage.
as_recarray : boolean, default ``False``
.. deprecated:: 0.18.2

Please call ``pd.read_csv(...).to_records()`` instead.

Return a NumPy recarray instead of a DataFrame after parsing the data. If
set to ``True``, this option takes precedence over the ``squeeze`` parameter.
In addition, as row indices are not available in such a format, the ``index_col``
parameter will be ignored.
squeeze : boolean, default ``False``
If the parsed data only contains one column then return a Series.
prefix : str, default ``None``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ Removal of prior version deprecations/changes
and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668)
- ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`)
- :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`)
- :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`)

.. _whatsnew_0220.performance:

Expand Down
88 changes: 2 additions & 86 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ except NameError:
basestring = str

cdef extern from "src/numpy_helper.h":
object sarr_from_data(cnp.dtype, int length, void* data)
void transfer_object_column(char *dst, char *src, size_t stride,
size_t length)

Expand Down Expand Up @@ -302,7 +301,6 @@ cdef class TextReader:
object delimiter, converters, delim_whitespace
object na_values
object memory_map
object as_recarray
object header, orig_header, names, header_start, header_end
object index_col
object low_memory
Expand Down Expand Up @@ -334,8 +332,6 @@ cdef class TextReader:

converters=None,

as_recarray=False,

skipinitialspace=False,
escapechar=None,
doublequote=True,
Expand Down Expand Up @@ -489,8 +485,6 @@ cdef class TextReader:
self.converters = converters

self.na_filter = na_filter
self.as_recarray = as_recarray

self.compact_ints = compact_ints
self.use_unsigned = use_unsigned

Expand Down Expand Up @@ -903,14 +897,7 @@ cdef class TextReader:
# Don't care about memory usage
columns = self._read_rows(rows, 1)

if self.as_recarray:
self._start_clock()
result = _to_structured_array(columns, self.header, self.usecols)
self._end_clock('Conversion to structured array')

return result
else:
return columns
return columns

cdef _read_low_memory(self, rows):
cdef:
Expand Down Expand Up @@ -999,7 +986,7 @@ cdef class TextReader:
self._start_clock()
columns = self._convert_column_data(rows=rows,
footer=footer,
upcast_na=not self.as_recarray)
upcast_na=True)
self._end_clock('Type conversion')

self._start_clock()
Expand Down Expand Up @@ -2321,77 +2308,6 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
return lib.maybe_convert_objects(result)


def _to_structured_array(dict columns, object names, object usecols):
cdef:
ndarray recs, column
cnp.dtype dt
dict fields

object name, fnames, field_type
Py_ssize_t i, offset, nfields, length
int64_t stride, elsize
char *buf

if names is None:
names = ['%d' % i for i in range(len(columns))]
else:
# single line header
names = names[0]

if usecols is not None:
names = [n for i, n in enumerate(names)
if i in usecols or n in usecols]

dt = np.dtype([(str(name), columns[i].dtype)
for i, name in enumerate(names)])
fnames = dt.names
fields = dt.fields

nfields = len(fields)

if PY3:
length = len(list(columns.values())[0])
else:
length = len(columns.values()[0])

stride = dt.itemsize

# We own the data.
buf = <char*> malloc(length * stride)

recs = sarr_from_data(dt, length, buf)
assert(recs.flags.owndata)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i believe u can remove

sarr_from_data
_fill_structured_columns

from the codebase as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm...potentially.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don’t have flake8 on cython that looks for unused things fyi

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah...that's tougher to check, as flake8 works on a file-by-file basis and has no holistic check on the code-base.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GitHub search seems to agree with your belief though.

for i in range(nfields):
# XXX
field_type = fields[fnames[i]]

# (dtype, stride) tuple
offset = field_type[1]
elsize = field_type[0].itemsize
column = columns[i]

_fill_structured_column(buf + offset, <char*> column.data,
elsize, stride, length,
field_type[0] == np.object_)

return recs


cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
int64_t stride, int64_t length, bint incref):
cdef:
int64_t i

if incref:
transfer_object_column(dst, src, stride, length)
else:
for i in range(length):
memcpy(dst, src, elsize)
dst += stride
src += elsize


def _maybe_encode(values):
if values is None:
return []
Expand Down
14 changes: 0 additions & 14 deletions pandas/_libs/src/numpy_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,6 @@ PANDAS_INLINE PyObject* char_to_string(char* data) {
#endif
}

PyObject* sarr_from_data(PyArray_Descr* descr, int length, void* data) {
PyArrayObject* result;
npy_intp dims[1] = {length};
Py_INCREF(descr); // newfromdescr steals a reference to descr
result = (PyArrayObject*)PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims,
NULL, data, 0, NULL);

// Returned array doesn't own data by default
result->flags |= NPY_OWNDATA;

return (PyObject*)result;
}

void transfer_object_column(char* dst, char* src, size_t stride,
size_t length) {
size_t i;
Expand All @@ -105,7 +92,6 @@ void transfer_object_column(char* dst, char* src, size_t stride,
}
}


void set_array_not_contiguous(PyArrayObject* ao) {
ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS);
}
Expand Down
44 changes: 1 addition & 43 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,6 @@
example of a valid callable argument would be ``lambda x: x.upper() in
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
parsing time and lower memory usage.
as_recarray : boolean, default False
.. deprecated:: 0.19.0
Please call `pd.read_csv(...).to_records()` instead.

Return a NumPy recarray instead of a DataFrame after parsing the data.
If set to True, this option takes precedence over the `squeeze` parameter.
In addition, as row indices are not available in such a format, the
`index_col` parameter will be ignored.
squeeze : boolean, default False
If the parsed data only contains one column then return a Series
prefix : str, default None
Expand Down Expand Up @@ -506,7 +498,6 @@ def _read(filepath_or_buffer, kwds):

_c_parser_defaults = {
'delim_whitespace': False,
'as_recarray': False,
'na_filter': True,
'compact_ints': False,
'use_unsigned': False,
Expand All @@ -532,14 +523,12 @@ def _read(filepath_or_buffer, kwds):
}

_deprecated_defaults = {
'as_recarray': None,
'buffer_lines': None,
'compact_ints': None,
'use_unsigned': None,
'tupleize_cols': None
}
_deprecated_args = {
'as_recarray',
'buffer_lines',
'compact_ints',
'use_unsigned',
Expand Down Expand Up @@ -614,7 +603,6 @@ def parser_f(filepath_or_buffer,
# Internal
doublequote=True,
delim_whitespace=False,
as_recarray=None,
compact_ints=None,
use_unsigned=None,
low_memory=_c_parser_defaults['low_memory'],
Expand Down Expand Up @@ -685,7 +673,6 @@ def parser_f(filepath_or_buffer,
compact_ints=compact_ints,
use_unsigned=use_unsigned,
delim_whitespace=delim_whitespace,
as_recarray=as_recarray,
warn_bad_lines=warn_bad_lines,
error_bad_lines=error_bad_lines,
low_memory=low_memory,
Expand Down Expand Up @@ -971,9 +958,7 @@ def _clean_options(self, options, engine):
"and will be removed in a future version."
.format(arg=arg))

if arg == 'as_recarray':
msg += ' Please call pd.to_csv(...).to_records() instead.'
elif arg == 'tupleize_cols':
if arg == 'tupleize_cols':
msg += (' Column tuples will then '
'always be converted to MultiIndex.')

Expand Down Expand Up @@ -1059,9 +1044,6 @@ def read(self, nrows=None):

ret = self._engine.read(nrows)

if self.options.get('as_recarray'):
return ret

# May alter columns / col_dict
index, columns, col_dict = self._create_index(ret)

Expand Down Expand Up @@ -1279,7 +1261,6 @@ def __init__(self, kwds):

self.true_values = kwds.get('true_values')
self.false_values = kwds.get('false_values')
self.as_recarray = kwds.get('as_recarray', False)
self.tupleize_cols = kwds.get('tupleize_cols', False)
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
Expand All @@ -1295,9 +1276,6 @@ def __init__(self, kwds):
if isinstance(self.header, (list, tuple, np.ndarray)):
if not all(map(is_integer, self.header)):
raise ValueError("header must be integer or list of integers")
if kwds.get('as_recarray'):
raise ValueError("cannot specify as_recarray when "
"specifying a multi-index header")
if kwds.get('usecols'):
raise ValueError("cannot specify usecols when "
"specifying a multi-index header")
Expand Down Expand Up @@ -1900,10 +1878,6 @@ def read(self, nrows=None):
# Done with first read, next time raise StopIteration
self._first_chunk = False

if self.as_recarray:
# what to do if there are leading columns?
return data

names = self.names

if self._reader.leading_cols:
Expand Down Expand Up @@ -2306,9 +2280,6 @@ def read(self, rows=None):
columns, data = self._do_date_conversions(columns, data)

data = self._convert_data(data)
if self.as_recarray:
return self._to_recarray(data, columns)

index, columns = self._make_index(data, alldata, columns, indexnamerow)

return index, columns, data
Expand Down Expand Up @@ -2376,19 +2347,6 @@ def _clean_mapping(mapping):
clean_na_fvalues, self.verbose,
clean_conv, clean_dtypes)

def _to_recarray(self, data, columns):
dtypes = []
o = compat.OrderedDict()

# use the columns to "order" the keys
# in the unordered 'data' dictionary
for col in columns:
dtypes.append((str(col), data[col].dtype))
o[col] = data[col]

tuples = lzip(*o.values())
return np.array(tuples, dtypes)

def _infer_columns(self):
names = self.names
num_original_columns = 0
Expand Down
20 changes: 0 additions & 20 deletions pandas/tests/io/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame
from pandas import compat
from pandas.compat import StringIO, range, lrange


Expand Down Expand Up @@ -161,25 +160,6 @@ def error(val):
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)

def test_pass_dtype_as_recarray(self):
if compat.is_platform_windows() and self.low_memory:
pytest.skip(
"segfaults on win-64, only when all tests are run")

data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), dtype={
'one': 'u1', 1: 'S1'}, as_recarray=True)
assert result['one'].dtype == 'u1'
assert result['two'].dtype == 'S1'

def test_usecols_dtypes(self):
data = """\
1,2,3
Expand Down
Loading