Skip to content

Commit

Permalink
DEPR: error_bad_lines and warn_bad_lines for read_csv (pandas-dev#40413)
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 authored and JulianWgs committed Jul 3, 2021
1 parent ab37ac9 commit 0e51d09
Show file tree
Hide file tree
Showing 14 changed files with 241 additions and 70 deletions.
23 changes: 20 additions & 3 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -344,16 +344,33 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None``
Error handling
++++++++++++++

error_bad_lines : boolean, default ``True``
error_bad_lines : boolean, default ``None``
Lines with too many fields (e.g. a csv line with too many commas) will by
default cause an exception to be raised, and no ``DataFrame`` will be
returned. If ``False``, then these "bad lines" will dropped from the
``DataFrame`` that is returned. See :ref:`bad lines <io.bad_lines>`
below.
warn_bad_lines : boolean, default ``True``

.. deprecated:: 1.3
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
encountering a bad line instead.
warn_bad_lines : boolean, default ``None``
If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
each "bad line" will be output.

.. deprecated:: 1.3
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
encountering a bad line instead.
on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error'
Specifies what to do upon encountering a bad line (a line with too many fields).
Allowed values are :

- 'error', raise an ParserError when a bad line is encountered.
- 'warn', print a warning when a bad line is encountered and skip that line.
- 'skip', skip bad lines without raising or warning when they are encountered.

.. versionadded:: 1.3

.. _io.dtypes:

Specifying column data types
Expand Down Expand Up @@ -1245,7 +1262,7 @@ You can elect to skip bad lines:

.. code-block:: ipython
In [29]: pd.read_csv(StringIO(data), error_bad_lines=False)
In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn")
Skipping line 3: expected 3 fields, saw 4
Out[29]:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,7 @@ Deprecations
- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`)
- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:``read_csv`` and :meth:``read_table`` in favor of argument ``on_bad_lines`` (:issue:`15122`)
- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
- Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`)
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
Expand Down
21 changes: 9 additions & 12 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ cdef extern from "parser/tokenizer.h":

enum: ERROR_OVERFLOW

ctypedef enum BadLineHandleMethod:
ERROR,
WARN,
SKIP

ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors)
ctypedef int (*io_cleanup)(void *src)
Expand Down Expand Up @@ -198,8 +203,7 @@ cdef extern from "parser/tokenizer.h":
int usecols

int expected_fields
int error_bad_lines
int warn_bad_lines
BadLineHandleMethod on_bad_lines

# floating point options
char decimal
Expand Down Expand Up @@ -351,8 +355,7 @@ cdef class TextReader:
thousands=None, # bytes | str
dtype=None,
usecols=None,
bint error_bad_lines=True,
bint warn_bad_lines=True,
on_bad_lines = ERROR,
bint na_filter=True,
na_values=None,
na_fvalues=None,
Expand Down Expand Up @@ -435,9 +438,7 @@ cdef class TextReader:
raise ValueError('Only length-1 comment characters supported')
self.parser.commentchar = ord(comment)

# error handling of bad lines
self.parser.error_bad_lines = int(error_bad_lines)
self.parser.warn_bad_lines = int(warn_bad_lines)
self.parser.on_bad_lines = on_bad_lines

self.skiprows = skiprows
if skiprows is not None:
Expand All @@ -454,8 +455,7 @@ cdef class TextReader:

# XXX
if skipfooter > 0:
self.parser.error_bad_lines = 0
self.parser.warn_bad_lines = 0
self.parser.on_bad_lines = SKIP

self.delimiter = delimiter

Expand Down Expand Up @@ -570,9 +570,6 @@ cdef class TextReader:
kh_destroy_str_starts(self.false_set)
self.false_set = NULL

def set_error_bad_lines(self, int status) -> None:
self.parser.error_bad_lines = status

def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
if not isinstance(quoting, int):
raise TypeError('"quoting" must be an integer')
Expand Down
7 changes: 3 additions & 4 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) {
self->allow_embedded_newline = 1;

self->expected_fields = -1;
self->error_bad_lines = 0;
self->warn_bad_lines = 0;
self->on_bad_lines = ERROR;

self->commentchar = '#';
self->thousands = '\0';
Expand Down Expand Up @@ -457,7 +456,7 @@ static int end_line(parser_t *self) {
self->line_fields[self->lines] = 0;

// file_lines is now the actual file line number (starting at 1)
if (self->error_bad_lines) {
if (self->on_bad_lines == ERROR) {
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize,
"Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
Expand All @@ -468,7 +467,7 @@ static int end_line(parser_t *self) {
return -1;
} else {
// simply skip bad lines
if (self->warn_bad_lines) {
if (self->on_bad_lines == WARN) {
// pass up error message
msg = malloc(bufsize);
snprintf(msg, bufsize,
Expand Down
9 changes: 7 additions & 2 deletions pandas/_libs/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ typedef enum {
QUOTE_NONE
} QuoteStyle;

typedef enum {
ERROR,
WARN,
SKIP
} BadLineHandleMethod;

typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors);
typedef int (*io_cleanup)(void *src);
Expand Down Expand Up @@ -136,8 +142,7 @@ typedef struct parser_t {
int usecols; // Boolean: 1: usecols provided, 0: none provided

int expected_fields;
int error_bad_lines;
int warn_bad_lines;
BadLineHandleMethod on_bad_lines;

// floating point options
char decimal;
Expand Down
13 changes: 12 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
import csv
import datetime
from enum import Enum
import itertools
from typing import (
Any,
Expand Down Expand Up @@ -108,10 +109,16 @@
"infer_datetime_format": False,
"skip_blank_lines": True,
"encoding_errors": "strict",
"on_bad_lines": "error",
}


class ParserBase:
class BadLineHandleMethod(Enum):
ERROR = 0
WARN = 1
SKIP = 2

_implicit_index: bool = False
_first_chunk: bool

Expand Down Expand Up @@ -203,9 +210,13 @@ def __init__(self, kwds):

self.handles: IOHandles | None = None

# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
# Normally, this arg would get pre-processed earlier on
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)

def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None:
"""
Let the readers open IOHanldes after they are done with their potential raises.
Let the readers open IOHandles after they are done with their potential raises.
"""
self.handles = get_handle(
src,
Expand Down
16 changes: 12 additions & 4 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,18 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
# open handles
self._open_handles(src, kwds)
assert self.handles is not None
for key in ("storage_options", "encoding", "memory_map", "compression"):

# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

for key in (
"storage_options",
"encoding",
"memory_map",
"compression",
"error_bad_lines",
"warn_bad_lines",
):
kwds.pop(key, None)

kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
Expand Down Expand Up @@ -206,9 +217,6 @@ def _set_noconvert_columns(self):
for col in noconvert_columns:
self._reader.set_noconvert(col)

def set_error_bad_lines(self, status):
self._reader.set_error_bad_lines(int(status))

def read(self, nrows=None):
try:
if self.low_memory:
Expand Down
26 changes: 15 additions & 11 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ def __init__(self, f: Union[FilePathOrBuffer, list], **kwds):
self.quoting = kwds["quoting"]
self.skip_blank_lines = kwds["skip_blank_lines"]

self.warn_bad_lines = kwds["warn_bad_lines"]
self.error_bad_lines = kwds["error_bad_lines"]

self.names_passed = kwds["names"] or None

self.has_index_names = False
Expand Down Expand Up @@ -707,10 +704,11 @@ def _next_line(self):

def _alert_malformed(self, msg, row_num):
"""
Alert a user about a malformed row.
Alert a user about a malformed row, depending on value of
`self.on_bad_lines` enum.
If `self.error_bad_lines` is True, the alert will be `ParserError`.
If `self.warn_bad_lines` is True, the alert will be printed out.
If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
If `self.on_bad_lines` is WARN, the alert will be printed out.
Parameters
----------
Expand All @@ -719,9 +717,9 @@ def _alert_malformed(self, msg, row_num):
Because this row number is displayed, we 1-index,
even though we 0-index internally.
"""
if self.error_bad_lines:
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
raise ParserError(msg)
elif self.warn_bad_lines:
elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
base = f"Skipping line {row_num}: "
sys.stderr.write(base + msg + "\n")

Expand All @@ -742,7 +740,10 @@ def _next_iter_line(self, row_num):
assert self.data is not None
return next(self.data)
except csv.Error as e:
if self.warn_bad_lines or self.error_bad_lines:
if (
self.on_bad_lines == self.BadLineHandleMethod.ERROR
or self.on_bad_lines == self.BadLineHandleMethod.WARN
):
msg = str(e)

if "NULL byte" in msg or "line contains NUL" in msg:
Expand Down Expand Up @@ -947,11 +948,14 @@ def _rows_to_cols(self, content):
actual_len = len(l)

if actual_len > col_len:
if self.error_bad_lines or self.warn_bad_lines:
if (
self.on_bad_lines == self.BadLineHandleMethod.ERROR
or self.on_bad_lines == self.BadLineHandleMethod.WARN
):
row_num = self.pos - (content_len - i + footers)
bad_lines.append((row_num, actual_len))

if self.error_bad_lines:
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
break
else:
content.append(l)
Expand Down
Loading

0 comments on commit 0e51d09

Please sign in to comment.