diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 8ca23c68657a1..b4e35d1f22840 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -344,16 +344,33 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, default ``True`` +error_bad_lines : boolean, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no ``DataFrame`` will be returned. If ``False``, then these "bad lines" will dropped from the ``DataFrame`` that is returned. See :ref:`bad lines ` below. -warn_bad_lines : boolean, default ``True`` + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +warn_bad_lines : boolean, default ``None`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are : + + - 'error', raise an ParserError when a bad line is encountered. + - 'warn', print a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. + + .. versionadded:: 1.3 + .. _io.dtypes: Specifying column data types @@ -1245,7 +1262,7 @@ You can elect to skip bad lines: .. code-block:: ipython - In [29]: pd.read_csv(StringIO(data), error_bad_lines=False) + In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn") Skipping line 3: expected 3 fields, saw 4 Out[29]: diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7d31dd9545f2d..c999bc2c79b55 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -669,6 +669,7 @@ Deprecations - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:``read_csv`` and :meth:``read_table`` in favor of argument ``on_bad_lines`` (:issue:`15122`) - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) - Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`) - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b2d548e04eab4..7d7074988e5f0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -146,6 +146,11 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW + ctypedef enum BadLineHandleMethod: + ERROR, + WARN, + SKIP + ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) ctypedef int (*io_cleanup)(void *src) @@ -198,8 +203,7 @@ cdef extern from "parser/tokenizer.h": int usecols int expected_fields - int error_bad_lines - int warn_bad_lines + BadLineHandleMethod on_bad_lines # floating point options char decimal @@ -351,8 +355,7 @@ cdef class TextReader: thousands=None, # bytes | str dtype=None, usecols=None, - bint error_bad_lines=True, - bint warn_bad_lines=True, + on_bad_lines = ERROR, bint na_filter=True, na_values=None, na_fvalues=None, @@ -435,9 +438,7 @@ cdef class TextReader: raise ValueError('Only length-1 comment characters supported') self.parser.commentchar = ord(comment) - # error handling of bad lines - self.parser.error_bad_lines = int(error_bad_lines) - self.parser.warn_bad_lines = int(warn_bad_lines) + self.parser.on_bad_lines = on_bad_lines self.skiprows = skiprows if skiprows is not None: @@ -454,8 +455,7 @@ cdef class TextReader: # XXX if skipfooter > 0: - self.parser.error_bad_lines = 0 - self.parser.warn_bad_lines = 0 + self.parser.on_bad_lines = SKIP self.delimiter = delimiter @@ -570,9 +570,6 @@ cdef class TextReader: kh_destroy_str_starts(self.false_set) self.false_set = NULL - def set_error_bad_lines(self, int status) -> None: - self.parser.error_bad_lines = status - def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49eb1e7855098..49797eea59ddc 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) { self->allow_embedded_newline = 1; self->expected_fields = -1; - self->error_bad_lines = 0; - self->warn_bad_lines = 0; + self->on_bad_lines = ERROR; self->commentchar = '#'; self->thousands = '\0'; @@ -457,7 +456,7 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; // file_lines is now the actual file line number (starting at 1) - if (self->error_bad_lines) { + if (self->on_bad_lines == ERROR) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", @@ -468,7 +467,7 @@ static int end_line(parser_t *self) { return -1; } else { // simply skip bad lines - if (self->warn_bad_lines) { + if (self->on_bad_lines == WARN) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f69fee4993d34..623d3690f252a 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -84,6 +84,12 @@ typedef enum { QUOTE_NONE } QuoteStyle; +typedef enum { + ERROR, + WARN, + SKIP +} BadLineHandleMethod; + typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); typedef int (*io_cleanup)(void *src); @@ -136,8 +142,7 @@ typedef struct parser_t { int usecols; // Boolean: 1: usecols provided, 0: none provided int expected_fields; - int error_bad_lines; - int warn_bad_lines; + BadLineHandleMethod on_bad_lines; // floating point options char decimal; diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 02084f91d9966..2a86ff13a2edc 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -3,6 +3,7 @@ from collections import defaultdict import csv import datetime +from enum import Enum import itertools from typing import ( Any, @@ -108,10 +109,16 @@ "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", + "on_bad_lines": "error", } class ParserBase: + class BadLineHandleMethod(Enum): + ERROR = 0 + WARN = 1 + SKIP = 2 + _implicit_index: bool = False _first_chunk: bool @@ -203,9 +210,13 @@ def __init__(self, kwds): self.handles: IOHandles | None = None + # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) + # Normally, this arg would get pre-processed earlier on + self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) + def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None: """ - Let the readers open IOHanldes after they are done with their potential raises. + Let the readers open IOHandles after they are done with their potential raises. """ self.handles = get_handle( src, diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 7a0e704d2fbc4..5c1f8f94a72da 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -50,7 +50,18 @@ def __init__(self, src: FilePathOrBuffer, **kwds): # open handles self._open_handles(src, kwds) assert self.handles is not None - for key in ("storage_options", "encoding", "memory_map", "compression"): + + # Have to pass int, would break tests using TextReader directly otherwise :( + kwds["on_bad_lines"] = self.on_bad_lines.value + + for key in ( + "storage_options", + "encoding", + "memory_map", + "compression", + "error_bad_lines", + "warn_bad_lines", + ): kwds.pop(key, None) kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) @@ -206,9 +217,6 @@ def _set_noconvert_columns(self): for col in noconvert_columns: self._reader.set_noconvert(col) - def set_error_bad_lines(self, status): - self._reader.set_error_bad_lines(int(status)) - def read(self, nrows=None): try: if self.low_memory: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 8f6d95f001d91..3635d5b32faf4 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -74,9 +74,6 @@ def __init__(self, f: Union[FilePathOrBuffer, list], **kwds): self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.warn_bad_lines = kwds["warn_bad_lines"] - self.error_bad_lines = kwds["error_bad_lines"] - self.names_passed = kwds["names"] or None self.has_index_names = False @@ -707,10 +704,11 @@ def _next_line(self): def _alert_malformed(self, msg, row_num): """ - Alert a user about a malformed row. + Alert a user about a malformed row, depending on value of + `self.on_bad_lines` enum. - If `self.error_bad_lines` is True, the alert will be `ParserError`. - If `self.warn_bad_lines` is True, the alert will be printed out. + If `self.on_bad_lines` is ERROR, the alert will be `ParserError`. + If `self.on_bad_lines` is WARN, the alert will be printed out. Parameters ---------- @@ -719,9 +717,9 @@ def _alert_malformed(self, msg, row_num): Because this row number is displayed, we 1-index, even though we 0-index internally. """ - if self.error_bad_lines: + if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) - elif self.warn_bad_lines: + elif self.on_bad_lines == self.BadLineHandleMethod.WARN: base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n") @@ -742,7 +740,10 @@ def _next_iter_line(self, row_num): assert self.data is not None return next(self.data) except csv.Error as e: - if self.warn_bad_lines or self.error_bad_lines: + if ( + self.on_bad_lines == self.BadLineHandleMethod.ERROR + or self.on_bad_lines == self.BadLineHandleMethod.WARN + ): msg = str(e) if "NULL byte" in msg or "line contains NUL" in msg: @@ -947,11 +948,14 @@ def _rows_to_cols(self, content): actual_len = len(l) if actual_len > col_len: - if self.error_bad_lines or self.warn_bad_lines: + if ( + self.on_bad_lines == self.BadLineHandleMethod.ERROR + or self.on_bad_lines == self.BadLineHandleMethod.WARN + ): row_num = self.pos - (content_len - i + footers) bad_lines.append((row_num, actual_len)) - if self.error_bad_lines: + if self.on_bad_lines == self.BadLineHandleMethod.ERROR: break else: content.append(l) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d957a669351c1..8bf1ab1260b8e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -34,6 +34,7 @@ Appender, deprecate_nonkeyword_arguments, ) +from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( is_file_like, @@ -324,14 +325,32 @@ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. -error_bad_lines : bool, default True +error_bad_lines : bool, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will be dropped from the DataFrame that is returned. -warn_bad_lines : bool, default True + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +warn_bad_lines : bool, default ``None`` If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are : + + - 'error', raise an Exception when a bad line is encountered. + - 'warn', raise a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. + + .. versionadded:: 1.3 + delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\\s+'``. If this option @@ -384,8 +403,8 @@ "na_filter": True, "low_memory": True, "memory_map": False, - "error_bad_lines": True, - "warn_bad_lines": True, + "error_bad_lines": None, + "warn_bad_lines": None, "float_precision": None, } @@ -394,8 +413,8 @@ _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults: Dict[str, Any] = {} -_deprecated_args: Set[str] = set() +_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None} +_deprecated_args: Set[str] = {"error_bad_lines", "warn_bad_lines"} def validate_integer(name, val, min_val=0): @@ -538,8 +557,11 @@ def read_csv( encoding_errors: Optional[str] = "strict", dialect=None, # Error Handling - error_bad_lines=True, - warn_bad_lines=True, + error_bad_lines=None, + warn_bad_lines=None, + # TODO (2.0): set on_bad_lines to "error". + # See _refine_defaults_read comment for why we do this. + on_bad_lines=None, # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], @@ -558,6 +580,9 @@ def read_csv( delim_whitespace, engine, sep, + error_bad_lines, + warn_bad_lines, + on_bad_lines, names, prefix, defaults={"delimiter": ","}, @@ -626,8 +651,11 @@ def read_table( encoding=None, dialect=None, # Error Handling - error_bad_lines=True, - warn_bad_lines=True, + error_bad_lines=None, + warn_bad_lines=None, + # TODO (2.0): set on_bad_lines to "error". + # See _refine_defaults_read comment for why we do this. + on_bad_lines=None, encoding_errors: Optional[str] = "strict", # Internal delim_whitespace=False, @@ -646,6 +674,9 @@ def read_table( delim_whitespace, engine, sep, + error_bad_lines, + warn_bad_lines, + on_bad_lines, names, prefix, defaults={"delimiter": "\t"}, @@ -947,7 +978,7 @@ def _clean_options(self, options, engine): f"The {arg} argument has been deprecated and will be " "removed in a future version.\n\n" ) - warnings.warn(msg, FutureWarning, stacklevel=2) + warnings.warn(msg, FutureWarning, stacklevel=7) else: result[arg] = parser_default @@ -1195,6 +1226,9 @@ def _refine_defaults_read( delim_whitespace: bool, engine: str, sep: Union[str, object], + error_bad_lines: Optional[bool], + warn_bad_lines: Optional[bool], + on_bad_lines: Optional[str], names: Union[Optional[ArrayLike], object], prefix: Union[Optional[str], object], defaults: Dict[str, Any], @@ -1222,6 +1256,12 @@ def _refine_defaults_read( sep : str or object A delimiter provided by the user (str) or a sentinel value, i.e. pandas._libs.lib.no_default. + error_bad_lines : str or None + Whether to error on a bad line or not. + warn_bad_lines : str or None + Whether to warn on a bad line or not. + on_bad_lines : str or None + An option for handling bad lines or a sentinel value(None). names : array-like, optional List of column names to use. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. @@ -1238,8 +1278,11 @@ def _refine_defaults_read( Raises ------ - ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and + ValueError : + If a delimiter was specified with ``sep`` (or ``delimiter``) and ``delim_whitespace=True``. + If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/ + ``warn_bad_lines`` is True. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -1292,6 +1335,48 @@ def _refine_defaults_read( kwds["engine"] = "c" kwds["engine_specified"] = False + # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines + # aren't specified at the same time. If so, raise. Otherwise, + # alias on_bad_lines to "error" if error/warn_bad_lines not set + # and on_bad_lines is not set. on_bad_lines is defaulted to None + # so we can tell if it is set (this is why this hack exists). + if on_bad_lines is not None: + if error_bad_lines is not None or warn_bad_lines is not None: + raise ValueError( + "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " + "Please only set on_bad_lines." + ) + if on_bad_lines == "error": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + else: + if error_bad_lines is not None: + # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true + validate_bool_kwarg(error_bad_lines, "error_bad_lines") + if error_bad_lines: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + else: + if warn_bad_lines is not None: + # This is the case where error_bad_lines is False + # We can only warn/skip if error_bad_lines is False + # None doesn't work because backwards-compatibility reasons + validate_bool_kwarg(warn_bad_lines, "warn_bad_lines") + if warn_bad_lines: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + else: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + else: + # Backwards compat, when only error_bad_lines = false, we warn + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + else: + # Everything None -> Error + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + return kwds diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index eba5e52516b4c..97b3be1306cd5 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -803,6 +803,19 @@ def test_encoding_surrogatepass(all_parsers): parser.read_csv(path) +@pytest.mark.parametrize("on_bad_lines", ["error", "warn"]) +def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines): + # GH 15122 + parser = all_parsers + kwds = {f"{on_bad_lines}_bad_lines": False} + with tm.assert_produces_warning( + FutureWarning, + match=f"The {on_bad_lines}_bad_lines argument has been deprecated " + "and will be removed in a future version.\n\n", + ): + parser.read_csv(csv1, **kwds) + + def test_malformed_second_line(all_parsers): # see GH14782 parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4e3d99af685ec..f5438ea3f0296 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -140,27 +140,37 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -def test_suppress_error_output(all_parsers, capsys): +@pytest.mark.parametrize( + "kwargs", + [ + pytest.param( + {"error_bad_lines": False, "warn_bad_lines": False}, + marks=pytest.mark.filterwarnings("ignore"), + ), + {"on_bad_lines": "skip"}, + ], +) +def test_suppress_error_output(all_parsers, capsys, kwargs): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv( - StringIO(data), error_bad_lines=False, warn_bad_lines=False - ) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert captured.err == "" +@pytest.mark.filterwarnings("ignore") @pytest.mark.parametrize( "kwargs", [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. ) @pytest.mark.parametrize( - "warn_kwargs", [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}] + "warn_kwargs", + [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}], ) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 @@ -173,13 +183,23 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): parser.read_csv(StringIO(data), **kwargs) -def test_warn_bad_lines(all_parsers, capsys): +@pytest.mark.parametrize( + "kwargs", + [ + pytest.param( + {"error_bad_lines": False, "warn_bad_lines": True}, + marks=pytest.mark.filterwarnings("ignore"), + ), + {"on_bad_lines": "warn"}, + ], +) +def test_warn_bad_lines(all_parsers, capsys, kwargs): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() @@ -234,3 +254,24 @@ def test_open_file(all_parsers): with pytest.raises(csv.Error, match="Could not determine delimiter"): parser.read_csv(file, sep=None, encoding_errors="replace") assert len(record) == 0, record[0].message + + +def test_invalid_on_bad_line(all_parsers): + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"): + parser.read_csv(StringIO(data), on_bad_lines="abc") + + +@pytest.mark.parametrize("error_bad_lines", [True, False]) +@pytest.mark.parametrize("warn_bad_lines", [True, False]) +def test_conflict_on_bad_line(all_parsers, error_bad_lines, warn_bad_lines): + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + kwds = {"error_bad_lines": error_bad_lines, "warn_bad_lines": warn_bad_lines} + with pytest.raises( + ValueError, + match="Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " + "Please only set on_bad_lines.", + ): + parser.read_csv(StringIO(data), on_bad_lines="error", **kwds) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 5d1fa426ff24c..160e00f5fb930 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -498,7 +498,7 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): header=None, delimiter="\\s+", skiprows=0, - error_bad_lines=False, + on_bad_lines="warn", ) captured = capsys.readouterr() # skipped lines 2, 3, 4, 9 diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index cf6866946ab76..f62c9fd1349bf 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -276,9 +276,7 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv( - StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False - ) + result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") tm.assert_frame_equal(result, expected) captured = capsys.readouterr() diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 7f84c5e378d16..d594bf8a75d49 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -140,11 +140,7 @@ def test_skip_bad_lines(self, capsys): reader.read() reader = TextReader( - StringIO(data), - delimiter=":", - header=None, - error_bad_lines=False, - warn_bad_lines=False, + StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip ) result = reader.read() expected = { @@ -155,11 +151,7 @@ def test_skip_bad_lines(self, capsys): assert_array_dicts_equal(result, expected) reader = TextReader( - StringIO(data), - delimiter=":", - header=None, - error_bad_lines=False, - warn_bad_lines=True, + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn ) reader.read() captured = capsys.readouterr()