From 57e567fb279a95a94d2e3db4a3ca2111f3b21a24 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 27 Dec 2022 12:52:59 +0100 Subject: [PATCH] fix(rust, python): csv, read escaped "" as missing (#5912) --- polars/polars-io/src/csv/buffer.rs | 13 +++++++------ py-polars/tests/unit/io/test_csv.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/polars/polars-io/src/csv/buffer.rs b/polars/polars-io/src/csv/buffer.rs index 3413039a4010c..f4bf9919678c3 100644 --- a/polars/polars-io/src/csv/buffer.rs +++ b/polars/polars-io/src/csv/buffer.rs @@ -95,7 +95,7 @@ where let bytes = skip_whitespace(bytes); return self.parse_bytes(bytes, ignore_errors, needs_escaping); } - if ignore_errors { + if ignore_errors || bytes.is_empty() { self.append_null() } else { return Err(PolarsError::ComputeError("".into())); @@ -349,16 +349,17 @@ impl ParsedBuffer for BooleanChunkedBuilder { &mut self, bytes: &[u8], ignore_errors: bool, - _needs_escaping: bool, + needs_escaping: bool, ) -> PolarsResult<()> { + let bytes = if needs_escaping { + &bytes[1..bytes.len() - 1] + } else { + bytes + }; if bytes.eq_ignore_ascii_case(b"false") { self.append_value(false); } else if bytes.eq_ignore_ascii_case(b"true") { self.append_value(true); - } else if bytes.eq_ignore_ascii_case(b"\"false\"") { - self.append_value(false); - } else if bytes.eq_ignore_ascii_case(b"\"true\"") { - self.append_value(true); } else if ignore_errors || bytes.is_empty() { self.append_null(); } else { diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 492cdca718899..304bf27ccd834 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -961,3 +961,25 @@ def test_csv_single_categorical_null() -> None: assert df.dtypes == [pl.Utf8, pl.Categorical, pl.Utf8] assert df.to_dict(False) == {"x": ["A"], "y": [None], "z": ["A"]} + + +def test_csv_quoted_missing() -> None: + csv = '''"col1"|"col2"|"col3"|"col4" +"0"|"Free text with a line +break"|"123"|"456" +"1"|"Free text without a linebreak"|""|"789" +"0"|"Free text with +two +linebreaks"|"101112"|"131415"''' # noqa: W291 + assert pl.read_csv(csv.encode(), sep="|", dtypes={"col3": pl.Int32}).to_dict( + False + ) == { + "col1": [0, 1, 0], + "col2": [ + "Free text with a line\nbreak", + "Free text without a linebreak", + "Free text with \ntwo \nlinebreaks", + ], + "col3": [123, None, 101112], + "col4": [456, 789, 131415], + }