diff --git a/polars/polars-io/src/csv/parser.rs b/polars/polars-io/src/csv/parser.rs index 5e406a47f7d3..82714fed8bfd 100644 --- a/polars/polars-io/src/csv/parser.rs +++ b/polars/polars-io/src/csv/parser.rs @@ -43,7 +43,7 @@ pub(crate) fn next_line_position( } debug_assert!(pos <= input.len()); let new_input = unsafe { input.get_unchecked(pos..) }; - let line = SplitLines::new(new_input, eol_char).next(); + let line = SplitLines::new(new_input, quote_char.unwrap_or(b'"'), eol_char).next(); match (line, expected_fields) { // count the fields, and determine if they are equal to what we expect from the schema @@ -214,13 +214,15 @@ pub(crate) fn get_line_stats( /// For instance: "This is a valid field\nI have multiples lines" is a valid string field, that contains multiple lines. pub(crate) struct SplitLines<'a> { v: &'a [u8], + quote_char: u8, end_line_char: u8, } impl<'a> SplitLines<'a> { - pub(crate) fn new(slice: &'a [u8], end_line_char: u8) -> Self { + pub(crate) fn new(slice: &'a [u8], quote_char: u8, end_line_char: u8) -> Self { Self { v: slice, + quote_char, end_line_char, } } @@ -244,7 +246,7 @@ impl<'a> Iterator for SplitLines<'a> { Some(&c) => { pos += 1; - if c == b'"' { + if c == self.quote_char { // toggle between string field enclosure // if we encounter a starting '"' -> in_field = true; // if we encounter a closing '"' -> in_field = false; @@ -641,9 +643,15 @@ mod test { #[test] fn test_splitlines() { let input = "1,\"foo\n\"\n2,\"foo\n\"\n"; - let mut lines = SplitLines::new(input.as_bytes(), b'\n'); + let mut lines = SplitLines::new(input.as_bytes(), b'"', b'\n'); assert_eq!(lines.next(), Some("1,\"foo\n\"".as_bytes())); assert_eq!(lines.next(), Some("2,\"foo\n\"".as_bytes())); assert_eq!(lines.next(), None); + + let input2 = "1,'foo\n'\n2,'foo\n'\n"; + let mut lines2 = SplitLines::new(input2.as_bytes(), b'\'', b'\n'); + assert_eq!(lines2.next(), Some("1,'foo\n'".as_bytes())); + assert_eq!(lines2.next(), Some("2,'foo\n'".as_bytes())); + assert_eq!(lines2.next(), None); } } diff --git a/polars/polars-io/src/csv/utils.rs b/polars/polars-io/src/csv/utils.rs index d00d3046c26b..0f70f6735d8e 100644 --- a/polars/polars-io/src/csv/utils.rs +++ b/polars/polars-io/src/csv/utils.rs @@ -191,7 +191,7 @@ pub fn infer_file_schema( if bytes.is_empty() { return Err(PolarsError::NoData("empty csv".into())); } - let mut lines = SplitLines::new(bytes, eol_char).skip(*skip_rows); + let mut lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows); // it can be that we have a single line without eol char let has_eol = bytes.contains(&eol_char); @@ -295,7 +295,7 @@ pub fn infer_file_schema( }; if !has_header { // re-init lines so that the header is included in type inference. - lines = SplitLines::new(bytes, eol_char).skip(*skip_rows); + lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows); } let header_length = headers.len(); diff --git a/polars/polars-io/src/ndjson_core/ndjson.rs b/polars/polars-io/src/ndjson_core/ndjson.rs index 08bfe155e7d2..f35aa1fce89d 100644 --- a/polars/polars-io/src/ndjson_core/ndjson.rs +++ b/polars/polars-io/src/ndjson_core/ndjson.rs @@ -324,7 +324,7 @@ fn parse_lines<'a>( let total_bytes = bytes.len(); let mut offset = 0; - for line in SplitLines::new(bytes, NEWLINE) { + for line in SplitLines::new(bytes, QUOTE_CHAR, NEWLINE) { offset += 1; // the newline offset += parse_impl(line, buffers, &mut buf)?; } diff --git a/polars/tests/it/io/csv.rs b/polars/tests/it/io/csv.rs index 32b9ea348901..18070227d01e 100644 --- a/polars/tests/it/io/csv.rs +++ b/polars/tests/it/io/csv.rs @@ -228,6 +228,23 @@ fn test_escape_double_quotes() { ))); } +#[test] +fn test_newline_in_custom_quote_char() { + // newline inside custom quote char (default is ") should parse correctly + let csv = r#"column_1,column_2 + 1,'foo + bar' + 2,'bar' +"#; + + let file = Cursor::new(csv); + let df = CsvReader::new(file) + .with_quote_char(Some(b'\'')) + .finish() + .unwrap(); + assert_eq!(df.shape(), (2, 2)); +} + #[test] fn test_escape_2() { // this is is harder than it looks.