From 22b1f5d3de766d3ad9374744556d216b9aa82c19 Mon Sep 17 00:00:00 2001 From: dmitrybugakov Date: Thu, 16 Nov 2023 16:32:34 +0100 Subject: [PATCH] feature(rust): Implement multi-character comment support in read_csv #10583 --- crates/polars-io/src/csv/mod.rs | 2 +- crates/polars-io/src/csv/parser.rs | 25 ++++++--- crates/polars-io/src/csv/read.rs | 55 ++++++++++++++++--- .../src/csv/read_impl/batched_mmap.rs | 6 +- .../src/csv/read_impl/batched_read.rs | 6 +- crates/polars-io/src/csv/read_impl/mod.rs | 29 +++++----- crates/polars-io/src/csv/utils.rs | 41 +++++++------- .../src/physical_plan/executors/scan/csv.rs | 2 +- crates/polars-lazy/src/scan/csv.rs | 22 +++++--- .../polars-pipe/src/executors/sources/csv.rs | 2 +- .../polars-plan/src/logical_plan/builder.rs | 7 ++- .../polars-plan/src/logical_plan/options.rs | 4 +- crates/polars/tests/it/io/csv.rs | 20 ++++++- py-polars/polars/dataframe/frame.py | 6 +- py-polars/polars/io/csv/batched_reader.py | 4 +- py-polars/polars/io/csv/functions.py | 42 ++++++++------ py-polars/polars/lazyframe/frame.py | 4 +- py-polars/src/batched_csv.rs | 7 +-- py-polars/src/dataframe.rs | 7 +-- py-polars/src/lazyframe.rs | 7 +-- py-polars/tests/unit/io/test_csv.py | 23 +++++++- 21 files changed, 206 insertions(+), 115 deletions(-) diff --git a/crates/polars-io/src/csv/mod.rs b/crates/polars-io/src/csv/mod.rs index f5dd4b1f6e14..a6cf97246fcc 100644 --- a/crates/polars-io/src/csv/mod.rs +++ b/crates/polars-io/src/csv/mod.rs @@ -59,7 +59,7 @@ use polars_core::prelude::*; use polars_time::prelude::*; #[cfg(feature = "temporal")] use rayon::prelude::*; -pub use read::{CsvEncoding, CsvReader, NullValues}; +pub use read::{CommentPrefix, CsvEncoding, CsvReader, NullValues}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; pub use write::{BatchedWriter, CsvWriter, QuoteStyle}; diff --git a/crates/polars-io/src/csv/parser.rs b/crates/polars-io/src/csv/parser.rs index 1b7880f1352e..885f934b78da 100644 --- a/crates/polars-io/src/csv/parser.rs +++ b/crates/polars-io/src/csv/parser.rs @@ -5,6 +5,7 @@ use polars_core::prelude::*; use super::buffer::*; use crate::csv::read::NullValuesCompiled; use crate::csv::splitfields::SplitFields; +use crate::csv::CommentPrefix; /// Skip the utf-8 Byte Order Mark. /// credits to csv-core @@ -16,6 +17,17 @@ pub(crate) fn skip_bom(input: &[u8]) -> &[u8] { } } +/// Checks if a line in a CSV file is a comment based on the given comment prefix configuration. +/// +/// This function is used during CSV parsing to determine whether a line should be ignored based on its starting characters. +pub(crate) fn is_comment_line(line: &[u8], comment_prefix: Option<&CommentPrefix>) -> bool { + match comment_prefix { + Some(CommentPrefix::Single(c)) => line.starts_with(&[*c]), + Some(CommentPrefix::Multi(s)) => line.starts_with(s.as_bytes()), + None => false, + } +} + /// Find the nearest next line position. /// Does not check for new line characters embedded in String fields. pub(crate) fn next_line_position_naive(input: &[u8], eol_char: u8) -> Option { @@ -351,7 +363,7 @@ pub(super) fn parse_lines<'a>( mut bytes: &'a [u8], offset: usize, separator: u8, - comment_char: Option, + comment_prefix: Option<&CommentPrefix>, quote_char: Option, eol_char: u8, missing_is_null: bool, @@ -400,13 +412,10 @@ pub(super) fn parse_lines<'a>( } // deal with comments - if let Some(c) = comment_char { - // line is a comment -> skip - if bytes[0] == c { - let bytes_rem = skip_this_line(bytes, quote_char, eol_char); - bytes = bytes_rem; - continue; - } + if is_comment_line(bytes, comment_prefix) { + let bytes_rem = skip_this_line(bytes, quote_char, eol_char); + bytes = bytes_rem; + continue; } // Every line we only need to parse the columns that are projected. diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs index 4d8527b70b80..2f3d3108ab38 100644 --- a/crates/polars-io/src/csv/read.rs +++ b/crates/polars-io/src/csv/read.rs @@ -25,6 +25,33 @@ pub enum NullValues { Named(Vec<(String, String)>), } +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum CommentPrefix { + /// A single byte character that indicates the start of a comment line. + Single(u8), + /// A string that indicates the start of a comment line. + /// This allows for multiple characters to be used as a comment identifier. + Multi(String), +} + +impl CommentPrefix { + /// Creates a new `CommentPrefix` for the `Single` variant. + pub fn new_single(c: u8) -> Self { + CommentPrefix::Single(c) + } + + /// Creates a new `CommentPrefix`. If `Multi` variant is used and the string is longer + /// than 5 characters, it will return `None`. + pub fn new_multi(s: String) -> Option { + if s.len() <= 5 { + Some(CommentPrefix::Multi(s)) + } else { + None + } + } +} + pub(super) enum NullValuesCompiled { /// A single value that's used for all columns AllColumnsSingle(String), @@ -118,7 +145,7 @@ where dtype_overwrite: Option<&'a [DataType]>, sample_size: usize, chunk_size: usize, - comment_char: Option, + comment_prefix: Option, null_values: Option, predicate: Option>, quote_char: Option, @@ -210,9 +237,21 @@ where self } - /// Set the comment character. Lines starting with this character will be ignored. - pub fn with_comment_char(mut self, comment_char: Option) -> Self { - self.comment_char = comment_char; + /// Set the comment prefix for this instance. Lines starting with this prefix will be ignored. + pub fn with_comment_prefix(mut self, comment_prefix: Option<&str>) -> Self { + self.comment_prefix = comment_prefix.map(|s| { + if s.len() == 1 && s.chars().next().unwrap().is_ascii() { + CommentPrefix::Single(s.as_bytes()[0]) + } else { + CommentPrefix::Multi(s.to_string()) + } + }); + self + } + + /// Sets the comment prefix from `CsvParserOptions` for internal initialization. + pub fn _with_comment_prefix(mut self, comment_prefix: Option) -> Self { + self.comment_prefix = comment_prefix; self } @@ -370,7 +409,7 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> { self.sample_size, self.chunk_size, self.low_memory, - self.comment_char, + std::mem::take(&mut self.comment_prefix), self.quote_char, self.eol_char, std::mem::take(&mut self.null_values), @@ -487,7 +526,7 @@ impl<'a> CsvReader<'a, Box> { None, &mut self.skip_rows_before_header, self.skip_rows_after_header, - self.comment_char, + self.comment_prefix.as_ref(), self.quote_char, self.eol_char, self.null_values.as_ref(), @@ -516,7 +555,7 @@ impl<'a> CsvReader<'a, Box> { None, &mut self.skip_rows_before_header, self.skip_rows_after_header, - self.comment_char, + self.comment_prefix.as_ref(), self.quote_char, self.eol_char, self.null_values.as_ref(), @@ -556,7 +595,7 @@ where sample_size: 1024, chunk_size: 1 << 18, low_memory: false, - comment_char: None, + comment_prefix: None, eol_char: b'\n', null_values: None, missing_is_null: true, diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs index f0299ca40fe9..3b0083f51092 100644 --- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs +++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs @@ -154,7 +154,7 @@ impl<'a> CoreReader<'a> { projection, starting_point_offset, row_count: self.row_count, - comment_char: self.comment_char, + comment_prefix: self.comment_prefix, quote_char: self.quote_char, eol_char: self.eol_char, null_values: self.null_values, @@ -182,7 +182,7 @@ pub struct BatchedCsvReaderMmap<'a> { projection: Vec, starting_point_offset: Option, row_count: Option, - comment_char: Option, + comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, @@ -240,7 +240,7 @@ impl<'a> BatchedCsvReaderMmap<'a> { bytes_offset_thread, self.quote_char, self.eol_char, - self.comment_char, + self.comment_prefix.as_ref(), self.chunk_size, &self.str_capacities, self.encoding, diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs index 9e8e6b6e6836..753cc281110e 100644 --- a/crates/polars-io/src/csv/read_impl/batched_read.rs +++ b/crates/polars-io/src/csv/read_impl/batched_read.rs @@ -237,7 +237,7 @@ impl<'a> CoreReader<'a> { projection, starting_point_offset, row_count: self.row_count, - comment_char: self.comment_char, + comment_prefix: self.comment_prefix, quote_char: self.quote_char, eol_char: self.eol_char, null_values: self.null_values, @@ -265,7 +265,7 @@ pub struct BatchedCsvReaderRead<'a> { projection: Vec, starting_point_offset: Option, row_count: Option, - comment_char: Option, + comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, @@ -337,7 +337,7 @@ impl<'a> BatchedCsvReaderRead<'a> { 0, self.quote_char, self.eol_char, - self.comment_char, + self.comment_prefix.as_ref(), self.chunk_size, &self.str_capacities, self.encoding, diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs index aa077a9e61a2..29c04bac35a1 100644 --- a/crates/polars-io/src/csv/read_impl/mod.rs +++ b/crates/polars-io/src/csv/read_impl/mod.rs @@ -20,7 +20,7 @@ use rayon::prelude::*; use crate::csv::buffer::*; use crate::csv::parser::*; -use crate::csv::read::NullValuesCompiled; +use crate::csv::read::{CommentPrefix, NullValuesCompiled}; use crate::csv::utils::*; use crate::csv::{CsvEncoding, NullValues}; use crate::mmap::ReaderBytes; @@ -109,7 +109,7 @@ pub(crate) struct CoreReader<'a> { sample_size: usize, chunk_size: usize, low_memory: bool, - comment_char: Option, + comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, @@ -198,7 +198,7 @@ impl<'a> CoreReader<'a> { sample_size: usize, chunk_size: usize, low_memory: bool, - comment_char: Option, + comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, @@ -247,7 +247,7 @@ impl<'a> CoreReader<'a> { schema_overwrite.as_deref(), &mut skip_rows, skip_rows_after_header, - comment_char, + comment_prefix.as_ref(), quote_char, eol_char, null_values.as_ref(), @@ -299,7 +299,7 @@ impl<'a> CoreReader<'a> { sample_size, chunk_size, low_memory, - comment_char, + comment_prefix, quote_char, eol_char, null_values, @@ -342,14 +342,13 @@ impl<'a> CoreReader<'a> { if self.skip_rows_after_header > 0 { for _ in 0..self.skip_rows_after_header { - let pos = match bytes.first() { - Some(first) if Some(*first) == self.comment_char => { - next_line_position_naive(bytes, eol_char) - }, + let pos = if is_comment_line(bytes, self.comment_prefix.as_ref()) { + next_line_position_naive(bytes, eol_char) + } else { // we don't pass expected fields // as we want to skip all rows // no matter the no. of fields - _ => next_line_position(bytes, None, self.separator, self.quote_char, eol_char), + next_line_position(bytes, None, self.separator, self.quote_char, eol_char) } .ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?; @@ -598,7 +597,7 @@ impl<'a> CoreReader<'a> { local_bytes, offset, self.separator, - self.comment_char, + self.comment_prefix.as_ref(), self.quote_char, self.eol_char, self.missing_is_null, @@ -670,7 +669,7 @@ impl<'a> CoreReader<'a> { bytes_offset_thread, self.quote_char, self.eol_char, - self.comment_char, + self.comment_prefix.as_ref(), capacity, &str_capacities, self.encoding, @@ -716,7 +715,7 @@ impl<'a> CoreReader<'a> { remaining_bytes, 0, self.separator, - self.comment_char, + self.comment_prefix.as_ref(), self.quote_char, self.eol_char, self.missing_is_null, @@ -800,7 +799,7 @@ fn read_chunk( bytes_offset_thread: usize, quote_char: Option, eol_char: u8, - comment_char: Option, + comment_prefix: Option<&CommentPrefix>, capacity: usize, str_capacities: &[RunningSize], encoding: CsvEncoding, @@ -835,7 +834,7 @@ fn read_chunk( local_bytes, offset, separator, - comment_char, + comment_prefix, quote_char, eol_char, missing_is_null, diff --git a/crates/polars-io/src/csv/utils.rs b/crates/polars-io/src/csv/utils.rs index 6f7ef7c054f9..8ff7608eeaec 100644 --- a/crates/polars-io/src/csv/utils.rs +++ b/crates/polars-io/src/csv/utils.rs @@ -16,7 +16,8 @@ use crate::csv::parser::{next_line_position, skip_bom, skip_line_ending, SplitLi use crate::csv::splitfields::SplitFields; use crate::csv::CsvEncoding; use crate::mmap::ReaderBytes; -use crate::prelude::NullValues; +use crate::prelude::parser::is_comment_line; +use crate::prelude::{CommentPrefix, NullValues}; use crate::utils::{BOOLEAN_RE, FLOAT_RE, INTEGER_RE}; pub(crate) fn get_file_chunks( @@ -142,7 +143,7 @@ pub fn infer_file_schema_inner( // on the schema inference skip_rows: &mut usize, skip_rows_after_header: usize, - comment_char: Option, + comment_prefix: Option<&CommentPrefix>, quote_char: Option, eol_char: u8, null_values: Option<&NullValues>, @@ -170,19 +171,19 @@ pub fn infer_file_schema_inner( // skip lines that are comments let mut first_line = None; - if let Some(comment_ch) = comment_char { - for (i, line) in (&mut lines).enumerate() { - if let Some(ch) = line.first() { - if *ch != comment_ch { - first_line = Some(line); - *skip_rows += i; - break; - } - } + + for (i, line) in (&mut lines).enumerate() { + if !is_comment_line(line, comment_prefix) { + first_line = Some(line); + *skip_rows += i; + break; } - } else { + } + + if first_line.is_none() { first_line = lines.next(); } + // edge case where we have a single row, no header and no eol char. if first_line.is_none() && !has_eol && !has_header { first_line = Some(bytes); @@ -254,7 +255,7 @@ pub fn infer_file_schema_inner( schema_overwrite, skip_rows, skip_rows_after_header, - comment_char, + comment_prefix, quote_char, eol_char, null_values, @@ -310,11 +311,9 @@ pub fn infer_file_schema_inner( continue; } - if let Some(c) = comment_char { - // line is a comment -> skip - if line[0] == c { - continue; - } + // line is a comment -> skip + if is_comment_line(line, comment_prefix) { + continue; } let len = line.len(); @@ -448,7 +447,7 @@ pub fn infer_file_schema_inner( schema_overwrite, skip_rows, skip_rows_after_header, - comment_char, + comment_prefix, quote_char, eol_char, null_values, @@ -481,7 +480,7 @@ pub fn infer_file_schema( // on the schema inference skip_rows: &mut usize, skip_rows_after_header: usize, - comment_char: Option, + comment_prefix: Option<&CommentPrefix>, quote_char: Option, eol_char: u8, null_values: Option<&NullValues>, @@ -496,7 +495,7 @@ pub fn infer_file_schema( schema_overwrite, skip_rows, skip_rows_after_header, - comment_char, + comment_prefix, quote_char, eol_char, null_values, diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs index bac591b84f86..f05875342555 100644 --- a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs +++ b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs @@ -38,7 +38,7 @@ impl CsvExec { .with_null_values(std::mem::take(&mut self.options.null_values)) .with_predicate(predicate) .with_encoding(CsvEncoding::LossyUtf8) - .with_comment_char(self.options.comment_char) + ._with_comment_prefix(std::mem::take(&mut self.options.comment_prefix)) .with_quote_char(self.options.quote_char) .with_end_of_line_char(self.options.eol_char) .with_encoding(self.options.encoding) diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 2543dcbea92b..d0b15218a506 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -2,7 +2,7 @@ use std::path::{Path, PathBuf}; use polars_core::prelude::*; use polars_io::csv::utils::infer_file_schema; -use polars_io::csv::{CsvEncoding, NullValues}; +use polars_io::csv::{CommentPrefix, CsvEncoding, NullValues}; use polars_io::utils::get_reader_bytes; use polars_io::RowCount; @@ -23,7 +23,7 @@ pub struct LazyCsvReader<'a> { schema: Option, schema_overwrite: Option<&'a Schema>, low_memory: bool, - comment_char: Option, + comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, @@ -57,7 +57,7 @@ impl<'a> LazyCsvReader<'a> { schema: None, schema_overwrite: None, low_memory: false, - comment_char: None, + comment_prefix: None, quote_char: Some(b'"'), eol_char: b'\n', null_values: None, @@ -147,10 +147,16 @@ impl<'a> LazyCsvReader<'a> { self } - /// Set the comment character. Lines starting with this character will be ignored. + /// Set the comment prefix for this instance. Lines starting with this prefix will be ignored. #[must_use] - pub fn with_comment_char(mut self, comment_char: Option) -> Self { - self.comment_char = comment_char; + pub fn with_comment_prefix(mut self, comment_prefix: Option<&str>) -> Self { + self.comment_prefix = comment_prefix.map(|s| { + if s.len() == 1 && s.chars().next().unwrap().is_ascii() { + CommentPrefix::Single(s.as_bytes()[0]) + } else { + CommentPrefix::Multi(s.to_string()) + } + }); self } @@ -252,7 +258,7 @@ impl<'a> LazyCsvReader<'a> { None, &mut skip_rows, self.skip_rows_after_header, - self.comment_char, + self.comment_prefix.as_ref(), self.quote_char, self.eol_char, None, @@ -285,7 +291,7 @@ impl LazyFileListReader for LazyCsvReader<'_> { self.schema, self.schema_overwrite, self.low_memory, - self.comment_char, + self.comment_prefix, self.quote_char, self.eol_char, self.null_values, diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index b1297c39e07c..271cc9d5c621 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -71,7 +71,7 @@ impl CsvSource { .low_memory(options.low_memory) .with_null_values(options.null_values) .with_encoding(CsvEncoding::LossyUtf8) - .with_comment_char(options.comment_char) + ._with_comment_prefix(options.comment_prefix) .with_quote_char(options.quote_char) .with_end_of_line_char(options.eol_char) .with_encoding(options.encoding) diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs index 3612e05e3f4d..43ca98e91c9d 100644 --- a/crates/polars-plan/src/logical_plan/builder.rs +++ b/crates/polars-plan/src/logical_plan/builder.rs @@ -23,6 +23,7 @@ use polars_io::RowCount; #[cfg(feature = "csv")] use polars_io::{ csv::utils::{infer_file_schema, is_compressed}, + csv::CommentPrefix, csv::CsvEncoding, csv::NullValues, utils::get_reader_bytes, @@ -285,7 +286,7 @@ impl LogicalPlanBuilder { mut schema: Option>, schema_overwrite: Option<&Schema>, low_memory: bool, - comment_char: Option, + comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, @@ -325,7 +326,7 @@ impl LogicalPlanBuilder { schema_overwrite, &mut skip_rows, skip_rows_after_header, - comment_char, + comment_prefix.as_ref(), quote_char, eol_char, null_values.as_ref(), @@ -377,7 +378,7 @@ impl LogicalPlanBuilder { ignore_errors, skip_rows, low_memory, - comment_char, + comment_prefix, quote_char, eol_char, null_values, diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs index 4a850411672b..a1282a9ba461 100644 --- a/crates/polars-plan/src/logical_plan/options.rs +++ b/crates/polars-plan/src/logical_plan/options.rs @@ -4,7 +4,7 @@ use polars_core::prelude::*; #[cfg(feature = "csv")] use polars_io::csv::SerializeOptions; #[cfg(feature = "csv")] -use polars_io::csv::{CsvEncoding, NullValues}; +use polars_io::csv::{CommentPrefix, CsvEncoding, NullValues}; #[cfg(feature = "ipc")] use polars_io::ipc::IpcCompression; #[cfg(feature = "parquet")] @@ -25,7 +25,7 @@ pub type FileCount = u32; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct CsvParserOptions { pub separator: u8, - pub comment_char: Option, + pub comment_prefix: Option, pub quote_char: Option, pub eol_char: u8, pub has_header: bool, diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index d8cd5fc99c1c..13804c5fc3c9 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -542,7 +542,21 @@ fn test_comment_lines() -> PolarsResult<()> { let file = Cursor::new(csv); let df = CsvReader::new(file) .has_header(false) - .with_comment_char(Some(b'#')) + .with_comment_prefix(Some("#")) + .finish()?; + assert_eq!(df.shape(), (3, 5)); + + let csv = r"!str,2,3,4,5 +!#& this is a comment +!str,2,3,4,5 +!#& this is also a comment +!str,2,3,4,5 +"; + + let file = Cursor::new(csv); + let df = CsvReader::new(file) + .has_header(false) + .with_comment_prefix(Some("!#&")) .finish()?; assert_eq!(df.shape(), (3, 5)); @@ -557,7 +571,7 @@ fn test_comment_lines() -> PolarsResult<()> { let file = Cursor::new(csv); let df = CsvReader::new(file) .has_header(true) - .with_comment_char(Some(b'%')) + .with_comment_prefix(Some("%")) .finish()?; assert_eq!(df.shape(), (3, 5)); @@ -698,7 +712,7 @@ fn test_header_with_comments() -> PolarsResult<()> { let file = Cursor::new(csv); let df = CsvReader::new(file) - .with_comment_char(Some(b'#')) + .with_comment_prefix(Some("#")) .finish()?; // 1 row. assert_eq!(df.shape(), (1, 3)); diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 2a7e018f44e2..02ad4d21ae6b 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -661,7 +661,7 @@ def _read_csv( has_header: bool = True, columns: Sequence[int] | Sequence[str] | None = None, separator: str = ",", - comment_char: str | None = None, + comment_prefix: str | None = None, quote_char: str | None = '"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, @@ -740,7 +740,7 @@ def _read_csv( source, has_header=has_header, separator=separator, - comment_char=comment_char, + comment_prefix=comment_prefix, quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes_dict, @@ -789,7 +789,7 @@ def _read_csv( dtype_list, dtype_slice, low_memory, - comment_char, + comment_prefix, quote_char, processed_null_values, missing_utf8_is_empty_string, diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index ad3964800e5a..f3251137ed01 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -32,7 +32,7 @@ def __init__( has_header: bool = True, columns: Sequence[int] | Sequence[str] | None = None, separator: str = ",", - comment_char: str | None = None, + comment_prefix: str | None = None, quote_char: str | None = '"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, @@ -92,7 +92,7 @@ def __init__( overwrite_dtype=dtype_list, overwrite_dtype_slice=dtype_slice, low_memory=low_memory, - comment_char=comment_char, + comment_prefix=comment_prefix, quote_char=quote_char, null_values=processed_null_values, missing_utf8_is_empty_string=missing_utf8_is_empty_string, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index aa0b8bfcaac9..cf0e93522512 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -8,6 +8,7 @@ from polars.io._utils import _prepare_file_arg from polars.io.csv._utils import _check_arg_is_1byte, _update_columns from polars.io.csv.batched_reader import BatchedCsvReader +from polars.utils.deprecation import deprecate_renamed_parameter from polars.utils.various import handle_projection_columns, normalize_filepath if TYPE_CHECKING: @@ -17,6 +18,9 @@ from polars.type_aliases import CsvEncoding, PolarsDataType, SchemaDict +@deprecate_renamed_parameter( + old_name="comment_char", new_name="comment_prefix", version="0.19.14" +) def read_csv( source: str | TextIO | BytesIO | Path | BinaryIO | bytes, *, @@ -24,7 +28,7 @@ def read_csv( columns: Sequence[int] | Sequence[str] | None = None, new_columns: Sequence[str] | None = None, separator: str = ",", - comment_char: str | None = None, + comment_prefix: str | None = None, quote_char: str | None = '"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, @@ -74,9 +78,9 @@ def read_csv( columns will have their original name. separator Single byte character to use as separator in the file. - comment_char - Single byte character that indicates the start of a comment line, - for instance `#`. + comment_prefix + A string, which can be up to 5 symbols in length, used to indicate + the start of a comment line. For instance, it can be set to `#` or `//`. quote_char Single byte character used for csv quoting, default = `"`. Set to None to turn off special handling and escaping of quotes. @@ -185,7 +189,6 @@ def read_csv( """ _check_arg_is_1byte("separator", separator, can_be_empty=False) - _check_arg_is_1byte("comment_char", comment_char, can_be_empty=False) _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True) _check_arg_is_1byte("eol_char", eol_char, can_be_empty=False) @@ -368,7 +371,7 @@ def read_csv( has_header=has_header, columns=columns if columns else projection, separator=separator, - comment_char=comment_char, + comment_prefix=comment_prefix, quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes, @@ -398,6 +401,9 @@ def read_csv( return df +@deprecate_renamed_parameter( + old_name="comment_char", new_name="comment_prefix", version="0.19.14" +) def read_csv_batched( source: str | Path, *, @@ -405,7 +411,7 @@ def read_csv_batched( columns: Sequence[int] | Sequence[str] | None = None, new_columns: Sequence[str] | None = None, separator: str = ",", - comment_char: str | None = None, + comment_prefix: str | None = None, quote_char: str | None = '"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, @@ -455,9 +461,9 @@ def read_csv_batched( columns will have their original name. separator Single byte character to use as separator in the file. - comment_char - Single byte character that indicates the start of a comment line, - for instance `#`. + comment_prefix + A string, which can be up to 5 symbols in length, used to indicate + the start of a comment line. For instance, it can be set to `#` or `//`. quote_char Single byte character used for csv quoting, default = `"`. Set to None to turn off special handling and escaping of quotes. @@ -669,7 +675,7 @@ def read_csv_batched( has_header=has_header, columns=columns if columns else projection, separator=separator, - comment_char=comment_char, + comment_prefix=comment_prefix, quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes, @@ -694,12 +700,15 @@ def read_csv_batched( ) +@deprecate_renamed_parameter( + old_name="comment_char", new_name="comment_prefix", version="0.19.14" +) def scan_csv( source: str | Path | list[str] | list[Path], *, has_header: bool = True, separator: str = ",", - comment_char: str | None = None, + comment_prefix: str | None = None, quote_char: str | None = '"', skip_rows: int = 0, dtypes: SchemaDict | Sequence[PolarsDataType] | None = None, @@ -741,9 +750,9 @@ def scan_csv( enumeration over every column in the dataset starting at 1. separator Single byte character to use as separator in the file. - comment_char - Single byte character that indicates the start of a comment line, - for instance `#`. + comment_prefix + A string, which can be up to 5 symbols in length, used to indicate + the start of a comment line. For instance, it can be set to `#` or `//`. quote_char Single byte character used for csv quoting, default = `"`. Set to None to turn off special handling and escaping of quotes. @@ -900,7 +909,6 @@ def with_column_names(cols: list[str]) -> list[str]: return new_columns # type: ignore[return-value] _check_arg_is_1byte("separator", separator, can_be_empty=False) - _check_arg_is_1byte("comment_char", comment_char, can_be_empty=False) _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True) if isinstance(source, (str, Path)): @@ -912,7 +920,7 @@ def with_column_names(cols: list[str]) -> list[str]: source, has_header=has_header, separator=separator, - comment_char=comment_char, + comment_prefix=comment_prefix, quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes, # type: ignore[arg-type] diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 14284ec75711..99567d3a24d1 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -323,7 +323,7 @@ def _scan_csv( *, has_header: bool = True, separator: str = ",", - comment_char: str | None = None, + comment_prefix: str | None = None, quote_char: str | None = '"', skip_rows: int = 0, dtypes: SchemaDict | None = None, @@ -381,7 +381,7 @@ def _scan_csv( cache, dtype_list, low_memory, - comment_char, + comment_prefix, quote_char, processed_null_values, missing_utf8_is_empty_string, diff --git a/py-polars/src/batched_csv.rs b/py-polars/src/batched_csv.rs index db949d6ae99c..5f7a6d2e82bd 100644 --- a/py-polars/src/batched_csv.rs +++ b/py-polars/src/batched_csv.rs @@ -28,7 +28,7 @@ impl PyBatchedCsv { #[pyo3(signature = ( infer_schema_length, chunk_size, has_header, ignore_errors, n_rows, skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype, - overwrite_dtype_slice, low_memory, comment_char, quote_char, null_values, + overwrite_dtype_slice, low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines) )] @@ -49,7 +49,7 @@ impl PyBatchedCsv { overwrite_dtype: Option)>>, overwrite_dtype_slice: Option>>, low_memory: bool, - comment_char: Option<&str>, + comment_prefix: Option<&str>, quote_char: Option<&str>, null_values: Option>, missing_utf8_is_empty_string: bool, @@ -62,7 +62,6 @@ impl PyBatchedCsv { truncate_ragged_lines: bool, ) -> PyResult { let null_values = null_values.map(|w| w.0); - let comment_char = comment_char.map(|s| s.as_bytes()[0]); let eol_char = eol_char.as_bytes()[0]; let row_count = row_count.map(|(name, offset)| RowCount { name, offset }); let quote_char = if let Some(s) = quote_char { @@ -110,7 +109,7 @@ impl PyBatchedCsv { .with_dtypes_slice(overwrite_dtype_slice.as_deref()) .with_missing_is_null(!missing_utf8_is_empty_string) .low_memory(low_memory) - .with_comment_char(comment_char) + .with_comment_prefix(comment_prefix) .with_null_values(null_values) .with_try_parse_dates(try_parse_dates) .with_quote_char(quote_char) diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index ab57dbb20db7..2d91d1bd0265 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -172,7 +172,7 @@ impl PyDataFrame { #[pyo3(signature = ( py_f, infer_schema_length, chunk_size, has_header, ignore_errors, n_rows, skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path, - overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char, + overwrite_dtype, overwrite_dtype_slice, low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, schema) )] @@ -194,7 +194,7 @@ impl PyDataFrame { overwrite_dtype: Option)>>, overwrite_dtype_slice: Option>>, low_memory: bool, - comment_char: Option<&str>, + comment_prefix: Option<&str>, quote_char: Option<&str>, null_values: Option>, missing_utf8_is_empty_string: bool, @@ -208,7 +208,6 @@ impl PyDataFrame { schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); - let comment_char = comment_char.map(|s| s.as_bytes()[0]); let eol_char = eol_char.as_bytes()[0]; let row_count = row_count.map(|(name, offset)| RowCount { name, offset }); let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied()); @@ -251,7 +250,7 @@ impl PyDataFrame { .low_memory(low_memory) .with_null_values(null_values) .with_missing_is_null(!missing_utf8_is_empty_string) - .with_comment_char(comment_char) + .with_comment_prefix(comment_prefix) .with_try_parse_dates(try_parse_dates) .with_quote_char(quote_char) .with_end_of_line_char(eol_char) diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index 4b2a23d216af..fff4a0d4d014 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -148,7 +148,7 @@ impl PyLazyFrame { #[staticmethod] #[cfg(feature = "csv")] #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, - low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string, + low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header, encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, schema ) @@ -164,7 +164,7 @@ impl PyLazyFrame { cache: bool, overwrite_dtype: Option)>>, low_memory: bool, - comment_char: Option<&str>, + comment_prefix: Option<&str>, quote_char: Option<&str>, null_values: Option>, missing_utf8_is_empty_string: bool, @@ -181,7 +181,6 @@ impl PyLazyFrame { schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); - let comment_char = comment_char.map(|s| s.as_bytes()[0]); let quote_char = quote_char.map(|s| s.as_bytes()[0]); let separator = separator.as_bytes()[0]; let eol_char = eol_char.as_bytes()[0]; @@ -211,7 +210,7 @@ impl PyLazyFrame { .with_dtype_overwrite(overwrite_dtype.as_ref()) .with_schema(schema.map(|schema| Arc::new(schema.0))) .low_memory(low_memory) - .with_comment_char(comment_char) + .with_comment_prefix(comment_prefix) .with_quote_char(quote_char) .with_end_of_line_char(eol_char) .with_rechunk(rechunk) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index a19e270de850..7285fd25a90a 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -549,7 +549,7 @@ def test_empty_line_with_single_column() -> None: b"a\n\nb\n", new_columns=["A"], has_header=False, - comment_char="#", + comment_prefix="#", use_pyarrow=False, ) expected = pl.DataFrame({"A": ["a", None, "b"]}) @@ -561,13 +561,32 @@ def test_empty_line_with_multiple_columns() -> None: b"a,b\n\nc,d\n", new_columns=["A", "B"], has_header=False, - comment_char="#", + comment_prefix="#", use_pyarrow=False, ) expected = pl.DataFrame({"A": ["a", "c"], "B": ["b", "d"]}) assert_frame_equal(df, expected) +def test_csv_multi_char_comment() -> None: + csv = textwrap.dedent( + """\ + #a,b + ##c,d + """ + ) + f = io.StringIO(csv) + df = pl.read_csv( + f, + new_columns=["A", "B"], + has_header=False, + comment_prefix="##", + use_pyarrow=False, + ) + expected = pl.DataFrame({"A": ["#a"], "B": ["b"]}) + assert_frame_equal(df, expected) + + def test_csv_quote_char() -> None: expected = pl.DataFrame( [