From 22b1f5d3de766d3ad9374744556d216b9aa82c19 Mon Sep 17 00:00:00 2001
From: dmitrybugakov <bugakov.dima@gmail.com>
Date: Thu, 16 Nov 2023 16:32:34 +0100
Subject: [PATCH] feature(rust): Implement multi-character comment support in
 read_csv #10583

---
 crates/polars-io/src/csv/mod.rs               |  2 +-
 crates/polars-io/src/csv/parser.rs            | 25 ++++++---
 crates/polars-io/src/csv/read.rs              | 55 ++++++++++++++++---
 .../src/csv/read_impl/batched_mmap.rs         |  6 +-
 .../src/csv/read_impl/batched_read.rs         |  6 +-
 crates/polars-io/src/csv/read_impl/mod.rs     | 29 +++++-----
 crates/polars-io/src/csv/utils.rs             | 41 +++++++-------
 .../src/physical_plan/executors/scan/csv.rs   |  2 +-
 crates/polars-lazy/src/scan/csv.rs            | 22 +++++---
 .../polars-pipe/src/executors/sources/csv.rs  |  2 +-
 .../polars-plan/src/logical_plan/builder.rs   |  7 ++-
 .../polars-plan/src/logical_plan/options.rs   |  4 +-
 crates/polars/tests/it/io/csv.rs              | 20 ++++++-
 py-polars/polars/dataframe/frame.py           |  6 +-
 py-polars/polars/io/csv/batched_reader.py     |  4 +-
 py-polars/polars/io/csv/functions.py          | 42 ++++++++------
 py-polars/polars/lazyframe/frame.py           |  4 +-
 py-polars/src/batched_csv.rs                  |  7 +--
 py-polars/src/dataframe.rs                    |  7 +--
 py-polars/src/lazyframe.rs                    |  7 +--
 py-polars/tests/unit/io/test_csv.py           | 23 +++++++-
 21 files changed, 206 insertions(+), 115 deletions(-)
diff --git a/crates/polars-io/src/csv/mod.rs b/crates/polars-io/src/csv/mod.rs
index f5dd4b1f6e14..a6cf97246fcc 100644
--- a/crates/polars-io/src/csv/mod.rs
+++ b/crates/polars-io/src/csv/mod.rs
@@ -59,7 +59,7 @@ use polars_core::prelude::*;
 use polars_time::prelude::*;
 #[cfg(feature = "temporal")]
 use rayon::prelude::*;
-pub use read::{CsvEncoding, CsvReader, NullValues};
+pub use read::{CommentPrefix, CsvEncoding, CsvReader, NullValues};
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 pub use write::{BatchedWriter, CsvWriter, QuoteStyle};
diff --git a/crates/polars-io/src/csv/parser.rs b/crates/polars-io/src/csv/parser.rs
index 1b7880f1352e..885f934b78da 100644
--- a/crates/polars-io/src/csv/parser.rs
+++ b/crates/polars-io/src/csv/parser.rs
@@ -5,6 +5,7 @@ use polars_core::prelude::*;
 use super::buffer::*;
 use crate::csv::read::NullValuesCompiled;
 use crate::csv::splitfields::SplitFields;
+use crate::csv::CommentPrefix;
 
 /// Skip the utf-8 Byte Order Mark.
 /// credits to csv-core
@@ -16,6 +17,17 @@ pub(crate) fn skip_bom(input: &[u8]) -> &[u8] {
     }
 }
 
+/// Checks if a line in a CSV file is a comment based on the given comment prefix configuration.
+///
+/// This function is used during CSV parsing to determine whether a line should be ignored based on its starting characters.
+pub(crate) fn is_comment_line(line: &[u8], comment_prefix: Option<&CommentPrefix>) -> bool {
+    match comment_prefix {
+        Some(CommentPrefix::Single(c)) => line.starts_with(&[*c]),
+        Some(CommentPrefix::Multi(s)) => line.starts_with(s.as_bytes()),
+        None => false,
+    }
+}
+
 /// Find the nearest next line position.
 /// Does not check for new line characters embedded in String fields.
 pub(crate) fn next_line_position_naive(input: &[u8], eol_char: u8) -> Option<usize> {
@@ -351,7 +363,7 @@ pub(super) fn parse_lines<'a>(
     mut bytes: &'a [u8],
     offset: usize,
     separator: u8,
-    comment_char: Option<u8>,
+    comment_prefix: Option<&CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     missing_is_null: bool,
@@ -400,13 +412,10 @@ pub(super) fn parse_lines<'a>(
         }
 
         // deal with comments
-        if let Some(c) = comment_char {
-            // line is a comment -> skip
-            if bytes[0] == c {
-                let bytes_rem = skip_this_line(bytes, quote_char, eol_char);
-                bytes = bytes_rem;
-                continue;
-            }
+        if is_comment_line(bytes, comment_prefix) {
+            let bytes_rem = skip_this_line(bytes, quote_char, eol_char);
+            bytes = bytes_rem;
+            continue;
         }
 
         // Every line we only need to parse the columns that are projected.
diff --git a/crates/polars-io/src/csv/read.rs b/crates/polars-io/src/csv/read.rs
index 4d8527b70b80..2f3d3108ab38 100644
--- a/crates/polars-io/src/csv/read.rs
+++ b/crates/polars-io/src/csv/read.rs
@@ -25,6 +25,33 @@ pub enum NullValues {
     Named(Vec<(String, String)>),
 }
 
+#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub enum CommentPrefix {
+    /// A single byte character that indicates the start of a comment line.
+    Single(u8),
+    /// A string that indicates the start of a comment line.
+    /// This allows for multiple characters to be used as a comment identifier.
+    Multi(String),
+}
+
+impl CommentPrefix {
+    /// Creates a new `CommentPrefix` for the `Single` variant.
+    pub fn new_single(c: u8) -> Self {
+        CommentPrefix::Single(c)
+    }
+
+    /// Creates a new `CommentPrefix`. If `Multi` variant is used and the string is longer
+    /// than 5 characters, it will return `None`.
+    pub fn new_multi(s: String) -> Option<Self> {
+        if s.len() <= 5 {
+            Some(CommentPrefix::Multi(s))
+        } else {
+            None
+        }
+    }
+}
+
 pub(super) enum NullValuesCompiled {
     /// A single value that's used for all columns
     AllColumnsSingle(String),
@@ -118,7 +145,7 @@ where
     dtype_overwrite: Option<&'a [DataType]>,
     sample_size: usize,
     chunk_size: usize,
-    comment_char: Option<u8>,
+    comment_prefix: Option<CommentPrefix>,
     null_values: Option<NullValues>,
     predicate: Option<Arc<dyn PhysicalIoExpr>>,
     quote_char: Option<u8>,
@@ -210,9 +237,21 @@ where
         self
     }
 
-    /// Set the comment character. Lines starting with this character will be ignored.
-    pub fn with_comment_char(mut self, comment_char: Option<u8>) -> Self {
-        self.comment_char = comment_char;
+    /// Set the comment prefix for this instance. Lines starting with this prefix will be ignored.
+    pub fn with_comment_prefix(mut self, comment_prefix: Option<&str>) -> Self {
+        self.comment_prefix = comment_prefix.map(|s| {
+            if s.len() == 1 && s.chars().next().unwrap().is_ascii() {
+                CommentPrefix::Single(s.as_bytes()[0])
+            } else {
+                CommentPrefix::Multi(s.to_string())
+            }
+        });
+        self
+    }
+
+    /// Sets the comment prefix from `CsvParserOptions` for internal initialization.
+    pub fn _with_comment_prefix(mut self, comment_prefix: Option<CommentPrefix>) -> Self {
+        self.comment_prefix = comment_prefix;
         self
     }
 
@@ -370,7 +409,7 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> {
             self.sample_size,
             self.chunk_size,
             self.low_memory,
-            self.comment_char,
+            std::mem::take(&mut self.comment_prefix),
             self.quote_char,
             self.eol_char,
             std::mem::take(&mut self.null_values),
@@ -487,7 +526,7 @@ impl<'a> CsvReader<'a, Box<dyn MmapBytesReader>> {
                     None,
                     &mut self.skip_rows_before_header,
                     self.skip_rows_after_header,
-                    self.comment_char,
+                    self.comment_prefix.as_ref(),
                     self.quote_char,
                     self.eol_char,
                     self.null_values.as_ref(),
@@ -516,7 +555,7 @@ impl<'a> CsvReader<'a, Box<dyn MmapBytesReader>> {
                     None,
                     &mut self.skip_rows_before_header,
                     self.skip_rows_after_header,
-                    self.comment_char,
+                    self.comment_prefix.as_ref(),
                     self.quote_char,
                     self.eol_char,
                     self.null_values.as_ref(),
@@ -556,7 +595,7 @@ where
             sample_size: 1024,
             chunk_size: 1 << 18,
             low_memory: false,
-            comment_char: None,
+            comment_prefix: None,
             eol_char: b'\n',
             null_values: None,
             missing_is_null: true,
diff --git a/crates/polars-io/src/csv/read_impl/batched_mmap.rs b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
index f0299ca40fe9..3b0083f51092 100644
--- a/crates/polars-io/src/csv/read_impl/batched_mmap.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_mmap.rs
@@ -154,7 +154,7 @@ impl<'a> CoreReader<'a> {
             projection,
             starting_point_offset,
             row_count: self.row_count,
-            comment_char: self.comment_char,
+            comment_prefix: self.comment_prefix,
             quote_char: self.quote_char,
             eol_char: self.eol_char,
             null_values: self.null_values,
@@ -182,7 +182,7 @@ pub struct BatchedCsvReaderMmap<'a> {
     projection: Vec<usize>,
     starting_point_offset: Option<usize>,
     row_count: Option<RowCount>,
-    comment_char: Option<u8>,
+    comment_prefix: Option<CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<NullValuesCompiled>,
@@ -240,7 +240,7 @@ impl<'a> BatchedCsvReaderMmap<'a> {
                         bytes_offset_thread,
                         self.quote_char,
                         self.eol_char,
-                        self.comment_char,
+                        self.comment_prefix.as_ref(),
                         self.chunk_size,
                         &self.str_capacities,
                         self.encoding,
diff --git a/crates/polars-io/src/csv/read_impl/batched_read.rs b/crates/polars-io/src/csv/read_impl/batched_read.rs
index 9e8e6b6e6836..753cc281110e 100644
--- a/crates/polars-io/src/csv/read_impl/batched_read.rs
+++ b/crates/polars-io/src/csv/read_impl/batched_read.rs
@@ -237,7 +237,7 @@ impl<'a> CoreReader<'a> {
             projection,
             starting_point_offset,
             row_count: self.row_count,
-            comment_char: self.comment_char,
+            comment_prefix: self.comment_prefix,
             quote_char: self.quote_char,
             eol_char: self.eol_char,
             null_values: self.null_values,
@@ -265,7 +265,7 @@ pub struct BatchedCsvReaderRead<'a> {
     projection: Vec<usize>,
     starting_point_offset: Option<usize>,
     row_count: Option<RowCount>,
-    comment_char: Option<u8>,
+    comment_prefix: Option<CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<NullValuesCompiled>,
@@ -337,7 +337,7 @@ impl<'a> BatchedCsvReaderRead<'a> {
                         0,
                         self.quote_char,
                         self.eol_char,
-                        self.comment_char,
+                        self.comment_prefix.as_ref(),
                         self.chunk_size,
                         &self.str_capacities,
                         self.encoding,
diff --git a/crates/polars-io/src/csv/read_impl/mod.rs b/crates/polars-io/src/csv/read_impl/mod.rs
index aa077a9e61a2..29c04bac35a1 100644
--- a/crates/polars-io/src/csv/read_impl/mod.rs
+++ b/crates/polars-io/src/csv/read_impl/mod.rs
@@ -20,7 +20,7 @@ use rayon::prelude::*;
 
 use crate::csv::buffer::*;
 use crate::csv::parser::*;
-use crate::csv::read::NullValuesCompiled;
+use crate::csv::read::{CommentPrefix, NullValuesCompiled};
 use crate::csv::utils::*;
 use crate::csv::{CsvEncoding, NullValues};
 use crate::mmap::ReaderBytes;
@@ -109,7 +109,7 @@ pub(crate) struct CoreReader<'a> {
     sample_size: usize,
     chunk_size: usize,
     low_memory: bool,
-    comment_char: Option<u8>,
+    comment_prefix: Option<CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<NullValuesCompiled>,
@@ -198,7 +198,7 @@ impl<'a> CoreReader<'a> {
         sample_size: usize,
         chunk_size: usize,
         low_memory: bool,
-        comment_char: Option<u8>,
+        comment_prefix: Option<CommentPrefix>,
         quote_char: Option<u8>,
         eol_char: u8,
         null_values: Option<NullValues>,
@@ -247,7 +247,7 @@ impl<'a> CoreReader<'a> {
                         schema_overwrite.as_deref(),
                         &mut skip_rows,
                         skip_rows_after_header,
-                        comment_char,
+                        comment_prefix.as_ref(),
                         quote_char,
                         eol_char,
                         null_values.as_ref(),
@@ -299,7 +299,7 @@ impl<'a> CoreReader<'a> {
             sample_size,
             chunk_size,
             low_memory,
-            comment_char,
+            comment_prefix,
             quote_char,
             eol_char,
             null_values,
@@ -342,14 +342,13 @@ impl<'a> CoreReader<'a> {
 
         if self.skip_rows_after_header > 0 {
             for _ in 0..self.skip_rows_after_header {
-                let pos = match bytes.first() {
-                    Some(first) if Some(*first) == self.comment_char => {
-                        next_line_position_naive(bytes, eol_char)
-                    },
+                let pos = if is_comment_line(bytes, self.comment_prefix.as_ref()) {
+                    next_line_position_naive(bytes, eol_char)
+                } else {
                     // we don't pass expected fields
                     // as we want to skip all rows
                     // no matter the no. of fields
-                    _ => next_line_position(bytes, None, self.separator, self.quote_char, eol_char),
+                    next_line_position(bytes, None, self.separator, self.quote_char, eol_char)
                 }
                 .ok_or_else(|| polars_err!(NoData: "not enough lines to skip"))?;
 
@@ -598,7 +597,7 @@ impl<'a> CoreReader<'a> {
                                 local_bytes,
                                 offset,
                                 self.separator,
-                                self.comment_char,
+                                self.comment_prefix.as_ref(),
                                 self.quote_char,
                                 self.eol_char,
                                 self.missing_is_null,
@@ -670,7 +669,7 @@ impl<'a> CoreReader<'a> {
                             bytes_offset_thread,
                             self.quote_char,
                             self.eol_char,
-                            self.comment_char,
+                            self.comment_prefix.as_ref(),
                             capacity,
                             &str_capacities,
                             self.encoding,
@@ -716,7 +715,7 @@ impl<'a> CoreReader<'a> {
                                 remaining_bytes,
                                 0,
                                 self.separator,
-                                self.comment_char,
+                                self.comment_prefix.as_ref(),
                                 self.quote_char,
                                 self.eol_char,
                                 self.missing_is_null,
@@ -800,7 +799,7 @@ fn read_chunk(
     bytes_offset_thread: usize,
     quote_char: Option<u8>,
     eol_char: u8,
-    comment_char: Option<u8>,
+    comment_prefix: Option<&CommentPrefix>,
     capacity: usize,
     str_capacities: &[RunningSize],
     encoding: CsvEncoding,
@@ -835,7 +834,7 @@ fn read_chunk(
             local_bytes,
             offset,
             separator,
-            comment_char,
+            comment_prefix,
             quote_char,
             eol_char,
             missing_is_null,
diff --git a/crates/polars-io/src/csv/utils.rs b/crates/polars-io/src/csv/utils.rs
index 6f7ef7c054f9..8ff7608eeaec 100644
--- a/crates/polars-io/src/csv/utils.rs
+++ b/crates/polars-io/src/csv/utils.rs
@@ -16,7 +16,8 @@ use crate::csv::parser::{next_line_position, skip_bom, skip_line_ending, SplitLi
 use crate::csv::splitfields::SplitFields;
 use crate::csv::CsvEncoding;
 use crate::mmap::ReaderBytes;
-use crate::prelude::NullValues;
+use crate::prelude::parser::is_comment_line;
+use crate::prelude::{CommentPrefix, NullValues};
 use crate::utils::{BOOLEAN_RE, FLOAT_RE, INTEGER_RE};
 
 pub(crate) fn get_file_chunks(
@@ -142,7 +143,7 @@ pub fn infer_file_schema_inner(
     // on the schema inference
     skip_rows: &mut usize,
     skip_rows_after_header: usize,
-    comment_char: Option<u8>,
+    comment_prefix: Option<&CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<&NullValues>,
@@ -170,19 +171,19 @@ pub fn infer_file_schema_inner(
 
     // skip lines that are comments
     let mut first_line = None;
-    if let Some(comment_ch) = comment_char {
-        for (i, line) in (&mut lines).enumerate() {
-            if let Some(ch) = line.first() {
-                if *ch != comment_ch {
-                    first_line = Some(line);
-                    *skip_rows += i;
-                    break;
-                }
-            }
+
+    for (i, line) in (&mut lines).enumerate() {
+        if !is_comment_line(line, comment_prefix) {
+            first_line = Some(line);
+            *skip_rows += i;
+            break;
         }
-    } else {
+    }
+
+    if first_line.is_none() {
         first_line = lines.next();
     }
+
     // edge case where we have a single row, no header and no eol char.
     if first_line.is_none() && !has_eol && !has_header {
         first_line = Some(bytes);
@@ -254,7 +255,7 @@ pub fn infer_file_schema_inner(
             schema_overwrite,
             skip_rows,
             skip_rows_after_header,
-            comment_char,
+            comment_prefix,
             quote_char,
             eol_char,
             null_values,
@@ -310,11 +311,9 @@ pub fn infer_file_schema_inner(
             continue;
         }
 
-        if let Some(c) = comment_char {
-            // line is a comment -> skip
-            if line[0] == c {
-                continue;
-            }
+        // line is a comment -> skip
+        if is_comment_line(line, comment_prefix) {
+            continue;
         }
 
         let len = line.len();
@@ -448,7 +447,7 @@ pub fn infer_file_schema_inner(
             schema_overwrite,
             skip_rows,
             skip_rows_after_header,
-            comment_char,
+            comment_prefix,
             quote_char,
             eol_char,
             null_values,
@@ -481,7 +480,7 @@ pub fn infer_file_schema(
     // on the schema inference
     skip_rows: &mut usize,
     skip_rows_after_header: usize,
-    comment_char: Option<u8>,
+    comment_prefix: Option<&CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<&NullValues>,
@@ -496,7 +495,7 @@ pub fn infer_file_schema(
         schema_overwrite,
         skip_rows,
         skip_rows_after_header,
-        comment_char,
+        comment_prefix,
         quote_char,
         eol_char,
         null_values,
diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
index bac591b84f86..f05875342555 100644
--- a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
+++ b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
@@ -38,7 +38,7 @@ impl CsvExec {
             .with_null_values(std::mem::take(&mut self.options.null_values))
             .with_predicate(predicate)
             .with_encoding(CsvEncoding::LossyUtf8)
-            .with_comment_char(self.options.comment_char)
+            ._with_comment_prefix(std::mem::take(&mut self.options.comment_prefix))
             .with_quote_char(self.options.quote_char)
             .with_end_of_line_char(self.options.eol_char)
             .with_encoding(self.options.encoding)
diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index 2543dcbea92b..d0b15218a506 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -2,7 +2,7 @@ use std::path::{Path, PathBuf};
 
 use polars_core::prelude::*;
 use polars_io::csv::utils::infer_file_schema;
-use polars_io::csv::{CsvEncoding, NullValues};
+use polars_io::csv::{CommentPrefix, CsvEncoding, NullValues};
 use polars_io::utils::get_reader_bytes;
 use polars_io::RowCount;
 
@@ -23,7 +23,7 @@ pub struct LazyCsvReader<'a> {
     schema: Option<SchemaRef>,
     schema_overwrite: Option<&'a Schema>,
     low_memory: bool,
-    comment_char: Option<u8>,
+    comment_prefix: Option<CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<NullValues>,
@@ -57,7 +57,7 @@ impl<'a> LazyCsvReader<'a> {
             schema: None,
             schema_overwrite: None,
             low_memory: false,
-            comment_char: None,
+            comment_prefix: None,
             quote_char: Some(b'"'),
             eol_char: b'\n',
             null_values: None,
@@ -147,10 +147,16 @@ impl<'a> LazyCsvReader<'a> {
         self
     }
 
-    /// Set the comment character. Lines starting with this character will be ignored.
+    /// Set the comment prefix for this instance. Lines starting with this prefix will be ignored.
     #[must_use]
-    pub fn with_comment_char(mut self, comment_char: Option<u8>) -> Self {
-        self.comment_char = comment_char;
+    pub fn with_comment_prefix(mut self, comment_prefix: Option<&str>) -> Self {
+        self.comment_prefix = comment_prefix.map(|s| {
+            if s.len() == 1 && s.chars().next().unwrap().is_ascii() {
+                CommentPrefix::Single(s.as_bytes()[0])
+            } else {
+                CommentPrefix::Multi(s.to_string())
+            }
+        });
         self
     }
 
@@ -252,7 +258,7 @@ impl<'a> LazyCsvReader<'a> {
             None,
             &mut skip_rows,
             self.skip_rows_after_header,
-            self.comment_char,
+            self.comment_prefix.as_ref(),
             self.quote_char,
             self.eol_char,
             None,
@@ -285,7 +291,7 @@ impl LazyFileListReader for LazyCsvReader<'_> {
             self.schema,
             self.schema_overwrite,
             self.low_memory,
-            self.comment_char,
+            self.comment_prefix,
             self.quote_char,
             self.eol_char,
             self.null_values,
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index b1297c39e07c..271cc9d5c621 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -71,7 +71,7 @@ impl CsvSource {
             .low_memory(options.low_memory)
             .with_null_values(options.null_values)
             .with_encoding(CsvEncoding::LossyUtf8)
-            .with_comment_char(options.comment_char)
+            ._with_comment_prefix(options.comment_prefix)
             .with_quote_char(options.quote_char)
             .with_end_of_line_char(options.eol_char)
             .with_encoding(options.encoding)
diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs
index 3612e05e3f4d..43ca98e91c9d 100644
--- a/crates/polars-plan/src/logical_plan/builder.rs
+++ b/crates/polars-plan/src/logical_plan/builder.rs
@@ -23,6 +23,7 @@ use polars_io::RowCount;
 #[cfg(feature = "csv")]
 use polars_io::{
     csv::utils::{infer_file_schema, is_compressed},
+    csv::CommentPrefix,
     csv::CsvEncoding,
     csv::NullValues,
     utils::get_reader_bytes,
@@ -285,7 +286,7 @@ impl LogicalPlanBuilder {
         mut schema: Option<Arc<Schema>>,
         schema_overwrite: Option<&Schema>,
         low_memory: bool,
-        comment_char: Option<u8>,
+        comment_prefix: Option<CommentPrefix>,
         quote_char: Option<u8>,
         eol_char: u8,
         null_values: Option<NullValues>,
@@ -325,7 +326,7 @@ impl LogicalPlanBuilder {
             schema_overwrite,
             &mut skip_rows,
             skip_rows_after_header,
-            comment_char,
+            comment_prefix.as_ref(),
             quote_char,
             eol_char,
             null_values.as_ref(),
@@ -377,7 +378,7 @@ impl LogicalPlanBuilder {
                     ignore_errors,
                     skip_rows,
                     low_memory,
-                    comment_char,
+                    comment_prefix,
                     quote_char,
                     eol_char,
                     null_values,
diff --git a/crates/polars-plan/src/logical_plan/options.rs b/crates/polars-plan/src/logical_plan/options.rs
index 4a850411672b..a1282a9ba461 100644
--- a/crates/polars-plan/src/logical_plan/options.rs
+++ b/crates/polars-plan/src/logical_plan/options.rs
@@ -4,7 +4,7 @@ use polars_core::prelude::*;
 #[cfg(feature = "csv")]
 use polars_io::csv::SerializeOptions;
 #[cfg(feature = "csv")]
-use polars_io::csv::{CsvEncoding, NullValues};
+use polars_io::csv::{CommentPrefix, CsvEncoding, NullValues};
 #[cfg(feature = "ipc")]
 use polars_io::ipc::IpcCompression;
 #[cfg(feature = "parquet")]
@@ -25,7 +25,7 @@ pub type FileCount = u32;
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct CsvParserOptions {
     pub separator: u8,
-    pub comment_char: Option<u8>,
+    pub comment_prefix: Option<CommentPrefix>,
     pub quote_char: Option<u8>,
     pub eol_char: u8,
     pub has_header: bool,
diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs
index d8cd5fc99c1c..13804c5fc3c9 100644
--- a/crates/polars/tests/it/io/csv.rs
+++ b/crates/polars/tests/it/io/csv.rs
@@ -542,7 +542,21 @@ fn test_comment_lines() -> PolarsResult<()> {
     let file = Cursor::new(csv);
     let df = CsvReader::new(file)
         .has_header(false)
-        .with_comment_char(Some(b'#'))
+        .with_comment_prefix(Some("#"))
+        .finish()?;
+    assert_eq!(df.shape(), (3, 5));
+
+    let csv = r"!str,2,3,4,5
+!#& this is a comment
+!str,2,3,4,5
+!#& this is also a comment
+!str,2,3,4,5
+";
+
+    let file = Cursor::new(csv);
+    let df = CsvReader::new(file)
+        .has_header(false)
+        .with_comment_prefix(Some("!#&"))
         .finish()?;
     assert_eq!(df.shape(), (3, 5));
 
@@ -557,7 +571,7 @@ fn test_comment_lines() -> PolarsResult<()> {
     let file = Cursor::new(csv);
     let df = CsvReader::new(file)
         .has_header(true)
-        .with_comment_char(Some(b'%'))
+        .with_comment_prefix(Some("%"))
         .finish()?;
     assert_eq!(df.shape(), (3, 5));
 
@@ -698,7 +712,7 @@ fn test_header_with_comments() -> PolarsResult<()> {
 
     let file = Cursor::new(csv);
     let df = CsvReader::new(file)
-        .with_comment_char(Some(b'#'))
+        .with_comment_prefix(Some("#"))
         .finish()?;
     // 1 row.
     assert_eq!(df.shape(), (1, 3));
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 2a7e018f44e2..02ad4d21ae6b 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -661,7 +661,7 @@ def _read_csv(
         has_header: bool = True,
         columns: Sequence[int] | Sequence[str] | None = None,
         separator: str = ",",
-        comment_char: str | None = None,
+        comment_prefix: str | None = None,
         quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
@@ -740,7 +740,7 @@ def _read_csv(
                 source,
                 has_header=has_header,
                 separator=separator,
-                comment_char=comment_char,
+                comment_prefix=comment_prefix,
                 quote_char=quote_char,
                 skip_rows=skip_rows,
                 dtypes=dtypes_dict,
@@ -789,7 +789,7 @@ def _read_csv(
             dtype_list,
             dtype_slice,
             low_memory,
-            comment_char,
+            comment_prefix,
             quote_char,
             processed_null_values,
             missing_utf8_is_empty_string,
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index ad3964800e5a..f3251137ed01 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -32,7 +32,7 @@ def __init__(
         has_header: bool = True,
         columns: Sequence[int] | Sequence[str] | None = None,
         separator: str = ",",
-        comment_char: str | None = None,
+        comment_prefix: str | None = None,
         quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
@@ -92,7 +92,7 @@ def __init__(
             overwrite_dtype=dtype_list,
             overwrite_dtype_slice=dtype_slice,
             low_memory=low_memory,
-            comment_char=comment_char,
+            comment_prefix=comment_prefix,
             quote_char=quote_char,
             null_values=processed_null_values,
             missing_utf8_is_empty_string=missing_utf8_is_empty_string,
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index aa0b8bfcaac9..cf0e93522512 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -8,6 +8,7 @@
 from polars.io._utils import _prepare_file_arg
 from polars.io.csv._utils import _check_arg_is_1byte, _update_columns
 from polars.io.csv.batched_reader import BatchedCsvReader
+from polars.utils.deprecation import deprecate_renamed_parameter
 from polars.utils.various import handle_projection_columns, normalize_filepath
 
 if TYPE_CHECKING:
@@ -17,6 +18,9 @@
     from polars.type_aliases import CsvEncoding, PolarsDataType, SchemaDict
 
 
+@deprecate_renamed_parameter(
+    old_name="comment_char", new_name="comment_prefix", version="0.19.14"
+)
 def read_csv(
     source: str | TextIO | BytesIO | Path | BinaryIO | bytes,
     *,
@@ -24,7 +28,7 @@ def read_csv(
     columns: Sequence[int] | Sequence[str] | None = None,
     new_columns: Sequence[str] | None = None,
     separator: str = ",",
-    comment_char: str | None = None,
+    comment_prefix: str | None = None,
     quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
@@ -74,9 +78,9 @@ def read_csv(
         columns will have their original name.
     separator
         Single byte character to use as separator in the file.
-    comment_char
-        Single byte character that indicates the start of a comment line,
-        for instance `#`.
+    comment_prefix
+        A string, which can be up to 5 symbols in length, used to indicate
+        the start of a comment line. For instance, it can be set to `#` or `//`.
     quote_char
         Single byte character used for csv quoting, default = `"`.
         Set to None to turn off special handling and escaping of quotes.
@@ -185,7 +189,6 @@ def read_csv(
 
     """
     _check_arg_is_1byte("separator", separator, can_be_empty=False)
-    _check_arg_is_1byte("comment_char", comment_char, can_be_empty=False)
     _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
     _check_arg_is_1byte("eol_char", eol_char, can_be_empty=False)
 
@@ -368,7 +371,7 @@ def read_csv(
             has_header=has_header,
             columns=columns if columns else projection,
             separator=separator,
-            comment_char=comment_char,
+            comment_prefix=comment_prefix,
             quote_char=quote_char,
             skip_rows=skip_rows,
             dtypes=dtypes,
@@ -398,6 +401,9 @@ def read_csv(
     return df
 
 
+@deprecate_renamed_parameter(
+    old_name="comment_char", new_name="comment_prefix", version="0.19.14"
+)
 def read_csv_batched(
     source: str | Path,
     *,
@@ -405,7 +411,7 @@ def read_csv_batched(
     columns: Sequence[int] | Sequence[str] | None = None,
     new_columns: Sequence[str] | None = None,
     separator: str = ",",
-    comment_char: str | None = None,
+    comment_prefix: str | None = None,
     quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
@@ -455,9 +461,9 @@ def read_csv_batched(
         columns will have their original name.
     separator
         Single byte character to use as separator in the file.
-    comment_char
-        Single byte character that indicates the start of a comment line,
-        for instance `#`.
+    comment_prefix
+        A string, which can be up to 5 symbols in length, used to indicate
+        the start of a comment line. For instance, it can be set to `#` or `//`.
     quote_char
         Single byte character used for csv quoting, default = `"`.
         Set to None to turn off special handling and escaping of quotes.
@@ -669,7 +675,7 @@ def read_csv_batched(
         has_header=has_header,
         columns=columns if columns else projection,
         separator=separator,
-        comment_char=comment_char,
+        comment_prefix=comment_prefix,
         quote_char=quote_char,
         skip_rows=skip_rows,
         dtypes=dtypes,
@@ -694,12 +700,15 @@ def read_csv_batched(
     )
 
 
+@deprecate_renamed_parameter(
+    old_name="comment_char", new_name="comment_prefix", version="0.19.14"
+)
 def scan_csv(
     source: str | Path | list[str] | list[Path],
     *,
     has_header: bool = True,
     separator: str = ",",
-    comment_char: str | None = None,
+    comment_prefix: str | None = None,
     quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: SchemaDict | Sequence[PolarsDataType] | None = None,
@@ -741,9 +750,9 @@ def scan_csv(
         enumeration over every column in the dataset starting at 1.
     separator
         Single byte character to use as separator in the file.
-    comment_char
-        Single byte character that indicates the start of a comment line,
-        for instance `#`.
+    comment_prefix
+        A string, which can be up to 5 symbols in length, used to indicate
+        the start of a comment line. For instance, it can be set to `#` or `//`.
     quote_char
         Single byte character used for csv quoting, default = `"`.
         Set to None to turn off special handling and escaping of quotes.
@@ -900,7 +909,6 @@ def with_column_names(cols: list[str]) -> list[str]:
                 return new_columns  # type: ignore[return-value]
 
     _check_arg_is_1byte("separator", separator, can_be_empty=False)
-    _check_arg_is_1byte("comment_char", comment_char, can_be_empty=False)
     _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
 
     if isinstance(source, (str, Path)):
@@ -912,7 +920,7 @@ def with_column_names(cols: list[str]) -> list[str]:
         source,
         has_header=has_header,
         separator=separator,
-        comment_char=comment_char,
+        comment_prefix=comment_prefix,
         quote_char=quote_char,
         skip_rows=skip_rows,
         dtypes=dtypes,  # type: ignore[arg-type]
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 14284ec75711..99567d3a24d1 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -323,7 +323,7 @@ def _scan_csv(
         *,
         has_header: bool = True,
         separator: str = ",",
-        comment_char: str | None = None,
+        comment_prefix: str | None = None,
         quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: SchemaDict | None = None,
@@ -381,7 +381,7 @@ def _scan_csv(
             cache,
             dtype_list,
             low_memory,
-            comment_char,
+            comment_prefix,
             quote_char,
             processed_null_values,
             missing_utf8_is_empty_string,
diff --git a/py-polars/src/batched_csv.rs b/py-polars/src/batched_csv.rs
index db949d6ae99c..5f7a6d2e82bd 100644
--- a/py-polars/src/batched_csv.rs
+++ b/py-polars/src/batched_csv.rs
@@ -28,7 +28,7 @@ impl PyBatchedCsv {
     #[pyo3(signature = (
         infer_schema_length, chunk_size, has_header, ignore_errors, n_rows, skip_rows,
         projection, separator, rechunk, columns, encoding, n_threads, path, overwrite_dtype,
-        overwrite_dtype_slice, low_memory, comment_char, quote_char, null_values,
+        overwrite_dtype_slice, low_memory, comment_prefix, quote_char, null_values,
         missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, row_count,
         sample_size, eol_char, raise_if_empty, truncate_ragged_lines)
     )]
@@ -49,7 +49,7 @@ impl PyBatchedCsv {
         overwrite_dtype: Option<Vec<(&str, Wrap<DataType>)>>,
         overwrite_dtype_slice: Option<Vec<Wrap<DataType>>>,
         low_memory: bool,
-        comment_char: Option<&str>,
+        comment_prefix: Option<&str>,
         quote_char: Option<&str>,
         null_values: Option<Wrap<NullValues>>,
         missing_utf8_is_empty_string: bool,
@@ -62,7 +62,6 @@ impl PyBatchedCsv {
         truncate_ragged_lines: bool,
     ) -> PyResult<PyBatchedCsv> {
         let null_values = null_values.map(|w| w.0);
-        let comment_char = comment_char.map(|s| s.as_bytes()[0]);
         let eol_char = eol_char.as_bytes()[0];
         let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
         let quote_char = if let Some(s) = quote_char {
@@ -110,7 +109,7 @@ impl PyBatchedCsv {
             .with_dtypes_slice(overwrite_dtype_slice.as_deref())
             .with_missing_is_null(!missing_utf8_is_empty_string)
             .low_memory(low_memory)
-            .with_comment_char(comment_char)
+            .with_comment_prefix(comment_prefix)
             .with_null_values(null_values)
             .with_try_parse_dates(try_parse_dates)
             .with_quote_char(quote_char)
diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
index ab57dbb20db7..2d91d1bd0265 100644
--- a/py-polars/src/dataframe.rs
+++ b/py-polars/src/dataframe.rs
@@ -172,7 +172,7 @@ impl PyDataFrame {
     #[pyo3(signature = (
         py_f, infer_schema_length, chunk_size, has_header, ignore_errors, n_rows,
         skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path,
-        overwrite_dtype, overwrite_dtype_slice, low_memory, comment_char, quote_char,
+        overwrite_dtype, overwrite_dtype_slice, low_memory, comment_prefix, quote_char,
         null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header,
         row_count, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, schema)
     )]
@@ -194,7 +194,7 @@ impl PyDataFrame {
         overwrite_dtype: Option<Vec<(&str, Wrap<DataType>)>>,
         overwrite_dtype_slice: Option<Vec<Wrap<DataType>>>,
         low_memory: bool,
-        comment_char: Option<&str>,
+        comment_prefix: Option<&str>,
         quote_char: Option<&str>,
         null_values: Option<Wrap<NullValues>>,
         missing_utf8_is_empty_string: bool,
@@ -208,7 +208,6 @@ impl PyDataFrame {
         schema: Option<Wrap<Schema>>,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
-        let comment_char = comment_char.map(|s| s.as_bytes()[0]);
         let eol_char = eol_char.as_bytes()[0];
         let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
         let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied());
@@ -251,7 +250,7 @@ impl PyDataFrame {
             .low_memory(low_memory)
             .with_null_values(null_values)
             .with_missing_is_null(!missing_utf8_is_empty_string)
-            .with_comment_char(comment_char)
+            .with_comment_prefix(comment_prefix)
             .with_try_parse_dates(try_parse_dates)
             .with_quote_char(quote_char)
             .with_end_of_line_char(eol_char)
diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs
index 4b2a23d216af..fff4a0d4d014 100644
--- a/py-polars/src/lazyframe.rs
+++ b/py-polars/src/lazyframe.rs
@@ -148,7 +148,7 @@ impl PyLazyFrame {
     #[staticmethod]
     #[cfg(feature = "csv")]
     #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
-        low_memory, comment_char, quote_char, null_values, missing_utf8_is_empty_string,
+        low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string,
         infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header,
         encoding, row_count, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, schema
     )
@@ -164,7 +164,7 @@ impl PyLazyFrame {
         cache: bool,
         overwrite_dtype: Option<Vec<(&str, Wrap<DataType>)>>,
         low_memory: bool,
-        comment_char: Option<&str>,
+        comment_prefix: Option<&str>,
         quote_char: Option<&str>,
         null_values: Option<Wrap<NullValues>>,
         missing_utf8_is_empty_string: bool,
@@ -181,7 +181,6 @@ impl PyLazyFrame {
         schema: Option<Wrap<Schema>>,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
-        let comment_char = comment_char.map(|s| s.as_bytes()[0]);
         let quote_char = quote_char.map(|s| s.as_bytes()[0]);
         let separator = separator.as_bytes()[0];
         let eol_char = eol_char.as_bytes()[0];
@@ -211,7 +210,7 @@ impl PyLazyFrame {
             .with_dtype_overwrite(overwrite_dtype.as_ref())
             .with_schema(schema.map(|schema| Arc::new(schema.0)))
             .low_memory(low_memory)
-            .with_comment_char(comment_char)
+            .with_comment_prefix(comment_prefix)
             .with_quote_char(quote_char)
             .with_end_of_line_char(eol_char)
             .with_rechunk(rechunk)
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index a19e270de850..7285fd25a90a 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -549,7 +549,7 @@ def test_empty_line_with_single_column() -> None:
         b"a\n\nb\n",
         new_columns=["A"],
         has_header=False,
-        comment_char="#",
+        comment_prefix="#",
         use_pyarrow=False,
     )
     expected = pl.DataFrame({"A": ["a", None, "b"]})
@@ -561,13 +561,32 @@ def test_empty_line_with_multiple_columns() -> None:
         b"a,b\n\nc,d\n",
         new_columns=["A", "B"],
         has_header=False,
-        comment_char="#",
+        comment_prefix="#",
         use_pyarrow=False,
     )
     expected = pl.DataFrame({"A": ["a", "c"], "B": ["b", "d"]})
     assert_frame_equal(df, expected)
 
 
+def test_csv_multi_char_comment() -> None:
+    csv = textwrap.dedent(
+        """\
+        #a,b
+        ##c,d
+        """
+    )
+    f = io.StringIO(csv)
+    df = pl.read_csv(
+        f,
+        new_columns=["A", "B"],
+        has_header=False,
+        comment_prefix="##",
+        use_pyarrow=False,
+    )
+    expected = pl.DataFrame({"A": ["#a"], "B": ["b"]})
+    assert_frame_equal(df, expected)
+
+
 def test_csv_quote_char() -> None:
     expected = pl.DataFrame(
         [