fix: Fix CSV skip_rows_after_header for streaming

pola-rs · May 12, 2024 · 95f321f · 95f321f
1 parent 492c9d9
commit 95f321f
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 0 deletions.
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -61,6 +61,12 @@ impl CsvSource {
         let low_memory = options.low_memory;
 
         let reader: CsvReader<File> = options
+            .with_skip_rows_after_header(
+                // If we don't set it to 0 here, it will skip double the amount of rows.
+                // But if we set it to 0, it will still skip the requested amount of rows.
+                // TODO: Find out why. Maybe has something to do with schema inference.
+                0,
+            )
             .with_schema_overwrite(Some(self.schema.clone()))
             .with_n_rows(n_rows)
             .with_columns(with_columns)

diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
@@ -2062,6 +2062,22 @@ def test_csv_escape_cf_15349() -> None:
     assert f.read() == b'test\nnormal\n"with\rcr"\n'
 
 
+@pytest.mark.write_disk()
+@pytest.mark.parametrize("streaming", [True, False])
+def test_skip_rows_after_header(tmp_path: Path, streaming: bool) -> None:
+    tmp_path.mkdir(exist_ok=True)
+    path = tmp_path / "data.csv"
+
+    df = pl.Series("a", [1, 2, 3, 4, 5], dtype=pl.Int64).to_frame()
+    df.write_csv(path)
+
+    skip = 2
+    expect = df.slice(skip)
+    out = pl.scan_csv(path, skip_rows_after_header=skip).collect(streaming=streaming)
+
+    assert_frame_equal(out, expect)
+
+
 @pytest.mark.parametrize("use_pyarrow", [True, False])
 def test_skip_rows_after_header_pyarrow(use_pyarrow: bool) -> None:
     csv = textwrap.dedent(