fix(rust, python): respect skip_rows in glob parsing csv (#6754)

pola-rs · Feb 9, 2023 · e103b34 · e103b34
1 parent b160f53
commit e103b34
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 16 deletions.
diff --git a/polars/polars-lazy/src/frame/csv.rs b/polars/polars-lazy/src/frame/csv.rs
@@ -302,10 +302,6 @@ impl<'a> LazyCsvReader<'a> {
                     let path = r.map_err(|e| PolarsError::ComputeError(format!("{e}").into()))?;
                     let mut builder = self.clone();
                     builder.path = path;
-                    if builder.skip_rows > 0 {
-                        builder.skip_rows = 0;
-                        builder.n_rows = None;
-                    }
                     // do no rechunk yet.
                     builder.rechunk = false;
                     builder.finish_impl()
@@ -314,16 +310,6 @@ impl<'a> LazyCsvReader<'a> {
             // set to false, as the csv parser has full thread utilization
             concat_impl(&lfs, self.rechunk, false, true)
                 .map_err(|_| PolarsError::ComputeError("no matching files found".into()))
-                .map(|lf| {
-                    if self.skip_rows != 0 || self.n_rows.is_some() {
-                        lf.slice(
-                            self.skip_rows as i64,
-                            self.n_rows.unwrap_or(usize::MAX) as IdxSize,
-                        )
-                    } else {
-                        lf
-                    }
-                })
         } else {
             self.finish_impl()
         }

diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock
diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py
@@ -100,3 +100,26 @@ def test_lazy_n_rows(foods_file_path: Path) -> None:
 def test_scan_slice_streaming(foods_file_path: Path) -> None:
     df = pl.scan_csv(foods_file_path).head(5).collect(streaming=True)
     assert df.shape == (5, 4)
+
+
+def test_glob_skip_rows() -> None:
+    with tempfile.TemporaryDirectory() as temp_dir:
+        for i in range(2):
+            file_path = Path(temp_dir) / f"test_{i}.csv"
+            with open(file_path, "w") as f:
+                f.write(
+                    f"""
+metadata goes here
+file number {i}
+foo,bar,baz
+1,2,3
+4,5,6
+7,8,9
+        """
+                )
+        file_path = Path(temp_dir) / "*.csv"
+        assert pl.read_csv(file_path, skip_rows=2).to_dict(False) == {
+            "foo": [1, 4, 7, 1, 4, 7],
+            "bar": [2, 5, 8, 2, 5, 8],
+            "baz": [3, 6, 9, 3, 6, 9],
+        }