From 04902fd800532fd9afa2024c08dae1abc0c8eca8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 9 Feb 2023 14:45:27 +0100 Subject: [PATCH 1/2] fix(rust, python): respect skip_rows in glob parsing csv --- polars/polars-lazy/src/frame/csv.rs | 14 -------------- py-polars/Cargo.lock | 5 +++-- py-polars/tests/unit/io/test_lazy_csv.py | 24 ++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/polars/polars-lazy/src/frame/csv.rs b/polars/polars-lazy/src/frame/csv.rs index 5788610b41b2..ea9ccf6c2e36 100644 --- a/polars/polars-lazy/src/frame/csv.rs +++ b/polars/polars-lazy/src/frame/csv.rs @@ -302,10 +302,6 @@ impl<'a> LazyCsvReader<'a> { let path = r.map_err(|e| PolarsError::ComputeError(format!("{e}").into()))?; let mut builder = self.clone(); builder.path = path; - if builder.skip_rows > 0 { - builder.skip_rows = 0; - builder.n_rows = None; - } // do no rechunk yet. builder.rechunk = false; builder.finish_impl() @@ -314,16 +310,6 @@ impl<'a> LazyCsvReader<'a> { // set to false, as the csv parser has full thread utilization concat_impl(&lfs, self.rechunk, false, true) .map_err(|_| PolarsError::ComputeError("no matching files found".into())) - .map(|lf| { - if self.skip_rows != 0 || self.n_rows.is_some() { - lf.slice( - self.skip_rows as i64, - self.n_rows.unwrap_or(usize::MAX) as IdxSize, - ) - } else { - lf - } - }) } else { self.finish_impl() } diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 687552e27b8e..dab15d6ca642 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -84,8 +84,9 @@ dependencies = [ [[package]] name = "arrow2" -version = "0.15.0" -source = "git+https://github.com/ritchie46/arrow2?branch=mmap_slice2#685cf49da02a1c94a501aab65535f6fbbcd7cbd7" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a4c5b03335bc1cb0fd9f5297f8fd3bbfd6fb04f3cb0bc7d6c91b7128cb8336a" dependencies = [ "ahash", "arrow-format", diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index 65548d1f4266..6022f7590b52 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -100,3 +100,27 @@ def test_lazy_n_rows(foods_file_path: Path) -> None: def test_scan_slice_streaming(foods_file_path: Path) -> None: df = pl.scan_csv(foods_file_path).head(5).collect(streaming=True) assert df.shape == (5, 4) + + +def test_glob_skip_rows() -> None: + + with tempfile.TemporaryDirectory() as temp_dir: + for i in range(2): + file_path = Path(temp_dir) / f"test_{i}.csv" + with open(file_path, "w") as f: + f.write( + f""" +metadata goes here +file number {i} +foo,bar,baz +1,2,3 +4,5,6 +7,8,9 + """ + ) + file_path = Path(temp_dir) / "*.csv" + assert pl.read_csv(file_path, skip_rows=2).to_dict(False) == { + "foo": [1, 4, 7, 1, 4, 7], + "bar": [2, 5, 8, 2, 5, 8], + "baz": [3, 6, 9, 3, 6, 9], + } From 5fd1c97b272b3eb8261c04ec42398e15e3dd53a4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 9 Feb 2023 14:58:16 +0100 Subject: [PATCH 2/2] black --- py-polars/tests/unit/io/test_lazy_csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index 6022f7590b52..cd7ecfb3a361 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -103,7 +103,6 @@ def test_scan_slice_streaming(foods_file_path: Path) -> None: def test_glob_skip_rows() -> None: - with tempfile.TemporaryDirectory() as temp_dir: for i in range(2): file_path = Path(temp_dir) / f"test_{i}.csv"