Skip to content

Commit

Permalink
fix(rust, python): respect skip_rows in glob parsing csv (#6754)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Feb 9, 2023
1 parent b160f53 commit e103b34
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 16 deletions.
14 changes: 0 additions & 14 deletions polars/polars-lazy/src/frame/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,6 @@ impl<'a> LazyCsvReader<'a> {
let path = r.map_err(|e| PolarsError::ComputeError(format!("{e}").into()))?;
let mut builder = self.clone();
builder.path = path;
if builder.skip_rows > 0 {
builder.skip_rows = 0;
builder.n_rows = None;
}
// do no rechunk yet.
builder.rechunk = false;
builder.finish_impl()
Expand All @@ -314,16 +310,6 @@ impl<'a> LazyCsvReader<'a> {
// set to false, as the csv parser has full thread utilization
concat_impl(&lfs, self.rechunk, false, true)
.map_err(|_| PolarsError::ComputeError("no matching files found".into()))
.map(|lf| {
if self.skip_rows != 0 || self.n_rows.is_some() {
lf.slice(
self.skip_rows as i64,
self.n_rows.unwrap_or(usize::MAX) as IdxSize,
)
} else {
lf
}
})
} else {
self.finish_impl()
}
Expand Down
5 changes: 3 additions & 2 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions py-polars/tests/unit/io/test_lazy_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,26 @@ def test_lazy_n_rows(foods_file_path: Path) -> None:
def test_scan_slice_streaming(foods_file_path: Path) -> None:
df = pl.scan_csv(foods_file_path).head(5).collect(streaming=True)
assert df.shape == (5, 4)


def test_glob_skip_rows() -> None:
with tempfile.TemporaryDirectory() as temp_dir:
for i in range(2):
file_path = Path(temp_dir) / f"test_{i}.csv"
with open(file_path, "w") as f:
f.write(
f"""
metadata goes here
file number {i}
foo,bar,baz
1,2,3
4,5,6
7,8,9
"""
)
file_path = Path(temp_dir) / "*.csv"
assert pl.read_csv(file_path, skip_rows=2).to_dict(False) == {
"foo": [1, 4, 7, 1, 4, 7],
"bar": [2, 5, 8, 2, 5, 8],
"baz": [3, 6, 9, 3, 6, 9],
}

0 comments on commit e103b34

Please sign in to comment.