Skip to content

Commit

Permalink
fix: Fix CSV skip_rows_after_header for streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
nameexhaustion committed May 12, 2024
1 parent 492c9d9 commit 95f321f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
6 changes: 6 additions & 0 deletions crates/polars-pipe/src/executors/sources/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ impl CsvSource {
let low_memory = options.low_memory;

let reader: CsvReader<File> = options
.with_skip_rows_after_header(
// If we don't set it to 0 here, it will skip double the amount of rows.
// But if we set it to 0, it will still skip the requested amount of rows.
// TODO: Find out why. Maybe has something to do with schema inference.
0,
)
.with_schema_overwrite(Some(self.schema.clone()))
.with_n_rows(n_rows)
.with_columns(with_columns)
Expand Down
16 changes: 16 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2062,6 +2062,22 @@ def test_csv_escape_cf_15349() -> None:
assert f.read() == b'test\nnormal\n"with\rcr"\n'


@pytest.mark.write_disk()
@pytest.mark.parametrize("streaming", [True, False])
def test_skip_rows_after_header(tmp_path: Path, streaming: bool) -> None:
tmp_path.mkdir(exist_ok=True)
path = tmp_path / "data.csv"

df = pl.Series("a", [1, 2, 3, 4, 5], dtype=pl.Int64).to_frame()
df.write_csv(path)

skip = 2
expect = df.slice(skip)
out = pl.scan_csv(path, skip_rows_after_header=skip).collect(streaming=streaming)

assert_frame_equal(out, expect)


@pytest.mark.parametrize("use_pyarrow", [True, False])
def test_skip_rows_after_header_pyarrow(use_pyarrow: bool) -> None:
csv = textwrap.dedent(
Expand Down

0 comments on commit 95f321f

Please sign in to comment.