From 4911ad5d868fefc59e83f68f21cfe036ba43d2a5 Mon Sep 17 00:00:00 2001 From: Boruch Chalk Date: Thu, 4 Jan 2024 09:34:35 +0200 Subject: [PATCH] fix(rust): Fix hive partitioned files not being skipped (#13358) fix(rust): Fix hive partitioned files not being skipped (#13358) --- .../src/physical_plan/expressions/apply.rs | 6 ++++++ py-polars/tests/unit/io/test_hive.py | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/crates/polars-lazy/src/physical_plan/expressions/apply.rs b/crates/polars-lazy/src/physical_plan/expressions/apply.rs index 5bfd6c4a77e80..62db3fc6853ee 100644 --- a/crates/polars-lazy/src/physical_plan/expressions/apply.rs +++ b/crates/polars-lazy/src/physical_plan/expressions/apply.rs @@ -509,6 +509,12 @@ impl ApplyExpr { let min = st.to_min()?; let max = st.to_max()?; + if ChunkCompare::equal(max, min).ok()?.all() { + let one_equals = + |value: &Series| Some(ChunkCompare::equal(input, value).ok()?.any()); + return one_equals(min); + } + let all_smaller = || Some(ChunkCompare::lt(input, min).ok()?.all()); let all_bigger = || Some(ChunkCompare::gt(input, max).ok()?.all()); Some(!all_smaller()? && !all_bigger()?) diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py index e476d61aae178..f495780752173 100644 --- a/py-polars/tests/unit/io/test_hive.py +++ b/py-polars/tests/unit/io/test_hive.py @@ -65,6 +65,26 @@ def test_hive_partitioned_predicate_pushdown( ) +@pytest.mark.write_disk() +def test_hive_partitioned_predicate_pushdown_skips_correct_number_of_files( + io_files_path: Path, tmp_path: Path, monkeypatch: Any, capfd: Any +) -> None: + monkeypatch.setenv("POLARS_VERBOSE", "1") + df = pl.DataFrame({"d": pl.arange(0, 10_000, eager=True)}).with_columns( + a=pl.col("d") % 100 + ) + root = tmp_path / "test_int_partitions" + df.write_parquet( + root, + use_pyarrow=True, + pyarrow_options={"partition_cols": ["a"]}, + ) + + q = pl.scan_parquet(root / "**/*.parquet", hive_partitioning=True) + assert q.filter(pl.col("a").is_in([10, 99])).collect().shape == (200, 2) + assert "hive partitioning: skipped 98 files" in capfd.readouterr().err + + @pytest.mark.write_disk() def test_hive_partitioned_slice_pushdown(io_files_path: Path, tmp_path: Path) -> None: df = pl.read_ipc(io_files_path / "*.ipc")