From 4911ad5d868fefc59e83f68f21cfe036ba43d2a5 Mon Sep 17 00:00:00 2001
From: Boruch Chalk <boruch.chalk@mobileye.com>
Date: Thu, 4 Jan 2024 09:34:35 +0200
Subject: [PATCH] fix(rust): Fix hive partitioned files not being skipped
 (#13358)

fix(rust): Fix hive partitioned files not being skipped (#13358)
---
 .../src/physical_plan/expressions/apply.rs    |  6 ++++++
 py-polars/tests/unit/io/test_hive.py          | 20 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/crates/polars-lazy/src/physical_plan/expressions/apply.rs b/crates/polars-lazy/src/physical_plan/expressions/apply.rs
index 5bfd6c4a77e80..62db3fc6853ee 100644
--- a/crates/polars-lazy/src/physical_plan/expressions/apply.rs
+++ b/crates/polars-lazy/src/physical_plan/expressions/apply.rs
@@ -509,6 +509,12 @@ impl ApplyExpr {
                     let min = st.to_min()?;
                     let max = st.to_max()?;
 
+                    if ChunkCompare::equal(max, min).ok()?.all() {
+                        let one_equals =
+                            |value: &Series| Some(ChunkCompare::equal(input, value).ok()?.any());
+                        return one_equals(min);
+                    }
+
                     let all_smaller = || Some(ChunkCompare::lt(input, min).ok()?.all());
                     let all_bigger = || Some(ChunkCompare::gt(input, max).ok()?.all());
                     Some(!all_smaller()? && !all_bigger()?)
diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py
index e476d61aae178..f495780752173 100644
--- a/py-polars/tests/unit/io/test_hive.py
+++ b/py-polars/tests/unit/io/test_hive.py
@@ -65,6 +65,26 @@ def test_hive_partitioned_predicate_pushdown(
     )
 
 
+@pytest.mark.write_disk()
+def test_hive_partitioned_predicate_pushdown_skips_correct_number_of_files(
+    io_files_path: Path, tmp_path: Path, monkeypatch: Any, capfd: Any
+) -> None:
+    monkeypatch.setenv("POLARS_VERBOSE", "1")
+    df = pl.DataFrame({"d": pl.arange(0, 10_000, eager=True)}).with_columns(
+        a=pl.col("d") % 100
+    )
+    root = tmp_path / "test_int_partitions"
+    df.write_parquet(
+        root,
+        use_pyarrow=True,
+        pyarrow_options={"partition_cols": ["a"]},
+    )
+
+    q = pl.scan_parquet(root / "**/*.parquet", hive_partitioning=True)
+    assert q.filter(pl.col("a").is_in([10, 99])).collect().shape == (200, 2)
+    assert "hive partitioning: skipped 98 files" in capfd.readouterr().err
+
+
 @pytest.mark.write_disk()
 def test_hive_partitioned_slice_pushdown(io_files_path: Path, tmp_path: Path) -> None:
     df = pl.read_ipc(io_files_path / "*.ipc")