diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index d3b956cbfc..29ff262677 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -666,7 +666,8 @@ def get_file_stats_from_metadata( def iter_groups(metadata: Any) -> Iterator[Any]: for i in range(metadata.num_row_groups): - yield metadata.row_group(i) + if metadata.row_group(i).num_rows > 0: + yield metadata.row_group(i) for column_idx in range(metadata.num_columns): name = metadata.row_group(0).column(column_idx).path_in_schema diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py index 9252dfdd41..186eae0b64 100644 --- a/python/tests/test_writer.py +++ b/python/tests/test_writer.py @@ -1251,3 +1251,25 @@ def test_with_deltalake_schema(tmp_path: pathlib.Path, sample_data: pa.Table): ) delta_table = DeltaTable(tmp_path) assert delta_table.schema().to_pyarrow() == sample_data.schema + + +def test_write_stats_empty_rowgroups(tmp_path: pathlib.Path): + # https://github.com/delta-io/delta-rs/issues/2169 + data = pa.table( + { + "data": pa.array(["B"] * 1024 * 33), + } + ) + write_deltalake( + tmp_path, + data, + max_rows_per_file=1024 * 32, + max_rows_per_group=1024 * 16, + min_rows_per_group=8 * 1024, + mode="overwrite", + ) + dt = DeltaTable(tmp_path) + assert ( + dt.to_pyarrow_dataset().to_table(filter=(pc.field("data") == "B")).shape[0] + == 33792 + )