Skip to content

Commit

Permalink
Pass schema during chunked parquet reads
Browse files Browse the repository at this point in the history
  • Loading branch information
kukushking committed Jul 20, 2023
1 parent 81b1e37 commit 34dd8eb
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
2 changes: 1 addition & 1 deletion awswrangler/s3/_read_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def _read_parquet_chunked(
batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
)
table = _add_table_partitions(
table=pa.Table.from_batches(chunks),
table=pa.Table.from_batches(chunks, schema=pq_file.schema.to_arrow_schema()),
path=path,
path_root=path_root,
)
Expand Down
9 changes: 7 additions & 2 deletions tests/unit/test_s3_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,12 +674,17 @@ def test_ignore_files(path: str, use_threads: Union[bool, int]) -> None:
"(ExecutionPlan)[https://github.com/ray-project/ray/blob/ray-2.0.1/python/ray/data/_internal/plan.py#L253]"
),
)
def test_empty_parquet(path):
@pytest.mark.parametrize("chunked", [True, False])
def test_empty_parquet(path, chunked):
path = f"{path}file.parquet"
s = pa.schema([pa.field("a", pa.int64())])
pq.write_table(s.empty_table(), path)

df = wr.s3.read_parquet(path)
df = wr.s3.read_parquet(path, chunked=chunked)

if chunked:
df = pd.concat(list(df))

assert len(df) == 0
assert len(df.columns) > 0

Expand Down

0 comments on commit 34dd8eb

Please sign in to comment.