Skip to content

Commit

Permalink
Enable use of dataframe type, in athena2pyarrow type
Browse files Browse the repository at this point in the history
  • Loading branch information
eliabrio committed Sep 5, 2024
1 parent 480a30c commit 76aef70
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
2 changes: 1 addition & 1 deletion awswrangler/_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _df_to_table(
for col_name, col_type in dtype.items():
if col_name in table.column_names:
col_index = table.column_names.index(col_name)
pyarrow_dtype = athena2pyarrow(col_type)
pyarrow_dtype = athena2pyarrow(col_type, df.dtypes.get(col_name))
field = pa.field(name=col_name, type=pyarrow_dtype)
table = table.set_column(col_index, field, table.column(col_name).cast(pyarrow_dtype))
_logger.debug("Casting column %s (%s) to %s (%s)", col_name, col_index, col_type, pyarrow_dtype)
Expand Down
17 changes: 14 additions & 3 deletions awswrangler/_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def _split_map(s: str) -> list[str]:
return parts


def athena2pyarrow(dtype: str) -> pa.DataType: # noqa: PLR0911,PLR0912
def athena2pyarrow(dtype: str, df_type: str = None) -> pa.DataType: # noqa: PLR0911,PLR0912
"""Athena to PyArrow data types conversion."""
dtype = dtype.strip()
if dtype.startswith(("array", "struct", "map")):
Expand All @@ -329,7 +329,18 @@ def athena2pyarrow(dtype: str) -> pa.DataType: # noqa: PLR0911,PLR0912
if (dtype in ("string", "uuid")) or dtype.startswith("char") or dtype.startswith("varchar"):
return pa.string()
if dtype == "timestamp":
return pa.timestamp(unit="ns")
if df_type:
match df_type:
case "datetime64[s]":
return pa.timestamp(unit="s")
case "datetime64[ms]":
return pa.timestamp(unit="ms")
case "datetime64[us]":
return pa.timestamp(unit="us")
case "datetime64[ns]":
return pa.timestamp(unit="ns")
case _:
return pa.timestamp(unit="ns")
if dtype == "date":
return pa.date32()
if dtype in ("binary" or "varbinary"):
Expand Down Expand Up @@ -701,7 +712,7 @@ def pyarrow_schema_from_pandas(
)
for k, v in casts.items():
if (k not in ignore) and (k in df.columns or _is_index_name(k, df.index)):
columns_types[k] = athena2pyarrow(dtype=v)
columns_types[k] = athena2pyarrow(dtype=v, df_type=df.dtypes.get(k))
columns_types = {k: v for k, v in columns_types.items() if v is not None}
_logger.debug("columns_types: %s", columns_types)
return pa.schema(fields=columns_types)
Expand Down

0 comments on commit 76aef70

Please sign in to comment.