Skip to content

Commit

Permalink
move dtypes mapping code
Browse files Browse the repository at this point in the history
  • Loading branch information
Linchin committed Apr 1, 2024
1 parent b716f98 commit c46c65c
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 106 deletions.
134 changes: 134 additions & 0 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from google.cloud.bigquery import _pyarrow_helpers
from google.cloud.bigquery import _versions_helpers
from google.cloud.bigquery import schema
from google.cloud.bigquery.enums import DefaultPandasDTypes

try:
import pandas # type: ignore
Expand Down Expand Up @@ -109,6 +110,11 @@ def _to_wkb(v):
time_dtype_name: "TIME",
}

_NO_SUPPORTED_DTYPE = (
"The dtype cannot to be converted to a pandas ExtensionArray "
"because the necessary `__from_arrow__` attribute is missing."
)


class _DownloadState(object):
"""Flag to indicate that a thread should exit early."""
Expand Down Expand Up @@ -1010,3 +1016,131 @@ def verify_pandas_imports():
raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception
if db_dtypes is None:
raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception


def verify_and_enhance_dtypes(
bool_dtype,
int_dtype,
float_dtype,
string_dtype,
date_dtype,
datetime_dtype,
time_dtype,
timestamp_dtype,
range_date_dtype,
range_datetime_dtype,
range_timestamp_dtype,
):
"""Verifies pandas dtypes mapping and convert from sentinel values."""

if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
bool_dtype = pandas.BooleanDtype()

if int_dtype is DefaultPandasDTypes.INT_DTYPE:
int_dtype = pandas.Int64Dtype()

if time_dtype is DefaultPandasDTypes.TIME_DTYPE:
time_dtype = db_dtypes.TimeDtype()

if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE:
try:
range_date_dtype = pandas.ArrowDtype(
pyarrow.struct([("start", pyarrow.date32()), ("end", pyarrow.date32())])
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_date_dtype to None.
msg = (
"Unable ro find class ArrowDtype in pandas, setting "
"range_date_dtype to be None. To use ArrowDtype, please "
"use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
range_date_dtype = None

if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE:
try:
range_datetime_dtype = pandas.ArrowDtype(
pyarrow.struct(
[
("start", pyarrow.timestamp("us")),
("end", pyarrow.timestamp("us")),
]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_datetime_dtype to None.
msg = (
"Unable ro find class ArrowDtype in pandas, setting "
"range_datetime_dtype to be None. To use ArrowDtype, "
"please use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
range_datetime_dtype = None

if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE:
try:
range_timestamp_dtype = pandas.ArrowDtype(
pyarrow.struct(
[
("start", pyarrow.timestamp("us", tz="UTC")),
("end", pyarrow.timestamp("us", tz="UTC")),
]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_timestamp_dtype to None.
msg = (
"Unable ro find class ArrowDtype in pandas, setting "
"range_timestamp_dtype to be None. To use ArrowDtype, "
"please use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
range_timestamp_dtype = None

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)

if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)

if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)

if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)

if (
date_dtype is not None
and date_dtype is not DefaultPandasDTypes.DATE_DTYPE
and not hasattr(date_dtype, "__from_arrow__")
):
raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE)

if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"):
raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE)

if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"):
raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE)

if timestamp_dtype is not None and not hasattr(timestamp_dtype, "__from_arrow__"):
raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE)

return (
bool_dtype,
int_dtype,
float_dtype,
string_dtype,
date_dtype,
datetime_dtype,
time_dtype,
timestamp_dtype,
range_date_dtype,
range_datetime_dtype,
range_timestamp_dtype,
)
131 changes: 25 additions & 106 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,6 @@

_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'

_NO_SUPPORTED_DTYPE = (
"The dtype cannot to be converted to a pandas ExtensionArray "
"because the necessary `__from_arrow__` attribute is missing."
)

# How many of the total rows need to be downloaded already for us to skip
# calling the BQ Storage API?
ALMOST_COMPLETELY_CACHED_RATIO = 0.333
Expand Down Expand Up @@ -2270,107 +2265,31 @@ def to_dataframe(
if geography_as_object and shapely is None:
raise ValueError(_NO_SHAPELY_ERROR)

if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
bool_dtype = pandas.BooleanDtype()

if int_dtype is DefaultPandasDTypes.INT_DTYPE:
int_dtype = pandas.Int64Dtype()

if time_dtype is DefaultPandasDTypes.TIME_DTYPE:
time_dtype = db_dtypes.TimeDtype()

if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE:
try:
range_date_dtype = pandas.ArrowDtype(
pyarrow.struct(
[("start", pyarrow.date32()), ("end", pyarrow.date32())]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_date_dtype to None.
msg = (
"Unable ro find class ArrowDtype in pandas, setting "
"range_date_dtype to be None. To use ArrowDtype, please "
"use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
range_date_dtype = None

if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE:
try:
range_datetime_dtype = pandas.ArrowDtype(
pyarrow.struct(
[
("start", pyarrow.timestamp("us")),
("end", pyarrow.timestamp("us")),
]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_datetime_dtype to None.
msg = (
"Unable ro find class ArrowDtype in pandas, setting "
"range_datetime_dtype to be None. To use ArrowDtype, "
"please use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
range_datetime_dtype = None

if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE:
try:
range_timestamp_dtype = pandas.ArrowDtype(
pyarrow.struct(
[
("start", pyarrow.timestamp("us", tz="UTC")),
("end", pyarrow.timestamp("us", tz="UTC")),
]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_timestamp_dtype to None.
msg = (
"Unable ro find class ArrowDtype in pandas, setting "
"range_timestamp_dtype to be None. To use ArrowDtype, "
"please use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
range_timestamp_dtype = None

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)

if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)

if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)

if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)

if (
date_dtype is not None
and date_dtype is not DefaultPandasDTypes.DATE_DTYPE
and not hasattr(date_dtype, "__from_arrow__")
):
raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE)

if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"):
raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE)

if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"):
raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE)

if timestamp_dtype is not None and not hasattr(
timestamp_dtype, "__from_arrow__"
):
raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE)
(
bool_dtype,
int_dtype,
float_dtype,
string_dtype,
date_dtype,
datetime_dtype,
time_dtype,
timestamp_dtype,
range_date_dtype,
range_datetime_dtype,
range_timestamp_dtype,
) = _pandas_helpers.verify_and_enhance_dtypes(
bool_dtype,
int_dtype,
float_dtype,
string_dtype,
date_dtype,
datetime_dtype,
time_dtype,
timestamp_dtype,
range_date_dtype,
range_datetime_dtype,
range_timestamp_dtype,
)

if dtypes is None:
dtypes = {}
Expand Down

0 comments on commit c46c65c

Please sign in to comment.