Skip to content

Commit

Permalink
feat!: Use pandas custom data types for BigQuery DATE and TIME column…
Browse files Browse the repository at this point in the history
…s, remove `date_as_object` argument (#972)

* Use new pandas date and time dtypes

* Get rid of date_as_object argument

* added *unit* test for dealing with dates and timestamps that can't fit in datetime64[ns]

* Implemented any, all, min, max and median

* test (and fix) load from dataframe with date and time columns

* Make sure insert_rows_from_dataframe works

* Renamed date and time dtypes to bqdate and bqtime

* make fallback date and time dtype names strings to make pytype happy

* date and time arrays implement __arrow_array__

to facilitate arrow conversion

* Make conversion of date columns from arrow pandas outout to pandas zero-copy

when not date_as_object

* Added date math support

* Support date math with DateOffset scalars

* always use types mapper for conversion from arrow to pandas

* adjust unit tests to use arrow not avro

* avoid "ValueError: need at least one array to concatenate" with empty RecordBatch

* add missing db-dtypes requirement

* avoid arrow_schema on older versions of bqstorage

BREAKING CHANGE: remove `date_as_object` argument from `to_dataframe`. The `dbdate` dtype is used by default with an automatic fallback to `object` when dates are not within the range of a nanosecond-precision pandas timestamp

Co-authored-by: Anthonios Partheniou <partheniou@google.com>
Co-authored-by: Tim Swast <swast@google.com>
Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
  • Loading branch information
4 people committed Nov 10, 2021
1 parent 42d3db6 commit 3d1af95
Show file tree
Hide file tree
Showing 16 changed files with 396 additions and 176 deletions.
14 changes: 13 additions & 1 deletion docs/usage/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,25 @@ The following data types are used when creating a pandas DataFrame.
-
* - DATETIME
- datetime64[ns], object
- object is used when there are values not representable in pandas
- The object dtype is used when there are values not representable in a
pandas nanosecond-precision timestamp.
* - DATE
- dbdate, object
- The object dtype is used when there are values not representable in a
pandas nanosecond-precision timestamp.

Requires the ``db-dtypes`` package. See the `db-dtypes usage guide
<https://googleapis.dev/python/db-dtypes/latest/usage.html>`_
* - FLOAT64
- float64
-
* - INT64
- Int64
-
* - TIME
- dbtime
- Requires the ``db-dtypes`` package. See the `db-dtypes usage guide
<https://googleapis.dev/python/db-dtypes/latest/usage.html>`_

Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame
------------------------------------------------------------
Expand Down
66 changes: 39 additions & 27 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,21 @@
import functools
import logging
import queue
from typing import Dict, Sequence
import warnings

try:
import pandas # type: ignore
except ImportError: # pragma: NO COVER
pandas = None
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype
else:
import numpy

from db_dtypes import DateDtype, TimeDtype # type: ignore

date_dtype_name = DateDtype.name
time_dtype_name = TimeDtype.name

import pyarrow # type: ignore
import pyarrow.parquet # type: ignore

Expand Down Expand Up @@ -77,15 +82,6 @@ def _to_wkb(v):

_MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads

# If you update the default dtypes, also update the docs at docs/usage/pandas.rst.
_BQ_TO_PANDAS_DTYPE_NULLSAFE = {
"BOOL": "boolean",
"BOOLEAN": "boolean",
"FLOAT": "float64",
"FLOAT64": "float64",
"INT64": "Int64",
"INTEGER": "Int64",
}
_PANDAS_DTYPE_TO_BQ = {
"bool": "BOOLEAN",
"datetime64[ns, UTC]": "TIMESTAMP",
Expand All @@ -102,6 +98,8 @@ def _to_wkb(v):
"uint16": "INTEGER",
"uint32": "INTEGER",
"geometry": "GEOGRAPHY",
date_dtype_name: "DATE",
time_dtype_name: "TIME",
}


Expand Down Expand Up @@ -267,26 +265,40 @@ def bq_to_arrow_schema(bq_schema):
return pyarrow.schema(arrow_fields)


def bq_schema_to_nullsafe_pandas_dtypes(
bq_schema: Sequence[schema.SchemaField],
) -> Dict[str, str]:
"""Return the default dtypes to use for columns in a BigQuery schema.
def default_types_mapper(date_as_object: bool = False):
"""Create a mapping from pyarrow types to pandas types.
Only returns default dtypes which are safe to have NULL values. This
includes Int64, which has pandas.NA values and does not result in
loss-of-precision.
This overrides the pandas defaults to use null-safe extension types where
available.
Returns:
A mapping from column names to pandas dtypes.
See: https://arrow.apache.org/docs/python/api/datatypes.html for a list of
data types. See:
tests/unit/test__pandas_helpers.py::test_bq_to_arrow_data_type for
BigQuery to Arrow type mapping.
Note to google-cloud-bigquery developers: If you update the default dtypes,
also update the docs at docs/usage/pandas.rst.
"""
dtypes = {}
for bq_field in bq_schema:
if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}:
continue
field_type = bq_field.field_type.upper()
if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE:
dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type]
return dtypes

def types_mapper(arrow_data_type):
if pyarrow.types.is_boolean(arrow_data_type):
return pandas.BooleanDtype()

elif (
# If date_as_object is True, we know some DATE columns are
# out-of-bounds of what is supported by pandas.
not date_as_object
and pyarrow.types.is_date(arrow_data_type)
):
return DateDtype()

elif pyarrow.types.is_integer(arrow_data_type):
return pandas.Int64Dtype()

elif pyarrow.types.is_time(arrow_data_type):
return TimeDtype()

return types_mapper


def bq_to_arrow_array(series, bq_field):
Expand Down
16 changes: 0 additions & 16 deletions google/cloud/bigquery/job/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -1556,7 +1556,6 @@ def to_dataframe(
dtypes: Dict[str, Any] = None,
progress_bar_type: str = None,
create_bqstorage_client: bool = True,
date_as_object: bool = True,
max_results: Optional[int] = None,
geography_as_object: bool = False,
) -> "pandas.DataFrame":
Expand Down Expand Up @@ -1599,12 +1598,6 @@ def to_dataframe(
.. versionadded:: 1.24.0
date_as_object (Optional[bool]):
If ``True`` (default), cast dates to objects. If ``False``, convert
to datetime64[ns] dtype.
.. versionadded:: 1.26.0
max_results (Optional[int]):
Maximum number of rows to include in the result. No limit by default.
Expand Down Expand Up @@ -1638,7 +1631,6 @@ def to_dataframe(
dtypes=dtypes,
progress_bar_type=progress_bar_type,
create_bqstorage_client=create_bqstorage_client,
date_as_object=date_as_object,
geography_as_object=geography_as_object,
)

Expand All @@ -1651,7 +1643,6 @@ def to_geodataframe(
dtypes: Dict[str, Any] = None,
progress_bar_type: str = None,
create_bqstorage_client: bool = True,
date_as_object: bool = True,
max_results: Optional[int] = None,
geography_column: Optional[str] = None,
) -> "geopandas.GeoDataFrame":
Expand Down Expand Up @@ -1694,12 +1685,6 @@ def to_geodataframe(
.. versionadded:: 1.24.0
date_as_object (Optional[bool]):
If ``True`` (default), cast dates to objects. If ``False``, convert
to datetime64[ns] dtype.
.. versionadded:: 1.26.0
max_results (Optional[int]):
Maximum number of rows to include in the result. No limit by default.
Expand Down Expand Up @@ -1732,7 +1717,6 @@ def to_geodataframe(
dtypes=dtypes,
progress_bar_type=progress_bar_type,
create_bqstorage_client=create_bqstorage_client,
date_as_object=date_as_object,
geography_column=geography_column,
)

Expand Down
88 changes: 43 additions & 45 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import pandas # type: ignore
except ImportError: # pragma: NO COVER
pandas = None
else:
import db_dtypes # type: ignore # noqa

import pyarrow # type: ignore

Expand Down Expand Up @@ -1815,7 +1817,6 @@ def to_dataframe(
dtypes: Dict[str, Any] = None,
progress_bar_type: str = None,
create_bqstorage_client: bool = True,
date_as_object: bool = True,
geography_as_object: bool = False,
) -> "pandas.DataFrame":
"""Create a pandas DataFrame by loading all pages of a query.
Expand Down Expand Up @@ -1865,12 +1866,6 @@ def to_dataframe(
.. versionadded:: 1.24.0
date_as_object (Optional[bool]):
If ``True`` (default), cast dates to objects. If ``False``, convert
to datetime64[ns] dtype.
.. versionadded:: 1.26.0
geography_as_object (Optional[bool]):
If ``True``, convert GEOGRAPHY data to :mod:`shapely`
geometry objects. If ``False`` (default), don't cast
Expand Down Expand Up @@ -1912,40 +1907,44 @@ def to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=create_bqstorage_client,
)
default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes(
self.schema
)

# Let the user-defined dtypes override the default ones.
# https://stackoverflow.com/a/26853961/101923
dtypes = {**default_dtypes, **dtypes}

# When converting timestamp values to nanosecond precision, the result
# When converting date or timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
types_to_check = {
pyarrow.timestamp("us"),
pyarrow.timestamp("us", tz=datetime.timezone.utc),
}

for column in record_batch:
if column.type in types_to_check:
try:
column.cast("timestamp[ns]")
except pyarrow.lib.ArrowInvalid:
timestamp_as_object = True
break
else:
timestamp_as_object = False

extra_kwargs = {"timestamp_as_object": timestamp_as_object}
# Pandas, we set the date_as_object or timestamp_as_object parameter to True,
# if necessary.
date_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if str(col.type).startswith("date")
)

df = record_batch.to_pandas(
date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs
timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if str(col.type).startswith("timestamp")
)

if len(record_batch) > 0:
df = record_batch.to_pandas(
date_as_object=date_as_object,
timestamp_as_object=timestamp_as_object,
integer_object_nulls=True,
types_mapper=_pandas_helpers.default_types_mapper(
date_as_object=date_as_object
),
)
else:
# Avoid "ValueError: need at least one array to concatenate" on
# older versions of pandas when converting empty RecordBatch to
# DataFrame. See: https://github.com/pandas-dev/pandas/issues/41241
df = pandas.DataFrame([], columns=record_batch.schema.names)

for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False)

if geography_as_object:
for field in self.schema:
Expand All @@ -1954,6 +1953,15 @@ def to_dataframe(

return df

@staticmethod
def __can_cast_timestamp_ns(column):
try:
column.cast("timestamp[ns]")
except pyarrow.lib.ArrowInvalid:
return False
else:
return True

# If changing the signature of this method, make sure to apply the same
# changes to job.QueryJob.to_geodataframe()
def to_geodataframe(
Expand All @@ -1962,7 +1970,6 @@ def to_geodataframe(
dtypes: Dict[str, Any] = None,
progress_bar_type: str = None,
create_bqstorage_client: bool = True,
date_as_object: bool = True,
geography_column: Optional[str] = None,
) -> "geopandas.GeoDataFrame":
"""Create a GeoPandas GeoDataFrame by loading all pages of a query.
Expand Down Expand Up @@ -2010,10 +2017,6 @@ def to_geodataframe(
This argument does nothing if ``bqstorage_client`` is supplied.
date_as_object (Optional[bool]):
If ``True`` (default), cast dates to objects. If ``False``, convert
to datetime64[ns] dtype.
geography_column (Optional[str]):
If there are more than one GEOGRAPHY column,
identifies which one to use to construct a geopandas
Expand Down Expand Up @@ -2069,7 +2072,6 @@ def to_geodataframe(
dtypes,
progress_bar_type,
create_bqstorage_client,
date_as_object,
geography_as_object=True,
)

Expand Down Expand Up @@ -2126,7 +2128,6 @@ def to_dataframe(
dtypes=None,
progress_bar_type=None,
create_bqstorage_client=True,
date_as_object=True,
geography_as_object=False,
) -> "pandas.DataFrame":
"""Create an empty dataframe.
Expand All @@ -2136,7 +2137,6 @@ def to_dataframe(
dtypes (Any): Ignored. Added for compatibility with RowIterator.
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
date_as_object (bool): Ignored. Added for compatibility with RowIterator.
Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand All @@ -2151,7 +2151,6 @@ def to_geodataframe(
dtypes=None,
progress_bar_type=None,
create_bqstorage_client=True,
date_as_object=True,
geography_column: Optional[str] = None,
) -> "pandas.DataFrame":
"""Create an empty dataframe.
Expand All @@ -2161,7 +2160,6 @@ def to_geodataframe(
dtypes (Any): Ignored. Added for compatibility with RowIterator.
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
date_as_object (bool): Ignored. Added for compatibility with RowIterator.
Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand Down
1 change: 1 addition & 0 deletions samples/geography/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ click==8.0.1
click-plugins==1.1.1
cligj==0.7.2
dataclasses==0.6; python_version < '3.7'
db-dtypes==0.3.0
Fiona==1.8.20
geojson==2.5.0
geopandas==0.9.0
Expand Down
1 change: 1 addition & 0 deletions samples/magics/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
db-dtypes==0.3.0
google-cloud-bigquery-storage==2.9.0
google-auth-oauthlib==0.4.6
grpcio==1.41.0
Expand Down
1 change: 1 addition & 0 deletions samples/snippets/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
db-dtypes==0.3.0
google-cloud-bigquery-storage==2.9.0
google-auth-oauthlib==0.4.6
grpcio==1.41.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
# Keep the no-op bqstorage extra for backward compatibility.
# See: https://github.com/googleapis/python-bigquery/issues/757
"bqstorage": [],
"pandas": ["pandas>=1.0.0"],
"pandas": ["pandas>=1.0.0", "db-dtypes>=0.3.0,<2.0.0dev"],
"geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"],
"tqdm": ["tqdm >= 4.7.4, <5.0.0dev"],
"opentelemetry": [
Expand Down
Loading

0 comments on commit 3d1af95

Please sign in to comment.