feat!: Use pandas custom data types for BigQuery DATE and TIME column…

…s, remove `date_as_object` argument (#972) * Use new pandas date and time dtypes * Get rid of date_as_object argument * added *unit* test for dealing with dates and timestamps that can't fit in datetime64[ns] * Implemented any, all, min, max and median * test (and fix) load from dataframe with date and time columns * Make sure insert_rows_from_dataframe works * Renamed date and time dtypes to bqdate and bqtime * make fallback date and time dtype names strings to make pytype happy * date and time arrays implement __arrow_array__ to facilitate arrow conversion * Make conversion of date columns from arrow pandas outout to pandas zero-copy when not date_as_object * Added date math support * Support date math with DateOffset scalars * always use types mapper for conversion from arrow to pandas * adjust unit tests to use arrow not avro * avoid "ValueError: need at least one array to concatenate" with empty RecordBatch * add missing db-dtypes requirement * avoid arrow_schema on older versions of bqstorage BREAKING CHANGE: remove `date_as_object` argument from `to_dataframe`. The `dbdate` dtype is used by default with an automatic fallback to `object` when dates are not within the range of a nanosecond-precision pandas timestamp Co-authored-by: Anthonios Partheniou <partheniou@google.com> Co-authored-by: Tim Swast <swast@google.com> Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
googleapis · Nov 10, 2021 · 3d1af95 · 3d1af95
1 parent 42d3db6
commit 3d1af95
Show file tree

Hide file tree

Showing 16 changed files with 396 additions and 176 deletions.
diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst
@@ -50,13 +50,25 @@ The following data types are used when creating a pandas DataFrame.
      -
    * - DATETIME
      - datetime64[ns], object
-     - object is used when there are values not representable in pandas
+     - The object dtype is used when there are values not representable in a
+       pandas nanosecond-precision timestamp.
+   * - DATE
+     - dbdate, object
+     - The object dtype is used when there are values not representable in a
+       pandas nanosecond-precision timestamp.
+
+       Requires the ``db-dtypes`` package. See the `db-dtypes usage guide
+       <https://googleapis.dev/python/db-dtypes/latest/usage.html>`_
    * - FLOAT64
      - float64
      -
    * - INT64
      - Int64
      -
+   * - TIME
+     - dbtime
+     - Requires the ``db-dtypes`` package. See the `db-dtypes usage guide
+       <https://googleapis.dev/python/db-dtypes/latest/usage.html>`_
 
 Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame
 ------------------------------------------------------------

diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -18,16 +18,21 @@
 import functools
 import logging
 import queue
-from typing import Dict, Sequence
 import warnings
 
 try:
     import pandas  # type: ignore
 except ImportError:  # pragma: NO COVER
     pandas = None
+    date_dtype_name = time_dtype_name = ""  # Use '' rather than None because pytype
 else:
     import numpy
 
+    from db_dtypes import DateDtype, TimeDtype  # type: ignore
+
+    date_dtype_name = DateDtype.name
+    time_dtype_name = TimeDtype.name
+
 import pyarrow  # type: ignore
 import pyarrow.parquet  # type: ignore
 
@@ -77,15 +82,6 @@ def _to_wkb(v):
 
 _MAX_QUEUE_SIZE_DEFAULT = object()  # max queue size sentinel for BQ Storage downloads
 
-# If you update the default dtypes, also update the docs at docs/usage/pandas.rst.
-_BQ_TO_PANDAS_DTYPE_NULLSAFE = {
-    "BOOL": "boolean",
-    "BOOLEAN": "boolean",
-    "FLOAT": "float64",
-    "FLOAT64": "float64",
-    "INT64": "Int64",
-    "INTEGER": "Int64",
-}
 _PANDAS_DTYPE_TO_BQ = {
     "bool": "BOOLEAN",
     "datetime64[ns, UTC]": "TIMESTAMP",
@@ -102,6 +98,8 @@ def _to_wkb(v):
     "uint16": "INTEGER",
     "uint32": "INTEGER",
     "geometry": "GEOGRAPHY",
+    date_dtype_name: "DATE",
+    time_dtype_name: "TIME",
 }
 
 
@@ -267,26 +265,40 @@ def bq_to_arrow_schema(bq_schema):
     return pyarrow.schema(arrow_fields)
 
 
-def bq_schema_to_nullsafe_pandas_dtypes(
-    bq_schema: Sequence[schema.SchemaField],
-) -> Dict[str, str]:
-    """Return the default dtypes to use for columns in a BigQuery schema.
+def default_types_mapper(date_as_object: bool = False):
+    """Create a mapping from pyarrow types to pandas types.
 
-    Only returns default dtypes which are safe to have NULL values. This
-    includes Int64, which has pandas.NA values and does not result in
-    loss-of-precision.
+    This overrides the pandas defaults to use null-safe extension types where
+    available.
 
-    Returns:
-        A mapping from column names to pandas dtypes.
+    See: https://arrow.apache.org/docs/python/api/datatypes.html for a list of
+    data types. See:
+    tests/unit/test__pandas_helpers.py::test_bq_to_arrow_data_type for
+    BigQuery to Arrow type mapping.
+
+    Note to google-cloud-bigquery developers: If you update the default dtypes,
+    also update the docs at docs/usage/pandas.rst.
     """
-    dtypes = {}
-    for bq_field in bq_schema:
-        if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}:
-            continue
-        field_type = bq_field.field_type.upper()
-        if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE:
-            dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type]
-    return dtypes
+
+    def types_mapper(arrow_data_type):
+        if pyarrow.types.is_boolean(arrow_data_type):
+            return pandas.BooleanDtype()
+
+        elif (
+            # If date_as_object is True, we know some DATE columns are
+            # out-of-bounds of what is supported by pandas.
+            not date_as_object
+            and pyarrow.types.is_date(arrow_data_type)
+        ):
+            return DateDtype()
+
+        elif pyarrow.types.is_integer(arrow_data_type):
+            return pandas.Int64Dtype()
+
+        elif pyarrow.types.is_time(arrow_data_type):
+            return TimeDtype()
+
+    return types_mapper
 
 
 def bq_to_arrow_array(series, bq_field):

diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py
@@ -1556,7 +1556,6 @@ def to_dataframe(
         dtypes: Dict[str, Any] = None,
         progress_bar_type: str = None,
         create_bqstorage_client: bool = True,
-        date_as_object: bool = True,
         max_results: Optional[int] = None,
         geography_as_object: bool = False,
     ) -> "pandas.DataFrame":
@@ -1599,12 +1598,6 @@ def to_dataframe(
 
                 .. versionadded:: 1.24.0
 
-            date_as_object (Optional[bool]):
-                If ``True`` (default), cast dates to objects. If ``False``, convert
-                to datetime64[ns] dtype.
-
-                .. versionadded:: 1.26.0
-
             max_results (Optional[int]):
                 Maximum number of rows to include in the result. No limit by default.
 
@@ -1638,7 +1631,6 @@ def to_dataframe(
             dtypes=dtypes,
             progress_bar_type=progress_bar_type,
             create_bqstorage_client=create_bqstorage_client,
-            date_as_object=date_as_object,
             geography_as_object=geography_as_object,
         )
 
@@ -1651,7 +1643,6 @@ def to_geodataframe(
         dtypes: Dict[str, Any] = None,
         progress_bar_type: str = None,
         create_bqstorage_client: bool = True,
-        date_as_object: bool = True,
         max_results: Optional[int] = None,
         geography_column: Optional[str] = None,
     ) -> "geopandas.GeoDataFrame":
@@ -1694,12 +1685,6 @@ def to_geodataframe(
 
                 .. versionadded:: 1.24.0
 
-            date_as_object (Optional[bool]):
-                If ``True`` (default), cast dates to objects. If ``False``, convert
-                to datetime64[ns] dtype.
-
-                .. versionadded:: 1.26.0
-
             max_results (Optional[int]):
                 Maximum number of rows to include in the result. No limit by default.
 
@@ -1732,7 +1717,6 @@ def to_geodataframe(
             dtypes=dtypes,
             progress_bar_type=progress_bar_type,
             create_bqstorage_client=create_bqstorage_client,
-            date_as_object=date_as_object,
             geography_column=geography_column,
         )
 

diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -28,6 +28,8 @@
     import pandas  # type: ignore
 except ImportError:  # pragma: NO COVER
     pandas = None
+else:
+    import db_dtypes  # type: ignore # noqa
 
 import pyarrow  # type: ignore
 
@@ -1815,7 +1817,6 @@ def to_dataframe(
         dtypes: Dict[str, Any] = None,
         progress_bar_type: str = None,
         create_bqstorage_client: bool = True,
-        date_as_object: bool = True,
         geography_as_object: bool = False,
     ) -> "pandas.DataFrame":
         """Create a pandas DataFrame by loading all pages of a query.
@@ -1865,12 +1866,6 @@ def to_dataframe(
 
                 .. versionadded:: 1.24.0
 
-            date_as_object (Optional[bool]):
-                If ``True`` (default), cast dates to objects. If ``False``, convert
-                to datetime64[ns] dtype.
-
-                .. versionadded:: 1.26.0
-
             geography_as_object (Optional[bool]):
                 If ``True``, convert GEOGRAPHY data to :mod:`shapely`
                 geometry objects. If ``False`` (default), don't cast
@@ -1912,40 +1907,44 @@ def to_dataframe(
             bqstorage_client=bqstorage_client,
             create_bqstorage_client=create_bqstorage_client,
         )
-        default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes(
-            self.schema
-        )
 
-        # Let the user-defined dtypes override the default ones.
-        # https://stackoverflow.com/a/26853961/101923
-        dtypes = {**default_dtypes, **dtypes}
-
-        # When converting timestamp values to nanosecond precision, the result
+        # When converting date or timestamp values to nanosecond precision, the result
         # can be out of pyarrow bounds. To avoid the error when converting to
-        # Pandas, we set the timestamp_as_object parameter to True, if necessary.
-        types_to_check = {
-            pyarrow.timestamp("us"),
-            pyarrow.timestamp("us", tz=datetime.timezone.utc),
-        }
-
-        for column in record_batch:
-            if column.type in types_to_check:
-                try:
-                    column.cast("timestamp[ns]")
-                except pyarrow.lib.ArrowInvalid:
-                    timestamp_as_object = True
-                    break
-        else:
-            timestamp_as_object = False
-
-        extra_kwargs = {"timestamp_as_object": timestamp_as_object}
+        # Pandas, we set the date_as_object or timestamp_as_object parameter to True,
+        # if necessary.
+        date_as_object = not all(
+            self.__can_cast_timestamp_ns(col)
+            for col in record_batch
+            # Type can be date32 or date64 (plus units).
+            # See: https://arrow.apache.org/docs/python/api/datatypes.html
+            if str(col.type).startswith("date")
+        )
 
-        df = record_batch.to_pandas(
-            date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs
+        timestamp_as_object = not all(
+            self.__can_cast_timestamp_ns(col)
+            for col in record_batch
+            # Type can be timestamp (plus units and time zone).
+            # See: https://arrow.apache.org/docs/python/api/datatypes.html
+            if str(col.type).startswith("timestamp")
         )
 
+        if len(record_batch) > 0:
+            df = record_batch.to_pandas(
+                date_as_object=date_as_object,
+                timestamp_as_object=timestamp_as_object,
+                integer_object_nulls=True,
+                types_mapper=_pandas_helpers.default_types_mapper(
+                    date_as_object=date_as_object
+                ),
+            )
+        else:
+            # Avoid "ValueError: need at least one array to concatenate" on
+            # older versions of pandas when converting empty RecordBatch to
+            # DataFrame. See: https://github.com/pandas-dev/pandas/issues/41241
+            df = pandas.DataFrame([], columns=record_batch.schema.names)
+
         for column in dtypes:
-            df[column] = pandas.Series(df[column], dtype=dtypes[column])
+            df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False)
 
         if geography_as_object:
             for field in self.schema:
@@ -1954,6 +1953,15 @@ def to_dataframe(
 
         return df
 
+    @staticmethod
+    def __can_cast_timestamp_ns(column):
+        try:
+            column.cast("timestamp[ns]")
+        except pyarrow.lib.ArrowInvalid:
+            return False
+        else:
+            return True
+
     # If changing the signature of this method, make sure to apply the same
     # changes to job.QueryJob.to_geodataframe()
     def to_geodataframe(
@@ -1962,7 +1970,6 @@ def to_geodataframe(
         dtypes: Dict[str, Any] = None,
         progress_bar_type: str = None,
         create_bqstorage_client: bool = True,
-        date_as_object: bool = True,
         geography_column: Optional[str] = None,
     ) -> "geopandas.GeoDataFrame":
         """Create a GeoPandas GeoDataFrame by loading all pages of a query.
@@ -2010,10 +2017,6 @@ def to_geodataframe(
 
                 This argument does nothing if ``bqstorage_client`` is supplied.
 
-            date_as_object (Optional[bool]):
-                If ``True`` (default), cast dates to objects. If ``False``, convert
-                to datetime64[ns] dtype.
-
             geography_column (Optional[str]):
                 If there are more than one GEOGRAPHY column,
                 identifies which one to use to construct a geopandas
@@ -2069,7 +2072,6 @@ def to_geodataframe(
             dtypes,
             progress_bar_type,
             create_bqstorage_client,
-            date_as_object,
             geography_as_object=True,
         )
 
@@ -2126,7 +2128,6 @@ def to_dataframe(
         dtypes=None,
         progress_bar_type=None,
         create_bqstorage_client=True,
-        date_as_object=True,
         geography_as_object=False,
     ) -> "pandas.DataFrame":
         """Create an empty dataframe.
@@ -2136,7 +2137,6 @@ def to_dataframe(
             dtypes (Any): Ignored. Added for compatibility with RowIterator.
             progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
             create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
-            date_as_object (bool): Ignored. Added for compatibility with RowIterator.
 
         Returns:
             pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
@@ -2151,7 +2151,6 @@ def to_geodataframe(
         dtypes=None,
         progress_bar_type=None,
         create_bqstorage_client=True,
-        date_as_object=True,
         geography_column: Optional[str] = None,
     ) -> "pandas.DataFrame":
         """Create an empty dataframe.
@@ -2161,7 +2160,6 @@ def to_geodataframe(
             dtypes (Any): Ignored. Added for compatibility with RowIterator.
             progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
             create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
-            date_as_object (bool): Ignored. Added for compatibility with RowIterator.
 
         Returns:
             pandas.DataFrame: An empty :class:`~pandas.DataFrame`.

diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt
@@ -7,6 +7,7 @@ click==8.0.1
 click-plugins==1.1.1
 cligj==0.7.2
 dataclasses==0.6; python_version < '3.7'
+db-dtypes==0.3.0
 Fiona==1.8.20
 geojson==2.5.0
 geopandas==0.9.0

diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt
@@ -1,3 +1,4 @@
+db-dtypes==0.3.0
 google-cloud-bigquery-storage==2.9.0
 google-auth-oauthlib==0.4.6
 grpcio==1.41.0

diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt
@@ -1,3 +1,4 @@
+db-dtypes==0.3.0
 google-cloud-bigquery-storage==2.9.0
 google-auth-oauthlib==0.4.6
 grpcio==1.41.0

diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
     # Keep the no-op bqstorage extra for backward compatibility.
     # See: https://github.com/googleapis/python-bigquery/issues/757
     "bqstorage": [],
-    "pandas": ["pandas>=1.0.0"],
+    "pandas": ["pandas>=1.0.0", "db-dtypes>=0.3.0,<2.0.0dev"],
     "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"],
     "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"],
     "opentelemetry": [