From 5dd6b24d8dc83713ed2925093fc1ab601102a653 Mon Sep 17 00:00:00 2001 From: Linchin Date: Fri, 22 Mar 2024 23:59:49 +0000 Subject: [PATCH 01/35] feat: support range in queries as dict --- google/cloud/bigquery/_helpers.py | 41 ++++++++++++ tests/system/helpers.py | 5 ++ tests/unit/test__helpers.py | 105 +++++++++++++++++++++++++++++- 3 files changed, 150 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 7198b60c2..0572867d7 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -309,6 +309,46 @@ def _json_from_json(value, field): return None +def _range_element_from_json(value, field): + """Coerce 'value' to a range element value, if set or not nullable.""" + if value == "UNBOUNDED": + return None + elif field.element_type == "DATE": + return _date_from_json(value, None) + elif field.element_type == "DATETIME": + return _datetime_from_json(value, None) + elif field.element_type == "TIMESTAMP": + return _timestamp_from_json(value, None) + else: + raise ValueError(f"Unsupported range field type: {value}") + + +def _range_from_json(value, field): + """Coerce 'value' to a range, if set or not nullable. + + Args: + value (str): The literal representation of the range. + field (google.cloud.bigquery.schema.SchemaField): + The field corresponding to the value. + + Returns: + Optional[dict]: + The parsed range object from ``value`` if the ``field`` is not + null (otherwise it is :data:`None`). + """ + range_literal = re.compile(r"\[.*, .*\)") + if _not_null(value, field): + if range_literal.match(value): + start, end = value[1:-1].split(", ") + start = _range_element_from_json(start, field.range_element_type) + end = _range_element_from_json(end, field.range_element_type) + return {"start": start, "end": end} + else: + raise ValueError(f"Unknown range format: {value}") + else: + return None + + # Parse BigQuery API response JSON into a Python representation. _CELLDATA_FROM_JSON = { "INTEGER": _int_from_json, @@ -329,6 +369,7 @@ def _json_from_json(value, field): "TIME": _time_from_json, "RECORD": _record_from_json, "JSON": _json_from_json, + "RANGE": _range_from_json, } _QUERY_PARAMS_FROM_JSON = dict(_CELLDATA_FROM_JSON) diff --git a/tests/system/helpers.py b/tests/system/helpers.py index 721f55040..7fd344eeb 100644 --- a/tests/system/helpers.py +++ b/tests/system/helpers.py @@ -25,6 +25,7 @@ _naive = datetime.datetime(2016, 12, 5, 12, 41, 9) _naive_microseconds = datetime.datetime(2016, 12, 5, 12, 41, 9, 250000) _stamp = "%s %s" % (_naive.date().isoformat(), _naive.time().isoformat()) +_date = _naive.date().isoformat() _stamp_microseconds = _stamp + ".250000" _zoned = _naive.replace(tzinfo=UTC) _zoned_microseconds = _naive_microseconds.replace(tzinfo=UTC) @@ -78,6 +79,10 @@ ), ("SELECT ARRAY(SELECT STRUCT([1, 2]))", [{"_field_1": [1, 2]}]), ("SELECT ST_GeogPoint(1, 2)", "POINT(1 2)"), + ( + "SELECT RANGE '[UNBOUNDED, %s)'" % _date, + {"start": None, "end": _naive.date()}, + ), ] diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 320c57737..40b78ce9d 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -452,6 +452,99 @@ def test_w_bogus_string_value(self): self._call_fut("12:12:27.123", object()) +class Test_range_from_json(unittest.TestCase): + def _call_fut(self, value, field): + from google.cloud.bigquery._helpers import _range_from_json + + return _range_from_json(value, field) + + def test_w_none_nullable(self): + self.assertIsNone(self._call_fut(None, _Field("NULLABLE"))) + + def test_w_none_required(self): + with self.assertRaises(TypeError): + self._call_fut(None, _Field("REQUIRED")) + + def test_w_wrong_format(self): + range_field = _Field( + "NULLIBLE", + field_type="RANGE", + range_element_type=_Field("NULLIBLE", element_type="DATE"), + ) + with self.assertRaises(ValueError): + self._call_fut("[2009-06-172019-06-17)", range_field) + + def test_w_wrong_element_type(self): + range_field = _Field( + "NULLIBLE", + field_type="RANGE", + range_element_type=_Field("NULLIBLE", element_type="TIME"), + ) + with self.assertRaises(ValueError): + self._call_fut("[15:31:38, 15:50:38)", range_field) + + def test_w_unbounded_value(self): + range_field = _Field( + "NULLIBLE", + field_type="RANGE", + range_element_type=_Field("NULLIBLE", element_type="DATE"), + ) + coerced = self._call_fut("[UNBOUNDED, 2019-06-17)", range_field) + self.assertEqual( + coerced, + {"start": None, "end": datetime.date(2019, 6, 17)}, + ) + + def test_w_date_value(self): + range_field = _Field( + "NULLIBLE", + field_type="RANGE", + range_element_type=_Field("NULLIBLE", element_type="DATE"), + ) + coerced = self._call_fut("[2009-06-17, 2019-06-17)", range_field) + self.assertEqual( + coerced, + { + "start": datetime.date(2009, 6, 17), + "end": datetime.date(2019, 6, 17), + }, + ) + + def test_w_datetime_value(self): + range_field = _Field( + "NULLIBLE", + field_type="RANGE", + range_element_type=_Field("NULLIBLE", element_type="DATETIME"), + ) + coerced = self._call_fut( + "[2009-06-17T13:45:30, 2019-06-17T13:45:30)", range_field + ) + self.assertEqual( + coerced, + { + "start": datetime.datetime(2009, 6, 17, 13, 45, 30), + "end": datetime.datetime(2019, 6, 17, 13, 45, 30), + }, + ) + + def test_w_timestamp_value(self): + from google.cloud._helpers import _EPOCH + + range_field = _Field( + "NULLIBLE", + field_type="RANGE", + range_element_type=_Field("NULLIBLE", element_type="TIMESTAMP"), + ) + coerced = self._call_fut("[1234567, 1234789)", range_field) + self.assertEqual( + coerced, + { + "start": _EPOCH + datetime.timedelta(seconds=1, microseconds=234567), + "end": _EPOCH + datetime.timedelta(seconds=1, microseconds=234789), + }, + ) + + class Test_record_from_json(unittest.TestCase): def _call_fut(self, value, field): from google.cloud.bigquery._helpers import _record_from_json @@ -1323,11 +1416,21 @@ def test_w_str(self): class _Field(object): - def __init__(self, mode, name="unknown", field_type="UNKNOWN", fields=()): + def __init__( + self, + mode, + name="unknown", + field_type="UNKNOWN", + fields=(), + range_element_type=None, + element_type=None, + ): self.mode = mode self.name = name self.field_type = field_type self.fields = fields + self.range_element_type = range_element_type + self.element_type = element_type def _field_isinstance_patcher(): From 74fb1d3bda2a8d74ec140f5f586178868cd016d9 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 25 Mar 2024 18:55:34 +0000 Subject: [PATCH 02/35] fix sys tests --- tests/system/test_query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/system/test_query.py b/tests/system/test_query.py index 0494272d9..5b9fcda1e 100644 --- a/tests/system/test_query.py +++ b/tests/system/test_query.py @@ -425,7 +425,7 @@ def test_query_statistics(bigquery_client, query_api_method): ), ( "SELECT @range_date", - "[2016-12-05, UNBOUNDED)", + {'end': None, 'start': datetime.date(2016, 12, 5)}, [ RangeQueryParameter( name="range_date", @@ -436,7 +436,7 @@ def test_query_statistics(bigquery_client, query_api_method): ), ( "SELECT @range_datetime", - "[2016-12-05T00:00:00, UNBOUNDED)", + {'end': None, 'start': datetime.datetime(2016, 12, 5, 0, 0)}, [ RangeQueryParameter( name="range_datetime", @@ -447,7 +447,7 @@ def test_query_statistics(bigquery_client, query_api_method): ), ( "SELECT @range_unbounded", - "[UNBOUNDED, UNBOUNDED)", + {'end': None, 'start': None}, [ RangeQueryParameter( name="range_unbounded", From a67e1aa3b0329e6636e91b64348bcf95a1453e75 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 25 Mar 2024 19:22:49 +0000 Subject: [PATCH 03/35] lint --- tests/system/test_query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/system/test_query.py b/tests/system/test_query.py index 5b9fcda1e..d94a117e3 100644 --- a/tests/system/test_query.py +++ b/tests/system/test_query.py @@ -425,7 +425,7 @@ def test_query_statistics(bigquery_client, query_api_method): ), ( "SELECT @range_date", - {'end': None, 'start': datetime.date(2016, 12, 5)}, + {"end": None, "start": datetime.date(2016, 12, 5)}, [ RangeQueryParameter( name="range_date", @@ -436,7 +436,7 @@ def test_query_statistics(bigquery_client, query_api_method): ), ( "SELECT @range_datetime", - {'end': None, 'start': datetime.datetime(2016, 12, 5, 0, 0)}, + {"end": None, "start": datetime.datetime(2016, 12, 5, 0, 0)}, [ RangeQueryParameter( name="range_datetime", @@ -447,7 +447,7 @@ def test_query_statistics(bigquery_client, query_api_method): ), ( "SELECT @range_unbounded", - {'end': None, 'start': None}, + {"end": None, "start": None}, [ RangeQueryParameter( name="range_unbounded", From 75a985524a95ac318087201a92cf86b15e6a804c Mon Sep 17 00:00:00 2001 From: Linchin Date: Thu, 28 Mar 2024 00:14:54 +0000 Subject: [PATCH 04/35] add arrow support --- google/cloud/bigquery/_pandas_helpers.py | 25 +++++++++ google/cloud/bigquery/_pyarrow_helpers.py | 27 ++++++++++ google/cloud/bigquery/client.py | 2 + google/cloud/bigquery/enums.py | 9 ++++ google/cloud/bigquery/job/query.py | 6 +++ google/cloud/bigquery/table.py | 30 +++++++++++ tests/unit/test_table.py | 63 +++++++++++++++++++++++ 7 files changed, 162 insertions(+) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 9f8dcfde4..cbbd73f16 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -164,6 +164,8 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) + if field_type_upper == "RANGE": + field_type_upper = f"RANGE<{field.range_element_type.element_type}>" data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper) if data_type_constructor is None: return None @@ -224,6 +226,9 @@ def default_types_mapper( datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = None, timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = None, + range_datetime_dtype: Union[Any, None] = None, + range_timestamp_dtype: Union[Any, None] = None, ): """Create a mapping from pyarrow types to pandas types. @@ -278,6 +283,26 @@ def types_mapper(arrow_data_type): elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type): return time_dtype + elif pyarrow.types.is_struct(arrow_data_type): + if ( + range_datetime_dtype is not None + and arrow_data_type.equals(range_datetime_dtype.pyarrow_dtype) + ): + return range_datetime_dtype + + elif ( + range_date_dtype is not None + and arrow_data_type.equals(range_date_dtype.pyarrow_dtype) + ): + return range_date_dtype + return pandas.ArrowDtype(range_date_dtype) + + elif ( + range_timestamp_dtype is not None + and arrow_data_type.equals(range_timestamp_dtype.pyarrow_dtype) + ): + return range_timestamp_dtype + return types_mapper diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 06509cc93..8bdd54314 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -46,6 +46,30 @@ def pyarrow_timestamp(): return pyarrow.timestamp("us", tz="UTC") +def pyarrow_range_timestamp(): + return pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + + +def pyarrow_range_datetime(): + return pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + + +def pyarrow_range_date(): + return pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + + _BQ_TO_ARROW_SCALARS = {} _ARROW_SCALAR_IDS_TO_BQ = {} @@ -68,6 +92,9 @@ def pyarrow_timestamp(): "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, + "RANGE": pyarrow_range_timestamp, + "RANGE": pyarrow_range_datetime, + "RANGE": pyarrow_range_date, } _ARROW_SCALAR_IDS_TO_BQ = { diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 408e7e49c..758f2a863 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -3793,6 +3793,8 @@ def insert_rows_json( if template_suffix is not None: data["templateSuffix"] = template_suffix + print(data) + path = "%s/insertAll" % table.path # We can always retry, because every row has an insert ID. span_attributes = {"path": path} diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index d75037ad1..14e660291 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -96,6 +96,15 @@ class DefaultPandasDTypes(enum.Enum): TIME_DTYPE = object() """Specifies default time dtype""" + RANGE_DATE_DTYPE = object() + """Specifies default range date dtype""" + + RANGE_DATETIME_DTYPE = object() + """Specifies default range datetime dtype""" + + RANGE_TIMESTAMP_DTYPE = object() + """Specifies default range timestamp dtype""" + class DestinationFormat(object): """The exported file format. The default value is :attr:`CSV`. diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 83d2751ce..8d703b2c9 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1739,6 +1739,9 @@ def to_dataframe( datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, + range_datetime_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, + range_timestamp_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1904,6 +1907,9 @@ def to_dataframe( datetime_dtype=datetime_dtype, time_dtype=time_dtype, timestamp_dtype=timestamp_dtype, + range_date_dtype=range_date_dtype, + range_datetime_dtype=range_datetime_dtype, + range_timestamp_dtype=range_timestamp_dtype, ) # If changing the signature of this method, make sure to apply the same diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index b3be4ff90..2e6c8e653 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2045,6 +2045,9 @@ def to_dataframe( datetime_dtype: Union[Any, None] = None, time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, + range_datetime_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, + range_timestamp_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -2215,6 +2218,27 @@ def to_dataframe( if time_dtype is DefaultPandasDTypes.TIME_DTYPE: time_dtype = db_dtypes.TimeDtype() + if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: + range_date_dtype = pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) + + if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: + range_datetime_dtype = pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + )) + + if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: + range_timestamp_dtype = pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + )) + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) @@ -2299,6 +2323,9 @@ def to_dataframe( datetime_dtype=datetime_dtype, time_dtype=time_dtype, timestamp_dtype=timestamp_dtype, + range_date_dtype=range_date_dtype, + range_datetime_dtype=range_datetime_dtype, + range_timestamp_dtype=range_timestamp_dtype, ), ) else: @@ -2503,6 +2530,9 @@ def to_dataframe( datetime_dtype=None, time_dtype=None, timestamp_dtype=None, + range_date_dtype=None, + range_datetime_dtype=None, + range_timestamp_dtype=None, ) -> "pandas.DataFrame": """Create an empty dataframe. diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index a8107ee97..cd5fa93cd 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3651,6 +3651,9 @@ def test_to_dataframe_w_dtypes_mapper(self): SchemaField("datetime", "DATETIME"), SchemaField("time", "TIME"), SchemaField("timestamp", "TIMESTAMP"), + SchemaField("range_timestamp", "RANGE", range_element_type="TIMESTAMP"), + SchemaField("range_datetime", "RANGE", range_element_type="DATETIME"), + SchemaField("range_date", "RANGE", range_element_type="DATE"), ] row_data = [ [ @@ -3663,6 +3666,9 @@ def test_to_dataframe_w_dtypes_mapper(self): "1999-12-31T00:00:00.000000", "00:00:00.000000", "1433836800000000", + "[1433836800000000, 1433999900000000)", + "[2009-06-17T13:45:30, 2019-07-17T13:45:30)", + "[2020-10-01, 2021-10-02)", ], [ "Bharney Rhubble", @@ -3674,6 +3680,9 @@ def test_to_dataframe_w_dtypes_mapper(self): "4567-12-31T00:00:00.000000", "12:00:00.232413", "81953424000000000", + "[1433836800000000, UNBOUNDED)", + "[2009-06-17T13:45:30, UNBOUNDED)", + "[2020-10-01, UNBOUNDED)", ], [ "Wylma Phlyntstone", @@ -3685,6 +3694,9 @@ def test_to_dataframe_w_dtypes_mapper(self): "9999-12-31T23:59:59.999999", "23:59:59.999999", "253402261199999999", + "[UNBOUNDED, UNBOUNDED)", + "[UNBOUNDED, UNBOUNDED)", + "[UNBOUNDED, UNBOUNDED)", ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] @@ -3722,6 +3734,27 @@ def test_to_dataframe_w_dtypes_mapper(self): if hasattr(pandas, "ArrowDtype") else None ), + range_date_dtype=( + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) + if hasattr(pandas, "ArrowDtype") + else None + ), + range_datetime_dtype=( + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us"))] + )) + if hasattr(pandas, "ArrowDtype") + else None + ), + range_timestamp_dtype=( + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC"))] + )) + if hasattr(pandas, "ArrowDtype") + else None + ), ) self.assertIsInstance(df, pandas.DataFrame) @@ -3789,6 +3822,36 @@ def test_to_dataframe_w_dtypes_mapper(self): ], ) self.assertEqual(df.timestamp.dtype.name, "timestamp[us, tz=UTC][pyarrow]") + + self.assertEqual( + list(df.range_timestamp), + [ + {'start': datetime.datetime(2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc), 'end': datetime.datetime(2015, 6, 11, 5, 18, 20, tzinfo=datetime.timezone.utc)}, + {'start': datetime.datetime(2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc), 'end': None}, + {'start': None, 'end': None}, + ], + ) + + self.assertEqual( + list(df.range_datetime), + [ + {'start': datetime.datetime(2009, 6, 17, 13, 45, 30), 'end': datetime.datetime(2019, 7, 17, 13, 45, 30)}, + {'start': datetime.datetime(2009, 6, 17, 13, 45, 30), 'end': None}, + {'start': None, 'end': None}, + ], + ) + + self.assertEqual( + list(df.range_date), + [ + {'start': datetime.date(2020, 10, 1), 'end': datetime.date(2021, 10, 2)}, + {'start': datetime.date(2020, 10, 1), 'end': None}, + {'start': None, 'end': None}, + #{'start': datetime.date(2020, 10, 1), 'end': None}, + #{'start': None, 'end': None}, + ], + ) + else: self.assertEqual( list(df.date), From 73a5001a3024cf13d42d9a6edecff008085abfb8 Mon Sep 17 00:00:00 2001 From: Linchin Date: Thu, 28 Mar 2024 21:19:51 +0000 Subject: [PATCH 05/35] fix python 3.7 test error --- google/cloud/bigquery/table.py | 60 +++++++++++++++++++++++++--------- tests/unit/test_table.py | 10 ++++-- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index c0432be55..233ab6ef6 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2219,25 +2219,55 @@ def to_dataframe( time_dtype = db_dtypes.TimeDtype() if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: - range_date_dtype = pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - )) + try: + range_date_dtype = pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_date_dtype to None. + msg = ("Unable ro find class ArrowDtype in pandas, setting " + "range_date_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8.") + warnings.warn(msg) + range_date_dtype = None if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: - range_datetime_dtype = pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - )) + try: + range_datetime_dtype = pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + )) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_datetime_dtype to None. + msg = ("Unable ro find class ArrowDtype in pandas, setting " + "range_datetime_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8.") + warnings.warn(msg) + range_datetime_dtype = None if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: - range_timestamp_dtype = pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - )) + try: + range_timestamp_dtype = pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + )) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_timestamp_dtype to None. + msg = ("Unable ro find class ArrowDtype in pandas, setting " + "range_timestamp_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8.") + warnings.warn(msg) + range_timestamp_dtype = None if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 6f438f62a..677976356 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3503,7 +3503,10 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - self.assertEqual(len(user_warnings), 0) + # Note: number of warnings is inconsistent across python versions + # I think it's relatively safe to not check warning numbers, than + # having different assertions depending on python version. + # self.assertEqual(len(user_warnings), 0) self.assertEqual(len(df), 4) @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) @@ -3534,7 +3537,10 @@ def test_to_dataframe_no_tqdm(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - self.assertEqual(len(user_warnings), 1) + # Note: number of warnings is inconsistent across python versions + # I think it's relatively safe to not check warning numbers, than + # having different assertions depending on python version. + # self.assertEqual(len(user_warnings), 1) # Even though the progress bar won't show, downloading the dataframe # should still work. From 6a735ca35176dbc44bf5fb77b9890d8342aad896 Mon Sep 17 00:00:00 2001 From: Linchin Date: Thu, 28 Mar 2024 22:36:26 +0000 Subject: [PATCH 06/35] print dependencies in sys test --- noxfile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/noxfile.py b/noxfile.py index 3adb4ba70..c6aad2b8f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -200,6 +200,9 @@ def system(session): extras = "[all]" session.install("-e", f".{extras}", "-c", constraints_path) + # print versions of all dependencies + session.run("python", "-m", "pip", "freeze") + # Run py.test against the system tests. session.run( "py.test", From d54336a5c1945093d3a53a452b12295b29ed403f Mon Sep 17 00:00:00 2001 From: Linchin Date: Fri, 29 Mar 2024 18:40:10 +0000 Subject: [PATCH 07/35] add unit test and docs --- google/cloud/bigquery/_helpers.py | 4 ++-- google/cloud/bigquery/_pandas_helpers.py | 1 - google/cloud/bigquery/client.py | 2 -- google/cloud/bigquery/table.py | 4 ++++ tests/unit/test_table.py | 12 ++++++++++++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 0572867d7..f2ce9d2cf 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -320,7 +320,7 @@ def _range_element_from_json(value, field): elif field.element_type == "TIMESTAMP": return _timestamp_from_json(value, None) else: - raise ValueError(f"Unsupported range field type: {value}") + raise ValueError(f"Unsupported range field type: {field.element_type}") def _range_from_json(value, field): @@ -344,7 +344,7 @@ def _range_from_json(value, field): end = _range_element_from_json(end, field.range_element_type) return {"start": start, "end": end} else: - raise ValueError(f"Unknown range format: {value}") + raise ValueError(f"Unknown format for range value: {value}") else: return None diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 8bd6d2837..1b3116c43 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -291,7 +291,6 @@ def types_mapper(arrow_data_type): and arrow_data_type.equals(range_date_dtype.pyarrow_dtype) ): return range_date_dtype - return pandas.ArrowDtype(range_date_dtype) elif ( range_timestamp_dtype is not None diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index f5b4679e6..891a54e5c 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -3868,8 +3868,6 @@ def insert_rows_json( if template_suffix is not None: data["templateSuffix"] = template_suffix - print(data) - path = "%s/insertAll" % table.path # We can always retry, because every row has an insert ID. span_attributes = {"path": path} diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 233ab6ef6..249374d12 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2580,6 +2580,9 @@ def to_dataframe( datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. time_dtype (Any): Ignored. Added for compatibility with RowIterator. timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. + range_date_dtype (Any): Ignored. Added for compatibility with RowIterator. + range_datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. + range_timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. @@ -2602,6 +2605,7 @@ def to_geodataframe( dtypes (Any): Ignored. Added for compatibility with RowIterator. progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + geography_column (str): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 677976356..0afd428cc 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3920,6 +3920,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): SchemaField("datetime", "DATETIME"), SchemaField("time", "TIME"), SchemaField("timestamp", "TIMESTAMP"), + SchemaField("range_timestamp", "RANGE", range_element_type="TIMESTAMP"), + SchemaField("range_datetime", "RANGE", range_element_type="DATETIME"), + SchemaField("range_date", "RANGE", range_element_type="DATE"), ] row_data = [ [ @@ -3932,6 +3935,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): "1999-12-31T00:00:00.000000", "23:59:59.999999", "1433836800000000", + "[1433836800000000, 1433999900000000)", + "[2009-06-17T13:45:30, 2019-07-17T13:45:30)", + "[2020-10-01, 2021-10-02)", ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] @@ -3949,6 +3955,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): datetime_dtype=None, time_dtype=None, timestamp_dtype=None, + range_timestamp_dtype=None, + range_datetime_dtype=None, + range_date_dtype=None, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(df.complete.dtype.name, "bool") @@ -3960,6 +3969,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): self.assertEqual(df.datetime.dtype.name, "datetime64[ns]") self.assertEqual(df.time.dtype.name, "object") self.assertEqual(df.timestamp.dtype.name, "datetime64[ns, UTC]") + self.assertEqual(df.time.range_timestamp_dtype.name, "object") + self.assertEqual(df.time.range_datetime_dtype.name, "object") + self.assertEqual(df.time.range_date_dtype.name, "object") def test_to_dataframe_w_unsupported_dtypes_mapper(self): pytest.importorskip("pandas") From 8dc4ae59b1737ba23f01d3416f80ff3337e50342 Mon Sep 17 00:00:00 2001 From: Linchin Date: Fri, 29 Mar 2024 20:53:51 +0000 Subject: [PATCH 08/35] fix unit test --- tests/unit/test_table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 0afd428cc..e41ace25d 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3969,9 +3969,9 @@ def test_to_dataframe_w_none_dtypes_mapper(self): self.assertEqual(df.datetime.dtype.name, "datetime64[ns]") self.assertEqual(df.time.dtype.name, "object") self.assertEqual(df.timestamp.dtype.name, "datetime64[ns, UTC]") - self.assertEqual(df.time.range_timestamp_dtype.name, "object") - self.assertEqual(df.time.range_datetime_dtype.name, "object") - self.assertEqual(df.time.range_date_dtype.name, "object") + self.assertEqual(df.range_timestamp.dtype.name, "object") + self.assertEqual(df.range_datetime.dtype.name, "object") + self.assertEqual(df.range_date.dtype.name, "object") def test_to_dataframe_w_unsupported_dtypes_mapper(self): pytest.importorskip("pandas") From 1b2d68fc035a6b0da156bd45ed097a9c5bdbfaf7 Mon Sep 17 00:00:00 2001 From: Linchin Date: Fri, 29 Mar 2024 21:09:39 +0000 Subject: [PATCH 09/35] add func docs --- google/cloud/bigquery/job/query.py | 47 +++++++++++++++++++++++++++++- google/cloud/bigquery/table.py | 45 ++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index e2b901cb4..2bb363d0b 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1872,8 +1872,53 @@ def to_dataframe( .. versionadded:: 3.10.0 + range_date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + ``pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ))`` + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + ``pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ))`` + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + ``pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ))`` + to convert BigQuery RANGE type, instead of relying + on the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + Returns: - pandas.DataFrame: + pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 249374d12..ebea4cc68 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2187,6 +2187,51 @@ def to_dataframe( .. versionadded:: 3.10.0 + range_date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + ``pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ))`` + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + ``pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ))`` + to convert BigQuery RANGE type, instead of relying on + the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + + range_timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype, such as: + ``pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ))`` + to convert BigQuery RANGE type, instead of relying + on the default ``object``. If you explicitly set the value to + ``None``, the data type will be ``object``. BigQuery Range type + can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type + + .. versionadded:: 3.21.0 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column From 6f93d8e7d5c413894cc0d1e9f3376406d211a86d Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 00:01:54 +0000 Subject: [PATCH 10/35] add sys test for tabledata.list in arrow --- tests/data/scalars.csv | 2 ++ tests/data/scalars_schema_csv.json | 10 ++++++++++ tests/system/conftest.py | 20 ++++++++++++++++++-- tests/system/test_arrow.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 tests/data/scalars.csv create mode 100644 tests/data/scalars_schema_csv.json diff --git a/tests/data/scalars.csv b/tests/data/scalars.csv new file mode 100644 index 000000000..1e60c2ed9 --- /dev/null +++ b/tests/data/scalars.csv @@ -0,0 +1,2 @@ +"[2020-01-01, 2020-02-01)" +"[2020-01-01, 2020-02-01)" diff --git a/tests/data/scalars_schema_csv.json b/tests/data/scalars_schema_csv.json new file mode 100644 index 000000000..82b878d95 --- /dev/null +++ b/tests/data/scalars_schema_csv.json @@ -0,0 +1,10 @@ +[ + { + "mode" : "NULLABLE", + "name" : "range_date", + "type" : "RANGE", + "rangeElementType": { + "type": "DATE" + } + } + ] \ No newline at end of file diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 784a1dd5c..184b22573 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -96,12 +96,14 @@ def load_scalars_table( project_id: str, dataset_id: str, data_path: str = "scalars.jsonl", + source_format=enums.SourceFormat.NEWLINE_DELIMITED_JSON, + schema_source="scalars_schema.json", ) -> str: - schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + schema = bigquery_client.schema_from_json(DATA_DIR / schema_source) table_id = data_path.replace(".", "_") + hex(random.randrange(1000000)) job_config = bigquery.LoadJobConfig() job_config.schema = schema - job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + job_config.source_format = source_format full_table_id = f"{project_id}.{dataset_id}.{table_id}" with open(DATA_DIR / data_path, "rb") as data_file: job = bigquery_client.load_table_from_file( @@ -151,6 +153,20 @@ def scalars_table_multi_location( return request.param, full_table_id +@pytest.fixture(scope="session") +def scalars_table_csv(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + full_table_id = load_scalars_table( + bigquery_client, + project_id, + dataset_id, + data_path="scalars.csv", + source_format=enums.SourceFormat.CSV, + schema_source="scalars_schema_csv.json", + ) + yield full_table_id + bigquery_client.delete_table(full_table_id, not_found_ok=True) + + @pytest.fixture def test_table_name(request, replace_non_anum=re.compile(r"[^a-zA-Z0-9_]").sub): return replace_non_anum("_", request.node.name) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index 8b88b6844..880044080 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -167,3 +167,32 @@ def test_arrow_extension_types_same_for_storage_and_REST_APIs_894( b"ARROW:extension:name": b"google:sqlType:geography", b"ARROW:extension:metadata": b'{"encoding": "WKT"}', } + + +def test_list_rows_range_csv( + bigquery_client: bigquery.Client, + scalars_table_csv: str, +): + table_id = scalars_table_csv + + schema = [ + bigquery.SchemaField( + "range_date", + enums.SqlTypeNames.RANGE, + range_element_type="DATE" + ), + ] + + arrow_table = bigquery_client.list_rows( + table_id, + selected_fields=schema, + ).to_arrow() + + schema = arrow_table.schema + + expected_type = pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + + range_type = schema.field("range_date").type + assert range_type == expected_type From 005d409ab5337ff0c2b7e251add05bdfc133bbb4 Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 00:39:10 +0000 Subject: [PATCH 11/35] add sys test for tabledata.list as iterator --- tests/data/scalars.csv | 2 +- tests/system/conftest.py | 2 ++ tests/system/test_list_rows.py | 18 +++++++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/data/scalars.csv b/tests/data/scalars.csv index 1e60c2ed9..7af97583f 100644 --- a/tests/data/scalars.csv +++ b/tests/data/scalars.csv @@ -1,2 +1,2 @@ "[2020-01-01, 2020-02-01)" -"[2020-01-01, 2020-02-01)" + diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 184b22573..1aa5eecd3 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -101,6 +101,8 @@ def load_scalars_table( ) -> str: schema = bigquery_client.schema_from_json(DATA_DIR / schema_source) table_id = data_path.replace(".", "_") + hex(random.randrange(1000000)) + #if data_path != "scalars.jsonl": + # breakpoint() job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = source_format diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py index 4c08958c3..ea2e0756f 100644 --- a/tests/system/test_list_rows.py +++ b/tests/system/test_list_rows.py @@ -117,4 +117,20 @@ def test_list_rows_scalars_extreme( if column == "rowindex": assert value == 4 else: - assert value is None + assert value + +def test_list_rows_scalars( + bigquery_client: bigquery.Client, + scalars_table_csv: str + ): + rows = bigquery_client.list_rows(scalars_table_csv) + rows = list(rows) + row = rows[0] + expected_range = { + "start": datetime.date(2020, 1, 1), + "end": datetime.date(2020, 2, 1), + } + assert row["range_date"] == expected_range + + row_null = rows[1] + assert row_null["range_date"] is None From 839eafe1eed5d9dafe4f746ed7935ca72eda3f71 Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 01:04:02 +0000 Subject: [PATCH 12/35] lint --- google/cloud/bigquery/_pandas_helpers.py | 19 ++-- google/cloud/bigquery/_pyarrow_helpers.py | 24 ++--- google/cloud/bigquery/dbapi/_helpers.py | 14 +-- google/cloud/bigquery/job/query.py | 42 +++++--- google/cloud/bigquery/table.py | 102 +++++++++++------- google/cloud/bigquery_v2/types/model.py | 11 ++ .../cloud/bigquery_v2/types/standard_sql.py | 1 + tests/system/conftest.py | 14 +-- tests/system/test_arrow.py | 4 +- tests/system/test_list_rows.py | 8 +- tests/unit/test_table.py | 86 ++++++++++----- 11 files changed, 196 insertions(+), 129 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 1b3116c43..fa6047df8 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -280,24 +280,21 @@ def types_mapper(arrow_data_type): return time_dtype elif pyarrow.types.is_struct(arrow_data_type): - if ( - range_datetime_dtype is not None - and arrow_data_type.equals(range_datetime_dtype.pyarrow_dtype) + if range_datetime_dtype is not None and arrow_data_type.equals( + range_datetime_dtype.pyarrow_dtype ): return range_datetime_dtype - elif ( - range_date_dtype is not None - and arrow_data_type.equals(range_date_dtype.pyarrow_dtype) + elif range_date_dtype is not None and arrow_data_type.equals( + range_date_dtype.pyarrow_dtype ): return range_date_dtype - - elif ( - range_timestamp_dtype is not None - and arrow_data_type.equals(range_timestamp_dtype.pyarrow_dtype) + + elif range_timestamp_dtype is not None and arrow_data_type.equals( + range_timestamp_dtype.pyarrow_dtype ): return range_timestamp_dtype - + return types_mapper diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 8f0f19ad5..3cd58cf9f 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -48,26 +48,24 @@ def pyarrow_timestamp(): def pyarrow_range_timestamp(): return pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - ) + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) def pyarrow_range_datetime(): return pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - ) + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) def pyarrow_range_date(): - return pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - ) + return pyarrow.struct([("start", pyarrow.date32()), ("end", pyarrow.date32())]) _BQ_TO_ARROW_SCALARS = {} diff --git a/google/cloud/bigquery/dbapi/_helpers.py b/google/cloud/bigquery/dbapi/_helpers.py index 117fa8ae7..a4ab05ce8 100644 --- a/google/cloud/bigquery/dbapi/_helpers.py +++ b/google/cloud/bigquery/dbapi/_helpers.py @@ -277,12 +277,14 @@ def complex_query_parameter( param = query.ArrayQueryParameter( name, sub_type, - value - if isinstance(sub_type, query.ScalarQueryParameterType) - else [ - complex_query_parameter(None, v, sub_type._complex__src, base) - for v in value - ], + ( + value + if isinstance(sub_type, query.ScalarQueryParameterType) + else [ + complex_query_parameter(None, v, sub_type._complex__src, base) + for v in value + ] + ), ) elif type_type == STRUCT: if not isinstance(value, collections_abc.Mapping): diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 2bb363d0b..01d8da322 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1735,8 +1735,12 @@ def to_dataframe( time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, - range_datetime_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, - range_timestamp_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, + range_datetime_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, + range_timestamp_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1873,10 +1877,12 @@ def to_dataframe( .. versionadded:: 3.10.0 range_date_dtype (Optional[pandas.Series.dtype, None]): - If set, indicate a pandas ExtensionDtype, such as: - ``pandas.ArrowDtype(pyarrow.struct( + If set, indicate a pandas ExtensionDtype, such as: + `` + pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] - ))`` + )) + `` to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1886,13 +1892,15 @@ def to_dataframe( .. versionadded:: 3.21.0 range_datetime_dtype (Optional[pandas.Series.dtype, None]): - If set, indicate a pandas ExtensionDtype, such as: - ``pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), + If set, indicate a pandas ExtensionDtype, such as: + `` + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us")), ] - ))`` + )) + `` to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1902,13 +1910,15 @@ def to_dataframe( .. versionadded:: 3.21.0 range_timestamp_dtype (Optional[pandas.Series.dtype, None]): - If set, indicate a pandas ExtensionDtype, such as: - ``pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), + If set, indicate a pandas ExtensionDtype, such as: + `` + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC")), ] - ))`` + )) + `` to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1918,7 +1928,7 @@ def to_dataframe( .. versionadded:: 3.21.0 Returns: - pandas.DataFrame: + pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived from the destination table's diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index ebea4cc68..508f73902 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2046,8 +2046,12 @@ def to_dataframe( time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, timestamp_dtype: Union[Any, None] = None, range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE, - range_datetime_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, - range_timestamp_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, + range_datetime_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE, + range_timestamp_dtype: Union[ + Any, None + ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -2188,10 +2192,12 @@ def to_dataframe( .. versionadded:: 3.10.0 range_date_dtype (Optional[pandas.Series.dtype, None]): - If set, indicate a pandas ExtensionDtype, such as: - ``pandas.ArrowDtype(pyarrow.struct( + If set, indicate a pandas ExtensionDtype, such as: + `` + pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] - ))`` + )) + `` to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2201,13 +2207,15 @@ def to_dataframe( .. versionadded:: 3.21.0 range_datetime_dtype (Optional[pandas.Series.dtype, None]): - If set, indicate a pandas ExtensionDtype, such as: - ``pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), + If set, indicate a pandas ExtensionDtype, such as: + `` + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us")), ] - ))`` + )) + `` to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2217,13 +2225,15 @@ def to_dataframe( .. versionadded:: 3.21.0 range_timestamp_dtype (Optional[pandas.Series.dtype, None]): - If set, indicate a pandas ExtensionDtype, such as: - ``pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), + If set, indicate a pandas ExtensionDtype, such as: + `` + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC")), ] - ))`` + )) + `` to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2265,52 +2275,64 @@ def to_dataframe( if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: try: - range_date_dtype = pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - )) + range_date_dtype = pandas.ArrowDtype( + pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + ) except AttributeError: # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 # only supports upto pandas 1.3. If pandas.ArrowDtype is not # present, we raise a warning and set range_date_dtype to None. - msg = ("Unable ro find class ArrowDtype in pandas, setting " - "range_date_dtype to be None. To use ArrowDtype, please " - "use pandas >= 1.5 and python >= 3.8.") + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_date_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8." + ) warnings.warn(msg) range_date_dtype = None - + if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: try: - range_datetime_dtype = pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - )) + range_datetime_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + ) except AttributeError: # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 # only supports upto pandas 1.3. If pandas.ArrowDtype is not # present, we raise a warning and set range_datetime_dtype to None. - msg = ("Unable ro find class ArrowDtype in pandas, setting " - "range_datetime_dtype to be None. To use ArrowDtype, please " - "use pandas >= 1.5 and python >= 3.8.") + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_datetime_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) warnings.warn(msg) range_datetime_dtype = None if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: try: - range_timestamp_dtype = pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - )) + range_timestamp_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + ) except AttributeError: # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 # only supports upto pandas 1.3. If pandas.ArrowDtype is not # present, we raise a warning and set range_timestamp_dtype to None. - msg = ("Unable ro find class ArrowDtype in pandas, setting " - "range_timestamp_dtype to be None. To use ArrowDtype, please " - "use pandas >= 1.5 and python >= 3.8.") + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_timestamp_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) warnings.warn(msg) range_timestamp_dtype = None diff --git a/google/cloud/bigquery_v2/types/model.py b/google/cloud/bigquery_v2/types/model.py index f32e15eb1..99c6b3b8d 100644 --- a/google/cloud/bigquery_v2/types/model.py +++ b/google/cloud/bigquery_v2/types/model.py @@ -103,6 +103,7 @@ class Model(proto.Message): class ModelType(proto.Enum): r"""Indicates the type of the Model.""" + MODEL_TYPE_UNSPECIFIED = 0 LINEAR_REGRESSION = 1 LOGISTIC_REGRESSION = 2 @@ -120,6 +121,7 @@ class ModelType(proto.Enum): class LossType(proto.Enum): r"""Loss metric to evaluate model training performance.""" + LOSS_TYPE_UNSPECIFIED = 0 MEAN_SQUARED_LOSS = 1 MEAN_LOG_LOSS = 2 @@ -128,6 +130,7 @@ class DistanceType(proto.Enum): r"""Distance metric used to compute the distance between two points. """ + DISTANCE_TYPE_UNSPECIFIED = 0 EUCLIDEAN = 1 COSINE = 2 @@ -136,6 +139,7 @@ class DataSplitMethod(proto.Enum): r"""Indicates the method to split input data into multiple tables. """ + DATA_SPLIT_METHOD_UNSPECIFIED = 0 RANDOM = 1 CUSTOM = 2 @@ -147,6 +151,7 @@ class DataFrequency(proto.Enum): r"""Type of supported data frequency for time series forecasting models. """ + DATA_FREQUENCY_UNSPECIFIED = 0 AUTO_FREQUENCY = 1 YEARLY = 2 @@ -161,6 +166,7 @@ class HolidayRegion(proto.Enum): r"""Type of supported holiday regions for time series forecasting models. """ + HOLIDAY_REGION_UNSPECIFIED = 0 GLOBAL = 1 NA = 2 @@ -233,12 +239,14 @@ class HolidayRegion(proto.Enum): class LearnRateStrategy(proto.Enum): r"""Indicates the learning rate optimization strategy to use.""" + LEARN_RATE_STRATEGY_UNSPECIFIED = 0 LINE_SEARCH = 1 CONSTANT = 2 class OptimizationStrategy(proto.Enum): r"""Indicates the optimization strategy used for training.""" + OPTIMIZATION_STRATEGY_UNSPECIFIED = 0 BATCH_GRADIENT_DESCENT = 1 NORMAL_EQUATION = 2 @@ -247,6 +255,7 @@ class FeedbackType(proto.Enum): r"""Indicates the training algorithm to use for matrix factorization models. """ + FEEDBACK_TYPE_UNSPECIFIED = 0 IMPLICIT = 1 EXPLICIT = 2 @@ -256,6 +265,7 @@ class SeasonalPeriod(proto.Message): class SeasonalPeriodType(proto.Enum): r"""""" + SEASONAL_PERIOD_TYPE_UNSPECIFIED = 0 NO_SEASONALITY = 1 DAILY = 2 @@ -271,6 +281,7 @@ class KmeansInitializationMethod(proto.Enum): r"""Indicates the method used to initialize the centroids for KMeans clustering algorithm. """ + KMEANS_INITIALIZATION_METHOD_UNSPECIFIED = 0 RANDOM = 1 CUSTOM = 2 diff --git a/google/cloud/bigquery_v2/types/standard_sql.py b/google/cloud/bigquery_v2/types/standard_sql.py index 3be5304fc..822c0bf22 100644 --- a/google/cloud/bigquery_v2/types/standard_sql.py +++ b/google/cloud/bigquery_v2/types/standard_sql.py @@ -60,6 +60,7 @@ class StandardSqlDataType(proto.Message): class TypeKind(proto.Enum): r"""""" + TYPE_KIND_UNSPECIFIED = 0 INT64 = 2 BOOL = 5 diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 1aa5eecd3..8efa042af 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -101,8 +101,6 @@ def load_scalars_table( ) -> str: schema = bigquery_client.schema_from_json(DATA_DIR / schema_source) table_id = data_path.replace(".", "_") + hex(random.randrange(1000000)) - #if data_path != "scalars.jsonl": - # breakpoint() job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = source_format @@ -156,12 +154,14 @@ def scalars_table_multi_location( @pytest.fixture(scope="session") -def scalars_table_csv(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): +def scalars_table_csv( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): full_table_id = load_scalars_table( - bigquery_client, - project_id, - dataset_id, - data_path="scalars.csv", + bigquery_client, + project_id, + dataset_id, + data_path="scalars.csv", source_format=enums.SourceFormat.CSV, schema_source="scalars_schema_csv.json", ) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index 880044080..82cf11f85 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -177,9 +177,7 @@ def test_list_rows_range_csv( schema = [ bigquery.SchemaField( - "range_date", - enums.SqlTypeNames.RANGE, - range_element_type="DATE" + "range_date", enums.SqlTypeNames.RANGE, range_element_type="DATE" ), ] diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py index ea2e0756f..108b842ce 100644 --- a/tests/system/test_list_rows.py +++ b/tests/system/test_list_rows.py @@ -117,12 +117,10 @@ def test_list_rows_scalars_extreme( if column == "rowindex": assert value == 4 else: - assert value + assert value is None + -def test_list_rows_scalars( - bigquery_client: bigquery.Client, - scalars_table_csv: str - ): +def test_list_rows_range(bigquery_client: bigquery.Client, scalars_table_csv: str): rows = bigquery_client.list_rows(scalars_table_csv) rows = list(rows) row = rows[0] diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index e41ace25d..4e59bb76b 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3497,12 +3497,13 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) - with warnings.catch_warnings(record=True) as warned: + # with warnings.catch_warnings(record=True) as warned: + with warnings.catch_warnings(record=True): df = row_iterator.to_dataframe(create_bqstorage_client=False) - user_warnings = [ - warning for warning in warned if warning.category is UserWarning - ] + # user_warnings = [ + # warning for warning in warned if warning.category is UserWarning + # ] # Note: number of warnings is inconsistent across python versions # I think it's relatively safe to not check warning numbers, than # having different assertions depending on python version. @@ -3528,15 +3529,16 @@ def test_to_dataframe_no_tqdm(self): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) - with warnings.catch_warnings(record=True) as warned: + # with warnings.catch_warnings(record=True) as warned: + with warnings.catch_warnings(record=True): df = row_iterator.to_dataframe( progress_bar_type="tqdm", create_bqstorage_client=False, ) - user_warnings = [ - warning for warning in warned if warning.category is UserWarning - ] + # user_warnings = [ + # warning for warning in warned if warning.category is UserWarning + # ] # Note: number of warnings is inconsistent across python versions # I think it's relatively safe to not check warning numbers, than # having different assertions depending on python version. @@ -3743,23 +3745,35 @@ def test_to_dataframe_w_dtypes_mapper(self): else None ), range_date_dtype=( - pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - )) + pandas.ArrowDtype( + pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + ) if hasattr(pandas, "ArrowDtype") else None ), range_datetime_dtype=( - pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us"))] - )) + pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + ) if hasattr(pandas, "ArrowDtype") else None ), range_timestamp_dtype=( - pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC"))] - )) + pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + ) if hasattr(pandas, "ArrowDtype") else None ), @@ -3834,29 +3848,45 @@ def test_to_dataframe_w_dtypes_mapper(self): self.assertEqual( list(df.range_timestamp), [ - {'start': datetime.datetime(2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc), 'end': datetime.datetime(2015, 6, 11, 5, 18, 20, tzinfo=datetime.timezone.utc)}, - {'start': datetime.datetime(2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc), 'end': None}, - {'start': None, 'end': None}, + { + "start": datetime.datetime( + 2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc + ), + "end": datetime.datetime( + 2015, 6, 11, 5, 18, 20, tzinfo=datetime.timezone.utc + ), + }, + { + "start": datetime.datetime( + 2015, 6, 9, 8, 0, 0, tzinfo=datetime.timezone.utc + ), + "end": None, + }, + {"start": None, "end": None}, ], ) self.assertEqual( list(df.range_datetime), [ - {'start': datetime.datetime(2009, 6, 17, 13, 45, 30), 'end': datetime.datetime(2019, 7, 17, 13, 45, 30)}, - {'start': datetime.datetime(2009, 6, 17, 13, 45, 30), 'end': None}, - {'start': None, 'end': None}, + { + "start": datetime.datetime(2009, 6, 17, 13, 45, 30), + "end": datetime.datetime(2019, 7, 17, 13, 45, 30), + }, + {"start": datetime.datetime(2009, 6, 17, 13, 45, 30), "end": None}, + {"start": None, "end": None}, ], ) self.assertEqual( list(df.range_date), [ - {'start': datetime.date(2020, 10, 1), 'end': datetime.date(2021, 10, 2)}, - {'start': datetime.date(2020, 10, 1), 'end': None}, - {'start': None, 'end': None}, - #{'start': datetime.date(2020, 10, 1), 'end': None}, - #{'start': None, 'end': None}, + { + "start": datetime.date(2020, 10, 1), + "end": datetime.date(2021, 10, 2), + }, + {"start": datetime.date(2020, 10, 1), "end": None}, + {"start": None, "end": None}, ], ) From 58a0e1804fa8ae1c246627f62707cd74300d973b Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 01:18:12 +0000 Subject: [PATCH 13/35] fix docs error --- google/cloud/bigquery/job/query.py | 12 ++++++------ google/cloud/bigquery/table.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 01d8da322..7eb13381d 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1878,11 +1878,11 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - `` + ''' pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] )) - `` + ''' to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1893,14 +1893,14 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - `` + ''' pandas.ArrowDtype(pyarrow.struct( [ ("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us")), ] )) - `` + ''' to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1911,14 +1911,14 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - `` + ''' pandas.ArrowDtype(pyarrow.struct( [ ("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC")), ] )) - `` + ''' to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 508f73902..aaa13758b 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2193,11 +2193,11 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - `` + ''' pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] )) - `` + ''' to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2208,14 +2208,14 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - `` + ''' pandas.ArrowDtype(pyarrow.struct( [ ("start", pyarrow.timestamp("us")), ("end", pyarrow.timestamp("us")), ] )) - `` + ''' to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2226,14 +2226,14 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - `` + ''' pandas.ArrowDtype(pyarrow.struct( [ ("start", pyarrow.timestamp("us", tz="UTC")), ("end", pyarrow.timestamp("us", tz="UTC")), ] )) - `` + ''' to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type From cc12e1b8f5ed16898dc808f33da980b23c4f5d6a Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 01:29:27 +0000 Subject: [PATCH 14/35] fix docstring --- google/cloud/bigquery/job/query.py | 43 +++++++++++++++--------------- google/cloud/bigquery/table.py | 42 ++++++++++++++--------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 7eb13381d..3c320a4a7 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1878,11 +1878,12 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - ''' - pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - )) - ''' + + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1893,14 +1894,14 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - ''' - pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - )) - ''' + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1911,14 +1912,14 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - ''' - pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - )) - ''' + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index aaa13758b..333064786 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2193,11 +2193,11 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - ''' - pandas.ArrowDtype(pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - )) - ''' + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2208,14 +2208,14 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - ''' - pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - )) - ''' + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2226,14 +2226,14 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - ''' - pandas.ArrowDtype(pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - )) - ''' + .. code-block:: python + + pandas.ArrowDtype(pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + )) to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type From 691710cd1d2fba1d33c7a472af9d5f8b19448e7c Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 01:38:19 +0000 Subject: [PATCH 15/35] fix docstring --- google/cloud/bigquery/job/query.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 3c320a4a7..adef161bf 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1894,6 +1894,7 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -1912,6 +1913,7 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( From 6d5ce1b04c4b2b29b2021b12290166e53f31b994 Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 01:48:27 +0000 Subject: [PATCH 16/35] fix docstring --- google/cloud/bigquery/job/query.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index adef161bf..731451adf 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1878,7 +1878,6 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -1894,7 +1893,6 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -1913,7 +1911,6 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: - .. code-block:: python pandas.ArrowDtype(pyarrow.struct( From 3ddfbf887dcb9a943227bff9439d6acc462008ea Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 02:12:42 +0000 Subject: [PATCH 17/35] docs --- google/cloud/bigquery/job/query.py | 3 +++ google/cloud/bigquery/table.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 731451adf..049d8a4f5 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1883,6 +1883,7 @@ def to_dataframe( pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] )) + to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1901,6 +1902,7 @@ def to_dataframe( ("end", pyarrow.timestamp("us")), ] )) + to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -1919,6 +1921,7 @@ def to_dataframe( ("end", pyarrow.timestamp("us", tz="UTC")), ] )) + to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 333064786..82c145c59 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2193,6 +2193,7 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -2208,6 +2209,7 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -2226,6 +2228,7 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( From b7c42ea154b7cb989d2c4020f6d12e94daa7e33f Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 02:23:34 +0000 Subject: [PATCH 18/35] docs --- google/cloud/bigquery/job/query.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 049d8a4f5..64d0e61a4 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1878,6 +1878,7 @@ def to_dataframe( range_date_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -1894,6 +1895,7 @@ def to_dataframe( range_datetime_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( @@ -1913,6 +1915,7 @@ def to_dataframe( range_timestamp_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype, such as: + .. code-block:: python pandas.ArrowDtype(pyarrow.struct( From f54a1d72dd6d6a425bc88a9f6ffb34dd71aa6847 Mon Sep 17 00:00:00 2001 From: Linchin Date: Sat, 30 Mar 2024 02:30:29 +0000 Subject: [PATCH 19/35] docs --- google/cloud/bigquery/table.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 82c145c59..6e4f5b9f3 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2199,6 +2199,7 @@ def to_dataframe( pandas.ArrowDtype(pyarrow.struct( [("start", pyarrow.date32()), ("end", pyarrow.date32())] )) + to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2218,6 +2219,7 @@ def to_dataframe( ("end", pyarrow.timestamp("us")), ] )) + to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type @@ -2237,6 +2239,7 @@ def to_dataframe( ("end", pyarrow.timestamp("us", tz="UTC")), ] )) + to convert BigQuery RANGE type, instead of relying on the default ``object``. If you explicitly set the value to ``None``, the data type will be ``object``. BigQuery Range type From c46c65c822b3c8295d5d6650b1c9c97d35d2ba5b Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 1 Apr 2024 22:14:22 +0000 Subject: [PATCH 20/35] move dtypes mapping code --- google/cloud/bigquery/_pandas_helpers.py | 134 +++++++++++++++++++++++ google/cloud/bigquery/table.py | 131 +++++----------------- 2 files changed, 159 insertions(+), 106 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index fa6047df8..37660c3c2 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -27,6 +27,7 @@ from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import DefaultPandasDTypes try: import pandas # type: ignore @@ -109,6 +110,11 @@ def _to_wkb(v): time_dtype_name: "TIME", } +_NO_SUPPORTED_DTYPE = ( + "The dtype cannot to be converted to a pandas ExtensionArray " + "because the necessary `__from_arrow__` attribute is missing." +) + class _DownloadState(object): """Flag to indicate that a thread should exit early.""" @@ -1010,3 +1016,131 @@ def verify_pandas_imports(): raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception if db_dtypes is None: raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception + + +def verify_and_enhance_dtypes( + bool_dtype, + int_dtype, + float_dtype, + string_dtype, + date_dtype, + datetime_dtype, + time_dtype, + timestamp_dtype, + range_date_dtype, + range_datetime_dtype, + range_timestamp_dtype, +): + """Verifies pandas dtypes mapping and convert from sentinel values.""" + + if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE: + bool_dtype = pandas.BooleanDtype() + + if int_dtype is DefaultPandasDTypes.INT_DTYPE: + int_dtype = pandas.Int64Dtype() + + if time_dtype is DefaultPandasDTypes.TIME_DTYPE: + time_dtype = db_dtypes.TimeDtype() + + if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: + try: + range_date_dtype = pandas.ArrowDtype( + pyarrow.struct([("start", pyarrow.date32()), ("end", pyarrow.date32())]) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_date_dtype to None. + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_date_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_date_dtype = None + + if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: + try: + range_datetime_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_datetime_dtype to None. + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_datetime_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_datetime_dtype = None + + if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: + try: + range_timestamp_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_timestamp_dtype to None. + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_timestamp_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_timestamp_dtype = None + + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): + raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) + + if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"): + raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE) + + if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"): + raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE) + + if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): + raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) + + if ( + date_dtype is not None + and date_dtype is not DefaultPandasDTypes.DATE_DTYPE + and not hasattr(date_dtype, "__from_arrow__") + ): + raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE) + + if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"): + raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE) + + if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"): + raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE) + + if timestamp_dtype is not None and not hasattr(timestamp_dtype, "__from_arrow__"): + raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE) + + return ( + bool_dtype, + int_dtype, + float_dtype, + string_dtype, + date_dtype, + datetime_dtype, + time_dtype, + timestamp_dtype, + range_date_dtype, + range_datetime_dtype, + range_timestamp_dtype, + ) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 6e4f5b9f3..4c181663c 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -95,11 +95,6 @@ _TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"' -_NO_SUPPORTED_DTYPE = ( - "The dtype cannot to be converted to a pandas ExtensionArray " - "because the necessary `__from_arrow__` attribute is missing." -) - # How many of the total rows need to be downloaded already for us to skip # calling the BQ Storage API? ALMOST_COMPLETELY_CACHED_RATIO = 0.333 @@ -2270,107 +2265,31 @@ def to_dataframe( if geography_as_object and shapely is None: raise ValueError(_NO_SHAPELY_ERROR) - if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE: - bool_dtype = pandas.BooleanDtype() - - if int_dtype is DefaultPandasDTypes.INT_DTYPE: - int_dtype = pandas.Int64Dtype() - - if time_dtype is DefaultPandasDTypes.TIME_DTYPE: - time_dtype = db_dtypes.TimeDtype() - - if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: - try: - range_date_dtype = pandas.ArrowDtype( - pyarrow.struct( - [("start", pyarrow.date32()), ("end", pyarrow.date32())] - ) - ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_date_dtype to None. - msg = ( - "Unable ro find class ArrowDtype in pandas, setting " - "range_date_dtype to be None. To use ArrowDtype, please " - "use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) - range_date_dtype = None - - if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: - try: - range_datetime_dtype = pandas.ArrowDtype( - pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - ) - ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_datetime_dtype to None. - msg = ( - "Unable ro find class ArrowDtype in pandas, setting " - "range_datetime_dtype to be None. To use ArrowDtype, " - "please use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) - range_datetime_dtype = None - - if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: - try: - range_timestamp_dtype = pandas.ArrowDtype( - pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - ) - ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_timestamp_dtype to None. - msg = ( - "Unable ro find class ArrowDtype in pandas, setting " - "range_timestamp_dtype to be None. To use ArrowDtype, " - "please use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) - range_timestamp_dtype = None - - if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): - raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) - - if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"): - raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE) - - if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"): - raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE) - - if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): - raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) - - if ( - date_dtype is not None - and date_dtype is not DefaultPandasDTypes.DATE_DTYPE - and not hasattr(date_dtype, "__from_arrow__") - ): - raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE) - - if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"): - raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE) - - if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"): - raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE) - - if timestamp_dtype is not None and not hasattr( - timestamp_dtype, "__from_arrow__" - ): - raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE) + ( + bool_dtype, + int_dtype, + float_dtype, + string_dtype, + date_dtype, + datetime_dtype, + time_dtype, + timestamp_dtype, + range_date_dtype, + range_datetime_dtype, + range_timestamp_dtype, + ) = _pandas_helpers.verify_and_enhance_dtypes( + bool_dtype, + int_dtype, + float_dtype, + string_dtype, + date_dtype, + datetime_dtype, + time_dtype, + timestamp_dtype, + range_date_dtype, + range_datetime_dtype, + range_timestamp_dtype, + ) if dtypes is None: dtypes = {} From b8401d2b7ede446cb468b90c0fabd97e567bb7ee Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 2 Apr 2024 16:38:39 -0700 Subject: [PATCH 21/35] address comment --- google/cloud/bigquery/_helpers.py | 12 +++++------- google/cloud/bigquery/query.py | 11 +++++------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index f2ce9d2cf..430afa845 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -66,6 +66,8 @@ _UNIVERSE_DOMAIN_ENV = "GOOGLE_CLOUD_UNIVERSE_DOMAIN" """Environment variable for setting universe domain.""" +_SUPPORTED_RANGE_ELEMENTS = {"TIMESTAMP", "DATETIME", "DATE"} + def _get_client_universe( client_options: Optional[Union[client_options_lib.ClientOptions, dict]] @@ -310,15 +312,11 @@ def _json_from_json(value, field): def _range_element_from_json(value, field): - """Coerce 'value' to a range element value, if set or not nullable.""" + """Coerce 'value' to a range element value.""" if value == "UNBOUNDED": return None - elif field.element_type == "DATE": - return _date_from_json(value, None) - elif field.element_type == "DATETIME": - return _datetime_from_json(value, None) - elif field.element_type == "TIMESTAMP": - return _timestamp_from_json(value, None) + if field.element_type in _SUPPORTED_RANGE_ELEMENTS: + return _CELLDATA_FROM_JSON[field.element_type](value, None) else: raise ValueError(f"Unsupported range field type: {field.element_type}") diff --git a/google/cloud/bigquery/query.py b/google/cloud/bigquery/query.py index 9c9402b74..9c59056fd 100644 --- a/google/cloud/bigquery/query.py +++ b/google/cloud/bigquery/query.py @@ -24,14 +24,13 @@ from google.cloud.bigquery._helpers import _rows_from_json from google.cloud.bigquery._helpers import _QUERY_PARAMS_FROM_JSON from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_PARAM +from google.cloud.bigquery._helpers import _SUPPORTED_RANGE_ELEMENTS _SCALAR_VALUE_TYPE = Optional[ Union[str, int, float, decimal.Decimal, bool, datetime.datetime, datetime.date] ] -_RANGE_ELEMENT_TYPE_STR = {"TIMESTAMP", "DATETIME", "DATE"} - class ConnectionProperty: """A connection-level property to customize query behavior. @@ -388,14 +387,14 @@ def _parse_range_element_type(self, type_): google.cloud.bigquery.query.ScalarQueryParameterType: Instance """ if isinstance(type_, str): - if type_ not in _RANGE_ELEMENT_TYPE_STR: + if type_ not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a string, range element type must be one of " "'TIMESTAMP', 'DATE', or 'DATETIME'." ) return ScalarQueryParameterType(type_) elif isinstance(type_, ScalarQueryParameterType): - if type_._type not in _RANGE_ELEMENT_TYPE_STR: + if type_._type not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a ScalarQueryParameter object, range element " "type must be one of 'TIMESTAMP', 'DATE', or 'DATETIME' " @@ -960,14 +959,14 @@ class RangeQueryParameter(_AbstractQueryParameter): @classmethod def _parse_range_element_type(self, range_element_type): if isinstance(range_element_type, str): - if range_element_type not in _RANGE_ELEMENT_TYPE_STR: + if range_element_type not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a string, range_element_type must be one of " f"'TIMESTAMP', 'DATE', or 'DATETIME'. Got {range_element_type}." ) return RangeQueryParameterType(range_element_type) elif isinstance(range_element_type, RangeQueryParameterType): - if range_element_type.type_._type not in _RANGE_ELEMENT_TYPE_STR: + if range_element_type.type_._type not in _SUPPORTED_RANGE_ELEMENTS: raise ValueError( "If given as a RangeQueryParameterType object, " "range_element_type must be one of 'TIMESTAMP', 'DATE', " From 4b96ee830409e6696f11f068c692ddb1fc1970da Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 2 Apr 2024 17:21:47 -0700 Subject: [PATCH 22/35] address comment --- google/cloud/bigquery/_pandas_helpers.py | 9 +++++++- google/cloud/bigquery/_pyarrow_helpers.py | 25 ----------------------- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 37660c3c2..9bda3cff0 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -148,6 +148,12 @@ def bq_to_arrow_struct_data_type(field): return pyarrow.struct(arrow_fields) +def bq_to_arrow_range_data_type(field): + element_type = field.element_type.upper() + arrow_element_type = _pyarrow_helpers.bq_to_arrow_scalars(element_type)() + return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)]) + + def bq_to_arrow_data_type(field): """Return the Arrow data type, corresponding to a given BigQuery column. @@ -167,7 +173,8 @@ def bq_to_arrow_data_type(field): return bq_to_arrow_struct_data_type(field) if field_type_upper == "RANGE": - field_type_upper = f"RANGE<{field.range_element_type.element_type}>" + return bq_to_arrow_range_data_type(field.range_element_type) + data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper) if data_type_constructor is None: return None diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py index 3cd58cf9f..3c745a611 100644 --- a/google/cloud/bigquery/_pyarrow_helpers.py +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -46,28 +46,6 @@ def pyarrow_timestamp(): return pyarrow.timestamp("us", tz="UTC") -def pyarrow_range_timestamp(): - return pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - ) - - -def pyarrow_range_datetime(): - return pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - ) - - -def pyarrow_range_date(): - return pyarrow.struct([("start", pyarrow.date32()), ("end", pyarrow.date32())]) - - _BQ_TO_ARROW_SCALARS = {} _ARROW_SCALAR_IDS_TO_BQ = {} @@ -90,9 +68,6 @@ def pyarrow_range_date(): "STRING": pyarrow.string, "TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, - "RANGE": pyarrow_range_timestamp, - "RANGE": pyarrow_range_datetime, - "RANGE": pyarrow_range_date, } _ARROW_SCALAR_IDS_TO_BQ = { From 790b3d122e8d63f2ad734c9605529406532a22a9 Mon Sep 17 00:00:00 2001 From: Linchin Date: Wed, 3 Apr 2024 15:09:33 -0700 Subject: [PATCH 23/35] fix pytest error --- google/cloud/bigquery/_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 430afa845..5f1287535 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -316,7 +316,7 @@ def _range_element_from_json(value, field): if value == "UNBOUNDED": return None if field.element_type in _SUPPORTED_RANGE_ELEMENTS: - return _CELLDATA_FROM_JSON[field.element_type](value, None) + return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type) else: raise ValueError(f"Unsupported range field type: {field.element_type}") From 0be9fb6a59ff6183f76ff2a5b3b7d412a8de8349 Mon Sep 17 00:00:00 2001 From: Linchin Date: Wed, 3 Apr 2024 16:06:33 -0700 Subject: [PATCH 24/35] Revert "move dtypes mapping code" This reverts commit c46c65c822b3c8295d5d6650b1c9c97d35d2ba5b. --- google/cloud/bigquery/_pandas_helpers.py | 134 ----------------------- google/cloud/bigquery/table.py | 131 +++++++++++++++++----- 2 files changed, 106 insertions(+), 159 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 9bda3cff0..eec0a046f 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -27,7 +27,6 @@ from google.cloud.bigquery import _pyarrow_helpers from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema -from google.cloud.bigquery.enums import DefaultPandasDTypes try: import pandas # type: ignore @@ -110,11 +109,6 @@ def _to_wkb(v): time_dtype_name: "TIME", } -_NO_SUPPORTED_DTYPE = ( - "The dtype cannot to be converted to a pandas ExtensionArray " - "because the necessary `__from_arrow__` attribute is missing." -) - class _DownloadState(object): """Flag to indicate that a thread should exit early.""" @@ -1023,131 +1017,3 @@ def verify_pandas_imports(): raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception if db_dtypes is None: raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception - - -def verify_and_enhance_dtypes( - bool_dtype, - int_dtype, - float_dtype, - string_dtype, - date_dtype, - datetime_dtype, - time_dtype, - timestamp_dtype, - range_date_dtype, - range_datetime_dtype, - range_timestamp_dtype, -): - """Verifies pandas dtypes mapping and convert from sentinel values.""" - - if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE: - bool_dtype = pandas.BooleanDtype() - - if int_dtype is DefaultPandasDTypes.INT_DTYPE: - int_dtype = pandas.Int64Dtype() - - if time_dtype is DefaultPandasDTypes.TIME_DTYPE: - time_dtype = db_dtypes.TimeDtype() - - if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: - try: - range_date_dtype = pandas.ArrowDtype( - pyarrow.struct([("start", pyarrow.date32()), ("end", pyarrow.date32())]) - ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_date_dtype to None. - msg = ( - "Unable ro find class ArrowDtype in pandas, setting " - "range_date_dtype to be None. To use ArrowDtype, please " - "use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) - range_date_dtype = None - - if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: - try: - range_datetime_dtype = pandas.ArrowDtype( - pyarrow.struct( - [ - ("start", pyarrow.timestamp("us")), - ("end", pyarrow.timestamp("us")), - ] - ) - ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_datetime_dtype to None. - msg = ( - "Unable ro find class ArrowDtype in pandas, setting " - "range_datetime_dtype to be None. To use ArrowDtype, " - "please use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) - range_datetime_dtype = None - - if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: - try: - range_timestamp_dtype = pandas.ArrowDtype( - pyarrow.struct( - [ - ("start", pyarrow.timestamp("us", tz="UTC")), - ("end", pyarrow.timestamp("us", tz="UTC")), - ] - ) - ) - except AttributeError: - # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 - # only supports upto pandas 1.3. If pandas.ArrowDtype is not - # present, we raise a warning and set range_timestamp_dtype to None. - msg = ( - "Unable ro find class ArrowDtype in pandas, setting " - "range_timestamp_dtype to be None. To use ArrowDtype, " - "please use pandas >= 1.5 and python >= 3.8." - ) - warnings.warn(msg) - range_timestamp_dtype = None - - if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): - raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) - - if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"): - raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE) - - if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"): - raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE) - - if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): - raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) - - if ( - date_dtype is not None - and date_dtype is not DefaultPandasDTypes.DATE_DTYPE - and not hasattr(date_dtype, "__from_arrow__") - ): - raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE) - - if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"): - raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE) - - if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"): - raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE) - - if timestamp_dtype is not None and not hasattr(timestamp_dtype, "__from_arrow__"): - raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE) - - return ( - bool_dtype, - int_dtype, - float_dtype, - string_dtype, - date_dtype, - datetime_dtype, - time_dtype, - timestamp_dtype, - range_date_dtype, - range_datetime_dtype, - range_timestamp_dtype, - ) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 4c181663c..6e4f5b9f3 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -95,6 +95,11 @@ _TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"' +_NO_SUPPORTED_DTYPE = ( + "The dtype cannot to be converted to a pandas ExtensionArray " + "because the necessary `__from_arrow__` attribute is missing." +) + # How many of the total rows need to be downloaded already for us to skip # calling the BQ Storage API? ALMOST_COMPLETELY_CACHED_RATIO = 0.333 @@ -2265,31 +2270,107 @@ def to_dataframe( if geography_as_object and shapely is None: raise ValueError(_NO_SHAPELY_ERROR) - ( - bool_dtype, - int_dtype, - float_dtype, - string_dtype, - date_dtype, - datetime_dtype, - time_dtype, - timestamp_dtype, - range_date_dtype, - range_datetime_dtype, - range_timestamp_dtype, - ) = _pandas_helpers.verify_and_enhance_dtypes( - bool_dtype, - int_dtype, - float_dtype, - string_dtype, - date_dtype, - datetime_dtype, - time_dtype, - timestamp_dtype, - range_date_dtype, - range_datetime_dtype, - range_timestamp_dtype, - ) + if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE: + bool_dtype = pandas.BooleanDtype() + + if int_dtype is DefaultPandasDTypes.INT_DTYPE: + int_dtype = pandas.Int64Dtype() + + if time_dtype is DefaultPandasDTypes.TIME_DTYPE: + time_dtype = db_dtypes.TimeDtype() + + if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE: + try: + range_date_dtype = pandas.ArrowDtype( + pyarrow.struct( + [("start", pyarrow.date32()), ("end", pyarrow.date32())] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_date_dtype to None. + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_date_dtype to be None. To use ArrowDtype, please " + "use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_date_dtype = None + + if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE: + try: + range_datetime_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us")), + ("end", pyarrow.timestamp("us")), + ] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_datetime_dtype to None. + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_datetime_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_datetime_dtype = None + + if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE: + try: + range_timestamp_dtype = pandas.ArrowDtype( + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ) + ) + except AttributeError: + # pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7 + # only supports upto pandas 1.3. If pandas.ArrowDtype is not + # present, we raise a warning and set range_timestamp_dtype to None. + msg = ( + "Unable ro find class ArrowDtype in pandas, setting " + "range_timestamp_dtype to be None. To use ArrowDtype, " + "please use pandas >= 1.5 and python >= 3.8." + ) + warnings.warn(msg) + range_timestamp_dtype = None + + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): + raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) + + if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"): + raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE) + + if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"): + raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE) + + if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): + raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) + + if ( + date_dtype is not None + and date_dtype is not DefaultPandasDTypes.DATE_DTYPE + and not hasattr(date_dtype, "__from_arrow__") + ): + raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE) + + if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"): + raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE) + + if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"): + raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE) + + if timestamp_dtype is not None and not hasattr( + timestamp_dtype, "__from_arrow__" + ): + raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE) if dtypes is None: dtypes = {} From b7f37795069188e8deb66ef03becc4968c440431 Mon Sep 17 00:00:00 2001 From: Linchin Date: Wed, 3 Apr 2024 16:10:21 -0700 Subject: [PATCH 25/35] remove commented out assertions --- tests/unit/test_table.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 4e59bb76b..c5c35185c 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3497,17 +3497,9 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) - # with warnings.catch_warnings(record=True) as warned: with warnings.catch_warnings(record=True): df = row_iterator.to_dataframe(create_bqstorage_client=False) - # user_warnings = [ - # warning for warning in warned if warning.category is UserWarning - # ] - # Note: number of warnings is inconsistent across python versions - # I think it's relatively safe to not check warning numbers, than - # having different assertions depending on python version. - # self.assertEqual(len(user_warnings), 0) self.assertEqual(len(df), 4) @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) @@ -3529,21 +3521,12 @@ def test_to_dataframe_no_tqdm(self): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) - # with warnings.catch_warnings(record=True) as warned: with warnings.catch_warnings(record=True): df = row_iterator.to_dataframe( progress_bar_type="tqdm", create_bqstorage_client=False, ) - # user_warnings = [ - # warning for warning in warned if warning.category is UserWarning - # ] - # Note: number of warnings is inconsistent across python versions - # I think it's relatively safe to not check warning numbers, than - # having different assertions depending on python version. - # self.assertEqual(len(user_warnings), 1) - # Even though the progress bar won't show, downloading the dataframe # should still work. self.assertEqual(len(df), 4) From 2a0d5183f5b5e7a2468a427ceedcaf150ac98ec0 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 15 Apr 2024 14:49:36 -0700 Subject: [PATCH 26/35] typo and formats --- google/cloud/bigquery/table.py | 6 +++--- google/cloud/bigquery_v2/types/model.py | 11 ----------- google/cloud/bigquery_v2/types/standard_sql.py | 1 - 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 6e4f5b9f3..1a3472e1d 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2291,7 +2291,7 @@ def to_dataframe( # only supports upto pandas 1.3. If pandas.ArrowDtype is not # present, we raise a warning and set range_date_dtype to None. msg = ( - "Unable ro find class ArrowDtype in pandas, setting " + "Unable to find class ArrowDtype in pandas, setting " "range_date_dtype to be None. To use ArrowDtype, please " "use pandas >= 1.5 and python >= 3.8." ) @@ -2313,7 +2313,7 @@ def to_dataframe( # only supports upto pandas 1.3. If pandas.ArrowDtype is not # present, we raise a warning and set range_datetime_dtype to None. msg = ( - "Unable ro find class ArrowDtype in pandas, setting " + "Unable to find class ArrowDtype in pandas, setting " "range_datetime_dtype to be None. To use ArrowDtype, " "please use pandas >= 1.5 and python >= 3.8." ) @@ -2335,7 +2335,7 @@ def to_dataframe( # only supports upto pandas 1.3. If pandas.ArrowDtype is not # present, we raise a warning and set range_timestamp_dtype to None. msg = ( - "Unable ro find class ArrowDtype in pandas, setting " + "Unable to find class ArrowDtype in pandas, setting " "range_timestamp_dtype to be None. To use ArrowDtype, " "please use pandas >= 1.5 and python >= 3.8." ) diff --git a/google/cloud/bigquery_v2/types/model.py b/google/cloud/bigquery_v2/types/model.py index 99c6b3b8d..f32e15eb1 100644 --- a/google/cloud/bigquery_v2/types/model.py +++ b/google/cloud/bigquery_v2/types/model.py @@ -103,7 +103,6 @@ class Model(proto.Message): class ModelType(proto.Enum): r"""Indicates the type of the Model.""" - MODEL_TYPE_UNSPECIFIED = 0 LINEAR_REGRESSION = 1 LOGISTIC_REGRESSION = 2 @@ -121,7 +120,6 @@ class ModelType(proto.Enum): class LossType(proto.Enum): r"""Loss metric to evaluate model training performance.""" - LOSS_TYPE_UNSPECIFIED = 0 MEAN_SQUARED_LOSS = 1 MEAN_LOG_LOSS = 2 @@ -130,7 +128,6 @@ class DistanceType(proto.Enum): r"""Distance metric used to compute the distance between two points. """ - DISTANCE_TYPE_UNSPECIFIED = 0 EUCLIDEAN = 1 COSINE = 2 @@ -139,7 +136,6 @@ class DataSplitMethod(proto.Enum): r"""Indicates the method to split input data into multiple tables. """ - DATA_SPLIT_METHOD_UNSPECIFIED = 0 RANDOM = 1 CUSTOM = 2 @@ -151,7 +147,6 @@ class DataFrequency(proto.Enum): r"""Type of supported data frequency for time series forecasting models. """ - DATA_FREQUENCY_UNSPECIFIED = 0 AUTO_FREQUENCY = 1 YEARLY = 2 @@ -166,7 +161,6 @@ class HolidayRegion(proto.Enum): r"""Type of supported holiday regions for time series forecasting models. """ - HOLIDAY_REGION_UNSPECIFIED = 0 GLOBAL = 1 NA = 2 @@ -239,14 +233,12 @@ class HolidayRegion(proto.Enum): class LearnRateStrategy(proto.Enum): r"""Indicates the learning rate optimization strategy to use.""" - LEARN_RATE_STRATEGY_UNSPECIFIED = 0 LINE_SEARCH = 1 CONSTANT = 2 class OptimizationStrategy(proto.Enum): r"""Indicates the optimization strategy used for training.""" - OPTIMIZATION_STRATEGY_UNSPECIFIED = 0 BATCH_GRADIENT_DESCENT = 1 NORMAL_EQUATION = 2 @@ -255,7 +247,6 @@ class FeedbackType(proto.Enum): r"""Indicates the training algorithm to use for matrix factorization models. """ - FEEDBACK_TYPE_UNSPECIFIED = 0 IMPLICIT = 1 EXPLICIT = 2 @@ -265,7 +256,6 @@ class SeasonalPeriod(proto.Message): class SeasonalPeriodType(proto.Enum): r"""""" - SEASONAL_PERIOD_TYPE_UNSPECIFIED = 0 NO_SEASONALITY = 1 DAILY = 2 @@ -281,7 +271,6 @@ class KmeansInitializationMethod(proto.Enum): r"""Indicates the method used to initialize the centroids for KMeans clustering algorithm. """ - KMEANS_INITIALIZATION_METHOD_UNSPECIFIED = 0 RANDOM = 1 CUSTOM = 2 diff --git a/google/cloud/bigquery_v2/types/standard_sql.py b/google/cloud/bigquery_v2/types/standard_sql.py index 822c0bf22..3be5304fc 100644 --- a/google/cloud/bigquery_v2/types/standard_sql.py +++ b/google/cloud/bigquery_v2/types/standard_sql.py @@ -60,7 +60,6 @@ class StandardSqlDataType(proto.Message): class TypeKind(proto.Enum): r"""""" - TYPE_KIND_UNSPECIFIED = 0 INT64 = 2 BOOL = 5 From 2c9782f6ac441736935c7e13635cbd5ab43fb3a7 Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 15 Apr 2024 15:59:39 -0700 Subject: [PATCH 27/35] add None-check for range_element_type and add unit tests --- google/cloud/bigquery/_pandas_helpers.py | 5 ++ tests/unit/test__pandas_helpers.py | 60 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index eec0a046f..8395478fb 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -143,6 +143,11 @@ def bq_to_arrow_struct_data_type(field): def bq_to_arrow_range_data_type(field): + if field is None: + raise ValueError( + "Range element type cannot be None, must be one of " + "DATE, DATETIME, or TIMESTAMP" + ) element_type = field.element_type.upper() arrow_element_type = _pyarrow_helpers.bq_to_arrow_scalars(element_type)() return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)]) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 5c13669f3..1af498718 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -670,6 +670,66 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): assert array.to_pylist() == list(series) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.parametrize( + "bq_schema,expected", + [ + ( + schema.SchemaField( + "field1", + "RANGE", + range_element_type=schema.FieldElementType("DATE"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.date32()), + ("end", pyarrow.date32()), + ] + ), + ), + ( + schema.SchemaField( + "field2", + "RANGE", + range_element_type=schema.FieldElementType("DATETIME"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", _pyarrow_helpers.pyarrow_datetime()), + ("end", _pyarrow_helpers.pyarrow_datetime()), + ] + ), + ), + ( + schema.SchemaField( + "field3", + "RANGE", + range_element_type=schema.FieldElementType("TIMESTAMP"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", _pyarrow_helpers.pyarrow_timestamp()), + ("end", _pyarrow_helpers.pyarrow_timestamp()), + ] + ), + ), + ], +) +def test_bq_to_arrow_data_type_w_range(module_under_test, bq_schema, expected): + actual = module_under_test.bq_to_arrow_data_type(bq_schema) + assert actual.equals(expected) + + +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +def test_bq_to_arrow_data_type_w_range_no_element(module_under_test): + field = schema.SchemaField("field1", "RANGE", mode="NULLABLE") + with pytest.raises(ValueError, match="Range element type cannot be None"): + module_under_test.bq_to_arrow_data_type(field) + + @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( From 40afa27a9053f253eda0b9027acf8dbba79e1b7b Mon Sep 17 00:00:00 2001 From: Linchin Date: Mon, 15 Apr 2024 16:45:13 -0700 Subject: [PATCH 28/35] change test skip condition --- tests/unit/test__pandas_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 1af498718..d45519022 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -670,7 +670,7 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): assert array.to_pylist() == list(series) -@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.parametrize( "bq_schema,expected", [ @@ -723,7 +723,7 @@ def test_bq_to_arrow_data_type_w_range(module_under_test, bq_schema, expected): assert actual.equals(expected) -@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_bq_to_arrow_data_type_w_range_no_element(module_under_test): field = schema.SchemaField("field1", "RANGE", mode="NULLABLE") with pytest.raises(ValueError, match="Range element type cannot be None"): From 203e0c0a8ed6674674b78b333e4d701ebbed598f Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 16 Apr 2024 10:35:38 -0700 Subject: [PATCH 29/35] fix test error --- tests/unit/test__pandas_helpers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index d45519022..ff9c7a909 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -671,6 +671,7 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.parametrize( "bq_schema,expected", [ From bb17b3bf44c8d96a410e0310fda92acff19e25d1 Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 16 Apr 2024 10:56:38 -0700 Subject: [PATCH 30/35] change test skip condition --- tests/unit/test__pandas_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index ff9c7a909..cea1b4cc0 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -671,7 +671,7 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") @pytest.mark.parametrize( "bq_schema,expected", [ From e58739a768961a488c20a3d92e9e24e2606f69ff Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 16 Apr 2024 11:52:34 -0700 Subject: [PATCH 31/35] change test skip condition --- tests/unit/test__pandas_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index cea1b4cc0..ff9c7a909 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -671,7 +671,7 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.parametrize( "bq_schema,expected", [ From c3db3c95e20675cdfcb7e52e9b9956957be257b9 Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 16 Apr 2024 12:38:36 -0700 Subject: [PATCH 32/35] change decorator order --- tests/unit/test__pandas_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index ff9c7a909..af38ab666 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -670,8 +670,6 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): assert array.to_pylist() == list(series) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") -@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.parametrize( "bq_schema,expected", [ @@ -719,6 +717,8 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): ), ], ) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_bq_to_arrow_data_type_w_range(module_under_test, bq_schema, expected): actual = module_under_test.bq_to_arrow_data_type(bq_schema) assert actual.equals(expected) From 2211dd0cdd7d886f89251c69feab1904fd916bf1 Mon Sep 17 00:00:00 2001 From: Linchin Date: Tue, 16 Apr 2024 14:12:21 -0700 Subject: [PATCH 33/35] use a different way to construct test data --- tests/unit/test__pandas_helpers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index af38ab666..58d2b73b3 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -696,8 +696,8 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): ), pyarrow.struct( [ - ("start", _pyarrow_helpers.pyarrow_datetime()), - ("end", _pyarrow_helpers.pyarrow_datetime()), + ("start", pyarrow.timestamp("us", tz=None)), + ("end", pyarrow.timestamp("us", tz=None)), ] ), ), @@ -710,8 +710,8 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): ), pyarrow.struct( [ - ("start", _pyarrow_helpers.pyarrow_timestamp()), - ("end", _pyarrow_helpers.pyarrow_timestamp()), + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), ] ), ), From e2a95524b0b84c6046959ab9e659102eaeafcdfb Mon Sep 17 00:00:00 2001 From: Linchin Date: Thu, 18 Apr 2024 12:25:30 -0700 Subject: [PATCH 34/35] fix error message and add warning number check --- google/cloud/bigquery/_helpers.py | 2 +- tests/unit/test_table.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 5f1287535..083eb9f9d 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -318,7 +318,7 @@ def _range_element_from_json(value, field): if field.element_type in _SUPPORTED_RANGE_ELEMENTS: return _CELLDATA_FROM_JSON[field.element_type](value, field.element_type) else: - raise ValueError(f"Unsupported range field type: {field.element_type}") + raise ValueError(f"Unsupported range element type: {field.element_type}") def _range_from_json(value, field): diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index c5c35185c..1134d7e45 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3497,9 +3497,13 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) - with warnings.catch_warnings(record=True): + with warnings.catch_warnings(record=True) as warned: df = row_iterator.to_dataframe(create_bqstorage_client=False) + user_warnings = [ + warning for warning in warned if warning.category is UserWarning + ] + self.assertEqual(len(user_warnings), 0) self.assertEqual(len(df), 4) @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) @@ -3521,12 +3525,17 @@ def test_to_dataframe_no_tqdm(self): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) - with warnings.catch_warnings(record=True): + with warnings.catch_warnings(record=True) as warned: df = row_iterator.to_dataframe( progress_bar_type="tqdm", create_bqstorage_client=False, ) + user_warnings = [ + warning for warning in warned if warning.category is UserWarning + ] + self.assertEqual(len(user_warnings), 1) + # Even though the progress bar won't show, downloading the dataframe # should still work. self.assertEqual(len(df), 4) From 4c20bd7dbcbe587757a3aed6b0dc55b39da96172 Mon Sep 17 00:00:00 2001 From: Linchin Date: Thu, 18 Apr 2024 12:56:40 -0700 Subject: [PATCH 35/35] add warning number check and comments --- tests/unit/test_table.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 1134d7e45..099529f95 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3503,7 +3503,11 @@ def test_to_dataframe_no_tqdm_no_progress_bar(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - self.assertEqual(len(user_warnings), 0) + # With Python 3.7 and 3.8, len(user_warnings) = 3. With pandas < 1.5, + # pandas.ArrowDtype is not supported. We raise warnings because + # range columns have to be converted to object. + # With higher Python versions and noextra tests, len(user_warnings) = 0 + self.assertIn(len(user_warnings), [0, 3]) self.assertEqual(len(df), 4) @mock.patch("google.cloud.bigquery._tqdm_helpers.tqdm", new=None) @@ -3534,7 +3538,11 @@ def test_to_dataframe_no_tqdm(self): user_warnings = [ warning for warning in warned if warning.category is UserWarning ] - self.assertEqual(len(user_warnings), 1) + # With Python 3.7 and 3.8, len(user_warnings) = 4. With pandas < 1.5, + # pandas.ArrowDtype is not supported. We raise warnings because + # range columns have to be converted to object. + # With higher Python versions and noextra tests, len(user_warnings) = 1 + self.assertIn(len(user_warnings), [1, 4]) # Even though the progress bar won't show, downloading the dataframe # should still work.