From b87ed88c8b42598bfd8849d363386a4ab1d68144 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Sun, 18 Aug 2019 20:36:39 +0200 Subject: [PATCH 1/5] Issue warning if no schema when loading from DF --- bigquery/google/cloud/bigquery/client.py | 8 ++++++++ bigquery/tests/unit/test_client.py | 26 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py index 04c596975eec..f853156b8d12 100644 --- a/bigquery/google/cloud/bigquery/client.py +++ b/bigquery/google/cloud/bigquery/client.py @@ -1548,6 +1548,14 @@ def load_table_from_dataframe( PendingDeprecationWarning, stacklevel=2, ) + else: + warnings.warn( + "Loading from a dataframe without a schema will be " + "deprecated in the future, please provide a schema.", + PendingDeprecationWarning, + stacklevel=2, + ) + dataframe.to_parquet(tmppath, compression=parquet_compression) with open(tmppath, "rb") as parquet_file: diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py index c4e9c5e830ac..e6ee16fa560d 100644 --- a/bigquery/tests/unit/test_client.py +++ b/bigquery/tests/unit/test_client.py @@ -5328,6 +5328,32 @@ def test_load_table_from_dataframe_w_custom_job_config(self): assert sent_config is job_config assert sent_config.source_format == job.SourceFormat.PARQUET + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_load_table_from_dataframe_wo_schema_warning(self): + client = self._make_client() + records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] + dataframe = pandas.DataFrame(records) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) + + with load_patch, pyarrow_patch, warnings.catch_warnings(record=True) as warned: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, location=self.LOCATION + ) + + for warning in warned: + if warning.category in ( + DeprecationWarning, + PendingDeprecationWarning, + ) and "please provide a schema" in str(warning): + break + else: + pytest.fail("A missing schema deprecation warning was not raised.") + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_schema_wo_pyarrow(self): From effcf7e6fde670eeeae257388af46dfa8329b41e Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Mon, 19 Aug 2019 00:36:40 +0200 Subject: [PATCH 2/5] Raise error if serializing DF with struct fields --- bigquery/google/cloud/bigquery/client.py | 9 +++++++ bigquery/tests/unit/test_client.py | 34 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py index f853156b8d12..53a2df4a7d2a 100644 --- a/bigquery/google/cloud/bigquery/client.py +++ b/bigquery/google/cloud/bigquery/client.py @@ -60,6 +60,7 @@ from google.cloud.bigquery.retry import DEFAULT_RETRY from google.cloud.bigquery.routine import Routine from google.cloud.bigquery.routine import RoutineReference +from google.cloud.bigquery.schema import _STRUCT_TYPES from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import _table_arg_to_table from google.cloud.bigquery.table import _table_arg_to_table_ref @@ -1529,6 +1530,14 @@ def load_table_from_dataframe( os.close(tmpfd) try: + if job_config.schema: + for field in job_config.schema: + if field.field_type in _STRUCT_TYPES: + raise ValueError( + "Pyarrow does not support serializing dataframes with " + "struct (record) column types." + ) + if pyarrow and job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py index e6ee16fa560d..665507339a12 100644 --- a/bigquery/tests/unit/test_client.py +++ b/bigquery/tests/unit/test_client.py @@ -5328,6 +5328,40 @@ def test_load_table_from_dataframe_w_custom_job_config(self): assert sent_config is job_config assert sent_config.source_format == job.SourceFormat.PARQUET + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_load_table_from_dataframe_struct_fields_error(self): + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + + records = [{"float_column": 3.14, "struct_column": [{"foo": 1}, {"bar": -1}]}] + dataframe = pandas.DataFrame(data=records) + + schema = [ + SchemaField("float_column", "FLOAT"), + SchemaField( + "agg_col", + "RECORD", + fields=[SchemaField("foo", "INTEGER"), SchemaField("bar", "INTEGER")], + ), + ] + job_config = job.LoadJobConfig(schema=schema) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with pytest.raises(ValueError) as exc_info, load_patch: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION + ) + + err_msg = str(exc_info.value) + assert "struct" in err_msg + assert "not support" in err_msg + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_wo_schema_warning(self): From 486e98e8ae6c49d1872c2924436ce4ebfd420c5f Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Mon, 19 Aug 2019 08:18:41 +0200 Subject: [PATCH 3/5] Rewrite test assertion to make coverage happy --- bigquery/tests/unit/test_client.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py index 665507339a12..12fdf0458e22 100644 --- a/bigquery/tests/unit/test_client.py +++ b/bigquery/tests/unit/test_client.py @@ -5379,14 +5379,13 @@ def test_load_table_from_dataframe_wo_schema_warning(self): dataframe, self.TABLE_REF, location=self.LOCATION ) - for warning in warned: - if warning.category in ( - DeprecationWarning, - PendingDeprecationWarning, - ) and "please provide a schema" in str(warning): - break - else: - pytest.fail("A missing schema deprecation warning was not raised.") + matches = [ + warning + for warning in warned + if warning.category in (DeprecationWarning, PendingDeprecationWarning) + and "please provide a schema" in str(warning) + ] + assert matches, "A missing schema deprecation warning was not raised." @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") From 505de5bc0035798a8ff8acc0a426bab0e6277b95 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Wed, 21 Aug 2019 09:18:24 +0200 Subject: [PATCH 4/5] Make the unsupported type message more general --- bigquery/google/cloud/bigquery/client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py index 53a2df4a7d2a..b120110f14f3 100644 --- a/bigquery/google/cloud/bigquery/client.py +++ b/bigquery/google/cloud/bigquery/client.py @@ -1534,8 +1534,9 @@ def load_table_from_dataframe( for field in job_config.schema: if field.field_type in _STRUCT_TYPES: raise ValueError( - "Pyarrow does not support serializing dataframes with " - "struct (record) column types." + "Uploading dataframes with struct (record) column types " + "is not supported. See: " + "https://github.com/googleapis/google-cloud-python/issues/8191" ) if pyarrow and job_config.schema: From 13640b07aee685c27d0cc14c3e0908391bd8a9b4 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Wed, 21 Aug 2019 09:22:21 +0200 Subject: [PATCH 5/5] Remove warning on missing schema The warning will be added once the support for partial schemas and automatic schema detection is implemented. --- bigquery/google/cloud/bigquery/client.py | 7 ------- bigquery/tests/unit/test_client.py | 25 ------------------------ 2 files changed, 32 deletions(-) diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py index b120110f14f3..ae9adb4da15f 100644 --- a/bigquery/google/cloud/bigquery/client.py +++ b/bigquery/google/cloud/bigquery/client.py @@ -1558,13 +1558,6 @@ def load_table_from_dataframe( PendingDeprecationWarning, stacklevel=2, ) - else: - warnings.warn( - "Loading from a dataframe without a schema will be " - "deprecated in the future, please provide a schema.", - PendingDeprecationWarning, - stacklevel=2, - ) dataframe.to_parquet(tmppath, compression=parquet_compression) diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py index 12fdf0458e22..d7ff3d2a90b3 100644 --- a/bigquery/tests/unit/test_client.py +++ b/bigquery/tests/unit/test_client.py @@ -5362,31 +5362,6 @@ def test_load_table_from_dataframe_struct_fields_error(self): assert "struct" in err_msg assert "not support" in err_msg - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_wo_schema_warning(self): - client = self._make_client() - records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] - dataframe = pandas.DataFrame(records) - - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - - with load_patch, pyarrow_patch, warnings.catch_warnings(record=True) as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, location=self.LOCATION - ) - - matches = [ - warning - for warning in warned - if warning.category in (DeprecationWarning, PendingDeprecationWarning) - and "please provide a schema" in str(warning) - ] - assert matches, "A missing schema deprecation warning was not raised." - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_schema_wo_pyarrow(self):