perf: use jobs.getQueryResults to download result sets (#363)

* refactor: break job into multiple modules Original paths are retained for backwards compatibility. * perf: use `jobs.getQueryResults` to download result sets Replaces `tabledata.list` when `RowIterator` is used for query results. This likely also fixes a few edge cases around BigQuery scripting jobs. * revert unnecessary changes to _get_query_results * simplify RowIterator. no need to hack Table object * fix tests for bqstorage warning * populate location
googleapis · Nov 4, 2020 · 0c3476d · 0c3476d
1 parent 2849e56
commit 0c3476d
Show file tree

Hide file tree

Showing 11 changed files with 256 additions and 131 deletions.
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -474,7 +474,7 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
     pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)
 
 
-def _tabledata_list_page_to_arrow(page, column_names, arrow_types):
+def _row_iterator_page_to_arrow(page, column_names, arrow_types):
     # Iterate over the page to force the API request to get the page data.
     try:
         next(iter(page))
@@ -490,8 +490,8 @@ def _tabledata_list_page_to_arrow(page, column_names, arrow_types):
     return pyarrow.RecordBatch.from_arrays(arrays, names=column_names)
 
 
-def download_arrow_tabledata_list(pages, bq_schema):
-    """Use tabledata.list to construct an iterable of RecordBatches.
+def download_arrow_row_iterator(pages, bq_schema):
+    """Use HTTP JSON RowIterator to construct an iterable of RecordBatches.
 
     Args:
         pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
@@ -510,10 +510,10 @@ def download_arrow_tabledata_list(pages, bq_schema):
     arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]
 
     for page in pages:
-        yield _tabledata_list_page_to_arrow(page, column_names, arrow_types)
+        yield _row_iterator_page_to_arrow(page, column_names, arrow_types)
 
 
-def _tabledata_list_page_to_dataframe(page, column_names, dtypes):
+def _row_iterator_page_to_dataframe(page, column_names, dtypes):
     # Iterate over the page to force the API request to get the page data.
     try:
         next(iter(page))
@@ -528,8 +528,8 @@ def _tabledata_list_page_to_dataframe(page, column_names, dtypes):
     return pandas.DataFrame(columns, columns=column_names)
 
 
-def download_dataframe_tabledata_list(pages, bq_schema, dtypes):
-    """Use (slower, but free) tabledata.list to construct a DataFrame.
+def download_dataframe_row_iterator(pages, bq_schema, dtypes):
+    """Use HTTP JSON RowIterator to construct a DataFrame.
 
     Args:
         pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
@@ -549,7 +549,7 @@ def download_dataframe_tabledata_list(pages, bq_schema, dtypes):
     bq_schema = schema._to_schema_fields(bq_schema)
     column_names = [field.name for field in bq_schema]
     for page in pages:
-        yield _tabledata_list_page_to_dataframe(page, column_names, dtypes)
+        yield _row_iterator_page_to_dataframe(page, column_names, dtypes)
 
 
 def _bqstorage_page_to_arrow(page):

diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py
@@ -80,18 +80,19 @@
 _MAX_MULTIPART_SIZE = 5 * 1024 * 1024
 _DEFAULT_NUM_RETRIES = 6
 _BASE_UPLOAD_TEMPLATE = (
-    u"https://bigquery.googleapis.com/upload/bigquery/v2/projects/"
-    u"{project}/jobs?uploadType="
+    "https://bigquery.googleapis.com/upload/bigquery/v2/projects/"
+    "{project}/jobs?uploadType="
 )
-_MULTIPART_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + u"multipart"
-_RESUMABLE_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + u"resumable"
-_GENERIC_CONTENT_TYPE = u"*/*"
+_MULTIPART_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + "multipart"
+_RESUMABLE_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + "resumable"
+_GENERIC_CONTENT_TYPE = "*/*"
 _READ_LESS_THAN_SIZE = (
     "Size {:d} was specified but the file-like object only had " "{:d} bytes remaining."
 )
 _NEED_TABLE_ARGUMENT = (
     "The table argument should be a table ID string, Table, or TableReference"
 )
+_LIST_ROWS_FROM_QUERY_RESULTS_FIELDS = "jobReference,totalRows,pageToken,rows"
 
 
 class Project(object):
@@ -293,7 +294,7 @@ def api_request(*args, **kwargs):
                 span_attributes=span_attributes,
                 *args,
                 timeout=timeout,
-                **kwargs
+                **kwargs,
             )
 
         return page_iterator.HTTPIterator(
@@ -371,7 +372,7 @@ def api_request(*args, **kwargs):
                 span_attributes=span_attributes,
                 *args,
                 timeout=timeout,
-                **kwargs
+                **kwargs,
             )
 
         return page_iterator.HTTPIterator(
@@ -1129,7 +1130,7 @@ def api_request(*args, **kwargs):
                 span_attributes=span_attributes,
                 *args,
                 timeout=timeout,
-                **kwargs
+                **kwargs,
             )
 
         result = page_iterator.HTTPIterator(
@@ -1207,7 +1208,7 @@ def api_request(*args, **kwargs):
                 span_attributes=span_attributes,
                 *args,
                 timeout=timeout,
-                **kwargs
+                **kwargs,
             )
 
         result = page_iterator.HTTPIterator(
@@ -1284,7 +1285,7 @@ def api_request(*args, **kwargs):
                 span_attributes=span_attributes,
                 *args,
                 timeout=timeout,
-                **kwargs
+                **kwargs,
             )
 
         result = page_iterator.HTTPIterator(
@@ -1510,7 +1511,7 @@ def delete_table(
                 raise
 
     def _get_query_results(
-        self, job_id, retry, project=None, timeout_ms=None, location=None, timeout=None
+        self, job_id, retry, project=None, timeout_ms=None, location=None, timeout=None,
     ):
         """Get the query results object for a query job.
 
@@ -1890,7 +1891,7 @@ def api_request(*args, **kwargs):
                 span_attributes=span_attributes,
                 *args,
                 timeout=timeout,
-                **kwargs
+                **kwargs,
             )
 
         return page_iterator.HTTPIterator(
@@ -2374,7 +2375,7 @@ def load_table_from_json(
 
         destination = _table_arg_to_table_ref(destination, default_project=self.project)
 
-        data_str = u"\n".join(json.dumps(item) for item in json_rows)
+        data_str = "\n".join(json.dumps(item) for item in json_rows)
         encoded_str = data_str.encode()
         data_file = io.BytesIO(encoded_str)
         return self.load_table_from_file(
@@ -3169,6 +3170,83 @@ def list_rows(
             # Pass in selected_fields separately from schema so that full
             # tables can be fetched without a column filter.
             selected_fields=selected_fields,
+            total_rows=getattr(table, "num_rows", None),
+        )
+        return row_iterator
+
+    def _list_rows_from_query_results(
+        self,
+        job_id,
+        location,
+        project,
+        schema,
+        total_rows=None,
+        destination=None,
+        max_results=None,
+        start_index=None,
+        page_size=None,
+        retry=DEFAULT_RETRY,
+        timeout=None,
+    ):
+        """List the rows of a completed query.
+        See
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/getQueryResults
+        Args:
+            job_id (str):
+                ID of a query job.
+            location (str): Location of the query job.
+            project (str):
+                ID of the project where the query job was run.
+            schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
+                The fields expected in these query results. Used to convert
+                from JSON to expected Python types.
+            total_rows (Optional[int]):
+                Total number of rows in the query results.
+            destination (Optional[Union[ \
+                google.cloud.bigquery.table.Table, \
+                google.cloud.bigquery.table.TableListItem, \
+                google.cloud.bigquery.table.TableReference, \
+                str, \
+            ]]):
+                Destination table reference. Used to fetch the query results
+                with the BigQuery Storage API.
+            max_results (Optional[int]):
+                Maximum number of rows to return across the whole iterator.
+            start_index (Optional[int]):
+                The zero-based index of the starting row to read.
+            page_size (Optional[int]):
+                The maximum number of rows in each page of results from this request.
+                Non-positive values are ignored. Defaults to a sensible value set by the API.
+            retry (Optional[google.api_core.retry.Retry]):
+                How to retry the RPC.
+            timeout (Optional[float]):
+                The number of seconds to wait for the underlying HTTP transport
+                before using ``retry``.
+                If multiple requests are made under the hood, ``timeout``
+                applies to each individual request.
+        Returns:
+            google.cloud.bigquery.table.RowIterator:
+                Iterator of row data
+                :class:`~google.cloud.bigquery.table.Row`-s.
+        """
+        params = {
+            "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS,
+            "location": location,
+        }
+
+        if start_index is not None:
+            params["startIndex"] = start_index
+
+        row_iterator = RowIterator(
+            client=self,
+            api_request=functools.partial(self._call_api, retry, timeout=timeout),
+            path=f"/projects/{project}/queries/{job_id}",
+            schema=schema,
+            max_results=max_results,
+            page_size=page_size,
+            table=destination,
+            extra_params=params,
+            total_rows=total_rows,
         )
         return row_iterator
 

diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py
@@ -38,7 +38,6 @@
 from google.cloud.bigquery.table import _EmptyRowIterator
 from google.cloud.bigquery.table import RangePartitioning
 from google.cloud.bigquery.table import _table_arg_to_table_ref
-from google.cloud.bigquery.table import Table
 from google.cloud.bigquery.table import TableReference
 from google.cloud.bigquery.table import TimePartitioning
 
@@ -1159,12 +1158,13 @@ def result(
         if self._query_results.total_rows is None:
             return _EmptyRowIterator()
 
-        schema = self._query_results.schema
-        dest_table_ref = self.destination
-        dest_table = Table(dest_table_ref, schema=schema)
-        dest_table._properties["numRows"] = self._query_results.total_rows
-        rows = self._client.list_rows(
-            dest_table,
+        rows = self._client._list_rows_from_query_results(
+            self._query_results.job_id,
+            self.location,
+            self._query_results.project,
+            self._query_results.schema,
+            total_rows=self._query_results.total_rows,
+            destination=self.destination,
             page_size=page_size,
             max_results=max_results,
             start_index=start_index,

diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -1306,6 +1306,8 @@ class RowIterator(HTTPIterator):
             call the BigQuery Storage API to fetch rows.
         selected_fields (Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]):
             A subset of columns to select from this table.
+        total_rows (Optional[int]):
+            Total number of rows in the table.
 
     """
 
@@ -1321,6 +1323,7 @@ def __init__(
         extra_params=None,
         table=None,
         selected_fields=None,
+        total_rows=None,
     ):
         super(RowIterator, self).__init__(
             client,
@@ -1342,7 +1345,7 @@ def __init__(
         self._schema = schema
         self._selected_fields = selected_fields
         self._table = table
-        self._total_rows = getattr(table, "num_rows", None)
+        self._total_rows = total_rows
 
     def _get_next_page_response(self):
         """Requests the next page from the path provided.
@@ -1419,7 +1422,7 @@ def _to_arrow_iterable(self, bqstorage_client=None):
             selected_fields=self._selected_fields,
         )
         tabledata_list_download = functools.partial(
-            _pandas_helpers.download_arrow_tabledata_list, iter(self.pages), self.schema
+            _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
         )
         return self._to_page_iterable(
             bqstorage_download,
@@ -1496,7 +1499,7 @@ def to_arrow(
         ) and self.max_results is not None:
             warnings.warn(
                 "Cannot use bqstorage_client if max_results is set, "
-                "reverting to fetching data with the tabledata.list endpoint.",
+                "reverting to fetching data with the REST endpoint.",
                 stacklevel=2,
             )
             create_bqstorage_client = False
@@ -1582,7 +1585,7 @@ def to_dataframe_iterable(self, bqstorage_client=None, dtypes=None):
             selected_fields=self._selected_fields,
         )
         tabledata_list_download = functools.partial(
-            _pandas_helpers.download_dataframe_tabledata_list,
+            _pandas_helpers.download_dataframe_row_iterator,
             iter(self.pages),
             self.schema,
             dtypes,
@@ -1680,7 +1683,7 @@ def to_dataframe(
         ) and self.max_results is not None:
             warnings.warn(
                 "Cannot use bqstorage_client if max_results is set, "
-                "reverting to fetching data with the tabledata.list endpoint.",
+                "reverting to fetching data with the REST endpoint.",
                 stacklevel=2,
             )
             create_bqstorage_client = False
@@ -2167,7 +2170,7 @@ def _item_to_row(iterator, resource):
     )
 
 
-def _tabledata_list_page_columns(schema, response):
+def _row_iterator_page_columns(schema, response):
     """Make a generator of all the columns in a page from tabledata.list.
 
     This enables creating a :class:`pandas.DataFrame` and other
@@ -2197,7 +2200,7 @@ def _rows_page_start(iterator, page, response):
     """
     # Make a (lazy) copy of the page in column-oriented format for use in data
     # science packages.
-    page._columns = _tabledata_list_page_columns(iterator._schema, response)
+    page._columns = _row_iterator_page_columns(iterator._schema, response)
 
     total_rows = response.get("totalRows")
     if total_rows is not None:

diff --git a/tests/unit/job/helpers.py b/tests/unit/job/helpers.py
@@ -60,6 +60,7 @@ def _make_job_resource(
     endpoint="https://bigquery.googleapis.com",
     job_type="load",
     job_id="a-random-id",
+    location="US",
     project_id="some-project",
     user_email="bq-user@example.com",
 ):
@@ -69,7 +70,11 @@ def _make_job_resource(
         "statistics": {"creationTime": creation_time_ms, job_type: {}},
         "etag": etag,
         "id": "{}:{}".format(project_id, job_id),
-        "jobReference": {"projectId": project_id, "jobId": job_id},
+        "jobReference": {
+            "projectId": project_id,
+            "jobId": job_id,
+            "location": location,
+        },
         "selfLink": "{}/bigquery/v2/projects/{}/jobs/{}".format(
             endpoint, project_id, job_id
         ),
@@ -130,7 +135,7 @@ def _table_ref(self, table_id):
 
         return TableReference(self.DS_REF, table_id)
 
-    def _make_resource(self, started=False, ended=False):
+    def _make_resource(self, started=False, ended=False, location="US"):
         self._setUpConstants()
         return _make_job_resource(
             creation_time_ms=int(self.WHEN_TS * 1000),
@@ -144,6 +149,7 @@ def _make_resource(self, started=False, ended=False):
             job_id=self.JOB_ID,
             project_id=self.PROJECT,
             user_email=self.USER_EMAIL,
+            location=location,
         )
 
     def _verifyInitialReadonlyProperties(self, job):