Skip to content

Commit

Permalink
docs(bigquery): document how to achieve higher write limit and add te…
Browse files Browse the repository at this point in the history
…sts (#9574)

* test(bigquery): add insert_rows*() tests w/o row IDs

* Groom the insert_rows_json() method's docstring

* docs: document how to achieve higher insert write limit

* Make method names less confusing for insert IDs
  • Loading branch information
plamut committed Nov 1, 2019
1 parent 48359eb commit 3e8fbae
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 16 deletions.
14 changes: 14 additions & 0 deletions bigquery/docs/usage/tables.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,20 @@ Insert rows into a table's data with the
:start-after: [START bigquery_table_insert_rows]
:end-before: [END bigquery_table_insert_rows]

Insert rows into a table's data with the
:func:`~google.cloud.bigquery.client.Client.insert_rows` method, achieving
higher write limit:

.. literalinclude:: ../samples/table_insert_rows_explicit_none_insert_ids.py
:language: python
:dedent: 4
:start-after: [START bigquery_table_insert_rows_explicit_none_insert_ids]
:end-before: [END bigquery_table_insert_rows_explicit_none_insert_ids]

Mind that inserting data with ``None`` row insert IDs can come at the expense of
more duplicate inserts. See also:
`Streaming inserts <https://cloud.google.com/bigquery/quotas#streaming_inserts>`_.

Add an empty column to the existing table with the
:func:`~google.cloud.bigquery.update_table` method:

Expand Down
35 changes: 19 additions & 16 deletions bigquery/google/cloud/bigquery/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2264,29 +2264,32 @@ def insert_rows_json(
table (Union[ \
google.cloud.bigquery.table.Table \
google.cloud.bigquery.table.TableReference, \
str, \
str \
]):
The destination table for the row data, or a reference to it.
json_rows (Sequence[Dict]):
Row data to be inserted. Keys must match the table schema fields
and values must be JSON-compatible representations.
row_ids (Sequence[str]):
(Optional) Unique ids, one per row being inserted. If omitted,
unique IDs are created.
skip_invalid_rows (bool):
(Optional) Insert all valid rows of a request, even if invalid
rows exist. The default value is False, which causes the entire
request to fail if any invalid rows exist.
ignore_unknown_values (bool):
(Optional) Accept rows that contain values that do not match the
schema. The unknown values are ignored. Default is False, which
row_ids (Optional[Sequence[Optional[str]]]):
Unique IDs, one per row being inserted. An ID can also be
``None``, indicating that an explicit insert ID should **not**
be used for that row. If the argument is omitted altogether,
unique IDs are created automatically.
skip_invalid_rows (Optional[bool]):
Insert all valid rows of a request, even if invalid rows exist.
The default value is ``False``, which causes the entire request
to fail if any invalid rows exist.
ignore_unknown_values (Optional[bool]):
Accept rows that contain values that do not match the schema.
The unknown values are ignored. Default is ``False``, which
treats unknown values as errors.
template_suffix (str):
(Optional) treat ``name`` as a template table and provide a suffix.
BigQuery will create the table ``<name> + <template_suffix>`` based
on the schema of the template table. See
template_suffix (Optional[str]):
Treat ``name`` as a template table and provide a suffix.
BigQuery will create the table ``<name> + <template_suffix>``
based on the schema of the template table. See
https://cloud.google.com/bigquery/streaming-data-into-bigquery#template-tables
retry (google.api_core.retry.Retry): (Optional) How to retry the RPC.
retry (Optional[google.api_core.retry.Retry]):
How to retry the RPC.
Returns:
Sequence[Mappings]:
Expand Down
36 changes: 36 additions & 0 deletions bigquery/samples/table_insert_rows_explicit_none_insert_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def table_insert_rows_explicit_none_insert_ids(client, table_id):

# [START bigquery_table_insert_rows_explicit_none_insert_ids]
# TODO(developer): Import the client library.
# from google.cloud import bigquery

# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the model to fetch.
# table_id = "your-project.your_dataset.your_table"

table = client.get_table(table_id) # Make an API request.
rows_to_insert = [(u"Phred Phlyntstone", 32), (u"Wylma Phlyntstone", 29)]

errors = client.insert_rows(
table, rows_to_insert, row_ids=[None] * len(rows_to_insert)
) # Make an API request.
if errors == []:
print("New rows have been added.")
# [END bigquery_table_insert_rows_explicit_none_insert_ids]
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from google.cloud import bigquery

from .. import table_insert_rows_explicit_none_insert_ids as mut


def test_table_insert_rows_explicit_none_insert_ids(capsys, client, random_table_id):

schema = [
bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
]

table = bigquery.Table(random_table_id, schema=schema)
table = client.create_table(table)

mut.table_insert_rows_explicit_none_insert_ids(client, random_table_id)
out, err = capsys.readouterr()
assert "New rows have been added." in out
104 changes: 104 additions & 0 deletions bigquery/tests/unit/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4572,6 +4572,40 @@ def test_insert_rows_w_record_schema(self):
method="POST", path="/%s" % PATH, data=SENT
)

def test_insert_rows_w_explicit_none_insert_ids(self):
from google.cloud.bigquery.schema import SchemaField
from google.cloud.bigquery.table import Table

PATH = "projects/{}/datasets/{}/tables/{}/insertAll".format(
self.PROJECT, self.DS_ID, self.TABLE_ID,
)
creds = _make_credentials()
http = object()
client = self._make_one(project=self.PROJECT, credentials=creds, _http=http)
conn = client._connection = make_connection({})
schema = [
SchemaField("full_name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
table = Table(self.TABLE_REF, schema=schema)
ROWS = [
{"full_name": "Phred Phlyntstone", "age": 32},
{"full_name": "Bharney Rhubble", "age": 33},
]

def _row_data(row):
row["age"] = str(row["age"])
return row

SENT = {"rows": [{"json": _row_data(row), "insertId": None} for row in ROWS]}

errors = client.insert_rows(table, ROWS, row_ids=[None] * len(ROWS))

self.assertEqual(len(errors), 0)
conn.api_request.assert_called_once_with(
method="POST", path="/{}".format(PATH), data=SENT
)

def test_insert_rows_errors(self):
from google.cloud.bigquery.table import Table

Expand Down Expand Up @@ -4765,6 +4799,55 @@ def test_insert_rows_from_dataframe_many_columns(self):
assert len(actual_calls) == 1
assert actual_calls[0] == expected_call

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_insert_rows_from_dataframe_w_explicit_none_insert_ids(self):
from google.cloud.bigquery.table import SchemaField
from google.cloud.bigquery.table import Table

API_PATH = "/projects/{}/datasets/{}/tables/{}/insertAll".format(
self.PROJECT, self.DS_ID, self.TABLE_REF.table_id
)

dataframe = pandas.DataFrame(
[
{"name": u"Little One", "adult": False},
{"name": u"Young Gun", "adult": True},
]
)

# create client
creds = _make_credentials()
http = object()
client = self._make_one(project=self.PROJECT, credentials=creds, _http=http)
conn = client._connection = make_connection({}, {})

# create table
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("adult", "BOOLEAN", mode="REQUIRED"),
]
table = Table(self.TABLE_REF, schema=schema)

error_info = client.insert_rows_from_dataframe(
table, dataframe, row_ids=[None] * len(dataframe)
)

self.assertEqual(len(error_info), 1)
assert error_info[0] == [] # no chunk errors

EXPECTED_SENT_DATA = {
"rows": [
{"insertId": None, "json": {"name": "Little One", "adult": "false"}},
{"insertId": None, "json": {"name": "Young Gun", "adult": "true"}},
]
}

actual_calls = conn.api_request.call_args_list
assert len(actual_calls) == 1
assert actual_calls[0] == mock.call(
method="POST", path=API_PATH, data=EXPECTED_SENT_DATA
)

def test_insert_rows_json(self):
from google.cloud.bigquery.table import Table, SchemaField
from google.cloud.bigquery.dataset import DatasetReference
Expand Down Expand Up @@ -4833,6 +4916,27 @@ def test_insert_rows_json_with_string_id(self):
data=expected,
)

def test_insert_rows_json_w_explicit_none_insert_ids(self):
rows = [{"col1": "val1"}, {"col2": "val2"}]
creds = _make_credentials()
http = object()
client = self._make_one(
project="default-project", credentials=creds, _http=http
)
conn = client._connection = make_connection({})

errors = client.insert_rows_json(
"proj.dset.tbl", rows, row_ids=[None] * len(rows),
)

self.assertEqual(len(errors), 0)
expected = {"rows": [{"json": row, "insertId": None} for row in rows]}
conn.api_request.assert_called_once_with(
method="POST",
path="/projects/proj/datasets/dset/tables/tbl/insertAll",
data=expected,
)

def test_list_partitions(self):
from google.cloud.bigquery.table import Table

Expand Down

0 comments on commit 3e8fbae

Please sign in to comment.