Skip to content

Commit

Permalink
perf: remove redundant array deepcopy (#26)
Browse files Browse the repository at this point in the history
* perf(bigquery): remove redundant array deepcopy

deepcopy can be a very costly operation when considering large arrays with complex nested objects.
refactor helpers to allow recursive conversion without copying arrays.

* add check to ignore REPEATED mode

* Update google/cloud/bigquery/_helpers.py

Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>

Co-authored-by: Tres Seaver <tseaver@palladion.com>
Co-authored-by: Tim Swast <swast@google.com>
Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>
  • Loading branch information
4 people authored Oct 7, 2020
1 parent d1eb8b3 commit b54f867
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 10 deletions.
39 changes: 29 additions & 10 deletions google/cloud/bigquery/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
"""Shared helper functions for BigQuery API classes."""

import base64
import copy
import datetime
import decimal
import re
Expand Down Expand Up @@ -397,13 +396,9 @@ def _repeated_field_to_json(field, row_value):
Returns:
List[Any]: A list of JSON-serializable objects.
"""
# Remove the REPEATED, but keep the other fields. This allows us to process
# each item as if it were a top-level field.
item_field = copy.deepcopy(field)
item_field._mode = "NULLABLE"
values = []
for item in row_value:
values.append(_field_to_json(item_field, item))
values.append(_single_field_to_json(field, item))
return values


Expand Down Expand Up @@ -462,6 +457,33 @@ def _record_field_to_json(fields, row_value):
return record


def _single_field_to_json(field, row_value):
"""Convert a single field into JSON-serializable values.
Ignores mode so that this can function for ARRAY / REPEATING fields
without requiring a deepcopy of the field. See:
https://github.com/googleapis/python-bigquery/issues/6
Args:
field (google.cloud.bigquery.schema.SchemaField):
The SchemaField to use for type conversion and field name.
row_value (Any):
Scalar or Struct to be inserted. The type
is inferred from the SchemaField's field_type.
Returns:
Any: A JSON-serializable object.
"""
if row_value is None:
return None

if field.field_type == "RECORD":
return _record_field_to_json(field.fields, row_value)

return _scalar_field_to_json(field, row_value)


def _field_to_json(field, row_value):
"""Convert a field into JSON-serializable values.
Expand All @@ -483,10 +505,7 @@ def _field_to_json(field, row_value):
if field.mode == "REPEATED":
return _repeated_field_to_json(field, row_value)

if field.field_type == "RECORD":
return _record_field_to_json(field.fields, row_value)

return _scalar_field_to_json(field, row_value)
return _single_field_to_json(field, row_value)


def _snake_to_camel_case(value):
Expand Down
35 changes: 35 additions & 0 deletions tests/unit/test__helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,41 @@ def test_w_known_field_type(self):
self.assertEqual(converted, str(original))


class Test_single_field_to_json(unittest.TestCase):
def _call_fut(self, field, value):
from google.cloud.bigquery._helpers import _single_field_to_json

return _single_field_to_json(field, value)

def test_w_none(self):
field = _make_field("INT64")
original = None
converted = self._call_fut(field, original)
self.assertIsNone(converted)

def test_w_record(self):
subfields = [
_make_field("INT64", name="one"),
_make_field("STRING", name="two"),
]
field = _make_field("RECORD", fields=subfields)
original = {"one": 42, "two": "two"}
converted = self._call_fut(field, original)
self.assertEqual(converted, {"one": "42", "two": "two"})

def test_w_scalar(self):
field = _make_field("INT64")
original = 42
converted = self._call_fut(field, original)
self.assertEqual(converted, str(original))

def test_w_scalar_ignores_mode(self):
field = _make_field("STRING", mode="REPEATED")
original = "hello world"
converted = self._call_fut(field, original)
self.assertEqual(converted, original)


class Test_repeated_field_to_json(unittest.TestCase):
def _call_fut(self, field, value):
from google.cloud.bigquery._helpers import _repeated_field_to_json
Expand Down

0 comments on commit b54f867

Please sign in to comment.