Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: load_table_from_dataframe now assumes there may be local null values #1735

Merged
merged 3 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,18 @@ def bq_to_arrow_field(bq_field, array_type=None):
if arrow_type is not None:
if array_type is not None:
arrow_type = array_type # For GEOGRAPHY, at least initially
is_nullable = bq_field.mode.upper() == "NULLABLE"
metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get(
bq_field.field_type.upper() if bq_field.field_type else ""
)
return pyarrow.field(
bq_field.name, arrow_type, nullable=is_nullable, metadata=metadata
bq_field.name,
arrow_type,
# Even if the remote schema is REQUIRED, there's a chance there's
# local NULL values. Arrow will gladly interpret these NULL values
# as non-NULL and give you an arbitrary value. See:
# https://github.com/googleapis/python-bigquery/issues/1692
nullable=True,
metadata=metadata,
)

warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name))
Expand Down
47 changes: 40 additions & 7 deletions tests/system/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,8 +428,7 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id):


def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
"""Test that a DataFrame with required columns can be uploaded if a
BigQuery schema is specified.
"""Test that a DataFrame can be uploaded to a table with required columns.

See: https://github.com/googleapis/google-cloud-python/issues/8093
"""
Expand All @@ -440,7 +439,6 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):

records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}]
dataframe = pandas.DataFrame(records, columns=["name", "age"])
job_config = bigquery.LoadJobConfig(schema=table_schema)
table_id = "{}.{}.load_table_from_dataframe_w_required".format(
bigquery_client.project, dataset_id
)
Expand All @@ -451,15 +449,50 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
bigquery.Table(table_id, schema=table_schema)
)

job_config = bigquery.LoadJobConfig(schema=table_schema)
load_job = bigquery_client.load_table_from_dataframe(
dataframe, table_id, job_config=job_config
)
load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id)
load_job.result()

table = bigquery_client.get_table(table)
assert tuple(table.schema) == table_schema
assert table.num_rows == 2
for field in table.schema:
assert field.mode == "REQUIRED"


def test_load_table_from_dataframe_w_required_but_local_nulls_fails(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've verified on origin/main that this test currently fails with FAILED tests/system/test_pandas.py::test_load_table_from_dataframe_w_required_but_local_nulls_fails - Failed: DID NOT RAISE <class 'google.api_core.exceptions.BadRequest'> but passes after this fix.

bigquery_client, dataset_id
):
"""Test that a DataFrame with nulls can't be uploaded to a table with
required columns.

See: https://github.com/googleapis/python-bigquery/issues/1692
"""
table_schema = (
bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
)

records = [
{"name": "Chip", "age": 2},
{"name": "Dale", "age": 3},
{"name": None, "age": None},
{"name": "Alvin", "age": 4},
]
dataframe = pandas.DataFrame(records, columns=["name", "age"])
table_id = (
"{}.{}.load_table_from_dataframe_w_required_but_local_nulls_fails".format(
bigquery_client.project, dataset_id
)
)

# Create the table before loading so that schema mismatch errors are
# identified.
helpers.retry_403(bigquery_client.create_table)(
bigquery.Table(table_id, schema=table_schema)
)

with pytest.raises(google.api_core.exceptions.BadRequest, match="null"):
bigquery_client.load_table_from_dataframe(dataframe, table_id).result()


def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id):
Expand Down
47 changes: 33 additions & 14 deletions tests/unit/test__pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,30 +1017,41 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
)

data = {
"field01": ["hello", "world"],
"field02": [b"abd", b"efg"],
"field03": [1, 2],
"field04": [3, 4],
"field05": [1.25, 9.75],
"field06": [-1.75, -3.5],
"field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")],
"field01": ["hello", None, "world"],
"field02": [b"abd", b"efg", b"hij"],
"field03": [1, 2, 3],
"field04": [4, None, 5],
"field05": [1.25, 0.0, 9.75],
"field06": [-1.75, None, -3.5],
"field07": [
decimal.Decimal("1.2345"),
decimal.Decimal("6.7891"),
-decimal.Decimal("10.111213"),
],
"field08": [
decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)),
None,
decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)),
],
"field09": [True, False],
"field10": [False, True],
"field09": [True, False, True],
"field10": [False, True, None],
"field11": [
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=datetime.timezone.utc),
datetime.datetime(2022, 7, 14, 23, 59, 59, tzinfo=datetime.timezone.utc),
],
"field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)],
"field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)],
"field12": [datetime.date(9999, 12, 31), None, datetime.date(1970, 1, 1)],
"field13": [datetime.time(23, 59, 59, 999999), None, datetime.time(12, 0, 0)],
"field14": [
datetime.datetime(1970, 1, 1, 0, 0, 0),
None,
datetime.datetime(2012, 12, 21, 9, 7, 42),
],
"field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"],
"field15": [
None,
"POINT(30 10)",
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
],
}
dataframe = pandas.DataFrame(data)

Expand All @@ -1049,7 +1060,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):

assert len(arrow_schema) == len(bq_schema)
for arrow_field in arrow_schema:
assert not arrow_field.nullable
# Even if the remote schema is REQUIRED, there's a chance there's
# local NULL values. Arrow will gladly interpret these NULL values
# as non-NULL and give you an arbitrary value. See:
# https://github.com/googleapis/python-bigquery/issues/1692
assert arrow_field.nullable


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
Expand Down Expand Up @@ -1101,7 +1116,11 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
arrow_schema = arrow_table.schema

expected_fields = [
pyarrow.field("field01", "string", nullable=False),
# Even if the remote schema is REQUIRED, there's a chance there's
# local NULL values. Arrow will gladly interpret these NULL values
# as non-NULL and give you an arbitrary value. See:
# https://github.com/googleapis/python-bigquery/issues/1692
pyarrow.field("field01", "string", nullable=True),
pyarrow.field("field02", "bool", nullable=True),
]
assert list(arrow_schema) == expected_fields
Expand Down