From 58752d5739a2c89a7e99cb4665d7aa254cad95a8 Mon Sep 17 00:00:00 2001 From: shollyman Date: Fri, 14 Feb 2020 17:04:44 -0800 Subject: [PATCH] fix: address bigquery/bqml test failures (#2920) * fix: bigquery/bqml testing The BQML tests use a non-unique dataset ID for multiple examples and testing currently triggers a lot of concurrent creation/deletions of said dataset. Switch to a dataset that leverages uuid to avoid invocations stomping on one another, which also necessitates parameterizing much of the SQL. There's also an issue with the pandas import currently, possibly due to recent changes in panda. This change also pins pandas to 0.22 and doesn't rely on the dependency being expressed as an extra through google-cloud-bigquery. * whitespace lint * update dependencies in requirements.txt --- bigquery/bqml/data_scientist_tutorial_test.py | 29 +++++++----- bigquery/bqml/ncaa_tutorial_test.py | 47 +++++++++---------- bigquery/bqml/requirements.txt | 3 +- .../bqml/resources/feature_input_query.sql | 6 ++- .../bqml/resources/training_data_query.sql | 5 +- 5 files changed, 50 insertions(+), 40 deletions(-) diff --git a/bigquery/bqml/data_scientist_tutorial_test.py b/bigquery/bqml/data_scientist_tutorial_test.py index 532835294d1b..eb4ef4b821bc 100644 --- a/bigquery/bqml/data_scientist_tutorial_test.py +++ b/bigquery/bqml/data_scientist_tutorial_test.py @@ -16,9 +16,14 @@ from google.cloud import bigquery # [END bqml_data_scientist_tutorial_import_and_client] import pytest +import uuid # [START bqml_data_scientist_tutorial_import_and_client] client = bigquery.Client() +# We use a unique dataset ID for this example to avoid collisions with +# other invocations of this tutorial. In practice, you could leverage +# a persistent dataset and not create/destroy it with each invocation. +dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex)) # [END bqml_data_scientist_tutorial_import_and_client] @@ -26,19 +31,19 @@ def delete_dataset(): yield client.delete_dataset( - client.dataset('bqml_tutorial'), delete_contents=True) + client.dataset(dataset_id), delete_contents=True) def test_data_scientist_tutorial(delete_dataset): # [START bqml_data_scientist_tutorial_create_dataset] - dataset = bigquery.Dataset(client.dataset('bqml_tutorial')) + dataset = bigquery.Dataset(client.dataset(dataset_id)) dataset.location = 'US' client.create_dataset(dataset) # [END bqml_data_scientist_tutorial_create_dataset] # [START bqml_data_scientist_tutorial_create_model] sql = """ - CREATE OR REPLACE MODEL `bqml_tutorial.sample_model` + CREATE OR REPLACE MODEL `{}.sample_model` OPTIONS(model_type='logistic_reg') AS SELECT IF(totals.transactions IS NULL, 0, 1) AS label, @@ -50,7 +55,7 @@ def test_data_scientist_tutorial(delete_dataset): `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_create_model] @@ -60,8 +65,8 @@ def test_data_scientist_tutorial(delete_dataset): SELECT * FROM - ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`) - """ + ML.TRAINING_INFO(MODEL `{}.sample_model`) + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_get_training_statistics] @@ -70,7 +75,7 @@ def test_data_scientist_tutorial(delete_dataset): sql = """ SELECT * - FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, ( + FROM ML.EVALUATE(MODEL `{}.sample_model`, ( SELECT IF(totals.transactions IS NULL, 0, 1) AS label, IFNULL(device.operatingSystem, "") AS os, @@ -81,7 +86,7 @@ def test_data_scientist_tutorial(delete_dataset): `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_evaluate_model] @@ -91,7 +96,7 @@ def test_data_scientist_tutorial(delete_dataset): SELECT country, SUM(predicted_label) as total_predicted_purchases - FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( + FROM ML.PREDICT(MODEL `{}.sample_model`, ( SELECT IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, @@ -104,7 +109,7 @@ def test_data_scientist_tutorial(delete_dataset): GROUP BY country ORDER BY total_predicted_purchases DESC LIMIT 10 - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_predict_transactions] @@ -114,7 +119,7 @@ def test_data_scientist_tutorial(delete_dataset): SELECT fullVisitorId, SUM(predicted_label) as total_predicted_purchases - FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( + FROM ML.PREDICT(MODEL `{}.sample_model`, ( SELECT IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, @@ -128,7 +133,7 @@ def test_data_scientist_tutorial(delete_dataset): GROUP BY fullVisitorId ORDER BY total_predicted_purchases DESC LIMIT 10 - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_predict_purchases] diff --git a/bigquery/bqml/ncaa_tutorial_test.py b/bigquery/bqml/ncaa_tutorial_test.py index 5fd96a3961fb..488684d86660 100644 --- a/bigquery/bqml/ncaa_tutorial_test.py +++ b/bigquery/bqml/ncaa_tutorial_test.py @@ -14,6 +14,7 @@ import io import os +import uuid # [START bqml_ncaa_tutorial_import_and_client] from google.cloud import bigquery @@ -22,6 +23,10 @@ # [START bqml_ncaa_tutorial_import_and_client] client = bigquery.Client() +# We use a unique dataset ID for this example to avoid collisions with +# other invocations of this tutorial. In practice, you could leverage +# a persistent dataset and not create/destroy it with each invocation. +dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex)) # [END bqml_ncaa_tutorial_import_and_client] @@ -29,12 +34,12 @@ def delete_dataset(): yield client.delete_dataset( - client.dataset('bqml_tutorial'), delete_contents=True) + client.dataset(dataset_id), delete_contents=True) def test_ncaa_tutorial(delete_dataset): # [START bqml_ncaa_tutorial_create_dataset] - dataset = bigquery.Dataset(client.dataset('bqml_tutorial')) + dataset = bigquery.Dataset(client.dataset(dataset_id)) dataset.location = 'US' client.create_dataset(dataset) # [END bqml_ncaa_tutorial_create_dataset] @@ -42,23 +47,17 @@ def test_ncaa_tutorial(delete_dataset): # Create the tables used by the tutorial # Note: the queries are saved to a file. This should be updated to use the # saved queries once the library supports running saved queries. - query_filepath_to_table_name = { - 'feature_input_query.sql': 'cume_games', - 'training_data_query.sql': 'wide_games' - } + query_files = ['feature_input_query.sql', 'training_data_query.sql'] resources_directory = os.path.join(os.path.dirname(__file__), 'resources') - for query_filepath, table_name in query_filepath_to_table_name.items(): - table_ref = dataset.table(table_name) - job_config = bigquery.QueryJobConfig() - job_config.destination = table_ref + for fname in query_files: query_filepath = os.path.join( - resources_directory, query_filepath) - sql = io.open(query_filepath, 'r', encoding='utf-8').read() - client.query(sql, job_config=job_config).result() + resources_directory, fname) + sql = io.open(query_filepath, 'r', encoding='utf-8').read().format(dataset_id) + client.query(sql).result() # [START bqml_ncaa_tutorial_create_model] sql = """ - CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model` + CREATE OR REPLACE MODEL `{0}.ncaa_model` OPTIONS ( model_type='linear_reg', max_iteration=50 ) AS @@ -69,11 +68,11 @@ def test_ncaa_tutorial(delete_dataset): total_three_points_att), total_three_points_att as label FROM - `bqml_tutorial.wide_games` + `{0}.wide_games` WHERE # remove the game to predict game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83' - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_create_model] @@ -83,8 +82,8 @@ def test_ncaa_tutorial(delete_dataset): SELECT * FROM - ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`) - """ + ML.TRAINING_INFO(MODEL `{}.ncaa_model`) + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_get_training_statistics] @@ -96,13 +95,13 @@ def test_ncaa_tutorial(delete_dataset): *, total_three_points_att AS label FROM - `bqml_tutorial.wide_games` ) + `{0}.wide_games` ) SELECT * FROM - ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`, + ML.EVALUATE(MODEL `{0}.ncaa_model`, TABLE eval_table) - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_evaluate_model] @@ -113,7 +112,7 @@ def test_ncaa_tutorial(delete_dataset): SELECT * FROM - `bqml_tutorial.wide_games` + `{0}.wide_games` WHERE game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' ) SELECT @@ -125,7 +124,7 @@ def test_ncaa_tutorial(delete_dataset): game_id, predicted_label AS predicted_total_three_points_att FROM - ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`, + ML.PREDICT(MODEL `{0}.ncaa_model`, table game_to_predict) ) AS predict JOIN ( SELECT @@ -135,7 +134,7 @@ def test_ncaa_tutorial(delete_dataset): game_to_predict) AS truth ON predict.game_id = truth.game_id - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_predict_outcomes] diff --git a/bigquery/bqml/requirements.txt b/bigquery/bqml/requirements.txt index 86141411cac0..dee3fe259612 100644 --- a/bigquery/bqml/requirements.txt +++ b/bigquery/bqml/requirements.txt @@ -1,3 +1,4 @@ -google-cloud-bigquery[pandas]==1.20.0 +pandas==0.24.2 +google-cloud-bigquery==1.23.1 flaky==3.6.1 mock==3.0.5 diff --git a/bigquery/bqml/resources/feature_input_query.sql b/bigquery/bqml/resources/feature_input_query.sql index d54f003425db..6348c7ea462e 100644 --- a/bigquery/bqml/resources/feature_input_query.sql +++ b/bigquery/bqml/resources/feature_input_query.sql @@ -1,4 +1,8 @@ -#standardSQL +# This query creates a sample table using +# the ncaa_basketball public dataset. It +# uses a format string token for setting +# the destination dataset. +CREATE OR REPLACE TABLE `{0}.cume_games` AS SELECT game_id, season, diff --git a/bigquery/bqml/resources/training_data_query.sql b/bigquery/bqml/resources/training_data_query.sql index 74f39e9f0aa1..5c2f6708d4e9 100644 --- a/bigquery/bqml/resources/training_data_query.sql +++ b/bigquery/bqml/resources/training_data_query.sql @@ -1,4 +1,5 @@ #standardSQL +CREATE OR REPLACE TABLE `{0}.wide_games` AS SELECT team.game_id AS game_id, team.season AS season, @@ -768,9 +769,9 @@ SELECT opponent.opp_possessions_std_last_5 AS opponent_opp_possessions_std_last_5, opponent.opp_possessions_std_last_10 AS opponent_opp_possessions_std_last_10 FROM - `bqml_tutorial.cume_games` AS team + `{0}.cume_games` AS team JOIN - `bqml_tutorial.cume_games` AS opponent + `{0}.cume_games` AS opponent ON team.game_id = opponent.game_id AND team.team_id != opponent.team_id WHERE