diff --git a/bigquery/bqml/data_scientist_tutorial_test.py b/bigquery/bqml/data_scientist_tutorial_test.py index 532835294d1b..eb4ef4b821bc 100644 --- a/bigquery/bqml/data_scientist_tutorial_test.py +++ b/bigquery/bqml/data_scientist_tutorial_test.py @@ -16,9 +16,14 @@ from google.cloud import bigquery # [END bqml_data_scientist_tutorial_import_and_client] import pytest +import uuid # [START bqml_data_scientist_tutorial_import_and_client] client = bigquery.Client() +# We use a unique dataset ID for this example to avoid collisions with +# other invocations of this tutorial. In practice, you could leverage +# a persistent dataset and not create/destroy it with each invocation. +dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex)) # [END bqml_data_scientist_tutorial_import_and_client] @@ -26,19 +31,19 @@ def delete_dataset(): yield client.delete_dataset( - client.dataset('bqml_tutorial'), delete_contents=True) + client.dataset(dataset_id), delete_contents=True) def test_data_scientist_tutorial(delete_dataset): # [START bqml_data_scientist_tutorial_create_dataset] - dataset = bigquery.Dataset(client.dataset('bqml_tutorial')) + dataset = bigquery.Dataset(client.dataset(dataset_id)) dataset.location = 'US' client.create_dataset(dataset) # [END bqml_data_scientist_tutorial_create_dataset] # [START bqml_data_scientist_tutorial_create_model] sql = """ - CREATE OR REPLACE MODEL `bqml_tutorial.sample_model` + CREATE OR REPLACE MODEL `{}.sample_model` OPTIONS(model_type='logistic_reg') AS SELECT IF(totals.transactions IS NULL, 0, 1) AS label, @@ -50,7 +55,7 @@ def test_data_scientist_tutorial(delete_dataset): `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_create_model] @@ -60,8 +65,8 @@ def test_data_scientist_tutorial(delete_dataset): SELECT * FROM - ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`) - """ + ML.TRAINING_INFO(MODEL `{}.sample_model`) + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_get_training_statistics] @@ -70,7 +75,7 @@ def test_data_scientist_tutorial(delete_dataset): sql = """ SELECT * - FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, ( + FROM ML.EVALUATE(MODEL `{}.sample_model`, ( SELECT IF(totals.transactions IS NULL, 0, 1) AS label, IFNULL(device.operatingSystem, "") AS os, @@ -81,7 +86,7 @@ def test_data_scientist_tutorial(delete_dataset): `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_evaluate_model] @@ -91,7 +96,7 @@ def test_data_scientist_tutorial(delete_dataset): SELECT country, SUM(predicted_label) as total_predicted_purchases - FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( + FROM ML.PREDICT(MODEL `{}.sample_model`, ( SELECT IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, @@ -104,7 +109,7 @@ def test_data_scientist_tutorial(delete_dataset): GROUP BY country ORDER BY total_predicted_purchases DESC LIMIT 10 - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_predict_transactions] @@ -114,7 +119,7 @@ def test_data_scientist_tutorial(delete_dataset): SELECT fullVisitorId, SUM(predicted_label) as total_predicted_purchases - FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( + FROM ML.PREDICT(MODEL `{}.sample_model`, ( SELECT IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, @@ -128,7 +133,7 @@ def test_data_scientist_tutorial(delete_dataset): GROUP BY fullVisitorId ORDER BY total_predicted_purchases DESC LIMIT 10 - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_predict_purchases] diff --git a/bigquery/bqml/ncaa_tutorial_test.py b/bigquery/bqml/ncaa_tutorial_test.py index 5fd96a3961fb..488684d86660 100644 --- a/bigquery/bqml/ncaa_tutorial_test.py +++ b/bigquery/bqml/ncaa_tutorial_test.py @@ -14,6 +14,7 @@ import io import os +import uuid # [START bqml_ncaa_tutorial_import_and_client] from google.cloud import bigquery @@ -22,6 +23,10 @@ # [START bqml_ncaa_tutorial_import_and_client] client = bigquery.Client() +# We use a unique dataset ID for this example to avoid collisions with +# other invocations of this tutorial. In practice, you could leverage +# a persistent dataset and not create/destroy it with each invocation. +dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex)) # [END bqml_ncaa_tutorial_import_and_client] @@ -29,12 +34,12 @@ def delete_dataset(): yield client.delete_dataset( - client.dataset('bqml_tutorial'), delete_contents=True) + client.dataset(dataset_id), delete_contents=True) def test_ncaa_tutorial(delete_dataset): # [START bqml_ncaa_tutorial_create_dataset] - dataset = bigquery.Dataset(client.dataset('bqml_tutorial')) + dataset = bigquery.Dataset(client.dataset(dataset_id)) dataset.location = 'US' client.create_dataset(dataset) # [END bqml_ncaa_tutorial_create_dataset] @@ -42,23 +47,17 @@ def test_ncaa_tutorial(delete_dataset): # Create the tables used by the tutorial # Note: the queries are saved to a file. This should be updated to use the # saved queries once the library supports running saved queries. - query_filepath_to_table_name = { - 'feature_input_query.sql': 'cume_games', - 'training_data_query.sql': 'wide_games' - } + query_files = ['feature_input_query.sql', 'training_data_query.sql'] resources_directory = os.path.join(os.path.dirname(__file__), 'resources') - for query_filepath, table_name in query_filepath_to_table_name.items(): - table_ref = dataset.table(table_name) - job_config = bigquery.QueryJobConfig() - job_config.destination = table_ref + for fname in query_files: query_filepath = os.path.join( - resources_directory, query_filepath) - sql = io.open(query_filepath, 'r', encoding='utf-8').read() - client.query(sql, job_config=job_config).result() + resources_directory, fname) + sql = io.open(query_filepath, 'r', encoding='utf-8').read().format(dataset_id) + client.query(sql).result() # [START bqml_ncaa_tutorial_create_model] sql = """ - CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model` + CREATE OR REPLACE MODEL `{0}.ncaa_model` OPTIONS ( model_type='linear_reg', max_iteration=50 ) AS @@ -69,11 +68,11 @@ def test_ncaa_tutorial(delete_dataset): total_three_points_att), total_three_points_att as label FROM - `bqml_tutorial.wide_games` + `{0}.wide_games` WHERE # remove the game to predict game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83' - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_create_model] @@ -83,8 +82,8 @@ def test_ncaa_tutorial(delete_dataset): SELECT * FROM - ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`) - """ + ML.TRAINING_INFO(MODEL `{}.ncaa_model`) + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_get_training_statistics] @@ -96,13 +95,13 @@ def test_ncaa_tutorial(delete_dataset): *, total_three_points_att AS label FROM - `bqml_tutorial.wide_games` ) + `{0}.wide_games` ) SELECT * FROM - ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`, + ML.EVALUATE(MODEL `{0}.ncaa_model`, TABLE eval_table) - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_evaluate_model] @@ -113,7 +112,7 @@ def test_ncaa_tutorial(delete_dataset): SELECT * FROM - `bqml_tutorial.wide_games` + `{0}.wide_games` WHERE game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' ) SELECT @@ -125,7 +124,7 @@ def test_ncaa_tutorial(delete_dataset): game_id, predicted_label AS predicted_total_three_points_att FROM - ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`, + ML.PREDICT(MODEL `{0}.ncaa_model`, table game_to_predict) ) AS predict JOIN ( SELECT @@ -135,7 +134,7 @@ def test_ncaa_tutorial(delete_dataset): game_to_predict) AS truth ON predict.game_id = truth.game_id - """ + """.format(dataset_id) df = client.query(sql).to_dataframe() print(df) # [END bqml_ncaa_tutorial_predict_outcomes] diff --git a/bigquery/bqml/requirements.txt b/bigquery/bqml/requirements.txt index 86141411cac0..dee3fe259612 100644 --- a/bigquery/bqml/requirements.txt +++ b/bigquery/bqml/requirements.txt @@ -1,3 +1,4 @@ -google-cloud-bigquery[pandas]==1.20.0 +pandas==0.24.2 +google-cloud-bigquery==1.23.1 flaky==3.6.1 mock==3.0.5 diff --git a/bigquery/bqml/resources/feature_input_query.sql b/bigquery/bqml/resources/feature_input_query.sql index d54f003425db..6348c7ea462e 100644 --- a/bigquery/bqml/resources/feature_input_query.sql +++ b/bigquery/bqml/resources/feature_input_query.sql @@ -1,4 +1,8 @@ -#standardSQL +# This query creates a sample table using +# the ncaa_basketball public dataset. It +# uses a format string token for setting +# the destination dataset. +CREATE OR REPLACE TABLE `{0}.cume_games` AS SELECT game_id, season, diff --git a/bigquery/bqml/resources/training_data_query.sql b/bigquery/bqml/resources/training_data_query.sql index 74f39e9f0aa1..5c2f6708d4e9 100644 --- a/bigquery/bqml/resources/training_data_query.sql +++ b/bigquery/bqml/resources/training_data_query.sql @@ -1,4 +1,5 @@ #standardSQL +CREATE OR REPLACE TABLE `{0}.wide_games` AS SELECT team.game_id AS game_id, team.season AS season, @@ -768,9 +769,9 @@ SELECT opponent.opp_possessions_std_last_5 AS opponent_opp_possessions_std_last_5, opponent.opp_possessions_std_last_10 AS opponent_opp_possessions_std_last_10 FROM - `bqml_tutorial.cume_games` AS team + `{0}.cume_games` AS team JOIN - `bqml_tutorial.cume_games` AS opponent + `{0}.cume_games` AS opponent ON team.game_id = opponent.game_id AND team.team_id != opponent.team_id WHERE