diff --git a/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb b/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb index 48af9c77..f98fc1d3 100644 --- a/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb +++ b/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb @@ -295,14 +295,14 @@ "source": [ "## Quickstart for AutoML tables\n", "\n", - "This section of the tutorial walks you through creating an AutoML client." + "This section of the tutorial walks you through creating an AutoML Tables client." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Additionally, one will want to create an instance to the AutoMLClient. \n", + "Additionally, one will want to create an instance to the TablesClient. \n", "This client instance is the HTTP request/response interface between the python script and the GCP AutoML service." ] }, @@ -322,8 +322,7 @@ "metadata": {}, "outputs": [], "source": [ - "client = automl.AutoMlClient()\n", - "prediction_client = automl.PredictionServiceClient()" + "client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)" ] }, { @@ -345,8 +344,9 @@ "metadata": {}, "outputs": [], "source": [ - "# client = automl.AutoMlClient.from_service_account_file('/path/to/service_account.json')\n", - "# prediction_client = automl.PredictionServiceClient.from_service_account_file('/path/to/service_account.json')" + "# from google.oauth2 import service_account\n", + "# credentials = service_account.Credentials.from_service_account_file('/path/to/service_account.json')\n", + "# client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION, credentials=credentials)" ] }, { @@ -356,21 +356,11 @@ "---" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the GCP location of your project.\n", - "project_location = client.location_path(PROJECT_ID, COMPUTE_REGION)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "List datasets in Project:" + "List datasets in your project:" ] }, { @@ -385,7 +375,7 @@ "outputs": [], "source": [ "# List datasets in Project\n", - "list_datasets = client.list_datasets(project_location)\n", + "list_datasets = client.list_datasets()\n", "datasets = { dataset.display_name: dataset.name for dataset in list_datasets }\n", "datasets" ] @@ -411,7 +401,7 @@ }, "outputs": [], "source": [ - "list_models = client.list_models(project_location)\n", + "list_models = client.list_models()\n", "models = { model.display_name: model.name for model in list_models }\n", "models" ] @@ -446,9 +436,7 @@ "id": "_JfZFGSceyE_" }, "source": [ - "Now we are ready to create a dataset instance (on GCP) using the client method create_dataset(). This method takes two parameters, the **project_location** (see above) and *dataset_settings*. \n", - "\n", - "The **dataset_settings** parameter is a dictionary with two keys: **display_name** and **tables_dataset_metadata**. A value must be specified for the display_name, which must be a string consisting only of alphanumeric characters and the underscore. The display name is what one would see through the web UI interface to the AutoML service.\n", + "Now we are ready to create a dataset instance (on GCP) using the client method create_dataset(). This method has one required parameter, the human readable display name `dataset_display_name`.\n", "\n", "Select a dataset display name and pass your table source information to create a new dataset." ] @@ -466,11 +454,8 @@ "# Create dataset\n", "\n", "dataset_display_name = 'census' \n", - "dataset_settings = {'display_name': dataset_display_name, \n", - " 'tables_dataset_metadata': {}}\n", - "create_dataset_response = client.create_dataset(project_location, dataset_settings)\n", - "dataset_name = create_dataset_response.name\n", - "create_dataset_response" + "dataset = client.create_dataset(dataset_display_name)\n", + "dataset" ] }, { @@ -491,78 +476,26 @@ }, "source": [ "You can import your data to AutoML Tables from GCS or BigQuery. For this tutorial, you can use the [census_income dataset](https://storage.cloud.google.com/cloud-ml-data/automl-tables/notebooks/census_income.csv) \n", - "as your training data. You can create a GCS bucket and upload the data into your bucket.\n", - "\n", - "- The URI for your file is `gs://BUCKET_NAME/filename`. \n", - "\n", - "Alternatively you can create a BigQuery table and upload the data into the table:\n", - "\n", - "- The URI for your table is `bq://PROJECT_ID.DATASET_ID.TABLE_ID`.\n", - "\n", - "Importing data may take a few minutes or hours depending on the size of your data. If your Colab times out, run the following command to retrieve your dataset. Replace `dataset_name` with its actual value obtained in the preceding cells.\n", - "\n", - " dataset = client.get_dataset(dataset_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Data source is GCS**" + "as your training data. We provide code below to copy the data into a bucket you own automatically. You are free to adjust the value of `GCS_STORAGE_BUCKET` as needed." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "UIWlq3NTYhOl" - }, - "outputs": [], - "source": [ - "gcs_input_uris = ['gs://cloud-ml-data-tables/notebooks/census_income.csv',]\n", - "\n", - "# Define input configuration.\n", - "input_config = {\n", - " 'gcs_source': {\n", - " 'input_uris': gcs_input_uris\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "**Data source is BigQuery**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "bB_GdeqCJW5i" - }, "outputs": [], "source": [ - "bq_input_uri = 'bq://bigquery-public-data.ml_datasets.census_adult_income'\n", - "\n", - "# Define input configuration.\n", - "input_config = {\n", - " 'bigquery_source': {\n", - " 'input_uri': bq_input_uri\n", - " }\n", - "}" + "GCS_STORAGE_BUCKET = 'gs://{}-codelab-data-storage'.format(PROJECT_ID)\n", + "GCS_DATASET_URI = '{}/census_income.csv'.format(GCS_STORAGE_BUCKET)\n", + "! gsutil ls $GCS_STORAGE_BUCKET || gsutil mb -l $COMPUTE_REGION $GCS_STORAGE_BUCKET\n", + "! gsutil cp gs://cloud-ml-data-tables/notebooks/census_income.csv $GCS_DATASET_URI" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Import data into the dataset, this process may take a while, depending on your data, once completed, you can verify the status in cell below." + "Import data into the dataset, this process may take a while, depending on your data, once completed, you can verify the status by printing the dataset object. This time pay attention to the example_count field with 32561 records." ] }, { @@ -575,28 +508,15 @@ }, "outputs": [], "source": [ - "import_data_response = client.import_data(dataset_name, input_config)\n", - "print('Dataset import operation: {}'.format(import_data_response.operation))\n", + "import_data_operation = client.import_data(\n", + " dataset=dataset,\n", + " gcs_input_uris=gcs_input_uris\n", + ")\n", + "print('Dataset import operation: {}'.format(import_data_operation))\n", "\n", "# Synchronous check of operation status. Wait until import is done.\n", - "import_data_result = import_data_response.result()\n", - "import_data_response.done()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Obtain the dataset details, this time pay attention to the `example_count` field with 32561 records." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = client.get_dataset(dataset_name)\n", + "import_data_operation.result()\n", + "dataset = client.get_dataset(dataset_name=dataset.name)\n", "dataset" ] }, @@ -610,19 +530,6 @@ "### Review the data specs" ] }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "RC0PWKqH4jwr" - }, - "source": [ - "Run the following command to see table specs such as row count.\n", - "We can see the different data types (numerical, string or categorical). \n", - "\n", - "More information [here](https://cloud.google.com/automl-tables/docs/data-types)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -634,12 +541,11 @@ "outputs": [], "source": [ "# List table specs\n", - "list_table_specs_response = client.list_table_specs(dataset_name)\n", + "list_table_specs_response = client.list_table_specs(dataset=dataset)\n", "table_specs = [s for s in list_table_specs_response]\n", "\n", "# List column specs\n", - "table_spec_name = table_specs[0].name\n", - "list_column_specs_response = client.list_column_specs(table_spec_name)\n", + "list_column_specs_response = client.list_column_specs(dataset=dataset)\n", "column_specs = {s.display_name: s for s in list_column_specs_response}\n", "\n", "# Print Features and data_type:\n", @@ -720,14 +626,12 @@ }, "outputs": [], "source": [ - "update_column_spec_dict = {\n", - " 'name': column_specs['income_bracket'].name,\n", - " 'data_type': {\n", - " 'type_code': 'CATEGORY',\n", - " 'nullable': False\n", - " }\n", - "}\n", - "update_column_response = client.update_column_spec(update_column_spec_dict)\n", + "update_column_response = client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name='income',\n", + " type_code='CATEGORY',\n", + " nullable=False,\n", + ")\n", "update_column_response" ] }, @@ -761,19 +665,10 @@ }, "outputs": [], "source": [ - "label_column_name = 'income_bracket'\n", - "label_column_spec = column_specs[label_column_name]\n", - "label_column_id = label_column_spec.name.rsplit('/', 1)[-1]\n", - "print('Label column ID: {}'.format(label_column_id))\n", - "\n", - "# Define the values of the fields to be updated.\n", - "update_dataset_dict = {\n", - " 'name': dataset_name,\n", - " 'tables_dataset_metadata': {\n", - " 'target_column_spec_id': label_column_id\n", - " }\n", - "}\n", - "update_dataset_response = client.update_dataset(update_dataset_dict)\n", + "update_dataset_response = client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name='income',\n", + ")\n", "update_dataset_response" ] }, @@ -806,12 +701,12 @@ "source": [ "Once we have defined our datasets and features we will create a model.\n", "\n", - "Specify the duration of the training. For example, `'train_budget_milli_node_hours': 1000` runs the training for one hour. \n", + "Specify the duration of the training. For example, `train_budget_milli_node_hours=1000` runs the training for one hour. \n", "\n", - "If your Colab times out, use `client.list_models(project_location)` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model. Replace `model_name` with its actual value.\n", + "If your Colab times out, use `client.list_models()` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model.\n", "\n", - "```\n", - " model = client.get_model(model_name) \n", + "```python\n", + " model = client.get_model(model_display_name=model_display_name) \n", "```" ] }, @@ -827,34 +722,15 @@ "source": [ "model_display_name = 'census_income_model'\n", "\n", - "model_dict = {\n", - " 'display_name': model_display_name,\n", - " 'dataset_id': dataset_name.rsplit('/', 1)[-1],\n", - " 'tables_model_metadata': {'train_budget_milli_node_hours': 1000}\n", - "}\n", - "create_model_response = client.create_model(project_location, model_dict)\n", - "print('Dataset import operation: {}'.format(create_model_response.operation))\n", + "create_model_response = client.create_model(\n", + " model_display_name,\n", + " dataset=dataset,\n", + " train_budget_milli_node_hours=1000,\n", + ")\n", + "print('Create model operation: {}'.format(create_model_response.operation))\n", "# Wait until model training is done.\n", - "create_model_result = create_model_response.result()\n", - "create_model_result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Model status" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get complete detail of the model.\n", - "model_name = create_model_result.name\n", - "client.get_model(model_name)" + "model = create_model_response.result()\n", + "model" ] }, { @@ -867,36 +743,6 @@ "___" ] }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "xGVGwgwXSZe_" - }, - "source": [ - "Adjust the slides on the right to the desired test values for your online prediction." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "bDzd5GYQSdpa" - }, - "outputs": [], - "source": [ - "#@title Make an online prediction: set the numeric variables{ vertical-output: true }\n", - "\n", - "age = 34 #@param {type:'slider', min:1, max:100, step:1}\n", - "capital_gain = 40000 #@param {type:'slider', min:0, max:100000, step:10000}\n", - "capital_loss = 3.8 #@param {type:'slider', min:0, max:4000, step:0.1}\n", - "fnlwgt = 150000 #@param {type:'slider', min:0, max:1000000, step:50000}\n", - "education_num = 9 #@param {type:'slider', min:1, max:16, step:1}\n", - "hours_per_week = 40 #@param {type:'slider', min:1, max:100, step:1}" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -913,7 +759,7 @@ "source": [ "**Important** : Deploy the model, then wait until the model FINISHES deployment.\n", "\n", - "The model takes a while to deploy online. When the deployment code response = client.deploy_model(model_name) finishes, you will be able to see this on the UI. Check the [UI](https://console.cloud.google.com/automl-tables?_ga=2.255483016.-1079099924.1550856636) and navigate to the predict tab of your model, and then to the online prediction portion, to see when it finishes online deployment before running the prediction cell.You should see \"online prediction\" text near the top, click on it, and it will take you to a view of your online prediction interface. You should see \"model deployed\" on the far right of the screen if the model is deployed, or a \"deploying model\" message if it is still deploying. " + "The model takes a while to deploy online. When the deployment code response = client.deploy_model(model_name=model.name) finishes, you will be able to see this on the UI. Check the [UI](https://console.cloud.google.com/automl-tables?_ga=2.255483016.-1079099924.1550856636) and navigate to the predict tab of your model, and then to the online prediction portion, to see when it finishes online deployment before running the prediction cell.You should see \"online prediction\" text near the top, click on it, and it will take you to a view of your online prediction interface. You should see \"model deployed\" on the far right of the screen if the model is deployed, or a \"deploying model\" message if it is still deploying. " ] }, { @@ -926,8 +772,7 @@ }, "outputs": [], "source": [ - "deploy_model_response = client.deploy_model(model_name)\n", - "deploy_model_result = deploy_model_response.result()" + "client.deploy_model(model=model).result()" ] }, { @@ -943,7 +788,8 @@ "metadata": {}, "outputs": [], "source": [ - "client.get_model(model_name)" + "model = client.get_model(model_name=model.name)\n", + "model" ] }, { @@ -1002,29 +848,60 @@ "sex_ids = ['Female', 'Male']\n", "native_country_ids = ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands']\n", "\n", - "workclass = widgets.Dropdown(options=workclass_ids, value=workclass_ids[0],\n", - " description='workclass:')\n", - "\n", - "education = widgets.Dropdown(options=education_ids, value=education_ids[0],\n", - " description='education:', width='500px')\n", - "\n", - "marital_status = widgets.Dropdown(options=marital_status_ids, value=marital_status_ids[0],\n", - " description='marital status:', width='500px')\n", - "\n", - "occupation = widgets.Dropdown(options=occupation_ids, value=occupation_ids[0],\n", - " description='occupation:', width='500px')\n", - "\n", - "relationship = widgets.Dropdown(options=relationship_ids, value=relationship_ids[0],\n", - " description='relationship:', width='500px')\n", - "\n", - "race = widgets.Dropdown(options=race_ids, value=race_ids[0],\n", - " description='race:', width='500px')\n", - "\n", - "sex = widgets.Dropdown(options=sex_ids, value=sex_ids[0],\n", - " description='sex:', width='500px')\n", - "\n", - "native_country = widgets.Dropdown(options=native_country_ids, value=native_country_ids[0],\n", - " description='native_country:', width='500px')\n", + "workclass = widgets.Dropdown(\n", + " options=workclass_ids, \n", + " value=workclass_ids[0],\n", + " description='workclass:'\n", + ")\n", + "\n", + "education = widgets.Dropdown(\n", + " options=education_ids, \n", + " value=education_ids[0],\n", + " description='education:', \n", + " width='500px'\n", + ")\n", + " \n", + "marital_status = widgets.Dropdown(\n", + " options=marital_status_ids, \n", + " value=marital_status_ids[0],\n", + " description='marital status:', \n", + " width='500px'\n", + ")\n", + "\n", + "occupation = widgets.Dropdown(\n", + " options=occupation_ids, \n", + " value=occupation_ids[0],\n", + " description='occupation:', \n", + " width='500px'\n", + ")\n", + "\n", + "relationship = widgets.Dropdown(\n", + " options=relationship_ids, \n", + " value=relationship_ids[0],\n", + " description='relationship:', \n", + " width='500px'\n", + ")\n", + "\n", + "race = widgets.Dropdown(\n", + " options=race_ids, \n", + " value=race_ids[0], \n", + " description='race:', \n", + " width='500px'\n", + ")\n", + "\n", + "sex = widgets.Dropdown(\n", + " options=sex_ids, \n", + " value=sex_ids[0],\n", + " description='sex:', \n", + " width='500px'\n", + ")\n", + "\n", + "native_country = widgets.Dropdown(\n", + " options=native_country_ids, \n", + " value=native_country_ids[0],\n", + " description='native_country:', \n", + " width='500px'\n", + ")\n", "\n", "display(workclass)\n", "display(education)\n", @@ -1036,6 +913,36 @@ "display(native_country)" ] }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xGVGwgwXSZe_" + }, + "source": [ + "Adjust the slides on the right to the desired test values for your online prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "bDzd5GYQSdpa" + }, + "outputs": [], + "source": [ + "#@title Make an online prediction: set the numeric variables{ vertical-output: true }\n", + "\n", + "age = 34 #@param {type:'slider', min:1, max:100, step:1}\n", + "capital_gain = 40000 #@param {type:'slider', min:0, max:100000, step:10000}\n", + "capital_loss = 3.8 #@param {type:'slider', min:0, max:4000, step:0.1}\n", + "fnlwgt = 150000 #@param {type:'slider', min:0, max:1000000, step:50000}\n", + "education_num = 9 #@param {type:'slider', min:1, max:16, step:1}\n", + "hours_per_week = 40 #@param {type:'slider', min:1, max:100, step:1}" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1056,28 +963,25 @@ }, "outputs": [], "source": [ - "payload = {\n", - " 'row': { \n", - " 'values': [\n", - " {'number_value': age},\n", - " {'string_value': workclass.value},\n", - " {'number_value': fnlwgt},\n", - " {'string_value': education.value},\n", - " {'number_value': education_num},\n", - " {'string_value': marital_status.value},\n", - " {'string_value': occupation.value},\n", - " {'string_value': relationship.value},\n", - " {'string_value': race.value},\n", - " {'string_value': sex.value},\n", - " {'number_value': capital_gain},\n", - " {'number_value': capital_loss},\n", - " {'number_value': hours_per_week},\n", - " {'string_value': native_country.value}\n", - " ]\n", - " }\n", + "inputs = {\n", + " 'age': age,\n", + " 'workclass': workclass.value,\n", + " 'fnlwgt': fnlwgt,\n", + " 'education': education.value,\n", + " 'education_num': education_num,\n", + " 'marital_status': marital_status.value,\n", + " 'occupation': occupation.value,\n", + " 'relationship': relationship.value,\n", + " 'race': race.value,\n", + " 'sex': sex.value,\n", + " 'capital_gain': capital_gain,\n", + " 'capital_loss': capital_loss,\n", + " 'hours_per_week': hours_per_week,\n", + " 'native_country': native_country.value,\n", "}\n", - "prediction_result = prediction_client.predict(model_name, payload)\n", - "print(prediction_result)" + "\n", + "prediction_result = client.predict(model=model, inputs=inputs)\n", + "prediction_result" ] }, { @@ -1120,7 +1024,7 @@ }, "outputs": [], "source": [ - "undeploy_model_response = client.undeploy_model(model_name)" + "undeploy_model_response = client.undeploy_model(model=model)" ] }, { @@ -1177,36 +1081,9 @@ "metadata": {}, "outputs": [], "source": [ - "! gsutil ls -al gs://cloud-ml-data-tables/notebooks/census_income_batch_prediction_input.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gkF3bH0qu4DU" - }, - "outputs": [], - "source": [ - "#@title Start batch prediction { vertical-output: true, output-height: 200 }\n", - "\n", - "batch_predict_gcs_input_uris = ['gs://cloud-ml-data-tables/notebooks/census_income_batch_prediction_input.csv',] #@param\n", - "batch_predict_gcs_output_uri_prefix = 'gs://automl-tables-pred/' #@param {type:'string'}\n", - "\n", - "# Define input source.\n", - "batch_prediction_input_source = {\n", - " 'gcs_source': {\n", - " 'input_uris': batch_predict_gcs_input_uris\n", - " }\n", - "}\n", - "# Define output target.\n", - "batch_prediction_output_target = {\n", - " 'gcs_destination': {\n", - " 'output_uri_prefix': batch_predict_gcs_output_uri_prefix\n", - " }\n", - "}" + "GCS_BATCH_PREDICT_URI = '{}/census_income_batch_prediction_input.csv'.format(GCS_STORAGE_BUCKET)\n", + "GCS_BATCH_PREDICT_OUTPUT = '{}/census_income_predictions/'.format(GCS_STORAGE_BUCKET)\n", + "! gsutil cp gs://cloud-ml-data-tables/notebooks/census_income_batch_prediction_input.csv $GCS_BATCH_PREDICT_URI" ] }, { @@ -1222,8 +1099,11 @@ "metadata": {}, "outputs": [], "source": [ - "batch_predict_response = prediction_client.batch_predict(\n", - " model_name, batch_prediction_input_source, batch_prediction_output_target)\n", + "batch_predict_response = client.batch_predict(\n", + " model=model, \n", + " gcs_input_uris=GCS_BATCH_PREDICT_URI,\n", + " gcs_output_uri_prefix=GCS_BATCH_PREDICT_OUTPUT,\n", + ")\n", "print('Batch prediction operation: {}'.format(batch_predict_response.operation))\n", "# Wait until batch prediction is done.\n", "batch_predict_result = batch_predict_response.result()\n", @@ -1263,7 +1143,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/samples/tables/notebooks/energy_price_forecasting/energy_price_forecasting.ipynb b/samples/tables/notebooks/energy_price_forecasting/energy_price_forecasting.ipynb index 681ad410..288daabc 100644 --- a/samples/tables/notebooks/energy_price_forecasting/energy_price_forecasting.ipynb +++ b/samples/tables/notebooks/energy_price_forecasting/energy_price_forecasting.ipynb @@ -1,728 +1,702 @@ - { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KOAz-lD1P7Kx" + }, + "source": [ + "----------------------------------------\n", + "\n", + "Copyright 2018 Google LLC \n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "you may not use this file except in compliance with the License.\n", + "You may obtain a copy of the License at\n", + "\n", + "[http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0)\n", + "\n", + "Unless required by applicable law or agreed to in writing, software\n", + "distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "See the License for the specific language governing permissions and limitations under the License.\n", + "\n", + "----------------------------------------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "m26YhtBMvVWA" + }, + "source": [ + "# Energy Forecasting with AutoML Tables\n", + "\n", + "To use this Colab notebook, copy it to your own Google Drive and open it with [Colaboratory](https://colab.research.google.com/) (or Colab). To run a cell hold the Shift key and press the Enter key (or Return key). Colab automatically displays the return value of the last line in each cell. Refer to [this page](https://colab.research.google.com/notebooks/welcome.ipynb) for more information on Colab.\n", + "\n", + "You can run a Colab notebook on a hosted runtime in the Cloud. The hosted VM times out after 90 minutes of inactivity and you will lose all the data stored in the memory including your authentication data. If your session gets disconnected (for example, because you closed your laptop) for less than the 90 minute inactivity timeout limit, press 'RECONNECT' on the top right corner of your notebook and resume the session. After Colab timeout, you'll need to\n", + "\n", + "1. Re-run the initialization and authentication.\n", + "2. Continue from where you left off. You may need to copy-paste the value of some variables such as the `dataset_name` from the printed output of the previous cells.\n", + "\n", + "Alternatively you can connect your Colab notebook to a [local runtime](https://research.google.com/colaboratory/local-runtimes.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "b--5FDDwCG9C" + }, + "source": [ + "## 1. Project set up\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "AZs0ICgy4jkQ" + }, + "source": [ + "Follow the [AutoML Tables documentation](https://cloud.google.com/automl-tables/docs/) to\n", + "* Create a Google Cloud Platform (GCP) project.\n", + "* Enable billing.\n", + "* Apply to whitelist your project.\n", + "* Enable AutoML API.\n", + "* Enable AutoML Talbes API.\n", + "* Create a service account, grant required permissions, and download the service account private key.\n", + "\n", + "You also need to upload your data into Google Cloud Storage (GCS) or BigQuery. For example, to use GCS as your data source\n", + "* Create a GCS bucket.\n", + "* Upload the training and batch prediction files.\n", + "\n", + "\n", + "**Warning:** Private keys must be kept secret. If you expose your private key it is recommended to revoke it immediately from the Google Cloud Console." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xZECt1oL429r" + }, + "source": [ + "\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rstRPH9SyZj_" + }, + "source": [ + "## 2. Initialize and authenticate\n", + "This section runs intialization and authentication. It creates an authenticated session which is required for running any of the following sections." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BR0POq2UzE7e" + }, + "source": [ + "### Install the client library\n", + "Run the following cell to install the client library using `pip`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "name": "Energy_Price_Forecasting.ipynb", - "version": "0.3.2", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "cells": [ - { - "metadata": { - "id": "KOAz-lD1P7Kx", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "----------------------------------------\n", - "\n", - "Copyright 2018 Google LLC \n", - "\n", - "Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "you may not use this file except in compliance with the License.\n", - "You may obtain a copy of the License at\n", - "\n", - "[http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0)\n", - "\n", - "Unless required by applicable law or agreed to in writing, software\n", - "distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "See the License for the specific language governing permissions and limitations under the License.\n", - "\n", - "----------------------------------------" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "m26YhtBMvVWA" - }, - "cell_type": "markdown", - "source": [ - "# Energy Forecasting with AutoML Tables\n", - "\n", - "To use this Colab notebook, copy it to your own Google Drive and open it with [Colaboratory](https://colab.research.google.com/) (or Colab). To run a cell hold the Shift key and press the Enter key (or Return key). Colab automatically displays the return value of the last line in each cell. Refer to [this page](https://colab.research.google.com/notebooks/welcome.ipynb) for more information on Colab.\n", - "\n", - "You can run a Colab notebook on a hosted runtime in the Cloud. The hosted VM times out after 90 minutes of inactivity and you will lose all the data stored in the memory including your authentication data. If your session gets disconnected (for example, because you closed your laptop) for less than the 90 minute inactivity timeout limit, press 'RECONNECT' on the top right corner of your notebook and resume the session. After Colab timeout, you'll need to\n", - "\n", - "1. Re-run the initialization and authentication.\n", - "2. Continue from where you left off. You may need to copy-paste the value of some variables such as the `dataset_name` from the printed output of the previous cells.\n", - "\n", - "Alternatively you can connect your Colab notebook to a [local runtime](https://research.google.com/colaboratory/local-runtimes.html)." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "b--5FDDwCG9C" - }, - "cell_type": "markdown", - "source": [ - "## 1. Project set up\n", - "\n", - "\n", - "\n" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "AZs0ICgy4jkQ" - }, - "cell_type": "markdown", - "source": [ - "Follow the [AutoML Tables documentation](https://cloud.google.com/automl-tables/docs/) to\n", - "* Create a Google Cloud Platform (GCP) project.\n", - "* Enable billing.\n", - "* Apply to whitelist your project.\n", - "* Enable AutoML API.\n", - "* Enable AutoML Talbes API.\n", - "* Create a service account, grant required permissions, and download the service account private key.\n", - "\n", - "You also need to upload your data into Google Cloud Storage (GCS) or BigQuery. For example, to use GCS as your data source\n", - "* Create a GCS bucket.\n", - "* Upload the training and batch prediction files.\n", - "\n", - "\n", - "**Warning:** Private keys must be kept secret. If you expose your private key it is recommended to revoke it immediately from the Google Cloud Console." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "xZECt1oL429r" - }, - "cell_type": "markdown", - "source": [ - "\n", - "\n", - "---\n", - "\n" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "rstRPH9SyZj_" - }, - "cell_type": "markdown", - "source": [ - "## 2. Initialize and authenticate\n", - "This section runs intialization and authentication. It creates an authenticated session which is required for running any of the following sections." - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "BR0POq2UzE7e" - }, - "cell_type": "markdown", - "source": [ - "### Install the client library\n", - "Run the following cell to install the client library using `pip`." - ] - }, - { - "metadata": { - "id": "43aXKjDRt_qZ", - "colab_type": "code", - "colab": { - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=", - "ok": true, - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "status": 200, - "status_text": "" - } - }, - "base_uri": "https://localhost:8080/", - "height": 602 - }, - "outputId": "4d3628f9-e5be-4145-f550-8eaffca97d37" - }, - "cell_type": "code", - "source": [ - "#@title Install AutoML Tables client library { vertical-output: true }\n", - "\n", - "!pip install google-cloud-automl" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "eVFsPPEociwF" - }, - "cell_type": "markdown", - "source": [ - "### Authenticate using service account key\n", - "Run the following cell. Click on the 'Choose Files' button and select the service account private key file. If your Service Account key file or folder is hidden, you can reveal it in a Mac by pressing the Command + Shift + . combo." - ] - }, - { - "metadata": { - "id": "u-kCqysAuaJk", - "colab_type": "code", - "colab": { - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=", - "ok": true, - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "status": 200, - "status_text": "" - } - }, - "base_uri": "https://localhost:8080/", - "height": 71 - }, - "outputId": "06154a63-f410-435f-b565-cd1599243b88" - }, - "cell_type": "code", - "source": [ - "#@title Authenticate using service account key and create a client. { vertical-output: true }\n", - "\n", - "from google.colab import files\n", - "from google.cloud import automl_v1beta1\n", - "\n", - "# Upload service account key\n", - "keyfile_upload = files.upload()\n", - "keyfile_name = list(keyfile_upload.keys())[0]\n", - "# Authenticate and create an AutoML client.\n", - "client = automl_v1beta1.AutoMlClient.from_service_account_file(keyfile_name)\n", - "# Authenticate and create a prediction service client.\n", - "prediction_client = automl_v1beta1.PredictionServiceClient.from_service_account_file(keyfile_name)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "s3F2xbEJdDvN" - }, - "cell_type": "markdown", - "source": [ - "### Set Project and Location" - ] - }, - { - "metadata": { - "id": "0uX4aJYUiXh5", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "Enter your GCP project ID." - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "6R4h5HF1Dtds", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "1e049b34-4683-4755-ab08-aec08de2bc66" - }, - "cell_type": "code", - "source": [ - "#@title GCP project ID and location\n", - "\n", - "project_id = 'energy-forecasting' #@param {type:'string'}\n", - "location = 'us-central1' #@param {type:'string'}\n", - "location_path = client.location_path(project_id, location)\n", - "location_path" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "qozQWMnOu48y" - }, - "cell_type": "markdown", - "source": [ - "\n", - "\n", - "---\n", - "\n" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "ODt86YuVDZzm" - }, - "cell_type": "markdown", - "source": [ - "## 3. Import training data" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "XwjZc9Q62Fm5" - }, - "cell_type": "markdown", - "source": [ - "### Create dataset" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "_JfZFGSceyE_" - }, - "cell_type": "markdown", - "source": [ - "Select a dataset display name and pass your table source information to create a new dataset." - ] - }, - { - "metadata": { - "id": "Z_JErW3cw-0J", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 224 - }, - "outputId": "7fe366df-73ae-4ab1-ceaa-fd6ced4ccdd9" - }, - "cell_type": "code", - "source": [ - "#@title Create dataset { vertical-output: true, output-height: 200 }\n", - "\n", - "dataset_display_name = 'energy_forecasting_solution' #@param {type: 'string'}\n", - "\n", - "create_dataset_response = client.create_dataset(\n", - " location_path,\n", - " {'display_name': dataset_display_name, 'tables_dataset_metadata': {}})\n", - "dataset_name = create_dataset_response.name\n", - "create_dataset_response" - ], - "execution_count":0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "35YZ9dy34VqJ" - }, - "cell_type": "markdown", - "source": [ - "### Import data" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "3c0o15gVREAw" - }, - "cell_type": "markdown", - "source": [ - "You can import your data to AutoML Tables from GCS or BigQuery. You can create a GCS bucket and upload the data into your bucket. The URI for your file is `gs://BUCKET_NAME/FOLDER_NAME1/FOLDER_NAME2/.../FILE_NAME`. Alternatively you can create a BigQuery table and upload the data into the table. The URI for your table is `bq://PROJECT_ID.DATASET_ID.TABLE_ID`.\n", - "\n", - "Importing data may take a few minutes or hours depending on the size of your data. If your Colab times out, run the following command to retrieve your dataset. Replace `dataset_name` with its actual value obtained in the preceding cells.\n", - "\n", - " dataset = client.get_dataset(dataset_name)" - ] - }, - { - "metadata": { - "id": "bB_GdeqCJW5i", - "colab_type": "code", - "colab": {} - }, - "cell_type": "code", - "source": [ - "#@title Datasource in BigQuery { vertical-output: true }\n", - "\n", - "dataset_bq_input_uri = 'bq://energy-forecasting.Energy.automldata' #@param {type: 'string'}\n", - "# Define input configuration.\n", - "input_config = {\n", - " 'bigquery_source': {\n", - " 'input_uri': dataset_bq_input_uri\n", - " }\n", - "}" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "FNVYfpoXJsNB", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 139 - }, - "outputId": "0ecc8d11-5bf1-4c2e-f688-b6d9be934e3c" - }, - "cell_type": "code", - "source": [ - " #@title Import data { vertical-output: true }\n", - "\n", - "import_data_response = client.import_data(dataset_name, input_config)\n", - "print('Dataset import operation: {}'.format(import_data_response.operation))\n", - "# Wait until import is done.\n", - "import_data_result = import_data_response.result()\n", - "import_data_result" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "QdxBI4s44ZRI", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Review the specs" - ] - }, - { - "metadata": { - "id": "RC0PWKqH4jwr", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "Run the following command to see table specs such as row count." - ] - }, - { - "metadata": { - "id": "v2Vzq_gwXxo-", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 3247 - }, - "outputId": "c89cd7b1-4344-46d9-c4a3-1b012b5b720d" - }, - "cell_type": "code", - "source": [ - "#@title Table schema { vertical-output: true }\n", - "\n", - "import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types\n", - "\n", - "# List table specs\n", - "list_table_specs_response = client.list_table_specs(dataset_name)\n", - "table_specs = [s for s in list_table_specs_response]\n", - "# List column specs\n", - "table_spec_name = table_specs[0].name\n", - "list_column_specs_response = client.list_column_specs(table_spec_name)\n", - "column_specs = {s.display_name: s for s in list_column_specs_response}\n", - "[(x, data_types.TypeCode.Name(\n", - " column_specs[x].data_type.type_code)) for x in column_specs.keys()]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "vcJP7xoq4yAJ", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "Run the following command to see column specs such inferred schema." - ] - }, - { - "metadata": { - "id": "FNykW_YOYt6d", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "___" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "kNRVJqVOL8h3" - }, - "cell_type": "markdown", - "source": [ - "## 4. Update dataset: assign a label column and enable nullable columns" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "-57gehId9PQ5" - }, - "cell_type": "markdown", - "source": [ - "AutoML Tables automatically detects your data column type. For example, for the [Iris dataset](https://storage.cloud.google.com/rostam-193618-tutorial/automl-tables-v1beta1/iris.csv) it detects `species` to be categorical and `petal_length`, `petal_width`, `sepal_length`, and `sepal_width` to be numerical. Depending on the type of your label column, AutoML Tables chooses to run a classification or regression model. If your label column contains only numerical values, but they represent categories, change your label column type to categorical by updating your schema." - ] - }, - { - "metadata": { - "id": "iRqdQ7Xiq04x", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "### Update a column: set as categorical" - ] - }, - { - "metadata": { - "id": "OCEUIPKegWrf", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "outputId": "44370b2c-f3dc-46bc-cefd-8a6f29f9cabe" - }, - "cell_type": "code", - "source": [ - "#@title Update dataset { vertical-output: true }\n", - "\n", - "column_to_category = 'hour' #@param {type: 'string'}\n", - "\n", - "update_column_spec_dict = {\n", - " \"name\": column_specs[column_to_category].name,\n", - " \"data_type\": {\n", - " \"type_code\": \"CATEGORY\"\n", - " }\n", - "}\n", - "update_column_response = client.update_column_spec(update_column_spec_dict)\n", - "update_column_response.display_name , update_column_response.data_type \n" - ], - "execution_count":0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "nDMH_chybe4w" - }, - "cell_type": "markdown", - "source": [ - "### Update dataset: assign a label and split column" - ] - }, - { - "metadata": { - "id": "hVIruWg0u33t", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 360 - }, - "outputId": "eeb5f733-16ec-4191-ea59-c2fab30c8442" - }, - "cell_type": "code", - "source": [ - "#@title Update dataset { vertical-output: true }\n", - "\n", - "label_column_name = 'price' #@param {type: 'string'}\n", - "label_column_spec = column_specs[label_column_name]\n", - "label_column_id = label_column_spec.name.rsplit('/', 1)[-1]\n", - "print('Label column ID: {}'.format(label_column_id))\n", - "\n", - "split_column_name = 'split' #@param {type: 'string'}\n", - "split_column_spec = column_specs[split_column_name]\n", - "split_column_id = split_column_spec.name.rsplit('/', 1)[-1]\n", - "print('Split column ID: {}'.format(split_column_id))\n", - "# Define the values of the fields to be updated.\n", - "update_dataset_dict = {\n", - " 'name': dataset_name,\n", - " 'tables_dataset_metadata': {\n", - " 'target_column_spec_id': label_column_id,\n", - " 'ml_use_column_spec_id': split_column_id,\n", - " }\n", - "}\n", - "update_dataset_response = client.update_dataset(update_dataset_dict)\n", - "update_dataset_response" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "z23NITLrcxmi", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "___" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "FcKgvj1-Tbgj" - }, - "cell_type": "markdown", - "source": [ - "## 5. Creating a model" - ] - }, - { - "metadata": { - "colab_type": "text", - "id": "Pnlk8vdQlO_k" - }, - "cell_type": "markdown", - "source": [ - "### Train a model\n", - "Specify the duration of the training. For example, `'train_budget_milli_node_hours': 1000` runs the training for one hour. If your Colab times out, use `client.list_models(location_path)` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model. Replace `model_name` with its actual value.\n", - "\n", - " model = client.get_model(model_name)" - ] - }, - { - "metadata": { - "id": "11izNd6Fu37N", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 139 - }, - "outputId": "1bca25aa-eb19-4b27-a3fa-7ef137aaf4e2" - }, - "cell_type": "code", - "source": [ - "#@title Create model { vertical-output: true }\n", - "\n", - "\n", - "\n", - "model_display_name = 'energy_model' #@param {type:'string'}\n", - "model_train_hours = 12 #@param {type:'integer'}\n", - "model_optimization_objective = 'MINIMIZE_MAE' #@param {type:'string'}\n", - "column_to_ignore = 'date_utc' #@param {type:'string'}\n", - "\n", - "# Create list of features to use\n", - "feat_list = list(column_specs.keys())\n", - "feat_list.remove(label_column_name)\n", - "feat_list.remove(split_column_name)\n", - "feat_list.remove(column_to_ignore)\n", - "\n", - "model_dict = {\n", - " 'display_name': model_display_name,\n", - " 'dataset_id': dataset_name.rsplit('/', 1)[-1],\n", - " 'tables_model_metadata': {\n", - " 'train_budget_milli_node_hours':model_train_hours * 1000,\n", - " 'optimization_objective': model_optimization_objective,\n", - " 'target_column_spec': column_specs[label_column_name],\n", - " 'input_feature_column_specs': [\n", - " column_specs[x] for x in feat_list]}\n", - " }\n", - " \n", - "create_model_response = client.create_model(location_path, model_dict)\n", - "print('Dataset import operation: {}'.format(create_model_response.operation))\n", - "# Wait until model training is done.\n", - "create_model_result = create_model_response.result()\n", - "model_name = create_model_result.name\n", - "create_model_result" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "puVew1GgPfQa", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - }, - "outputId": "42b9296c-d231-4787-f7fb-4aa1a6ff9bd9" - }, - "cell_type": "code", - "source": [ - "#@title Model Metrics {vertical-output: true }\n", - "\n", - "metrics= [x for x in client.list_model_evaluations(model_name)][-1]\n", - "metrics.regression_evaluation_metrics" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "YQnfEwyrSt2T", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "![alt text](https://storage.googleapis.com/images_public/automl_test.png)" - ] - }, - { - "metadata": { - "id": "Vyc8ckbpRMHp", - "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 272 - }, - "outputId": "931d4921-2144-4092-dab6-165c1b1c2a88" - }, - "cell_type": "code", - "source": [ - "#@title Feature Importance {vertical-output: true }\n", - "\n", - "model = client.get_model(model_name)\n", - "feat_list = [(x.feature_importance, x.column_display_name) for x in model.tables_model_metadata.tables_model_column_info]\n", - "feat_list.sort(reverse=True)\n", - "feat_list[:15]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "id": "__2gDQ5I5gcj", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "![alt text](https://storage.googleapis.com/images_public/feature_importance.png)\n", - "![alt text](https://storage.googleapis.com/images_public/loc_portugal.png)\n", - "![alt text](https://storage.googleapis.com/images_public/weather_schema.png)\n", - "![alt text](https://storage.googleapis.com/images_public/training_schema.png)" - ] - }, - { - "metadata": { - "id": "1wS1is9IY5nK", - "colab_type": "text" - }, - "cell_type": "markdown", - "source": [ - "___" - ] - } - ] + "base_uri": "https://localhost:8080/", + "height": 602, + "resources": { + "http://localhost:8080/nbextensions/google.colab/files.js": { + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=", + "headers": [ + [ + "content-type", + "application/javascript" + ] + ], + "ok": true, + "status": 200, + "status_text": "" + } + } + }, + "colab_type": "code", + "id": "43aXKjDRt_qZ", + "outputId": "4d3628f9-e5be-4145-f550-8eaffca97d37" + }, + "outputs": [], + "source": [ + "#@title Install AutoML Tables client library { vertical-output: true }\n", + "\n", + "!pip install google-cloud-automl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s3F2xbEJdDvN" + }, + "source": [ + "### Set Project and Location" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0uX4aJYUiXh5" + }, + "source": [ + "Enter your GCP project ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "6R4h5HF1Dtds", + "outputId": "1e049b34-4683-4755-ab08-aec08de2bc66" + }, + "outputs": [], + "source": [ + "#@title GCP project ID and location\n", + "\n", + "project_id = 'energy-forecasting' #@param {type:'string'}\n", + "location = 'us-central1' #@param {type:'string'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eVFsPPEociwF" + }, + "source": [ + "### Authenticate using service account key\n", + "Run the following cell. Click on the 'Choose Files' button and select the service account private key file. If your Service Account key file or folder is hidden, you can reveal it in a Mac by pressing the Command + Shift + . combo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 71, + "resources": { + "http://localhost:8080/nbextensions/google.colab/files.js": { + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=", + "headers": [ + [ + "content-type", + "application/javascript" + ] + ], + "ok": true, + "status": 200, + "status_text": "" + } + } + }, + "colab_type": "code", + "id": "u-kCqysAuaJk", + "outputId": "06154a63-f410-435f-b565-cd1599243b88" + }, + "outputs": [], + "source": [ + "#@title Authenticate using service account key and create a client. { vertical-output: true }\n", + "\n", + "from google.oauth2 import service_account\n", + "from google.colab import files\n", + "from google.cloud import automl_v1beta1\n", + "\n", + "# Upload service account key\n", + "keyfile_upload = files.upload()\n", + "keyfile_name = list(keyfile_upload.keys())[0]\n", + "# Authenticate and create an AutoML client.\n", + "credentials = service_account.Credentials.from_service_account_file(keyfile_name)\n", + "client = automl.TablesClient(project=project_id, region=location, credentials=credentials)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qozQWMnOu48y" + }, + "source": [ + "\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ODt86YuVDZzm" + }, + "source": [ + "## 3. Import training data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XwjZc9Q62Fm5" + }, + "source": [ + "### Create dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_JfZFGSceyE_" + }, + "source": [ + "Select a dataset display name and pass your table source information to create a new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "colab_type": "code", + "id": "Z_JErW3cw-0J", + "outputId": "7fe366df-73ae-4ab1-ceaa-fd6ced4ccdd9" + }, + "outputs": [], + "source": [ + "#@title Create dataset { vertical-output: true, output-height: 200 }\n", + "\n", + "dataset_display_name = 'energy_forcasting_solution' \n", + "dataset = client.create_dataset(dataset_display_name)\n", + "print(dataset.name) # unique to this dataset\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "35YZ9dy34VqJ" + }, + "source": [ + "### Import data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3c0o15gVREAw" + }, + "source": [ + "You can import your data to AutoML Tables from GCS or BigQuery. You can create a GCS bucket and upload the data into your bucket. The URI for your file is `gs://BUCKET_NAME/FOLDER_NAME1/FOLDER_NAME2/.../FILE_NAME`. Alternatively you can create a BigQuery table and upload the data into the table. The URI for your table is `bq://PROJECT_ID.DATASET_ID.TABLE_ID`.\n", + "\n", + "Importing data may take a few minutes or hours depending on the size of your data. __If your Colab times out__, run the following command to retrieve your dataset. Replace `dataset_name` with its actual value obtained in the preceding cells.\n", + "\n", + "```python\n", + " # This will work if your display name ('energy_forecasting_solution') is unique to your project.\n", + " dataset = client.get_dataset(dataset_display_name=dataset_display_name)\n", + " # OR, if you have multiple datasets with the same display name ('energy_forecasting_solution'), use the\n", + " # unique indentifier acquired from the above cell ( print(dataset.name) ).\n", + " dataset = client.get_dataset(dataset_name=dataset_name)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + }, + "colab_type": "code", + "id": "FNVYfpoXJsNB", + "outputId": "0ecc8d11-5bf1-4c2e-f688-b6d9be934e3c" + }, + "outputs": [], + "source": [ + " #@title Import data { vertical-output: true }\n", + "\n", + "dataset_bq_input_uri = 'bq://energy-forecasting.Energy.automldata' #@param {type: 'string'}\n", + "\n", + "import_data_operation = client.import_data(\n", + " dataset=dataset,\n", + " bigquery_input_uri=dataset_bq_input_uri\n", + ")\n", + "\n", + "print('Dataset import operation: {}'.format(import_data_response.operation))\n", + "\n", + "# Wait until import is done.\n", + "import_data_result = import_data_response.result()\n", + "import_data_result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QdxBI4s44ZRI" + }, + "source": [ + "### Review the specs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RC0PWKqH4jwr" + }, + "source": [ + "Run the following command to see table specs such as row count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 3247 + }, + "colab_type": "code", + "id": "v2Vzq_gwXxo-", + "outputId": "c89cd7b1-4344-46d9-c4a3-1b012b5b720d" + }, + "outputs": [], + "source": [ + "#@title Table schema { vertical-output: true }\n", + "\n", + "# List table specs\n", + "list_table_specs_response = client.list_table_specs(dataset=dataset)\n", + "table_specs = [s for s in list_table_specs_response]\n", + "\n", + "# List column specs\n", + "list_column_specs_response = client.list_column_specs(dataset=dataset)\n", + "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "\n", + "# Print Features and data_type:\n", + "\n", + "features = [(key, data_types.TypeCode.Name(value.data_type.type_code)) for key, value in column_specs.items()]\n", + "print('Feature list:\\n')\n", + "for feature in features:\n", + " print(feature[0],':', feature[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FNykW_YOYt6d" + }, + "source": [ + "___" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kNRVJqVOL8h3" + }, + "source": [ + "## 4. Update dataset: assign a label column and enable nullable columns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-57gehId9PQ5" + }, + "source": [ + "AutoML Tables automatically detects your data column type. For example, for the [Iris dataset](https://storage.cloud.google.com/rostam-193618-tutorial/automl-tables-v1beta1/iris.csv) it detects `species` to be categorical and `petal_length`, `petal_width`, `sepal_length`, and `sepal_width` to be numerical. Depending on the type of your label column, AutoML Tables chooses to run a classification or regression model. If your label column contains only numerical values, but they represent categories, change your label column type to categorical by updating your schema." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "iRqdQ7Xiq04x" + }, + "source": [ + "### Update a column: set as categorical" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "colab_type": "code", + "id": "OCEUIPKegWrf", + "outputId": "44370b2c-f3dc-46bc-cefd-8a6f29f9cabe" + }, + "outputs": [], + "source": [ + "#@title Update dataset { vertical-output: true }\n", + "\n", + "column_to_category = 'hour' #@param {type: 'string'}\n", + "\n", + "update_column_response = client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_to_category,\n", + " type_code='CATEGORY'\n", + ")\n", + "\n", + "update_column_response.display_name, update_column_response.data_type " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nDMH_chybe4w" + }, + "source": [ + "### Update dataset: assign a target and split column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 360 + }, + "colab_type": "code", + "id": "hVIruWg0u33t", + "outputId": "eeb5f733-16ec-4191-ea59-c2fab30c8442" + }, + "outputs": [], + "source": [ + "#@title Update dataset { vertical-output: true }\n", + "\n", + "target_column_name = 'price' #@param {type: 'string'}\n", + "split_column_name = 'split' #@param {type: 'string'}\n", + "\n", + "client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name=target_column_name,\n", + ")\n", + "\n", + "client.set_test_train_column(\n", + " dataset=dataset,\n", + " column_spec_display_name=split_column_name,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "z23NITLrcxmi" + }, + "source": [ + "___" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FcKgvj1-Tbgj" + }, + "source": [ + "## 5. Creating a model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Pnlk8vdQlO_k" + }, + "source": [ + "### Train a model\n", + "\n", + "Specify the duration of the training. For example, `train_budget_milli_node_hours=1000` runs the training for one hour. \n", + "\n", + "If your Colab times out, use `client.list_models()` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model.\n", + "\n", + "```python\n", + " model = client.get_model(model_display_name=model_display_name) \n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + }, + "colab_type": "code", + "id": "11izNd6Fu37N", + "outputId": "1bca25aa-eb19-4b27-a3fa-7ef137aaf4e2" + }, + "outputs": [], + "source": [ + "#@title Create model { vertical-output: true }\n", + "\n", + "model_display_name = 'energy_model' #@param {type:'string'}\n", + "model_train_hours = 12 #@param {type:'integer'}\n", + "model_optimization_objective = 'MINIMIZE_MAE' #@param {type:'string'}\n", + "\n", + "create_model_response = client.create_model(\n", + " model_display_name,\n", + " dataset=dataset,\n", + " optimization_objective=model_optimization_objective,\n", + " train_budget_milli_node_hours=model_train_hours * 1000,\n", + ")\n", + "\n", + "print('Dataset import operation: {}'.format(create_model_response.operation))\n", + "# Wait until model training is done.\n", + "model = create_model_response.result()\n", + "model_name = model.name\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "colab_type": "code", + "id": "puVew1GgPfQa", + "outputId": "42b9296c-d231-4787-f7fb-4aa1a6ff9bd9" + }, + "outputs": [], + "source": [ + "#@title Model Metrics {vertical-output: true }\n", + "\n", + "metrics= [x for x in client.list_model_evaluations(model=model)][-1]\n", + "metrics.regression_evaluation_metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YQnfEwyrSt2T" + }, + "source": [ + "![alt text](https://storage.googleapis.com/images_public/automl_test.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "colab_type": "code", + "id": "Vyc8ckbpRMHp", + "outputId": "931d4921-2144-4092-dab6-165c1b1c2a88" + }, + "outputs": [], + "source": [ + "#@title Feature Importance {vertical-output: true }\n", + "\n", + "feat_list = [(x.feature_importance, x.column_display_name) for x in model.tables_model_metadata.tables_model_column_info]\n", + "feat_list.sort(reverse=True)\n", + "feat_list[:15]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "__2gDQ5I5gcj" + }, + "source": [ + "![alt text](https://storage.googleapis.com/images_public/feature_importance.png)\n", + "![alt text](https://storage.googleapis.com/images_public/loc_portugal.png)\n", + "![alt text](https://storage.googleapis.com/images_public/weather_schema.png)\n", + "![alt text](https://storage.googleapis.com/images_public/training_schema.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1wS1is9IY5nK" + }, + "source": [ + "___" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Energy_Price_Forecasting.ipynb", + "provenance": [], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb b/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb index e77e479f..112a6310 100644 --- a/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb +++ b/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -845,7 +845,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb b/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb index 7148408c..39a4cb8b 100644 --- a/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb +++ b/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb @@ -302,8 +302,7 @@ "metadata": {}, "outputs": [], "source": [ - "client = automl.AutoMlClient()\n", - "prediction_client = automl.PredictionServiceClient()" + "client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)" ] }, { @@ -322,8 +321,9 @@ "metadata": {}, "outputs": [], "source": [ - "# client = automl.AutoMlClient.from_service_account_file('/path/to/service_account.json')\n", - "# prediction_client = automl.PredictionServiceClient.from_service_account_file('/path/to/service_account.json')" + "# from google.oauth2 import service_account\n", + "# credentials = service_account.Credentials.from_service_account_file('/path/to/service_account.json')\n", + "# client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION, credentials=credentials)" ] }, { @@ -333,16 +333,6 @@ "---" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the GCP location of your project.\n", - "location_path = client.location_path(PROJECT_ID, COMPUTE_REGION)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -410,12 +400,8 @@ "\n", "dataset_display_name = 'colab_trial1' #@param {type: 'string'}\n", "\n", - "create_dataset_response = client.create_dataset(\n", - " location_path,\n", - " {'display_name': dataset_display_name, 'tables_dataset_metadata': {}})\n", - "dataset_name = create_dataset_response.name\n", - "create_dataset_response\n", - "\n" + "dataset = client.create_dataset(dataset_display_name)\n", + "dataset" ] }, { @@ -794,32 +780,17 @@ "source": [ "#@title ... take the data source from GCS { vertical-output: true } \n", "\n", - "dataset_gcs_input_uris = ['gs:///training_unnested_balanced_FULL.csv',] #@param\n", - "# Define input configuration.\n", - "input_config = {\n", - " 'gcs_source': {\n", - " 'input_uris': dataset_gcs_input_uris\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "SfXjtAwDsYlV" - }, - "outputs": [], - "source": [ - " #@title Import data { vertical-output: true }\n", + "dataset_gcs_input_uris = ['gs://{}/training_unnested_balanced_FULL.csv'.format(bucket_name),] #@param\n", + "import_data_operation = client.import_data(\n", + " dataset=dataset,\n", + " gcs_input_uris=dataset_gcs_input_uris\n", + ")\n", + "print('Dataset import operation: {}'.format(import_data_operation))\n", "\n", - "import_data_response = client.import_data(dataset_name, input_config)\n", - "print('Dataset import operation: {}'.format(import_data_response.operation))\n", - "# Wait until import is done.\n", - "import_data_result = import_data_response.result()\n", - "import_data_result" + "# Synchronous check of operation status. Wait until import is done.\n", + "import_data_operation.result()\n", + "dataset = client.get_dataset(dataset_name=dataset.name)\n", + "dataset" ] }, { @@ -866,24 +837,28 @@ }, "outputs": [], "source": [ - "#@title Table schema { vertical-output: true }\n", - "\n", - "import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types\n", - "import matplotlib.pyplot as plt\n", - "\n", "# List table specs\n", - "list_table_specs_response = client.list_table_specs(dataset_name)\n", + "list_table_specs_response = client.list_table_specs(dataset=dataset)\n", "table_specs = [s for s in list_table_specs_response]\n", + "\n", "# List column specs\n", - "table_spec_name = table_specs[0].name\n", - "list_column_specs_response = client.list_column_specs(table_spec_name)\n", + "list_column_specs_response = client.list_column_specs(dataset=dataset)\n", "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "\n", + "# Print Features and data_type:\n", + "\n", + "features = [(key, data_types.TypeCode.Name(value.data_type.type_code)) for key, value in column_specs.items()]\n", + "print('Feature list:\\n')\n", + "for feature in features:\n", + " print(feature[0],':', feature[1])\n", + " \n", "# Table schema pie chart.\n", + "\n", "type_counts = {}\n", "for column_spec in column_specs.values():\n", " type_name = data_types.TypeCode.Name(column_spec.data_type.type_code)\n", " type_counts[type_name] = type_counts.get(type_name, 0) + 1\n", - "\n", + " \n", "plt.pie(x=type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')\n", "plt.axis('equal')\n", "plt.show()\n" @@ -901,14 +876,11 @@ "source": [ "#@title Update a column: set to not nullable { vertical-output: true }\n", "\n", - "update_column_spec_dict = {\n", - " 'name': column_specs['totalTransactionRevenue'].name,\n", - " 'data_type': {\n", - " 'type_code': 'CATEGORY',\n", - " 'nullable': False\n", - " }\n", - "}\n", - "update_column_response = client.update_column_spec(update_column_spec_dict)\n", + "update_column_response = client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name='totalTransactionRevenue',\n", + " nullable=False,\n", + ")\n", "update_column_response" ] }, @@ -919,9 +891,7 @@ "id": "3O9cFko3t3ai" }, "source": [ - "**Tip:** You can use `'type_code': 'CATEGORY'` in the preceding `update_column_spec_dict` to convert the column data type from `FLOAT64` `to `CATEGORY`.\n", - "\n", - "\n" + "**Tip:** You can use kwarg `type_code='CATEGORY'` in the preceding `update_column_spec(..)` call to convert the column data type from `FLOAT64` `to `CATEGORY`." ] }, { @@ -931,9 +901,7 @@ "id": "rR2RaPP7t6y8" }, "source": [ - "### Update dataset: assign a label\n", - "\n", - "Simply update the target column to not nullable, and update the assigned label to ‘totalTransactionRevenue’" + "### Update dataset: assign a target column" ] }, { @@ -948,18 +916,10 @@ "source": [ "#@title Update dataset { vertical-output: true }\n", "\n", - "label_column_name = 'totalTransactionRevenue' #@param {type: 'string'}\n", - "label_column_spec = column_specs[label_column_name]\n", - "label_column_id = label_column_spec.name.rsplit('/', 1)[-1]\n", - "print('Label column ID: {}'.format(label_column_id))\n", - "# Define the values of the fields to be updated.\n", - "update_dataset_dict = {\n", - " 'name': dataset_name,\n", - " 'tables_dataset_metadata': {\n", - " 'target_column_spec_id': label_column_id\n", - " }\n", - "}\n", - "update_dataset_response = client.update_dataset(update_dataset_dict)\n", + "update_dataset_response = client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name='totalTransactionRevenue',\n", + ")\n", "update_dataset_response" ] }, @@ -988,9 +948,9 @@ "\n", "The decision to divide the sessions along time was made to avoid the model training on future data to predict past data. (This can be avoided with a datetime variable in the dataset and by toggling a button in the UI)\n", "\n", - "Training the model may take one hour or more. The following cell keeps running until the training is done. If your Colab times out, use `client.list_models(location_path)` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model. Replace `model_name` with its actual value.\n", + "Training the model may take one hour or more. The following cell keeps running until the training is done. If your Colab times out, use `client.list_models()` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model. Replace `model_name` with its actual value.\n", "\n", - " model = client.get_model(model_name)\n", + " model = client.get_model(model_name=model_name)\n", " \n", "Note that we trained on the first 9 months of data and we validate using the last 3." ] @@ -1009,17 +969,15 @@ "#this will create a model that can be access through the auto ml tables colab\n", "model_display_name = 'trial_1' #@param {type:'string'}\n", "\n", - "model_dict = {\n", - " 'display_name': model_display_name,\n", - " 'dataset_id': dataset_name.rsplit('/', 1)[-1],\n", - " 'tables_model_metadata': {'train_budget_milli_node_hours': 1000}\n", - "}\n", - "create_model_response = client.create_model(location_path, model_dict)\n", - "print('Dataset import operation: {}'.format(create_model_response.operation))\n", + "create_model_response = client.create_model(\n", + " model_display_name,\n", + " dataset=dataset,\n", + " train_budget_milli_node_hours=1000,\n", + ")\n", + "print('Create model operation: {}'.format(create_model_response.operation))\n", "# Wait until model training is done.\n", - "create_model_result = create_model_response.result()\n", - "model_name = create_model_result.name\n", - "print(model_name)" + "model = create_model_response.result()\n", + "model" ] }, { @@ -1068,26 +1026,16 @@ "#@title Start batch prediction { vertical-output: true, output-height: 200 }\n", "\n", "batch_predict_gcs_input_uris = ['gs://cloud-ml-data-tables/notebooks/validation_unnested_FULL.csv',] #@param\n", - "batch_predict_gcs_output_uri_prefix = 'gs://' #@param {type:'string'}\n", - "# Define input source.\n", - "batch_prediction_input_source = {\n", - " 'gcs_source': {\n", - " 'input_uris': batch_predict_gcs_input_uris\n", - " }\n", - "}\n", - "# Define output target.\n", - "batch_prediction_output_target = {\n", - " 'gcs_destination': {\n", - " 'output_uri_prefix': batch_predict_gcs_output_uri_prefix\n", - " }\n", - "}\n", - "batch_predict_response = prediction_client.batch_predict(\n", - " model_name, batch_prediction_input_source, batch_prediction_output_target)\n", + "batch_predict_gcs_output_uri_prefix = 'gs://{}'.format(bucket_name) #@param {type:'string'}\n", + "batch_predict_response = client.batch_predict(\n", + " model=model, \n", + " gcs_input_uris=batch_predict_gcs_input_uris,\n", + " gcs_output_uri_prefix=batch_predict_gcs_output_uri_prefix,\n", + ")\n", "print('Batch prediction operation: {}'.format(batch_predict_response.operation))\n", "# Wait until batch prediction is done.\n", "batch_predict_result = batch_predict_response.result()\n", - "batch_predict_response.metadata\n", - "\n" + "batch_predict_response.metadata" ] }, { @@ -1234,7 +1182,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb b/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb index bac645db..32a0f568 100644 --- a/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb +++ b/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb @@ -1,373 +1,384 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "slicing_eval_results.ipynb", - "version": "0.3.2", - "provenance": [ - { - "file_id": "1goi268plF-1AJ77xjdMwIpapBr1ssb-q", - "timestamp": 1551899111384 - }, - { - "file_id": "/piper/depot/google3/cloud/ml/autoflow/colab/slicing_eval_results.ipynb?workspaceId=simonewu:autoflow-1::citc", - "timestamp": 1547767618990 - }, - { - "file_id": "1fjkKgZq5iMevPnfiIpSHSiSiw5XimZ1C", - "timestamp": 1547596565571 - } - ], - "collapsed_sections": [], - "last_runtime": { - "build_target": "//learning/fairness/colabs:ml_fairness_notebook", - "kind": "shared" - } - }, - "kernelspec": { - "display_name": "Python 2", - "name": "python2" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jt_Hqb95fRz8" + }, + "source": [ + "# Slicing AutoML Tables Evaluation Results with BigQuery\n", + "\n", + "This colab assumes that you've created a dataset with AutoML Tables, and used that dataset to train a classification model. Once the model is done training, you also need to export the results table by using the following instructions. You'll see more detailed setup instructions below.\n", + "\n", + "This colab will walk you through the process of using BigQuery to visualize data slices, showing you one simple way to evaluate your model for bias.\n", + "\n", + "## Setup\n", + "\n", + "To use this Colab, copy it to your own Google Drive or open it in the Playground mode. Follow the instructions in the [AutoML Tables Product docs](https://cloud.google.com/automl-tables/docs/) to create a GCP project, enable the API, and create and download a service account private key, and set up required permission. You'll also need to use the AutoML Tables frontend or service to create a model and export its evaluation results to BigQuery. You should find a link on the Evaluate tab to view your evaluation results in BigQuery once you've finished training your model. Then navigate to BigQuery in your GCP console and you'll see your new results table in the list of tables to which your project has access. \n", + "\n", + "For demo purposes, we'll be using the [Default of Credit Card Clients](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) dataset for analysis. This dataset was collected to help compare different methods of predicting credit card default. Using this colab to analyze your own dataset may require a little adaptation.\n", + "\n", + "The code below will sample if you want it to. Or you can set sample_count to be as large or larger than your dataset to use the whole thing for analysis. \n", + "\n", + "Note also that although the data we use in this demo is public, you'll need to enter your own Google Cloud project ID in the parameter below to authenticate to it.\n", + "\n" + ] }, - "cells": [ - { - "metadata": { - "colab_type": "text", - "id": "jt_Hqb95fRz8" - }, - "cell_type": "markdown", - "source": [ - "# Slicing AutoML Tables Evaluation Results with BigQuery\n", - "\n", - "This colab assumes that you've created a dataset with AutoML Tables, and used that dataset to train a classification model. Once the model is done training, you also need to export the results table by using the following instructions. You'll see more detailed setup instructions below.\n", - "\n", - "This colab will walk you through the process of using BigQuery to visualize data slices, showing you one simple way to evaluate your model for bias.\n", - "\n", - "## Setup\n", - "\n", - "To use this Colab, copy it to your own Google Drive or open it in the Playground mode. Follow the instructions in the [AutoML Tables Product docs](https://cloud.google.com/automl-tables/docs/) to create a GCP project, enable the API, and create and download a service account private key, and set up required permission. You'll also need to use the AutoML Tables frontend or service to create a model and export its evaluation results to BigQuery. You should find a link on the Evaluate tab to view your evaluation results in BigQuery once you've finished training your model. Then navigate to BigQuery in your GCP console and you'll see your new results table in the list of tables to which your project has access. \n", - "\n", - "For demo purposes, we'll be using the [Default of Credit Card Clients](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) dataset for analysis. This dataset was collected to help compare different methods of predicting credit card default. Using this colab to analyze your own dataset may require a little adaptation.\n", - "\n", - "The code below will sample if you want it to. Or you can set sample_count to be as large or larger than your dataset to use the whole thing for analysis. \n", - "\n", - "Note also that although the data we use in this demo is public, you'll need to enter your own Google Cloud project ID in the parameter below to authenticate to it.\n", - "\n" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "m2oL8tO-f9rK", - "colab": {} - }, - "cell_type": "code", - "source": [ - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "from google.colab import auth\n", - "import numpy as np\n", - "import os\n", - "import pandas as pd\n", - "import sys\n", - "sys.path.append('./python')\n", - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score\n", - "from sklearn.metrics import precision_recall_curve\n", - "# For facets\n", - "from IPython.core.display import display, HTML\n", - "import base64\n", - "!pip install --upgrade tf-nightly witwidget\n", - "import witwidget.notebook.visualization as visualization\n", - "!pip install apache-beam\n", - "!pip install --upgrade tensorflow_model_analysis\n", - "!pip install --upgrade tensorflow\n", - "\n", - "import tensorflow as tf\n", - "import tensorflow_model_analysis as tfma\n", - "print('TFMA version: {}'.format(tfma.version.VERSION_STRING))\n", - "\n", - "# https://cloud.google.com/resource-manager/docs/creating-managing-projects\n", - "project_id = '[YOUR PROJECT ID HERE]' #@param {type:\"string\"}\n", - "table_name = 'bigquery-public-data:ml_datasets.credit_card_default' #@param {type:\"string\"}\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"]=project_id\n", - "sample_count = 3000 #@param\n", - "row_count = pd.io.gbq.read_gbq('''\n", - " SELECT \n", - " COUNT(*) as total\n", - " FROM [%s]''' % (table_name), project_id=project_id, verbose=False).total[0]\n", - "df = pd.io.gbq.read_gbq('''\n", - " SELECT\n", - " *\n", - " FROM\n", - " [%s]\n", - " WHERE RAND() < %d/%d\n", - "''' % (table_name, sample_count, row_count), project_id=project_id, verbose=False)\n", - "print('Full dataset has %d rows' % row_count)\n", - "df.describe()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "608Fe8PRtj5q" - }, - "cell_type": "markdown", - "source": [ - "##Data Preprocessing\n", - "\n", - "Many of the tools we use to analyze models and data expect to find their inputs in the [tensorflow.Example](https://www.tensorflow.org/tutorials/load_data/tf_records) format. Here, we'll preprocess our data into tf.Examples, and also extract the predicted class from our classifier, which is binary." - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "lqZeO9aGtn2s", - "colab": {} - }, - "cell_type": "code", - "source": [ - "unique_id_field = 'ID' #@param\n", - "prediction_field_score = 'predicted_default_payment_next_month_tables_score' #@param\n", - "prediction_field_value = 'predicted_default_payment_next_month_tables_value' #@param\n", - "\n", - "\n", - "def extract_top_class(prediction_tuples):\n", - " # values from Tables show up as a CSV of individual json (prediction, confidence) objects.\n", - " best_score = 0\n", - " best_class = u''\n", - " for val, sco in prediction_tuples:\n", - " if sco > best_score:\n", - " best_score = sco\n", - " best_class = val\n", - " return (best_class, best_score)\n", - "\n", - "def df_to_examples(df, columns=None):\n", - " examples = []\n", - " if columns == None:\n", - " columns = df.columns.values.tolist()\n", - " for id in df[unique_id_field].unique():\n", - " example = tf.train.Example()\n", - " prediction_tuples = zip(df.loc[df[unique_id_field] == id][prediction_field_value], df.loc[df[unique_id_field] == id][prediction_field_score])\n", - " row = df.loc[df[unique_id_field] == id].iloc[0]\n", - " for col in columns:\n", - " if col == prediction_field_score or col == prediction_field_value:\n", - " # Deal with prediction fields separately\n", - " continue\n", - " elif df[col].dtype is np.dtype(np.int64):\n", - " example.features.feature[col].int64_list.value.append(int(row[col]))\n", - " elif df[col].dtype is np.dtype(np.float64):\n", - " example.features.feature[col].float_list.value.append(row[col])\n", - " elif row[col] is None:\n", - " continue\n", - " elif row[col] == row[col]:\n", - " example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))\n", - " cla, sco = extract_top_class(prediction_tuples)\n", - " example.features.feature['predicted_class'].int64_list.value.append(cla)\n", - " example.features.feature['predicted_class_score'].float_list.value.append(sco)\n", - " examples.append(example)\n", - " return examples\n", - "\n", - "# Fix up some types so analysis is consistent. This code is specific to the dataset.\n", - "df = df.astype({\"PAY_5\": float, \"PAY_6\": float})\n", - "\n", - "# Converts a dataframe column into a column of 0's and 1's based on the provided test.\n", - "def make_label_column_numeric(df, label_column, test):\n", - " df[label_column] = np.where(test(df[label_column]), 1, 0)\n", - " \n", - "# Convert label types to numeric. This code is specific to the dataset.\n", - "make_label_column_numeric(df, 'predicted_default_payment_next_month_tables_value', lambda val: val == '1')\n", - "make_label_column_numeric(df, 'default_payment_next_month', lambda val: val == '1')\n", - "\n", - "examples = df_to_examples(df)\n", - "print(\"Preprocessing complete!\")" - ], - "execution_count": 0, - "outputs": [] - }, - { - "metadata": { - "colab_type": "text", - "id": "XwnOX_orVZEs" - }, - "cell_type": "markdown", - "source": [ - "## What-If Tool\n", - "\n", - "First, we'll explore the data and predictions using the [What-If Tool](https://pair-code.github.io/what-if-tool/). The What-If tool is a powerful visual interface to explore data, models, and predictions. Because we're reading our results from BigQuery, we aren't able to use the features of the What-If Tool that query the model directly. But we can still learn a lot about this dataset from the exploration that the What-If tool enables.\n", - "\n", - "Imagine that you're curious to discover whether there's a discrepancy in the predictive power of your model depending on the marital status of the person whose credit history is being analyzed. You can use the What-If Tool to look at a glance and see the relative sizes of the data samples for each class. In this dataset, the marital statuses are encoded as 1 = married; 2 = single; 3 = divorce; 0=others. You can see using the What-If Tool that there are very few samples for classes other than married or single, which might indicate that performance could be compromised. If this lack of representation concerns you, you could consider collecting more data for underrepresented classes, downsampling overrepresented classes, or upweighting underrepresented data types as you train, depending on your use case and data availability.\n" - ] - }, - { - "metadata": { - "colab_type": "code", - "id": "tjWxGOBkVXQ6", - "colab": {} - }, - "cell_type": "code", - "source": [ - "WitWidget = visualization.WitWidget\n", - "WitConfigBuilder = visualization.WitConfigBuilder\n", - "\n", - "num_datapoints = 2965 #@param {type: \"number\"}\n", - "tool_height_in_px = 700 #@param {type: \"number\"}\n", - "\n", - "# Setup the tool with the test examples and the trained classifier\n", - "config_builder = WitConfigBuilder(examples[:num_datapoints])\n", - "# Need to call this so we have inference_address and model_name initialized\n", - "config_builder = config_builder.set_estimator_and_feature_spec('', '')\n", - "config_builder = config_builder.set_compare_estimator_and_feature_spec('', '')\n", - "wv = WitWidget(config_builder, height=tool_height_in_px)" - ], - "execution_count": 0, - "outputs": [] - }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "m2oL8tO-f9rK" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function\n", + "\n", + "from google.colab import auth\n", + "import numpy as np\n", + "import os\n", + "import pandas as pd\n", + "import sys\n", + "sys.path.append('./python')\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "# For facets\n", + "from IPython.core.display import display, HTML\n", + "import base64\n", + "!pip install --upgrade tf-nightly witwidget\n", + "import witwidget.notebook.visualization as visualization\n", + "!pip install apache-beam\n", + "!pip install --upgrade tensorflow_model_analysis\n", + "!pip install --upgrade tensorflow\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow_model_analysis as tfma\n", + "print('TFMA version: {}'.format(tfma.version.VERSION_STRING))\n", + "\n", + "# https://cloud.google.com/resource-manager/docs/creating-managing-projects\n", + "project_id = '[YOUR PROJECT ID HERE]' #@param {type:\"string\"}\n", + "table_name = 'bigquery-public-data:ml_datasets.credit_card_default' #@param {type:\"string\"}\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"]=project_id\n", + "sample_count = 3000 #@param\n", + "row_count = pd.io.gbq.read_gbq('''\n", + " SELECT \n", + " COUNT(*) as total\n", + " FROM [%s]''' % (table_name), project_id=project_id, verbose=False).total[0]\n", + "df = pd.io.gbq.read_gbq('''\n", + " SELECT\n", + " *\n", + " FROM\n", + " [%s]\n", + " WHERE RAND() < %d/%d\n", + "''' % (table_name, sample_count, row_count), project_id=project_id, verbose=False)\n", + "print('Full dataset has %d rows' % row_count)\n", + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "608Fe8PRtj5q" + }, + "source": [ + "##Data Preprocessing\n", + "\n", + "Many of the tools we use to analyze models and data expect to find their inputs in the [tensorflow.Example](https://www.tensorflow.org/tutorials/load_data/tf_records) format. Here, we'll preprocess our data into tf.Examples, and also extract the predicted class from our classifier, which is binary." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lqZeO9aGtn2s" + }, + "outputs": [], + "source": [ + "unique_id_field = 'ID' #@param\n", + "prediction_field_score = 'predicted_default_payment_next_month_tables_score' #@param\n", + "prediction_field_value = 'predicted_default_payment_next_month_tables_value' #@param\n", + "\n", + "\n", + "def extract_top_class(prediction_tuples):\n", + " # values from Tables show up as a CSV of individual json (prediction, confidence) objects.\n", + " best_score = 0\n", + " best_class = u''\n", + " for val, sco in prediction_tuples:\n", + " if sco > best_score:\n", + " best_score = sco\n", + " best_class = val\n", + " return (best_class, best_score)\n", + "\n", + "def df_to_examples(df, columns=None):\n", + " examples = []\n", + " if columns == None:\n", + " columns = df.columns.values.tolist()\n", + " for id in df[unique_id_field].unique():\n", + " example = tf.train.Example()\n", + " prediction_tuples = zip(df.loc[df[unique_id_field] == id][prediction_field_value], df.loc[df[unique_id_field] == id][prediction_field_score])\n", + " row = df.loc[df[unique_id_field] == id].iloc[0]\n", + " for col in columns:\n", + " if col == prediction_field_score or col == prediction_field_value:\n", + " # Deal with prediction fields separately\n", + " continue\n", + " elif df[col].dtype is np.dtype(np.int64):\n", + " example.features.feature[col].int64_list.value.append(int(row[col]))\n", + " elif df[col].dtype is np.dtype(np.float64):\n", + " example.features.feature[col].float_list.value.append(row[col])\n", + " elif row[col] is None:\n", + " continue\n", + " elif row[col] == row[col]:\n", + " example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))\n", + " cla, sco = extract_top_class(prediction_tuples)\n", + " example.features.feature['predicted_class'].int64_list.value.append(cla)\n", + " example.features.feature['predicted_class_score'].float_list.value.append(sco)\n", + " examples.append(example)\n", + " return examples\n", + "\n", + "# Fix up some types so analysis is consistent. This code is specific to the dataset.\n", + "df = df.astype({\"PAY_5\": float, \"PAY_6\": float})\n", + "\n", + "# Converts a dataframe column into a column of 0's and 1's based on the provided test.\n", + "def make_label_column_numeric(df, label_column, test):\n", + " df[label_column] = np.where(test(df[label_column]), 1, 0)\n", + " \n", + "# Convert label types to numeric. This code is specific to the dataset.\n", + "make_label_column_numeric(df, 'predicted_default_payment_next_month_tables_value', lambda val: val == '1')\n", + "make_label_column_numeric(df, 'default_payment_next_month', lambda val: val == '1')\n", + "\n", + "examples = df_to_examples(df)\n", + "print(\"Preprocessing complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XwnOX_orVZEs" + }, + "source": [ + "## What-If Tool\n", + "\n", + "First, we'll explore the data and predictions using the [What-If Tool](https://pair-code.github.io/what-if-tool/). The What-If tool is a powerful visual interface to explore data, models, and predictions. Because we're reading our results from BigQuery, we aren't able to use the features of the What-If Tool that query the model directly. But we can still learn a lot about this dataset from the exploration that the What-If tool enables.\n", + "\n", + "Imagine that you're curious to discover whether there's a discrepancy in the predictive power of your model depending on the marital status of the person whose credit history is being analyzed. You can use the What-If Tool to look at a glance and see the relative sizes of the data samples for each class. In this dataset, the marital statuses are encoded as 1 = married; 2 = single; 3 = divorce; 0=others. You can see using the What-If Tool that there are very few samples for classes other than married or single, which might indicate that performance could be compromised. If this lack of representation concerns you, you could consider collecting more data for underrepresented classes, downsampling overrepresented classes, or upweighting underrepresented data types as you train, depending on your use case and data availability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tjWxGOBkVXQ6" + }, + "outputs": [], + "source": [ + "WitWidget = visualization.WitWidget\n", + "WitConfigBuilder = visualization.WitConfigBuilder\n", + "\n", + "num_datapoints = 2965 #@param {type: \"number\"}\n", + "tool_height_in_px = 700 #@param {type: \"number\"}\n", + "\n", + "# Setup the tool with the test examples and the trained classifier\n", + "config_builder = WitConfigBuilder(examples[:num_datapoints])\n", + "# Need to call this so we have inference_address and model_name initialized\n", + "config_builder = config_builder.set_estimator_and_feature_spec('', '')\n", + "config_builder = config_builder.set_compare_estimator_and_feature_spec('', '')\n", + "wv = WitWidget(config_builder, height=tool_height_in_px)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YHydLAY991Du" + }, + "source": [ + "## Tensorflow Model Analysis\n", + "\n", + "Then, let's examine some sliced metrics. This section of the tutorial will use [TFMA](https://github.com/tensorflow/model-analysis) model agnostic analysis capabilities. \n", + "\n", + "TFMA generates sliced metrics graphs and confusion matrices. We can use these to dig deeper into the question of how well this model performs on different classes of marital status. The model was built to optimize for AUC ROC metric, and it does fairly well for all of the classes, though there is a small performance gap for the \"divorced\" category. But when we look at the AUC-PR metric slices, we can see that the \"divorced\" and \"other\" classes are very poorly served by the model compared to the more common classes. AUC-PR is the metric that measures how well the tradeoff between precision and recall is being made in the model's predictions. If we're concerned about this gap, we could consider retraining to use AUC-PR as the optimization metric and see whether that model does a better job making equitable predictions. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ZfU11b0797le" + }, + "outputs": [], + "source": [ + "import apache_beam as beam\n", + "import tempfile\n", + "\n", + "from collections import OrderedDict\n", + "from google.protobuf import text_format\n", + "from tensorflow_model_analysis import post_export_metrics\n", + "from tensorflow_model_analysis import types\n", + "from tensorflow_model_analysis.api import model_eval_lib\n", + "from tensorflow_model_analysis.evaluators import aggregate\n", + "from tensorflow_model_analysis.extractors import slice_key_extractor\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict\n", + "from tensorflow_model_analysis.proto import metrics_for_slice_pb2\n", + "from tensorflow_model_analysis.slicer import slicer\n", + "from tensorflow_model_analysis.view.widget_view import render_slicing_metrics\n", + "\n", + "# To set up model agnostic extraction, need to specify features and labels of\n", + "# interest in a feature map.\n", + "feature_map = OrderedDict();\n", + "\n", + "for i, column in enumerate(df.columns):\n", + " type = df.dtypes[i]\n", + " if column == prediction_field_score or column == prediction_field_value:\n", + " continue\n", + " elif (type == np.dtype(np.float64)):\n", + " feature_map[column] = tf.FixedLenFeature([], tf.float32)\n", + " elif (type == np.dtype(np.object)):\n", + " feature_map[column] = tf.FixedLenFeature([], tf.string)\n", + " elif (type == np.dtype(np.int64)):\n", + " feature_map[column] = tf.FixedLenFeature([], tf.int64)\n", + " elif (type == np.dtype(np.bool)):\n", + " feature_map[column] = tf.FixedLenFeature([], tf.bool)\n", + " elif (type == np.dtype(np.datetime64)):\n", + " feature_map[column] = tf.FixedLenFeature([], tf.timestamp)\n", + "\n", + "feature_map['predicted_class'] = tf.FixedLenFeature([], tf.int64)\n", + "feature_map['predicted_class_score'] = tf.FixedLenFeature([], tf.float32)\n", + "\n", + "serialized_examples = [e.SerializeToString() for e in examples]\n", + "\n", + "BASE_DIR = tempfile.gettempdir()\n", + "OUTPUT_DIR = os.path.join(BASE_DIR, 'output')\n", + "\n", + "slice_column = 'MARRIAGE' #@param\n", + "predicted_labels = 'predicted_class' #@param\n", + "actual_labels = 'default_payment_next_month' #@param\n", + "predicted_class_score = 'predicted_class_score' #@param\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " model_agnostic_config = model_agnostic_predict.ModelAgnosticConfig(\n", + " label_keys=[actual_labels],\n", + " prediction_keys=[predicted_labels],\n", + " feature_spec=feature_map)\n", + " \n", + " extractors = [\n", + " model_agnostic_extractor.ModelAgnosticExtractor(\n", + " model_agnostic_config=model_agnostic_config,\n", + " desired_batch_size=3),\n", + " slice_key_extractor.SliceKeyExtractor([\n", + " slicer.SingleSliceSpec(columns=[slice_column])\n", + " ])\n", + " ]\n", + "\n", + " auc_roc_callback = post_export_metrics.auc(\n", + " labels_key=actual_labels,\n", + " target_prediction_keys=[predicted_labels])\n", + " \n", + " auc_pr_callback = post_export_metrics.auc(\n", + " curve='PR',\n", + " labels_key=actual_labels,\n", + " target_prediction_keys=[predicted_labels])\n", + " \n", + " confusion_matrix_callback = post_export_metrics.confusion_matrix_at_thresholds(\n", + " labels_key=actual_labels,\n", + " target_prediction_keys=[predicted_labels],\n", + " example_weight_key=predicted_class_score,\n", + " thresholds=[0.0, 0.5, 0.8, 1.0])\n", + "\n", + " # Create our model agnostic aggregator.\n", + " eval_shared_model = types.EvalSharedModel(\n", + " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", + " add_metrics_callbacks=[confusion_matrix_callback,\n", + " auc_roc_callback,\n", + " auc_pr_callback,\n", + " post_export_metrics.example_count()],\n", + " fpl_feed_config=model_agnostic_extractor\n", + " .ModelAgnosticGetFPLFeedConfig(model_agnostic_config)))\n", + "\n", + " # Run Model Agnostic Eval.\n", + " _ = (\n", + " pipeline\n", + " | beam.Create(serialized_examples)\n", + " | 'ExtractEvaluateAndWriteResults' >>\n", + " model_eval_lib.ExtractEvaluateAndWriteResults(\n", + " eval_shared_model=eval_shared_model,\n", + " output_path=OUTPUT_DIR,\n", + " extractors=extractors))\n", + " \n", + "\n", + "eval_result = tfma.load_eval_result(output_path=OUTPUT_DIR)\n", + "render_slicing_metrics(eval_result, slicing_column = slice_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mOotC2D5Onqu" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "last_runtime": { + "build_target": "//learning/fairness/colabs:ml_fairness_notebook", + "kind": "shared" + }, + "name": "slicing_eval_results.ipynb", + "provenance": [ { - "metadata": { - "colab_type": "text", - "id": "YHydLAY991Du" - }, - "cell_type": "markdown", - "source": [ - "## Tensorflow Model Analysis\n", - "\n", - "Then, let's examine some sliced metrics. This section of the tutorial will use [TFMA](https://github.com/tensorflow/model-analysis) model agnostic analysis capabilities. \n", - "\n", - "TFMA generates sliced metrics graphs and confusion matrices. We can use these to dig deeper into the question of how well this model performs on different classes of marital status. The model was built to optimize for AUC ROC metric, and it does fairly well for all of the classes, though there is a small performance gap for the \"divorced\" category. But when we look at the AUC-PR metric slices, we can see that the \"divorced\" and \"other\" classes are very poorly served by the model compared to the more common classes. AUC-PR is the metric that measures how well the tradeoff between precision and recall is being made in the model's predictions. If we're concerned about this gap, we could consider retraining to use AUC-PR as the optimization metric and see whether that model does a better job making equitable predictions. " - ] + "file_id": "1goi268plF-1AJ77xjdMwIpapBr1ssb-q", + "timestamp": 1551899111384 }, { - "metadata": { - "colab_type": "code", - "id": "ZfU11b0797le", - "colab": {} - }, - "cell_type": "code", - "source": [ - "import apache_beam as beam\n", - "import tempfile\n", - "\n", - "from collections import OrderedDict\n", - "from google.protobuf import text_format\n", - "from tensorflow_model_analysis import post_export_metrics\n", - "from tensorflow_model_analysis import types\n", - "from tensorflow_model_analysis.api import model_eval_lib\n", - "from tensorflow_model_analysis.evaluators import aggregate\n", - "from tensorflow_model_analysis.extractors import slice_key_extractor\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict\n", - "from tensorflow_model_analysis.proto import metrics_for_slice_pb2\n", - "from tensorflow_model_analysis.slicer import slicer\n", - "from tensorflow_model_analysis.view.widget_view import render_slicing_metrics\n", - "\n", - "# To set up model agnostic extraction, need to specify features and labels of\n", - "# interest in a feature map.\n", - "feature_map = OrderedDict();\n", - "\n", - "for i, column in enumerate(df.columns):\n", - " type = df.dtypes[i]\n", - " if column == prediction_field_score or column == prediction_field_value:\n", - " continue\n", - " elif (type == np.dtype(np.float64)):\n", - " feature_map[column] = tf.FixedLenFeature([], tf.float32)\n", - " elif (type == np.dtype(np.object)):\n", - " feature_map[column] = tf.FixedLenFeature([], tf.string)\n", - " elif (type == np.dtype(np.int64)):\n", - " feature_map[column] = tf.FixedLenFeature([], tf.int64)\n", - " elif (type == np.dtype(np.bool)):\n", - " feature_map[column] = tf.FixedLenFeature([], tf.bool)\n", - " elif (type == np.dtype(np.datetime64)):\n", - " feature_map[column] = tf.FixedLenFeature([], tf.timestamp)\n", - "\n", - "feature_map['predicted_class'] = tf.FixedLenFeature([], tf.int64)\n", - "feature_map['predicted_class_score'] = tf.FixedLenFeature([], tf.float32)\n", - "\n", - "serialized_examples = [e.SerializeToString() for e in examples]\n", - "\n", - "BASE_DIR = tempfile.gettempdir()\n", - "OUTPUT_DIR = os.path.join(BASE_DIR, 'output')\n", - "\n", - "slice_column = 'MARRIAGE' #@param\n", - "predicted_labels = 'predicted_class' #@param\n", - "actual_labels = 'default_payment_next_month' #@param\n", - "predicted_class_score = 'predicted_class_score' #@param\n", - "\n", - "with beam.Pipeline() as pipeline:\n", - " model_agnostic_config = model_agnostic_predict.ModelAgnosticConfig(\n", - " label_keys=[actual_labels],\n", - " prediction_keys=[predicted_labels],\n", - " feature_spec=feature_map)\n", - " \n", - " extractors = [\n", - " model_agnostic_extractor.ModelAgnosticExtractor(\n", - " model_agnostic_config=model_agnostic_config,\n", - " desired_batch_size=3),\n", - " slice_key_extractor.SliceKeyExtractor([\n", - " slicer.SingleSliceSpec(columns=[slice_column])\n", - " ])\n", - " ]\n", - "\n", - " auc_roc_callback = post_export_metrics.auc(\n", - " labels_key=actual_labels,\n", - " target_prediction_keys=[predicted_labels])\n", - " \n", - " auc_pr_callback = post_export_metrics.auc(\n", - " curve='PR',\n", - " labels_key=actual_labels,\n", - " target_prediction_keys=[predicted_labels])\n", - " \n", - " confusion_matrix_callback = post_export_metrics.confusion_matrix_at_thresholds(\n", - " labels_key=actual_labels,\n", - " target_prediction_keys=[predicted_labels],\n", - " example_weight_key=predicted_class_score,\n", - " thresholds=[0.0, 0.5, 0.8, 1.0])\n", - "\n", - " # Create our model agnostic aggregator.\n", - " eval_shared_model = types.EvalSharedModel(\n", - " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", - " add_metrics_callbacks=[confusion_matrix_callback,\n", - " auc_roc_callback,\n", - " auc_pr_callback,\n", - " post_export_metrics.example_count()],\n", - " fpl_feed_config=model_agnostic_extractor\n", - " .ModelAgnosticGetFPLFeedConfig(model_agnostic_config)))\n", - "\n", - " # Run Model Agnostic Eval.\n", - " _ = (\n", - " pipeline\n", - " | beam.Create(serialized_examples)\n", - " | 'ExtractEvaluateAndWriteResults' >>\n", - " model_eval_lib.ExtractEvaluateAndWriteResults(\n", - " eval_shared_model=eval_shared_model,\n", - " output_path=OUTPUT_DIR,\n", - " extractors=extractors))\n", - " \n", - "\n", - "eval_result = tfma.load_eval_result(output_path=OUTPUT_DIR)\n", - "render_slicing_metrics(eval_result, slicing_column = slice_column)" - ], - "execution_count": 0, - "outputs": [] + "file_id": "/piper/depot/google3/cloud/ml/autoflow/colab/slicing_eval_results.ipynb?workspaceId=simonewu:autoflow-1::citc", + "timestamp": 1547767618990 }, { - "metadata": { - "colab_type": "code", - "id": "mOotC2D5Onqu", - "colab": {} - }, - "cell_type": "code", - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] + "file_id": "1fjkKgZq5iMevPnfiIpSHSiSiw5XimZ1C", + "timestamp": 1547596565571 } - ] -} \ No newline at end of file + ], + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb b/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb index 3d695ece..b984679f 100644 --- a/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb +++ b/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb @@ -345,11 +345,7 @@ "import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types\n", "import matplotlib.pyplot as plt\n", "\n", - "client = automl.AutoMlClient()\n", - "prediction_client = automl.PredictionServiceClient()\n", - "\n", - "# Get the GCP location of your project.\n", - "location_path = client.location_path(PROJECT_ID, COMPUTE_REGION)" + "client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)" ] }, { @@ -387,8 +383,8 @@ "source": [ "#@title List datasets. { vertical-output: true }\n", "\n", - "list_datasets_response = client.list_datasets(location_path)\n", - "datasets = {dataset.display_name: dataset.name for dataset in list_datasets_response}\n", + "list_datasets = client.list_datasets()\n", + "datasets = { dataset.display_name: dataset.name for dataset in list_datasets }\n", "datasets" ] }, @@ -417,8 +413,8 @@ "source": [ "#@title List models. { vertical-output: true }\n", "\n", - "list_models_response = client.list_models(location_path)\n", - "models = {model.display_name: model.name for model in list_models_response}\n", + "list_models = client.list_models()\n", + "models = { model.display_name: model.name for model in list_models }\n", "models" ] }, @@ -479,32 +475,8 @@ "\n", "dataset_display_name = 'stockout_data' #@param {type: 'string'}\n", "\n", - "dataset_dict = {\n", - " 'display_name': dataset_display_name, \n", - " 'tables_dataset_metadata': {}\n", - "}\n", - "\n", - "create_dataset_response = client.create_dataset(\n", - " location_path,\n", - " dataset_dict\n", - ")\n", - "create_dataset_response" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "RLRgvqzUdxfL" - }, - "outputs": [], - "source": [ - " #@title Get dataset name { vertical-output: true }\n", - "\n", - "dataset_name = create_dataset_response.name\n", - "dataset_name" + "dataset = client.create_dataset(dataset_display_name)\n", + "dataset" ] }, { @@ -531,27 +503,6 @@ "See the table schema and dataset description from the README. " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "bB_GdeqCJW5i" - }, - "outputs": [], - "source": [ - "#@title ... if data source is BigQuery { vertical-output: true }\n", - "\n", - "dataset_bq_input_uri = 'bq://product-stockout.product_stockout.stockout' #@param {type: 'string'}\n", - "# Define input configuration.\n", - "input_config = {\n", - " 'bigquery_source': {\n", - " 'input_uri': dataset_bq_input_uri\n", - " }\n", - "}" - ] - }, { "cell_type": "code", "execution_count": null, @@ -564,26 +515,16 @@ "source": [ "#@title Import data { vertical-output: true }\n", "\n", - "import_data_response = client.import_data(dataset_name, \n", - " input_config)\n", - "print('Dataset import operation: {}'.format(import_data_response.operation))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "1O7tJ8IlefRC" - }, - "outputs": [], - "source": [ - "#@title Check if importing the data is complete { vertical-output: true }\n", + "import_data_operation = client.import_data(\n", + " dataset=dataset,\n", + " bigquery_input_uri=bq_input_uri,\n", + ")\n", + "print('Dataset import operation: {}'.format(import_data_operation))\n", "\n", - "# If returns `False`, you can check back again later.\n", - "# Continue with the rest only if this cell returns a `True`.\n", - "import_data_response.done()" + "# Synchronous check of operation status. Wait until import is done.\n", + "import_data_operation.result()\n", + "dataset = client.get_dataset(dataset_name=dataset.name)\n", + "dataset" ] }, { @@ -611,7 +552,7 @@ "outputs": [], "source": [ "# dataset_name = '' #@param {type: 'string'}\n", - "# dataset = client.get_dataset(dataset_name) " + "# dataset = client.get_dataset(dataset_name=dataset_name) " ] }, { @@ -644,24 +585,36 @@ }, "outputs": [], "source": [ - "#@title Table schema { vertical-output: true }\n", - "\n", "# List table specs\n", - "list_table_specs_response = client.list_table_specs(dataset_name)\n", + "list_table_specs_response = client.list_table_specs(dataset=dataset)\n", "table_specs = [s for s in list_table_specs_response]\n", + "\n", "# List column specs\n", - "table_spec_name = table_specs[0].name\n", - "list_column_specs_response = client.list_column_specs(table_spec_name)\n", + "list_column_specs_response = client.list_column_specs(dataset=dataset)\n", "column_specs = {s.display_name: s for s in list_column_specs_response}\n", - "# Table schema pie chart.\n", + "\n", + "# Print Features and data_type:\n", + "\n", + "features = [(key, data_types.TypeCode.Name(value.data_type.type_code)) for key, value in column_specs.items()]\n", + "print('Feature list:\\n')\n", + "for feature in features:\n", + " print(feature[0],':', feature[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "type_counts = {}\n", "for column_spec in column_specs.values():\n", " type_name = data_types.TypeCode.Name(column_spec.data_type.type_code)\n", " type_counts[type_name] = type_counts.get(type_name, 0) + 1\n", - "\n", + " \n", "plt.pie(x=type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')\n", "plt.axis('equal')\n", - "plt.show()\n" + "plt.show()" ] }, { @@ -720,23 +673,6 @@ "In addition, AutoML Tables detects `Stockout` to be categorical that chooses to run a classification model. " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Pyku3AHEfSp4" - }, - "outputs": [], - "source": [ - "#@title List table specs { vertical-output: true }\n", - "\n", - "list_table_specs_response = client.list_table_specs(dataset_name)\n", - "table_specs = [s for s in list_table_specs_response]\n", - "table_specs" - ] - }, { "cell_type": "code", "execution_count": null, @@ -749,11 +685,6 @@ "source": [ "#@title Check column data type { vertical-output: true }\n", "\n", - "# Get column specs.\n", - "table_spec_name = table_specs[0].name\n", - "list_column_specs_response = client.list_column_specs(table_spec_name)\n", - "column_specs = {s.display_name: s for s in list_column_specs_response}\n", - "\n", "# Print column data types.\n", "for column in column_specs:\n", " print(column, '-', column_specs[column].data_type)" @@ -772,40 +703,7 @@ "\n", "In this solution, the columns `Item_Number`, `Category`, `Vendor_Number` and `Store_Number` are not nullable, but `Zip_Code` and `County_Number` can take null values.\n", "\n", - "To change the data type, you can update the schema by updating the column spec.\n", - "\n", - "`update_column_response = client.update_column_spec(update_column_spec_dict)`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gAPg_ymDf4kL" - }, - "outputs": [], - "source": [ - "def create_update_column_sepc_dict(column_name, type_code, nullable):\n", - " \"\"\"\n", - " Create `update_column_spec_dict` with a given column name and target `type_code`.\n", - " Inputs:\n", - " column_name: string. Represents column name.\n", - " type_code: string. Represents variable type. See details: \\\n", - " https://cloud.google.com/automl-tables/docs/reference/rest/v1beta1/projects.locations.datasets.tableSpecs.columnSpecs#typecode\n", - " nullable: boolean. If true, this DataType can also be null.\n", - " Return:\n", - " update_column_spec_dict: dictionary. Encodes the target column specs.\n", - " \"\"\"\n", - " update_column_spec_dict = {\n", - " 'name': column_specs[column_name].name,\n", - " 'data_type': {\n", - " 'type_code': type_code,\n", - " 'nullable': nullable\n", - " }\n", - " }\n", - " return update_column_spec_dict" + "To change the data type, you can update the schema by updating the column spec." ] }, { @@ -835,8 +733,12 @@ "for i in range(len(categorical_column_names)):\n", " column_name = categorical_column_names[i]\n", " nullable = is_nullable[i]\n", - " update_column_spec_dict = create_update_column_sepc_dict(column_name, 'CATEGORY', nullable)\n", - " update_column_response = client.update_column_spec(update_column_spec_dict)" + " client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_name,\n", + " type_code='CATEGORY',\n", + " nullable=nullable,\n", + " )\n" ] }, { @@ -848,7 +750,7 @@ "source": [ "### Update dataset: assign a label\n", "\n", - "Select the label column and update the dataset." + "Select the target column and update the dataset." ] }, { @@ -863,19 +765,11 @@ "source": [ "#@title Update dataset { vertical-output: true }\n", "\n", - "label_column_name = 'Stockout' #@param {type: 'string'}\n", - "label_column_spec = column_specs[label_column_name]\n", - "label_column_id = label_column_spec.name.rsplit('/', 1)[-1]\n", - "print('Label column ID: {}'.format(label_column_id))\n", - "# Define the values of the fields to be updated.\n", - "update_dataset_dict = {\n", - " 'name': dataset_name,\n", - " 'tables_dataset_metadata': {\n", - " 'target_column_spec_id': label_column_id\n", - " }\n", - "}\n", - "\n", - "update_dataset_response = client.update_dataset(update_dataset_dict)\n", + "target_column_name = 'Stockout' #@param {type: 'string'}\n", + "update_dataset_response = client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name=target_column_name,\n", + ")\n", "update_dataset_response" ] }, @@ -926,57 +820,18 @@ "source": [ "#@title Create model { vertical-output: true }\n", "\n", - "feature_list = list(column_specs.keys())\n", - "feature_list.remove('Stockout')\n", - "\n", "model_display_name = 'stockout_model' #@param {type:'string'}\n", - "dataset_id = dataset_name.rsplit('/', 1)[-1]\n", - "\n", - "model_dict = {\n", - " 'display_name': model_display_name,\n", - " 'dataset_id': dataset_id, \n", - " 'tables_model_metadata': {\n", - " 'target_column_spec': column_specs['Stockout'],\n", - " 'input_feature_column_specs': [column_specs[f] for f in feature_list],\n", - " 'optimization_objective': 'MAXIMIZE_AU_PRC',\n", - " 'train_budget_milli_node_hours': 1000\n", - " }, \n", - "}\n", - "\n", - "create_model_response = client.create_model(location_path, model_dict)\n", - "print('Dataset import operation: {}'.format(create_model_response.operation))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wCQdx9VyhKY5" - }, - "outputs": [], - "source": [ - "#@title Check if model training is complete { vertical-output: true }\n", - "# If returns `False`, you can check back again later.\n", - "# Continue with the rest only if this cell returns a `True`.\n", - "create_model_response.done()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "bPiR8zMwhQYO" - }, - "outputs": [], - "source": [ - "#@title Retrieve the model name { vertical-output: true }\n", - "create_model_result = create_model_response.result()\n", - "model_name = create_model_result.name\n", - "model_name" + "\n", + "create_model_response = client.create_model(\n", + " model_display_name,\n", + " dataset=dataset,\n", + " train_budget_milli_node_hours=1000,\n", + " optimization_objective='MAXIMIZE_AU_PRC',\n", + ")\n", + "print('Create model operation: {}'.format(create_model_response.operation))\n", + "# Wait until model training is done.\n", + "model = create_model_response.result()\n", + "model" ] }, { @@ -986,7 +841,7 @@ "id": "neYjToB36q9E" }, "source": [ - "If your Colab times out, use `client.list_models(location_path)` to check whether your model has been created. \n", + "If your Colab times out, use `client.list_models()` to check whether your model has been created. \n", "\n", "Then uncomment the following cell and run the command to retrieve your model. Replace `YOUR_MODEL_NAME` with its actual value obtained in the preceding cell.\n", "\n", @@ -1004,7 +859,7 @@ "outputs": [], "source": [ "# model_name = '' #@param {type: 'string'}\n", - "# model = client.get_model(model_name)" + "# model = client.get_model(model_name=model_name)" ] }, { @@ -1068,53 +923,14 @@ "batch_predict_bq_input_uri = 'bq://product-stockout.product_stockout.batch_prediction_inputs'\n", "batch_predict_gcs_output_uri_prefix = 'gs://' #@param {type:'string'}\n", "\n", - "# Define input source.\n", - "batch_prediction_input_source = {\n", - " 'bigquery_source': {\n", - " 'input_uri': batch_predict_bq_input_uri\n", - " }\n", - "}\n", - "# Define output target.\n", - "batch_prediction_output_target = {\n", - " 'gcs_destination': {\n", - " 'output_uri_prefix': batch_predict_gcs_output_uri_prefix\n", - " }\n", - "}\n", - "batch_predict_response = prediction_client.batch_predict(model_name, \n", - " batch_prediction_input_source, \n", - " batch_prediction_output_target)\n", - "print('Batch prediction operation: {}'.format(batch_predict_response.operation))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "AVJhh_k0PfxD" - }, - "outputs": [], - "source": [ - "#@title Check if batch prediction is complete { vertical-output: true }\n", - "\n", - "# If returns `False`, you can check back again later.\n", - "# Continue with the rest only if this cell returns a `True`.\n", - "batch_predict_response.done()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8nr5q2M8W2VX" - }, - "outputs": [], - "source": [ - "#@title Retrieve batch prediction metadata { vertical-output: true }\n", - "\n", + "batch_predict_response = client.batch_predict(\n", + " model=model, \n", + " biqquery_input_uris=batch_predict_bq_input_uri,\n", + " gcs_output_uri_prefix=batch_predict_gcs_output_uri_prefix,\n", + ")\n", + "print('Batch prediction operation: {}'.format(batch_predict_response.operation))\n", + "# Wait until batch prediction is done.\n", + "batch_predict_result = batch_predict_response.result()\n", "batch_predict_response.metadata" ] }, @@ -1175,7 +991,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.7" } }, "nbformat": 4,