diff --git a/tables/automl/automl_tables_dataset.py b/tables/automl/automl_tables_dataset.py new file mode 100644 index 000000000000..d4be12b3702e --- /dev/null +++ b/tables/automl/automl_tables_dataset.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on dataset +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def create_dataset(project_id, compute_region, dataset_display_name): + """Create a dataset.""" + # [START automl_tables_create_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Create a dataset with the given display name + dataset = client.create_dataset(dataset_display_name) + + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + print("Dataset metadata:") + print("\t{}".format(dataset.tables_dataset_metadata)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + + # [END automl_tables_create_dataset] + + return dataset + + +def list_datasets(project_id, compute_region, filter_=None): + """List all datasets.""" + result = [] + # [START automl_tables_list_datasets] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the datasets available in the region by applying filter. + response = client.list_datasets(filter_=filter_) + + print("List of datasets:") + for dataset in response: + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + metadata = dataset.tables_dataset_metadata + print("Dataset primary table spec id: {}".format( + metadata.primary_table_spec_id)) + print("Dataset target column spec id: {}".format( + metadata.target_column_spec_id)) + print("Dataset target column spec id: {}".format( + metadata.target_column_spec_id)) + print("Dataset weight column spec id: {}".format( + metadata.weight_column_spec_id)) + print("Dataset ml use column spec id: {}".format( + metadata.ml_use_column_spec_id)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + print("\n") + + # [END automl_tables_list_datasets] + result.append(dataset) + + return result + + +def list_table_specs( + project_id, compute_region, dataset_display_name, filter_=None +): + """List all table specs.""" + result = [] + # [START automl_tables_list_specs] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the table specs in the dataset by applying filter. + response = client.list_table_specs( + dataset_display_name=dataset_display_name, filter_=filter_) + + print("List of table specs:") + for table_spec in response: + # Display the table_spec information. + print("Table spec name: {}".format(table_spec.name)) + print("Table spec id: {}".format(table_spec.name.split("/")[-1])) + print("Table spec time column spec id: {}".format( + table_spec.time_column_spec_id)) + print("Table spec row count: {}".format(table_spec.row_count)) + print("Table spec column count: {}".format(table_spec.column_count)) + + # [END automl_tables_list_specs] + result.append(table_spec) + + return result + + +def list_column_specs(project_id, + compute_region, + dataset_display_name, + filter_=None): + """List all column specs.""" + result = [] + # [START automl_tables_list_column_specs] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the table specs in the dataset by applying filter. + response = client.list_column_specs( + dataset_display_name=dataset_display_name, filter_=filter_) + + print("List of column specs:") + for column_spec in response: + # Display the column_spec information. + print("Column spec name: {}".format(column_spec.name)) + print("Column spec id: {}".format(column_spec.name.split("/")[-1])) + print("Column spec display name: {}".format(column_spec.display_name)) + print("Column spec data type: {}".format(column_spec.data_type)) + + # [END automl_tables_list_column_specs] + result.append(column_spec) + + return result + + +def get_dataset(project_id, compute_region, dataset_display_name): + """Get the dataset.""" + # [START automl_tables_get_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Get complete detail of the dataset. + dataset = client.get_dataset(dataset_display_name=dataset_display_name) + + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + print("Dataset metadata:") + print("\t{}".format(dataset.tables_dataset_metadata)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + + # [END automl_tables_get_dataset] + + return dataset + + +def get_table_spec(project_id, compute_region, dataset_id, table_spec_id): + """Get the table spec.""" + # [START automl_tables_get_table_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Get the full path of the table spec. + table_spec_name = client.auto_ml_client.table_spec_path( + project_id, compute_region, dataset_id, table_spec_id + ) + + # Get complete detail of the table spec. + table_spec = client.get_table_spec(table_spec_name) + + # Display the table spec information. + print("Table spec name: {}".format(table_spec.name)) + print("Table spec id: {}".format(table_spec.name.split("/")[-1])) + print("Table spec time column spec id: {}".format( + table_spec.time_column_spec_id)) + print("Table spec row count: {}".format(table_spec.row_count)) + print("Table spec column count: {}".format(table_spec.column_count)) + + # [END automl_tables_get_table_spec] + + +def get_column_spec(project_id, + compute_region, + dataset_id, + table_spec_id, + column_spec_id): + """Get the column spec.""" + # [START automl_tables_get_column_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + # column_spec_id = 'COLUMN_SPEC_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Get the full path of the column spec. + column_spec_name = client.auto_ml_client.column_spec_path( + project_id, compute_region, dataset_id, table_spec_id, column_spec_id + ) + + # Get complete detail of the column spec. + column_spec = client.get_column_spec(column_spec_name) + + # Display the column spec information. + print("Column spec name: {}".format(column_spec.name)) + print("Column spec id: {}".format(column_spec.name.split("/")[-1])) + print("Column spec display name: {}".format(column_spec.display_name)) + print("Column spec data type: {}".format(column_spec.data_type)) + print("Column spec data stats: {}".format(column_spec.data_stats)) + print("Column spec top correlated columns\n") + for column_correlation in column_spec.top_correlated_columns: + print(column_correlation) + + # [END automl_tables_get_column_spec] + + +def import_data(project_id, compute_region, dataset_display_name, path): + """Import structured data.""" + # [START automl_tables_import_data] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME' + # path = 'gs://path/to/file.csv' or 'bq://project_id.dataset.table_id' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + response = None + if path.startswith('bq'): + response = client.import_data( + dataset_display_name=dataset_display_name, bigquery_input_uri=path + ) + else: + # Get the multiple Google Cloud Storage URIs. + input_uris = path.split(",") + response = client.import_data( + dataset_display_name=dataset_display_name, + gcs_input_uris=input_uris + ) + + print("Processing import...") + # synchronous check of operation status. + print("Data imported. {}".format(response.result())) + + # [END automl_tables_import_data] + + +def export_data(project_id, compute_region, dataset_display_name, gcs_uri): + """Export a dataset to a Google Cloud Storage bucket.""" + # [START automl_tables_export_data] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # gcs_uri: 'GCS_URI_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Export the dataset to the output URI. + response = client.export_data(dataset_display_name=dataset_display_name, + gcs_output_uri_prefix=gcs_uri) + + print("Processing export...") + # synchronous check of operation status. + print("Data exported. {}".format(response.result())) + + # [END automl_tables_export_data] + + +def update_dataset(project_id, + compute_region, + dataset_display_name, + target_column_spec_name=None, + weight_column_spec_name=None, + test_train_column_spec_name=None): + """Update dataset.""" + # [START automl_tables_update_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # target_column_spec_name = 'TARGET_COLUMN_SPEC_NAME_HERE' or None + # weight_column_spec_name = 'WEIGHT_COLUMN_SPEC_NAME_HERE' or None + # test_train_column_spec_name = 'TEST_TRAIN_COLUMN_SPEC_NAME_HERE' or None + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + if target_column_spec_name is not None: + response = client.set_target_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=target_column_spec_name + ) + print("Target column updated. {}".format(response)) + if weight_column_spec_name is not None: + response = client.set_weight_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=weight_column_spec_name + ) + print("Weight column updated. {}".format(response)) + if test_train_column_spec_name is not None: + response = client.set_test_train_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=test_train_column_spec_name + ) + print("Test/train column updated. {}".format(response)) + + # [END automl_tables_update_dataset] + + +def update_table_spec(project_id, + compute_region, + dataset_display_name, + time_column_spec_display_name): + """Update table spec.""" + # [START automl_tables_update_table_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # time_column_spec_display_name = 'time_column_spec_display_name_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + response = client.set_time_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=time_column_spec_display_name + ) + + # synchronous check of operation status. + print("Table spec updated. {}".format(response)) + # [END automl_tables_update_table_spec] + + +def update_column_spec(project_id, + compute_region, + dataset_display_name, + column_spec_display_name, + type_code, + nullable=None): + """Update column spec.""" + # [START automl_tables_update_column_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # column_spec_display_name = 'COLUMN_SPEC_DISPLAY_NAME_HERE' + # type_code = 'TYPE_CODE_HERE' + # nullable = 'NULLABLE_HERE' or None if unchanged + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Update the column spec. + response = client.update_column_spec( + dataset_display_name=dataset_display_name, + column_spec_display_name=column_spec_display_name, + type_code=type_code, nullable=nullable + ) + + # synchronous check of operation status. + print("Table spec updated. {}".format(response)) + # [END automl_tables_update_column_spec] + + +def delete_dataset(project_id, compute_region, dataset_display_name): + """Delete a dataset""" + # [START automl_tables_delete_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Delete a dataset. + response = client.delete_dataset(dataset_display_name=dataset_display_name) + + # synchronous check of operation status. + print("Dataset deleted. {}".format(response.result())) + # [END automl_tables_delete_dataset] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + create_dataset_parser = subparsers.add_parser( + "create_dataset", help=create_dataset.__doc__ + ) + create_dataset_parser.add_argument("--dataset_name") + + list_datasets_parser = subparsers.add_parser( + "list_datasets", help=list_datasets.__doc__ + ) + list_datasets_parser.add_argument("--filter_") + + list_table_specs_parser = subparsers.add_parser( + "list_table_specs", help=list_table_specs.__doc__ + ) + list_table_specs_parser.add_argument("--dataset_display_name") + list_table_specs_parser.add_argument("--filter_") + + list_column_specs_parser = subparsers.add_parser( + "list_column_specs", help=list_column_specs.__doc__ + ) + list_column_specs_parser.add_argument("--dataset_display_name") + list_column_specs_parser.add_argument("--filter_") + + get_dataset_parser = subparsers.add_parser( + "get_dataset", help=get_dataset.__doc__ + ) + get_dataset_parser.add_argument("--dataset_display_name") + + get_table_spec_parser = subparsers.add_parser( + "get_table_spec", help=get_table_spec.__doc__ + ) + get_table_spec_parser.add_argument("--dataset_id") + get_table_spec_parser.add_argument("--table_spec_id") + + get_column_spec_parser = subparsers.add_parser( + "get_column_spec", help=get_column_spec.__doc__ + ) + get_column_spec_parser.add_argument("--dataset_id") + get_column_spec_parser.add_argument("--table_spec_id") + get_column_spec_parser.add_argument("--column_spec_id") + + import_data_parser = subparsers.add_parser( + "import_data", help=import_data.__doc__ + ) + import_data_parser.add_argument("--dataset_display_name") + import_data_parser.add_argument("--path") + + export_data_parser = subparsers.add_parser( + "export_data", help=export_data.__doc__ + ) + export_data_parser.add_argument("--dataset_display_name") + export_data_parser.add_argument("--gcs_uri") + + update_dataset_parser = subparsers.add_parser( + "update_dataset", help=update_dataset.__doc__ + ) + update_dataset_parser.add_argument("--dataset_display_name") + update_dataset_parser.add_argument("--target_column_spec_name") + update_dataset_parser.add_argument("--weight_column_spec_name") + update_dataset_parser.add_argument("--ml_use_column_spec_name") + + update_table_spec_parser = subparsers.add_parser( + "update_table_spec", help=update_table_spec.__doc__ + ) + update_table_spec_parser.add_argument("--dataset_display_name") + update_table_spec_parser.add_argument("--time_column_spec_display_name") + + update_column_spec_parser = subparsers.add_parser( + "update_column_spec", help=update_column_spec.__doc__ + ) + update_column_spec_parser.add_argument("--dataset_display_name") + update_column_spec_parser.add_argument("--column_spec_display_name") + update_column_spec_parser.add_argument("--type_code") + update_column_spec_parser.add_argument("--nullable", type=bool) + + delete_dataset_parser = subparsers.add_parser( + "delete_dataset", help=delete_dataset.__doc__ + ) + delete_dataset_parser.add_argument("--dataset_display_name") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + if args.command == "create_dataset": + create_dataset(project_id, compute_region, args.dataset_name) + if args.command == "list_datasets": + list_datasets(project_id, compute_region, args.filter_) + if args.command == "list_table_specs": + list_table_specs(project_id, + compute_region, + args.dataset_display_name, + args.filter_) + if args.command == "list_column_specs": + list_column_specs(project_id, + compute_region, + args.dataset_display_name, + args.filter_) + if args.command == "get_dataset": + get_dataset(project_id, compute_region, args.dataset_display_name) + if args.command == "get_table_spec": + get_table_spec(project_id, + compute_region, + args.dataset_display_name, + args.table_spec_id) + if args.command == "get_column_spec": + get_column_spec(project_id, + compute_region, + args.dataset_display_name, + args.table_spec_id, + args.column_spec_id) + if args.command == "import_data": + import_data(project_id, + compute_region, + args.dataset_display_name, + args.path) + if args.command == "export_data": + export_data(project_id, + compute_region, + args.dataset_display_name, + args.gcs_uri) + if args.command == "update_dataset": + update_dataset(project_id, + compute_region, + args.dataset_display_name, + args.target_column_spec_name, + args.weight_column_spec_name, + args.ml_use_column_spec_name) + if args.command == "update_table_spec": + update_table_spec(project_id, + compute_region, + args.dataset_display_name, + args.time_column_spec_display_name) + if args.command == "update_column_spec": + update_column_spec(project_id, + compute_region, + args.dataset_display_name, + args.column_spec_display_name, + args.type_code, + args.nullable) + if args.command == "delete_dataset": + delete_dataset(project_id, compute_region, args.dataset_display_name) diff --git a/tables/automl/automl_tables_model.py b/tables/automl/automl_tables_model.py new file mode 100644 index 000000000000..cb8d85dd2275 --- /dev/null +++ b/tables/automl/automl_tables_model.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on model +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def create_model(project_id, + compute_region, + dataset_display_name, + model_display_name, + train_budget_milli_node_hours, + include_column_spec_names=None, + exclude_column_spec_names=None): + """Create a model.""" + # [START automl_tables_create_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # train_budget_milli_node_hours = 'TRAIN_BUDGET_MILLI_NODE_HOURS_HERE' + # include_column_spec_names = 'INCLUDE_COLUMN_SPEC_NAMES_HERE' + # or None if unspecified + # exclude_column_spec_names = 'EXCLUDE_COLUMN_SPEC_NAMES_HERE' + # or None if unspecified + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Create a model with the model metadata in the region. + response = client.create_model( + model_display_name, + train_budget_milli_node_hours=train_budget_milli_node_hours, + dataset_display_name=dataset_display_name, + include_column_spec_names=include_column_spec_names, + exclude_column_spec_names=exclude_column_spec_names, + ) + + print("Training model...") + print("Training operation name: {}".format(response.operation.name)) + print("Training completed: {}".format(response.result())) + + # [END automl_tables_create_model] + + +def get_operation_status(operation_full_id): + """Get operation status.""" + # [START automl_tables_get_operation_status] + # TODO(developer): Uncomment and set the following variables + # operation_full_id = + # 'projects//locations//operations/' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient() + + # Get the latest state of a long-running operation. + op = client.auto_ml_client.transport._operations_client.get_operation( + operation_full_id + ) + + print("Operation status: {}".format(op)) + + # [END automl_tables_get_operation_status] + + +def list_models(project_id, compute_region, filter_=None): + """List all models.""" + result = [] + # [START automl_tables_list_models] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # filter_ = 'DATASET_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + from google.cloud.automl_v1beta1 import enums + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the models available in the region by applying filter. + response = client.list_models(filter_=filter_) + + print("List of models:") + for model in response: + # Retrieve deployment state. + if model.deployment_state == enums.Model.DeploymentState.DEPLOYED: + deployment_state = "deployed" + else: + deployment_state = "undeployed" + + # Display the model information. + print("Model name: {}".format(model.name)) + print("Model id: {}".format(model.name.split("/")[-1])) + print("Model display name: {}".format(model.display_name)) + metadata = model.tables_model_metadata + print("Target column display name: {}".format( + metadata.target_column_spec.display_name)) + print("Training budget in node milli hours: {}".format( + metadata.train_budget_milli_node_hours)) + print("Training cost in node milli hours: {}".format( + metadata.train_cost_milli_node_hours)) + print("Model create time:") + print("\tseconds: {}".format(model.create_time.seconds)) + print("\tnanos: {}".format(model.create_time.nanos)) + print("Model deployment state: {}".format(deployment_state)) + print("\n") + + # [END automl_tables_list_models] + result.append(model) + + return result + + +def get_model(project_id, compute_region, model_display_name): + """Get model details.""" + # [START automl_tables_get_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + from google.cloud.automl_v1beta1 import enums + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Get complete detail of the model. + model = client.get_model(model_display_name=model_display_name) + + # Retrieve deployment state. + if model.deployment_state == enums.Model.DeploymentState.DEPLOYED: + deployment_state = "deployed" + else: + deployment_state = "undeployed" + + # Display the model information. + print("Model name: {}".format(model.name)) + print("Model id: {}".format(model.name.split("/")[-1])) + print("Model display name: {}".format(model.display_name)) + print("Model metadata:") + print(model.tables_model_metadata) + print("Model create time:") + print("\tseconds: {}".format(model.create_time.seconds)) + print("\tnanos: {}".format(model.create_time.nanos)) + print("Model deployment state: {}".format(deployment_state)) + + # [END automl_tables_get_model] + + return model + + +def list_model_evaluations( + project_id, compute_region, model_display_name, filter_=None +): + + """List model evaluations.""" + result = [] + # [START automl_tables_list_model_evaluations] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the model evaluations in the model by applying filter. + response = client.list_model_evaluations( + model_display_name=model_display_name, + filter_=filter_ + ) + + print("List of model evaluations:") + for evaluation in response: + print("Model evaluation name: {}".format(evaluation.name)) + print("Model evaluation id: {}".format(evaluation.name.split("/")[-1])) + print("Model evaluation example count: {}".format( + evaluation.evaluated_example_count)) + print("Model evaluation time:") + print("\tseconds: {}".format(evaluation.create_time.seconds)) + print("\tnanos: {}".format(evaluation.create_time.nanos)) + print("\n") + # [END automl_tables_list_model_evaluations] + result.append(evaluation) + + return result + + +def get_model_evaluation( + project_id, compute_region, model_id, model_evaluation_id +): + """Get model evaluation.""" + # [START automl_tables_get_model_evaluation] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # model_evaluation_id = 'MODEL_EVALUATION_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient() + + # Get the full path of the model evaluation. + model_evaluation_full_id = client.auto_ml_client.model_evaluation_path( + project_id, compute_region, model_id, model_evaluation_id + ) + + # Get complete detail of the model evaluation. + response = client.get_model_evaluation( + model_evaluation_name=model_evaluation_full_id + ) + + print(response) + # [END automl_tables_get_model_evaluation] + return response + + +def display_evaluation( + project_id, compute_region, model_display_name, filter_=None +): + """Display evaluation.""" + # [START automl_tables_display_evaluation] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the model evaluations in the model by applying filter. + response = client.list_model_evaluations( + model_display_name=model_display_name, filter_=filter_ + ) + + # Iterate through the results. + for evaluation in response: + # There is evaluation for each class in a model and for overall model. + # Get only the evaluation of overall model. + if not evaluation.annotation_spec_id: + model_evaluation_name = evaluation.name + break + + # Get a model evaluation. + model_evaluation = client.get_model_evaluation( + model_evaluation_name=model_evaluation_name + ) + + classification_metrics = model_evaluation.classification_evaluation_metrics + if str(classification_metrics): + confidence_metrics = classification_metrics.confidence_metrics_entry + + # Showing model score based on threshold of 0.5 + print("Model classification metrics (threshold at 0.5):") + for confidence_metrics_entry in confidence_metrics: + if confidence_metrics_entry.confidence_threshold == 0.5: + print( + "Model Precision: {}%".format( + round(confidence_metrics_entry.precision * 100, 2) + ) + ) + print( + "Model Recall: {}%".format( + round(confidence_metrics_entry.recall * 100, 2) + ) + ) + print( + "Model F1 score: {}%".format( + round(confidence_metrics_entry.f1_score * 100, 2) + ) + ) + print("Model AUPRC: {}".format(classification_metrics.au_prc)) + print("Model AUROC: {}".format(classification_metrics.au_roc)) + print("Model log loss: {}".format(classification_metrics.log_loss)) + + regression_metrics = model_evaluation.regression_evaluation_metrics + if str(regression_metrics): + print("Model regression metrics:") + print("Model RMSE: {}".format( + regression_metrics.root_mean_squared_error + )) + print("Model MAE: {}".format(regression_metrics.mean_absolute_error)) + print("Model MAPE: {}".format( + regression_metrics.mean_absolute_percentage_error)) + print("Model R^2: {}".format(regression_metrics.r_squared)) + + # [END automl_tables_display_evaluation] + + +def deploy_model(project_id, compute_region, model_display_name): + """Deploy model.""" + # [START automl_tables_deploy_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Deploy model + response = client.deploy_model(model_display_name=model_display_name) + + # synchronous check of operation status. + print("Model deployed. {}".format(response.result())) + + # [END automl_tables_deploy_model] + + +def undeploy_model(project_id, compute_region, model_display_name): + """Undeploy model.""" + # [START automl_tables_undeploy_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Undeploy model + response = client.undeploy_model(model_display_name=model_display_name) + + # synchronous check of operation status. + print("Model undeployed. {}".format(response.result())) + + # [END automl_tables_undeploy_model] + + +def delete_model(project_id, compute_region, model_display_name): + """Delete a model.""" + # [START automl_tables_delete_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Undeploy model + response = client.delete_model(model_display_name=model_display_name) + + # synchronous check of operation status. + print("Model deleted. {}".format(response.result())) + + # [END automl_tables_delete_model] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + create_model_parser = subparsers.add_parser( + "create_model", help=create_model.__doc__ + ) + create_model_parser.add_argument("--dataset_display_name") + create_model_parser.add_argument("--model_display_name") + create_model_parser.add_argument( + "--train_budget_milli_node_hours", type=int, + ) + + get_operation_status_parser = subparsers.add_parser( + "get_operation_status", help=get_operation_status.__doc__ + ) + get_operation_status_parser.add_argument("--operation_full_id") + + list_models_parser = subparsers.add_parser( + "list_models", help=list_models.__doc__ + ) + list_models_parser.add_argument("--filter_") + + get_model_parser = subparsers.add_parser( + "get_model", help=get_model.__doc__ + ) + get_model_parser.add_argument("--model_display_name") + + list_model_evaluations_parser = subparsers.add_parser( + "list_model_evaluations", help=list_model_evaluations.__doc__ + ) + list_model_evaluations_parser.add_argument("--model_display_name") + list_model_evaluations_parser.add_argument("--filter_") + + get_model_evaluation_parser = subparsers.add_parser( + "get_model_evaluation", help=get_model_evaluation.__doc__ + ) + get_model_evaluation_parser.add_argument("--model_id") + get_model_evaluation_parser.add_argument("--model_evaluation_id") + + display_evaluation_parser = subparsers.add_parser( + "display_evaluation", help=display_evaluation.__doc__ + ) + display_evaluation_parser.add_argument("--model_display_name") + display_evaluation_parser.add_argument("--filter_") + + deploy_model_parser = subparsers.add_parser( + "deploy_model", help=deploy_model.__doc__ + ) + deploy_model_parser.add_argument("--model_display_name") + + undeploy_model_parser = subparsers.add_parser( + "undeploy_model", help=undeploy_model.__doc__ + ) + undeploy_model_parser.add_argument("--model_display_name") + + delete_model_parser = subparsers.add_parser( + "delete_model", help=delete_model.__doc__ + ) + delete_model_parser.add_argument("--model_display_name") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + + if args.command == "create_model": + create_model( + project_id, + compute_region, + args.dataset_display_name, + args.model_display_name, + args.train_budget_milli_node_hours, + # Input columns are omitted here as argparse does not support + # column spec objects, but it is still included in function def. + ) + if args.command == "get_operation_status": + get_operation_status(args.operation_full_id) + if args.command == "list_models": + list_models(project_id, compute_region, args.filter_) + if args.command == "get_model": + get_model(project_id, compute_region, args.model_display_name) + if args.command == "list_model_evaluations": + list_model_evaluations( + project_id, compute_region, args.model_display_name, args.filter_ + ) + if args.command == "get_model_evaluation": + get_model_evaluation( + project_id, + compute_region, + args.model_display_name, + args.model_evaluation_id + ) + if args.command == "display_evaluation": + display_evaluation( + project_id, compute_region, args.model_display_name, args.filter_ + ) + if args.command == "deploy_model": + deploy_model(project_id, compute_region, args.model_display_name) + if args.command == "undeploy_model": + undeploy_model(project_id, compute_region, args.model_display_name) + if args.command == "delete_model": + delete_model(project_id, compute_region, args.model_display_name) diff --git a/tables/automl/automl_tables_predict.py b/tables/automl/automl_tables_predict.py new file mode 100644 index 000000000000..61643ed66faa --- /dev/null +++ b/tables/automl/automl_tables_predict.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on prediction +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def predict(project_id, + compute_region, + model_display_name, + inputs): + """Make a prediction.""" + # [START automl_tables_predict] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # inputs = {'value': 3, ...} + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + response = client.predict( + model_display_name=model_display_name, + inputs=inputs) + print("Prediction results:") + for result in response.payload: + print("Predicted class name: {}".format(result.display_name)) + print("Predicted class score: {}".format( + result.classification.score)) + + # [END automl_tables_predict] + + +def batch_predict(project_id, + compute_region, + model_display_name, + gcs_input_uris, + gcs_output_uri): + """Make a batch of predictions.""" + # [START automl_tables_batch_predict] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # gcs_input_uris = ['gs://path/to/file.csv] + # gcs_output_uri = 'gs://path' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Query model + response = client.batch_predict( + gcs_input_uris, gcs_output_uri, model_display_name=model_display_name + ) + print("Making batch prediction... ") + response.result() + print("Batch prediction complete.\n{}".format(response.metadata)) + + # [END automl_tables_batch_predict] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + predict_parser = subparsers.add_parser("predict", help=predict.__doc__) + predict_parser.add_argument("--model_display_name") + predict_parser.add_argument("--file_path") + + batch_predict_parser = subparsers.add_parser( + "batch_predict", help=predict.__doc__ + ) + batch_predict_parser.add_argument("--model_display_name") + batch_predict_parser.add_argument("--input_path") + batch_predict_parser.add_argument("--output_path") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + + if args.command == "predict": + predict( + project_id, + compute_region, + args.model_display_name, + args.file_path, + ) + + if args.command == "batch_predict": + batch_predict( + project_id, + compute_region, + args.model_display_name, + args.input_path, + args.output_path, + ) diff --git a/tables/automl/dataset_test.py b/tables/automl/dataset_test.py new file mode 100644 index 000000000000..32fd9b08128d --- /dev/null +++ b/tables/automl/dataset_test.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import random +import string +import os +import time + +from google.api_core import exceptions + +import automl_tables_dataset + +PROJECT = os.environ["GCLOUD_PROJECT"] +REGION = "us-central1" +STATIC_DATASET = "test_dataset_do_not_delete" +GCS_DATASET="gs://cloud-ml-tables-data/bank-marketing.csv" + +ID = "{rand}_{time}".format( + rand="".join( + [random.choice(string.ascii_letters + string.digits) for n in range(4)] + ), + time=int(time.time()), +) + +def _id(name): + return "{}_{}".format(name, ID) + +def ensure_dataset_ready(): + dataset = None + name = STATIC_DATASET + try: + dataset = automl_tables_dataset.get_dataset(PROJECT, REGION, name) + except exceptions.NotFound: + dataset = automl_tables_dataset.create_dataset(PROJECT, REGION, name) + + if dataset.example_count is None or dataset.example_count == 0: + automl_tables_dataset.import_data( + PROJECT, REGION, name, GCS_DATASET + ) + dataset = automl_tables_dataset.get_dataset(PROJECT, REGION, name) + + automl_tables_dataset.update_dataset( + PROJECT, + REGION, + dataset.display_name, + target_column_spec_name='Deposit', + ) + + return dataset + +@pytest.mark.slow +def test_dataset_create_import_delete(capsys): + name = _id("d_cr_dl") + dataset = automl_tables_dataset.create_dataset(PROJECT, REGION, name) + assert dataset is not None + assert dataset.display_name == name + +# automl_tables_dataset.import_data( +# PROJECT, REGION, name, GCS_DATASET +# ) + +# out, _ = capsys.readouterr() +# assert "Data imported." in out + + automl_tables_dataset.delete_dataset(PROJECT, REGION, name) + + with pytest.raises(exceptions.NotFound): + automl_tables_dataset.get_dataset(PROJECT, REGION, name) + +def test_dataset_update(capsys): + dataset = ensure_dataset_ready() + automl_tables_dataset.update_dataset( + PROJECT, + REGION, + dataset.display_name, + target_column_spec_name='Deposit', + weight_column_spec_name='Balance' + ) + + out, _ = capsys.readouterr() + assert "Target column updated." in out + assert "Weight column updated." in out + +def test_column_update(capsys): + dataset = ensure_dataset_ready() + automl_tables_dataset.update_column_spec( + PROJECT, + REGION, + dataset.display_name, + column_spec_display_name='Job', + type_code='CATEGORY', + nullable=False + ) + + out, _ = capsys.readouterr() + assert "Table spec updated." in out + +def test_list_datasets(): + ensure_dataset_ready() + assert next( + ( + d + for d + in automl_tables_dataset.list_datasets(PROJECT, REGION) + if d.display_name == STATIC_DATASET + ), None) is not None + +def test_list_table_specs(): + dataset = ensure_dataset_ready() + ts = automl_tables_dataset.list_table_specs(PROJECT, REGION, dataset.display_name) + assert len(ts) > 0 + for t in ts: + assert t.name.startswith(dataset.name) + +def test_list_column_specs(): + dataset = ensure_dataset_ready() + cs = automl_tables_dataset.list_column_specs(PROJECT, REGION, dataset.display_name) + assert len(cs) > 0 + for c in cs: + assert c.name.startswith(dataset.name) diff --git a/tables/automl/model_test.py b/tables/automl/model_test.py new file mode 100644 index 000000000000..e76b0437f4d3 --- /dev/null +++ b/tables/automl/model_test.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import random +import string +import os +import time + +from google.api_core import exceptions + +import automl_tables_model +import dataset_test + +PROJECT = os.environ["GCLOUD_PROJECT"] +REGION = "us-central1" +STATIC_MODEL = "test_model_do_not_delete" +GCS_DATASET="gs://cloud-ml-tables-data/bank-marketing.csv" + +ID = "{rand}_{time}".format( + rand="".join( + [random.choice(string.ascii_letters + string.digits) for n in range(4)] + ), + time=int(time.time()), +) + +def _id(name): + return "{}_{}".format(name, ID) + +def test_list_models(): + ensure_model_ready() + assert next( + ( + m + for m + in automl_tables_model.list_models(PROJECT, REGION) + if m.display_name == STATIC_MODEL + ), None) is not None + +def test_list_model_evaluations(): + model = ensure_model_ready() + mes = automl_tables_model.list_model_evaluations(PROJECT, REGION, model.display_name) + assert len(mes) > 0 + for me in mes: + assert me.name.startswith(model.name) + +def test_get_model_evaluations(): + model = ensure_model_ready() + me = automl_tables_model.list_model_evaluations( + PROJECT, REGION, model.display_name + )[0] + mep = automl_tables_model.get_model_evaluation( + PROJECT, + REGION, + model.name.rpartition('/')[2], + me.name.rpartition('/')[2] + ) + + assert mep.name == me.name + +def ensure_model_ready(): + name = STATIC_MODEL + try: + return automl_tables_model.get_model(PROJECT, REGION, name) + except exceptions.NotFound: + pass + + dataset = dataset_test.ensure_dataset_ready() + return automl_tables_model.create_model( + PROJECT, REGION, dataset.display_name, name, 1000 + ) diff --git a/tables/automl/predict_test.py b/tables/automl/predict_test.py new file mode 100644 index 000000000000..66c79c6952f1 --- /dev/null +++ b/tables/automl/predict_test.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import random +import string +import os +import time + +from google.api_core import exceptions +from google.cloud.automl_v1beta1.gapic import enums + +import automl_tables_predict +import automl_tables_model +import model_test + +PROJECT = os.environ["GCLOUD_PROJECT"] +REGION = "us-central1" +STATIC_MODEL = model_test.STATIC_MODEL + +def test_predict(capsys): + inputs = { + "Age": 31, + "Balance": 200, + "Campaign": 2, + "Contact": "cellular", + "Day": 4, + "Default": "no", + "Duration": 12, + "Education": "primary", + "Housing": "yes", + "Job": "blue-collar", + "Loan": "no", + "MaritalStatus": "divorced", + "Month": "jul", + "PDays": 4, + "POutcome": '0', + "Previous": 12, + } + + ensure_model_online() + automl_tables_predict.predict(PROJECT, REGION, STATIC_MODEL, inputs) + out, _ = capsys.readouterr() + assert 'Predicted class name:' in out + assert 'Predicted class score:' in out + + +def ensure_model_online(): + model = model_test.ensure_model_ready() + if model.deployment_state != enums.Model.DeploymentState.DEPLOYED: + automl_tables_model.deploy_model(PROJECT, REGION, model.display_name) + + return automl_tables_model.get_model(PROJECT, REGION, model.display_name)