diff --git a/tables/automl/automl_tables_dataset.py b/tables/automl/automl_tables_dataset.py new file mode 100644 index 000000000000..ef501ce1132d --- /dev/null +++ b/tables/automl/automl_tables_dataset.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on dataset +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def create_dataset(project_id, compute_region, dataset_name): + """Create a dataset.""" + # [START automl_tables_create_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_name = 'DATASET_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # A resource that represents Google Cloud Platform location. + project_location = client.location_path(project_id, compute_region) + + # Set dataset name and metadata of the dataset. + my_dataset = { + "display_name": dataset_name, + "tables_dataset_metadata": {} + } + + # Create a dataset with the dataset metadata in the region. + dataset = client.create_dataset(project_location, my_dataset) + + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + print("Dataset metadata:") + print("\t{}".format(dataset.tables_dataset_metadata)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + + # [END automl_tables_create_dataset] + + +def list_datasets(project_id, compute_region, filter_=None): + """List all datasets.""" + # [START automl_tables_list_datasets] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # A resource that represents Google Cloud Platform location. + project_location = client.location_path(project_id, compute_region) + + # List all the datasets available in the region by applying filter. + response = client.list_datasets(project_location, filter_) + + print("List of datasets:") + for dataset in response: + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + metadata = dataset.tables_dataset_metadata + print("Dataset primary table spec id: {}".format( + metadata.primary_table_spec_id)) + print("Dataset target column spec id: {}".format( + metadata.target_column_spec_id)) + print("Dataset target column spec id: {}".format( + metadata.target_column_spec_id)) + print("Dataset weight column spec id: {}".format( + metadata.weight_column_spec_id)) + print("Dataset ml use column spec id: {}".format( + metadata.ml_use_column_spec_id)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + print("\n") + + # [END automl_tables_list_datasets] + + +def list_table_specs(project_id, compute_region, dataset_id, filter_=None): + """List all table specs.""" + # [START automl_tables_list_specs] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the dataset. + dataset_full_id = client.dataset_path( + project_id, compute_region, dataset_id + ) + + # List all the table specs in the dataset by applying filter. + response = client.list_table_specs(dataset_full_id, filter_) + + print("List of table specs:") + for table_spec in response: + # Display the table_spec information. + print("Table spec name: {}".format(table_spec.name)) + print("Table spec id: {}".format(table_spec.name.split("/")[-1])) + print("Table spec time column spec id: {}".format( + table_spec.time_column_spec_id)) + print("Table spec row count: {}".format(table_spec.row_count)) + print("Table spec column count: {}".format(table_spec.column_count)) + + # [END automl_tables_list_specs] + + +def list_column_specs(project_id, + compute_region, + dataset_id, + table_spec_id, + filter_=None): + """List all column specs.""" + # [START automl_tables_list_column_specs] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the table_spec. + table_spec_full_id = client.table_spec_path( + project_id, compute_region, dataset_id, table_spec_id + ) + + # List all the column specs in the table spec by applying filter. + response = client.list_column_specs(table_spec_full_id, filter_) + + print("List of column specs:") + for column_spec in response: + # Display the column_spec information. + print("Column spec name: {}".format(column_spec.name)) + print("Column spec id: {}".format(column_spec.name.split("/")[-1])) + print("Column spec display name: {}".format(column_spec.display_name)) + print("Column spec data type: {}".format(column_spec.data_type)) + + # [END automl_tables_list_column_specs] + + +def get_dataset(project_id, compute_region, dataset_id): + """Get the dataset.""" + # [START automl_tables_get_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the dataset. + dataset_full_id = client.dataset_path( + project_id, compute_region, dataset_id + ) + + # Get complete detail of the dataset. + dataset = client.get_dataset(dataset_full_id) + + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + print("Dataset metadata:") + print("\t{}".format(dataset.tables_dataset_metadata)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + + # [END automl_tables_get_dataset] + + +def get_table_spec(project_id, compute_region, dataset_id, table_spec_id): + """Get the table spec.""" + # [START automl_tables_get_table_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the table spec. + table_spec_full_id = client.table_spec_path( + project_id, compute_region, dataset_id, table_spec_id + ) + + # Get complete detail of the table spec. + table_spec = client.get_table_spec(table_spec_full_id) + + # Display the table spec information. + print("Table spec name: {}".format(table_spec.name)) + print("Table spec id: {}".format(table_spec.name.split("/")[-1])) + print("Table spec time column spec id: {}".format( + table_spec.time_column_spec_id)) + print("Table spec row count: {}".format(table_spec.row_count)) + print("Table spec column count: {}".format(table_spec.column_count)) + + # [END automl_tables_get_table_spec] + + +def get_column_spec(project_id, + compute_region, + dataset_id, + table_spec_id, + column_spec_id): + """Get the column spec.""" + # [START automl_tables_get_column_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + # column_spec_id = 'COLUMN_SPEC_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the column spec. + column_spec_full_id = client.column_spec_path( + project_id, compute_region, dataset_id, table_spec_id, column_spec_id + ) + + # Get complete detail of the column spec. + column_spec = client.get_column_spec(column_spec_full_id) + + # Display the column spec information. + print("Column spec name: {}".format(column_spec.name)) + print("Column spec id: {}".format(column_spec.name.split("/")[-1])) + print("Column spec display name: {}".format(column_spec.display_name)) + print("Column spec data type: {}".format(column_spec.data_type)) + print("Column spec data stats: {}".format(column_spec.data_stats)) + print("Column spec top correlated columns\n") + for column_correlation in column_spec.top_correlated_columns: + print(column_correlation) + + # [END automl_tables_get_column_spec] + + +def import_data(project_id, compute_region, dataset_id, path): + """Import structured data.""" + # [START automl_tables_import_data] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # path = 'gs://path/to/file.csv' or 'bq://project_id.dataset_id.table_id' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the dataset. + dataset_full_id = client.dataset_path( + project_id, compute_region, dataset_id + ) + + if path.startswith('bq'): + input_config = {"bigquery_source": {"input_uri": path}} + else: + # Get the multiple Google Cloud Storage URIs. + input_uris = path.split(",") + input_config = {"gcs_source": {"input_uris": input_uris}} + + # Import data from the input URI. + response = client.import_data(dataset_full_id, input_config) + + print("Processing import...") + # synchronous check of operation status. + print("Data imported. {}".format(response.result())) + + # [END automl_tables_import_data] + + +def export_data(project_id, compute_region, dataset_id, gcs_uri): + """Export a dataset to a Google Cloud Storage bucket.""" + # [START automl_tables_export_data] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # output_uri: 'gs://location/to/export/data' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the dataset. + dataset_full_id = client.dataset_path( + project_id, compute_region, dataset_id + ) + + # Set the output URI + output_config = {"gcs_destination": {"output_uri_prefix": gcs_uri}} + + # Export the dataset to the output URI. + response = client.export_data(dataset_full_id, output_config) + + print("Processing export...") + # synchronous check of operation status. + print("Data exported. {}".format(response.result())) + + # [END automl_tables_export_data] + + +def update_dataset(project_id, + compute_region, + dataset_id, + target_column_spec_id=None, + weight_column_spec_id=None, + ml_use_column_spec_id=None): + """Update dataset.""" + # [START automl_tables_update_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # target_column_spec_id = 'TARGET_COLUMN_SPEC_ID_HERE' or None if unchanged + # weight_column_spec_id = 'WEIGHT_COLUMN_SPEC_ID_HERE' or None if unchanged + # ml_use_column_spec_id = 'ML_USE_COLUMN_SPEC_ID_HERE' or None if unchanged + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the dataset. + dataset_full_id = client.dataset_path( + project_id, compute_region, dataset_id + ) + + # Set the target, weight, and ml use columns in the tables dataset metadata. + tables_dataset_metadata = {} + if target_column_spec_id: + tables_dataset_metadata['target_column_spec_id'] = target_column_spec_id + if weight_column_spec_id: + tables_dataset_metadata['weight_column_spec_id'] = weight_column_spec_id + if ml_use_column_spec_id: + tables_dataset_metadata['ml_use_column_spec_id'] = ml_use_column_spec_id + + # Set the updated tables dataset metadata in the dataset. + my_dataset = { + 'name': dataset_full_id, + 'tables_dataset_metadata': tables_dataset_metadata, + } + + # Update the dataset. + response = client.update_dataset(my_dataset) + + # synchronous check of operation status. + print("Dataset updated. {}".format(response)) + # [END automl_tables_update_dataset] + + +def update_table_spec(project_id, + compute_region, + dataset_id, + table_spec_id, + time_column_spec_id): + """Update table spec.""" + # [START automl_tables_update_table_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_spec_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + # time_column_spec_id = 'TIME_COLUMN_SPEC_ID_HERE' or None if unchanged + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the table spec. + table_spec_full_id = client.table_spec_path( + project_id, compute_region, dataset_id, table_spec_id + ) + + # Set the updated time column in the table spec. + my_table_spec = { + 'name': table_spec_full_id, + 'time_column_spec_id': time_column_spec_id + } + + # Update the table spec. + response = client.update_table_spec(my_table_spec) + + # synchronous check of operation status. + print("Table spec updated. {}".format(response)) + # [END automl_tables_update_table_spec] + + +def update_column_spec(project_id, + compute_region, + dataset_id, + table_spec_id, + column_spec_id, + type_code, + nullable=None): + """Update column spec.""" + # [START automl_tables_update_column_spec] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_spec_id = 'DATASET_ID_HERE' + # table_spec_id = 'TABLE_SPEC_ID_HERE' + # column_spec_id = 'COLUMN_SPEC_ID_HERE' + # type_code = 'TYPE_CODE_HERE' + # nullable = 'NULLABLE_HERE' or None if unchanged + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the column spec. + column_spec_full_id = client.column_spec_path( + project_id, compute_region, dataset_id, table_spec_id, column_spec_id + ) + + # Set type code and nullable in data_type. + data_type = {'type_code': type_code} + if nullable is not None: + data_type['nullable'] = nullable + + # Set the updated data_type in the column_spec. + my_column_spec = { + 'name': column_spec_full_id, + 'data_type': data_type, + } + + # Update the column spec. + response = client.update_column_spec(my_column_spec) + + # synchronous check of operation status. + print("Table spec updated. {}".format(response)) + # [END automl_tables_update_column_spec] + + +def delete_dataset(project_id, compute_region, dataset_id): + """Delete a dataset""" + # [START automl_tables_delete_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the dataset. + dataset_full_id = client.dataset_path( + project_id, compute_region, dataset_id + ) + + # Delete a dataset. + response = client.delete_dataset(dataset_full_id) + + # synchronous check of operation status. + print("Dataset deleted. {}".format(response.result())) + # [END automl_tables_delete_dataset] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + create_dataset_parser = subparsers.add_parser( + "create_dataset", help=create_dataset.__doc__ + ) + create_dataset_parser.add_argument("--dataset_name") + + list_datasets_parser = subparsers.add_parser( + "list_datasets", help=list_datasets.__doc__ + ) + list_datasets_parser.add_argument("--filter_") + + list_table_specs_parser = subparsers.add_parser( + "list_table_specs", help=list_table_specs.__doc__ + ) + list_table_specs_parser.add_argument("--dataset_id") + list_table_specs_parser.add_argument("--filter_") + + list_column_specs_parser = subparsers.add_parser( + "list_column_specs", help=list_column_specs.__doc__ + ) + list_column_specs_parser.add_argument("--dataset_id") + list_column_specs_parser.add_argument("--table_spec_id") + list_column_specs_parser.add_argument("--filter_") + + get_dataset_parser = subparsers.add_parser( + "get_dataset", help=get_dataset.__doc__ + ) + get_dataset_parser.add_argument("--dataset_id") + + get_table_spec_parser = subparsers.add_parser( + "get_table_spec", help=get_table_spec.__doc__ + ) + get_table_spec_parser.add_argument("--dataset_id") + get_table_spec_parser.add_argument("--table_spec_id") + + get_column_spec_parser = subparsers.add_parser( + "get_column_spec", help=get_column_spec.__doc__ + ) + get_column_spec_parser.add_argument("--dataset_id") + get_column_spec_parser.add_argument("--table_spec_id") + get_column_spec_parser.add_argument("--column_spec_id") + + import_data_parser = subparsers.add_parser( + "import_data", help=import_data.__doc__ + ) + import_data_parser.add_argument("--dataset_id") + import_data_parser.add_argument("--path") + + export_data_parser = subparsers.add_parser( + "export_data", help=export_data.__doc__ + ) + export_data_parser.add_argument("--dataset_id") + export_data_parser.add_argument("--gcs_uri") + + update_dataset_parser = subparsers.add_parser( + "update_dataset", help=update_dataset.__doc__ + ) + update_dataset_parser.add_argument("--dataset_id") + update_dataset_parser.add_argument("--target_column_spec_id") + update_dataset_parser.add_argument("--weight_column_spec_id") + update_dataset_parser.add_argument("--ml_use_column_spec_id") + + update_table_spec_parser = subparsers.add_parser( + "update_table_spec", help=update_table_spec.__doc__ + ) + update_table_spec_parser.add_argument("--dataset_id") + update_table_spec_parser.add_argument("--table_spec_id") + update_table_spec_parser.add_argument("--time_column_spec_id") + + update_column_spec_parser = subparsers.add_parser( + "update_column_spec", help=update_column_spec.__doc__ + ) + update_column_spec_parser.add_argument("--dataset_id") + update_column_spec_parser.add_argument("--column_spec_id") + update_column_spec_parser.add_argument("--table_spec_id") + update_column_spec_parser.add_argument("--type_code") + update_column_spec_parser.add_argument("--nullable", type=bool) + + delete_dataset_parser = subparsers.add_parser( + "delete_dataset", help=delete_dataset.__doc__ + ) + delete_dataset_parser.add_argument("--dataset_id") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + if args.command == "create_dataset": + create_dataset(project_id, compute_region, args.dataset_name) + if args.command == "list_datasets": + list_datasets(project_id, compute_region, args.filter_) + if args.command == "list_table_specs": + list_table_specs(project_id, + compute_region, + args.dataset_id, + args.filter_) + if args.command == "list_column_specs": + list_column_specs(project_id, + compute_region, + args.dataset_id, + args.table_spec_id, + args.filter_) + if args.command == "get_dataset": + get_dataset(project_id, compute_region, args.dataset_id) + if args.command == "get_table_spec": + get_table_spec(project_id, + compute_region, + args.dataset_id, + args.table_spec_id) + if args.command == "get_column_spec": + get_column_spec(project_id, + compute_region, + args.dataset_id, + args.table_spec_id, + args.column_spec_id) + if args.command == "import_data": + import_data(project_id, compute_region, args.dataset_id, args.path) + if args.command == "export_data": + export_data(project_id, compute_region, args.dataset_id, args.gcs_uri) + if args.command == "update_dataset": + update_dataset(project_id, + compute_region, + args.dataset_id, + args.target_column_spec_id, + args.weight_column_spec_id, + args.ml_use_column_spec_id) + if args.command == "update_table_spec": + update_table_spec(project_id, + compute_region, + args.dataset_id, + args.table_spec_id, + args.time_column_spec_id) + if args.command == "update_column_spec": + update_column_spec(project_id, + compute_region, + args.dataset_id, + args.table_spec_id, + args.column_spec_id, + args.type_code, + args.nullable) + if args.command == "delete_dataset": + delete_dataset(project_id, compute_region, args.dataset_id) diff --git a/tables/automl/automl_tables_model.py b/tables/automl/automl_tables_model.py new file mode 100644 index 000000000000..b9792e19964f --- /dev/null +++ b/tables/automl/automl_tables_model.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on model +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def create_model(project_id, + compute_region, + dataset_id, + model_name, + train_budget_milli_node_hours, + optimization_objective=None, + input_feature_column_specs=None): + """Create a model.""" + # [START automl_tables_create_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_id = 'DATASET_ID_HERE' + # model_name = 'MODEL_NAME_HERE' + # train_budget_milli_node_hours = 'TRAIN_BUDGET_MILLI_NODE_HOURS_HERE' + # optimization_objective = 'OPTIMIZATION_OBJECTIVE_HEREs' + # or None if unspecified + # input_feature_column_specs = 'INPUT_FEATURE_COLUMN_SPECS_HERE' + # or None if unspecified + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # A resource that represents Google Cloud Platform location. + project_location = client.location_path(project_id, compute_region) + + # Sets an (optional) maximum train time, 1000 = 1 hour. + tables_model_metadata = {} + if train_budget_milli_node_hours: + tables_model_metadata.update( + {'train_budget_milli_node_hours': train_budget_milli_node_hours} + ) + + # Set the columns to use for training, defaults to all but the target and + # ml use columns if unspecified. Expects a list of column specs, not ids. + if input_feature_column_specs: + tables_model_metadata.update( + {'input_feature_column_specs': input_feature_column_specs} + ) + + # Set model name, dataset source, and metadata. + my_model = { + "display_name": model_name, + "dataset_id": dataset_id, + "tables_model_metadata": tables_model_metadata, + } + + # Create a model with the model metadata in the region. + response = client.create_model(project_location, my_model) + + print("Training model...") + print("Training operation name: {}".format(response.operation.name)) + print("Training completed: {}".format(response.result())) + + # [END automl_tables_create_model] + + +def get_operation_status(operation_full_id): + """Get operation status.""" + # [START automl_tables_get_operation_status] + # TODO(developer): Uncomment and set the following variables + # operation_full_id = + # 'projects//locations//operations/' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the latest state of a long-running operation. + response = client.transport._operations_client.get_operation( + operation_full_id + ) + + print("Operation status: {}".format(response)) + + # [END automl_tables_get_operation_status] + + +def list_models(project_id, compute_region, filter_=None): + """List all models.""" + # [START automl_tables_list_models] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # filter_ = 'DATASET_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + from google.cloud.automl_v1beta1 import enums + + client = automl.AutoMlClient() + + # A resource that represents Google Cloud Platform location. + project_location = client.location_path(project_id, compute_region) + + # List all the models available in the region by applying filter. + response = client.list_models(project_location, filter_) + + print("List of models:") + for model in response: + # Retrieve deployment state. + if model.deployment_state == enums.Model.DeploymentState.DEPLOYED: + deployment_state = "deployed" + else: + deployment_state = "undeployed" + + # Display the model information. + print("Model name: {}".format(model.name)) + print("Model id: {}".format(model.name.split("/")[-1])) + print("Model display name: {}".format(model.display_name)) + metadata = model.tables_model_metadata + print("Target column display name: {}".format( + metadata.target_column_spec.display_name)) + print("Training budget in node milli hours: {}".format( + metadata.train_budget_milli_node_hours)) + print("Training cost in node milli hours: {}".format( + metadata.train_cost_milli_node_hours)) + print("Model create time:") + print("\tseconds: {}".format(model.create_time.seconds)) + print("\tnanos: {}".format(model.create_time.nanos)) + print("Model deployment state: {}".format(deployment_state)) + print("\n") + + # [END automl_tables_list_models] + + +def get_model(project_id, compute_region, model_id): + """Get model details.""" + # [START automl_tables_get_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + from google.cloud.automl_v1beta1 import enums + + client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = client.model_path(project_id, compute_region, model_id) + + # Get complete detail of the model. + model = client.get_model(model_full_id) + + # Retrieve deployment state. + if model.deployment_state == enums.Model.DeploymentState.DEPLOYED: + deployment_state = "deployed" + else: + deployment_state = "undeployed" + + # Display the model information. + print("Model name: {}".format(model.name)) + print("Model id: {}".format(model.name.split("/")[-1])) + print("Model display name: {}".format(model.display_name)) + print("Model metadata:") + print(model.tables_model_metadata) + print("Model create time:") + print("\tseconds: {}".format(model.create_time.seconds)) + print("\tnanos: {}".format(model.create_time.nanos)) + print("Model deployment state: {}".format(deployment_state)) + + # [END automl_tables_get_model] + + +def list_model_evaluations(project_id, compute_region, model_id, filter_=None): + """List model evaluations.""" + # [START automl_tables_list_model_evaluations] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = client.model_path(project_id, compute_region, model_id) + + # List all the model evaluations in the model by applying filter. + response = client.list_model_evaluations(model_full_id, filter_) + + print("List of model evaluations:") + for evaluation in response: + print("Model evaluation name: {}".format(evaluation.name)) + print("Model evaluation id: {}".format(evaluation.name.split("/")[-1])) + print("Model evaluation example count: {}".format( + evaluation.evaluated_example_count)) + print("Model evaluation time:") + print("\tseconds: {}".format(evaluation.create_time.seconds)) + print("\tnanos: {}".format(evaluation.create_time.nanos)) + print("\n") + # [END automl_tables_list_model_evaluations] + + +def get_model_evaluation( + project_id, compute_region, model_id, model_evaluation_id +): + """Get model evaluation.""" + # [START automl_tables_get_model_evaluation] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # model_evaluation_id = 'MODEL_EVALUATION_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the model evaluation. + model_evaluation_full_id = client.model_evaluation_path( + project_id, compute_region, model_id, model_evaluation_id + ) + + # Get complete detail of the model evaluation. + response = client.get_model_evaluation(model_evaluation_full_id) + + print(response) + + # [END automl_tables_get_model_evaluation] + + +def display_evaluation(project_id, compute_region, model_id, filter_=None): + """Display evaluation.""" + # [START automl_tables_display_evaluation] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = client.model_path(project_id, compute_region, model_id) + + # List all the model evaluations in the model by applying filter. + response = client.list_model_evaluations(model_full_id, filter_) + + # Iterate through the results. + for evaluation in response: + # There is evaluation for each class in a model and for overall model. + # Get only the evaluation of overall model. + if not evaluation.annotation_spec_id: + model_evaluation_id = evaluation.name.split("/")[-1] + + # Resource name for the model evaluation. + model_evaluation_full_id = client.model_evaluation_path( + project_id, compute_region, model_id, model_evaluation_id + ) + + # Get a model evaluation. + model_evaluation = client.get_model_evaluation(model_evaluation_full_id) + + classification_metrics = model_evaluation.classification_evaluation_metrics + if str(classification_metrics): + confidence_metrics = classification_metrics.confidence_metrics_entry + + # Showing model score based on threshold of 0.5 + print("Model classification metrics (threshold at 0.5):") + for confidence_metrics_entry in confidence_metrics: + if confidence_metrics_entry.confidence_threshold == 0.5: + print( + "Model Precision: {}%".format( + round(confidence_metrics_entry.precision * 100, 2) + ) + ) + print( + "Model Recall: {}%".format( + round(confidence_metrics_entry.recall * 100, 2) + ) + ) + print( + "Model F1 score: {}%".format( + round(confidence_metrics_entry.f1_score * 100, 2) + ) + ) + print("Model AUPRC: {}".format(classification_metrics.au_prc)) + print("Model AUROC: {}".format(classification_metrics.au_roc)) + print("Model log loss: {}".format(classification_metrics.log_loss)) + + regression_metrics = model_evaluation.regression_evaluation_metrics + if str(regression_metrics): + print("Model regression metrics:") + print("Model RMSE: {}".format(regression_metrics.root_mean_squared_error)) + print("Model MAE: {}".format(regression_metrics.mean_absolute_error)) + print("Model MAPE: {}".format( + regression_metrics.mean_absolute_percentage_error)) + print("Model R^2: {}".format(regression_metrics.r_squared)) + + # [END automl_tables_display_evaluation] + + + +def deploy_model(project_id, compute_region, model_id): + """Deploy model.""" + # [START automl_tables_deploy_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = client.model_path(project_id, compute_region, model_id) + + # Deploy model + response = client.deploy_model(model_full_id) + + print("Model deployed.") + + # [END automl_tables_deploy_model] + + + +def undeploy_model(project_id, compute_region, model_id): + """Undeploy model.""" + # [START automl_tables_undeploy_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = client.model_path(project_id, compute_region, model_id) + + # Deploy model + response = client.undeploy_model(model_full_id) + + print("Model undeployed.") + + # [END automl_tables_undeploy_model] + + +def delete_model(project_id, compute_region, model_id): + """Delete a model.""" + # [START automl_tables_delete_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = client.model_path(project_id, compute_region, model_id) + + # Delete a model. + response = client.delete_model(model_full_id) + + # synchronous check of operation status. + print("Model deleted. {}".format(response.result())) + + # [END automl_tables_delete_model] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + create_model_parser = subparsers.add_parser( + "create_model", help=create_model.__doc__ + ) + create_model_parser.add_argument("--dataset_id") + create_model_parser.add_argument("--model_name") + create_model_parser.add_argument( + "--train_budget_milli_node_hours", type=int, + ) + create_model_parser.add_argument("--optimization_objective") + + get_operation_status_parser = subparsers.add_parser( + "get_operation_status", help=get_operation_status.__doc__ + ) + get_operation_status_parser.add_argument("--operation_full_id") + + list_models_parser = subparsers.add_parser( + "list_models", help=list_models.__doc__ + ) + list_models_parser.add_argument("--filter_") + + get_model_parser = subparsers.add_parser( + "get_model", help=get_model.__doc__ + ) + get_model_parser.add_argument("--model_id") + + list_model_evaluations_parser = subparsers.add_parser( + "list_model_evaluations", help=list_model_evaluations.__doc__ + ) + list_model_evaluations_parser.add_argument("--model_id") + list_model_evaluations_parser.add_argument("--filter_") + + get_model_evaluation_parser = subparsers.add_parser( + "get_model_evaluation", help=get_model_evaluation.__doc__ + ) + get_model_evaluation_parser.add_argument("--model_id") + get_model_evaluation_parser.add_argument("--model_evaluation_id") + + display_evaluation_parser = subparsers.add_parser( + "display_evaluation", help=display_evaluation.__doc__ + ) + display_evaluation_parser.add_argument("--model_id") + display_evaluation_parser.add_argument("--filter_") + + deploy_model_parser = subparsers.add_parser( + "deploy_model", help=deploy_model.__doc__ + ) + deploy_model_parser.add_argument("--model_id") + + undeploy_model_parser = subparsers.add_parser( + "undeploy_model", help=undeploy_model.__doc__ + ) + undeploy_model_parser.add_argument("--model_id") + + delete_model_parser = subparsers.add_parser( + "delete_model", help=delete_model.__doc__ + ) + delete_model_parser.add_argument("--model_id") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + + if args.command == "create_model": + create_model( + project_id, + compute_region, + args.dataset_id, + args.model_name, + args.train_budget_milli_node_hours, + # Input columns are omitted here as argparse does not support + # column spec objects, but it is still included in function def. + ) + if args.command == "get_operation_status": + get_operation_status(args.operation_full_id) + if args.command == "list_models": + list_models(project_id, compute_region, args.filter_) + if args.command == "get_model": + get_model(project_id, compute_region, args.model_id) + if args.command == "list_model_evaluations": + list_model_evaluations( + project_id, compute_region, args.model_id, args.filter_ + ) + if args.command == "get_model_evaluation": + get_model_evaluation( + project_id, compute_region, args.model_id, args.model_evaluation_id + ) + if args.command == "display_evaluation": + display_evaluation( + project_id, compute_region, args.model_id, args.filter_ + ) + if args.command == "deploy_model": + deploy_model(project_id, compute_region, args.model_id) + if args.command == "undeploy_model": + undeploy_model(project_id, compute_region, args.model_id) + if args.command == "delete_model": + delete_model(project_id, compute_region, args.model_id) diff --git a/tables/automl/automl_tables_predict.py b/tables/automl/automl_tables_predict.py new file mode 100644 index 000000000000..ee46fefdde37 --- /dev/null +++ b/tables/automl/automl_tables_predict.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on prediction +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def predict(project_id, + compute_region, + model_id, + file_path, + score_threshold="", +): + """Make a prediction.""" + # [START automl_tables_predict] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # file_path = '/local/path/to/file' + # score_threshold = 'value from 0.0 to 0.5' + + from google.cloud import automl_v1beta1 as automl + import csv + + automl_client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = automl_client.model_path( + project_id, compute_region, model_id + ) + + # Create client for prediction service. + prediction_client = automl.PredictionServiceClient() + + # params is additional domain-specific parameters. + # score_threshold is used to filter the result + # Initialize params + params = {} + if score_threshold: + params = {"score_threshold": score_threshold} + + with open(file_path, "rt") as csv_file: + # Read each row of csv + content = csv.reader(csv_file) + for row in content: + # Create payload + values = [] + for column in row: + values.append({'number_value': float(column)}) + payload = { + 'row': {'values': values} + } + + # Query model + response = prediction_client.predict(model_full_id, payload) + print("Prediction results:") + for result in response.payload: + print("Predicted class name: {}".format(result.display_name)) + print("Predicted class score: {}".format(result.classification.score)) + + # [END automl_tables_predict] + + +def batch_predict(project_id, + compute_region, + model_id, + input_path, + output_path): + """Make a batch of predictions.""" + # [START automl_tables_batch_predict] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # input_path = 'gs://path/to/file.csv' or + # 'bq://project_id.dataset_id.table_id' + # output_path = 'gs://path' or `bq://project_id' + + + from google.cloud import automl_v1beta1 as automl + import csv + + automl_client = automl.AutoMlClient() + + # Get the full path of the model. + model_full_id = automl_client.model_path( + project_id, compute_region, model_id + ) + + # Create client for prediction service. + prediction_client = automl.PredictionServiceClient() + + if input_path.startswith('bq'): + input_config = {"bigquery_source": {"input_uri": input_path}} + else: + # Get the multiple Google Cloud Storage URIs. + input_uris = input_path.split(",").strip() + input_config = {"gcs_source": {"input_uris": input_uris}} + + if output_path.startswith('bq'): + output_config = {"bigquery_destination": {"output_uri": output_path}} + else: + # Get the multiple Google Cloud Storage URIs. + output_uris = output_path.split(",").strip() + output_config = {"gcs_destination": {"output_uris": output_uris}} + + # Query model + response = prediction_client.batch_predict( + model_full_id, input_config, output_config) + print("Making batch prediction... ") + try: + result = response.result() + except: + # Hides Any to BatchPredictResult error. + pass + print("Batch prediction complete.\n{}".format(response.metadata)) + + # [END automl_tables_batch_predict] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + predict_parser = subparsers.add_parser("predict", help=predict.__doc__) + predict_parser.add_argument("--model_id") + predict_parser.add_argument("--file_path") + predict_parser.add_argument("--score_threshold", nargs="?", default="") + + batch_predict_parser = subparsers.add_parser( + "batch_predict", help=predict.__doc__ + ) + batch_predict_parser.add_argument("--model_id") + batch_predict_parser.add_argument("--input_path") + batch_predict_parser.add_argument("--output_path") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + + if args.command == "predict": + predict( + project_id, + compute_region, + args.model_id, + args.file_path, + args.score_threshold, + ) + + if args.command == "batch_predict": + batch_predict( + project_id, + compute_region, + args.model_id, + args.input_path, + args.output_path, + ) \ No newline at end of file