From be445cf880fd5b58e211c39f208c750a5d9bc8ea Mon Sep 17 00:00:00 2001 From: bradmiro Date: Mon, 27 Jan 2020 17:20:42 -0500 Subject: [PATCH 1/5] Added CLI functionality to quickstart --- dataproc/quickstart/quickstart.py | 44 ++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/dataproc/quickstart/quickstart.py b/dataproc/quickstart/quickstart.py index fcbda8827d3b..f5709e54547b 100644 --- a/dataproc/quickstart/quickstart.py +++ b/dataproc/quickstart/quickstart.py @@ -15,6 +15,17 @@ # limitations under the License. # [START dataproc_quickstart] +"""This quickstart sample walks a user through creating a Cloud Dataproc + cluster, submitting a PySpark job from Google Cloud Storage to the + cluster, reading the output of the job and deleting the cluster, all + using the Python client library. + + Usage: + python3 quickstart.py --project_id --region \ + --cluster_name --job_file_path +""" + +import argparse import time from google.cloud import dataproc_v1 as dataproc @@ -22,18 +33,6 @@ def quickstart(project_id, region, cluster_name, job_file_path): - """This quickstart sample walks a user through creating a Cloud Dataproc - cluster, submitting a PySpark job from Google Cloud Storage to the - cluster, reading the output of the job and deleting the cluster, all - using the Python client library. - - Args: - project_id (string): Project to use for creating resources. - region (string): Region where the resources should live. - cluster_name (string): Name to use for creating a cluster. - job_file_path (string): Job in GCS to execute against the cluster. - """ - # Create the cluster client. cluster_client = dataproc.ClusterControllerClient(client_options={ 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) @@ -125,4 +124,23 @@ def quickstart(project_id, region, cluster_name, job_file_path): operation.result() print('Cluster {} successfully deleted.'.format(cluster_name)) - # [END dataproc_quickstart] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--project_id', type=str, + help='Project to use for creating resources.') + parser.add_argument('--region', type=str, + help='Region where the resources should live.') + parser.add_argument('--cluster_name', type=str, + help='Name to use for creating a cluster') + parser.add_argument('--job_file_path', type=str, + help='Job in GCS to execute against the cluster.') + + args = parser.parse_args() + quickstart(args.project_id, args.region, + args.cluster_name, args.job_file_path) +# [END dataproc_quickstart] From f9de7ddf90f81b0f9f0a7392b091c7e2a555ecab Mon Sep 17 00:00:00 2001 From: bradmiro Date: Mon, 27 Jan 2020 17:26:29 -0500 Subject: [PATCH 2/5] remove python3 from docstring --- dataproc/quickstart/quickstart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataproc/quickstart/quickstart.py b/dataproc/quickstart/quickstart.py index f5709e54547b..86dee6b89e3f 100644 --- a/dataproc/quickstart/quickstart.py +++ b/dataproc/quickstart/quickstart.py @@ -21,7 +21,7 @@ using the Python client library. Usage: - python3 quickstart.py --project_id --region \ + python quickstart.py --project_id --region \ --cluster_name --job_file_path """ From bd579fee500ab430177f1ac3b86379fa22f8e410 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 28 Jan 2020 14:35:43 -0500 Subject: [PATCH 3/5] Added test for CLI functionality --- dataproc/quickstart/quickstart.py | 45 +++++++++++++++++--------- dataproc/quickstart/quickstart_test.py | 24 ++++++++------ 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/dataproc/quickstart/quickstart.py b/dataproc/quickstart/quickstart.py index 86dee6b89e3f..c8880f781fa4 100644 --- a/dataproc/quickstart/quickstart.py +++ b/dataproc/quickstart/quickstart.py @@ -15,14 +15,15 @@ # limitations under the License. # [START dataproc_quickstart] -"""This quickstart sample walks a user through creating a Cloud Dataproc - cluster, submitting a PySpark job from Google Cloud Storage to the - cluster, reading the output of the job and deleting the cluster, all - using the Python client library. - - Usage: - python quickstart.py --project_id --region \ - --cluster_name --job_file_path +""" +This quickstart sample walks a user through creating a Cloud Dataproc +cluster, submitting a PySpark job from Google Cloud Storage to the +cluster, reading the output of the job and deleting the cluster, all +using the Python client library. + +Usage: + python quickstart.py --project_id --region \ + --cluster_name --job_file_path """ import argparse @@ -131,14 +132,26 @@ def quickstart(project_id, region, cluster_name, job_file_path): description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) - parser.add_argument('--project_id', type=str, - help='Project to use for creating resources.') - parser.add_argument('--region', type=str, - help='Region where the resources should live.') - parser.add_argument('--cluster_name', type=str, - help='Name to use for creating a cluster') - parser.add_argument('--job_file_path', type=str, - help='Job in GCS to execute against the cluster.') + parser.add_argument( + '--project_id', + type=str, + required=True, + help='Project to use for creating resources.') + parser.add_argument( + '--region', + type=str, + required=True, + help='Region where the resources should live.') + parser.add_argument( + '--cluster_name', + type=str, + required=True, + help='Name to use for creating a cluster') + parser.add_argument( + '--job_file_path', + type=str, + required=True, + help='Job in GCS to execute against the cluster.') args = parser.parse_args() quickstart(args.project_id, args.region, diff --git a/dataproc/quickstart/quickstart_test.py b/dataproc/quickstart/quickstart_test.py index df488d0abc6f..4145a5ab9179 100644 --- a/dataproc/quickstart/quickstart_test.py +++ b/dataproc/quickstart/quickstart_test.py @@ -15,12 +15,11 @@ import os import uuid import pytest +import subprocess from google.cloud import dataproc_v1 as dataproc from google.cloud import storage -import quickstart - PROJECT_ID = os.environ['GCLOUD_PROJECT'] REGION = 'us-central1' @@ -29,10 +28,10 @@ JOB_FILE_NAME = 'sum.py' JOB_FILE_PATH = 'gs://{}/{}'.format(STAGING_BUCKET, JOB_FILE_NAME) SORT_CODE = ( - "import pyspark\n" - "sc = pyspark.SparkContext()\n" - "rdd = sc.parallelize((1,2,3,4,5))\n" - "sum = rdd.reduce(lambda x, y: x + y)\n" + "import pyspark\n" + "sc = pyspark.SparkContext()\n" + "rdd = sc.parallelize((1,2,3,4,5))\n" + "sum = rdd.reduce(lambda x, y: x + y)\n" ) @@ -60,10 +59,17 @@ def setup_teardown(): blob.delete() -def test_quickstart(capsys): - quickstart.quickstart(PROJECT_ID, REGION, CLUSTER_NAME, JOB_FILE_PATH) +def test_quickstart(): + command = [ + 'python', 'quickstart/quickstart.py', + '--project_id', PROJECT_ID, + '--region', REGION, + '--cluster_name', CLUSTER_NAME, + '--job_file_path', JOB_FILE_PATH + ] + out = subprocess.check_output(command) + out = str(out, "utf-8") - out, _ = capsys.readouterr() assert 'Cluster created successfully' in out assert 'Submitted job' in out assert 'finished with state DONE:' in out From b0b8299ca7d51e032da0ed664e45c7a02e880b52 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 28 Jan 2020 14:53:18 -0500 Subject: [PATCH 4/5] Fixed quickstart test --- dataproc/quickstart/quickstart_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataproc/quickstart/quickstart_test.py b/dataproc/quickstart/quickstart_test.py index 4145a5ab9179..b7fe0576676d 100644 --- a/dataproc/quickstart/quickstart_test.py +++ b/dataproc/quickstart/quickstart_test.py @@ -67,8 +67,7 @@ def test_quickstart(): '--cluster_name', CLUSTER_NAME, '--job_file_path', JOB_FILE_PATH ] - out = subprocess.check_output(command) - out = str(out, "utf-8") + out = subprocess.check_output(command).decode("utf-8") assert 'Cluster created successfully' in out assert 'Submitted job' in out From 3c3da277f5f70365da17aad45870526ecff74cf4 Mon Sep 17 00:00:00 2001 From: bradmiro Date: Tue, 28 Jan 2020 15:24:05 -0500 Subject: [PATCH 5/5] Fixed helper text --- dataproc/quickstart/quickstart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataproc/quickstart/quickstart.py b/dataproc/quickstart/quickstart.py index c8880f781fa4..4159e2815202 100644 --- a/dataproc/quickstart/quickstart.py +++ b/dataproc/quickstart/quickstart.py @@ -146,7 +146,7 @@ def quickstart(project_id, region, cluster_name, job_file_path): '--cluster_name', type=str, required=True, - help='Name to use for creating a cluster') + help='Name to use for creating a cluster.') parser.add_argument( '--job_file_path', type=str,