From 86a0286116431f0ca4327cd221235072bb09de33 Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 12:14:24 -0700 Subject: [PATCH 1/7] Manual style fix pass --- bootstrap/bootstrap.py | 67 +++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index e78fa47d..09715c22 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -2,8 +2,6 @@ import sys import platform import argparse -# import shutil -# from git import Repo class Helper: @@ -25,13 +23,7 @@ def project_name(self): def git_repo(self): return self._git_repo - # def clonerepo(self): - # # Download MLOpsPython repo from git - # Repo.clone_from( - # self._git_repo, self._project_directory, branch="master", depth=1) # NOQA: E501 - # print(self._project_directory) - - def renamefiles(self): + def rename_files(self): # Rename all files starting with diabetes_regression with project name strtoreplace = "diabetes_regression" dirs = [".pipelines", r"ml_service/pipelines"] @@ -42,10 +34,11 @@ def renamefiles(self): if(filename.find(strtoreplace) != -1): src = os.path.join(self._project_directory, normDir, filename) # NOQA: E501 dst = os.path.join(self._project_directory, - normDir, filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501 + normDir, + filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501 os.rename(src, dst) - def renamedir(self): + def rename_dir(self): dir = "diabetes_regression" src = os.path.join(self._project_directory, dir) for path, subdirs, files in os.walk(src): @@ -57,7 +50,7 @@ def renamedir(self): new_name = os.path.join(newPath, name) os.rename(file_path, new_name) - def deletedir(self): + def delete_dir(self): # Delete unwanted directories dirs = ["docs", r"diabetes_regression"] if (platform.system() == "Windows"): @@ -65,10 +58,9 @@ def deletedir(self): else: cmd = 'rm -r "{}"' for dir in dirs: - os.system( - cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501 + os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501 - def cleandir(self): + def clean_dir(self): # Clean up directories dirs = ["data", "experimentation"] for dir in dirs: @@ -76,18 +68,15 @@ def cleandir(self): for file in files: os.remove(os.path.join(root, file)) - def validateargs(self): + def validate_args(self): # Validate arguments if (os.path.isdir(self._project_directory) is False): - raise Exception( - "Not a valid directory. Please provide absolute directory path") # NOQA: E501 - # if (len(os.listdir(self._project_directory)) > 0): - # raise Exception("Directory not empty. PLease empty directory") + raise Exception("Not a valid directory. Please provide absolute directory path") # NOQA: E501 if(len(self._project_name) < 3 or len(self._project_name) > 15): raise Exception("Project name should be 3 to 15 chars long") -def replaceprojectname(project_dir, project_name, rename_name): +def replace_project_name(project_dir, project_name, rename_name): # Replace instances of rename_name within files with project_name dirs = [r".env.example", r".pipelines/code-quality-template.yml", @@ -107,42 +96,46 @@ def replaceprojectname(project_dir, project_name, rename_name): r"diabetes_regression/conda_dependencies.yml", r"diabetes_regression/evaluate/evaluate_model.py", r"diabetes_regression/register/register_model.py", - r"diabetes_regression/training/test_train.py"] # NOQA: E501 + r"diabetes_regression/training/test_train.py"] for dir in dirs: file = os.path.join(project_dir, os.path.normpath(dir)) - fin = open(file, - "rt", encoding="utf8") + fin = open(file, "rt", encoding="utf8") data = fin.read() data = data.replace(rename_name, project_name) fin.close() - fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") # NOQA: E501 + fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") fin.write(data) fin.close() def main(args): parser = argparse.ArgumentParser(description='New Template') - parser.add_argument("--d", type=str, + parser.add_argument("--d", + type=str, help="Absolute path to new project direcory") - parser.add_argument( - "--n", type=str, help="Name of the project[3-15 chars] ") + parser.add_argument("--n", + type=str, + help="Name of the project[3-15 chars] ") try: args = parser.parse_args() + project_directory = args.d project_name = args.n + helper = Helper(project_directory, project_name) - helper.validateargs() - # helper.clonerepo() - helper.cleandir() - replaceprojectname(project_directory, project_name, - "diabetes_regression") - replaceprojectname(project_directory, project_name, "diabetes") - helper.renamefiles() - helper.renamedir() - helper.deletedir() + helper.validate_args() + helper.clean_dir() + + replace_project_name(project_directory, project_name, "diabetes_regression") # NOQA: E501 + replace_project_name(project_directory, project_name, "diabetes") + + helper.rename_files() + helper.rename_dir() + helper.delete_dir() except Exception as e: print(e) + return 0 From f113ec2c688ec42f387365a51bf148b002ff9ac5 Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 12:35:42 -0700 Subject: [PATCH 2/7] Bootstrap script: enforce letters and underscores only --- bootstrap/bootstrap.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index 09715c22..59baec93 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -2,7 +2,7 @@ import sys import platform import argparse - +import re class Helper: @@ -72,9 +72,10 @@ def validate_args(self): # Validate arguments if (os.path.isdir(self._project_directory) is False): raise Exception("Not a valid directory. Please provide absolute directory path") # NOQA: E501 - if(len(self._project_name) < 3 or len(self._project_name) > 15): + if (len(self._project_name) < 3 or len(self._project_name) > 15): raise Exception("Project name should be 3 to 15 chars long") - + if (re.search("^[\w_]+$", self._project_name)): + raise Exception("Invalid characters in project name. Use letters and underscores only.") # NOQA: E501 def replace_project_name(project_dir, project_name, rename_name): # Replace instances of rename_name within files with project_name From 944f764a0e0d99b6450ebd48db6cacd06f2c5766 Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 13:44:40 -0700 Subject: [PATCH 3/7] Improve error handling and argument validation --- bootstrap/bootstrap.py | 44 ++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index 59baec93..7c337ed8 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -71,15 +71,15 @@ def clean_dir(self): def validate_args(self): # Validate arguments if (os.path.isdir(self._project_directory) is False): - raise Exception("Not a valid directory. Please provide absolute directory path") # NOQA: E501 + raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501 if (len(self._project_name) < 3 or len(self._project_name) > 15): - raise Exception("Project name should be 3 to 15 chars long") - if (re.search("^[\w_]+$", self._project_name)): - raise Exception("Invalid characters in project name. Use letters and underscores only.") # NOQA: E501 + raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") + if (not re.search("^[\w_]+$", self._project_name)): + raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 def replace_project_name(project_dir, project_name, rename_name): # Replace instances of rename_name within files with project_name - dirs = [r".env.example", + files = [r".env.example", r".pipelines/code-quality-template.yml", r".pipelines/pr.yml", r".pipelines/diabetes_regression-ci.yml", @@ -99,30 +99,36 @@ def replace_project_name(project_dir, project_name, rename_name): r"diabetes_regression/register/register_model.py", r"diabetes_regression/training/test_train.py"] - for dir in dirs: - file = os.path.join(project_dir, os.path.normpath(dir)) - fin = open(file, "rt", encoding="utf8") - data = fin.read() - data = data.replace(rename_name, project_name) - fin.close() - fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") - fin.write(data) - fin.close() + for file in files: + path = os.path.join(project_dir, os.path.normpath(file)) + try: + with open(path, "rt", encoding="utf8") as f_in: + data = f_in.read() + data = data.replace(rename_name, project_name) + with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out: # NOQA: E501 + f_out.write(data) + except IOError as e: + print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir)) # NOQA: E501 + raise e def main(args): parser = argparse.ArgumentParser(description='New Template') - parser.add_argument("--d", + parser.add_argument("-d", + "--directory", type=str, + required=True, help="Absolute path to new project direcory") - parser.add_argument("--n", + parser.add_argument("-n", + "--name", type=str, - help="Name of the project[3-15 chars] ") + required=True, + help="Name of the project [3-15 chars, letters and underscores only]") try: args = parser.parse_args() - project_directory = args.d - project_name = args.n + project_directory = args.directory + project_name = args.name helper = Helper(project_directory, project_name) helper.validate_args() From 96d4ad15d7ea64e9a9d7d9bc35957233b99d5077 Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 14:05:31 -0700 Subject: [PATCH 4/7] Avoid sklearn import error after bootstrap script runs --- .../diabetes_regression_build_train_pipeline.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py index c511f7db..f8600889 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -6,7 +6,6 @@ from ml_service.util.attach_compute import get_compute from ml_service.util.env_variables import Env from ml_service.util.manage_environment import get_environment -from sklearn.datasets import load_diabetes import pandas as pd import os @@ -57,6 +56,14 @@ def main(): # Check to see if dataset exists if (dataset_name not in aml_workspace.datasets): + # This is a hack for the bootstrap script so that we handle our global + # find/replace of the project name gracefully. + try: + from sklearn.datasets import load_diabetes + except ImportError as e: + print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501 + raise e + # Create dataset from diabetes sample data sample_data = load_diabetes() df = pd.DataFrame( From f8f88ce46a6f4ad08ad194acd149721057378b8c Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 14:06:22 -0700 Subject: [PATCH 5/7] Update bootstrap README with standardized args --- bootstrap/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootstrap/README.md b/bootstrap/README.md index 027512bf..27051f2b 100644 --- a/bootstrap/README.md +++ b/bootstrap/README.md @@ -13,6 +13,6 @@ To bootstrap from the existing MLOpsPython repository: 1. Ensure Python 3 is installed locally 1. Clone this repository locally 1. Run bootstrap.py script -`python bootstrap.py --d [dirpath] --n [projectname]` +`python bootstrap.py -d [dirpath] -n [projectname]` * `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned * `[projectname]` is the name of your ML project From 6c9cfa430e045baa29a00bd0146e1667bdd4f1da Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 14:13:48 -0700 Subject: [PATCH 6/7] Linting fixes --- bootstrap/bootstrap.py | 8 +++++--- .../pipelines/diabetes_regression_build_train_pipeline.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index 7c337ed8..6e51b503 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -4,6 +4,7 @@ import argparse import re + class Helper: def __init__(self, project_directory, project_name): @@ -73,10 +74,11 @@ def validate_args(self): if (os.path.isdir(self._project_directory) is False): raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501 if (len(self._project_name) < 3 or len(self._project_name) > 15): - raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") - if (not re.search("^[\w_]+$", self._project_name)): + raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 + if (not re.search("^[\\w_]+$", self._project_name)): raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 + def replace_project_name(project_dir, project_name, rename_name): # Replace instances of rename_name within files with project_name files = [r".env.example", @@ -123,7 +125,7 @@ def main(args): "--name", type=str, required=True, - help="Name of the project [3-15 chars, letters and underscores only]") + help="Name of the project [3-15 chars, letters and underscores only]") # NOQA: E501 try: args = parser.parse_args() diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py index f8600889..176191d6 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -61,7 +61,7 @@ def main(): try: from sklearn.datasets import load_diabetes except ImportError as e: - print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501 + print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501 raise e # Create dataset from diabetes sample data From 9fdf97ca7ab24b7638d01d2312b4ba1eda13a5c3 Mon Sep 17 00:00:00 2001 From: Tom Care Date: Thu, 2 Apr 2020 16:30:22 -0700 Subject: [PATCH 7/7] Factor out diabetes CSV creation --- ...iabetes_regression_build_train_pipeline.py | 26 +++++++------------ ml_service/pipelines/load_sample_data.py | 16 ++++++++++++ 2 files changed, 26 insertions(+), 16 deletions(-) create mode 100644 ml_service/pipelines/load_sample_data.py diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py index 176191d6..dfe3f5b3 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -3,10 +3,10 @@ from azureml.pipeline.core import Pipeline, PipelineData from azureml.core import Workspace, Dataset, Datastore from azureml.core.runconfig import RunConfiguration +from ml_service.pipelines.load_sample_data import create_sample_data_csv from ml_service.util.attach_compute import get_compute from ml_service.util.env_variables import Env from ml_service.util.manage_environment import get_environment -import pandas as pd import os @@ -56,22 +56,16 @@ def main(): # Check to see if dataset exists if (dataset_name not in aml_workspace.datasets): - # This is a hack for the bootstrap script so that we handle our global - # find/replace of the project name gracefully. - try: - from sklearn.datasets import load_diabetes - except ImportError as e: - print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501 - raise e - - # Create dataset from diabetes sample data - sample_data = load_diabetes() - df = pd.DataFrame( - data=sample_data.data, - columns=sample_data.feature_names) - df['Y'] = sample_data.target + # This call creates an example CSV from sklearn sample data. If you + # have already bootstrapped your project, you can comment this line + # out and use your own CSV. + create_sample_data_csv() + + # Use a CSV to read in the data set. file_name = 'diabetes.csv' - df.to_csv(file_name, index=False) + + if (not os.path.exists(file_name)): + raise Exception("Could not find CSV dataset at \"%s\". If you have bootstrapped your project, you will need to provide a CSV." % file_name) # NOQA: E501 # Upload file to default datastore in workspace datatstore = Datastore.get(aml_workspace, datastore_name) diff --git a/ml_service/pipelines/load_sample_data.py b/ml_service/pipelines/load_sample_data.py new file mode 100644 index 00000000..cad56568 --- /dev/null +++ b/ml_service/pipelines/load_sample_data.py @@ -0,0 +1,16 @@ + +import pandas as pd +from sklearn.datasets import load_diabetes + + +# Loads the diabetes sample data from sklearn and produces a csv file that can +# be used by the build/train pipeline script. +def create_sample_data_csv(file_name): + sample_data = load_diabetes() + df = pd.DataFrame( + data=sample_data.data, + columns=sample_data.feature_names) + df['Y'] = sample_data.target + # Hard code to diabetes so we fail fast if the project has been + # bootstrapped. + df.to_csv('diabetes.csv', index=False)