diff --git a/bootstrap/README.md b/bootstrap/README.md index 027512bf..27051f2b 100644 --- a/bootstrap/README.md +++ b/bootstrap/README.md @@ -13,6 +13,6 @@ To bootstrap from the existing MLOpsPython repository: 1. Ensure Python 3 is installed locally 1. Clone this repository locally 1. Run bootstrap.py script -`python bootstrap.py --d [dirpath] --n [projectname]` +`python bootstrap.py -d [dirpath] -n [projectname]` * `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned * `[projectname]` is the name of your ML project diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index e78fa47d..6e51b503 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -2,8 +2,7 @@ import sys import platform import argparse -# import shutil -# from git import Repo +import re class Helper: @@ -25,13 +24,7 @@ def project_name(self): def git_repo(self): return self._git_repo - # def clonerepo(self): - # # Download MLOpsPython repo from git - # Repo.clone_from( - # self._git_repo, self._project_directory, branch="master", depth=1) # NOQA: E501 - # print(self._project_directory) - - def renamefiles(self): + def rename_files(self): # Rename all files starting with diabetes_regression with project name strtoreplace = "diabetes_regression" dirs = [".pipelines", r"ml_service/pipelines"] @@ -42,10 +35,11 @@ def renamefiles(self): if(filename.find(strtoreplace) != -1): src = os.path.join(self._project_directory, normDir, filename) # NOQA: E501 dst = os.path.join(self._project_directory, - normDir, filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501 + normDir, + filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501 os.rename(src, dst) - def renamedir(self): + def rename_dir(self): dir = "diabetes_regression" src = os.path.join(self._project_directory, dir) for path, subdirs, files in os.walk(src): @@ -57,7 +51,7 @@ def renamedir(self): new_name = os.path.join(newPath, name) os.rename(file_path, new_name) - def deletedir(self): + def delete_dir(self): # Delete unwanted directories dirs = ["docs", r"diabetes_regression"] if (platform.system() == "Windows"): @@ -65,10 +59,9 @@ def deletedir(self): else: cmd = 'rm -r "{}"' for dir in dirs: - os.system( - cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501 + os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501 - def cleandir(self): + def clean_dir(self): # Clean up directories dirs = ["data", "experimentation"] for dir in dirs: @@ -76,20 +69,19 @@ def cleandir(self): for file in files: os.remove(os.path.join(root, file)) - def validateargs(self): + def validate_args(self): # Validate arguments if (os.path.isdir(self._project_directory) is False): - raise Exception( - "Not a valid directory. Please provide absolute directory path") # NOQA: E501 - # if (len(os.listdir(self._project_directory)) > 0): - # raise Exception("Directory not empty. PLease empty directory") - if(len(self._project_name) < 3 or len(self._project_name) > 15): - raise Exception("Project name should be 3 to 15 chars long") + raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501 + if (len(self._project_name) < 3 or len(self._project_name) > 15): + raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 + if (not re.search("^[\\w_]+$", self._project_name)): + raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 -def replaceprojectname(project_dir, project_name, rename_name): +def replace_project_name(project_dir, project_name, rename_name): # Replace instances of rename_name within files with project_name - dirs = [r".env.example", + files = [r".env.example", r".pipelines/code-quality-template.yml", r".pipelines/pr.yml", r".pipelines/diabetes_regression-ci.yml", @@ -107,42 +99,52 @@ def replaceprojectname(project_dir, project_name, rename_name): r"diabetes_regression/conda_dependencies.yml", r"diabetes_regression/evaluate/evaluate_model.py", r"diabetes_regression/register/register_model.py", - r"diabetes_regression/training/test_train.py"] # NOQA: E501 + r"diabetes_regression/training/test_train.py"] - for dir in dirs: - file = os.path.join(project_dir, os.path.normpath(dir)) - fin = open(file, - "rt", encoding="utf8") - data = fin.read() - data = data.replace(rename_name, project_name) - fin.close() - fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") # NOQA: E501 - fin.write(data) - fin.close() + for file in files: + path = os.path.join(project_dir, os.path.normpath(file)) + try: + with open(path, "rt", encoding="utf8") as f_in: + data = f_in.read() + data = data.replace(rename_name, project_name) + with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out: # NOQA: E501 + f_out.write(data) + except IOError as e: + print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir)) # NOQA: E501 + raise e def main(args): parser = argparse.ArgumentParser(description='New Template') - parser.add_argument("--d", type=str, + parser.add_argument("-d", + "--directory", + type=str, + required=True, help="Absolute path to new project direcory") - parser.add_argument( - "--n", type=str, help="Name of the project[3-15 chars] ") + parser.add_argument("-n", + "--name", + type=str, + required=True, + help="Name of the project [3-15 chars, letters and underscores only]") # NOQA: E501 try: args = parser.parse_args() - project_directory = args.d - project_name = args.n + + project_directory = args.directory + project_name = args.name + helper = Helper(project_directory, project_name) - helper.validateargs() - # helper.clonerepo() - helper.cleandir() - replaceprojectname(project_directory, project_name, - "diabetes_regression") - replaceprojectname(project_directory, project_name, "diabetes") - helper.renamefiles() - helper.renamedir() - helper.deletedir() + helper.validate_args() + helper.clean_dir() + + replace_project_name(project_directory, project_name, "diabetes_regression") # NOQA: E501 + replace_project_name(project_directory, project_name, "diabetes") + + helper.rename_files() + helper.rename_dir() + helper.delete_dir() except Exception as e: print(e) + return 0 diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py index c511f7db..dfe3f5b3 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -3,11 +3,10 @@ from azureml.pipeline.core import Pipeline, PipelineData from azureml.core import Workspace, Dataset, Datastore from azureml.core.runconfig import RunConfiguration +from ml_service.pipelines.load_sample_data import create_sample_data_csv from ml_service.util.attach_compute import get_compute from ml_service.util.env_variables import Env from ml_service.util.manage_environment import get_environment -from sklearn.datasets import load_diabetes -import pandas as pd import os @@ -57,14 +56,16 @@ def main(): # Check to see if dataset exists if (dataset_name not in aml_workspace.datasets): - # Create dataset from diabetes sample data - sample_data = load_diabetes() - df = pd.DataFrame( - data=sample_data.data, - columns=sample_data.feature_names) - df['Y'] = sample_data.target + # This call creates an example CSV from sklearn sample data. If you + # have already bootstrapped your project, you can comment this line + # out and use your own CSV. + create_sample_data_csv() + + # Use a CSV to read in the data set. file_name = 'diabetes.csv' - df.to_csv(file_name, index=False) + + if (not os.path.exists(file_name)): + raise Exception("Could not find CSV dataset at \"%s\". If you have bootstrapped your project, you will need to provide a CSV." % file_name) # NOQA: E501 # Upload file to default datastore in workspace datatstore = Datastore.get(aml_workspace, datastore_name) diff --git a/ml_service/pipelines/load_sample_data.py b/ml_service/pipelines/load_sample_data.py new file mode 100644 index 00000000..cad56568 --- /dev/null +++ b/ml_service/pipelines/load_sample_data.py @@ -0,0 +1,16 @@ + +import pandas as pd +from sklearn.datasets import load_diabetes + + +# Loads the diabetes sample data from sklearn and produces a csv file that can +# be used by the build/train pipeline script. +def create_sample_data_csv(file_name): + sample_data = load_diabetes() + df = pd.DataFrame( + data=sample_data.data, + columns=sample_data.feature_names) + df['Y'] = sample_data.target + # Hard code to diabetes so we fail fast if the project has been + # bootstrapped. + df.to_csv('diabetes.csv', index=False)