From 86a0286116431f0ca4327cd221235072bb09de33 Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 12:14:24 -0700
Subject: [PATCH 1/7] Manual style fix pass

---
 bootstrap/bootstrap.py | 67 +++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py
index e78fa47d..09715c22 100644
--- a/bootstrap/bootstrap.py
+++ b/bootstrap/bootstrap.py
@@ -2,8 +2,6 @@
 import sys
 import platform
 import argparse
-# import shutil
-# from git import Repo
 
 
 class Helper:
@@ -25,13 +23,7 @@ def project_name(self):
     def git_repo(self):
         return self._git_repo
 
-    # def clonerepo(self):
-    #     # Download MLOpsPython repo from git
-    #     Repo.clone_from(
-    #         self._git_repo, self._project_directory, branch="master", depth=1) # NOQA: E501
-    #     print(self._project_directory)
-
-    def renamefiles(self):
+    def rename_files(self):
         # Rename all files starting with diabetes_regression with project name
         strtoreplace = "diabetes_regression"
         dirs = [".pipelines", r"ml_service/pipelines"]
@@ -42,10 +34,11 @@ def renamefiles(self):
                 if(filename.find(strtoreplace) != -1):
                     src = os.path.join(self._project_directory, normDir, filename)  # NOQA: E501
                     dst = os.path.join(self._project_directory,
-                                       normDir, filename.replace(strtoreplace, self._project_name, 1))  # NOQA: E501
+                                       normDir,
+                                       filename.replace(strtoreplace, self._project_name, 1))  # NOQA: E501
                     os.rename(src, dst)
 
-    def renamedir(self):
+    def rename_dir(self):
         dir = "diabetes_regression"
         src = os.path.join(self._project_directory, dir)
         for path, subdirs, files in os.walk(src):
@@ -57,7 +50,7 @@ def renamedir(self):
                 new_name = os.path.join(newPath, name)
                 os.rename(file_path, new_name)
 
-    def deletedir(self):
+    def delete_dir(self):
         # Delete unwanted directories
         dirs = ["docs", r"diabetes_regression"]
         if (platform.system() == "Windows"):
@@ -65,10 +58,9 @@ def deletedir(self):
         else:
             cmd = 'rm -r "{}"'
         for dir in dirs:
-            os.system(
-                cmd.format(os.path.join(self._project_directory, os.path.normpath(dir))))  # NOQA: E501
+            os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir))))  # NOQA: E501
 
-    def cleandir(self):
+    def clean_dir(self):
         # Clean up directories
         dirs = ["data", "experimentation"]
         for dir in dirs:
@@ -76,18 +68,15 @@ def cleandir(self):
                 for file in files:
                     os.remove(os.path.join(root, file))
 
-    def validateargs(self):
+    def validate_args(self):
         # Validate arguments
         if (os.path.isdir(self._project_directory) is False):
-            raise Exception(
-                "Not a valid directory. Please provide absolute directory path")  # NOQA: E501
-        # if (len(os.listdir(self._project_directory)) > 0):
-        #     raise Exception("Directory not empty. PLease empty directory")
+            raise Exception("Not a valid directory. Please provide absolute directory path")  # NOQA: E501
         if(len(self._project_name) < 3 or len(self._project_name) > 15):
             raise Exception("Project name should be 3 to 15 chars long")
 
 
-def replaceprojectname(project_dir, project_name, rename_name):
+def replace_project_name(project_dir, project_name, rename_name):
     # Replace instances of rename_name within files with project_name
     dirs = [r".env.example",
             r".pipelines/code-quality-template.yml",
@@ -107,42 +96,46 @@ def replaceprojectname(project_dir, project_name, rename_name):
             r"diabetes_regression/conda_dependencies.yml",
             r"diabetes_regression/evaluate/evaluate_model.py",
             r"diabetes_regression/register/register_model.py",
-            r"diabetes_regression/training/test_train.py"]  # NOQA: E501
+            r"diabetes_regression/training/test_train.py"]
 
     for dir in dirs:
         file = os.path.join(project_dir, os.path.normpath(dir))
-        fin = open(file,
-                   "rt", encoding="utf8")
+        fin = open(file, "rt", encoding="utf8")
         data = fin.read()
         data = data.replace(rename_name, project_name)
         fin.close()
-        fin = open(os.path.join(project_dir, file), "wt", encoding="utf8")  # NOQA: E501
+        fin = open(os.path.join(project_dir, file), "wt", encoding="utf8")
         fin.write(data)
         fin.close()
 
 
 def main(args):
     parser = argparse.ArgumentParser(description='New Template')
-    parser.add_argument("--d", type=str,
+    parser.add_argument("--d",
+                        type=str,
                         help="Absolute path to new project direcory")
-    parser.add_argument(
-        "--n", type=str, help="Name of the project[3-15 chars] ")
+    parser.add_argument("--n",
+                        type=str,
+                        help="Name of the project[3-15 chars] ")
     try:
         args = parser.parse_args()
+
         project_directory = args.d
         project_name = args.n
+
         helper = Helper(project_directory, project_name)
-        helper.validateargs()
-        # helper.clonerepo()
-        helper.cleandir()
-        replaceprojectname(project_directory, project_name,
-                           "diabetes_regression")
-        replaceprojectname(project_directory, project_name, "diabetes")
-        helper.renamefiles()
-        helper.renamedir()
-        helper.deletedir()
+        helper.validate_args()
+        helper.clean_dir()
+
+        replace_project_name(project_directory, project_name, "diabetes_regression")  # NOQA: E501
+        replace_project_name(project_directory, project_name, "diabetes")
+
+        helper.rename_files()
+        helper.rename_dir()
+        helper.delete_dir()
     except Exception as e:
         print(e)
+
     return 0
 
 

From f113ec2c688ec42f387365a51bf148b002ff9ac5 Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 12:35:42 -0700
Subject: [PATCH 2/7] Bootstrap script: enforce letters and underscores only

---
 bootstrap/bootstrap.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py
index 09715c22..59baec93 100644
--- a/bootstrap/bootstrap.py
+++ b/bootstrap/bootstrap.py
@@ -2,7 +2,7 @@
 import sys
 import platform
 import argparse
-
+import re
 
 class Helper:
 
@@ -72,9 +72,10 @@ def validate_args(self):
         # Validate arguments
         if (os.path.isdir(self._project_directory) is False):
             raise Exception("Not a valid directory. Please provide absolute directory path")  # NOQA: E501
-        if(len(self._project_name) < 3 or len(self._project_name) > 15):
+        if (len(self._project_name) < 3 or len(self._project_name) > 15):
             raise Exception("Project name should be 3 to 15 chars long")
-
+        if (re.search("^[\w_]+$", self._project_name)):
+            raise Exception("Invalid characters in project name. Use letters and underscores only.")  # NOQA: E501
 
 def replace_project_name(project_dir, project_name, rename_name):
     # Replace instances of rename_name within files with project_name

From 944f764a0e0d99b6450ebd48db6cacd06f2c5766 Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 13:44:40 -0700
Subject: [PATCH 3/7] Improve error handling and argument validation

---
 bootstrap/bootstrap.py | 44 ++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py
index 59baec93..7c337ed8 100644
--- a/bootstrap/bootstrap.py
+++ b/bootstrap/bootstrap.py
@@ -71,15 +71,15 @@ def clean_dir(self):
     def validate_args(self):
         # Validate arguments
         if (os.path.isdir(self._project_directory) is False):
-            raise Exception("Not a valid directory. Please provide absolute directory path")  # NOQA: E501
+            raise Exception("Not a valid directory. Please provide an absolute directory path.")  # NOQA: E501
         if (len(self._project_name) < 3 or len(self._project_name) > 15):
-            raise Exception("Project name should be 3 to 15 chars long")
-        if (re.search("^[\w_]+$", self._project_name)):
-            raise Exception("Invalid characters in project name. Use letters and underscores only.")  # NOQA: E501
+            raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.")
+        if (not re.search("^[\w_]+$", self._project_name)):
+            raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.")  # NOQA: E501
 
 def replace_project_name(project_dir, project_name, rename_name):
     # Replace instances of rename_name within files with project_name
-    dirs = [r".env.example",
+    files = [r".env.example",
             r".pipelines/code-quality-template.yml",
             r".pipelines/pr.yml",
             r".pipelines/diabetes_regression-ci.yml",
@@ -99,30 +99,36 @@ def replace_project_name(project_dir, project_name, rename_name):
             r"diabetes_regression/register/register_model.py",
             r"diabetes_regression/training/test_train.py"]
 
-    for dir in dirs:
-        file = os.path.join(project_dir, os.path.normpath(dir))
-        fin = open(file, "rt", encoding="utf8")
-        data = fin.read()
-        data = data.replace(rename_name, project_name)
-        fin.close()
-        fin = open(os.path.join(project_dir, file), "wt", encoding="utf8")
-        fin.write(data)
-        fin.close()
+    for file in files:
+        path = os.path.join(project_dir, os.path.normpath(file))
+        try:
+            with open(path, "rt", encoding="utf8") as f_in:
+                data = f_in.read()
+            data = data.replace(rename_name, project_name)
+            with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out:  # NOQA: E501
+                f_out.write(data)
+        except IOError as e:
+            print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir))  # NOQA: E501
+            raise e
 
 
 def main(args):
     parser = argparse.ArgumentParser(description='New Template')
-    parser.add_argument("--d",
+    parser.add_argument("-d",
+                        "--directory",
                         type=str,
+                        required=True,
                         help="Absolute path to new project direcory")
-    parser.add_argument("--n",
+    parser.add_argument("-n",
+                        "--name",
                         type=str,
-                        help="Name of the project[3-15 chars] ")
+                        required=True,
+                        help="Name of the project [3-15 chars, letters and underscores only]")
     try:
         args = parser.parse_args()
 
-        project_directory = args.d
-        project_name = args.n
+        project_directory = args.directory
+        project_name = args.name
 
         helper = Helper(project_directory, project_name)
         helper.validate_args()

From 96d4ad15d7ea64e9a9d7d9bc35957233b99d5077 Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 14:05:31 -0700
Subject: [PATCH 4/7] Avoid sklearn import error after bootstrap script runs

---
 .../diabetes_regression_build_train_pipeline.py          | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
index c511f7db..f8600889 100644
--- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
+++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
@@ -6,7 +6,6 @@
 from ml_service.util.attach_compute import get_compute
 from ml_service.util.env_variables import Env
 from ml_service.util.manage_environment import get_environment
-from sklearn.datasets import load_diabetes
 import pandas as pd
 import os
 
@@ -57,6 +56,14 @@ def main():
 
     # Check to see if dataset exists
     if (dataset_name not in aml_workspace.datasets):
+        # This is a hack for the bootstrap script so that we handle our global
+        # find/replace of the project name gracefully.
+        try:
+            from sklearn.datasets import load_diabetes
+        except ImportError as e:
+            print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501
+            raise e
+
         # Create dataset from diabetes sample data
         sample_data = load_diabetes()
         df = pd.DataFrame(

From f8f88ce46a6f4ad08ad194acd149721057378b8c Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 14:06:22 -0700
Subject: [PATCH 5/7] Update bootstrap README with standardized args

---
 bootstrap/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bootstrap/README.md b/bootstrap/README.md
index 027512bf..27051f2b 100644
--- a/bootstrap/README.md
+++ b/bootstrap/README.md
@@ -13,6 +13,6 @@ To bootstrap from the existing MLOpsPython repository:
 1. Ensure Python 3 is installed locally
 1. Clone this repository locally
 1. Run bootstrap.py script  
-`python bootstrap.py --d [dirpath] --n [projectname]`
+`python bootstrap.py -d [dirpath] -n [projectname]`
     * `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned
     * `[projectname]` is the name of your ML project

From 6c9cfa430e045baa29a00bd0146e1667bdd4f1da Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 14:13:48 -0700
Subject: [PATCH 6/7] Linting fixes

---
 bootstrap/bootstrap.py                                    | 8 +++++---
 .../pipelines/diabetes_regression_build_train_pipeline.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py
index 7c337ed8..6e51b503 100644
--- a/bootstrap/bootstrap.py
+++ b/bootstrap/bootstrap.py
@@ -4,6 +4,7 @@
 import argparse
 import re
 
+
 class Helper:
 
     def __init__(self, project_directory, project_name):
@@ -73,10 +74,11 @@ def validate_args(self):
         if (os.path.isdir(self._project_directory) is False):
             raise Exception("Not a valid directory. Please provide an absolute directory path.")  # NOQA: E501
         if (len(self._project_name) < 3 or len(self._project_name) > 15):
-            raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.")
-        if (not re.search("^[\w_]+$", self._project_name)):
+            raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.")  # NOQA: E501
+        if (not re.search("^[\\w_]+$", self._project_name)):
             raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.")  # NOQA: E501
 
+
 def replace_project_name(project_dir, project_name, rename_name):
     # Replace instances of rename_name within files with project_name
     files = [r".env.example",
@@ -123,7 +125,7 @@ def main(args):
                         "--name",
                         type=str,
                         required=True,
-                        help="Name of the project [3-15 chars, letters and underscores only]")
+                        help="Name of the project [3-15 chars, letters and underscores only]")  # NOQA: E501
     try:
         args = parser.parse_args()
 
diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
index f8600889..176191d6 100644
--- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
+++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
@@ -61,7 +61,7 @@ def main():
         try:
             from sklearn.datasets import load_diabetes
         except ImportError as e:
-            print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501
+            print("Project has already been bootstrapped, you must provide your own data.")  # NOQA: E501
             raise e
 
         # Create dataset from diabetes sample data

From 9fdf97ca7ab24b7638d01d2312b4ba1eda13a5c3 Mon Sep 17 00:00:00 2001
From: Tom Care <tcare@microsoft.com>
Date: Thu, 2 Apr 2020 16:30:22 -0700
Subject: [PATCH 7/7] Factor out diabetes CSV creation

---
 ...iabetes_regression_build_train_pipeline.py | 26 +++++++------------
 ml_service/pipelines/load_sample_data.py      | 16 ++++++++++++
 2 files changed, 26 insertions(+), 16 deletions(-)
 create mode 100644 ml_service/pipelines/load_sample_data.py

diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
index 176191d6..dfe3f5b3 100644
--- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
+++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py
@@ -3,10 +3,10 @@
 from azureml.pipeline.core import Pipeline, PipelineData
 from azureml.core import Workspace, Dataset, Datastore
 from azureml.core.runconfig import RunConfiguration
+from ml_service.pipelines.load_sample_data import create_sample_data_csv
 from ml_service.util.attach_compute import get_compute
 from ml_service.util.env_variables import Env
 from ml_service.util.manage_environment import get_environment
-import pandas as pd
 import os
 
 
@@ -56,22 +56,16 @@ def main():
 
     # Check to see if dataset exists
     if (dataset_name not in aml_workspace.datasets):
-        # This is a hack for the bootstrap script so that we handle our global
-        # find/replace of the project name gracefully.
-        try:
-            from sklearn.datasets import load_diabetes
-        except ImportError as e:
-            print("Project has already been bootstrapped, you must provide your own data.")  # NOQA: E501
-            raise e
-
-        # Create dataset from diabetes sample data
-        sample_data = load_diabetes()
-        df = pd.DataFrame(
-            data=sample_data.data,
-            columns=sample_data.feature_names)
-        df['Y'] = sample_data.target
+        # This call creates an example CSV from sklearn sample data. If you
+        # have already bootstrapped your project, you can comment this line
+        # out and use your own CSV.
+        create_sample_data_csv()
+
+        # Use a CSV to read in the data set.
         file_name = 'diabetes.csv'
-        df.to_csv(file_name, index=False)
+
+        if (not os.path.exists(file_name)):
+            raise Exception("Could not find CSV dataset at \"%s\". If you have bootstrapped your project, you will need to provide a CSV." % file_name)  # NOQA: E501
 
         # Upload file to default datastore in workspace
         datatstore = Datastore.get(aml_workspace, datastore_name)
diff --git a/ml_service/pipelines/load_sample_data.py b/ml_service/pipelines/load_sample_data.py
new file mode 100644
index 00000000..cad56568
--- /dev/null
+++ b/ml_service/pipelines/load_sample_data.py
@@ -0,0 +1,16 @@
+
+import pandas as pd
+from sklearn.datasets import load_diabetes
+
+
+# Loads the diabetes sample data from sklearn and produces a csv file that can
+# be used by the build/train pipeline script.
+def create_sample_data_csv(file_name):
+    sample_data = load_diabetes()
+    df = pd.DataFrame(
+        data=sample_data.data,
+        columns=sample_data.feature_names)
+    df['Y'] = sample_data.target
+    # Hard code to diabetes so we fail fast if the project has been
+    # bootstrapped.
+    df.to_csv('diabetes.csv', index=False)