Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap fixes #250

Merged
merged 7 commits into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bootstrap/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ To bootstrap from the existing MLOpsPython repository:
1. Ensure Python 3 is installed locally
1. Clone this repository locally
1. Run bootstrap.py script
`python bootstrap.py --d [dirpath] --n [projectname]`
`python bootstrap.py -d [dirpath] -n [projectname]`
* `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned
* `[projectname]` is the name of your ML project
100 changes: 51 additions & 49 deletions bootstrap/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import sys
import platform
import argparse
# import shutil
# from git import Repo
import re


class Helper:
Expand All @@ -25,13 +24,7 @@ def project_name(self):
def git_repo(self):
return self._git_repo

# def clonerepo(self):
# # Download MLOpsPython repo from git
# Repo.clone_from(
# self._git_repo, self._project_directory, branch="master", depth=1) # NOQA: E501
# print(self._project_directory)

def renamefiles(self):
def rename_files(self):
# Rename all files starting with diabetes_regression with project name
strtoreplace = "diabetes_regression"
dirs = [".pipelines", r"ml_service/pipelines"]
Expand All @@ -42,10 +35,11 @@ def renamefiles(self):
if(filename.find(strtoreplace) != -1):
src = os.path.join(self._project_directory, normDir, filename) # NOQA: E501
dst = os.path.join(self._project_directory,
normDir, filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501
normDir,
filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501
os.rename(src, dst)

def renamedir(self):
def rename_dir(self):
dir = "diabetes_regression"
src = os.path.join(self._project_directory, dir)
for path, subdirs, files in os.walk(src):
Expand All @@ -57,39 +51,37 @@ def renamedir(self):
new_name = os.path.join(newPath, name)
os.rename(file_path, new_name)

def deletedir(self):
def delete_dir(self):
# Delete unwanted directories
dirs = ["docs", r"diabetes_regression"]
if (platform.system() == "Windows"):
cmd = 'rmdir /S /Q "{}"'
else:
cmd = 'rm -r "{}"'
for dir in dirs:
os.system(
cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501
os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501

def cleandir(self):
def clean_dir(self):
# Clean up directories
dirs = ["data", "experimentation"]
for dir in dirs:
for root, dirs, files in os.walk(os.path.join(self._project_directory, dir)): # NOQA: E501
for file in files:
os.remove(os.path.join(root, file))

def validateargs(self):
def validate_args(self):
# Validate arguments
if (os.path.isdir(self._project_directory) is False):
raise Exception(
"Not a valid directory. Please provide absolute directory path") # NOQA: E501
# if (len(os.listdir(self._project_directory)) > 0):
# raise Exception("Directory not empty. PLease empty directory")
if(len(self._project_name) < 3 or len(self._project_name) > 15):
raise Exception("Project name should be 3 to 15 chars long")
raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501
if (len(self._project_name) < 3 or len(self._project_name) > 15):
raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501
if (not re.search("^[\\w_]+$", self._project_name)):
raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501


def replaceprojectname(project_dir, project_name, rename_name):
def replace_project_name(project_dir, project_name, rename_name):
# Replace instances of rename_name within files with project_name
dirs = [r".env.example",
files = [r".env.example",
r".pipelines/code-quality-template.yml",
r".pipelines/pr.yml",
r".pipelines/diabetes_regression-ci.yml",
Expand All @@ -107,42 +99,52 @@ def replaceprojectname(project_dir, project_name, rename_name):
r"diabetes_regression/conda_dependencies.yml",
r"diabetes_regression/evaluate/evaluate_model.py",
r"diabetes_regression/register/register_model.py",
r"diabetes_regression/training/test_train.py"] # NOQA: E501
r"diabetes_regression/training/test_train.py"]

for dir in dirs:
file = os.path.join(project_dir, os.path.normpath(dir))
fin = open(file,
"rt", encoding="utf8")
data = fin.read()
data = data.replace(rename_name, project_name)
fin.close()
fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") # NOQA: E501
fin.write(data)
fin.close()
for file in files:
path = os.path.join(project_dir, os.path.normpath(file))
try:
with open(path, "rt", encoding="utf8") as f_in:
data = f_in.read()
data = data.replace(rename_name, project_name)
with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out: # NOQA: E501
f_out.write(data)
except IOError as e:
print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir)) # NOQA: E501
raise e


def main(args):
parser = argparse.ArgumentParser(description='New Template')
parser.add_argument("--d", type=str,
parser.add_argument("-d",
"--directory",
type=str,
required=True,
help="Absolute path to new project direcory")
parser.add_argument(
"--n", type=str, help="Name of the project[3-15 chars] ")
parser.add_argument("-n",
"--name",
type=str,
required=True,
help="Name of the project [3-15 chars, letters and underscores only]") # NOQA: E501
try:
args = parser.parse_args()
project_directory = args.d
project_name = args.n

project_directory = args.directory
project_name = args.name

helper = Helper(project_directory, project_name)
helper.validateargs()
# helper.clonerepo()
helper.cleandir()
replaceprojectname(project_directory, project_name,
"diabetes_regression")
replaceprojectname(project_directory, project_name, "diabetes")
helper.renamefiles()
helper.renamedir()
helper.deletedir()
helper.validate_args()
helper.clean_dir()

replace_project_name(project_directory, project_name, "diabetes_regression") # NOQA: E501
replace_project_name(project_directory, project_name, "diabetes")

helper.rename_files()
helper.rename_dir()
helper.delete_dir()
except Exception as e:
print(e)

return 0


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from ml_service.util.attach_compute import get_compute
from ml_service.util.env_variables import Env
from ml_service.util.manage_environment import get_environment
from sklearn.datasets import load_diabetes
import pandas as pd
import os

Expand Down Expand Up @@ -57,6 +56,14 @@ def main():

# Check to see if dataset exists
if (dataset_name not in aml_workspace.datasets):
# This is a hack for the bootstrap script so that we handle our global
# find/replace of the project name gracefully.
try:
from sklearn.datasets import load_diabetes
except ImportError as e:
print("Project has already been bootstrapped, you must provide your own data.") # NOQA: E501
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't be so strong in this message. We don't know the reason of the error for sure. We just guess. I would go with something like "Failed to load diabetes dataset, perhaps the project has already ..."

The thing is that it will still rename load_diabetes into load_we_dont_call_his_name which introduces a buggy code (that we handle with this try-except). Perhaps it would make sense to move out all this load_diabetes dataset creation to a separate module (imported in this file) and exclude that module/file from "files" in replace_project_name.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, separating dataset creation is the right way to go. A quick hotfix is to call replaceprojectname on the training script to rename the specific import.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought ImportError would be enough of a limited scope to avoid any weirdness. However you made me realize that this would actually be the first time that we encounter sklearn in the flow, so this could hide a dependency error. At the very least, we need to check that sklearn exists and the loading function doesn't.

Re: moving it out, if we're going to do the non-hacky fix, I feel that this should be generic enough that you don't need to rely on having a predefined dataset to export and rather can use a csv.

raise e

# Create dataset from diabetes sample data
sample_data = load_diabetes()
df = pd.DataFrame(
Expand Down