Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap fixes #250

Merged
merged 7 commits into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bootstrap/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ To bootstrap from the existing MLOpsPython repository:
1. Ensure Python 3 is installed locally
1. Clone this repository locally
1. Run bootstrap.py script
`python bootstrap.py --d [dirpath] --n [projectname]`
`python bootstrap.py -d [dirpath] -n [projectname]`
* `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned
* `[projectname]` is the name of your ML project
100 changes: 51 additions & 49 deletions bootstrap/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import sys
import platform
import argparse
# import shutil
# from git import Repo
import re


class Helper:
Expand All @@ -25,13 +24,7 @@ def project_name(self):
def git_repo(self):
return self._git_repo

# def clonerepo(self):
# # Download MLOpsPython repo from git
# Repo.clone_from(
# self._git_repo, self._project_directory, branch="master", depth=1) # NOQA: E501
# print(self._project_directory)

def renamefiles(self):
def rename_files(self):
# Rename all files starting with diabetes_regression with project name
strtoreplace = "diabetes_regression"
dirs = [".pipelines", r"ml_service/pipelines"]
Expand All @@ -42,10 +35,11 @@ def renamefiles(self):
if(filename.find(strtoreplace) != -1):
src = os.path.join(self._project_directory, normDir, filename) # NOQA: E501
dst = os.path.join(self._project_directory,
normDir, filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501
normDir,
filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501
os.rename(src, dst)

def renamedir(self):
def rename_dir(self):
dir = "diabetes_regression"
src = os.path.join(self._project_directory, dir)
for path, subdirs, files in os.walk(src):
Expand All @@ -57,39 +51,37 @@ def renamedir(self):
new_name = os.path.join(newPath, name)
os.rename(file_path, new_name)

def deletedir(self):
def delete_dir(self):
# Delete unwanted directories
dirs = ["docs", r"diabetes_regression"]
if (platform.system() == "Windows"):
cmd = 'rmdir /S /Q "{}"'
else:
cmd = 'rm -r "{}"'
for dir in dirs:
os.system(
cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501
os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501

def cleandir(self):
def clean_dir(self):
# Clean up directories
dirs = ["data", "experimentation"]
for dir in dirs:
for root, dirs, files in os.walk(os.path.join(self._project_directory, dir)): # NOQA: E501
for file in files:
os.remove(os.path.join(root, file))

def validateargs(self):
def validate_args(self):
# Validate arguments
if (os.path.isdir(self._project_directory) is False):
raise Exception(
"Not a valid directory. Please provide absolute directory path") # NOQA: E501
# if (len(os.listdir(self._project_directory)) > 0):
# raise Exception("Directory not empty. PLease empty directory")
if(len(self._project_name) < 3 or len(self._project_name) > 15):
raise Exception("Project name should be 3 to 15 chars long")
raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501
if (len(self._project_name) < 3 or len(self._project_name) > 15):
raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501
if (not re.search("^[\\w_]+$", self._project_name)):
raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501


def replaceprojectname(project_dir, project_name, rename_name):
def replace_project_name(project_dir, project_name, rename_name):
# Replace instances of rename_name within files with project_name
dirs = [r".env.example",
files = [r".env.example",
r".pipelines/code-quality-template.yml",
r".pipelines/pr.yml",
r".pipelines/diabetes_regression-ci.yml",
Expand All @@ -107,42 +99,52 @@ def replaceprojectname(project_dir, project_name, rename_name):
r"diabetes_regression/conda_dependencies.yml",
r"diabetes_regression/evaluate/evaluate_model.py",
r"diabetes_regression/register/register_model.py",
r"diabetes_regression/training/test_train.py"] # NOQA: E501
r"diabetes_regression/training/test_train.py"]

for dir in dirs:
file = os.path.join(project_dir, os.path.normpath(dir))
fin = open(file,
"rt", encoding="utf8")
data = fin.read()
data = data.replace(rename_name, project_name)
fin.close()
fin = open(os.path.join(project_dir, file), "wt", encoding="utf8") # NOQA: E501
fin.write(data)
fin.close()
for file in files:
path = os.path.join(project_dir, os.path.normpath(file))
try:
with open(path, "rt", encoding="utf8") as f_in:
data = f_in.read()
data = data.replace(rename_name, project_name)
with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out: # NOQA: E501
f_out.write(data)
except IOError as e:
print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir)) # NOQA: E501
raise e


def main(args):
parser = argparse.ArgumentParser(description='New Template')
parser.add_argument("--d", type=str,
parser.add_argument("-d",
"--directory",
type=str,
required=True,
help="Absolute path to new project direcory")
parser.add_argument(
"--n", type=str, help="Name of the project[3-15 chars] ")
parser.add_argument("-n",
"--name",
type=str,
required=True,
help="Name of the project [3-15 chars, letters and underscores only]") # NOQA: E501
try:
args = parser.parse_args()
project_directory = args.d
project_name = args.n

project_directory = args.directory
project_name = args.name

helper = Helper(project_directory, project_name)
helper.validateargs()
# helper.clonerepo()
helper.cleandir()
replaceprojectname(project_directory, project_name,
"diabetes_regression")
replaceprojectname(project_directory, project_name, "diabetes")
helper.renamefiles()
helper.renamedir()
helper.deletedir()
helper.validate_args()
helper.clean_dir()

replace_project_name(project_directory, project_name, "diabetes_regression") # NOQA: E501
replace_project_name(project_directory, project_name, "diabetes")

helper.rename_files()
helper.rename_dir()
helper.delete_dir()
except Exception as e:
print(e)

return 0


Expand Down
19 changes: 10 additions & 9 deletions ml_service/pipelines/diabetes_regression_build_train_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core import Workspace, Dataset, Datastore
from azureml.core.runconfig import RunConfiguration
from ml_service.pipelines.load_sample_data import create_sample_data_csv
from ml_service.util.attach_compute import get_compute
from ml_service.util.env_variables import Env
from ml_service.util.manage_environment import get_environment
from sklearn.datasets import load_diabetes
import pandas as pd
import os


Expand Down Expand Up @@ -57,14 +56,16 @@ def main():

# Check to see if dataset exists
if (dataset_name not in aml_workspace.datasets):
# Create dataset from diabetes sample data
sample_data = load_diabetes()
df = pd.DataFrame(
data=sample_data.data,
columns=sample_data.feature_names)
df['Y'] = sample_data.target
# This call creates an example CSV from sklearn sample data. If you
# have already bootstrapped your project, you can comment this line
# out and use your own CSV.
create_sample_data_csv()

# Use a CSV to read in the data set.
file_name = 'diabetes.csv'
df.to_csv(file_name, index=False)

if (not os.path.exists(file_name)):
raise Exception("Could not find CSV dataset at \"%s\". If you have bootstrapped your project, you will need to provide a CSV." % file_name) # NOQA: E501

# Upload file to default datastore in workspace
datatstore = Datastore.get(aml_workspace, datastore_name)
Expand Down
16 changes: 16 additions & 0 deletions ml_service/pipelines/load_sample_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

import pandas as pd
from sklearn.datasets import load_diabetes


# Loads the diabetes sample data from sklearn and produces a csv file that can
# be used by the build/train pipeline script.
def create_sample_data_csv(file_name):
sample_data = load_diabetes()
df = pd.DataFrame(
data=sample_data.data,
columns=sample_data.feature_names)
df['Y'] = sample_data.target
# Hard code to diabetes so we fail fast if the project has been
# bootstrapped.
df.to_csv('diabetes.csv', index=False)