From d7a62f66b7fa13898ad5d7e48f42b9dd8602bb14 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 31 Oct 2019 17:03:03 +0800 Subject: [PATCH 01/12] check pylint for nni_cmd --- tools/nni_cmd/command_utils.py | 2 +- tools/nni_cmd/common_utils.py | 10 +-- tools/nni_cmd/config_schema.py | 104 ++++++++++++++-------------- tools/nni_cmd/config_utils.py | 29 ++++---- tools/nni_cmd/constants.py | 5 +- tools/nni_cmd/launcher.py | 46 ++++++------ tools/nni_cmd/launcher_utils.py | 20 +++--- tools/nni_cmd/nnictl.py | 16 +++-- tools/nni_cmd/nnictl_utils.py | 66 +++++++++--------- tools/nni_cmd/package_management.py | 6 +- tools/nni_cmd/ssh_utils.py | 6 +- tools/nni_cmd/tensorboard_utils.py | 25 +++---- tools/nni_cmd/updater.py | 2 +- tools/nni_cmd/url_utils.py | 6 +- 14 files changed, 171 insertions(+), 172 deletions(-) diff --git a/tools/nni_cmd/command_utils.py b/tools/nni_cmd/command_utils.py index a3bcb81965..cf13f63eae 100644 --- a/tools/nni_cmd/command_utils.py +++ b/tools/nni_cmd/command_utils.py @@ -3,7 +3,7 @@ import os import signal import psutil -from .common_utils import print_error, print_normal, print_warning +from .common_utils import print_error def check_output_command(file_path, head=None, tail=None): diff --git a/tools/nni_cmd/common_utils.py b/tools/nni_cmd/common_utils.py index 3a5e909ca2..af0fe3efa6 100644 --- a/tools/nni_cmd/common_utils.py +++ b/tools/nni_cmd/common_utils.py @@ -21,10 +21,10 @@ import os import sys import json -import ruamel.yaml as yaml -import psutil import socket from pathlib import Path +import ruamel.yaml as yaml +import psutil from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO, COLOR_RED_FORMAT, COLOR_YELLOW_FORMAT def get_yml_content(file_path): @@ -34,6 +34,7 @@ def get_yml_content(file_path): return yaml.load(file, Loader=yaml.Loader) except yaml.scanner.ScannerError as err: print_error('yaml file format error!') + print_error(err) exit(1) except Exception as exception: print_error(exception) @@ -46,6 +47,7 @@ def get_json_content(file_path): return json.load(file) except TypeError as err: print_error('json file format error!') + print_error(err) return None def print_error(content): @@ -70,7 +72,7 @@ def detect_process(pid): def detect_port(port): '''Detect if the port is used''' - socket_test = socket.socket(socket.AF_INET,socket.SOCK_STREAM) + socket_test = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: socket_test.connect(('127.0.0.1', int(port))) socket_test.close() @@ -79,7 +81,7 @@ def detect_port(port): return False def get_user(): - if sys.platform =='win32': + if sys.platform == 'win32': return os.environ['USERNAME'] else: return os.environ['USER'] diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index da943564fb..dded8d1e95 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -19,13 +19,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os -from schema import Schema, And, Use, Optional, Regex, Or +from schema import Schema, And, Optional, Regex, Or from .constants import SCHEMA_TYPE_ERROR, SCHEMA_RANGE_ERROR, SCHEMA_PATH_ERROR -def setType(key, type): +def setType(key, valueType): '''check key type''' - return And(type, error=SCHEMA_TYPE_ERROR % (key, type.__name__)) + return And(valueType, error=SCHEMA_TYPE_ERROR % (key, valueType.__name__)) def setChoice(key, *args): '''check choice''' @@ -47,7 +47,7 @@ def setPathCheck(key): 'experimentName': setType('experimentName', str), Optional('description'): setType('description', str), 'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999), - Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$',error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), + Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), @@ -106,7 +106,7 @@ def setPathCheck(key): 'builtinTunerName': 'NetworkMorphism', Optional('classArgs'): { Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), - Optional('task'): setChoice('task', 'cv','nlp','common'), + Optional('task'): setChoice('task', 'cv', 'nlp', 'common'), Optional('input_width'): setType('input_width', int), Optional('input_channel'): setType('input_channel', int), Optional('n_output_node'): setType('n_output_node', int), @@ -139,7 +139,7 @@ def setPathCheck(key): Optional('selection_num_warm_up'): setType('selection_num_warm_up', int), Optional('selection_num_starting_points'): setType('selection_num_starting_points', int), }, - Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), + Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), }, 'PPOTuner': { @@ -232,35 +232,35 @@ def setPathCheck(key): } common_trial_schema = { -'trial':{ - 'command': setType('command', str), - 'codeDir': setPathCheck('codeDir'), - Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') } } pai_trial_schema = { -'trial':{ - 'command': setType('command', str), - 'codeDir': setPathCheck('codeDir'), - 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), - 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), - 'memoryMB': setType('memoryMB', int), - 'image': setType('image', str), - Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'), - Optional('shmMB'): setType('shmMB', int), - Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ - error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), - Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ - error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), - Optional('virtualCluster'): setType('virtualCluster', str), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), - Optional('portList'): [{ - "label": setType('label', str), - "beginAt": setType('beginAt', int), - "portNumber": setType('portNumber', int) - }] + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), + 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), + 'memoryMB': setType('memoryMB', int), + 'image': setType('image', str), + Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'), + Optional('shmMB'): setType('shmMB', int), + Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ + error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), + Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ + error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), + Optional('virtualCluster'): setType('virtualCluster', str), + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + Optional('portList'): [{ + "label": setType('label', str), + "beginAt": setType('beginAt', int), + "portNumber": setType('portNumber', int) + }] } } @@ -273,7 +273,7 @@ def setPathCheck(key): } kubeflow_trial_schema = { -'trial':{ + 'trial':{ 'codeDir': setPathCheck('codeDir'), Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), Optional('ps'): { @@ -315,7 +315,7 @@ def setPathCheck(key): 'server': setType('server', str), 'path': setType('path', str) } - },{ + }, { 'operator': setChoice('operator', 'tf-operator', 'pytorch-operator'), 'apiVersion': setType('apiVersion', str), Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), @@ -363,7 +363,7 @@ def setPathCheck(key): 'server': setType('server', str), 'path': setType('path', str) } - },{ + }, { Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), Optional('serviceAccountName'): setType('serviceAccountName', str), 'keyVault': { @@ -383,24 +383,24 @@ def setPathCheck(key): } machine_list_schema = { -Optional('machineList'):[Or({ - 'ip': setType('ip', str), - Optional('port'): setNumberRange('port', int, 1, 65535), - 'username': setType('username', str), - 'passwd': setType('passwd', str), - Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), - Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), - Optional('useActiveGpu'): setType('useActiveGpu', bool) - },{ - 'ip': setType('ip', str), - Optional('port'): setNumberRange('port', int, 1, 65535), - 'username': setType('username', str), - 'sshKeyPath': setPathCheck('sshKeyPath'), - Optional('passphrase'): setType('passphrase', str), - Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), - Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), - Optional('useActiveGpu'): setType('useActiveGpu', bool) -})] + Optional('machineList'):[Or({ + 'ip': setType('ip', str), + Optional('port'): setNumberRange('port', int, 1, 65535), + 'username': setType('username', str), + 'passwd': setType('passwd', str), + Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool) + }, { + 'ip': setType('ip', str), + Optional('port'): setNumberRange('port', int, 1, 65535), + 'username': setType('username', str), + 'sshKeyPath': setPathCheck('sshKeyPath'), + Optional('passphrase'): setType('passphrase', str), + Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool) + })] } LOCAL_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema}) diff --git a/tools/nni_cmd/config_utils.py b/tools/nni_cmd/config_utils.py index 6b2b8a0cc0..c5a36b374d 100644 --- a/tools/nni_cmd/config_utils.py +++ b/tools/nni_cmd/config_utils.py @@ -21,7 +21,6 @@ import os import json -import shutil from .constants import NNICTL_HOME_DIR class Config: @@ -73,29 +72,29 @@ def __init__(self): self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiments = self.read_file() - def add_experiment(self, id, port, time, file_name, platform): + def add_experiment(self, expId, port, time, file_name, platform): '''set {key:value} paris to self.experiment''' - self.experiments[id] = {} - self.experiments[id]['port'] = port - self.experiments[id]['startTime'] = time - self.experiments[id]['endTime'] = 'N/A' - self.experiments[id]['status'] = 'INITIALIZED' - self.experiments[id]['fileName'] = file_name - self.experiments[id]['platform'] = platform + self.experiments[expId] = {} + self.experiments[expId]['port'] = port + self.experiments[expId]['startTime'] = time + self.experiments[expId]['endTime'] = 'N/A' + self.experiments[expId]['status'] = 'INITIALIZED' + self.experiments[expId]['fileName'] = file_name + self.experiments[expId]['platform'] = platform self.write_file() - def update_experiment(self, id, key, value): + def update_experiment(self, expId, key, value): '''Update experiment''' if id not in self.experiments: return False - self.experiments[id][key] = value + self.experiments[expId][key] = value self.write_file() return True - def remove_experiment(self, id): + def remove_experiment(self, expId): '''remove an experiment by id''' if id in self.experiments: - self.experiments.pop(id) + self.experiments.pop(expId) self.write_file() def get_all_experiments(self): @@ -109,7 +108,7 @@ def write_file(self): json.dump(self.experiments, file) except IOError as error: print('Error:', error) - return + return '' def read_file(self): '''load config from local file''' @@ -119,4 +118,4 @@ def read_file(self): return json.load(file) except ValueError: return {} - return {} + return {} diff --git a/tools/nni_cmd/constants.py b/tools/nni_cmd/constants.py index d22a509c46..0777d2db98 100644 --- a/tools/nni_cmd/constants.py +++ b/tools/nni_cmd/constants.py @@ -21,7 +21,7 @@ import os from colorama import Fore -NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') +NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') ERROR_INFO = 'ERROR: %s' @@ -58,7 +58,8 @@ '-----------------------------------------------------------------------\n' EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \ - 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' + 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could ' \ + 'use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' EXPERIMENT_INFORMATION_FORMAT = '----------------------------------------------------------------------------------------\n' \ ' Experiment information\n' \ diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index e2fac2cb42..f99f8dfe43 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -22,22 +22,21 @@ import json import os import sys -import shutil import string -from subprocess import Popen, PIPE, call, check_output, check_call, CalledProcessError +import random +import site +import time import tempfile +from subprocess import Popen, check_call, CalledProcessError +from nni_annotation import expand_annotations, generate_search_space from nni.constants import ModuleName, AdvisorModuleName -from nni_annotation import * from .launcher_utils import validate_all_content -from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response +from .rest_utils import rest_put, rest_post, check_rest_server, check_response from .url_utils import cluster_metadata_url, experiment_url, get_local_urls from .config_utils import Config, Experiments -from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port, get_user, get_python_dir -from .constants import * -import random -import site -import time -from pathlib import Path +from .common_utils import get_yml_content, get_json_content, print_error, print_normal, \ + detect_port, get_user, get_python_dir +from .constants import NNICTL_HOME_DIR, ERROR_INFO, REST_TIME_OUT, EXPERIMENT_SUCCESS_INFO, LOG_HEADER, PACKAGE_REQUIREMENTS from .command_utils import check_output_command, kill_command from .nnictl_utils import update_experiment @@ -83,7 +82,8 @@ def _generate_installation_path(sitepackages_path): python_dir = os.getenv('VIRTUAL_ENV') else: python_sitepackage = site.getsitepackages()[0] - # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given that nni exists there + # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given + # that nni exists there if python_sitepackage.startswith('/usr') or python_sitepackage.startswith('/Library'): python_dir = try_installation_path_sequentially(site.getusersitepackages(), site.getsitepackages()[0]) else: @@ -98,7 +98,6 @@ def _generate_installation_path(sitepackages_path): def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None): '''Run nni manager process''' - nni_config = Config(config_file_name) if detect_port(port): print_error('Port %s is used by another process, please reset the port!\n' \ 'You could use \'nnictl create --help\' to get help information' % port) @@ -114,7 +113,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None entry_dir = get_nni_installation_path() entry_file = os.path.join(entry_dir, 'main.js') - + node_command = 'node' if sys.platform == 'win32': node_command = os.path.join(entry_dir[:-3], 'Scripts', 'node.exe') @@ -132,7 +131,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None cmds += ['--experiment_id', experiment_id] stdout_full_path, stderr_full_path = get_log_path(config_file_name) with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: - time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #add time information in the header of log files log_header = LOG_HEADER % str(time_now) stdout_file.write(log_header) @@ -212,7 +211,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name): if experiment_config.get('nniManagerIp') is None: return True, None ip_config_dict = dict() - ip_config_dict['nni_manager_ip'] = { 'nniManagerIp' : experiment_config['nniManagerIp'] } + ip_config_dict['nni_manager_ip'] = {'nniManagerIp': experiment_config['nniManagerIp']} response = rest_put(cluster_metadata_url(port), json.dumps(ip_config_dict), REST_TIME_OUT) err_message = None if not response or not response.status_code == 200: @@ -403,11 +402,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen stdout_full_path, stderr_full_path = get_log_path(config_file_name) with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: check_call([sys.executable, '-c', 'import %s'%(module_name)], stdout=stdout_file, stderr=stderr_file) - except CalledProcessError as e: + except CalledProcessError: print_error('some errors happen when import package %s.' %(package_name)) print_log_content(config_file_name) if package_name in PACKAGE_REQUIREMENTS: - print_error('If %s is not installed, it should be installed through \'nnictl package install --name %s\''%(package_name, package_name)) + print_error('If %s is not installed, it should be installed through '\ + '\'nnictl package install --name %s\''%(package_name, package_name)) exit(1) log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None @@ -416,7 +416,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen if log_level not in ['trace', 'debug'] and (args.debug or experiment_config.get('debug') is True): log_level = 'debug' # start rest server - rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level) + rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], \ + mode, config_file_name, experiment_id, log_dir, log_level) nni_config.set_config('restServerPid', rest_process.pid) # Deal with annotation if experiment_config.get('useAnnotation'): @@ -450,8 +451,9 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen exit(1) if mode != 'view': # set platform configuration - set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port, config_file_name, rest_process) - + set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port,\ + config_file_name, rest_process) + # start a new experiment print_normal('Starting experiment...') # set debug configuration @@ -478,7 +480,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen #save experiment information nnictl_experiment_config = Experiments() - nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name, experiment_config['trainingServicePlatform']) + nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name,\ + experiment_config['trainingServicePlatform']) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) @@ -503,7 +506,6 @@ def manage_stopped_experiment(args, mode): experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() experiment_id = None - experiment_endTime = None #find the latest stopped experiment if not args.id: print_error('Please set experiment id! \nYou could use \'nnictl {0} {id}\' to {0} a stopped experiment!\n' \ diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index da6a668064..f6c849abab 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -20,11 +20,11 @@ import os import json -from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA, FRAMEWORKCONTROLLER_CONFIG_SCHEMA, \ -tuner_schema_dict, advisor_schema_dict, assessor_schema_dict -from schema import SchemaMissingKeyError, SchemaForbiddenKeyError, SchemaUnexpectedTypeError, SchemaWrongKeyError, SchemaError -from .common_utils import get_json_content, print_error, print_warning, print_normal -from schema import Schema, And, Use, Optional, Regex, Or +from schema import SchemaError +from schema import Schema +from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ + FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict +from .common_utils import print_error, print_warning, print_normal def expand_path(experiment_config, key): '''Change '~' to user home directory''' @@ -164,11 +164,11 @@ def validate_common_content(experiment_config): print_error('Please set correct trainingServicePlatform!') exit(1) schema_dict = { - 'local': LOCAL_CONFIG_SCHEMA, - 'remote': REMOTE_CONFIG_SCHEMA, - 'pai': PAI_CONFIG_SCHEMA, - 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, - 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA + 'local': LOCAL_CONFIG_SCHEMA, + 'remote': REMOTE_CONFIG_SCHEMA, + 'pai': PAI_CONFIG_SCHEMA, + 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, + 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA } separate_schema_dict = { 'tuner': tuner_schema_dict, diff --git a/tools/nni_cmd/nnictl.py b/tools/nni_cmd/nnictl.py index 8da30fdfb7..88ee311423 100644 --- a/tools/nni_cmd/nnictl.py +++ b/tools/nni_cmd/nnictl.py @@ -20,14 +20,18 @@ import argparse +import os import pkg_resources +from colorama import init +from .common_utils import print_error from .launcher import create_experiment, resume_experiment, view_experiment from .updater import update_searchspace, update_concurrency, update_duration, update_trialnum, import_data -from .nnictl_utils import * -from .package_management import * -from .constants import * -from .tensorboard_utils import * -from colorama import init +from .nnictl_utils import stop_experiment, trial_ls, trial_kill, list_experiment, experiment_status,\ + log_trial, experiment_clean, platform_clean, experiment_list, \ + monitor_experiment, export_trials_data, trial_codegen, webui_url, get_config, log_stdout, log_stderr +from .package_management import package_install, package_show +from .constants import DEFAULT_REST_PORT +from .tensorboard_utils import start_tensorboard, stop_tensorboard init(autoreset=True) if os.environ.get('COVERAGE_PROCESS_START'): @@ -38,7 +42,7 @@ def nni_info(*args): if args[0].version: try: print(pkg_resources.get_distribution('nni').version) - except pkg_resources.ResolutionError as err: + except pkg_resources.ResolutionError: print_error('Get version failed, please use `pip3 list | grep nni` to check nni version!') else: print('please run "nnictl {positional argument} --help" to see nnictl guidance') diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index b6fada56e8..4cadce182d 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -20,15 +20,13 @@ import csv import os -import psutil import json -from datetime import datetime, timezone import time import re -from pathlib import Path -from pyhdfs import HdfsClient, HdfsFileNotFoundException import shutil -from subprocess import call, check_output +from datetime import datetime, timezone +from pathlib import Path +from pyhdfs import HdfsClient from nni_annotation import expand_annotations from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url @@ -102,7 +100,8 @@ def check_experiment_id(args, update=True): experiment_information = "" for key in running_experiment_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ - experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'],\ + experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) elif not running_experiment_list: @@ -157,23 +156,24 @@ def parse_ids(args): experiment_information = "" for key in running_experiment_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ - experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], \ + experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) else: result_list = running_experiment_list elif args.id.endswith('*'): - for id in running_experiment_list: - if id.startswith(args.id[:-1]): - result_list.append(id) + for expId in running_experiment_list: + if expId.startswith(args.id[:-1]): + result_list.append(expId) elif args.id in running_experiment_list: result_list.append(args.id) else: - for id in running_experiment_list: - if id.startswith(args.id): - result_list.append(id) + for expId in running_experiment_list: + if expId.startswith(args.id): + result_list.append(expId) if len(result_list) > 1: - print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) ) + print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list)) return None if not result_list and (args.id or args.port): print_error('There are no experiments matched, please set correct experiment id or restful server port') @@ -235,7 +235,6 @@ def stop_experiment(args): for experiment_id in experiment_id_list: print_normal('Stoping experiment %s' % experiment_id) nni_config = Config(experiment_dict[experiment_id]['fileName']) - rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if rest_pid: kill_command(rest_pid) @@ -249,7 +248,7 @@ def stop_experiment(args): nni_config.set_config('tensorboardPidList', []) print_normal('Stop experiment success.') experiment_config.update_experiment(experiment_id, 'status', 'STOPPED') - time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) def trial_ls(args): @@ -401,9 +400,9 @@ def local_clean(directory): print_normal('removing folder {0}'.format(directory)) try: shutil.rmtree(directory) - except FileNotFoundError as err: + except FileNotFoundError: print_error('{0} does not exist.'.format(directory)) - + def remote_clean(machine_list, experiment_id=None): '''clean up remote data''' for machine in machine_list: @@ -418,7 +417,7 @@ def remote_clean(machine_list, experiment_id=None): sftp = create_ssh_sftp_client(host, port, userName, passwd) print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir)) remove_remote_directory(sftp, remote_dir) - + def hdfs_clean(host, user_name, output_dir, experiment_id=None): '''clean up hdfs data''' hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5) @@ -475,7 +474,7 @@ def experiment_clean(args): machine_list = nni_config.get_config('experimentConfig').get('machineList') remote_clean(machine_list, experiment_id) elif platform == 'pai': - host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') + host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName') output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir') hdfs_clean(host, user_name, output_dir, experiment_id) @@ -492,7 +491,7 @@ def experiment_clean(args): experiment_config = Experiments() print_normal('removing metadata of experiment {0}'.format(experiment_id)) experiment_config.remove_experiment(experiment_id) - print_normal('Done.') + print_normal('Done.') def get_platform_dir(config_content): '''get the dir list to be deleted''' @@ -505,8 +504,7 @@ def get_platform_dir(config_content): port = machine.get('port') dir_list.append(host + ':' + str(port) + '/tmp/nni') elif platform == 'pai': - pai_config = config_content.get('paiConfig') - host = config_content.get('paiConfig').get('host') + host = config_content.get('paiConfig').get('host') user_name = config_content.get('paiConfig').get('userName') output_dir = config_content.get('trial').get('outputDir') dir_list.append('server: {0}, path: {1}/nni'.format(host, user_name)) @@ -529,17 +527,15 @@ def platform_clean(args): print_normal('platform {0} not supported.'.format(platform)) exit(0) update_experiment() - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - id_list = list(experiment_dict.keys()) dir_list = get_platform_dir(config_content) if not dir_list: print_normal('No folder of NNI caches is found.') exit(1) while True: - print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.') - for dir in dir_list: - print(' ' + dir) + print_normal('This command will remove below folders of NNI caches. If other users are using experiments' \ + ' on below hosts, it will be broken.') + for value in dir_list: + print(' ' + value) inputs = input('INFO: do you want to continue?[y/N]:') if not inputs.lower() or inputs.lower() in ['n', 'no']: exit(0) @@ -549,11 +545,9 @@ def platform_clean(args): break if platform == 'remote': machine_list = config_content.get('machineList') - for machine in machine_list: - remote_clean(machine_list, None) + remote_clean(machine_list, None) elif platform == 'pai': - pai_config = config_content.get('paiConfig') - host = config_content.get('paiConfig').get('host') + host = config_content.get('paiConfig').get('host') user_name = config_content.get('paiConfig').get('userName') output_dir = config_content.get('trial').get('outputDir') hdfs_clean(host, user_name, output_dir, None) @@ -618,7 +612,8 @@ def show_experiment_info(): return for key in experiment_id_list: print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \ - experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))) + experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], \ + get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))) print(TRIAL_MONITOR_HEAD) running, response = check_rest_server_quick(experiment_dict[key]['port']) if running: @@ -627,7 +622,8 @@ def show_experiment_info(): content = json.loads(response.text) for index, value in enumerate(content): content[index] = convert_time_stamp_to_date(value) - print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), content[index].get('endTime'), content[index].get('status'))) + print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), \ + content[index].get('endTime'), content[index].get('status'))) print(TRIAL_MONITOR_TAIL) def monitor_experiment(args): diff --git a/tools/nni_cmd/package_management.py b/tools/nni_cmd/package_management.py index de8dbe62ec..32ed79496d 100644 --- a/tools/nni_cmd/package_management.py +++ b/tools/nni_cmd/package_management.py @@ -18,12 +18,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import nni import os -import sys -from subprocess import call +import nni from .constants import PACKAGE_REQUIREMENTS -from .common_utils import print_normal, print_error +from .common_utils import print_error from .command_utils import install_requirements_command def process_install(package_name): diff --git a/tools/nni_cmd/ssh_utils.py b/tools/nni_cmd/ssh_utils.py index da707dac48..7453830323 100644 --- a/tools/nni_cmd/ssh_utils.py +++ b/tools/nni_cmd/ssh_utils.py @@ -20,7 +20,6 @@ import os from .common_utils import print_error -from subprocess import call from .command_utils import install_package_command def check_environment(): @@ -29,6 +28,8 @@ def check_environment(): import paramiko except: install_package_command('paramiko') + import paramiko + return paramiko def copy_remote_directory_to_local(sftp, remote_path, local_path): '''copy remote directory to local machine''' @@ -49,8 +50,7 @@ def copy_remote_directory_to_local(sftp, remote_path, local_path): def create_ssh_sftp_client(host_ip, port, username, password): '''create ssh client''' try: - check_environment() - import paramiko + paramiko = check_environment() conn = paramiko.Transport(host_ip, port) conn.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(conn) diff --git a/tools/nni_cmd/tensorboard_utils.py b/tools/nni_cmd/tensorboard_utils.py index b4578c34b0..9646b4de0e 100644 --- a/tools/nni_cmd/tensorboard_utils.py +++ b/tools/nni_cmd/tensorboard_utils.py @@ -19,21 +19,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os -import psutil import json -import datetime -import time -from subprocess import call, check_output, Popen, PIPE -from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response -from .config_utils import Config, Experiments -from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, get_local_urls -from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, COLOR_GREEN_FORMAT -import time -from .common_utils import print_normal, print_error, print_warning, detect_process, detect_port -from .nnictl_utils import * import re -from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local import tempfile +from subprocess import call, Popen +from .rest_utils import rest_get, check_rest_server_quick, check_response +from .config_utils import Config, Experiments +from .url_utils import trial_jobs_url, get_local_urls +from .constants import COLOR_GREEN_FORMAT, REST_TIME_OUT +from .common_utils import print_normal, print_error, detect_process, detect_port +from .nnictl_utils import check_experiment_id, check_experiment_id +from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local def parse_log_path(args, trial_content): '''parse log path''' @@ -43,7 +39,7 @@ def parse_log_path(args, trial_content): if args.trial_id and args.trial_id != 'all' and trial.get('id') != args.trial_id: continue pattern = r'(?P.+)://(?P.+):(?P.*)' - match = re.search(pattern,trial['logPath']) + match = re.search(pattern, trial['logPath']) if match: path_list.append(match.group('path')) host_list.append(match.group('host')) @@ -94,7 +90,8 @@ def start_tensorboard_process(args, nni_config, path_list, temp_nni_path): if detect_port(args.port): print_error('Port %s is used by another process, please reset port!' % str(args.port)) exit(1) - with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file: + with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, \ + open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file: cmds = ['tensorboard', '--logdir', format_tensorboard_log_path(path_list), '--port', str(args.port)] tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) url_list = get_local_urls(args.port) diff --git a/tools/nni_cmd/updater.py b/tools/nni_cmd/updater.py index 9258d73f0a..07ae6123cb 100644 --- a/tools/nni_cmd/updater.py +++ b/tools/nni_cmd/updater.py @@ -25,7 +25,7 @@ from .url_utils import experiment_url, import_data_url from .config_utils import Config from .common_utils import get_json_content, print_normal, print_error, print_warning -from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename +from .nnictl_utils import get_experiment_port, get_config_filename from .launcher_utils import parse_time from .constants import REST_TIME_OUT, TUNERS_SUPPORTING_IMPORT_DATA, TUNERS_NO_NEED_TO_IMPORT_DATA diff --git a/tools/nni_cmd/url_utils.py b/tools/nni_cmd/url_utils.py index c50b2551d2..05cfa8e66f 100644 --- a/tools/nni_cmd/url_utils.py +++ b/tools/nni_cmd/url_utils.py @@ -18,8 +18,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import socket import psutil -from socket import AddressFamily BASE_URL = 'http://localhost' @@ -83,8 +83,8 @@ def tensorboard_url(port): def get_local_urls(port): '''get urls of local machine''' url_list = [] - for name, info in psutil.net_if_addrs().items(): + for _, info in psutil.net_if_addrs().items(): for addr in info: - if AddressFamily.AF_INET == addr.family: + if socket.AddressFamily.AF_INET == addr.family: url_list.append('http://{}:{}'.format(addr.address, port)) return url_list From e259d109fea97fbce6f81b3081390fcb99d594fa Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 31 Oct 2019 17:55:41 +0800 Subject: [PATCH 02/12] fix id error --- tools/nni_cmd/config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_cmd/config_utils.py b/tools/nni_cmd/config_utils.py index c5a36b374d..c7c88bcf3e 100644 --- a/tools/nni_cmd/config_utils.py +++ b/tools/nni_cmd/config_utils.py @@ -85,7 +85,7 @@ def add_experiment(self, expId, port, time, file_name, platform): def update_experiment(self, expId, key, value): '''Update experiment''' - if id not in self.experiments: + if expId not in self.experiments: return False self.experiments[expId][key] = value self.write_file() From 92c2ce7ef6eb231cc16cb4db020faffe40ac924a Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 16 Jan 2020 17:26:35 +0800 Subject: [PATCH 03/12] add merge config --- .../rest_server/restValidationSchemas.ts | 1 + .../training_service/pai/paiK8S/paiK8SConfig.ts | 4 +++- .../pai/paiK8S/paiK8STrainingService.ts | 13 +++++++++++-- tools/nni_cmd/config_schema.py | 3 ++- tools/nni_cmd/launcher_utils.py | 9 ++++++++- 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index a9ad8cfd9a..c7fa694fb9 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -38,6 +38,7 @@ export namespace ValidationSchemas { authFile: joi.string(), nniManagerNFSMountPath: joi.string().min(1), containerNFSMountPath: joi.string().min(1), + paiConfigPath: joi.string(), paiStoragePlugin: joi.string().min(1), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), portList: joi.array().items(joi.object({ diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts index 70f175683e..26ad2901bd 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts @@ -31,10 +31,11 @@ export class NNIPAIK8STrialConfig extends TrialConfig { public readonly nniManagerNFSMountPath: string; public readonly containerNFSMountPath: string; public readonly paiStoragePlugin: string; + public readonly paiConfigPath?: string; constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number, image: string, nniManagerNFSMountPath: string, containerNFSMountPath: string, - paiStoragePlugin: string, virtualCluster?: string) { + paiStoragePlugin: string, virtualCluster?: string, paiConfigPath?: string) { super(command, codeDir, gpuNum); this.cpuNum = cpuNum; this.memoryMB = memoryMB; @@ -43,5 +44,6 @@ export class NNIPAIK8STrialConfig extends TrialConfig { this.nniManagerNFSMountPath = nniManagerNFSMountPath; this.containerNFSMountPath = containerNFSMountPath; this.paiStoragePlugin = paiStoragePlugin; + this.paiConfigPath = paiConfigPath; } } diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index fc64d4dbdc..bd83f32747 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -185,7 +185,16 @@ class PAIK8STrainingService extends PAITrainingService { } } - return yaml.safeDump(paiJobConfig); + if (this.paiTrialConfig.paiConfigPath) { + try { + const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); + return yaml.safeDump({...paiJobConfig, ...additionalPAIConfig}); + } catch (error) { + this.log.error(`Loading ${this.paiTrialConfig.paiConfigPath} error: ${error}`); + } + } else { + return yaml.safeDump(paiJobConfig); + } } protected async submitTrialJobToPAI(trialJobId: string): Promise { @@ -254,7 +263,7 @@ class PAIK8STrainingService extends PAITrainingService { this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand); - + this.log.debug(paiJobConfig); // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const submitJobRequest: request.Options = { diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index 8017946ce9..d077a968d3 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -280,7 +280,8 @@ def setPathCheck(key): Optional('virtualCluster'): setType('virtualCluster', str), 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), 'containerNFSMountPath': setType('containerNFSMountPath', str), - 'paiStoragePlugin': setType('paiStoragePlugin', str) + 'paiStoragePlugin': setType('paiStoragePlugin', str), + Optional('paiConfigPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'paiConfigPath') } } diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index f2d2c1d22f..c1aeb8bfc4 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -7,7 +7,7 @@ from schema import Schema from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_YARN_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict -from .common_utils import print_error, print_warning, print_normal +from .common_utils import print_error, print_warning, print_normal, get_yml_content def expand_path(experiment_config, key): '''Change '~' to user home directory''' @@ -63,6 +63,8 @@ def parse_path(experiment_config, config_path): if experiment_config.get('machineList'): for index in range(len(experiment_config['machineList'])): expand_path(experiment_config['machineList'][index], 'sshKeyPath') + if experiment_config['trial'].get('paiConfigPath'): + expand_path(experiment_config['trial'], 'paiConfigPath') #if users use relative path, convert it to absolute path root_path = os.path.dirname(config_path) @@ -94,6 +96,8 @@ def parse_path(experiment_config, config_path): if experiment_config.get('machineList'): for index in range(len(experiment_config['machineList'])): parse_relative_path(root_path, experiment_config['machineList'][index], 'sshKeyPath') + if experiment_config['trial'].get('paiConfigPath'): + parse_relative_path(root_path, experiment_config['trial'], 'paiConfigPath') def validate_search_space_content(experiment_config): '''Validate searchspace content, @@ -269,6 +273,9 @@ def validate_pai_trial_conifg(experiment_config): print_warning(warning_information.format('dataDir')) if experiment_config.get('trial').get('outputDir'): print_warning(warning_information.format('outputDir')) + if experiment_config.get('trainingServicePlatform') == 'pai': + if experiment_config.get('trial').get('paiConfigPath'): + get_yml_content(experiment_config['trial']['paiConfigPath']) def validate_all_content(experiment_config, config_path): '''Validate whether experiment_config is valid''' From 956b413ddd91ab049f79bd30c60c5543d98ed3f7 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jan 2020 10:10:47 +0800 Subject: [PATCH 04/12] fix comments --- .../training_service/pai/paiK8S/paiK8STrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index bd83f32747..67643f520f 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService { const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); return yaml.safeDump({...paiJobConfig, ...additionalPAIConfig}); } catch (error) { - this.log.error(`Loading ${this.paiTrialConfig.paiConfigPath} error: ${error}`); + this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`); } } else { return yaml.safeDump(paiJobConfig); From 0a378209ab2005d2f791815c7eead433aa9de561 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jan 2020 10:47:51 +0800 Subject: [PATCH 05/12] use deepmerge package --- src/nni_manager/package.json | 3 ++- .../training_service/pai/paiK8S/paiK8STrainingService.ts | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index f79a0a4529..bea81ef16e 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -26,7 +26,8 @@ "ts-deferred": "^1.0.4", "typescript-ioc": "^1.2.4", "typescript-string-operations": "^1.3.1", - "webhdfs": "^1.2.0" + "webhdfs": "^1.2.0", + "deepmerge": "^4.2.2" }, "devDependencies": { "@types/chai": "^4.1.4", diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 67643f520f..7a682718be 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -44,6 +44,7 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; import { PAIJobRestServer } from '../paiJobRestServer'; const yaml = require('js-yaml'); +const deepmerge = require('deepmerge') /** * Training Service implementation for OpenPAI (Open Platform for AI) @@ -188,7 +189,9 @@ class PAIK8STrainingService extends PAITrainingService { if (this.paiTrialConfig.paiConfigPath) { try { const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); - return yaml.safeDump({...paiJobConfig, ...additionalPAIConfig}); + //deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result. + //refer: https://github.com/TehShrike/deepmerge + return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig)); } catch (error) { this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`); } From d07fec172830720aec235d78bc6824feb536e2d7 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jan 2020 10:53:00 +0800 Subject: [PATCH 06/12] add semicolon --- .../training_service/pai/paiK8S/paiK8STrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 7a682718be..3c625af9cc 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -44,7 +44,7 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; import { PAIJobRestServer } from '../paiJobRestServer'; const yaml = require('js-yaml'); -const deepmerge = require('deepmerge') +const deepmerge = require('deepmerge'); /** * Training Service implementation for OpenPAI (Open Platform for AI) From a80368460f15ac1bfd15b820b599577d73cac4e2 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jan 2020 10:55:37 +0800 Subject: [PATCH 07/12] add annotation --- tools/nni_cmd/launcher_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index c1aeb8bfc4..bd01b4b3af 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -275,6 +275,7 @@ def validate_pai_trial_conifg(experiment_config): print_warning(warning_information.format('outputDir')) if experiment_config.get('trainingServicePlatform') == 'pai': if experiment_config.get('trial').get('paiConfigPath'): + # validate the file format of paiConfigPath, ensure it is yaml format get_yml_content(experiment_config['trial']['paiConfigPath']) def validate_all_content(experiment_config, config_path): From 1970f155df208ac40a2ac7472defd25a40074be9 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jan 2020 19:50:14 +0800 Subject: [PATCH 08/12] set trial config optional --- .../pai/paiK8S/paiK8STrainingService.ts | 3 +- tools/nni_cmd/config_schema.py | 14 +++--- tools/nni_cmd/launcher_utils.py | 44 +++++++++++++++++-- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 3c625af9cc..daa1242c8c 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -191,7 +191,8 @@ class PAIK8STrainingService extends PAITrainingService { const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); //deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result. //refer: https://github.com/TehShrike/deepmerge - return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig)); + const overwriteMerge = (destinationArray: any, sourceArray: any, options: any) => sourceArray; + return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig, { arrayMerge: overwriteMerge })); } catch (error) { this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`); } diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index d077a968d3..7efe15c0dc 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -271,16 +271,16 @@ def setPathCheck(key): pai_trial_schema = { 'trial':{ - 'command': setType('command', str), 'codeDir': setPathCheck('codeDir'), - 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), - 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), - 'memoryMB': setType('memoryMB', int), - 'image': setType('image', str), - Optional('virtualCluster'): setType('virtualCluster', str), 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), 'containerNFSMountPath': setType('containerNFSMountPath', str), - 'paiStoragePlugin': setType('paiStoragePlugin', str), + 'command': setType('command', str), + Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), + Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999), + Optional('memoryMB'): setType('memoryMB', int), + Optional('image'): setType('image', str), + Optional('virtualCluster'): setType('virtualCluster', str), + Optional('paiStoragePlugin'): setType('paiStoragePlugin', str), Optional('paiConfigPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'paiConfigPath') } } diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index bd01b4b3af..9301e5bf2b 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -258,6 +258,45 @@ def validate_machine_list(experiment_config): print_error('Please set machineList!') exit(1) +def validate_pai_config_path(experiment_config): + '''validate paiConfigPath field''' + if experiment_config.get('trainingServicePlatform') == 'pai': + if experiment_config.get('trial', {}).get('paiConfigPath'): + # validate the file format of paiConfigPath, ensure it is yaml format + pai_config = get_yml_content(experiment_config['trial']['paiConfigPath']) + if experiment_config['trial'].get('image') is None: + if pai_config.get('prerequisites', [{}])[0].get('uri') is None: + print_error('Please set image field, or set image uri in your own paiConfig!') + exit(1) + experiment_config['trial']['image'] = pai_config['prerequisites'][0]['uri'] + if experiment_config['trial'].get('gpuNum') is None: + if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('gpu') is None: + print_error('Please set gpuNum field, or set resourcePerInstance gpu in your own paiConfig!') + exit(1) + experiment_config['trial']['gpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['gpu'] + if experiment_config['trial'].get('cpuNum') is None: + if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('cpu') is None: + print_error('Please set cpuNum field, or set resourcePerInstance cpu in your own paiConfig!') + exit(1) + experiment_config['trial']['cpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['cpu'] + if experiment_config['trial'].get('memoryMB') is None: + if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('memoryMB', {}) is None: + print_error('Please set memoryMB field, or set resourcePerInstance memoryMB in your own paiConfig!') + exit(1) + experiment_config['trial']['memoryMB'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['memoryMB'] + if experiment_config['trial'].get('paiStoragePlugin') is None: + if pai_config.get('extras', {}).get('com.microsoft.pai.runtimeplugin', [{}])[0].get('plugin') is None: + print_error('Please set paiStoragePlugin field, or set plugin in your own paiConfig!') + exit(1) + experiment_config['trial']['paiStoragePlugin'] = pai_config['extras']['com.microsoft.pai.runtimeplugin'][0]['plugin'] + else: + pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin'] + for trial_field in pai_trial_fields_required_list: + if experiment_config['trial'].get(trial_field) is None: + print_error('Please set {0} in trial configuration,\ + or set additional pai configuration file path in paiConfigPath!'.format(trial_field)) + exit(1) + def validate_pai_trial_conifg(experiment_config): '''validate the trial config in pai platform''' if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']: @@ -273,10 +312,7 @@ def validate_pai_trial_conifg(experiment_config): print_warning(warning_information.format('dataDir')) if experiment_config.get('trial').get('outputDir'): print_warning(warning_information.format('outputDir')) - if experiment_config.get('trainingServicePlatform') == 'pai': - if experiment_config.get('trial').get('paiConfigPath'): - # validate the file format of paiConfigPath, ensure it is yaml format - get_yml_content(experiment_config['trial']['paiConfigPath']) + validate_pai_config_path(experiment_config) def validate_all_content(experiment_config, config_path): '''Validate whether experiment_config is valid''' From a1dab9fe7900f6c3559c8b955896d3b252eb1b10 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 19 Jan 2020 20:00:22 +0800 Subject: [PATCH 09/12] add doc --- docs/en_US/TrainingService/PaiMode.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 3174e1079b..535e3e9f96 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -48,12 +48,12 @@ Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if y Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys: * cpuNum - * Required key. Should be positive number based on your trial program's CPU requirement + * Optional key. Should be positive number based on your trial program's CPU requirement. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field. * memoryMB - * Required key. Should be positive number based on your trial program's memory requirement + * Optional key. Should be positive number based on your trial program's memory requirement. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field. * image - * Required key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run. - * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. + * Optional key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run. + * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field. * virtualCluster * Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. * nniManagerNFSMountPath @@ -61,7 +61,9 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * containerNFSMountPath * Required key. Set the mount path in your container used in PAI. * paiStoragePlugin - * Required key. Set the storage plugin name used in PAI. + * Optional key. Set the storage plugin name used in PAI. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field. +* paiConfigPath + * Optional key. Set the file path of pai job configuration, the file is in yaml format. Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command From c58f49bdc703761af83a648d0bd0c32580d7ab91 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 20 Jan 2020 10:46:42 +0800 Subject: [PATCH 10/12] sort package.json --- src/nni_manager/package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index bea81ef16e..e837e67991 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -13,6 +13,7 @@ "azure-storage": "^2.10.2", "chai-as-promised": "^7.1.1", "child-process-promise": "^2.2.1", + "deepmerge": "^4.2.2", "express": "^4.16.3", "express-joi-validator": "^2.0.0", "js-base64": "^2.4.9", @@ -26,8 +27,7 @@ "ts-deferred": "^1.0.4", "typescript-ioc": "^1.2.4", "typescript-string-operations": "^1.3.1", - "webhdfs": "^1.2.0", - "deepmerge": "^4.2.2" + "webhdfs": "^1.2.0" }, "devDependencies": { "@types/chai": "^4.1.4", From 4163f267a28dc18cd2fb1bb0accbeb8a1ed75bad Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 5 Feb 2020 11:48:59 +0800 Subject: [PATCH 11/12] add yarn.lock --- src/nni_manager/yarn.lock | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index deac0b5c89..7853580757 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -1108,6 +1108,11 @@ deepmerge@^2.1.1: version "2.2.1" resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170" +deepmerge@^4.2.2: + version "4.2.2" + resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955" + integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg== + default-require-extensions@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-2.0.0.tgz#f5f8fbb18a7d6d50b21f641f649ebb522cfe24f7" @@ -1840,9 +1845,10 @@ growl@1.10.5: version "1.10.5" resolved "https://registry.yarnpkg.com/growl/-/growl-1.10.5.tgz#f2735dc2283674fa67478b10181059355c369e5e" -handlebars@^4.0.11, handlebars@^4.3.0: - version "4.5.3" - resolved "https://registry.yarnpkg.com/handlebars/-/handlebars-4.5.3.tgz#5cf75bd8714f7605713511a56be7c349becb0482" +handlebars@^4.0.11, handlebars@^4.5.3: + version "4.7.2" + resolved "https://registry.yarnpkg.com/handlebars/-/handlebars-4.7.2.tgz#01127b3840156a0927058779482031afe0e730d7" + integrity sha512-4PwqDL2laXtTWZghzzCtunQUTLbo31pcCJrd/B/9JP8XbhVzpS5ZXuKqlOzsd1rtcaLo4KqAn8nl8mkknS4MHw== dependencies: neo-async "^2.6.0" optimist "^0.6.1" From e2ceede5cf65b9e706107e6c3ffc55d185e5abde Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 5 Feb 2020 11:52:17 +0800 Subject: [PATCH 12/12] revert change --- src/nni_manager/yarn.lock | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index 7853580757..08d0eab4db 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -1845,10 +1845,9 @@ growl@1.10.5: version "1.10.5" resolved "https://registry.yarnpkg.com/growl/-/growl-1.10.5.tgz#f2735dc2283674fa67478b10181059355c369e5e" -handlebars@^4.0.11, handlebars@^4.5.3: - version "4.7.2" - resolved "https://registry.yarnpkg.com/handlebars/-/handlebars-4.7.2.tgz#01127b3840156a0927058779482031afe0e730d7" - integrity sha512-4PwqDL2laXtTWZghzzCtunQUTLbo31pcCJrd/B/9JP8XbhVzpS5ZXuKqlOzsd1rtcaLo4KqAn8nl8mkknS4MHw== +handlebars@^4.0.11, handlebars@^4.3.0: + version "4.5.3" + resolved "https://registry.yarnpkg.com/handlebars/-/handlebars-4.5.3.tgz#5cf75bd8714f7605713511a56be7c349becb0482" dependencies: neo-async "^2.6.0" optimist "^0.6.1"