diff --git a/Makefile b/Makefile index 829c68bd3f..20696461a4 100644 --- a/Makefile +++ b/Makefile @@ -167,6 +167,7 @@ install-dependencies: $(NNI_NODE_TARBALL) $(NNI_YARN_TARBALL) .PHONY: install-python-modules install-python-modules: #$(_INFO) Installing Python SDK $(_END) + sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' src/sdk/pynni/nni/__init__.py sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' setup.py && $(PIP_INSTALL) $(PIP_MODE) . .PHONY: dev-install-python-modules diff --git a/README.md b/README.md index 7b67efc1ce..27a33c5f18 100644 --- a/README.md +++ b/README.md @@ -342,7 +342,7 @@ With authors' permission, we listed a set of NNI usage examples and relevant art Join IM discussion groups: |Gitter||WeChat| |----|----|----| -|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://github.com/JSong-Jia/NNI-user-group/blob/master/user%20group%20code_0512.jpg)| +|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://github.com/JSong-Jia/NNI-user-group/blob/master/user%20group%20code_0512.png)| ## Related Projects diff --git a/deployment/pypi/Makefile b/deployment/pypi/Makefile index c430b9195e..a9559f7f1e 100644 --- a/deployment/pypi/Makefile +++ b/deployment/pypi/Makefile @@ -47,6 +47,7 @@ build: cp $(CWD)../../src/nni_manager/package.json $(CWD)nni sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(CWD)nni/package.json cd $(CWD)nni && $(NNI_YARN) --prod + sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(CWD)../../src/sdk/pynni/nni/__init__.py cd $(CWD) && sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' setup.py && python3 setup.py bdist_wheel -p $(WHEEL_SPEC) cd $(CWD) diff --git a/deployment/pypi/install.ps1 b/deployment/pypi/install.ps1 index d8012de094..7332a37489 100644 --- a/deployment/pypi/install.ps1 +++ b/deployment/pypi/install.ps1 @@ -60,6 +60,8 @@ Copy-Item $CWD\..\..\src\nni_manager\package.json $CWD\nni (Get-Content $CWD\nni\package.json).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content $CWD\nni\package.json cd $CWD\nni yarn --prod +cd $CWD\..\..\src\sdk\pynni\nni +(Get-Content __init__.py).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content __init__.py cd $CWD (Get-Content setup.py).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content setup.py python setup.py bdist_wheel -p $WHEEL_SPEC diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index c608cc970a..53046cdfcd 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -7,9 +7,9 @@ Step 1. Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). Step 2. Get PAI token. Click `My profile` button in the top-right side of PAI's webprotal. -![](../../img/pai_token_button.jpg) -Find the token management region, copy one of the token as your account token. -![](../../img/pai_token_profile.jpg) +![](../../img/pai_profile.jpg) +Click `copy` button in the page to copy a jwt token. +![](../../img/pai_token.jpg) Step 3. Mount NFS storage to local machine. Click `Submit job` button in PAI's webportal. @@ -19,7 +19,7 @@ Step 3. Mount NFS storage to local machine. The `DEFAULT_STORAGE`field is the path to be mounted in PAI's container when a job is started. The `Preview container paths` is the NFS host and path that PAI provided, you need to mount the corresponding host and path to your local machine first, then NNI could use the PAI's NFS storage. For example, use the following command: ``` -sudo mount nfs://gcr-openpai-infra02:/pai/data /local/mnt +sudo mount -t nfs4 gcr-openpai-infra02:/pai/data /local/mnt ``` Then the `/data` folder in container will be mounted to `/local/mnt` folder in your local machine. You could use the following configuration in your NNI's config file: @@ -66,7 +66,7 @@ trial: virtualCluster: default nniManagerNFSMountPath: /home/user/mnt containerNFSMountPath: /mnt/data/user - paiStoragePlugin: team_wise + paiStoragePlugin: teamwise_storage # Configuration to access OpenPAI Cluster paiConfig: userName: your_pai_nni_user @@ -74,7 +74,7 @@ paiConfig: host: 10.1.1.1 ``` -Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. +Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. The host field in configuration file is PAI's job submission page uri, like `10.10.5.1`, the default http protocol in NNI is `http`, if your PAI's cluster enabled https, please use the uri in `https://10.10.5.1` format. Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys: * cpuNum diff --git a/docs/en_US/TrainingService/RemoteMachineMode.md b/docs/en_US/TrainingService/RemoteMachineMode.md index fb3aeaca7f..bb8c9d5d67 100644 --- a/docs/en_US/TrainingService/RemoteMachineMode.md +++ b/docs/en_US/TrainingService/RemoteMachineMode.md @@ -2,18 +2,54 @@ NNI can run one experiment on multiple remote machines through SSH, called `remote` mode. It's like a lightweight training platform. In this mode, NNI can be started from your computer, and dispatch trials to remote machines in parallel. -## Remote machine requirements +The OS of remote machines supports `Linux`, `Windows 10`, and `Windows Server 2019`. -* It only supports Linux as remote machines, and [linux part in system specification](../Tutorial/InstallationLinux.md) is same as NNI local mode. +## Requirements -* Follow [installation](../Tutorial/InstallationLinux.md) to install NNI on each machine. - -* Make sure remote machines meet environment requirements of your trial code. If the default environment does not meet the requirements, the setup script can be added into `command` field of NNI config. +* Make sure the default environment of remote machines meets requirements of your trial code. If the default environment does not meet the requirements, the setup script can be added into `command` field of NNI config. * Make sure remote machines can be accessed through SSH from the machine which runs `nnictl` command. It supports both password and key authentication of SSH. For advanced usages, please refer to [machineList part of configuration](../Tutorial/ExperimentConfig.md). * Make sure the NNI version on each machine is consistent. +* Make sure the command of Trial is compatible with remote OSes, if you want to use remote Linux and Windows together. For example, the default python 3.x executable called `python3` on Linux, and `python` on Windows. + +### Linux + +* Follow [installation](../Tutorial/InstallationLinux.md) to install NNI on the remote machine. + +### Windows + +* Follow [installation](../Tutorial/InstallationWin.md) to install NNI on the remote machine. + +* Install and start `OpenSSH Server`. + + 1. Open `Settings` app on Windows. + + 2. Click `Apps`, then click `Optional features`. + + 3. Click `Add a feature`, search and select `OpenSSH Server`, and then click `Install`. + + 4. Once it's installed, run below command to start and set to automatic start. + + ```bat + sc config sshd start=auto + net start sshd + ``` + +* Make sure remote account is administrator, so that it can stop running trials. + +* Make sure there is no welcome message more than default, since it causes ssh2 failed in NodeJs. For example, if you're using Data Science VM on Azure, it needs to remove extra echo commands in `C:\dsvm\tools\setup\welcome.bat`. + + The output like below is ok, when opening a new command window. + + ```text + Microsoft Windows [Version 10.0.17763.1192] + (c) 2018 Microsoft Corporation. All rights reserved. + + (py37_default) C:\Users\AzureUser> + ``` + ## Run an experiment e.g. there are three machines, which can be logged in with username and password. diff --git a/docs/en_US/Tutorial/InstallationWin.md b/docs/en_US/Tutorial/InstallationWin.md index a9f3deb3e4..012188a1b4 100644 --- a/docs/en_US/Tutorial/InstallationWin.md +++ b/docs/en_US/Tutorial/InstallationWin.md @@ -1,46 +1,56 @@ # Install on Windows -## Installation +## Prerequires -Anaconda or Miniconda is highly recommended to manage multiple Python environments. +* Python 3.5 (or above) 64-bit. [Anaconda](https://www.anaconda.com/products/individual) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) is highly recommended to manage multiple Python environments on Windows. -### Install NNI through pip +* If it's a newly installed Python environment, it needs to install [Microsoft C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) to support build NNI dependencies like `scikit-learn`. - Prerequisites: `python 64-bit >= 3.5` + ```bat + pip install cython wheel + ``` - ```bash - python -m pip install --upgrade nni - ``` +* git for verifying installation. -### Install NNI through source code +## Install NNI - If you are interested in special or the latest code versions, you can install NNI through source code. +In most cases, you can install and upgrade NNI from pip package. It's easy and fast. - Prerequisites: `python 64-bit >=3.5`, `git`, `PowerShell`. +If you are interested in special or the latest code versions, you can install NNI through source code. - ```bash - git clone -b v1.5 https://github.com/Microsoft/nni.git - cd nni - powershell -ExecutionPolicy Bypass -file install.ps1 - ``` +If you want to contribute to NNI, refer to [setup development environment](SetupNniDeveloperEnvironment.md). + +* From pip package + + ```bat + python -m pip install --upgrade nni + ``` + +* From source code + + ```bat + git clone -b v1.5 https://github.com/Microsoft/nni.git + cd nni + powershell -ExecutionPolicy Bypass -file install.ps1 + ``` ## Verify installation The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is used** when running it. -* Download the examples via clone the source code. +* Clone examples within source code. - ```bash - git clone -b v1.5 https://github.com/Microsoft/nni.git - ``` + ```bat + git clone -b v1.5 https://github.com/Microsoft/nni.git + ``` * Run the MNIST example. - ```bash - nnictl create --config nni\examples\trials\mnist-tfv1\config_windows.yml - ``` + ```bat + nnictl create --config nni\examples\trials\mnist-tfv1\config_windows.yml + ``` - Note: for other examples you need to change trial command `python3` to `python` in each example YAML, if python3 is called through `python` on your machine. + Note: If you are familiar with other frameworks, you can choose corresponding example under `examples\trials`. It needs to change trial command `python3` to `python` in each example YAML, since default installation has `python.exe`, not `python3.exe` executable. * Wait for the message `INFO: Successfully started experiment!` in the command line. This message indicates that your experiment has been successfully started. You can explore the experiment using the `Web UI url`. @@ -112,18 +122,20 @@ If there is a stderr file, please check it. Two possible cases are: * forgetting to install experiment dependencies such as TensorFlow, Keras and so on. ### Fail to use BOHB on Windows + Make sure a C++ 14.0 compiler is installed when trying to run `nnictl package install --name=BOHB` to install the dependencies. ### Not supported tuner on Windows + SMAC is not supported currently; for the specific reason refer to this [GitHub issue](https://github.com/automl/SMAC3/issues/483). -### Use a Windows server as a remote worker -Currently, you can't. +### Use Windows as a remote worker -Note: +Refer to [Remote Machine mode](../TrainingService/RemoteMachineMode.md). -* If an error like `Segmentation fault` is encountered, please refer to the [FAQ](FAQ.md) +### Segmentation fault (core dumped) when installing +Refer to [FAQ](FAQ.md). ## Further reading diff --git a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md index 00657f28e3..d4e1cfc072 100644 --- a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md +++ b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md @@ -1,76 +1,59 @@ -**Set up NNI developer environment** +# Setup NNI development environment -=== +NNI development environment supports Ubuntu 1604 (or above), and Windows 10 with Python3 64bit. -## Best practice for debug NNI source code +## Installation -For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps. +The installation steps are similar with installing from source code. But the installation links to code directory, so that code changes can be applied to installation as easy as possible. -### 1. Clone the source code +### 1. Clone source code -Run the command - -``` +```bash git clone https://github.com/Microsoft/nni.git ``` -to clone the source code +Note, if you want to contribute code back, it needs to fork your own NNI repo, and clone from there. -### 2. Prepare the debug environment and install dependencies +### 2. Install from source code -Change directory to the source code folder, then run the command +#### Ubuntu +```bash +make dev-easy-install ``` -make install-dependencies -``` - -to install the dependent tools for the environment - -### 3. Build source code -Run the command +#### Windows +```bat +powershell -ExecutionPolicy Bypass -file install.ps1 -Development ``` -make build -``` - -to build the source code -### 4. Install NNI to development environment - -Run the command - -``` -make dev-install -``` - -to install the distribution content to development environment, and create cli scripts - -### 5. Check if the environment is ready +### 3. Check if the environment is ready Now, you can try to start an experiment to check if your environment is ready. For example, run the command -``` -nnictl create --config ~/nni/examples/trials/mnist-tfv1/config.yml +```bash +nnictl create --config examples/trials/mnist-tfv1/config.yml ``` And open WebUI to check if everything is OK -### 6. Redeploy - -After the code changes, it may need to redeploy. It depends on what kind of code changed. +### 4. Reload changes #### Python -It doesn't need to redeploy, but the nnictl may need to be restarted. +Nothing to do, the code is already linked to package folders. #### TypeScript -* If `src/nni_manager` is changed, run `yarn watch` continually under this folder. It will rebuild code instantly. The nnictl may need to be restarted to reload NNI manager. +* If `src/nni_manager` is changed, run `yarn watch` under this folder. It will watch and build code continually. The `nnictl` need to be restarted to reload NNI manager. * If `src/webui` or `src/nasui` are changed, run `yarn start` under the corresponding folder. The web UI will refresh automatically if code is changed. +### 5. Submit Pull Request + +All changes are merged to master branch from your forked repo. The description of Pull Request must be meaningful, and useful. + +We will review the changes as soon as possible. Once it passes review, we will merge it to master branch. ---- -At last, wish you have a wonderful day. -For more contribution guidelines on making PR's or issues to NNI source code, you can refer to our [Contributing](Contributing.md) document. +For more contribution guidelines and coding styles, you can refer to the [contributing document](Contributing.md). diff --git a/docs/img/pai_job_submission_page.jpg b/docs/img/pai_job_submission_page.jpg index f49a1c267e..377a66f593 100644 Binary files a/docs/img/pai_job_submission_page.jpg and b/docs/img/pai_job_submission_page.jpg differ diff --git a/docs/img/pai_profile.jpg b/docs/img/pai_profile.jpg new file mode 100644 index 0000000000..eadbbeb9fa Binary files /dev/null and b/docs/img/pai_profile.jpg differ diff --git a/docs/img/pai_token.jpg b/docs/img/pai_token.jpg new file mode 100644 index 0000000000..83f388a282 Binary files /dev/null and b/docs/img/pai_token.jpg differ diff --git a/docs/img/pai_token_profile.jpg b/docs/img/pai_token_profile.jpg deleted file mode 100644 index 52d68bb7b5..0000000000 Binary files a/docs/img/pai_token_profile.jpg and /dev/null differ diff --git a/examples/nas/enas-tf/datasets.py b/examples/nas/enas-tf/datasets.py new file mode 100644 index 0000000000..2c5e44902b --- /dev/null +++ b/examples/nas/enas-tf/datasets.py @@ -0,0 +1,12 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import tensorflow as tf +from tensorflow.data import Dataset + +def get_dataset(): + (x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data() + x_train, x_valid = x_train / 255.0, x_valid / 255.0 + train_set = (x_train, y_train) + valid_set = (x_valid, y_valid) + return train_set, valid_set diff --git a/examples/nas/enas-tf/macro.py b/examples/nas/enas-tf/macro.py new file mode 100644 index 0000000000..f0d73c2e69 --- /dev/null +++ b/examples/nas/enas-tf/macro.py @@ -0,0 +1,142 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import tensorflow as tf +from tensorflow.keras import Model, Sequential +from tensorflow.keras.layers import ( + AveragePooling2D, + BatchNormalization, + Conv2D, + Dense, + Dropout, + GlobalAveragePooling2D, + MaxPool2D, + ReLU, + SeparableConv2D, +) + +from nni.nas.tensorflow.mutables import InputChoice, LayerChoice, MutableScope + + +def build_conv(filters, kernel_size, name=None): + return Sequential([ + Conv2D(filters, kernel_size=1, use_bias=False), + BatchNormalization(trainable=False), + ReLU(), + Conv2D(filters, kernel_size, padding='same'), + BatchNormalization(trainable=False), + ReLU(), + ], name) + +def build_separable_conv(filters, kernel_size, name=None): + return Sequential([ + Conv2D(filters, kernel_size=1, use_bias=False), + BatchNormalization(trainable=False), + ReLU(), + SeparableConv2D(filters, kernel_size, padding='same', use_bias=False), + Conv2D(filters, kernel_size=1, use_bias=False), + BatchNormalization(trainable=False), + ReLU(), + ], name) + +def build_avg_pool(filters, name=None): + return Sequential([ + Conv2D(filters, kernel_size=1, use_bias=False), + BatchNormalization(trainable=False), + ReLU(), + AveragePooling2D(pool_size=3, strides=1, padding='same'), + BatchNormalization(trainable=False), + ], name) + +def build_max_pool(filters, name=None): + return Sequential([ + Conv2D(filters, kernel_size=1, use_bias=False), + BatchNormalization(trainable=False), + ReLU(), + MaxPool2D(pool_size=3, strides=1, padding='same'), + BatchNormalization(trainable=False), + ], name) + + +class FactorizedReduce(Model): + def __init__(self, filters): + super().__init__() + self.conv1 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False) + self.conv2 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False) + self.bn = BatchNormalization(trainable=False) + + def call(self, x): + out1 = self.conv1(x) + out2 = self.conv2(x[:, 1:, 1:, :]) + out = tf.concat([out1, out2], axis=3) + out = self.bn(out) + return out + + +class ENASLayer(MutableScope): + def __init__(self, key, prev_labels, filters): + super().__init__(key) + self.mutable = LayerChoice([ + build_conv(filters, 3, 'conv3'), + build_separable_conv(filters, 3, 'sepconv3'), + build_conv(filters, 5, 'conv5'), + build_separable_conv(filters, 5, 'sepconv5'), + build_avg_pool(filters, 'avgpool'), + build_max_pool(filters, 'maxpool'), + ]) + if len(prev_labels) > 0: + self.skipconnect = InputChoice(choose_from=prev_labels, n_chosen=None) + else: + self.skipconnect = None + self.batch_norm = BatchNormalization(trainable=False) + + def call(self, prev_layers): + out = self.mutable(prev_layers[-1]) + if self.skipconnect is not None: + connection = self.skipconnect(prev_layers[:-1]) + if connection is not None: + out += connection + return self.batch_norm(out) + + +class GeneralNetwork(Model): + def __init__(self, num_layers=12, filters=24, num_classes=10, dropout_rate=0.0): + super().__init__() + self.num_layers = num_layers + + self.stem = Sequential([ + Conv2D(filters, kernel_size=3, padding='same', use_bias=False), + BatchNormalization() + ]) + + labels = ['layer_{}'.format(i) for i in range(num_layers)] + self.enas_layers = [] + for i in range(num_layers): + layer = ENASLayer(labels[i], labels[:i], filters) + self.enas_layers.append(layer) + + pool_num = 2 + self.pool_distance = num_layers // (pool_num + 1) + self.pool_layers = [FactorizedReduce(filters) for _ in range(pool_num)] + + self.gap = GlobalAveragePooling2D() + self.dropout = Dropout(dropout_rate) + self.dense = Dense(num_classes) + + def call(self, x): + cur = self.stem(x) + prev_outputs = [cur] + + for i, layer in enumerate(self.enas_layers): + if i > 0 and i % self.pool_distance == 0: + pool = self.pool_layers[i // self.pool_distance - 1] + prev_outputs = [pool(tensor) for tensor in prev_outputs] + cur = prev_outputs[-1] + + cur = layer(prev_outputs) + prev_outputs.append(cur) + + cur = self.gap(cur) + cur = self.dropout(cur) + logits = self.dense(cur) + return logits diff --git a/examples/nas/enas-tf/micro.py b/examples/nas/enas-tf/micro.py new file mode 100644 index 0000000000..8c52f4b441 --- /dev/null +++ b/examples/nas/enas-tf/micro.py @@ -0,0 +1,176 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import tensorflow as tf +from tensorflow.keras import Model, Sequential +from tensorflow.keras.layers import ( + AveragePooling2D, + BatchNormalization, + Conv2D, + Dense, + Dropout, + GlobalAveragePooling2D, + MaxPool2D, + ReLU, + SeparableConv2D, +) + +from nni.nas.tensorflow.mutables import InputChoice, LayerChoice, MutableScope + + +def build_conv_1x1(filters, name=None): + return Sequential([ + Conv2D(filters, kernel_size=1, use_bias=False), + BatchNormalization(trainable=False), + ReLU(), + ], name) + +def build_sep_conv(filters, kernel_size, name=None): + return Sequential([ + ReLU(), + SeparableConv2D(filters, kernel_size, padding='same'), + BatchNormalization(trainable=True), + ], name) + + +class FactorizedReduce(Model): + def __init__(self, filters): + super().__init__() + self.conv1 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False) + self.conv2 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False) + self.bn = BatchNormalization(trainable=False) + + def call(self, x): + out1 = self.conv1(x) + out2 = self.conv2(x[:, 1:, 1:, :]) + out = tf.concat([out1, out2], axis=3) + out = self.bn(out) + return out + + +class ReductionLayer(Model): + def __init__(self, filters): + super().__init__() + self.reduce0 = FactorizedReduce(filters) + self.reduce1 = FactorizedReduce(filters) + + def call(self, prevprev, prev): + return self.reduce0(prevprev), self.reduce1(prev) + + +class Calibration(Model): + def __init__(self, filters): + super().__init__() + self.filters = filters + self.process = None + + def build(self, shape): + assert len(shape) == 4 # batch_size, width, height, filters + if shape[3] != self.filters: + self.process = build_conv_1x1(self.filters) + + def call(self, x): + if self.process is None: + return x + return self.process(x) + + +class Cell(Model): + def __init__(self, cell_name, prev_labels, filters): + super().__init__() + self.input_choice = InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True, key=cell_name + '_input') + self.op_choice = LayerChoice([ + build_sep_conv(filters, 3), + build_sep_conv(filters, 5), + AveragePooling2D(pool_size=3, strides=1, padding='same'), + MaxPool2D(pool_size=3, strides=1, padding='same'), + Sequential(), # Identity + ], key=cell_name + '_op') + + def call(self, prev_layers): + chosen_input, chosen_mask = self.input_choice(prev_layers) + cell_out = self.op_choice(chosen_input) + return cell_out, chosen_mask + + +class Node(MutableScope): + def __init__(self, node_name, prev_node_names, filters): + super().__init__(node_name) + self.cell_x = Cell(node_name + '_x', prev_node_names, filters) + self.cell_y = Cell(node_name + '_y', prev_node_names, filters) + + def call(self, prev_layers): + out_x, mask_x = self.cell_x(prev_layers) + out_y, mask_y = self.cell_y(prev_layers) + return out_x + out_y, mask_x | mask_y + + +class ENASLayer(Model): + def __init__(self, num_nodes, filters, reduction): + super().__init__() + self.preproc0 = Calibration(filters) + self.preproc1 = Calibration(filters) + + self.nodes = [] + node_labels = [InputChoice.NO_KEY, InputChoice.NO_KEY] + name_prefix = 'reduce' if reduction else 'normal' + for i in range(num_nodes): + node_labels.append('{}_node_{}'.format(name_prefix, i)) + self.nodes.append(Node(node_labels[-1], node_labels[:-1], filters)) + + self.conv_ops = [Conv2D(filters, kernel_size=1, padding='same', use_bias=False) for _ in range(num_nodes + 2)] + self.bn = BatchNormalization(trainable=False) + + def call(self, prevprev, prev): + prev_nodes_out = [self.preproc0(prevprev), self.preproc1(prev)] + nodes_used_mask = tf.zeros(len(self.nodes) + 2, dtype=tf.bool) + for i, node in enumerate(self.nodes): + node_out, mask = node(prev_nodes_out) + nodes_used_mask |= tf.pad(mask, [[0, nodes_used_mask.shape[0] - mask.shape[0]]]) + prev_nodes_out.append(node_out) + + outputs = [] + for used, out, conv in zip(nodes_used_mask.numpy(), prev_nodes_out, self.conv_ops): + if not used: + outputs.append(conv(out)) + out = tf.add_n(outputs) + return prev, self.bn(out) + + +class MicroNetwork(Model): + def __init__(self, num_layers=6, num_nodes=5, out_channels=20, num_classes=10, dropout_rate=0.1): + super().__init__() + self.num_layers = num_layers + self.stem = Sequential([ + Conv2D(out_channels * 3, kernel_size=3, padding='same', use_bias=False), + BatchNormalization(), + ]) + + pool_distance = num_layers // 3 + pool_layer_indices = [pool_distance, 2 * pool_distance + 1] + + self.enas_layers = [] + + filters = out_channels + for i in range(num_layers + 2): + if i in pool_layer_indices: + reduction = True + filters *= 2 + self.enas_layers.append(ReductionLayer(filters)) + else: + reduction = False + self.enas_layers.append(ENASLayer(num_nodes, filters, reduction)) + + self.gap = GlobalAveragePooling2D() + self.dropout = Dropout(dropout_rate) + self.dense = Dense(num_classes) + + def call(self, x): + prev = cur = self.stem(x) + for layer in self.enas_layers: + prev, cur = layer(prev, cur) + cur = tf.keras.activations.relu(cur) + cur = self.gap(cur) + cur = self.dropout(cur) + logits = self.dense(cur) + return logits diff --git a/examples/nas/enas-tf/search.py b/examples/nas/enas-tf/search.py new file mode 100644 index 0000000000..b68daf62f3 --- /dev/null +++ b/examples/nas/enas-tf/search.py @@ -0,0 +1,35 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + + +from tensorflow.keras.losses import Reduction, SparseCategoricalCrossentropy +from tensorflow.keras.optimizers import SGD + +from nni.nas.tensorflow import enas + +import datasets +from macro import GeneralNetwork +from micro import MicroNetwork +from utils import accuracy, accuracy_metrics + + +# TODO: argparse + + +dataset_train, dataset_valid = datasets.get_dataset() +#model = GeneralNetwork() +model = MicroNetwork() + +loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE) +optimizer = SGD(learning_rate=0.05, momentum=0.9) + +trainer = enas.EnasTrainer(model, + loss=loss, + metrics=accuracy_metrics, + reward_function=accuracy, + optimizer=optimizer, + batch_size=64, + num_epochs=310, + dataset_train=dataset_train, + dataset_valid=dataset_valid) +trainer.train() diff --git a/examples/nas/enas-tf/utils.py b/examples/nas/enas-tf/utils.py new file mode 100644 index 0000000000..dc924a96f3 --- /dev/null +++ b/examples/nas/enas-tf/utils.py @@ -0,0 +1,19 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import tensorflow as tf + + +def accuracy_metrics(y_true, logits): + return {'enas_acc': accuracy(y_true, logits)} + +def accuracy(y_true, logits): + # y_true: shape=(batch_size) or (batch_size,1), type=integer + # logits: shape=(batch_size, num_of_classes), type=float + # returns float + batch_size = y_true.shape[0] + y_true = tf.squeeze(y_true) + y_pred = tf.math.argmax(logits, axis=1) + y_pred = tf.cast(y_pred, y_true.dtype) + equal = tf.cast(y_pred == y_true, tf.int32) + return tf.math.reduce_sum(equal).numpy() / batch_size diff --git a/examples/nas/naive-tf/train.py b/examples/nas/naive-tf/train.py new file mode 100644 index 0000000000..a4e56a39f8 --- /dev/null +++ b/examples/nas/naive-tf/train.py @@ -0,0 +1,87 @@ +import tensorflow as tf +from tensorflow.keras import Model +from tensorflow.keras.layers import (AveragePooling2D, BatchNormalization, Conv2D, Dense, MaxPool2D) +from tensorflow.keras.losses import Reduction, SparseCategoricalCrossentropy +from tensorflow.keras.optimizers import SGD + +from nni.nas.tensorflow.mutables import LayerChoice, InputChoice +from nni.nas.tensorflow.enas import EnasTrainer + +tf.get_logger().setLevel('ERROR') + + +class Net(Model): + def __init__(self): + super().__init__() + self.conv1 = LayerChoice([ + Conv2D(6, 3, padding='same', activation='relu'), + Conv2D(6, 5, padding='same', activation='relu'), + ]) + self.pool = MaxPool2D(2) + self.conv2 = LayerChoice([ + Conv2D(16, 3, padding='same', activation='relu'), + Conv2D(16, 5, padding='same', activation='relu'), + ]) + self.conv3 = Conv2D(16, 1) + + self.skipconnect = InputChoice(n_candidates=1) + self.bn = BatchNormalization() + + self.gap = AveragePooling2D(2) + self.fc1 = Dense(120, activation='relu') + self.fc2 = Dense(84, activation='relu') + self.fc3 = Dense(10) + + def call(self, x): + bs = x.shape[0] + + t = self.conv1(x) + x = self.pool(t) + x0 = self.conv2(x) + x1 = self.conv3(x0) + + x0 = self.skipconnect([x0]) + if x0 is not None: + x1 += x0 + x = self.pool(self.bn(x1)) + + x = self.gap(x) + x = tf.reshape(x, [bs, -1]) + x = self.fc1(x) + x = self.fc2(x) + x = self.fc3(x) + return x + + +def accuracy(output, target): + bs = target.shape[0] + predicted = tf.cast(tf.argmax(output, 1), target.dtype) + target = tf.reshape(target, [-1]) + return sum(tf.cast(predicted == target, tf.float32)) / bs + + +if __name__ == '__main__': + cifar10 = tf.keras.datasets.cifar10 + (x_train, y_train), (x_test, y_test) = cifar10.load_data() + x_train, x_test = x_train / 255.0, x_test / 255.0 + split = int(len(x_train) * 0.9) + dataset_train = tf.data.Dataset.from_tensor_slices((x_train[:split], y_train[:split])).batch(64) + dataset_valid = tf.data.Dataset.from_tensor_slices((x_train[split:], y_train[split:])).batch(64) + dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(64) + + net = Net() + trainer = EnasTrainer( + net, + loss=SparseCategoricalCrossentropy(reduction=Reduction.SUM), + metrics=accuracy, + reward_function=accuracy, + optimizer=SGD(learning_rate=0.001, momentum=0.9), + batch_size=64, + num_epochs=2, + dataset_train=dataset_train, + dataset_valid=dataset_valid, + dataset_test=dataset_test + ) + + trainer.train() + #trainer.export('checkpoint') diff --git a/examples/trials/mnist-batch-tune-keras/mnist-keras.py b/examples/trials/mnist-batch-tune-keras/mnist-keras.py index 9df8f32a35..40aa9f33e4 100644 --- a/examples/trials/mnist-batch-tune-keras/mnist-keras.py +++ b/examples/trials/mnist-batch-tune-keras/mnist-keras.py @@ -84,7 +84,11 @@ def on_epoch_end(self, epoch, logs={}): Run on end of each epoch ''' LOG.debug(logs) - nni.report_intermediate_result(logs["val_acc"]) + # TensorFlow 2.0 API reference claims the key is `val_acc`, but in fact it's `val_accuracy` + if 'val_acc' in logs: + nni.report_intermediate_result(logs['val_acc']) + else: + nni.report_intermediate_result(logs['val_accuracy']) def train(args, params): ''' diff --git a/examples/trials/mnist-keras/mnist-keras.py b/examples/trials/mnist-keras/mnist-keras.py index 2d7dac0004..794b7deb2a 100644 --- a/examples/trials/mnist-keras/mnist-keras.py +++ b/examples/trials/mnist-keras/mnist-keras.py @@ -86,7 +86,11 @@ def on_epoch_end(self, epoch, logs={}): Run on end of each epoch ''' LOG.debug(logs) - nni.report_intermediate_result(logs["val_acc"]) + # TensorFlow 2.0 API reference claims the key is `val_acc`, but in fact it's `val_accuracy` + if 'val_acc' in logs: + nni.report_intermediate_result(logs['val_acc']) + else: + nni.report_intermediate_result(logs['val_accuracy']) def train(args, params): ''' diff --git a/examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py b/examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py index 7d69a0241a..c357e2c4b8 100644 --- a/examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py +++ b/examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py @@ -152,7 +152,11 @@ def on_epoch_end(self, epoch, logs=None): if logs is None: logs = dict() logger.debug(logs) - nni.report_intermediate_result(logs["val_accuracy"]) + # TensorFlow 2.0 API reference claims the key is `val_acc`, but in fact it's `val_accuracy` + if 'val_acc' in logs: + nni.report_intermediate_result(logs['val_acc']) + else: + nni.report_intermediate_result(logs['val_accuracy']) # Training diff --git a/examples/trials/network_morphism/cifar10/cifar10_keras.py b/examples/trials/network_morphism/cifar10/cifar10_keras.py index ef371b811b..91f9879e4f 100644 --- a/examples/trials/network_morphism/cifar10/cifar10_keras.py +++ b/examples/trials/network_morphism/cifar10/cifar10_keras.py @@ -152,9 +152,11 @@ def on_epoch_end(self, epoch, logs=None): if logs is None: logs = dict() logger.debug(logs) - # accuracy key for keras 2.2.2: val_acc - # for keras 2.3.1: val_accuracy - nni.report_intermediate_result(logs["val_accuracy"]) + # TensorFlow 2.0 API reference claims the key is `val_acc`, but in fact it's `val_accuracy` + if 'val_acc' in logs: + nni.report_intermediate_result(logs['val_acc']) + else: + nni.report_intermediate_result(logs['val_accuracy']) # Training diff --git a/install.ps1 b/install.ps1 index f61a3a7046..33cae44cc9 100644 --- a/install.ps1 +++ b/install.ps1 @@ -1,12 +1,14 @@ +param ([Switch] $Development) [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 + $install_node = $true $install_yarn = $true -if([Environment]::Is64BitOperatingSystem){ +if ([Environment]::Is64BitOperatingSystem) { $OS_VERSION = 'win64' } -else{ +else { $OS_VERSION = 'win32' } # nodejs @@ -15,58 +17,58 @@ $yarnUrl = "https://yarnpkg.com/latest.tar.gz" $unzipNodeDir = "node-v*" $unzipYarnDir = "yarn-v*" -$NNI_DEPENDENCY_FOLDER = [System.IO.Path]::GetTempPath()+$env:USERNAME +$NNI_DEPENDENCY_FOLDER = [System.IO.Path]::GetTempPath() + $env:USERNAME $WHICH_PYTHON = where.exe python -if($WHICH_PYTHON -eq $null){ +if ($WHICH_PYTHON -eq $null) { throw "Can not find python" } -else{ +else { $pyVersion = & python -V 2>&1 - $pyVersion = ([string]$pyVersion).substring(7,3) - if([double]$pyVersion -lt 3.5){ + $pyVersion = ([string]$pyVersion).substring(7, 3) + if ([double]$pyVersion -lt 3.5) { throw "python version should >= 3.5" } } $WHICH_PIP = where.exe pip -if($WHICH_PIP -eq $null){ +if ($WHICH_PIP -eq $null) { throw "Can not find pip" } $env:PYTHONIOENCODING = "UTF-8" -if($env:VIRTUAL_ENV){ +if ($env:VIRTUAL_ENV) { $NNI_PYTHON3 = $env:VIRTUAL_ENV + "\Scripts" $NNI_PKG_FOLDER = $env:VIRTUAL_ENV + "\nni" $NNI_PYTHON_SCRIPTS = $NNI_PYTHON3 } -else{ +else { $NNI_PYTHON3 = $(python -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]))') $NNI_PKG_FOLDER = $NNI_PYTHON3 + "\nni" - $NNI_PYTHON_SCRIPTS = $NNI_PYTHON3 + "\Scripts" + $NNI_PYTHON_SCRIPTS = $NNI_PYTHON3 + "\Scripts" } -$PIP_INSTALL = """$NNI_PYTHON3\python"" -m pip install ." +$PIP_INSTALL = """$NNI_PYTHON3\python"" -m pip install " -if(!(Test-Path $NNI_DEPENDENCY_FOLDER)){ +if (!(Test-Path $NNI_DEPENDENCY_FOLDER)) { New-Item $NNI_DEPENDENCY_FOLDER -ItemType Directory } -$NNI_NODE_ZIP = $NNI_DEPENDENCY_FOLDER+"\nni-node.zip" -$NNI_NODE_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-node" -$NNI_YARN_TARBALL = $NNI_DEPENDENCY_FOLDER+"\nni-yarn.tar.gz" -$NNI_YARN_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-yarn" -$NNI_YARN = $NNI_YARN_FOLDER +"\bin\yarn" +$NNI_NODE_ZIP = $NNI_DEPENDENCY_FOLDER + "\nni-node.zip" +$NNI_NODE_FOLDER = $NNI_DEPENDENCY_FOLDER + "\nni-node" +$NNI_YARN_TARBALL = $NNI_DEPENDENCY_FOLDER + "\nni-yarn.tar.gz" +$NNI_YARN_FOLDER = $NNI_DEPENDENCY_FOLDER + "\nni-yarn" +$NNI_YARN = $NNI_YARN_FOLDER + "\bin\yarn" ## Version number $NNI_VERSION_VALUE = $(git describe --tags) $NNI_VERSION_TEMPLATE = "999.0.0-developing" -if(!(Test-Path $NNI_NODE_ZIP)){ +if (!(Test-Path $NNI_NODE_ZIP)) { Write-Host "Downloading Node..." (New-Object Net.WebClient).DownloadFile($nodeUrl, $NNI_NODE_ZIP) } -if(!(Test-Path $NNI_YARN_TARBALL)){ +if (!(Test-Path $NNI_YARN_TARBALL)) { Write-Host "Downloading Yarn..." (New-Object Net.WebClient).DownloadFile($yarnUrl, $NNI_YARN_TARBALL) } @@ -74,27 +76,30 @@ if(!(Test-Path $NNI_YARN_TARBALL)){ $NNI_YARN_TARBALL = $NNI_YARN_TARBALL -split '\\' -join '\\' $NNI_DEPENDENCY_FOLDER = $NNI_DEPENDENCY_FOLDER -split '\\' -join '\\' $SCRIPT_PATH = $NNI_DEPENDENCY_FOLDER + '\extract.py' -$SCRIPT = "import tarfile", - ("tar = tarfile.open(""{0}"")" -f $NNI_YARN_TARBALL), - ("tar.extractall(""{0}"")" -f $NNI_DEPENDENCY_FOLDER), +$SCRIPT = "import tarfile", + ("tar = tarfile.open(""{0}"")" -f $NNI_YARN_TARBALL), + ("tar.extractall(""{0}"")" -f $NNI_DEPENDENCY_FOLDER), "tar.close()" [System.IO.File]::WriteAllLines($SCRIPT_PATH, $SCRIPT) Add-Type -AssemblyName System.IO.Compression.FileSystem -function Unzip{ +function Unzip { param([string]$zipfile, [string]$outpath) [System.IO.Compression.ZipFile]::ExtractToDirectory($zipfile, $outpath) } if ($install_node) { ### nodejs install - if(!(Test-Path $NNI_NODE_FOLDER)){ + if (!(Test-Path $NNI_NODE_FOLDER)) { Unzip $NNI_NODE_ZIP $NNI_DEPENDENCY_FOLDER $unzipNodeDir = Get-ChildItem "$NNI_DEPENDENCY_FOLDER\$unzipNodeDir" Rename-Item $unzipNodeDir "nni-node" } Copy-Item "$NNI_NODE_FOLDER\node.exe" $NNI_PYTHON_SCRIPTS -Recurse -Force +} + +if ($install_yarn) { ### yarn install - if(!(Test-Path $NNI_YARN_FOLDER)){ + if (!(Test-Path $NNI_YARN_FOLDER)) { cmd /C """$NNI_PYTHON3\python""" $SCRIPT_PATH $unzipYarnDir = Get-ChildItem "$NNI_DEPENDENCY_FOLDER\$unzipYarnDir" Rename-Item $unzipYarnDir "nni-yarn" @@ -104,37 +109,82 @@ if ($install_node) { ## install-python-modules: ### Installing Python SDK (Get-Content setup.py).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content setup.py -cmd /c $PIP_INSTALL + +if ($Development) { + $PYTHON_BUILD = "build" + if (Test-Path $PYTHON_BUILD) { + # To compat with file and links. + cmd /c rmdir /s /q $PYTHON_BUILD + } + New-Item $PYTHON_BUILD -ItemType Directory + New-Item -ItemType Junction -Path "$($PYTHON_BUILD)\nni" -Target "src\sdk\pynni\nni" + New-Item -ItemType Junction -Path "$($PYTHON_BUILD)\nnicli" -Target "src\sdk\pycli\nnicli" + New-Item -ItemType Junction -Path "$($PYTHON_BUILD)\nni_annotation" -Target "tools\nni_annotation" + New-Item -ItemType Junction -Path "$($PYTHON_BUILD)\nni_cmd" -Target "tools\nni_cmd" + New-Item -ItemType Junction -Path "$($PYTHON_BUILD)\nni_trial_tool" -Target "tools\nni_trial_tool" + New-Item -ItemType Junction -Path "$($PYTHON_BUILD)\nni_gpu_tool" -Target "tools\nni_gpu_tool" + + Copy-Item setup.py $PYTHON_BUILD + Copy-Item README.md $PYTHON_BUILD + + Push-Location build + #update folders in setup file + (Get-Content setup.py).replace("src/sdk/pynni/", "") | Set-Content setup.py + (Get-Content setup.py).replace("src/sdk/pycli/", "") | Set-Content setup.py + (Get-Content setup.py).replace("src/sdk/pynni", ".") | Set-Content setup.py + (Get-Content setup.py).replace("tools/", "") | Set-Content setup.py + # install current folder. + cmd /c $PIP_INSTALL -e . + Pop-Location +} +else { + cmd /c $PIP_INSTALL . +} # Building NNI Manager -$env:PATH=$NNI_PYTHON_SCRIPTS+';'+$env:PATH +$env:PATH = $NNI_PYTHON_SCRIPTS + ';' + $env:PATH cd src\nni_manager cmd /c $NNI_YARN cmd /c $NNI_YARN build Copy-Item config -Destination .\dist\ -Recurse -Force # Building WebUI +# office-ui-fabric-react need longer time. the 180000 is in ms, mean 180 seconds, longer than default 30 seconds. cd ..\webui -cmd /c $NNI_YARN +cmd /c $NNI_YARN --network-timeout 180000 cmd /c $NNI_YARN build # Building NasUI cd ..\nasui -cmd /c $NNI_YARN +cmd /c $NNI_YARN --network-timeout 180000 cmd /c $NNI_YARN build cd ..\.. ## install-node-modules -if(!(Test-Path $NNI_PKG_FOLDER)){ - New-Item $NNI_PKG_FOLDER -ItemType Directory +if (Test-Path $NNI_PKG_FOLDER) { + # it needs to remove the whole folder for following copy. + cmd /c rmdir /s /q $NNI_PKG_FOLDER +} + +$NNI_PKG_FOLDER_STATIC = $NNI_PKG_FOLDER + "\static" +$NASUI_PKG_FOLDER = $NNI_PKG_FOLDER + "\nasui" + +if ($Development) { + New-Item -ItemType Junction -Path $($NNI_PKG_FOLDER) -Target "src\nni_manager\dist" + New-Item -ItemType Junction -Path "$($NNI_PKG_FOLDER)\node_modules" -Target "src\nni_manager\node_modules" + New-Item -ItemType Junction -Path $($NNI_PKG_FOLDER_STATIC) -Target "src\webui\build" + New-Item -ItemType Junction -Path $($NASUI_PKG_FOLDER) -Target "src\nasui\build" +} +else { + Copy-Item "src\nni_manager\dist" $NNI_PKG_FOLDER -Recurse + Copy-Item "src\webui\build" $NNI_PKG_FOLDER_STATIC -Recurse + Copy-Item "src\nasui\build" $NASUI_PKG_FOLDER -Recurse } -Remove-Item $NNI_PKG_FOLDER -Recurse -Force -Copy-Item "src\nni_manager\dist" $NNI_PKG_FOLDER -Recurse + Copy-Item "src\nni_manager\package.json" $NNI_PKG_FOLDER $PKG_JSON = $NNI_PKG_FOLDER + "\package.json" (Get-Content $PKG_JSON).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content $PKG_JSON -cmd /c $NNI_YARN --prod --cwd $NNI_PKG_FOLDER -$NNI_PKG_FOLDER_STATIC = $NNI_PKG_FOLDER + "\static" -$NASUI_PKG_FOLDER = $NNI_PKG_FOLDER + "\nasui" -Copy-Item "src\webui\build" $NNI_PKG_FOLDER_STATIC -Recurse -Copy-Item "src\nasui\build" $NASUI_PKG_FOLDER -Recurse Copy-Item "src\nasui\server.js" $NASUI_PKG_FOLDER -Recurse + +if (!$Development) { + cmd /c $NNI_YARN --prod --cwd $NNI_PKG_FOLDER +} diff --git a/pylintrc b/pylintrc index a5a924ee7c..e23cacfb12 100644 --- a/pylintrc +++ b/pylintrc @@ -45,4 +45,6 @@ enable= unused-wildcard-import, ignore-patterns=test* # List of members which are set dynamically and missed by pylint inference -generated-members=numpy.*,torch.* +generated-members=numpy.*,torch.*,tensorflow.* + +ignored-modules=tensorflow diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index a79484dd3a..413d2ee220 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -22,7 +22,7 @@ import { HyperParameters, TrainingService, TrialJobStatus } from './trainingServ function getExperimentRootDir(): string { return getExperimentStartupInfo() - .getLogDir(); + .getLogDir(); } function getLogDir(): string { @@ -31,7 +31,7 @@ function getLogDir(): string { function getLogLevel(): string { return getExperimentStartupInfo() - .getLogLevel(); + .getLogLevel(); } function getDefaultDatabaseDir(): string { @@ -113,11 +113,16 @@ function uniqueString(len: number): string { return String.fromCharCode(...codes); } +function randomInt(max: number): number { + return Math.floor(Math.random() * max); +} + function randomSelect(a: T[]): T { assert(a !== undefined); return a[Math.floor(Math.random() * a.length)]; } + function parseArg(names: string[]): string { if (process.argv.length >= 4) { for (let i: number = 2; i < process.argv.length - 1; i++) { @@ -132,7 +137,7 @@ function parseArg(names: string[]): string { function getCmdPy(): string { let cmd = 'python3'; - if(process.platform === 'win32'){ + if (process.platform === 'win32') { cmd = 'python'; } return cmd; @@ -160,7 +165,7 @@ function generateParamFileName(hyperParameters: HyperParameters): string { assert(hyperParameters.index >= 0); let paramFileName: string; - if(hyperParameters.index == 0) { + if (hyperParameters.index == 0) { paramFileName = 'parameter.cfg'; } else { paramFileName = `parameter_${hyperParameters.index}.cfg` @@ -211,9 +216,9 @@ function getIPV4Address(): string { return cachedipv4Address; } - if(os.networkInterfaces().eth0) { - for(const item of os.networkInterfaces().eth0) { - if(item.family === 'IPv4') { + if (os.networkInterfaces().eth0) { + for (const item of os.networkInterfaces().eth0) { + if (item.family === 'IPv4') { cachedipv4Address = item.address; return cachedipv4Address; } @@ -225,14 +230,6 @@ function getIPV4Address(): string { throw Error('getIPV4Address() failed because no valid IPv4 address found.') } -function getRemoteTmpDir(osType: string): string { - if (osType == 'linux') { - return '/tmp'; - } else { - throw Error(`remote OS ${osType} not supported`); - } -} - /** * Get the status of canceled jobs according to the hint isEarlyStopped */ @@ -245,7 +242,7 @@ function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus { * @param directory directory name */ function countFilesRecursively(directory: string): Promise { - if(!fs.existsSync(directory)) { + if (!fs.existsSync(directory)) { throw Error(`Direcotory ${directory} doesn't exist`); } @@ -261,13 +258,13 @@ function countFilesRecursively(directory: string): Promise { let fileCount: number = -1; let cmd: string; - if(process.platform === "win32") { + if (process.platform === "win32") { cmd = `powershell "Get-ChildItem -Path ${directory} -Recurse -File | Measure-Object | %{$_.Count}"` } else { cmd = `find ${directory} -type f | wc -l`; } cpp.exec(cmd).then((result) => { - if(result.stdout && parseInt(result.stdout)) { + if (result.stdout && parseInt(result.stdout)) { fileCount = parseInt(result.stdout); } deferred.resolve(fileCount); @@ -280,20 +277,20 @@ function countFilesRecursively(directory: string): Promise { function validateFileName(fileName: string): boolean { const pattern: string = '^[a-z0-9A-Z._-]+$'; const validateResult = fileName.match(pattern); - if(validateResult) { + if (validateResult) { return true; } return false; } async function validateFileNameRecursively(directory: string): Promise { - if(!fs.existsSync(directory)) { + if (!fs.existsSync(directory)) { throw Error(`Direcotory ${directory} doesn't exist`); } const fileNameArray: string[] = fs.readdirSync(directory); let result = true; - for(const name of fileNameArray){ + for (const name of fileNameArray) { const fullFilePath: string = path.join(directory, name); try { // validate file names and directory names @@ -301,14 +298,14 @@ async function validateFileNameRecursively(directory: string): Promise if (fs.lstatSync(fullFilePath).isDirectory()) { result = result && await validateFileNameRecursively(fullFilePath); } - if(!result) { + if (!result) { return Promise.reject(new Error(`file name in ${fullFilePath} is not valid!`)); } - } catch(error) { + } catch (error) { return Promise.reject(error); } } - return Promise.resolve(result); + return Promise.resolve(result); } /** @@ -316,9 +313,9 @@ async function validateFileNameRecursively(directory: string): Promise */ async function getVersion(): Promise { const deferred: Deferred = new Deferred(); - import(path.join(__dirname, '..', 'package.json')).then((pkg)=>{ + import(path.join(__dirname, '..', 'package.json')).then((pkg) => { deferred.resolve(pkg.version); - }).catch((error)=>{ + }).catch((error) => { deferred.reject(error); }); return deferred.promise; @@ -331,9 +328,9 @@ function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newE let cmd: string = command; let arg: string[] = []; let newShell: boolean = true; - if(process.platform === "win32"){ + if (process.platform === "win32") { cmd = command.split(" ", 1)[0]; - arg = command.substr(cmd.length+1).split(" "); + arg = command.substr(cmd.length + 1).split(" "); newShell = false; } const tunerProc: ChildProcess = spawn(cmd, arg, { @@ -383,7 +380,7 @@ async function killPid(pid: any): Promise { if (process.platform === "win32") { await cpp.exec(`cmd.exe /c taskkill /PID ${pid} /F`); } - else{ + else { await cpp.exec(`kill -9 ${pid}`); } } catch (error) { @@ -397,7 +394,7 @@ function getNewLine(): string { if (process.platform === "win32") { return "\r\n"; } - else{ + else { return "\n"; } } @@ -412,6 +409,8 @@ function unixPathJoin(...paths: any[]): string { return dir; } -export {countFilesRecursively, validateFileNameRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, +export { + countFilesRecursively, validateFileNameRecursively, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin, - mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine }; + mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine +}; diff --git a/src/nni_manager/core/ipcInterface.ts b/src/nni_manager/core/ipcInterface.ts index 340af109bc..e7c45beec6 100644 --- a/src/nni_manager/core/ipcInterface.ts +++ b/src/nni_manager/core/ipcInterface.ts @@ -23,11 +23,7 @@ const ipcIncomingFd: number = 4; */ function encodeCommand(commandType: string, content: string): Buffer { const contentBuffer: Buffer = Buffer.from(content); - if (contentBuffer.length >= 1_000_000) { - throw new RangeError('Command too long'); - } - const contentLengthBuffer: Buffer = Buffer.from(contentBuffer.length.toString().padStart(6, '0')); - + const contentLengthBuffer: Buffer = Buffer.from(contentBuffer.length.toString().padStart(14, '0')); return Buffer.concat([Buffer.from(commandType), contentLengthBuffer, contentBuffer]); } @@ -43,12 +39,12 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] { return [false, '', '', data]; } const commandType: string = data.slice(0, 2).toString(); - const contentLength: number = parseInt(data.slice(2, 8).toString(), 10); - if (data.length < contentLength + 8) { + const contentLength: number = parseInt(data.slice(2, 16).toString(), 10); + if (data.length < contentLength + 16) { return [false, '', '', data]; } - const content: string = data.slice(8, contentLength + 8).toString(); - const remain: Buffer = data.slice(contentLength + 8); + const content: string = data.slice(16, contentLength + 16).toString(); + const remain: Buffer = data.slice(contentLength + 16); return [true, commandType, content, remain]; } diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index b276168be1..5b86815ee0 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -266,7 +266,7 @@ class NNIManager implements Manager { const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => { timeoutId = setTimeout( () => { reject(new Error('TrainingService setClusterMetadata timeout. Please check your config file.')); }, - 10000); + 30000); }); await Promise.race([delay1, this.trainingService.setClusterMetadata(key, value)]).finally(() => { clearTimeout(timeoutId); @@ -368,7 +368,7 @@ class NNIManager implements Manager { CUDA_VISIBLE_DEVICES: this.getGpuEnvvarValue() }; const newEnv = Object.assign({}, process.env, nniEnv); - const tunerProc: ChildProcess = getTunerProc(command,stdio,newCwd,newEnv); + const tunerProc: ChildProcess = getTunerProc(command, stdio, newCwd, newEnv); this.dispatcherPid = tunerProc.pid; this.dispatcher = createDispatcherInterface(tunerProc); @@ -436,7 +436,9 @@ class NNIManager implements Manager { } await killPid(this.dispatcherPid); const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs(); - // TO DO: to promise all + + // DON'T try to make it in parallel, the training service may not handle it well. + // If there is performance concern, consider to support batch cancellation on training service. for (const trialJob of trialJobList) { if (trialJob.status === 'RUNNING' || trialJob.status === 'WAITING') { @@ -444,7 +446,7 @@ class NNIManager implements Manager { this.log.info(`cancelTrialJob: ${trialJob.id}`); await this.trainingService.cancelTrialJob(trialJob.id); } catch (error) { - // pid does not exist, do nothing here + this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`); } } } diff --git a/src/nni_manager/core/test/assessor.py b/src/nni_manager/core/test/assessor.py index 50d1949da3..004283cb51 100644 --- a/src/nni_manager/core/test/assessor.py +++ b/src/nni_manager/core/test/assessor.py @@ -8,13 +8,13 @@ def send(command, data): command = command.encode('utf8') data = data.encode('utf8') - msg = b'%b%06d%b' % (command, len(data), data) + msg = b'%b%14d%b' % (command, len(data), data) _out_file.write(msg) _out_file.flush() def receive(): - header = _in_file.read(8) + header = _in_file.read(16) l = int(header[2:]) command = header[:2].decode('utf8') data = _in_file.read(l).decode('utf8') diff --git a/src/nni_manager/core/test/ipcInterface.test.ts b/src/nni_manager/core/test/ipcInterface.test.ts index 1742303379..4ddeda8a1f 100644 --- a/src/nni_manager/core/test/ipcInterface.test.ts +++ b/src/nni_manager/core/test/ipcInterface.test.ts @@ -14,7 +14,6 @@ import { NNIError } from '../../common/errors'; let sentCommands: { [key: string]: string }[] = []; const receivedCommands: { [key: string]: string }[] = []; -let commandTooLong: Error | undefined; let rejectCommandType: Error | undefined; function runProcess(): Promise { @@ -54,14 +53,7 @@ function runProcess(): Promise { // Command #2: ok dispatcher.sendCommand('ME', '123'); - // Command #3: too long - try { - dispatcher.sendCommand('ME', 'x'.repeat(1_000_000)); - } catch (error) { - commandTooLong = error; - } - - // Command #4: FE is not tuner/assessor command, test the exception type of send non-valid command + // Command #3: FE is not tuner/assessor command, test the exception type of send non-valid command try { dispatcher.sendCommand('FE', '1'); } catch (error) { @@ -88,21 +80,11 @@ describe('core/protocol', (): void => { }); it('sendCommand() should work without content', (): void => { - assert.equal(sentCommands[0], '(\'IN\', \'\')'); + assert.equal(sentCommands[0], "('IN', '')"); }); it('sendCommand() should work with content', (): void => { - assert.equal(sentCommands[1], '(\'ME\', \'123\')'); - }); - - it('sendCommand() should throw on too long command', (): void => { - if (commandTooLong === undefined) { - assert.fail('Should throw error') - } else { - const err: Error | undefined = (commandTooLong).cause; - assert(err && err.name === 'RangeError'); - assert(err && err.message === 'Command too long'); - } + assert.equal(sentCommands[1], "('ME', '123')"); }); it('sendCommand() should throw on wrong command type', (): void => { diff --git a/src/nni_manager/training_service/common/util.ts b/src/nni_manager/training_service/common/util.ts index 9328f01e61..dd7e368ac6 100644 --- a/src/nni_manager/training_service/common/util.ts +++ b/src/nni_manager/training_service/common/util.ts @@ -174,10 +174,11 @@ export async function tarAdd(tarPath: string, sourcePath: string): Promise script.push( `import os`, `import tarfile`, - String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tarFilePath, sourceFilePath), + String.Format(`tar = tarfile.open("{0}","w:gz")\r\nroot="{1}"\r\nfor file_path,dir,files in os.walk(root):`, tarFilePath, sourceFilePath), ` for file in files:`, - ` fullpath = os.path.join(root,file)`, - ` tar.add(fullpath, arcname=file)`, + ` full_path = os.path.join(file_path, file)`, + ` file = os.path.relpath(full_path, root)`, + ` tar.add(full_path, arcname=file)`, `tar.close()`); await fs.promises.writeFile(path.join(os.tmpdir(), 'tar.py'), script.join(getNewLine()), { encoding: 'utf8', mode: 0o777 }); const tarScript: string = path.join(os.tmpdir(), 'tar.py'); diff --git a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts index 87e8ad6c7a..7f57178cdb 100644 --- a/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts @@ -3,6 +3,7 @@ 'use strict'; +import * as assert from 'assert'; import * as cpp from 'child-process-promise'; import * as fs from 'fs'; import * as path from 'path'; @@ -72,6 +73,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple this.kubernetesRestServerPort = restServer.clusterRestServerPort; } + // wait upload of code Dir to finish + if (this.copyExpCodeDirPromise !== undefined) { + await this.copyExpCodeDirPromise; + } + const trialJobId: string = uniqueString(5); // Set trial's NFS working folder const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); @@ -81,8 +87,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple this.generateContainerPort(); await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form); - //upload code files - const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); + //wait upload of script files to finish + const trialJobOutputUrl: string = await this.uploadFolder(trialLocalTempFolder, `nni/${getExperimentId()}/${trialJobId}`); let initStatus: TrialJobStatus = 'WAITING'; if (!trialJobOutputUrl) { initStatus = 'FAILED'; @@ -151,6 +157,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple // Validate to make sure codeDir doesn't have too many files try { await validateCodeDir(this.fcTrialConfig.codeDir); + //upload codeDir to storage + this.copyExpCodeDirPromise = this.uploadFolder(this.fcTrialConfig.codeDir, `nni/${getExperimentId()}/nni-code`); } catch (error) { this.log.error(error); @@ -171,41 +179,31 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple } /** - * upload code files to nfs or azureStroage - * @param trialJobId - * @param trialLocalTempFolder - * return: trialJobOutputUrl + * upload local folder to nfs or azureStroage */ - private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise { + private async uploadFolder(srcDirectory: string, destDirectory: string): Promise { if (this.fcClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } - if (this.fcTrialConfig === undefined) { - throw new Error('Kubeflow trial config is not initialized'); - } - - let trialJobOutputUrl: string = ''; + assert(this.fcClusterConfig.storage === undefined + || this.fcClusterConfig.storage === 'azureStorage' + || this.fcClusterConfig.storage === 'nfs'); - if (this.fcClusterConfig.storageType === 'azureStorage') { - const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure = - this.fcClusterConfig; - trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir, - azureFrameworkControllerClusterConfig.uploadRetryCount); - } else if (this.fcClusterConfig.storageType === 'nfs') { - const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = - this.fcClusterConfig; - // Creat work dir for current trial in NFS directory - await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); - // Copy code files from local dir to NFS mounted dir - await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); - // Copy codeDir to NFS mounted dir - await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); - const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs; - trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`; + if (this.fcClusterConfig.storage === 'azureStorage') { + if (this.azureStorageClient === undefined) { + throw new Error('azureStorageClient is not initialized'); + } + const fcClusterConfigAzure: FrameworkControllerClusterConfigAzure = this.fcClusterConfig; + return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, fcClusterConfigAzure.uploadRetryCount); + } else if (this.fcClusterConfig.storage === 'nfs' || this.fcClusterConfig.storage === undefined) { + await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/${destDirectory}`); + await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalNFSTempFolder}/${destDirectory}/.`); + const fcClusterConfigNFS: FrameworkControllerClusterConfigNFS = this.fcClusterConfig; + const nfsConfig: NFSConfig = fcClusterConfigNFS.nfs; + return `nfs://${nfsConfig.server}:${destDirectory}`; } - - return Promise.resolve(trialJobOutputUrl); + return ''; } /** diff --git a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts index 98c84a30b0..8a082e949f 100644 --- a/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts @@ -74,14 +74,20 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer); this.kubernetesRestServerPort = restServer.clusterRestServerPort; } + + // upload code Dir to storage + if (this.copyExpCodeDirPromise !== undefined) { + await this.copyExpCodeDirPromise; + } + const trialJobId: string = uniqueString(5); const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase(); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); //prepare the runscript await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form); - //upload files to sotrage - const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); + //upload script files to sotrage + const trialJobOutputUrl: string = await this.uploadFolder(trialLocalTempFolder, `nni/${getExperimentId()}/${trialJobId}`); let initStatus: TrialJobStatus = 'WAITING'; if (!trialJobOutputUrl) { initStatus = 'FAILED'; @@ -152,6 +158,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber // Validate to make sure codeDir doesn't have too many files try { await validateCodeDir(this.kubeflowTrialConfig.codeDir); + //upload codeDir to storage + this.copyExpCodeDirPromise = this.uploadFolder(this.kubeflowTrialConfig.codeDir, `nni/${getExperimentId()}/nni-code`); } catch (error) { this.log.error(error); @@ -172,12 +180,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber } /** - * upload code files to nfs or azureStroage - * @param trialJobId - * @param trialLocalTempFolder - * return: trialJobOutputUrl + * upload local folder to nfs or azureStroage */ - private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise { + private async uploadFolder(srcDirectory: string, destDirectory: string): Promise { if (this.kubeflowClusterConfig === undefined) { throw new Error('Kubeflow Cluster config is not initialized'); } @@ -186,8 +191,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber throw new Error('Kubeflow Trial config is not initialized'); } - let trialJobOutputUrl: string = ''; - assert(this.kubeflowClusterConfig.storage === undefined || this.kubeflowClusterConfig.storage === 'azureStorage' || this.kubeflowClusterConfig.storage === 'nfs'); @@ -197,20 +200,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber throw new Error('azureStorageClient is not initialized'); } const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = this.kubeflowClusterConfig; - trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir, azureKubeflowClusterConfig.uploadRetryCount); + return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, azureKubeflowClusterConfig.uploadRetryCount); } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { + await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/${destDirectory}`); + await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalNFSTempFolder}/${destDirectory}/.`); const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = this.kubeflowClusterConfig; - // Creat work dir for current trial in NFS directory - await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); - // Copy script files from local dir to NFS mounted dir - await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); - // Copy codeDir to NFS mounted dir - await cpp.exec(`cp -r ${this.kubeflowTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs; - trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`; + return `nfs://${nfsConfig.server}:${destDirectory}`; } - - return Promise.resolve(trialJobOutputUrl); + return ''; } private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, diff --git a/src/nni_manager/training_service/kubernetes/kubernetesData.ts b/src/nni_manager/training_service/kubernetes/kubernetesData.ts index 3d3cd3a5b8..7e0729e24c 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesData.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesData.ts @@ -39,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail { export const kubernetesScriptFormat: string = `#!/bin/bash export NNI_PLATFORM={0} -export NNI_SYS_DIR=$PWD/nni/{1} +export NNI_SYS_DIR={1} export NNI_OUTPUT_DIR={2} export MULTI_PHASE=false export NNI_TRIAL_JOB_ID={3} @@ -49,7 +49,7 @@ export NNI_TRIAL_SEQ_ID={6} {7} mkdir -p $NNI_SYS_DIR mkdir -p $NNI_OUTPUT_DIR -cp -rT $NNI_CODE_DIR $NNI_SYS_DIR +cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR cd $NNI_SYS_DIR sh install_nni.sh python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \ diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index 56870fac97..f21ac9ad69 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -49,6 +49,8 @@ abstract class KubernetesTrainingService { protected kubernetesClusterConfig?: KubernetesClusterConfig; protected versionCheck: boolean = true; protected logCollection: string; + protected copyExpCodeDirPromise?: Promise; + protected expContainerCodeFolder: string; constructor() { this.log = getLogger(); @@ -57,6 +59,7 @@ abstract class KubernetesTrainingService { this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp'); this.experimentId = getExperimentId(); this.CONTAINER_MOUNT_PATH = '/tmp/mount'; + this.expContainerCodeFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId, 'nni-code'); this.genericK8sClient = new GeneralK8sClient(); this.logCollection = 'none'; } @@ -272,11 +275,11 @@ abstract class KubernetesTrainingService { const runScript: string = String.Format( kubernetesScriptFormat, platform, - trialJobId, + trialWorkingFolder, path.join(trialWorkingFolder, 'output', `${roleName}_output`), trialJobId, getExperimentId(), - trialWorkingFolder, + this.expContainerCodeFolder, trialSequenceId, nvidiaScript, command, @@ -329,51 +332,45 @@ abstract class KubernetesTrainingService { ); return registrySecretName; } - - protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: string, codeDir: string, uploadRetryCount: number | undefined): Promise { + + /** + * upload local directory to azureStorage + * @param srcDirectory the source directory of local folder + * @param destDirectory the target directory in azure + * @param uploadRetryCount the retry time when upload failed + */ + protected async uploadFolderToAzureStorage(srcDirectory: string, destDirectory: string, uploadRetryCount: number | undefined): Promise { if (this.azureStorageClient === undefined) { throw new Error('azureStorageClient is not initialized'); } - let trialJobOutputUrl: string = ''; let retryCount: number = 1; if(uploadRetryCount) { retryCount = uploadRetryCount; } - let resultUploadNNIScript: boolean = false; - let resultUploadCodeFile: boolean = false; + let uploadSuccess: boolean = false; + let folderUriInAzure = ''; try { do { - //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage - if(!resultUploadNNIScript) { - resultUploadNNIScript = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${trialLocalTempFolder}`); - } - //upload code files to azure storage - if(!resultUploadCodeFile) { - resultUploadCodeFile = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, - `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, - `${codeDir}`); - } - if (resultUploadNNIScript && resultUploadCodeFile) { - trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` + - `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; - break; - } else { + uploadSuccess = await AzureStorageClientUtility.uploadDirectory( + this.azureStorageClient, + `${destDirectory}`, + this.azureStorageShare, + `${srcDirectory}`); + if (!uploadSuccess) { //wait for 5 seconds to re-upload files await delay(5000); this.log.info('Upload failed, Retry: upload files to azure-storage'); + } else { + folderUriInAzure = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${destDirectory}`; + break; } } while (retryCount-- >= 0) } catch (error) { this.log.error(error); //return a empty url when got error - return Promise.resolve(""); - } - if(!trialJobOutputUrl) { - this.log.info(`Retry-count is used up, upload files to azureStorage for trial ${trialJobId} failed!`); + return Promise.resolve(''); } - return Promise.resolve(trialJobOutputUrl); + return Promise.resolve(folderUriInAzure); } } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index fed15a3ff7..27bd42c385 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -361,21 +361,25 @@ class LocalTrainingService implements TrainingService { trialJobDetail: TrialJobDetail, resource: { gpuIndices: number[] }, gpuNum: number | undefined): { key: string; value: string }[] { - const envVariables: { key: string; value: string }[] = [ - { key: 'NNI_PLATFORM', value: 'local' }, - { key: 'NNI_EXP_ID', value: this.experimentId }, - { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory }, - { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id }, - { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory }, - { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() }, - { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } - ]; - if (gpuNum !== undefined) { - envVariables.push({ - key: 'CUDA_VISIBLE_DEVICES', - value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',') - }); - } + if (this.localTrialConfig === undefined) { + throw new Error('localTrialConfig is not initialized!'); + } + const envVariables: { key: string; value: string }[] = [ + { key: 'NNI_PLATFORM', value: 'local' }, + { key: 'NNI_EXP_ID', value: this.experimentId }, + { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory }, + { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id }, + { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory }, + { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() }, + { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }, + { key: 'NNI_CODE_DIR', value: this.localTrialConfig.codeDir} + ]; + if (gpuNum !== undefined) { + envVariables.push({ + key: 'CUDA_VISIBLE_DEVICES', + value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',') + }); + } return envVariables; } @@ -473,12 +477,16 @@ class LocalTrainingService implements TrainingService { private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] { const script: string[] = []; if (process.platform === 'win32') { + script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`); + script.push(`cd $env:NNI_SYS_DIR`); script.push( `cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`, `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`, `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`, `Write $LASTEXITCODE " " $NOW_DATE | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`); } else { + script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`); + script.push(`cd $NNI_SYS_DIR`); script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`); if (process.platform === 'darwin') { // https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x @@ -506,7 +514,6 @@ class LocalTrainingService implements TrainingService { if (process.platform !== 'win32') { runScriptContent.push('#!/bin/bash'); } - runScriptContent.push(`cd '${this.localTrialConfig.codeDir}'`); for (const variable of variables) { runScriptContent.push(setEnvironmentVariable(variable)); } diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts index a1733f99cd..f75511d826 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts @@ -31,7 +31,6 @@ fi`; export const PAI_K8S_TRIAL_COMMAND_FORMAT: string = `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ -&& ls $NNI_SYS_DIR \ -&& cd $NNI_SYS_DIR && sh install_nni.sh \ -&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \ ---nni_manager_version '{9}' --log_collection '{10}'`; +&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \ +&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \ +--nni_manager_version '{10}' --log_collection '{11}'`; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 48737ead35..a7a02e5cc7 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -53,6 +53,7 @@ const yaml = require('js-yaml'); @component.Singleton class PAIK8STrainingService extends PAITrainingService { protected paiTrialConfig: NNIPAIK8STrialConfig | undefined; + private copyExpCodeDirPromise?: Promise; private paiJobConfig: undefined; private nniVersion: string | undefined; constructor() { @@ -78,7 +79,7 @@ class PAIK8STrainingService extends PAITrainingService { } break; - case TrialConfigMetadataKey.TRIAL_CONFIG: + case TrialConfigMetadataKey.TRIAL_CONFIG: { if (this.paiClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); break; @@ -86,10 +87,15 @@ class PAIK8STrainingService extends PAITrainingService { this.paiTrialConfig = JSON.parse(value); // Validate to make sure codeDir doesn't have too many files await validateCodeDir(this.paiTrialConfig.codeDir); + const nniManagerNFSExpCodeDir = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, 'nni-code'); + await execMkdir(nniManagerNFSExpCodeDir); + //Copy codeDir files to local working folder + this.copyExpCodeDirPromise = execCopydir(this.paiTrialConfig.codeDir, nniManagerNFSExpCodeDir); if (this.paiTrialConfig.paiConfigPath) { this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); } break; + } case TrialConfigMetadataKey.VERSION_CHECK: this.versionCheck = (value === 'true' || value === 'True'); this.nniVersion = this.versionCheck ? await getVersion() : ''; @@ -152,6 +158,7 @@ class PAIK8STrainingService extends PAITrainingService { if (this.paiTrialConfig === undefined) { throw new Error('trial config is not initialized'); } + const containerNFSExpCodeDir = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/nni-code`; const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobDetail.id}`; const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniPaiTrialCommand: string = String.Format( @@ -162,6 +169,7 @@ class PAIK8STrainingService extends PAITrainingService { this.experimentId, trialJobDetail.form.sequenceId, this.isMultiPhase, + containerNFSExpCodeDir, command, nniManagerIp, this.paiRestServerPort, @@ -264,15 +272,18 @@ class PAIK8STrainingService extends PAITrainingService { throw new Error('paiJobRestServer is not initialized'); } + // Make sure experiment code files is copied from local to NFS + if (this.copyExpCodeDirPromise !== undefined) { + await this.copyExpCodeDirPromise; + } + this.paiRestServerPort = this.paiJobRestServer.clusterRestServerPort; // Step 1. Prepare PAI job configuration //create trial local working folder locally. await execMkdir(trialJobDetail.logPath); - - const runScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; // Write NNI installation file to local files - await fs.promises.writeFile(path.join(trialJobDetail.logPath, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); + await fs.promises.writeFile(path.join(trialJobDetail.logPath, 'install_nni.sh'), CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); // Write file content ( parameter.cfg ) to local working folders if (trialJobDetail.form !== undefined) { @@ -284,7 +295,7 @@ class PAIK8STrainingService extends PAITrainingService { //Generate Job Configuration in yaml format const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail); this.log.debug(paiJobConfig); - // Step 3. Submit PAI job via Rest call + // Step 2. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const submitJobRequest: request.Options = { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, diff --git a/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts b/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts index 29bb688d17..a2db2d7515 100644 --- a/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts +++ b/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts @@ -7,6 +7,36 @@ import { OsCommands } from "../osCommands"; import { RemoteCommandResult } from "../remoteMachineData"; class LinuxCommands extends OsCommands { + + public getScriptExt(): string { + return "sh"; + } + + public generateStartScript(workingDirectory: string, trialJobId: string, experimentId: string, + trialSequenceId: string, isMultiPhase: boolean, jobIdFileName: string, + command: string, nniManagerAddress: string, nniManagerPort: number, + nniManagerVersion: string, logCollection: string, exitCodeFile: string, + codeDir: string, cudaVisibleSetting: string): string { + + return `#!/bin/bash + export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \ + NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir} + export MULTI_PHASE=${isMultiPhase} + + cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR + cd $NNI_SYS_DIR + sh install_nni.sh + python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \ + --nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \ + --job_id_file ${jobIdFileName} \ + --log_collection '${logCollection}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr + echo $? \`date +%s%3N\` >${exitCodeFile}`; + } + + public generateGpuStatsScript(scriptFolder: string): string { + return `echo $$ > ${scriptFolder}/pid ; METRIC_OUTPUT_DIR=${scriptFolder} python3 -m nni_gpu_tool.gpu_metrics_collector`; + } + public createFolder(folderName: string, sharedFolder: boolean = false): string { let command; if (sharedFolder) { @@ -64,7 +94,19 @@ class LinuxCommands extends OsCommands { } public killChildProcesses(pidFileName: string): string { - const command = `pkill -P \`cat '${pidFileName}'\``; + // prevent trialkeeper to be killed, so it can save exit code. + const command = `list_descendants () + { + local children=$(ps -o pid= --ppid "$1") + + for pid in $children + do + list_descendants "$pid" + done + + echo "$children" + } + kill $(list_descendants \`cat '${pidFileName}'\`)` return command; } diff --git a/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts b/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts new file mode 100644 index 0000000000..c47d017168 --- /dev/null +++ b/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts @@ -0,0 +1,124 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { OsCommands } from "../osCommands"; +import { RemoteCommandResult } from "../remoteMachineData"; + +class WindowsCommands extends OsCommands { + + protected pathSpliter: string = '\\'; + + public getScriptExt(): string { + return "cmd"; + } + public generateStartScript(workingDirectory: string, trialJobId: string, experimentId: string, + trialSequenceId: string, isMultiPhase: boolean, jobIdFileName: string, + command: string, nniManagerAddress: string, nniManagerPort: number, + nniManagerVersion: string, logCollection: string, exitCodeFile: string, + codeDir: string, cudaVisibleSetting: string): string { + return `echo off + set NNI_PLATFORM=remote + set NNI_SYS_DIR=${workingDirectory} + set NNI_OUTPUT_DIR=${workingDirectory} + set NNI_TRIAL_JOB_ID=${trialJobId} + set NNI_EXP_ID=${experimentId} + set NNI_TRIAL_SEQ_ID=${trialSequenceId} + set MULTI_PHASE=${isMultiPhase} + set NNI_CODE_DIR=${codeDir} + ${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""} + + robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR% + cd %NNI_SYS_DIR% + python -c "import nni" 2>nul + if not %ERRORLEVEL% EQU 0 ( + echo installing NNI as exit code of "import nni" is %ERRORLEVEL% + python -m pip install --user --upgrade nni + ) + + echo starting script + python -m nni_trial_tool.trial_keeper --trial_command "${command}" --nnimanager_ip "${nniManagerAddress}" --nnimanager_port "${nniManagerPort}" --nni_manager_version "${nniManagerVersion}" --log_collection "${logCollection}" --job_id_file ${jobIdFileName} 1>%NNI_OUTPUT_DIR%/trialkeeper_stdout 2>%NNI_OUTPUT_DIR%/trialkeeper_stderr + + echo save exit code(%ERRORLEVEL%) and time + echo|set /p="%ERRORLEVEL% " > ${exitCodeFile} + powershell -command "Write (((New-TimeSpan -Start (Get-Date "01/01/1970") -End (Get-Date).ToUniversalTime()).TotalMilliseconds).ToString("0")) | Out-file ${exitCodeFile} -Append -NoNewline -encoding utf8"`; + } + + public generateGpuStatsScript(scriptFolder: string): string { + return `powershell -command $env:METRIC_OUTPUT_DIR='${scriptFolder}';$app = Start-Process -FilePath python -NoNewWindow -passthru -ArgumentList '-m nni_gpu_tool.gpu_metrics_collector' -RedirectStandardOutput ${scriptFolder}\\scriptstdout -RedirectStandardError ${scriptFolder}\\scriptstderr;Write $PID ^| Out-File ${scriptFolder}\\pid -NoNewline -encoding utf8;wait-process $app.ID`; + } + + public createFolder(folderName: string, sharedFolder: boolean = false): string { + let command; + if (sharedFolder) { + command = `mkdir "${folderName}"\r\nICACLS "${folderName}" /grant "Users":F`; + } else { + command = `mkdir "${folderName}"`; + } + return command; + } + + public allowPermission(isRecursive: boolean = false, ...folders: string[]): string { + let commands: string = ""; + + folders.forEach(folder => { + commands += `ICACLS "${folder}" /grant "Users":F${isRecursive ? " /T" : ""}\r\n` + }); + return commands; + } + + public removeFolder(folderName: string, isRecursive: boolean = false, isForce: boolean = true): string { + let flags = ''; + if (isForce || isRecursive) { + flags = `${isRecursive ? ' /s' : ''}${isForce ? ' /q' : ''}`; + } + + const command = `rmdir${flags} "${folderName}"`; + return command; + } + + public removeFiles(folderName: string, filePattern: string): string { + const files = this.joinPath(folderName, filePattern); + const command = `del "${files}"`; + return command; + } + + public readLastLines(fileName: string, lineCount: number = 1): string { + const command = `powershell.exe Get-Content "${fileName}" -Tail ${lineCount}`; + return command; + } + + public isProcessAliveCommand(pidFileName: string): string { + const command = `powershell.exe Get-Process -Id (get-content "${pidFileName}") -ErrorAction SilentlyContinue`; + return command; + } + + public isProcessAliveProcessOutput(commandResult: RemoteCommandResult): boolean { + let result = true; + if (commandResult.exitCode !== 0) { + result = false; + } + return result; + } + + public killChildProcesses(pidFileName: string): string { + const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` + + `Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` + + `if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` + + `kill-tree $ppid"`; + return command; + } + + public extractFile(tarFileName: string, targetFolder: string): string { + const command = `tar -xf "${tarFileName}" -C "${targetFolder}"`; + return command; + } + + public executeScript(script: string, _isFile: boolean): string { + const command = `${script}`; + return command; + } +} + +export { WindowsCommands }; diff --git a/src/nni_manager/training_service/remote_machine/osCommands.ts b/src/nni_manager/training_service/remote_machine/osCommands.ts index 804d964586..7f9144e435 100644 --- a/src/nni_manager/training_service/remote_machine/osCommands.ts +++ b/src/nni_manager/training_service/remote_machine/osCommands.ts @@ -8,8 +8,16 @@ import { RemoteCommandResult } from "./remoteMachineData"; abstract class OsCommands { protected pathSpliter: string = '/'; - protected multiplePathSpliter: RegExp = new RegExp(`\\${this.pathSpliter}{2,}`); + protected multiplePathSpliter: RegExp = new RegExp(`[\\\\/]{2,}`); + protected normalizePath: RegExp = new RegExp(`[\\\\/]`); + public abstract getScriptExt(): string; + public abstract generateStartScript(workingDirectory: string, trialJobId: string, experimentId: string, + trialSequenceId: string, isMultiPhase: boolean, jobIdFileName: string, + command: string, nniManagerAddress: string, nniManagerPort: number, + nniManagerVersion: string, logCollection: string, exitCodeFile: string, + codeDir: string, cudaVisibleSetting: string): string; + public abstract generateGpuStatsScript(scriptFolder: string): string; public abstract createFolder(folderName: string, sharedFolder: boolean): string; public abstract allowPermission(isRecursive: boolean, ...folders: string[]): string; public abstract removeFolder(folderName: string, isRecursive: boolean, isForce: boolean): string; @@ -26,6 +34,9 @@ abstract class OsCommands { if (dir === '') { dir = '.'; } else { + // normalize + dir = dir.replace(this.normalizePath, this.pathSpliter); + // reduce duplicate ones dir = dir.replace(this.multiplePathSpliter, this.pathSpliter); } return dir; diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts index 5f5f6455b7..28d49762f4 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts @@ -85,78 +85,82 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail { * The remote machine executor manager */ export class ExecutorManager { - private readonly executorArray: ShellExecutor[]; - private readonly maxTrialNumberPerConnection: number; + private readonly executorMap: Map = new Map(); private readonly rmMeta: RemoteMachineMeta; - constructor(executorArray: ShellExecutor[], maxTrialNumberPerConnection: number, rmMeta: RemoteMachineMeta) { + + private executors: ShellExecutor[] = []; + + constructor(rmMeta: RemoteMachineMeta) { this.rmMeta = rmMeta; - this.executorArray = executorArray; - this.maxTrialNumberPerConnection = maxTrialNumberPerConnection; } - /** - * find a available executor, if no executor available, return a new one - */ - public async getAvailableExecutor(): Promise { - for (const index of this.executorArray.keys()) { - const connectionNumber: number = this.executorArray[index].getUsedConnectionNumber; - if (connectionNumber < this.maxTrialNumberPerConnection) { - this.executorArray[index].addUsedConnectionNumber(); + public async getExecutor(id: string): Promise { + let isFound = false; + let executor: ShellExecutor | undefined; - return this.executorArray[index]; + // already assigned + if (this.executorMap.has(id)) { + executor = this.executorMap.get(id); + if (executor === undefined) { + throw new Error("executor shouldn't be undefined before return!"); } + return executor; } - //init a new executor if could not get an available one - return await this.initNewShellExecutor(); - } + for (const candidateExecutor of this.executors) { + if (candidateExecutor.addUsage()) { + isFound = true; + executor = candidateExecutor; + break; + } + } + // init a new executor if no free one. + if (!isFound) { + executor = await this.createShellExecutor(); + } - /** - * add a new executor to executorArray - * @param executor ShellExecutor - */ - public addNewShellExecutor(executor: ShellExecutor): void { - this.executorArray.push(executor); - } + if (executor === undefined) { + throw new Error("executor shouldn't be undefined before set!"); + } + this.executorMap.set(id, executor); - /** - * first executor instance is used for gpu collector and host job - */ - public getFirstExecutor(): ShellExecutor { - return this.executorArray[0]; + return executor; } /** * close all of executor */ - public closeAllExecutor(): void { - for (const executor of this.executorArray) { + public releaseAllExecutor(): void { + this.executorMap.clear(); + for (const executor of this.executors) { executor.close(); } + this.executors = []; } /** * retrieve resource, minus a number for given executor * @param executor executor */ - public releaseConnection(executor: ShellExecutor | undefined): void { + public releaseExecutor(id: string): void { + const executor = this.executorMap.get(id); if (executor === undefined) { - throw new Error(`could not release a undefined executor`); - } - for (const index of this.executorArray.keys()) { - if (this.executorArray[index] === executor) { - this.executorArray[index].minusUsedConnectionNumber(); - break; - } + throw new Error(`executor for ${id} is not found`); } + executor.releaseUsage(); + this.executorMap.delete(id); } /** * Create a new connection executor and initialize it */ - private async initNewShellExecutor(): Promise { + private async createShellExecutor(): Promise { const executor = new ShellExecutor(); await executor.initialize(this.rmMeta); + if (!executor.addUsage()) { + throw new Error("failed to add usage on new created Executor! It's a wired bug!"); + } + this.executors.push(executor); return executor; } } @@ -175,21 +179,3 @@ export enum ScheduleResultType { // Cannot match requirement even if all GPU are a REQUIRE_EXCEED_TOTAL } - -export const REMOTEMACHINE_TRIAL_COMMAND_FORMAT: string = - `#!/bin/bash -export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \ -NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5} -cd $NNI_SYS_DIR -sh install_nni.sh -echo $$ >{6} -python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \ ---nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr -echo $? \`date +%s%3N\` >{12}`; - -export const HOST_JOB_SHELL_FORMAT: string = - `#!/bin/bash -cd {0} -echo $$ >{1} -eval {2} >stdout 2>stderr -echo $? \`date +%s%3N\` >{3}`; diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 157da50d0c..138fa6e0ae 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -8,7 +8,6 @@ import { EventEmitter } from 'events'; import * as fs from 'fs'; import * as path from 'path'; import { Deferred } from 'ts-deferred'; -import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; import { NNIError, NNIErrorNames } from '../../common/errors'; import { getExperimentId } from '../../common/experimentStartupInfo'; @@ -19,17 +18,17 @@ import { TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; import { - delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getRemoteTmpDir, - getVersion, uniqueString, unixPathJoin + delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, + getVersion, uniqueString } from '../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { GPUSummary } from '../common/gpuData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { execCopydir, execMkdir, validateCodeDir, getGpuMetricsCollectorBashScriptContent } from '../common/util'; +import { execMkdir, validateCodeDir } from '../common/util'; import { GPUScheduler } from './gpuScheduler'; import { - REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineMeta, + RemoteMachineMeta, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager } from './remoteMachineData'; @@ -41,12 +40,12 @@ import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; */ @component.Singleton class RemoteMachineTrainingService implements TrainingService { + private readonly initExecutorId = "initConnection"; private readonly machineExecutorManagerMap: Map; //machine excutor map - private readonly trialExecutorMap: Map; //trial excutor map + private readonly machineCopyExpCodeDirPromiseMap: Map>; + private readonly trialExecutorManagerMap: Map; //trial excutor map private readonly trialJobsMap: Map; - private readonly MAX_TRIAL_NUMBER_PER_EXECUTOR: number = 5; // every excutor has a max trial concurrency number private readonly expRootDir: string; - private readonly remoteExpRootDir: string; private trialConfig: TrialConfig | undefined; private gpuScheduler?: GPUScheduler; private readonly jobQueue: string[]; @@ -55,25 +54,21 @@ class RemoteMachineTrainingService implements TrainingService { private readonly metricsEmitter: EventEmitter; private readonly log: Logger; private isMultiPhase: boolean = false; - private trialSequenceId: number; private remoteRestServerPort?: number; - private readonly remoteOS: string; private nniManagerIpConfig?: NNIManagerIpConfig; private versionCheck: boolean = true; private logCollection: string; constructor(@component.Inject timer: ObservableTimer) { - this.remoteOS = 'linux'; this.metricsEmitter = new EventEmitter(); this.trialJobsMap = new Map(); - this.trialExecutorMap = new Map(); + this.trialExecutorManagerMap = new Map(); + this.machineCopyExpCodeDirPromiseMap = new Map>(); this.machineExecutorManagerMap = new Map(); this.jobQueue = []; this.expRootDir = getExperimentRootDir(); - this.remoteExpRootDir = this.getRemoteExperimentRootDir(); this.timer = timer; this.log = getLogger(); - this.trialSequenceId = -1; this.logCollection = 'none'; this.log.info('Construct remote machine training service.'); } @@ -106,14 +101,14 @@ class RemoteMachineTrainingService implements TrainingService { } await delay(3000); } - this.log.info('Remote machine training service exit.'); + this.log.info('RemoteMachineTrainingService run loop exited.'); } /** * give trial an executor * @param trial remote machine trial job detail */ - public async allocateExecutorForTrial(trial: RemoteMachineTrialJobDetail): Promise { + public allocateExecutorManagerForTrial(trial: RemoteMachineTrialJobDetail): void { if (trial.rmMeta === undefined) { throw new Error(`rmMeta not set in trial ${trial.id}`); } @@ -121,23 +116,23 @@ class RemoteMachineTrainingService implements TrainingService { if (executorManager === undefined) { throw new Error(`executorManager not initialized`); } - const shellExecutor: ShellExecutor = await executorManager.getAvailableExecutor(); - this.trialExecutorMap.set(trial.id, shellExecutor); + this.trialExecutorManagerMap.set(trial.id, executorManager); } /** * If a trial is finished, release the connection resource * @param trial remote machine trial job detail */ - public releaseTrialExecutor(trial: RemoteMachineTrialJobDetail): void { + public releaseTrialResource(trial: RemoteMachineTrialJobDetail): void { if (trial.rmMeta === undefined) { throw new Error(`rmMeta not set in trial ${trial.id}`); } - const executorManager: ExecutorManager | undefined = this.machineExecutorManagerMap.get(trial.rmMeta); + const executorManager = this.trialExecutorManagerMap.get(trial.id); if (executorManager === undefined) { - throw new Error(`executorManager not initialized`); + throw new Error(`ExecutorManager is not assigned for trial ${trial.id}`); } - executorManager.releaseConnection(this.trialExecutorMap.get(trial.id)); + // Note, it still keep reference in trialExecutorManagerMap, as there may be following requests from nni manager. + executorManager.releaseExecutor(trial.id); } /** @@ -170,10 +165,7 @@ class RemoteMachineTrainingService implements TrainingService { if (trialJob.rmMeta === undefined) { throw new Error(`rmMeta not set for submitted job ${trialJobId}`); } - const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJob.id); - if (executor === undefined) { - throw new Error(`Invalid job id: ${trialJobId}, cannot find executor`); - } + const executor = await this.getExecutor(trialJob.id); return this.updateTrialJobStatus(trialJob, executor); } else { @@ -208,13 +200,12 @@ class RemoteMachineTrainingService implements TrainingService { // Generate trial job id(random) const trialJobId: string = uniqueString(5); - const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId); const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail( trialJobId, 'WAITING', Date.now(), - trialWorkingFolder, + "unset", form ); this.jobQueue.push(trialJobId); @@ -264,26 +255,23 @@ class RemoteMachineTrainingService implements TrainingService { // Get executor where the job is running if (trialJob.rmMeta !== undefined) { // If the trial job is already scheduled, check its status and kill the trial process in remote machine - const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJob.id); - if (executor === undefined) { - throw new Error(`Invalid job id ${trialJobId}, cannot find executor`); - } + const executor = await this.getExecutor(trialJob.id); if (trialJob.status === 'UNKNOWN') { - this.releaseTrialExecutor(trialJob); trialJob.status = 'USER_CANCELED'; + this.releaseTrialResource(trialJob); return } - const jobpidPath: string = this.getJobPidPath(trialJob.id); + const jobpidPath: string = this.getJobPidPath(executor, trialJob.id); try { // Mark the toEarlyStop tag here trialJob.isEarlyStopped = isEarlyStopped; await executor.killChildProcesses(jobpidPath); - this.releaseTrialExecutor(trialJob); + this.releaseTrialResource(trialJob); } catch (error) { // Not handle the error since pkill failed will not impact trial job's current status - this.log.error(`remoteTrainingService.cancelTrialJob: ${error.message}`); + this.log.error(`remoteTrainingService.cancelTrialJob: ${error}`); } } else { // Job is not scheduled yet, set status to 'USER_CANCELLED' directly @@ -320,9 +308,20 @@ class RemoteMachineTrainingService implements TrainingService { throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`); } - // Validate to make sure codeDir doesn't have too many files try { + // Validate to make sure codeDir doesn't have too many files await validateCodeDir(remoteMachineTrailConfig.codeDir); + // Copy codeDir to remote machine + for (const [rmMeta, executorManager] of this.machineExecutorManagerMap.entries()) { + const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId); + if (executor !== undefined) { + this.machineCopyExpCodeDirPromiseMap.set( + rmMeta, + executor.copyDirectoryToRemote(remoteMachineTrailConfig.codeDir, executor.getRemoteCodePath(getExperimentId())) + ); + } + } + } catch (error) { this.log.error(error); @@ -361,7 +360,15 @@ class RemoteMachineTrainingService implements TrainingService { public async cleanUp(): Promise { this.log.info('Stopping remote machine training service...'); this.stopping = true; - await Promise.race([delay(10000), this.cleanupConnections()]); + await this.cleanupConnections(); + } + + private async getExecutor(trialId: string): Promise { + const executorManager = this.trialExecutorManagerMap.get(trialId); + if (executorManager === undefined) { + throw new Error(`ExecutorManager is not assigned for trial ${trialId}`); + } + return await executorManager.getExecutor(trialId); } /** @@ -382,21 +389,19 @@ class RemoteMachineTrainingService implements TrainingService { */ private async cleanupConnections(): Promise { try { - for (const [rmMeta, executorManager] of this.machineExecutorManagerMap.entries()) { - const jobpidPath: string = unixPathJoin(this.getRemoteScriptsPath(rmMeta.username), 'pid'); - const executor: ShellExecutor | undefined = executorManager.getFirstExecutor(); + for (const executorManager of this.machineExecutorManagerMap.values()) { + const executor = await executorManager.getExecutor(this.initExecutorId); if (executor !== undefined) { - await executor.killChildProcesses(jobpidPath); - await executor.removeFolder(this.getRemoteScriptsPath(rmMeta.username)); + this.log.info(`killing gpu metric collector on ${executor.name}`); + const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid'); + await executor.killChildProcesses(gpuJobPidPath); } - executorManager.closeAllExecutor(); + executorManager.releaseAllExecutor(); } } catch (error) { //ignore error, this function is called to cleanup remote connections when experiment is stopping - this.log.error(`Cleanup connection exception, error is ${error.message}`); + this.log.error(`Cleanup connection exception, error is ${error}`); } - - return Promise.resolve(); } private async setupConnections(machineList: string): Promise { @@ -408,10 +413,14 @@ class RemoteMachineTrainingService implements TrainingService { rmMetaList.forEach(async (rmMeta: RemoteMachineMeta) => { rmMeta.occupiedGpuIndexMap = new Map(); - const executorManager: ExecutorManager = new ExecutorManager([], this.MAX_TRIAL_NUMBER_PER_EXECUTOR, rmMeta); - const executor: ShellExecutor = await executorManager.getAvailableExecutor(); + const executorManager: ExecutorManager = new ExecutorManager(rmMeta); + this.log.info(`connecting to ${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`); + const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId); + this.log.debug(`reached ${executor.name}`); this.machineExecutorManagerMap.set(rmMeta, executorManager); + this.log.debug(`initializing ${executor.name}`); await this.initRemoteMachineOnConnected(rmMeta, executor); + this.log.info(`connected to ${executor.name}`); if (++connectedRMNum === rmMetaList.length) { deferred.resolve(); } @@ -422,27 +431,36 @@ class RemoteMachineTrainingService implements TrainingService { private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise { // Create root working directory after executor is ready - const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni'); - await executor.createFolder(this.remoteExpRootDir); + const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni'); + await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId())); // the directory to store temp scripts in remote machine - const remoteGpuScriptCollectorDir: string = this.getRemoteScriptsPath(rmMeta.username); + const remoteGpuScriptCollectorDir: string = executor.getRemoteScriptsPath(getExperimentId()); + + // clean up previous result. await executor.createFolder(remoteGpuScriptCollectorDir, true); await executor.allowPermission(false, nniRootDir, `${nniRootDir}/*`, `${nniRootDir}/scripts/*`); //Begin to execute gpu_metrics_collection scripts - const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir); + const script = executor.generateGpuStatsScript(getExperimentId()); executor.executeScript(script, false, true); + // the timer is trigger in 1 second, it causes multiple runs on server. + // So reduce it's freqeunce, only allow one of it run. + const collectingCount: boolean[] = []; const disposable: Rx.IDisposable = this.timer.subscribe( async () => { - const cmdresult = await executor.readLastLines(unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')); - if (cmdresult !== "") { - rmMeta.gpuSummary = JSON.parse(cmdresult); - if (rmMeta.gpuSummary.gpuCount === 0) { - this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); - this.timer.unsubscribe(disposable); + if (collectingCount.length == 0) { + collectingCount.push(true); + const cmdresult = await executor.readLastLines(executor.joinPath(remoteGpuScriptCollectorDir, 'gpu_metrics')); + if (cmdresult !== "") { + rmMeta.gpuSummary = JSON.parse(cmdresult); + if (rmMeta.gpuSummary.gpuCount === 0) { + this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); + this.timer.unsubscribe(disposable); + } } + collectingCount.pop(); } } ); @@ -477,16 +495,23 @@ class RemoteMachineTrainingService implements TrainingService { } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED && rmScheduleResult.scheduleInfo !== undefined) { const rmScheduleInfo: RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo; - const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId); trialJobDetail.rmMeta = rmScheduleInfo.rmMeta; + const copyExpCodeDirPromise = this.machineCopyExpCodeDirPromiseMap.get(trialJobDetail.rmMeta); + if (copyExpCodeDirPromise !== undefined) { + await copyExpCodeDirPromise; + } + + this.allocateExecutorManagerForTrial(trialJobDetail); + const executor = await this.getExecutor(trialJobDetail.id); + + trialJobDetail.workingDirectory = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobDetail.id); - await this.allocateExecutorForTrial(trialJobDetail); await this.launchTrialOnScheduledMachine( - trialJobId, trialWorkingFolder, trialJobDetail.form, rmScheduleInfo); + trialJobId, trialJobDetail.form, rmScheduleInfo); trialJobDetail.status = 'RUNNING'; - trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`; + trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialJobDetail.workingDirectory}`; trialJobDetail.startTime = Date.now(); this.trialJobsMap.set(trialJobId, trialJobDetail); @@ -501,19 +526,13 @@ class RemoteMachineTrainingService implements TrainingService { return deferred.promise; } - private async launchTrialOnScheduledMachine(trialJobId: string, trialWorkingFolder: string, form: TrialJobApplicationForm, + private async launchTrialOnScheduledMachine(trialJobId: string, form: TrialJobApplicationForm, rmScheduleInfo: RemoteMachineScheduleInfo): Promise { if (this.trialConfig === undefined) { throw new Error('trial config is not initialized'); } const cudaVisibleDevice: string = rmScheduleInfo.cudaVisibleDevice; - const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJobId); - if (executor === undefined) { - assert(false, 'ShellExecutor is undefined.'); - - // for lint - return; - } + const executor = await this.getExecutor(trialJobId); const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { throw new Error(`Can not get trial job detail for job: ${trialJobId}`); @@ -521,23 +540,22 @@ class RemoteMachineTrainingService implements TrainingService { const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); - await executor.createFolder(trialWorkingFolder); - await executor.createFolder(unixPathJoin(trialWorkingFolder, '.nni')); + await executor.createFolder(executor.joinPath(trialJobDetail.workingDirectory, '.nni')); // RemoteMachineRunShellFormat is the run shell format string, // See definition in remoteMachineData.ts - let command: string; + let cudaVisible: string; // Set CUDA_VISIBLE_DEVICES environment variable based on cudaVisibleDevice // If no valid cudaVisibleDevice is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device // If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script if (this.trialConfig.gpuNum === undefined) { - command = this.trialConfig.command; + cudaVisible = "" } else { if (typeof cudaVisibleDevice === 'string' && cudaVisibleDevice.length > 0) { - command = `CUDA_VISIBLE_DEVICES=${cudaVisibleDevice} ${this.trialConfig.command}`; + cudaVisible = `CUDA_VISIBLE_DEVICES=${cudaVisibleDevice}`; } else { - command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`; + cudaVisible = `CUDA_VISIBLE_DEVICES=" "`; } } const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); @@ -546,53 +564,36 @@ class RemoteMachineTrainingService implements TrainingService { this.remoteRestServerPort = restServer.clusterRestServerPort; } const version: string = this.versionCheck ? await getVersion() : ''; - const runScriptTrialContent: string = String.Format( - REMOTEMACHINE_TRIAL_COMMAND_FORMAT, - trialWorkingFolder, - trialWorkingFolder, + const runScriptTrialContent: string = executor.generateStartScript( + trialJobDetail.workingDirectory, trialJobId, getExperimentId(), trialJobDetail.form.sequenceId.toString(), this.isMultiPhase, - unixPathJoin(trialWorkingFolder, '.nni', 'jobpid'), - command, + this.trialConfig.command, nniManagerIp, this.remoteRestServerPort, version, - this.logCollection, - unixPathJoin(trialWorkingFolder, '.nni', 'code') - ); + this.logCollection, cudaVisible); //create tmp trial working folder locally. await execMkdir(path.join(trialLocalTempFolder, '.nni')); - //create tmp trial working folder locally. - await execCopydir(this.trialConfig.codeDir, trialLocalTempFolder); - const installScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; - // Write NNI installation file to local tmp files - await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' }); + // Write install_nni.sh, it's not used in Windows platform. + await fs.promises.writeFile(path.join(trialLocalTempFolder, executor.getScriptName("install_nni")), CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); // Write file content ( run.sh and parameter.cfg ) to local tmp files - await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), runScriptTrialContent, { encoding: 'utf8' }); + await fs.promises.writeFile(path.join(trialLocalTempFolder, executor.getScriptName("run")), runScriptTrialContent, { encoding: 'utf8' }); await this.writeParameterFile(trialJobId, form.hyperParameters); // Copy files in codeDir to remote working directory - await executor.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, this.remoteOS); + await executor.copyDirectoryToRemote(trialLocalTempFolder, trialJobDetail.workingDirectory); // Execute command in remote machine - executor.executeScript(unixPathJoin(trialWorkingFolder, 'run.sh'), true, true); - } - - private getRmMetaByHost(host: string): RemoteMachineMeta { - for (const rmMeta of this.machineExecutorManagerMap.keys()) { - if (rmMeta.ip === host) { - return rmMeta; - } - } - throw new Error(`Host not found: ${host}`); + executor.executeScript(executor.joinPath(trialJobDetail.workingDirectory, executor.getScriptName("run")), true, true); } private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, executor: ShellExecutor): Promise { const deferred: Deferred = new Deferred(); - const jobpidPath: string = this.getJobPidPath(trialJob.id); - const trialReturnCodeFilePath: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code'); + const jobpidPath: string = this.getJobPidPath(executor, trialJob.id); + const trialReturnCodeFilePath: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJob.id, '.nni', 'code'); /* eslint-disable require-atomic-updates */ try { const isAlive = await executor.isProcessAlive(jobpidPath); @@ -601,7 +602,7 @@ class RemoteMachineTrainingService implements TrainingService { const trialReturnCode: string = await executor.getRemoteFileContent(trialReturnCodeFilePath); this.log.debug(`trailjob ${trialJob.id} return code: ${trialReturnCode}`); const match: RegExpMatchArray | null = trialReturnCode.trim() - .match(/^(\d+)\s+(\d+)$/); + .match(/^-?(\d+)\s+(\d+)$/); if (match !== null) { const { 1: code, 2: timestamp } = match; // Update trial job's status based on result code @@ -616,13 +617,13 @@ class RemoteMachineTrainingService implements TrainingService { } } trialJob.endTime = parseInt(timestamp, 10); - this.releaseTrialExecutor(trialJob); + this.releaseTrialResource(trialJob); } this.log.debug(`trailJob status update: ${trialJob.id}, ${trialJob.status}`); } deferred.resolve(trialJob); } catch (error) { - this.log.error(`Update job status exception, error is ${error.message}`); + this.log.debug(`(Ignorable mostly)Update job status exception, error is ${error.message}`); if (error instanceof NNIError && error.name === NNIErrorNames.NOT_FOUND) { deferred.resolve(trialJob); } else { @@ -634,45 +635,30 @@ class RemoteMachineTrainingService implements TrainingService { return deferred.promise; } - private getRemoteScriptsPath(userName: string): string { - return unixPathJoin(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts'); - } - - private getHostJobRemoteDir(jobId: string): string { - return unixPathJoin(this.remoteExpRootDir, 'hostjobs', jobId); - } - - private getRemoteExperimentRootDir(): string { - return unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId()); - } - public get MetricsEmitter(): EventEmitter { return this.metricsEmitter; } - private getJobPidPath(jobId: string): string { + private getJobPidPath(executor: ShellExecutor, jobId: string): string { const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(jobId); if (trialJobDetail === undefined) { throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${jobId}`); } - return unixPathJoin(trialJobDetail.workingDirectory, '.nni', 'jobpid'); + return executor.joinPath(trialJobDetail.workingDirectory, '.nni', 'jobpid'); } private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise { - const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJobId); - if (executor === undefined) { - throw new Error('ShellExecutor is undefined.'); - } + const executor = await this.getExecutor(trialJobId); - const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId); + const trialWorkingFolder: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobId); const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); const fileName: string = generateParamFileName(hyperParameters); const localFilepath: string = path.join(trialLocalTempFolder, fileName); await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); - await executor.copyFileToRemote(localFilepath, unixPathJoin(trialWorkingFolder, fileName)); + await executor.copyFileToRemote(localFilepath, executor.joinPath(trialWorkingFolder, fileName)); } } diff --git a/src/nni_manager/training_service/remote_machine/shellExecutor.ts b/src/nni_manager/training_service/remote_machine/shellExecutor.ts index 4b60bd963d..deb9d26b9f 100644 --- a/src/nni_manager/training_service/remote_machine/shellExecutor.ts +++ b/src/nni_manager/training_service/remote_machine/shellExecutor.ts @@ -4,27 +4,39 @@ 'use strict'; import * as assert from 'assert'; +import * as fs from 'fs'; import * as os from 'os'; import * as path from 'path'; -import * as fs from 'fs'; -import { Client, ClientChannel, SFTPWrapper, ConnectConfig } from 'ssh2'; -import { Deferred } from "ts-deferred"; -import { RemoteCommandResult, RemoteMachineMeta } from "./remoteMachineData"; +import { Client, ClientChannel, ConnectConfig, SFTPWrapper } from 'ssh2'; import * as stream from 'stream'; -import { OsCommands } from "./osCommands"; -import { LinuxCommands } from "./extends/linuxCommands"; +import { Deferred } from "ts-deferred"; import { getLogger, Logger } from '../../common/log'; -import { NNIError, NNIErrorNames } from '../../common/errors'; +import { uniqueString, randomInt } from '../../common/utils'; import { execRemove, tarAdd } from '../common/util'; -import { getRemoteTmpDir, uniqueString, unixPathJoin } from '../../common/utils'; +import { LinuxCommands } from "./extends/linuxCommands"; +import { WindowsCommands } from './extends/windowsCommands'; +import { OsCommands } from "./osCommands"; +import { RemoteCommandResult, RemoteMachineMeta } from "./remoteMachineData"; +import { NNIError, NNIErrorNames } from '../../common/errors'; class ShellExecutor { - private sshClient: Client = new Client(); - private osCommands: OsCommands | undefined; - private usedConnectionNumber: number = 0; //count the connection number of every client + public name: string = ""; - protected pathSpliter: string = '/'; - protected multiplePathSpliter: RegExp = new RegExp(`\\${this.pathSpliter}{2,}`); + private readonly lineBreaker = new RegExp(`[\r\n]+`); + private readonly maxUsageCount = 5; + + private osCommands: OsCommands | undefined; + private usedCount: number = 0; //count the connection number of every client + private readonly sshClient: Client; + private readonly log: Logger; + private tempPath: string = ""; + private isWindows: boolean = false; + private channelDefaultOutputs: string[] = []; + + constructor() { + this.log = getLogger(); + this.sshClient = new Client(); + } public async initialize(rmMeta: RemoteMachineMeta): Promise { const deferred: Deferred = new Deferred(); @@ -33,8 +45,9 @@ class ShellExecutor { host: rmMeta.ip, port: rmMeta.port, username: rmMeta.username, - tryKeyboard: true + tryKeyboard: true, }; + this.name = `${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`; if (rmMeta.passwd !== undefined) { connectConfig.password = rmMeta.passwd; } else if (rmMeta.sshKeyPath !== undefined) { @@ -49,20 +62,42 @@ class ShellExecutor { } else { deferred.reject(new Error(`No valid passwd or sshKeyPath is configed.`)); } + this.sshClient.on('ready', async () => { // check OS type: windows or else const result = await this.execute("ver"); if (result.exitCode == 0 && result.stdout.search("Windows") > -1) { - // not implement Windows commands yet. - throw new Error("not implement Windows commands yet."); + this.osCommands = new WindowsCommands(); + this.isWindows = true; + + // detect default output and trying to remove it under windows. + // Anaconda has this kind of output. + let defaultResult = await this.execute(""); + if (defaultResult.stdout !== "") { + deferred.reject(new Error(`The windows remote node shouldn't output welcome message, below content should be removed from the command window! \n` + + `${defaultResult.stdout}`)); + } + defaultResult = await this.execute("powershell -command \"\""); + if (defaultResult.stdout !== "") { + this.channelDefaultOutputs.push(defaultResult.stdout); + } + this.log.debug(`set channelDefaultOutput to "${this.channelDefaultOutputs}"`); + + // parse temp folder to expand possible environment variables. + const commandResult = await this.execute("echo %TEMP%"); + this.tempPath = commandResult.stdout.replace(this.lineBreaker, ""); } else { this.osCommands = new LinuxCommands(); + // it's not stable to get tmp path by Linux command, like "echo /tmp" or "ld -d /tmp". + // Sometime it returns empty back, so hard code tmp path here. + this.tempPath = "/tmp"; } + deferred.resolve(); }).on('error', (err: Error) => { // SSH connection error, reject with error message deferred.reject(new Error(err.message)); - }).on("keyboard-interactive", (name, instructions, lang, prompts, finish) => { + }).on("keyboard-interactive", (_name, _instructions, _lang, _prompts, finish) => { finish([rmMeta.passwd]); }).connect(connectConfig); @@ -73,43 +108,108 @@ class ShellExecutor { this.sshClient.end(); } - public get getUsedConnectionNumber(): number { - return this.usedConnectionNumber; + public addUsage(): boolean { + let isAddedSuccess = false; + if (this.usedCount < this.maxUsageCount) { + this.usedCount++; + isAddedSuccess = true; + } + return isAddedSuccess; } - public addUsedConnectionNumber(): void { - this.usedConnectionNumber += 1; + public releaseUsage(): boolean { + let canBeReleased = false; + if (this.usedCount > 0) { + this.usedCount--; + } + if (this.usedCount == 0) { + canBeReleased = true; + } + return canBeReleased; } - public minusUsedConnectionNumber(): void { - this.usedConnectionNumber -= 1; + public getScriptName(mainName: string): string { + if (this.osCommands === undefined) { + throw new Error("osCommands must be initialized!"); + } + return `${mainName}.${this.osCommands.getScriptExt()}`; + } + + public generateStartScript(workingDirectory: string, trialJobId: string, experimentId: string, + trialSequenceId: string, isMultiPhase: boolean, + command: string, nniManagerAddress: string, nniManagerPort: number, + nniManagerVersion: string, logCollection: string, cudaVisibleSetting: string): string { + if (this.osCommands === undefined) { + throw new Error("osCommands must be initialized!"); + } + const jobIdFileName = this.joinPath(workingDirectory, '.nni', 'jobpid'); + const exitCodeFile = this.joinPath(workingDirectory, '.nni', 'code'); + const codeDir = this.getRemoteCodePath(experimentId); + + return this.osCommands.generateStartScript(workingDirectory, trialJobId, experimentId, + trialSequenceId, isMultiPhase, jobIdFileName, command, + nniManagerAddress, nniManagerPort, nniManagerVersion, + logCollection, exitCodeFile, codeDir, cudaVisibleSetting); + } + + public generateGpuStatsScript(experimentId: string): string { + if (this.osCommands === undefined) { + throw new Error("osCommands must be initialized!"); + } + return this.osCommands.generateGpuStatsScript(this.getRemoteScriptsPath(experimentId)); + } + + public getTempPath(): string { + if (this.tempPath === "") { + throw new Error("tempPath must be initialized!"); + } + return this.tempPath; + } + + public getRemoteScriptsPath(experimentId: string): string { + return this.joinPath(this.getRemoteExperimentRootDir(experimentId), 'scripts'); + } + + public getRemoteCodePath(experimentId: string): string { + return this.joinPath(this.getRemoteExperimentRootDir(experimentId), 'nni-code'); + } + + public getRemoteExperimentRootDir(experimentId: string): string { + return this.joinPath(this.tempPath, 'nni', 'experiments', experimentId); + } + + public joinPath(...paths: string[]): string { + if (!this.osCommands) { + throw new Error("osCommands must be initialized!"); + } + return this.osCommands.joinPath(...paths); } public async createFolder(folderName: string, sharedFolder: boolean = false): Promise { const commandText = this.osCommands && this.osCommands.createFolder(folderName, sharedFolder); const commandResult = await this.execute(commandText); - const result = commandResult.exitCode >= 0; + const result = commandResult.exitCode == 0; return result; } public async allowPermission(isRecursive: boolean = false, ...folders: string[]): Promise { const commandText = this.osCommands && this.osCommands.allowPermission(isRecursive, ...folders); const commandResult = await this.execute(commandText); - const result = commandResult.exitCode >= 0; + const result = commandResult.exitCode == 0; return result; } public async removeFolder(folderName: string, isRecursive: boolean = false, isForce: boolean = true): Promise { const commandText = this.osCommands && this.osCommands.removeFolder(folderName, isRecursive, isForce); const commandResult = await this.execute(commandText); - const result = commandResult.exitCode >= 0; + const result = commandResult.exitCode == 0; return result; } public async removeFiles(folderOrFileName: string, filePattern: string = ""): Promise { const commandText = this.osCommands && this.osCommands.removeFiles(folderOrFileName, filePattern); const commandResult = await this.execute(commandText); - const result = commandResult.exitCode >= 0; + const result = commandResult.exitCode == 0; return result; } @@ -142,10 +242,10 @@ class ShellExecutor { return commandResult.exitCode == 0; } - public async executeScript(script: string, isFile: boolean, isInteractive: boolean = false): Promise { + public async executeScript(script: string, isFile: boolean = false, isInteractive: boolean = false): Promise { const commandText = this.osCommands && this.osCommands.executeScript(script, isFile); const commandResult = await this.execute(commandText, undefined, isInteractive); - return commandResult.exitCode == 0; + return commandResult; } /** @@ -154,13 +254,13 @@ class ShellExecutor { * @param remoteFilePath the target path in remote machine */ public async copyFileToRemote(localFilePath: string, remoteFilePath: string): Promise { - const log: Logger = getLogger(); - log.debug(`copyFileToRemote: localFilePath: ${localFilePath}, remoteFilePath: ${remoteFilePath}`); + const commandIndex = randomInt(10000); + this.log.debug(`copyFileToRemote(${commandIndex}): localFilePath: ${localFilePath}, remoteFilePath: ${remoteFilePath}`); const deferred: Deferred = new Deferred(); this.sshClient.sftp((err: Error, sftp: SFTPWrapper) => { if (err !== undefined && err !== null) { - log.error(`copyFileToRemote: ${err.message}, ${localFilePath}, ${remoteFilePath}`); + this.log.error(`copyFileToRemote(${commandIndex}): ${err}`); deferred.reject(err); return; @@ -169,6 +269,7 @@ class ShellExecutor { sftp.fastPut(localFilePath, remoteFilePath, (fastPutErr: Error) => { sftp.end(); if (fastPutErr !== undefined && fastPutErr !== null) { + this.log.error(`copyFileToRemote(${commandIndex}) fastPutErr: ${fastPutErr}, ${localFilePath}, ${remoteFilePath}`); deferred.reject(fastPutErr); } else { deferred.resolve(true); @@ -183,13 +284,17 @@ class ShellExecutor { * Copy files and directories in local directory recursively to remote directory * @param localDirectory local diretory * @param remoteDirectory remote directory - * @param sshClient SSH client */ - public async copyDirectoryToRemote(localDirectory: string, remoteDirectory: string, remoteOS: string): Promise { + public async copyDirectoryToRemote(localDirectory: string, remoteDirectory: string): Promise { const tmpSuffix: string = uniqueString(5); const localTarPath: string = path.join(os.tmpdir(), `nni_tmp_local_${tmpSuffix}.tar.gz`); - const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), `nni_tmp_remote_${tmpSuffix}.tar.gz`); + if (!this.osCommands) { + throw new Error("osCommands must be initialized!"); + } + const remoteTarPath: string = this.osCommands.joinPath(this.tempPath, `nni_tmp_remote_${tmpSuffix}.tar.gz`); + // Create remote directory + await this.createFolder(remoteDirectory); // Compress files in local directory to experiment root directory await tarAdd(localTarPath, localDirectory); // Copy the compressed file to remoteDirectory and delete it @@ -201,12 +306,13 @@ class ShellExecutor { } public async getRemoteFileContent(filePath: string): Promise { + const commandIndex = randomInt(10000); + this.log.debug(`getRemoteFileContent(${commandIndex}): filePath: ${filePath}`); const deferred: Deferred = new Deferred(); this.sshClient.sftp((err: Error, sftp: SFTPWrapper) => { if (err !== undefined && err !== null) { - getLogger() - .error(`getRemoteFileContent: ${err.message}`); - deferred.reject(new Error(`SFTP error: ${err.message}`)); + this.log.error(`getRemoteFileContent(${commandIndex}) sftp: ${err}`); + deferred.reject(new Error(`SFTP error: ${err}`)); return; } @@ -227,8 +333,7 @@ class ShellExecutor { deferred.resolve(dataBuffer); }); } catch (error) { - getLogger() - .error(`getRemoteFileContent: ${error.message}`); + this.log.error(`getRemoteFileContent(${commandIndex}): ${error.message}`); sftp.end(); deferred.reject(new Error(`SFTP error: ${error.message}`)); } @@ -238,16 +343,20 @@ class ShellExecutor { } private async execute(command: string | undefined, processOutput: ((input: RemoteCommandResult) => RemoteCommandResult) | undefined = undefined, useShell: boolean = false): Promise { - const log: Logger = getLogger(); - log.debug(`remoteExeCommand: command: [${command}]`); const deferred: Deferred = new Deferred(); let stdout: string = ''; let stderr: string = ''; let exitCode: number; + const commandIndex = randomInt(10000); + this.log.debug(`remoteExeCommand(${commandIndex}): [${command}]`); + + // Windows always uses shell, and it needs to disable to get it works. + useShell = useShell && !this.isWindows; + const callback = (err: Error, channel: ClientChannel): void => { if (err !== undefined && err !== null) { - log.error(`remoteExeCommand: ${err.message}`); + this.log.error(`remoteExeCommand(${commandIndex}): ${err.message}`); deferred.reject(err); return; } @@ -257,7 +366,23 @@ class ShellExecutor { }); channel.on('exit', (code: any) => { exitCode = code; - log.debug(`remoteExeCommand exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`); + + // remove default output to get stdout correct. + if (this.channelDefaultOutputs.length > 0) { + let modifiedStdout = stdout; + this.channelDefaultOutputs.forEach(defaultOutput => { + if (modifiedStdout.startsWith(defaultOutput)) { + if (modifiedStdout.length > defaultOutput.length) { + modifiedStdout = modifiedStdout.substr(defaultOutput.length); + } else if (modifiedStdout.length === defaultOutput.length) { + modifiedStdout = ""; + } + } + }); + stdout = modifiedStdout; + } + + this.log.debug(`remoteExeCommand(${commandIndex}) exit(${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`); let result = { stdout: stdout, stderr: stderr, @@ -269,7 +394,7 @@ class ShellExecutor { } deferred.resolve(result); }); - channel.stderr.on('data', function (data) { + channel.stderr.on('data', function (data: any) { stderr += data; }); diff --git a/src/nni_manager/training_service/remote_machine/test/linuxCommands.test.ts b/src/nni_manager/training_service/remote_machine/test/linuxCommands.test.ts index ffe89cbc4f..ee7d1904a9 100644 --- a/src/nni_manager/training_service/remote_machine/test/linuxCommands.test.ts +++ b/src/nni_manager/training_service/remote_machine/test/linuxCommands.test.ts @@ -8,7 +8,6 @@ import * as chaiAsPromised from 'chai-as-promised'; import * as component from '../../../common/component'; import { cleanupUnitTest, prepareUnitTest } from '../../../common/utils'; import { LinuxCommands } from '../extends/linuxCommands'; -// import { TrialConfigMetadataKey } from '../trialConfigMetadataKey'; describe('Unit Test for linuxCommands', () => { @@ -88,10 +87,6 @@ describe('Unit Test for linuxCommands', () => { )).to.equal(false); }) - it('killChildProcesses', async () => { - chai.expect(linuxCommands.killChildProcesses("test")).to.equal("pkill -P `cat 'test'`"); - }) - it('extractFile', async () => { chai.expect(linuxCommands.extractFile("test.tar", "testfolder")).to.equal("tar -oxzf 'test.tar' -C 'testfolder'"); }) diff --git a/src/nni_manager/training_service/remote_machine/test/shellExecutor.test.ts b/src/nni_manager/training_service/remote_machine/test/shellExecutor.test.ts index fb8b6bbf2b..4e9d9ffb68 100644 --- a/src/nni_manager/training_service/remote_machine/test/shellExecutor.test.ts +++ b/src/nni_manager/training_service/remote_machine/test/shellExecutor.test.ts @@ -8,29 +8,29 @@ import * as fs from 'fs'; import * as chai from 'chai'; import * as chaiAsPromised from 'chai-as-promised'; -import { Client } from 'ssh2'; import { ShellExecutor } from '../shellExecutor'; import { prepareUnitTest, cleanupUnitTest } from '../../../common/utils'; -const LOCALFILE: string = '/tmp/localSshclientUTData'; -const REMOTEFILE: string = '/tmp/remoteSshclientUTData'; -const REMOTEFOLDER: string = '/tmp/remoteSshclientUTFolder'; +const LOCALFILE: string = 'localSshUTData'; +const REMOTEFILE: string = 'remoteSshUTData'; +const REMOTEFOLDER: string = 'remoteSshUTFolder'; async function copyFile(executor: ShellExecutor): Promise { - await executor.copyFileToRemote(LOCALFILE, REMOTEFILE); + const remoteFullName = executor.joinPath(executor.getTempPath(), REMOTEFILE); + await executor.copyFileToRemote(LOCALFILE, remoteFullName); } async function copyFileToRemoteLoop(executor: ShellExecutor): Promise { - for (let i: number = 0; i < 10; i++) { - // console.log(i); - await executor.copyFileToRemote(LOCALFILE, REMOTEFILE); + const remoteFullName = executor.joinPath(executor.getTempPath(), REMOTEFILE); + for (let i: number = 0; i < 3; i++) { + await executor.copyFileToRemote(LOCALFILE, remoteFullName); } } async function getRemoteFileContentLoop(executor: ShellExecutor): Promise { - for (let i: number = 0; i < 10; i++) { - // console.log(i); - await executor.getRemoteFileContent(REMOTEFILE); + const remoteFullName = executor.joinPath(executor.getTempPath(), REMOTEFILE); + for (let i: number = 0; i < 3; i++) { + await executor.getRemoteFileContent(remoteFullName); } } @@ -41,14 +41,16 @@ describe('ShellExecutor test', () => { rmMeta = JSON.parse(fs.readFileSync('../../.vscode/rminfo.json', 'utf8')); console.log(rmMeta); } catch (err) { - console.log(`Please configure rminfo.json to enable remote machine test.${err}`); + console.log(`Please configure rminfo.json to enable remote machine test. ${err}`); skip = true; } before(async () => { chai.should(); chai.use(chaiAsPromised); - await cpp.exec(`echo '1234' > ${LOCALFILE}`); + if (!fs.existsSync(LOCALFILE)){ + await cpp.exec(`echo '1234' > ${LOCALFILE}`); + } prepareUnitTest(); }); @@ -61,26 +63,27 @@ describe('ShellExecutor test', () => { if (skip) { return; } - const shellExecutor: ShellExecutor = new ShellExecutor(); - await shellExecutor.initialize(rmMeta); - let result = await shellExecutor.createFolder(REMOTEFOLDER, false); + const executor: ShellExecutor = new ShellExecutor(); + await executor.initialize(rmMeta); + const remoteFullPath = executor.joinPath(executor.getTempPath(), REMOTEFOLDER); + let result = await executor.createFolder(remoteFullPath, false); chai.expect(result).eq(true); - result = await shellExecutor.removeFolder(REMOTEFOLDER); + const commandResult = await executor.executeScript("dir"); + chai.expect(commandResult.exitCode).eq(0); + result = await executor.removeFolder(remoteFullPath); chai.expect(result).eq(true); + await executor.close(); }); it('Test ShellExecutor', async () => { if (skip) { return; } - const shellExecutor: ShellExecutor = new ShellExecutor(); - await shellExecutor.initialize(rmMeta); - await copyFile(shellExecutor); - await Promise.all([ - copyFileToRemoteLoop(shellExecutor), - copyFileToRemoteLoop(shellExecutor), - copyFileToRemoteLoop(shellExecutor), - getRemoteFileContentLoop(shellExecutor) - ]); + const executor: ShellExecutor = new ShellExecutor(); + await executor.initialize(rmMeta); + await copyFile(executor); + await copyFileToRemoteLoop(executor); + await getRemoteFileContentLoop(executor); + await executor.close(); }); }); diff --git a/src/nni_manager/training_service/remote_machine/test/windowsCommands.test.ts b/src/nni_manager/training_service/remote_machine/test/windowsCommands.test.ts new file mode 100644 index 0000000000..2f2408697a --- /dev/null +++ b/src/nni_manager/training_service/remote_machine/test/windowsCommands.test.ts @@ -0,0 +1,102 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import * as chai from 'chai'; +import * as chaiAsPromised from 'chai-as-promised'; +import * as component from '../../../common/component'; +import { cleanupUnitTest, prepareUnitTest } from '../../../common/utils'; +import { WindowsCommands } from '../extends/windowsCommands'; + + +describe('Unit Test for Windows Commands', () => { + + let windowsCommands: WindowsCommands + + before(() => { + chai.should(); + chai.use(chaiAsPromised); + prepareUnitTest(); + }); + + after(() => { + cleanupUnitTest(); + }); + + beforeEach(() => { + windowsCommands = component.get(WindowsCommands); + }); + + afterEach(() => { + }); + + it('joinPath', async () => { + chai.expect(windowsCommands.joinPath("/root/", "\\first")).to.equal("\\root\\first"); + chai.expect(windowsCommands.joinPath("root/", "first")).to.equal("root\\first"); + chai.expect(windowsCommands.joinPath("\\root/", "\\first")).to.equal("\\root\\first"); + chai.expect(windowsCommands.joinPath("\\root\\", "\\first")).to.equal("\\root\\first"); + chai.expect(windowsCommands.joinPath("\\root", "first")).to.equal("\\root\\first"); + chai.expect(windowsCommands.joinPath("\\root\\", "first")).to.equal("\\root\\first"); + chai.expect(windowsCommands.joinPath("root\\", "first")).to.equal("root\\first"); + chai.expect(windowsCommands.joinPath("root\\")).to.equal("root\\"); + chai.expect(windowsCommands.joinPath("root")).to.equal("root"); + chai.expect(windowsCommands.joinPath(".\\root")).to.equal(".\\root"); + chai.expect(windowsCommands.joinPath("")).to.equal("."); + chai.expect(windowsCommands.joinPath("..")).to.equal(".."); + }) + + it('createFolder', async () => { + chai.expect(windowsCommands.createFolder("test")).to.equal("mkdir \"test\""); + chai.expect(windowsCommands.createFolder("test", true)).to.equal("mkdir \"test\"\r\nICACLS \"test\" /grant \"Users\":F"); + }) + + it('allowPermission', async () => { + chai.expect(windowsCommands.allowPermission(true, "test", "test1")).to.equal("ICACLS \"test\" /grant \"Users\":F /T\r\nICACLS \"test1\" /grant \"Users\":F /T\r\n"); + chai.expect(windowsCommands.allowPermission(false, "test")).to.equal("ICACLS \"test\" /grant \"Users\":F\r\n"); + }) + + it('removeFolder', async () => { + chai.expect(windowsCommands.removeFolder("test")).to.equal("rmdir /q \"test\""); + chai.expect(windowsCommands.removeFolder("test", true)).to.equal("rmdir /s /q \"test\""); + chai.expect(windowsCommands.removeFolder("test", true, false)).to.equal("rmdir /s \"test\""); + chai.expect(windowsCommands.removeFolder("test", false, false)).to.equal("rmdir \"test\""); + chai.expect(windowsCommands.removeFolder("test", true, true)).to.equal("rmdir /s /q \"test\""); + }) + + it('removeFiles', async () => { + chai.expect(windowsCommands.removeFiles("test", "*.sh")).to.equal("del \"test\\*.sh\""); + chai.expect(windowsCommands.removeFiles("test", "")).to.equal("del \"test\""); + }) + + it('readLastLines', async () => { + chai.expect(windowsCommands.readLastLines("test", 3)).to.equal("powershell.exe Get-Content \"test\" -Tail 3"); + }) + + it('isProcessAlive', async () => { + chai.expect(windowsCommands.isProcessAliveCommand("test")).to.equal("powershell.exe Get-Process -Id (get-content \"test\") -ErrorAction SilentlyContinue"); + chai.expect(windowsCommands.isProcessAliveProcessOutput( + { + exitCode: 0, + stdout: "", + stderr: "" + } + )).to.equal(true); + chai.expect(windowsCommands.isProcessAliveProcessOutput( + { + exitCode: 10, + stdout: "", + stderr: "" + } + )).to.equal(false); + }) + + it('extractFile', async () => { + chai.expect(windowsCommands.extractFile("test.tar", "testfolder")).to.equal("tar -xf \"test.tar\" -C \"testfolder\""); + }) + + it('executeScript', async () => { + chai.expect(windowsCommands.executeScript("test.sh", true)).to.equal("test.sh"); + chai.expect(windowsCommands.executeScript("test script'\"", false)).to.equal("test script'\""); + }) +}); diff --git a/src/sdk/__init__.py b/src/sdk/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/sdk/pynni/__init__.py b/src/sdk/pynni/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/sdk/pynni/nni/__init__.py b/src/sdk/pynni/nni/__init__.py index a8cd78bbf7..c7236adc1c 100644 --- a/src/sdk/pynni/nni/__init__.py +++ b/src/sdk/pynni/nni/__init__.py @@ -1,6 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +__version__ = '999.0.0-developing' + from .env_vars import dispatcher_env_vars if dispatcher_env_vars.SDK_PROCESS != 'dispatcher': diff --git a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py index 835c6f269a..084d5b8ea4 100644 --- a/src/sdk/pynni/nni/compression/speedup/torch/compressor.py +++ b/src/sdk/pynni/nni/compression/speedup/torch/compressor.py @@ -163,9 +163,11 @@ def speedup_model(self): first, do mask/shape inference, second, replace modules """ + training = self.bound_model.training _logger.info("start to speed up the model") _logger.info("infer module masks...") self.infer_modules_masks() _logger.info("replace compressed modules...") self.replace_compressed_modules() + self.bound_model.train(training) _logger.info("speedup done") diff --git a/src/sdk/pynni/nni/nas/tensorflow/base_mutator.py b/src/sdk/pynni/nni/nas/tensorflow/base_mutator.py new file mode 100644 index 0000000000..860680f199 --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/base_mutator.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from tensorflow.keras import Model + +from .mutables import Mutable, MutableScope, InputChoice +from .utils import StructuredMutableTreeNode + + +class BaseMutator(Model): + def __init__(self, model): + super().__init__() + self.__dict__['model'] = model + self._structured_mutables = self._parse_search_space(self.model) + + def _parse_search_space(self, module, root=None, prefix='', memo=None, nested_detection=None): + if memo is None: + memo = set() + if root is None: + root = StructuredMutableTreeNode(None) + if module not in memo: + memo.add(module) + if isinstance(module, Mutable): + if nested_detection is not None: + raise RuntimeError('Cannot have nested search space. Error at {} in {}' + .format(module, nested_detection)) + module.name = prefix + module.set_mutator(self) + root = root.add_child(module) + if not isinstance(module, MutableScope): + nested_detection = module + if isinstance(module, InputChoice): + for k in module.choose_from: + if k != InputChoice.NO_KEY and k not in [m.key for m in memo if isinstance(m, Mutable)]: + raise RuntimeError('"{}" required by "{}" not found in keys that appeared before, and is not NO_KEY.' + .format(k, module.key)) + for submodule in module.layers: + if not isinstance(submodule, Model): + continue + submodule_prefix = prefix + ('.' if prefix else '') + submodule.name + self._parse_search_space(submodule, root, submodule_prefix, memo=memo, nested_detection=nested_detection) + return root + + @property + def mutables(self): + return self._structured_mutables + + def undedup_mutables(self): + return self._structured_mutables.traverse(deduplicate=False) + + def call(self, *inputs): + raise RuntimeError('Call is undefined for mutators.') + + def __setattr__(self, name, value): + if name == 'model': + raise AttributeError("Attribute `model` can be set at most once, and you shouldn't use `self.model = model` to " + "include your network, as it will include all parameters in model into the mutator.") + return super().__setattr__(name, value) + + def enter_mutable_scope(self, mutable_scope): + pass + + def exit_mutable_scope(self, mutable_scope): + pass + + def on_forward_layer_choice(self, mutable, *inputs): + raise NotImplementedError + + def on_forward_input_choice(self, mutable, tensor_list): + raise NotImplementedError + + def export(self): + raise NotImplementedError diff --git a/src/sdk/pynni/nni/nas/tensorflow/enas/__init__.py b/src/sdk/pynni/nni/nas/tensorflow/enas/__init__.py new file mode 100644 index 0000000000..d3372836eb --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/enas/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from .mutator import EnasMutator +from .trainer import EnasTrainer diff --git a/src/sdk/pynni/nni/nas/tensorflow/enas/mutator.py b/src/sdk/pynni/nni/nas/tensorflow/enas/mutator.py new file mode 100644 index 0000000000..de43195fa2 --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/enas/mutator.py @@ -0,0 +1,160 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import tensorflow as tf +from tensorflow.keras.layers import Dense, Embedding, LSTMCell, RNN +from tensorflow.keras.losses import SparseCategoricalCrossentropy, Reduction + +from nni.nas.tensorflow.mutator import Mutator +from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope + + +class EnasMutator(Mutator): + def __init__(self, model, + lstm_size=64, + lstm_num_layers=1, + tanh_constant=1.5, + cell_exit_extra_step=False, + skip_target=0.4, + temperature=None, + branch_bias=0.25, + entropy_reduction='sum'): + super().__init__(model) + self.tanh_constant = tanh_constant + self.temperature = temperature + self.cell_exit_extra_step = cell_exit_extra_step + + cells = [LSTMCell(units=lstm_size, use_bias=False) for _ in range(lstm_num_layers)] + self.lstm = RNN(cells, stateful=True) + self.g_emb = tf.random.normal((1, 1, lstm_size)) * 0.1 + self.skip_targets = tf.constant([1.0 - skip_target, skip_target]) + + self.max_layer_choice = 0 + self.bias_dict = {} + for mutable in self.mutables: + if isinstance(mutable, LayerChoice): + if self.max_layer_choice == 0: + self.max_layer_choice = len(mutable) + assert self.max_layer_choice == len(mutable), \ + "ENAS mutator requires all layer choice have the same number of candidates." + if 'reduce' in mutable.key: + bias = [] + for choice in mutable.choices: + if 'conv' in str(type(choice)).lower(): + bias.append(branch_bias) + else: + bias.append(-branch_bias) + self.bias_dict[mutable.key] = tf.constant(bias) + + # exposed for trainer + self.sample_log_prob = 0 + self.sample_entropy = 0 + self.sample_skip_penalty = 0 + + # internal nn layers + self.embedding = Embedding(self.max_layer_choice + 1, lstm_size) + self.soft = Dense(self.max_layer_choice, use_bias=False) + self.attn_anchor = Dense(lstm_size, use_bias=False) + self.attn_query = Dense(lstm_size, use_bias=False) + self.v_attn = Dense(1, use_bias=False) + assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.' + self.entropy_reduction = tf.reduce_sum if entropy_reduction == 'sum' else tf.reduce_mean + self.cross_entropy_loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE) + + self._first_sample = True + + def sample_search(self): + self._initialize() + self._sample(self.mutables) + self._first_sample = False + return self._choices + + def sample_final(self): + return self.sample_search() + + def _sample(self, tree): + mutable = tree.mutable + if isinstance(mutable, LayerChoice) and mutable.key not in self._choices: + self._choices[mutable.key] = self._sample_layer_choice(mutable) + elif isinstance(mutable, InputChoice) and mutable.key not in self._choices: + self._choices[mutable.key] = self._sample_input_choice(mutable) + for child in tree.children: + self._sample(child) + if self.cell_exit_extra_step and isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid: + self._anchors_hid[mutable.key] = self.lstm(self._inputs, 1) + + def _initialize(self): + self._choices = {} + self._anchors_hid = {} + self._inputs = self.g_emb + # seems the `input_shape` parameter of RNN does not work + # workaround it by omitting `reset_states` for first run + if not self._first_sample: + self.lstm.reset_states() + self.sample_log_prob = 0 + self.sample_entropy = 0 + self.sample_skip_penalty = 0 + + def _sample_layer_choice(self, mutable): + logit = self.soft(self.lstm(self._inputs)) + if self.temperature is not None: + logit /= self.temperature + if self.tanh_constant is not None: + logit = self.tanh_constant * tf.tanh(logit) + if mutable.key in self.bias_dict: + logit += self.bias_dict[mutable.key] + softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1)) + branch_id = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [1]) + log_prob = self.cross_entropy_loss(branch_id, logit) + self.sample_log_prob += self.entropy_reduction(log_prob) + entropy = log_prob * tf.math.exp(-log_prob) + self.sample_entropy += self.entropy_reduction(entropy) + self._inputs = tf.reshape(self.embedding(branch_id), [1, 1, -1]) + mask = tf.one_hot(branch_id, self.max_layer_choice) + return tf.cast(tf.reshape(mask, [-1]), tf.bool) + + def _sample_input_choice(self, mutable): + query, anchors = [], [] + for label in mutable.choose_from: + if label not in self._anchors_hid: + self._anchors_hid[label] = self.lstm(self._inputs) + query.append(self.attn_anchor(self._anchors_hid[label])) + anchors.append(self._anchors_hid[label]) + query = tf.concat(query, axis=0) + query = tf.tanh(query + self.attn_query(anchors[-1])) + query = self.v_attn(query) + + if self.temperature is not None: + query /= self.temperature + if self.tanh_constant is not None: + query = self.tanh_constant * tf.tanh(query) + + if mutable.n_chosen is None: + logit = tf.concat([-query, query], axis=1) + softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1)) + skip = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1]) + skip_prob = tf.math.sigmoid(logit) + kl = tf.reduce_sum(skip_prob * tf.math.log(skip_prob / self.skip_targets)) + self.sample_skip_penalty += kl + log_prob = self.cross_entropy_loss(skip, logit) + + skip = tf.cast(skip, tf.float32) + inputs = tf.tensordot(skip, tf.concat(anchors, 0), 1) / (1. + tf.reduce_sum(skip)) + self._inputs = tf.reshape(inputs, [1, 1, -1]) + + else: + assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS." + logit = tf.reshape(query, [1, -1]) + softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1)) + index = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1]) + skip = tf.reshape(tf.one_hot(index, mutable.n_candidates), [-1]) + # when the size is 1, tf does not accept tensor here, complaining the shape is wrong + # but using a numpy array seems fine + log_prob = self.cross_entropy_loss(logit, query.numpy()) + self._inputs = tf.reshape(anchors[index.numpy()[0]], [1, 1, -1]) + + self.sample_log_prob += self.entropy_reduction(log_prob) + entropy = log_prob * tf.exp(-log_prob) + self.sample_entropy += self.entropy_reduction(entropy) + assert len(skip) == mutable.n_candidates, (skip, mutable.n_candidates, mutable.n_chosen) + return tf.cast(skip, tf.bool) diff --git a/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py b/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py new file mode 100644 index 0000000000..2d0d3cdb5a --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py @@ -0,0 +1,159 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging + +import tensorflow as tf +from tensorflow.data import Dataset +from tensorflow.keras.optimizers import Adam + +from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads + +from .mutator import EnasMutator + +logger = logging.getLogger(__name__) + + +log_frequency = 100 +entropy_weight = 0.0001 +skip_weight = 0.8 +baseline_decay = 0.999 +child_steps = 500 +mutator_lr = 0.00035 +mutator_steps = 50 +mutator_steps_aggregate = 20 +aux_weight = 0.4 +test_arc_per_epoch = 1 + + +class EnasTrainer: + def __init__(self, model, loss, metrics, reward_function, optimizer, batch_size, num_epochs, + dataset_train, dataset_valid): + self.model = model + self.loss = loss + self.metrics = metrics + self.reward_function = reward_function + self.optimizer = optimizer + self.batch_size = batch_size + self.num_epochs = num_epochs + + x, y = dataset_train + split = int(len(x) * 0.9) + self.train_set = Dataset.from_tensor_slices((x[:split], y[:split])) + self.valid_set = Dataset.from_tensor_slices((x[split:], y[split:])) + self.test_set = Dataset.from_tensor_slices(dataset_valid) + + self.mutator = EnasMutator(model) + self.mutator_optim = Adam(learning_rate=mutator_lr) + + self.baseline = 0. + + + def train(self, validate=True): + for epoch in range(self.num_epochs): + logger.info("Epoch %d Training", epoch + 1) + self.train_one_epoch(epoch) + logger.info("Epoch %d Validating", epoch + 1) + self.validate_one_epoch(epoch) + + def validate(self): + self.validate_one_epoch(-1) + + + def train_one_epoch(self, epoch): + train_loader, valid_loader = self._create_train_loader() + + # Sample model and train + meters = AverageMeterGroup() + + for step in range(1, child_steps + 1): + x, y = next(train_loader) + self.mutator.reset() + + with tf.GradientTape() as tape: + logits = self.model(x, training=True) + if isinstance(logits, tuple): + logits, aux_logits = logits + aux_loss = self.loss(aux_logits, y) + else: + aux_loss = 0. + metrics = self.metrics(y, logits) + loss = self.loss(y, logits) + aux_weight * aux_loss + + grads = tape.gradient(loss, self.model.trainable_weights) + grads = fill_zero_grads(grads, self.model.trainable_weights) + grads, _ = tf.clip_by_global_norm(grads, 5.0) + self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) + + metrics['loss'] = tf.reduce_mean(loss).numpy() + meters.update(metrics) + + if log_frequency and step % log_frequency == 0: + logger.info("Model Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, + self.num_epochs, step, child_steps, meters) + + # Train sampler (mutator) + meters = AverageMeterGroup() + for mutator_step in range(1, mutator_steps + 1): + grads_list = [] + for step in range(1, mutator_steps_aggregate + 1): + with tf.GradientTape() as tape: + x, y = next(valid_loader) + self.mutator.reset() + + logits = self.model(x, training=False) + metrics = self.metrics(y, logits) + reward = self.reward_function(y, logits) + entropy_weight * self.mutator.sample_entropy + self.baseline = self.baseline * baseline_decay + reward * (1 - baseline_decay) + loss = self.mutator.sample_log_prob * (reward - self.baseline) + loss += skip_weight * self.mutator.sample_skip_penalty + + meters.update({ + 'reward': reward, + 'loss': tf.reduce_mean(loss).numpy(), + 'ent': self.mutator.sample_entropy.numpy(), + 'log_prob': self.mutator.sample_log_prob.numpy(), + 'baseline': self.baseline, + 'skip': self.mutator.sample_skip_penalty, + }) + + cur_step = step + (mutator_step - 1) * mutator_steps_aggregate + if log_frequency and cur_step % log_frequency == 0: + logger.info("RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s", epoch + 1, self.num_epochs, + mutator_step, mutator_steps, step, mutator_steps_aggregate, + meters) + + grads = tape.gradient(loss, self.mutator.trainable_weights) + grads = fill_zero_grads(grads, self.mutator.trainable_weights) + grads_list.append(grads) + total_grads = [tf.math.add_n(weight_grads) for weight_grads in zip(*grads_list)] + total_grads, _ = tf.clip_by_global_norm(total_grads, 5.0) + self.mutator_optim.apply_gradients(zip(total_grads, self.mutator.trainable_weights)) + + def validate_one_epoch(self, epoch): + test_loader = self._create_validate_loader() + + for arc_id in range(test_arc_per_epoch): + meters = AverageMeterGroup() + for x, y in test_loader: + self.mutator.reset() + logits = self.model(x) + if isinstance(logits, tuple): + logits, _ = logits + metrics = self.metrics(logits, y) + loss = self.loss(y, logits) + metrics['loss'] = tf.reduce_mean(loss).numpy() + meters.update(metrics) + + logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s", + epoch + 1, self.num_epochs, arc_id + 1, test_arc_per_epoch, + meters.summary()) + + + def _create_train_loader(self): + train_set = self.train_set.shuffle(1000000).batch(self.batch_size) + test_set = self.test_set.shuffle(1000000).batch(self.batch_size) + return iter(train_set), iter(test_set) + + def _create_validate_loader(self): + return iter(self.test_set.shuffle(1000000).batch(self.batch_size)) diff --git a/src/sdk/pynni/nni/nas/tensorflow/mutables.py b/src/sdk/pynni/nni/nas/tensorflow/mutables.py new file mode 100644 index 0000000000..1665112732 --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/mutables.py @@ -0,0 +1,136 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging + +from tensorflow.keras import Model + +from .utils import global_mutable_counting + + +_logger = logging.getLogger(__name__) + + +class Mutable(Model): + def __init__(self, key=None): + super().__init__() + if key is None: + self._key = '{}_{}'.format(type(self).__name__, global_mutable_counting()) + elif isinstance(key, str): + self._key = key + else: + self._key = str(key) + _logger.warning('Key "%s" is not string, converted to string.', key) + self.init_hook = None + self.forward_hook = None + + def __deepcopy__(self, memodict=None): + raise NotImplementedError("Deep copy doesn't work for mutables.") + + def __call__(self, *args, **kwargs): + self._check_built() + return super().__call__(*args, **kwargs) + + def set_mutator(self, mutator): + if 'mutator' in self.__dict__: + raise RuntimeError('`set_mutator is called more than once. ' + 'Did you parse the search space multiple times? ' + 'Or did you apply multiple fixed architectures?') + self.__dict__['mutator'] = mutator + + def call(self, *inputs): + raise NotImplementedError('Method `call` of Mutable must be overridden') + + @property + def key(self): + return self._key + + @property + def name(self): + return self._name if hasattr(self, '_name') else self._key + + @name.setter + def name(self, name): + self._name = name + + def _check_built(self): + if not hasattr(self, 'mutator'): + raise ValueError( + "Mutator not set for {}. You might have forgotten to initialize and apply your mutator. " + "Or did you initialize a mutable on the fly in forward pass? Move to `__init__` " + "so that trainer can locate all your mutables. See NNI docs for more details.".format(self)) + + def __repr__(self): + return '{} ({})'.format(self.name, self.key) + + +class MutableScope(Mutable): + def __call__(self, *args, **kwargs): + try: + self._check_built() + self.mutator.enter_mutable_scope(self) + return super().__call__(*args, **kwargs) + finally: + self.mutator.exit_mutable_scope(self) + + +class LayerChoice(Mutable): + def __init__(self, op_candidates, reduction='sum', return_mask=False, key=None): + super().__init__(key=key) + self.length = len(op_candidates) + self.choices = op_candidates + self.reduction = reduction + self.return_mask = return_mask + self._built = False + + def call(self, *inputs): + if not self._built: + for op in self.choices: + if len(inputs) > 1: # FIXME: not tested + op.build([inp.shape for inp in inputs]) + elif len(inputs) == 1: + op.build(inputs[0].shape) + self._built = True + out, mask = self.mutator.on_forward_layer_choice(self, *inputs) + if self.return_mask: + return out, mask + return out + + def __len__(self): + return len(self.choices) + + +class InputChoice(Mutable): + NO_KEY = '' + + def __init__(self, n_candidates=None, choose_from=None, n_chosen=None, reduction='sum', return_mask=False, key=None): + super().__init__(key=key) + assert n_candidates is not None or choose_from is not None, \ + 'At least one of `n_candidates` and `choose_from` must be not None.' + if choose_from is not None and n_candidates is None: + n_candidates = len(choose_from) + elif choose_from is None and n_candidates is not None: + choose_from = [self.NO_KEY] * n_candidates + assert n_candidates == len(choose_from), 'Number of candidates must be equal to the length of `choose_from`.' + assert n_candidates > 0, 'Number of candidates must be greater than 0.' + assert n_chosen is None or 0 <= n_chosen <= n_candidates, \ + 'Expected selected number must be None or no more than number of candidates.' + + self.n_candidates = n_candidates + self.choose_from = choose_from.copy() + self.n_chosen = n_chosen + self.reduction = reduction + self.return_mask = return_mask + + def call(self, optional_inputs): + optional_input_list = optional_inputs + if isinstance(optional_inputs, dict): + optional_input_list = [optional_inputs[tag] for tag in self.choose_from] + assert isinstance(optional_input_list, list), \ + 'Optional input list must be a list, not a {}.'.format(type(optional_input_list)) + assert len(optional_inputs) == self.n_candidates, \ + 'Length of the input list must be equal to number of candidates.' + out, mask = self.mutator.on_forward_input_choice(self, optional_input_list) + if self.return_mask: + return out, mask + return out diff --git a/src/sdk/pynni/nni/nas/tensorflow/mutator.py b/src/sdk/pynni/nni/nas/tensorflow/mutator.py new file mode 100644 index 0000000000..20c57f9405 --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/mutator.py @@ -0,0 +1,77 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging + +import tensorflow as tf + +from .base_mutator import BaseMutator + + +_logger = logging.getLogger(__name__) + + +class Mutator(BaseMutator): + def __init__(self, model): + super().__init__(model) + self._cache = {} + + def sample_search(self): + raise NotImplementedError('Method `sample_search` must be overridden') + + def sample_final(self): + raise NotImplementedError('Method `sample_final` must be overriden for exporting') + + def reset(self): + self._cache = self.sample_search() + + def export(self): + return self.sample_final() + + # TODO: status + # TODO: graph + + def on_forward_layer_choice(self, mutable, *inputs): + mask = self._get_decision(mutable) + assert len(mask) == len(mutable), \ + 'Invalid mask, expected {} to be of length {}.'.format(mask, len(mutable)) + out = self._select_with_mask(lambda choice: choice(*inputs), mutable.choices, mask) + return self._tensor_reduction(mutable.reduction, out), mask + + def on_forward_input_choice(self, mutable, tensor_list): + mask = self._get_decision(mutable) + assert len(mask) == mutable.n_candidates, \ + 'Invalid mask, expected {} to be of length {}.'.format(mask, mutable.n_candidates) + out = self._select_with_mask(lambda tensor: tensor, tensor_list, mask) + return self._tensor_reduction(mutable.reduction, out), mask + + def _select_with_mask(self, map_fn, candidates, mask): + if mask.dtype.is_bool: + out = [map_fn(cand) for cand, m in zip(candidates, mask) if m] + elif mask.dtype.is_floating: + out = [map_fn(cand) * m for cand, m in zip(candidates, mask) if m] + else: + raise ValueError('Unrecognized mask, dtype is {}'.format(mask.dtype.name)) + return out + + def _tensor_reduction(self, reduction_type, tensor_list): + if reduction_type == 'none': + return tensor_list + if not tensor_list: + return None + if len(tensor_list) == 1: + return tensor_list[0] + if reduction_type == 'sum': + return sum(tensor_list) + if reduction_type == 'mean': + return sum(tensor_list) / len(tensor_list) + if reduction_type == 'concat': + return tf.concat(tensor_list, axis=0) + raise ValueError('Unrecognized reduction policy: "{}'.format(reduction_type)) + + def _get_decision(self, mutable): + if mutable.key not in self._cache: + raise ValueError('"{}" not found in decision cache.'.format(mutable.key)) + result = self._cache[mutable.key] + _logger.debug('Decision %s: %s', mutable.key, result) + return result diff --git a/src/sdk/pynni/nni/nas/tensorflow/utils.py b/src/sdk/pynni/nni/nas/tensorflow/utils.py new file mode 100644 index 0000000000..0cfc6e815d --- /dev/null +++ b/src/sdk/pynni/nni/nas/tensorflow/utils.py @@ -0,0 +1,93 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import tensorflow as tf + +_counter = 0 + +def global_mutable_counting(): + global _counter + _counter += 1 + return _counter + + +class AverageMeter: + def __init__(self, name): + self.name = name + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val): + self.val = val + self.sum += val + self.count += 1 + self.avg = self.sum / self.count + + def __str__(self): + return '{name} {val:4f} ({avg:4f})'.format(**self.__dict__) + + def summary(self): + return '{name}: {avg:4f}'.format(**self.__dict__) + + +class AverageMeterGroup: + def __init__(self): + self.meters = {} + + def update(self, data): + for k, v in data.items(): + if k not in self.meters: + self.meters[k] = AverageMeter(k) + self.meters[k].update(v) + + def __str__(self): + return ' '.join(str(v) for v in self.meters.values()) + + def summary(self): + return ' '.join(v.summary() for v in self.meters.values()) + + +class StructuredMutableTreeNode: + def __init__(self, mutable): + self.mutable = mutable + self.children = [] + + def add_child(self, mutable): + self.children.append(StructuredMutableTreeNode(mutable)) + return self.children[-1] + + def type(self): + return type(self.mutable) + + def __iter__(self): + return self.traverse() + + def traverse(self, order="pre", deduplicate=True, memo=None): + if memo is None: + memo = set() + assert order in ["pre", "post"] + if order == "pre": + if self.mutable is not None: + if not deduplicate or self.mutable.key not in memo: + memo.add(self.mutable.key) + yield self.mutable + for child in self.children: + for m in child.traverse(order=order, deduplicate=deduplicate, memo=memo): + yield m + if order == "post": + if self.mutable is not None: + if not deduplicate or self.mutable.key not in memo: + memo.add(self.mutable.key) + yield self.mutable + + +def fill_zero_grads(grads, weights): + ret = [] + for grad, weight in zip(grads, weights): + if grad is not None: + ret.append(grad) + else: + ret.append(tf.zeros_like(weight)) + return ret diff --git a/src/sdk/pynni/nni/ppo_tuner/distri.py b/src/sdk/pynni/nni/ppo_tuner/distri.py index 0c1626730d..9af2e1add9 100644 --- a/src/sdk/pynni/nni/ppo_tuner/distri.py +++ b/src/sdk/pynni/nni/ppo_tuner/distri.py @@ -61,7 +61,7 @@ def sample_placeholder(self, prepend_shape, name=None): class CategoricalPd(Pd): """ - Categorical prossibility distribution + Categorical probability distribution """ def __init__(self, logits, mask_npinf, nsteps, size, is_act_model): self.logits = logits diff --git a/src/sdk/pynni/nni/protocol.py b/src/sdk/pynni/nni/protocol.py index e7330f78cd..ca2d7069d2 100644 --- a/src/sdk/pynni/nni/protocol.py +++ b/src/sdk/pynni/nni/protocol.py @@ -43,8 +43,7 @@ def send(command, data): try: _lock.acquire() data = data.encode('utf8') - assert len(data) < 1000000, 'Command too long' - msg = b'%b%06d%b' % (command.value, len(data), data) + msg = b'%b%014d%b' % (command.value, len(data), data) logging.getLogger(__name__).debug('Sending command, data: [%s]', msg) _out_file.write(msg) _out_file.flush() @@ -56,9 +55,9 @@ def receive(): """Receive a command from Training Service. Returns a tuple of command (CommandType) and payload (str) """ - header = _in_file.read(8) + header = _in_file.read(16) logging.getLogger(__name__).debug('Received command, header: [%s]', header) - if header is None or len(header) < 8: + if header is None or len(header) < 16: # Pipe EOF encountered logging.getLogger(__name__).debug('Pipe EOF encountered') return None, None diff --git a/src/sdk/pynni/tests/test_model_speedup.py b/src/sdk/pynni/tests/test_model_speedup.py index 6b9417268b..31cb5712ce 100644 --- a/src/sdk/pynni/tests/test_model_speedup.py +++ b/src/sdk/pynni/tests/test_model_speedup.py @@ -10,9 +10,11 @@ from torchvision.models.resnet import resnet18 from unittest import TestCase, main -from nni.compression.torch import L1FilterPruner +from nni.compression.torch import L1FilterPruner, apply_compression_results from nni.compression.speedup.torch import ModelSpeedup +torch.manual_seed(0) + class BackboneModel1(nn.Module): def __init__(self): super().__init__() @@ -58,7 +60,10 @@ def forward(self, x): x = self.fc3(x) return x +dummy_input = torch.randn(2, 1, 28, 28) SPARSITY = 0.5 +MODEL_FILE, MASK_FILE = './11_model.pth', './l1_mask.pth' + def prune_model_l1(model): config_list = [{ 'sparsity': SPARSITY, @@ -66,14 +71,14 @@ def prune_model_l1(model): }] pruner = L1FilterPruner(model, config_list) pruner.compress() - pruner.export_model(model_path='./11_model.pth', mask_path='./l1_mask.pth') + pruner.export_model(model_path=MODEL_FILE, mask_path=MASK_FILE) class SpeedupTestCase(TestCase): def test_speedup_vgg16(self): prune_model_l1(vgg16()) model = vgg16() model.train() - ms = ModelSpeedup(model, torch.randn(2, 3, 32, 32), './l1_mask.pth') + ms = ModelSpeedup(model, torch.randn(2, 3, 32, 32), MASK_FILE) ms.speedup_model() orig_model = vgg16() @@ -88,20 +93,33 @@ def test_speedup_vgg16(self): def test_speedup_bigmodel(self): prune_model_l1(BigModel()) model = BigModel() + apply_compression_results(model, MASK_FILE, 'cpu') + model.eval() + mask_out = model(dummy_input) + model.train() - ms = ModelSpeedup(model, torch.randn(2, 1, 28, 28), './l1_mask.pth') + ms = ModelSpeedup(model, dummy_input, MASK_FILE) ms.speedup_model() + assert model.training + + model.eval() + speedup_out = model(dummy_input) + if not torch.allclose(mask_out, speedup_out, atol=1e-07): + print('input:', dummy_input.size(), torch.abs(dummy_input).sum((2,3))) + print('mask_out:', mask_out) + print('speedup_out:', speedup_out) + raise RuntimeError('model speedup inference result is incorrect!') orig_model = BigModel() - assert model.training + assert model.backbone2.conv1.out_channels == int(orig_model.backbone2.conv1.out_channels * SPARSITY) assert model.backbone2.conv2.in_channels == int(orig_model.backbone2.conv2.in_channels * SPARSITY) assert model.backbone2.conv2.out_channels == int(orig_model.backbone2.conv2.out_channels * SPARSITY) assert model.backbone2.fc1.in_features == int(orig_model.backbone2.fc1.in_features * SPARSITY) def tearDown(self): - os.remove('./11_model.pth') - os.remove('./l1_mask.pth') + os.remove(MODEL_FILE) + os.remove(MASK_FILE) if __name__ == '__main__': main() diff --git a/src/sdk/pynni/tests/test_protocol.py b/src/sdk/pynni/tests/test_protocol.py index cd59144624..5c9ee78eaf 100644 --- a/src/sdk/pynni/tests/test_protocol.py +++ b/src/sdk/pynni/tests/test_protocol.py @@ -20,30 +20,21 @@ class ProtocolTestCase(TestCase): def test_send_en(self): out_file = _prepare_send() send(CommandType.NewTrialJob, 'CONTENT') - self.assertEqual(out_file.getvalue(), b'TR000007CONTENT') + self.assertEqual(out_file.getvalue(), b'TR00000000000007CONTENT') def test_send_zh(self): out_file = _prepare_send() send(CommandType.NewTrialJob, '你好') - self.assertEqual(out_file.getvalue(), 'TR000006你好'.encode('utf8')) - - def test_send_too_large(self): - _prepare_send() - exception = None - try: - send(CommandType.NewTrialJob, ' ' * 1000000) - except AssertionError as e: - exception = e - self.assertIsNotNone(exception) + self.assertEqual(out_file.getvalue(), 'TR00000000000006你好'.encode('utf8')) def test_receive_en(self): - _prepare_receive(b'IN000005hello') + _prepare_receive(b'IN00000000000005hello') command, data = receive() self.assertIs(command, CommandType.Initialize) self.assertEqual(data, 'hello') def test_receive_zh(self): - _prepare_receive('IN000006世界'.encode('utf8')) + _prepare_receive('IN00000000000006世界'.encode('utf8')) command, data = receive() self.assertIs(command, CommandType.Initialize) self.assertEqual(data, '世界') diff --git a/src/webui/package.json b/src/webui/package.json index d7400e4f39..ca89200352 100644 --- a/src/webui/package.json +++ b/src/webui/package.json @@ -65,7 +65,6 @@ "@typescript-eslint/eslint-plugin": "^2.11.0", "@typescript-eslint/parser": "^2.11.0", "@uifabric/fluent-theme": "^0.16.7", - "npx": "^10.2.0", "eslint": "^5.16.0", "eslint-config-react-app": "^4.0.0", "eslint-loader": "2.1.2", @@ -74,6 +73,7 @@ "eslint-plugin-jsx-a11y": "6.2.1", "eslint-plugin-react": "7.12.4", "eslint-plugin-react-hooks": "^1.5.0", + "npx": "^10.2.0", "typescript": "3.4.5" }, "scripts": { diff --git a/src/webui/yarn.lock b/src/webui/yarn.lock index 3053a14da9..f6b3ed1cc8 100644 --- a/src/webui/yarn.lock +++ b/src/webui/yarn.lock @@ -5371,8 +5371,9 @@ kind-of@^5.0.0: resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-5.1.0.tgz#729c91e2d857b7a419a1f9aa65685c4c33f5845d" kind-of@^6.0.0, kind-of@^6.0.2: - version "6.0.2" - resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-6.0.2.tgz#01146b36a6218e64e58f3a8d66de5d7fc6f6d051" + version "6.0.3" + resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-6.0.3.tgz#07c05034a6c349fa06e24fa35aa76db4580ce4dd" + integrity sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw== last-call-webpack-plugin@^3.0.0: version "3.0.0" diff --git a/test/config/examples/mnist-annotation.yml b/test/config/examples/mnist-annotation.yml index 17d28684e6..330400570d 100644 --- a/test/config/examples/mnist-annotation.yml +++ b/test/config/examples/mnist-annotation.yml @@ -13,7 +13,6 @@ assessor: trial: codeDir: ../../../examples/trials/mnist-annotation command: python3 mnist.py --batch_num 10 - gpuNum: 0 useAnnotation: true multiPhase: false diff --git a/test/config/examples/mnist-keras.yml b/test/config/examples/mnist-keras.yml index 3743be301e..6bb9e0e999 100644 --- a/test/config/examples/mnist-keras.yml +++ b/test/config/examples/mnist-keras.yml @@ -14,7 +14,6 @@ assessor: trial: codeDir: ../../../examples/trials/mnist-keras command: python3 mnist-keras.py --num_train 200 --epochs 1 - gpuNum: 0 useAnnotation: false multiPhase: false diff --git a/test/config/examples/mnist-nested-search-space.yml b/test/config/examples/mnist-nested-search-space.yml index 9be7c8ef7e..89e51a180a 100644 --- a/test/config/examples/mnist-nested-search-space.yml +++ b/test/config/examples/mnist-nested-search-space.yml @@ -15,7 +15,6 @@ assessor: trial: codeDir: ../../../examples/trials/mnist-nested-search-space command: python3 mnist.py --batch_num 10 - gpuNum: 0 useAnnotation: false multiPhase: false diff --git a/test/config/examples/mnist-pytorch.yml b/test/config/examples/mnist-pytorch.yml index c62f0579d4..570d9de81f 100644 --- a/test/config/examples/mnist-pytorch.yml +++ b/test/config/examples/mnist-pytorch.yml @@ -14,7 +14,6 @@ assessor: trial: codeDir: ../../../examples/trials/mnist-pytorch command: python3 mnist.py --epochs 1 --batch_num 10 - gpuNum: 0 useAnnotation: false multiPhase: false diff --git a/test/config/examples/mnist-tfv1.yml b/test/config/examples/mnist-tfv1.yml index f66e288efc..f8393918ad 100644 --- a/test/config/examples/mnist-tfv1.yml +++ b/test/config/examples/mnist-tfv1.yml @@ -14,7 +14,6 @@ assessor: trial: codeDir: ../../../examples/trials/mnist-tfv1 command: python3 mnist.py --batch_num 10 - gpuNum: 0 useAnnotation: false multiPhase: false diff --git a/test/config/integration_tests.yml b/test/config/integration_tests.yml index b89e44285b..c6c5b44fa3 100644 --- a/test/config/integration_tests.yml +++ b/test/config/integration_tests.yml @@ -1,6 +1,6 @@ defaultTestCaseConfig: - launchCommand: nnictl create --config $configFile + launchCommand: nnictl create --config $configFile --debug stopCommand: nnictl stop experimentStatusCheck: True platform: linux darwin win32 @@ -22,7 +22,7 @@ testCases: validator: # launch command, default launch command is 'nnictl create --config $configFile' - launchCommand: nnictl create --config $configFile + launchCommand: nnictl create --config $configFile --debug # stop command, default stop command is 'nnictl stop', empty means no stop command stopCommand: nnictl stop @@ -38,15 +38,24 @@ testCases: - name: mnist-tfv1 configFile: test/config/examples/mnist-tfv1.yml + config: + maxTrialNum: 1 + trialConcurrency: 1 - name: mnist-keras configFile: test/config/examples/mnist-keras.yml + config: + maxTrialNum: 2 + trialConcurrency: 1 - name: mnist-pytorch configFile: test/config/examples/mnist-pytorch.yml - name: mnist-annotation configFile: test/config/examples/mnist-annotation.yml + config: + maxTrialNum: 1 + trialConcurrency: 1 - name: cifar10-pytorch configFile: test/config/examples/cifar10-pytorch.yml diff --git a/test/config/training_service.yml b/test/config/training_service.yml index 65f2676960..5ceb8b2c6e 100644 --- a/test/config/training_service.yml +++ b/test/config/training_service.yml @@ -78,12 +78,15 @@ paiYarn: pai: nniManagerIp: maxExecDuration: 15m + # PAI has job submission limitation, set maxTrialNum=1 to control trial job numbers for PAI + maxTrialNum: 1 + trialConcurrency: 1 paiConfig: host: userName: trainingServicePlatform: pai trial: - gpuNum: 1 + gpuNum: 1 cpuNum: 1 image: memoryMB: 8192 diff --git a/test/nni_test/nnitest/naive_test.py b/test/nni_test/nnitest/naive_test.py index cedd8b4ad4..b998686960 100644 --- a/test/nni_test/nnitest/naive_test.py +++ b/test/nni_test/nnitest/naive_test.py @@ -10,7 +10,7 @@ import time import traceback -from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, snooze +from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, wait_for_port_available from utils import GREEN, RED, CLEAR, EXPERIMENT_URL NNI_SOURCE_DIR = '..' @@ -71,7 +71,7 @@ def naive_test(args): assert assessor_result == expected, 'Bad assessor result' subprocess.run(['nnictl', 'stop']) - snooze() + wait_for_port_available(8080, 10) def stop_experiment_test(args): config_file = args.config @@ -86,19 +86,20 @@ def stop_experiment_test(args): experiment_id = get_experiment_id(EXPERIMENT_URL) proc = subprocess.run(['nnictl', 'stop', experiment_id]) assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode) - snooze() + wait_for_port_available(8080, 10) assert not detect_port(8080), '`nnictl stop %s` failed to stop experiments' % experiment_id # test cmd `nnictl stop --port` proc = subprocess.run(['nnictl', 'stop', '--port', '8990']) assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode) - snooze() + wait_for_port_available(8990, 10) assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id # test cmd `nnictl stop --all` proc = subprocess.run(['nnictl', 'stop', '--all']) assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode - snooze() + wait_for_port_available(8888, 10) + wait_for_port_available(8989, 10) assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments' diff --git a/test/nni_test/nnitest/run_tests.py b/test/nni_test/nnitest/run_tests.py index 2b8c4c2e29..d817eb4465 100644 --- a/test/nni_test/nnitest/run_tests.py +++ b/test/nni_test/nnitest/run_tests.py @@ -15,7 +15,7 @@ from utils import get_experiment_status, get_yml_content, dump_yml_content, get_experiment_id, \ parse_max_duration_time, get_trial_stats, deep_update, print_trial_job_log, get_failed_trial_jobs, \ get_experiment_dir, print_experiment_log -from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, detect_port +from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, wait_for_port_available import validators it_variables = {} @@ -157,7 +157,7 @@ def launch_test(config_file, training_service, test_case_config): if num_failed > 0: print('failed jobs: ', num_failed) break - time.sleep(3) + time.sleep(1) except: print_experiment_log(experiment_id=experiment_id) raise @@ -168,6 +168,7 @@ def launch_test(config_file, training_service, test_case_config): trial_stats = get_trial_stats(TRIAL_JOBS_URL) print(json.dumps(trial_stats, indent=4), flush=True) if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats['EARLY_STOPPED'] < max_trial_num: + print_experiment_log(experiment_id=experiment_id) print_trial_job_log(training_service, TRIAL_JOBS_URL) raise AssertionError('Failed to finish in maxExecDuration') @@ -188,16 +189,6 @@ def case_included(name, cases): return True return False -def wait_for_port_available(port, timeout): - begin_time = time.time() - while True: - if not detect_port(port): - return - if time.time() - begin_time > timeout: - msg = 'port {} is not available in {} seconds.'.format(port, timeout) - raise RuntimeError(msg) - time.sleep(5) - def match_platform(test_case_config): return sys.platform in test_case_config['platform'].split(' ') diff --git a/test/nni_test/nnitest/utils.py b/test/nni_test/nnitest/utils.py index e362b0de07..81031500c1 100644 --- a/test/nni_test/nnitest/utils.py +++ b/test/nni_test/nnitest/utils.py @@ -168,6 +168,13 @@ def detect_port(port): except: return False -def snooze(): - '''Sleep to make sure previous stopped exp has enough time to exit''' - time.sleep(6) + +def wait_for_port_available(port, timeout): + begin_time = time.time() + while True: + if not detect_port(port): + return + if time.time() - begin_time > timeout: + msg = 'port {} is not available in {} seconds.'.format(port, timeout) + raise RuntimeError(msg) + time.sleep(1) diff --git a/test/pipelines/pipelines-it-remote.yml b/test/pipelines/pipelines-it-remote-linux-to-linux.yml similarity index 98% rename from test/pipelines/pipelines-it-remote.yml rename to test/pipelines/pipelines-it-remote-linux-to-linux.yml index 4eab1cf650..78b3c9edaa 100644 --- a/test/pipelines/pipelines-it-remote.yml +++ b/test/pipelines/pipelines-it-remote-linux-to-linux.yml @@ -1,5 +1,5 @@ jobs: -- job: 'integration_test_remote' +- job: 'integration_test_remote_linux_to_linux' timeoutInMinutes: 120 steps: diff --git a/test/pipelines/pipelines-it-remote-linux-to-windows.yml b/test/pipelines/pipelines-it-remote-linux-to-windows.yml new file mode 100644 index 0000000000..613db986fc --- /dev/null +++ b/test/pipelines/pipelines-it-remote-linux-to-windows.yml @@ -0,0 +1,48 @@ +jobs: + - job: "integration_test_remote_linux_to_windows" + timeoutInMinutes: 120 + steps: + - script: make clean + displayName: "clean nni source code" + - task: CopyFilesOverSSH@0 + inputs: + sshEndpoint: $(end_point) + contents: | + ** + !**/dist/** + !**/node_modules/** + targetFolder: /tmp/nnitest/$(Build.BuildId) + overwrite: true + displayName: "Copy all files to remote machine" + timeoutInMinutes: 10 + - task: SSH@0 + inputs: + sshEndpoint: $(end_point) + runOptions: commands + commands: cd "\tmp\nnitest\$(Build.BuildId)" && powershell.exe -command "conda activate l2w | .\uninstall.ps1 | .\install.ps1" + failOnStdErr: false + displayName: "install on remote windows" + - script: python3 -m pip install --upgrade pip setuptools --user + displayName: "Install python tools" + - script: make easy-install + displayName: "Install nni via source code" + - script: | + sudo apt-get install swig -y + PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC + PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB + displayName: "Install dependencies for integration tests in remote mode" + - script: | + set -e + cd test + python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(remote_user) --remote_host $(remote_host) \ + --remote_port $(remote_port) --remote_pwd $(remote_pwd) --nni_manager_ip $(nni_manager_ip) + cat config/training_service.yml + PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote + displayName: "integration test" + - task: SSH@0 + inputs: + sshEndpoint: $(end_point) + runOptions: commands + commands: rmdir /s /q "\\?\c:\tmp\nnitest\$(Build.BuildId)" + condition: always() + displayName: "clean up on remote server" diff --git a/test/pipelines/pipelines-it-remote-windows.yml b/test/pipelines/pipelines-it-remote-windows-to-linux.yml similarity index 96% rename from test/pipelines/pipelines-it-remote-windows.yml rename to test/pipelines/pipelines-it-remote-windows-to-linux.yml index 63bcfb5c41..36a98a9819 100644 --- a/test/pipelines/pipelines-it-remote-windows.yml +++ b/test/pipelines/pipelines-it-remote-windows-to-linux.yml @@ -1,5 +1,5 @@ jobs: -- job: 'integration_test_remote_windows' +- job: 'integration_test_remote_windows_to_linux' timeoutInMinutes: 120 steps: @@ -23,6 +23,7 @@ jobs: sshEndpoint: $(end_point) runOptions: inline inline: cd /tmp/nnitest/$(Build.BuildId)/nni-remote/deployment/pypi;make build + failOnStdErr: false continueOnError: true displayName: 'build nni bdsit_wheel' - task: SSH@0 diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index 00067352ea..6ea6eb348f 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -139,7 +139,9 @@ def set_remote_config(experiment_config, port, config_file_name): for i in range(len(request_data['machine_list'])): if isinstance(request_data['machine_list'][i].get('gpuIndices'), int): request_data['machine_list'][i]['gpuIndices'] = str(request_data['machine_list'][i].get('gpuIndices')) - response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT) + # It needs to connect all remote machines, the time out of connection is 30 seconds. + # So timeout of this place should be longer. + response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 60, True) err_message = '' if not response or not check_response(response): if response is not None: diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index df3922a801..7bccc1085a 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -227,7 +227,7 @@ def stop_experiment(args): experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() for experiment_id in experiment_id_list: - print_normal('Stoping experiment %s' % experiment_id) + print_normal('Stopping experiment %s' % experiment_id) nni_config = Config(experiment_dict[experiment_id]['fileName']) rest_pid = nni_config.get_config('restServerPid') if rest_pid: diff --git a/tools/nni_trial_tool/constants.py b/tools/nni_trial_tool/constants.py index 69d0449036..bef4013370 100644 --- a/tools/nni_trial_tool/constants.py +++ b/tools/nni_trial_tool/constants.py @@ -7,8 +7,6 @@ BASE_URL = 'http://{}' -HOME_DIR = os.path.join(os.environ['HOME'], 'nni') - LOG_DIR = os.environ['NNI_OUTPUT_DIR'] NNI_PLATFORM = os.environ['NNI_PLATFORM'] diff --git a/tools/nni_trial_tool/trial_keeper.py b/tools/nni_trial_tool/trial_keeper.py index 6da3a7bfc2..10ee7af211 100644 --- a/tools/nni_trial_tool/trial_keeper.py +++ b/tools/nni_trial_tool/trial_keeper.py @@ -2,23 +2,27 @@ # Licensed under the MIT license. import argparse -import os -from subprocess import Popen -import time +import ctypes +import json import logging -import shlex +import os import re +import shlex import sys -import json import threading -from pyhdfs import HdfsClient +import time +from subprocess import Popen + import pkg_resources -from .rest_utils import rest_post, rest_get -from .url_utils import gen_send_version_url, gen_parameter_meta_url +from pyhdfs import HdfsClient -from .constants import LOG_DIR, NNI_PLATFORM, MULTI_PHASE, NNI_TRIAL_JOB_ID, NNI_SYS_DIR, NNI_EXP_ID -from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal, copyHdfsFileToLocal -from .log_utils import LogType, nni_log, RemoteLogger, StdOutputType +from .constants import (LOG_DIR, MULTI_PHASE, NNI_EXP_ID, NNI_PLATFORM, + NNI_SYS_DIR, NNI_TRIAL_JOB_ID) +from .hdfsClientUtility import (copyDirectoryToHdfs, copyHdfsDirectoryToLocal, + copyHdfsFileToLocal) +from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log +from .rest_utils import rest_get, rest_post +from .url_utils import gen_parameter_meta_url, gen_send_version_url logger = logging.getLogger('trial_keeper') regular = re.compile('v?(?P[0-9](\.[0-9]){0,1}).*') @@ -80,6 +84,10 @@ def main_loop(args): if hdfs_client is not None: copyHdfsDirectoryToLocal(args.nni_hdfs_exp_dir, os.getcwd(), hdfs_client) + if args.job_id_file: + with open(args.job_id_file, 'w') as job_file: + job_file.write("%d" % os.getpid()) + # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior log_pipe_stdout = trial_syslogger_stdout.get_pipelog_reader() process = Popen(args.trial_command, shell=True, stdout=log_pipe_stdout, stderr=log_pipe_stdout) @@ -91,6 +99,9 @@ def main_loop(args): retCode = process.poll() # child worker process exits and all stdout data is read if retCode is not None and log_pipe_stdout.set_process_exit() and log_pipe_stdout.is_read_completed == True: + # In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError. + # So covert it to int32. + retCode = ctypes.c_long(retCode).value nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) if hdfs_output_dir is not None: # Copy local directory to hdfs for OpenPAI @@ -218,6 +229,7 @@ def run(self): PARSER.add_argument('--webhdfs_path', type=str, help='the webhdfs path used in webhdfs URL') PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager') PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trialkeeper') + PARSER.add_argument('--job_id_file', type=str, help='set job id file for operating and monitoring job.') args, unknown = PARSER.parse_known_args() if args.trial_command is None: exit(1) diff --git a/uninstall.ps1 b/uninstall.ps1 index 0da8f2d7f6..29f8e23483 100644 --- a/uninstall.ps1 +++ b/uninstall.ps1 @@ -4,12 +4,12 @@ $env:PYTHONIOENCODING = "UTF-8" if($env:VIRTUAL_ENV){ $NNI_PYTHON3 = $env:VIRTUAL_ENV + "\Scripts" $NNI_PKG_FOLDER = $env:VIRTUAL_ENV + "\nni" - Remove-Item "$NNI_PYTHON3\node.exe" -Force + cmd /c del "$NNI_PYTHON3\node.exe" } else{ $NNI_PYTHON3 = $(python -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]))') $NNI_PKG_FOLDER = $NNI_PYTHON3 + "\nni" - Remove-Item "$NNI_PYTHON3\Scripts\node.exe" -Force + cmd /c del "$NNI_PYTHON3\Scripts\node.exe" } $PIP_UNINSTALL = """$NNI_PYTHON3\python"" -m pip uninstall -y " @@ -17,13 +17,16 @@ $NNI_NODE_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-node" $NNI_YARN_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-yarn" # uninstall -Remove-Item $NNI_PKG_FOLDER -Recurse -Force -cmd /C $PIP_UNINSTALL "nni" +cmd /c rmdir /s /q $NNI_PKG_FOLDER +cmd /c $PIP_UNINSTALL "nni" -# clean -Remove-Item "src/nni_manager/dist" -Recurse -Force -Remove-Item "src/nni_manager/node_modules" -Recurse -Force -Remove-Item "src/webui/build" -Recurse -Force -Remove-Item "src/webui/node_modules" -Recurse -Force -Remove-Item $NNI_YARN_FOLDER -Recurse -Force -Remove-Item $NNI_NODE_FOLDER -Recurse -Force +# clean up +cmd /c rmdir /s /q "build" +cmd /c rmdir /s /q "src\nni_manager\dist" +cmd /c rmdir /s /q "src\nni_manager\node_modules" +cmd /c rmdir /s /q "src\webui\build" +cmd /c rmdir /s /q "src\webui\node_modules" +cmd /c rmdir /s /q "src\nasui\build" +cmd /c rmdir /s /q "src\nasui\node_modules" +cmd /c rmdir /s /q $NNI_YARN_FOLDER +cmd /c rmdir /s /q $NNI_NODE_FOLDER