From 358bdb1837d92126362a23c6414e7bf5f07d338b Mon Sep 17 00:00:00 2001 From: QuanluZhang Date: Mon, 4 Nov 2019 12:24:00 +0800 Subject: [PATCH 01/11] Dev compress refactor (#1690) * refactor --- docs/en_US/Compressor/AutoCompression.md | 10 +- docs/en_US/Compressor/Overview.md | 85 +++++---- docs/en_US/Compressor/Pruner.md | 16 +- docs/en_US/Compressor/Quantizer.md | 20 +- examples/model_compress/main_tf_pruner.py | 6 +- examples/model_compress/main_tf_quantizer.py | 4 +- examples/model_compress/main_torch_pruner.py | 4 +- .../model_compress/main_torch_quantizer.py | 4 +- .../compression/tensorflow/builtin_pruners.py | 16 +- .../tensorflow/builtin_quantizers.py | 12 +- .../nni/compression/tensorflow/compressor.py | 173 ++++++++++++------ .../nni/compression/torch/builtin_pruners.py | 16 +- .../compression/torch/builtin_quantizers.py | 12 +- .../pynni/nni/compression/torch/compressor.py | 135 ++++++++++---- src/sdk/pynni/tests/test_compressor.py | 8 +- 15 files changed, 327 insertions(+), 194 deletions(-) diff --git a/docs/en_US/Compressor/AutoCompression.md b/docs/en_US/Compressor/AutoCompression.md index a77a5aad0d..013240167a 100644 --- a/docs/en_US/Compressor/AutoCompression.md +++ b/docs/en_US/Compressor/AutoCompression.md @@ -9,13 +9,13 @@ You can easily compress a model with NNI compression. Take pruning for example, ```python from nni.compression.torch import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(config_list) -pruner(model) +pruner = LevelPruner(model, config_list) +pruner.compress() ``` The 'default' op_type stands for the module types defined in [default_layers.py](https://github.com/microsoft/nni/blob/master/src/sdk/pynni/nni/compression/torch/default_layers.py) for pytorch. -Therefore ```{ 'sparsity': 0.8, 'op_types': ['default'] }```means that **all layers with specified op_types will be compressed with the same 0.8 sparsity**. When ```pruner(model)``` called, the model is compressed with masks and after that you can normally fine tune this model and **pruned weights won't be updated** which have been masked. +Therefore ```{ 'sparsity': 0.8, 'op_types': ['default'] }```means that **all layers with specified op_types will be compressed with the same 0.8 sparsity**. When ```pruner.compress()``` called, the model is compressed with masks and after that you can normally fine tune this model and **pruned weights won't be updated** which have been masked. ## Then, make this automatic @@ -84,9 +84,9 @@ config_list_agp = [{'initial_sparsity': 0, 'final_sparsity': conv0_sparsity, {'initial_sparsity': 0, 'final_sparsity': conv1_sparsity, 'start_epoch': 0, 'end_epoch': 3, 'frequency': 1,'op_name': 'conv1' },] -PRUNERS = {'level':LevelPruner(config_list_level),'agp':AGP_Pruner(config_list_agp)} +PRUNERS = {'level':LevelPruner(model, config_list_level),'agp':AGP_Pruner(model, config_list_agp)} pruner = PRUNERS(params['prune_method']['_name']) -pruner(model) +pruner.compress() ... # fine tuning acc = evaluate(model) # evaluation nni.report_final_results(acc) diff --git a/docs/en_US/Compressor/Overview.md b/docs/en_US/Compressor/Overview.md index 6b8eef8635..7ee603e3e3 100644 --- a/docs/en_US/Compressor/Overview.md +++ b/docs/en_US/Compressor/Overview.md @@ -25,8 +25,8 @@ Tensorflow code ```python from nni.compression.tensorflow import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(config_list) -pruner(tf.get_default_graph()) +pruner = LevelPruner(tf.get_default_graph(), config_list) +pruner.compress() ``` PyTorch code @@ -34,13 +34,13 @@ PyTorch code ```python from nni.compression.torch import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(config_list) -pruner(model) +pruner = LevelPruner(model, config_list) +pruner.compress() ``` You can use other compression algorithms in the package of `nni.compression`. The algorithms are implemented in both PyTorch and Tensorflow, under `nni.compression.torch` and `nni.compression.tensorflow` respectively. You can refer to [Pruner](./Pruner.md) and [Quantizer](./Quantizer.md) for detail description of supported algorithms. -The function call `pruner(model)` receives user defined model (in Tensorflow the model can be obtained with `tf.get_default_graph()`, while in PyTorch the model is the defined model class), and the model is modified with masks inserted. Then when you run the model, the masks take effect. The masks can be adjusted at runtime by the algorithms. +The function call `pruner.compress()` modifies user defined model (in Tensorflow the model can be obtained with `tf.get_default_graph()`, while in PyTorch the model is the defined model class), and the model is modified with masks inserted. Then when you run the model, the masks take effect. The masks can be adjusted at runtime by the algorithms. When instantiate a compression algorithm, there is `config_list` passed in. We describe how to write this config below. @@ -111,20 +111,26 @@ If you want to write a new pruning algorithm, you can write a class that inherit # nni.compression.tensorflow.Pruner with # nni.compression.torch.Pruner class YourPruner(nni.compression.tensorflow.Pruner): - def __init__(self, config_list): - # suggest you to use the NNI defined spec for config - super().__init__(config_list) - - def bind_model(self, model): - # this func can be used to remember the model or its weights - # in member variables, for getting their values during training - pass - - def calc_mask(self, weight, config, **kwargs): - # weight is the target weight tensor - # config is the selected dict object in config_list for this layer - # kwargs contains op, op_types, and op_name - # design your mask and return your mask + def __init__(self, model, config_list): + """ + Suggest you to use the NNI defined spec for config + """ + super().__init__(model, config_list) + + def calc_mask(self, layer, config): + """ + Pruners should overload this method to provide mask for weight tensors. + The mask must have the same shape and type comparing to the weight. + It will be applied with ``mul()`` operation on the weight. + This method is effectively hooked to ``forward()`` method of the model. + + Parameters + ---------- + layer: LayerInfo + calculate mask for ``layer``'s weight + config: dict + the configuration for generating the mask + """ return your_mask # note for pytorch version, there is no sess in input arguments @@ -133,16 +139,18 @@ class YourPruner(nni.compression.tensorflow.Pruner): # note for pytorch version, there is no sess in input arguments def step(self, sess): - # can do some processing based on the model or weights binded - # in the func bind_model + """ + Can do some processing based on the model or weights binded + in the func bind_model + """ pass ``` -For the simplest algorithm, you only need to override `calc_mask`. It receives each layer's weight and selected configuration, as well as op information. You generate the mask for this weight in this function and return. Then NNI applies the mask for you. +For the simplest algorithm, you only need to override ``calc_mask``. It receives the to-be-compressed layers one by one along with their compression configuration. You generate the mask for this weight in this function and return. Then NNI applies the mask for you. -Some algorithms generate mask based on training progress, i.e., epoch number. We provide `update_epoch` for the pruner to be aware of the training progress. +Some algorithms generate mask based on training progress, i.e., epoch number. We provide `update_epoch` for the pruner to be aware of the training progress. It should be called at the beginning of each epoch. -Some algorithms may want global information for generating masks, for example, all weights of the model (for statistic information), model optimizer's information. NNI supports this requirement using `bind_model`. `bind_model` receives the complete model, thus, it could record any information (e.g., reference to weights) it cares about. Then `step` can process or update the information according to the algorithm. You can refer to [source code of built-in algorithms](https://github.com/microsoft/nni/tree/master/src/sdk/pynni/nni/compressors) for example implementations. +Some algorithms may want global information for generating masks, for example, all weights of the model (for statistic information). Your can use `self.bound_model` in the Pruner class for accessing weights. If you also need optimizer's information (for example in Pytorch), you could override `__init__` to receive more arguments such as model's optimizer. Then `step` can process or update the information according to the algorithm. You can refer to [source code of built-in algorithms](https://github.com/microsoft/nni/tree/master/src/sdk/pynni/nni/compressors) for example implementations. ### Quantization algorithm @@ -154,20 +162,19 @@ The interface for customizing quantization algorithm is similar to that of pruni # nni.compression.tensorflow.Quantizer with # nni.compression.torch.Quantizer class YourQuantizer(nni.compression.tensorflow.Quantizer): - def __init__(self, config_list): - # suggest you to use the NNI defined spec for config - super().__init__(config_list) - - def bind_model(self, model): - # this func can be used to remember the model or its weights - # in member variables, for getting their values during training - pass + def __init__(self, model, config_list): + """ + Suggest you to use the NNI defined spec for config + """ + super().__init__(model, config_list) def quantize_weight(self, weight, config, **kwargs): - # weight is the target weight tensor - # config is the selected dict object in config_list for this layer - # kwargs contains op, op_types, and op_name - # design your quantizer and return new weight + """ + weight is the target weight tensor + config is the selected dict object in config_list for this layer + kwargs contains op, op_types, and op_name + design your quantizer and return new weight + """ return new_weight # note for pytorch version, there is no sess in input arguments @@ -176,8 +183,10 @@ class YourQuantizer(nni.compression.tensorflow.Quantizer): # note for pytorch version, there is no sess in input arguments def step(self, sess): - # can do some processing based on the model or weights binded - # in the func bind_model + """ + Can do some processing based on the model or weights binded + in the func bind_model + """ pass ``` diff --git a/docs/en_US/Compressor/Pruner.md b/docs/en_US/Compressor/Pruner.md index 6e0a521be2..731503fc2d 100644 --- a/docs/en_US/Compressor/Pruner.md +++ b/docs/en_US/Compressor/Pruner.md @@ -13,16 +13,16 @@ Tensorflow code ``` from nni.compression.tensorflow import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(config_list) -pruner(model_graph) +pruner = LevelPruner(model_graph, config_list) +pruner.compress() ``` PyTorch code ``` from nni.compression.torch import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(config_list) -pruner(model) +pruner = LevelPruner(model, config_list) +pruner.compress() ``` #### User configuration for Level Pruner @@ -53,8 +53,8 @@ config_list = [{ 'frequency': 1, 'op_types': 'default' }] -pruner = AGP_Pruner(config_list) -pruner(tf.get_default_graph()) +pruner = AGP_Pruner(tf.get_default_graph(), config_list) +pruner.compress() ``` PyTorch code ```python @@ -67,8 +67,8 @@ config_list = [{ 'frequency': 1, 'op_types': 'default' }] -pruner = AGP_Pruner(config_list) -pruner(model) +pruner = AGP_Pruner(model, config_list) +pruner.compress() ``` Second, you should add code below to update epoch number when you finish one epoch in your training code. diff --git a/docs/en_US/Compressor/Quantizer.md b/docs/en_US/Compressor/Quantizer.md index 3839623024..fb03250b00 100644 --- a/docs/en_US/Compressor/Quantizer.md +++ b/docs/en_US/Compressor/Quantizer.md @@ -8,11 +8,11 @@ We provide Naive Quantizer to quantizer weight to default 8 bits, you can use it ### Usage tensorflow ```python -nni.compressors.tensorflow.NaiveQuantizer()(model_graph) +nni.compressors.tensorflow.NaiveQuantizer(model_graph).compress() ``` pytorch ```python -nni.compressors.torch.NaiveQuantizer()(model) +nni.compressors.torch.NaiveQuantizer(model).compress() ``` *** @@ -32,15 +32,15 @@ Tensorflow code ```python from nni.compressors.tensorflow import QAT_Quantizer config_list = [{ 'q_bits': 8, 'op_types': ['default'] }] -quantizer = QAT_Quantizer(config_list) -quantizer(tf.get_default_graph()) +quantizer = QAT_Quantizer(tf.get_default_graph(), config_list) +quantizer.compress() ``` PyTorch code ```python from nni.compressors.torch import QAT_Quantizer config_list = [{ 'q_bits': 8, 'op_types': ['default'] }] -quantizer = QAT_Quantizer(config_list) -quantizer(model) +quantizer = QAT_Quantizer(model, config_list) +quantizer.compress() ``` You can view example for more information @@ -61,15 +61,15 @@ Tensorflow code ```python from nni.compressors.tensorflow import DoReFaQuantizer config_list = [{ 'q_bits': 8, 'op_types': 'default' }] -quantizer = DoReFaQuantizer(config_list) -quantizer(tf.get_default_graph()) +quantizer = DoReFaQuantizer(tf.get_default_graph(), config_list) +quantizer.compress() ``` PyTorch code ```python from nni.compressors.torch import DoReFaQuantizer config_list = [{ 'q_bits': 8, 'op_types': 'default' }] -quantizer = DoReFaQuantizer(config_list) -quantizer(model) +quantizer = DoReFaQuantizer(model, config_list) +quantizer.compress() ``` You can view example for more information diff --git a/examples/model_compress/main_tf_pruner.py b/examples/model_compress/main_tf_pruner.py index 8714e6fa8a..90ef552b69 100644 --- a/examples/model_compress/main_tf_pruner.py +++ b/examples/model_compress/main_tf_pruner.py @@ -93,15 +93,13 @@ def main(): 'frequency': 1, 'op_types': ['default'] }] - pruner = AGP_Pruner(configure_list) + pruner = AGP_Pruner(tf.get_default_graph(), configure_list) # if you want to load from yaml file # configure_file = nni.compressors.tf_compressor._nnimc_tf._tf_default_load_configure_file('configure_example.yaml','AGPruner') # configure_list = configure_file.get('config',[]) # pruner.load_configure(configure_list) # you can also handle it yourself and input an configure list in json - pruner(tf.get_default_graph()) - # you can also use compress(model) or compress_default_graph() for tensorflow compressor - # pruner.compress(tf.get_default_graph()) + pruner.compress() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) diff --git a/examples/model_compress/main_tf_quantizer.py b/examples/model_compress/main_tf_quantizer.py index 0c50f6cb9a..ac74bcf5c0 100644 --- a/examples/model_compress/main_tf_quantizer.py +++ b/examples/model_compress/main_tf_quantizer.py @@ -83,8 +83,8 @@ def main(): DoReFaQuantizer(configure_list).compress(tf.get_default_graph()) ''' configure_list = [{'q_bits':8, 'op_types':['default']}] - quantizer = QAT_Quantizer(configure_list) - quantizer(tf.get_default_graph()) + quantizer = QAT_Quantizer(tf.get_default_graph(), configure_list) + quantizer.compress() # you can also use compress(model) or compress_default_graph() # method like QATquantizer(q_bits = 8).compress_default_graph() diff --git a/examples/model_compress/main_torch_pruner.py b/examples/model_compress/main_torch_pruner.py index 6ec8b069ba..e0b1be0251 100644 --- a/examples/model_compress/main_torch_pruner.py +++ b/examples/model_compress/main_torch_pruner.py @@ -79,8 +79,8 @@ def main(): 'op_types': ['default'] }] - pruner = AGP_Pruner(configure_list) - pruner(model) + pruner = AGP_Pruner(model, configure_list) + pruner.compress() # you can also use compress(model) method # like that pruner.compress(model) diff --git a/examples/model_compress/main_torch_quantizer.py b/examples/model_compress/main_torch_quantizer.py index 3e88f2660a..545478f111 100644 --- a/examples/model_compress/main_torch_quantizer.py +++ b/examples/model_compress/main_torch_quantizer.py @@ -69,8 +69,8 @@ def main(): DoReFaQuantizer(configure_list).compress(model) ''' configure_list = [{'q_bits':8, 'op_types':['default']}] - quantizer = QAT_Quantizer(configure_list) - quantizer(model) + quantizer = QAT_Quantizer(model, configure_list) + quantizer.compress() # you can also use compress(model) method # like thaht quantizer.compress(model) diff --git a/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py b/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py index e3f99e0f98..ed3493dfd5 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py +++ b/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py @@ -8,16 +8,18 @@ class LevelPruner(Pruner): - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - sparsity """ - super().__init__(config_list) + super().__init__(model, config_list) self.mask_list = {} self.if_init_list = {} - def calc_mask(self, weight, config, op_name, **kwargs): + def calc_mask(self, layer, config): + weight = layer.weight + op_name = layer.name if self.if_init_list.get(op_name, True): threshold = tf.contrib.distributions.percentile(tf.abs(weight), config['sparsity'] * 100) mask = tf.cast(tf.math.greater(tf.abs(weight), threshold), weight.dtype) @@ -38,7 +40,7 @@ class AGP_Pruner(Pruner): https://arxiv.org/pdf/1710.01878.pdf """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - initial_sparsity @@ -47,13 +49,15 @@ def __init__(self, config_list): - end_epoch: end epoch number stop update mask - frequency: if you want update every 2 epoch, you can set it 2 """ - super().__init__(config_list) + super().__init__(model, config_list) self.mask_list = {} self.if_init_list = {} self.now_epoch = tf.Variable(0) self.assign_handler = [] - def calc_mask(self, weight, config, op_name, **kwargs): + def calc_mask(self, layer, config): + weight = layer.weight + op_name = layer.name start_epoch = config.get('start_epoch', 0) freq = config.get('frequency', 1) if self.now_epoch >= start_epoch and self.if_init_list.get(op_name, True) and ( diff --git a/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py b/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py index 8e0b47d28b..fbf6168e39 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py +++ b/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py @@ -10,8 +10,8 @@ class NaiveQuantizer(Quantizer): """quantize weight to 8 bits """ - def __init__(self, config_list): - super().__init__(config_list) + def __init__(self, model, config_list): + super().__init__(model, config_list) self.layer_scale = {} def quantize_weight(self, weight, config, op_name, **kwargs): @@ -27,12 +27,12 @@ class QAT_Quantizer(Quantizer): Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - q_bits """ - super().__init__(config_list) + super().__init__(model, config_list) def quantize_weight(self, weight, config, **kwargs): a = tf.stop_gradient(tf.reduce_min(weight)) @@ -52,12 +52,12 @@ class DoReFaQuantizer(Quantizer): Zhou et al., DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients (https://arxiv.org/abs/1606.06160) """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - q_bits """ - super().__init__(config_list) + super().__init__(model, config_list) def quantize_weight(self, weight, config, **kwargs): a = tf.math.tanh(weight) diff --git a/src/sdk/pynni/nni/compression/tensorflow/compressor.py b/src/sdk/pynni/nni/compression/tensorflow/compressor.py index f092ebad30..c46e883342 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/compressor.py +++ b/src/sdk/pynni/nni/compression/tensorflow/compressor.py @@ -6,67 +6,85 @@ class LayerInfo: - def __init__(self, op): + def __init__(self, op, weight, weight_op): self.op = op self.name = op.name self.type = op.type + self.weight = weight + self.weight_op = weight_op class Compressor: - """Abstract base TensorFlow compressor""" + """ + Abstract base TensorFlow compressor + """ - def __init__(self, config_list): - self._bound_model = None - self._config_list = config_list + def __init__(self, model, config_list): + """ + Record necessary info in class members + + Parameters + ---------- + model : pytorch model + the model user wants to compress + config_list : list + the configurations that users specify for compression + """ + self.bound_model = model + self.config_list = config_list + self.modules_to_compress = [] - def __call__(self, model): - """Compress given graph with algorithm implemented by subclass. - The graph will be editted and returned. + def compress(self): """ - self.compress(model) - return model + Compress the model with algorithm implemented by subclass. - def compress(self, model): - """Compress given graph with algorithm implemented by subclass. - This will edit the graph. + The model will be instrumented and user should never edit it after calling this method. + `self.modules_to_compress` records all the to-be-compressed layers """ - assert self._bound_model is None, "Each NNI compressor instance can only compress one model" - self._bound_model = model - self.bind_model(model) - for op in model.get_operations(): - layer = LayerInfo(op) - config = self._select_config(layer) + for op in self.bound_model.get_operations(): + weight_index = _detect_weight_index(op) + if weight_index is None: + _logger.warning('Failed to detect weight for layer %s', op.name) + return + weight_op = op.inputs[weight_index].op + weight = weight_op.inputs[0] + + layer = LayerInfo(op, weight, weight_op) + config = self.select_config(layer) if config is not None: self._instrument_layer(layer, config) + self.modules_to_compress.append((layer, config)) + return self.bound_model - def compress_default_graph(self): - """Compress the default graph with algorithm implemented by subclass. - This will edit the default graph. + def get_modules_to_compress(self): """ - self.compress(tf.get_default_graph()) - + To obtain all the to-be-compressed layers. - def bind_model(self, model): - """This method is called when a model is bound to the compressor. - Compressors can optionally overload this method to do model-specific initialization. - It is guaranteed that only one model will be bound to each compressor instance. + Returns + ------- + self.modules_to_compress : list + a list of the layers, each of which is a tuple (`layer`, `config`), + `layer` is `LayerInfo`, `config` is a `dict` """ + return self.modules_to_compress - def update_epoch(self, epoch, sess): - """If user want to update mask every epoch, user can override this method + def select_config(self, layer): """ - - def step(self, sess): - """If user want to update mask every step, user can override this method + Find the configuration for `layer` by parsing `self.config_list` + + Parameters + ---------- + layer : LayerInfo + one layer + + Returns + ------- + ret : config or None + the retrieved configuration for this layer, if None, this layer should + not be compressed """ - - - def _instrument_layer(self, layer, config): - raise NotImplementedError() - - def _select_config(self, layer): ret = None - for config in self._config_list: + for config in self.config_list: op_types = config.get('op_types') if op_types == 'default': op_types = default_layers.op_weight_index.keys() @@ -79,35 +97,72 @@ def _select_config(self, layer): return None return ret + def update_epoch(self, epoch, sess): + """ + If user want to update model every epoch, user can override this method. + This method should be called at the beginning of each epoch + + Parameters + ---------- + epoch : num + the current epoch number + """ + + def step(self, sess): + """ + If user want to update mask every step, user can override this method + """ + + + def _instrument_layer(self, layer, config): + """ + This method is implemented in the subclasses, i.e., `Pruner` and `Quantizer` + + Parameters + ---------- + layer : LayerInfo + the layer to instrument the compression operation + config : dict + the configuration for compressing this layer + """ + raise NotImplementedError() + class Pruner(Compressor): """ Abstract base TensorFlow pruner """ - def calc_mask(self, weight, config, op, op_type, op_name): - """Pruners should overload this method to provide mask for weight tensors. + def calc_mask(self, layer, config): + """ + Pruners should overload this method to provide mask for weight tensors. The mask must have the same shape and type comparing to the weight. - It will be applied with `multiply()` operation. - This method works as a subgraph which will be inserted into the bound model. + It will be applied with `mul()` operation on the weight. + This method is effectively hooked to `forward()` method of the model. + + Parameters + ---------- + layer : LayerInfo + calculate mask for `layer`'s weight + config : dict + the configuration for generating the mask """ raise NotImplementedError("Pruners must overload calc_mask()") def _instrument_layer(self, layer, config): - # it seems the graph editor can only swap edges of nodes or remove all edges from a node - # it cannot remove one edge from a node, nor can it assign a new edge to a node - # we assume there is a proxy operation between the weight and the Conv2D layer - # this is true as long as the weight is `tf.Value` - # not sure what will happen if the weight is calculated from other operations - weight_index = _detect_weight_index(layer) - if weight_index is None: - _logger.warning('Failed to detect weight for layer %s', layer.name) - return - weight_op = layer.op.inputs[weight_index].op - weight = weight_op.inputs[0] - mask = self.calc_mask(weight, config, op=layer.op, op_type=layer.type, op_name=layer.name) - new_weight = weight * mask - tf.contrib.graph_editor.swap_outputs(weight_op, new_weight.op) + """ + Create a wrapper forward function to replace the original one. + + Parameters + ---------- + layer : LayerInfo + the layer to instrument the mask + config : dict + the configuration for generating the mask + """ + mask = self.calc_mask(layer, config) + new_weight = layer.weight * mask + tf.contrib.graph_editor.swap_outputs(layer.weight_op, new_weight.op) class Quantizer(Compressor): @@ -133,7 +188,7 @@ def _detect_weight_index(layer): index = default_layers.op_weight_index.get(layer.type) if index is not None: return index - weight_indices = [i for i, op in enumerate(layer.op.inputs) if op.name.endswith('Variable/read')] + weight_indices = [i for i, op in enumerate(layer.inputs) if op.name.endswith('Variable/read')] if len(weight_indices) == 1: return weight_indices[0] return None diff --git a/src/sdk/pynni/nni/compression/torch/builtin_pruners.py b/src/sdk/pynni/nni/compression/torch/builtin_pruners.py index 1ac951c8f9..593ba3f100 100644 --- a/src/sdk/pynni/nni/compression/torch/builtin_pruners.py +++ b/src/sdk/pynni/nni/compression/torch/builtin_pruners.py @@ -11,16 +11,18 @@ class LevelPruner(Pruner): """Prune to an exact pruning level specification """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - sparsity """ - super().__init__(config_list) + super().__init__(model, config_list) self.mask_list = {} self.if_init_list = {} - def calc_mask(self, weight, config, op_name, **kwargs): + def calc_mask(self, layer, config): + weight = layer.module.weight.data + op_name = layer.name if self.if_init_list.get(op_name, True): w_abs = weight.abs() k = int(weight.numel() * config['sparsity']) @@ -45,7 +47,7 @@ class AGP_Pruner(Pruner): https://arxiv.org/pdf/1710.01878.pdf """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - initial_sparsity @@ -54,12 +56,14 @@ def __init__(self, config_list): - end_epoch: end epoch number stop update mask, you should make sure start_epoch <= end_epoch - frequency: if you want update every 2 epoch, you can set it 2 """ - super().__init__(config_list) + super().__init__(model, config_list) self.mask_list = {} self.now_epoch = 0 self.if_init_list = {} - def calc_mask(self, weight, config, op_name, **kwargs): + def calc_mask(self, layer, config): + weight = layer.module.weight.data + op_name = layer.name start_epoch = config.get('start_epoch', 0) freq = config.get('frequency', 1) if self.now_epoch >= start_epoch and self.if_init_list.get(op_name, True) and ( diff --git a/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py b/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py index 5f4e46626b..bede5b2e59 100644 --- a/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py +++ b/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py @@ -10,8 +10,8 @@ class NaiveQuantizer(Quantizer): """quantize weight to 8 bits """ - def __init__(self, config_list): - super().__init__(config_list) + def __init__(self, model, config_list): + super().__init__(model, config_list) self.layer_scale = {} def quantize_weight(self, weight, config, op_name, **kwargs): @@ -27,12 +27,12 @@ class QAT_Quantizer(Quantizer): Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - q_bits """ - super().__init__(config_list) + super().__init__(model, config_list) def quantize_weight(self, weight, config, **kwargs): if config['q_bits'] <= 1: @@ -53,12 +53,12 @@ class DoReFaQuantizer(Quantizer): Zhou et al., DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients (https://arxiv.org/abs/1606.06160) """ - def __init__(self, config_list): + def __init__(self, model, config_list): """ config_list: supported keys: - q_bits """ - super().__init__(config_list) + super().__init__(model, config_list) def quantize_weight(self, weight, config, **kwargs): out = weight.tanh() diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py index 5909f3b6a4..580b1c1fac 100644 --- a/src/sdk/pynni/nni/compression/torch/compressor.py +++ b/src/sdk/pynni/nni/compression/torch/compressor.py @@ -15,49 +15,69 @@ def __init__(self, name, module): class Compressor: - """Abstract base PyTorch compressor""" + """ + Abstract base PyTorch compressor + """ - def __init__(self, config_list): - self._bound_model = None - self._config_list = config_list + def __init__(self, model, config_list): + """ + Record necessary info in class members + + Parameters + ---------- + model : pytorch model + the model user wants to compress + config_list : list + the configurations that users specify for compression + """ + self.bound_model = model + self.config_list = config_list + self.modules_to_compress = [] - def __call__(self, model): - self.compress(model) - return model + def compress(self): + """ + Compress the model with algorithm implemented by subclass. - def compress(self, model): - """Compress the model with algorithm implemented by subclass. The model will be instrumented and user should never edit it after calling this method. + `self.modules_to_compress` records all the to-be-compressed layers """ - assert self._bound_model is None, "Each NNI compressor instance can only compress one model" - self._bound_model = model - self.bind_model(model) - for name, module in model.named_modules(): + for name, module in self.bound_model.named_modules(): layer = LayerInfo(name, module) - config = self._select_config(layer) + config = self.select_config(layer) if config is not None: self._instrument_layer(layer, config) + self.modules_to_compress.append((layer, config)) + return self.bound_model - def bind_model(self, model): - """This method is called when a model is bound to the compressor. - Users can optionally overload this method to do model-specific initialization. - It is guaranteed that only one model will be bound to each compressor instance. + def get_modules_to_compress(self): """ + To obtain all the to-be-compressed layers. - def update_epoch(self, epoch): - """if user want to update model every epoch, user can override this method + Returns + ------- + self.modules_to_compress : list + a list of the layers, each of which is a tuple (`layer`, `config`), + `layer` is `LayerInfo`, `config` is a `dict` """ + return self.modules_to_compress - def step(self): - """if user want to update model every step, user can override this method + def select_config(self, layer): + """ + Find the configuration for `layer` by parsing `self.config_list` + + Parameters + ---------- + layer : LayerInfo + one layer + + Returns + ------- + ret : config or None + the retrieved configuration for this layer, if None, this layer should + not be compressed """ - - def _instrument_layer(self, layer, config): - raise NotImplementedError() - - def _select_config(self, layer): ret = None - for config in self._config_list: + for config in self.config_list: config['op_types'] = self._expand_config_op_types(config) if layer.type not in config['op_types']: continue @@ -68,6 +88,35 @@ def _select_config(self, layer): return None return ret + def update_epoch(self, epoch): + """ + If user want to update model every epoch, user can override this method. + This method should be called at the beginning of each epoch + + Parameters + ---------- + epoch : num + the current epoch number + """ + + def step(self): + """ + If user want to update model every step, user can override this method + """ + + def _instrument_layer(self, layer, config): + """ + This method is implemented in the subclasses, i.e., `Pruner` and `Quantizer` + + Parameters + ---------- + layer : LayerInfo + the layer to instrument the compression operation + config : dict + the configuration for compressing this layer + """ + raise NotImplementedError() + def _expand_config_op_types(self, config): if config is None: return [] @@ -84,17 +133,33 @@ class Pruner(Compressor): Abstract base PyTorch pruner """ - def calc_mask(self, weight, config, op, op_type, op_name): - """Pruners should overload this method to provide mask for weight tensors. + def calc_mask(self, layer, config): + """ + Pruners should overload this method to provide mask for weight tensors. The mask must have the same shape and type comparing to the weight. - It will be applied with `mul()` operation. + It will be applied with `mul()` operation on the weight. This method is effectively hooked to `forward()` method of the model. + + Parameters + ---------- + layer : LayerInfo + calculate mask for `layer`'s weight + config : dict + the configuration for generating the mask """ raise NotImplementedError("Pruners must overload calc_mask()") def _instrument_layer(self, layer, config): - # TODO: support multiple weight tensors - # create a wrapper forward function to replace the original one + """ + Create a wrapper forward function to replace the original one. + + Parameters + ---------- + layer : LayerInfo + the layer to instrument the mask + config : dict + the configuration for generating the mask + """ assert layer._forward is None, 'Each model can only be compressed once' if not _check_weight(layer.module): _logger.warning('Module %s does not have parameter "weight"', layer.name) @@ -104,12 +169,10 @@ def _instrument_layer(self, layer, config): def new_forward(*inputs): # apply mask to weight old_weight = layer.module.weight.data - mask = self.calc_mask(old_weight, config, op=layer.module, op_type=layer.type, op_name=layer.name) + mask = self.calc_mask(layer, config) layer.module.weight.data = old_weight.mul(mask) # calculate forward ret = layer._forward(*inputs) - # recover original weight - layer.module.weight.data = old_weight return ret layer.module.forward = new_forward diff --git a/src/sdk/pynni/tests/test_compressor.py b/src/sdk/pynni/tests/test_compressor.py index d921f82a75..ca8b628640 100644 --- a/src/sdk/pynni/tests/test_compressor.py +++ b/src/sdk/pynni/tests/test_compressor.py @@ -101,20 +101,20 @@ class CompressorTestCase(TestCase): def test_tf_pruner(self): model = TfMnist() configure_list = [{'sparsity': 0.8, 'op_types': ['default']}] - tf_compressor.LevelPruner(configure_list).compress_default_graph() + tf_compressor.LevelPruner(tf.get_default_graph(), configure_list).compress() def test_tf_quantizer(self): model = TfMnist() - tf_compressor.NaiveQuantizer([{'op_types': ['default']}]).compress_default_graph() + tf_compressor.NaiveQuantizer(tf.get_default_graph(), [{'op_types': ['default']}]).compress() def test_torch_pruner(self): model = TorchMnist() configure_list = [{'sparsity': 0.8, 'op_types': ['default']}] - torch_compressor.LevelPruner(configure_list).compress(model) + torch_compressor.LevelPruner(model, configure_list).compress() def test_torch_quantizer(self): model = TorchMnist() - torch_compressor.NaiveQuantizer([{'op_types': ['default']}]).compress(model) + torch_compressor.NaiveQuantizer(model, [{'op_types': ['default']}]).compress() if __name__ == '__main__': From 1f9b76173c1a60f31f29aedb2d5c771dde44c2bf Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 4 Nov 2019 12:49:35 +0800 Subject: [PATCH 02/11] Add comprehensive tests for tuners (merge into master) (#1681) * Add comprehensive tests for tuners (#1570) --- azure-pipelines.yml | 8 +- docs/en_US/Tuner/BuiltinTuner.md | 6 +- docs/en_US/Tutorial/SearchSpaceSpec.md | 12 +- .../nni/evolution_tuner/evolution_tuner.py | 4 +- src/sdk/pynni/nni/nas_utils.py | 2 + src/sdk/pynni/nni/parameter_expressions.py | 2 +- src/sdk/pynni/tests/assets/search_space.json | 88 ++++++ src/sdk/pynni/tests/test_msg_dispatcher.py | 125 +++++++++ src/sdk/pynni/tests/test_tuner.py | 265 +++++++++++------- test/metrics_test.py | 2 + test/unittest.sh | 4 +- 11 files changed, 410 insertions(+), 108 deletions(-) create mode 100644 src/sdk/pynni/tests/assets/search_space.json create mode 100644 src/sdk/pynni/tests/test_msg_dispatcher.py diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4d6e23c9f9..33ec934627 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -25,6 +25,8 @@ jobs: displayName: 'Run flake8 tests to find Python syntax errors and undefined names' - script: | cd test + sudo apt install -y swig + PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC source unittest.sh displayName: 'Unit test' - script: | @@ -65,7 +67,11 @@ jobs: displayName: 'Install nni toolkit via source code' - script: | cd test - PATH=$HOME/Library/Python/3.7/bin:$PATH && source unittest.sh + ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" < /dev/null 2> /dev/null + brew install swig@3 + ln -s /usr/local/opt/swig\@3/bin/swig /usr/local/bin/swig + PATH=$HOME/Library/Python/3.7/bin:$PATH nnictl package install --name=SMAC + PATH=$HOME/Library/Python/3.7/bin:$PATH source unittest.sh displayName: 'Unit test' - script: | cd test diff --git a/docs/en_US/Tuner/BuiltinTuner.md b/docs/en_US/Tuner/BuiltinTuner.md index b7eca12075..9dd9085e4c 100644 --- a/docs/en_US/Tuner/BuiltinTuner.md +++ b/docs/en_US/Tuner/BuiltinTuner.md @@ -122,7 +122,7 @@ Its requirement of computation resource is relatively high. Specifically, it req * **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', the tuner will target to maximize metrics. If 'minimize', the tuner will target to minimize metrics. -* **population_size** (*int value (should > 0), optional, default = 20*) - the initial size of the population(trial num) in evolution tuner. Suggests `population_size` be much larger than `concurrency`, so users can get the most out of the algorithm (and at least `concurrency`, or the tuner will fail on their first generation of parameters). +* **population_size** (*int value (should > 0), optional, default = 20*) - the initial size of the population (trial num) in evolution tuner. Suggests `population_size` be much larger than `concurrency`, so users can get the most out of the algorithm (and at least `concurrency`, or the tuner will fail on their first generation of parameters). **Usage example** @@ -143,11 +143,11 @@ tuner: > Built-in Tuner Name: **SMAC** -**Please note that SMAC doesn't support running on windows currently. The specific reason can be referred to this [GitHub issue](https://github.com/automl/SMAC3/issues/483).** +**Please note that SMAC doesn't support running on Windows currently. The specific reason can be referred to this [GitHub issue](https://github.com/automl/SMAC3/issues/483).** **Installation** -SMAC need to be installed by following command before first use. +SMAC need to be installed by following command before first use. As a reminder, `swig` is required for SMAC: for Ubuntu `swig` can be installed with `apt`. ```bash nnictl package install --name=SMAC diff --git a/docs/en_US/Tutorial/SearchSpaceSpec.md b/docs/en_US/Tutorial/SearchSpaceSpec.md index b892a5e1e5..fd1781716f 100644 --- a/docs/en_US/Tutorial/SearchSpaceSpec.md +++ b/docs/en_US/Tutorial/SearchSpaceSpec.md @@ -21,6 +21,8 @@ To define a search space, users should define the name of variable, the type of Take the first line as an example. `dropout_rate` is defined as a variable whose priori distribution is a uniform distribution of a range from `0.1` and `0.5`. +Note that the ability of a search space is highly connected with your tuner. We listed the supported types for each builtin tuner below. For a customized tuner, you don't have to follow our convention and you will have the flexibility to define any type you want. + ## Types All types of sampling strategies and their parameter are listed here: @@ -74,6 +76,8 @@ All types of sampling strategies and their parameter are listed here: * `{"_type": "mutable_layer", "_value": {mutable_layer_infomation}}` * Type for [Neural Architecture Search Space][1]. Value is also a dictionary, which contains key-value pairs representing respectively name and search space of each mutable_layer. * For now, users can only use this type of search space with annotation, which means that there is no need to define a json file for search space since it will be automatically generated according to the annotation in trial code. + * The following HPO tuners can be adapted to tune this search space: TPE, Random, Anneal, Evolution, Grid Search, + Hyperband and BOHB. * For detailed usage, please refer to [General NAS Interfaces][1]. ## Search Space Types Supported by Each Tuner @@ -86,20 +90,20 @@ All types of sampling strategies and their parameter are listed here: | Evolution Tuner | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | SMAC Tuner | ✓ | ✓ | ✓ | ✓ | ✓ | | | | | | | Batch Tuner | ✓ | | | | | | | | | | -| Grid Search Tuner | ✓ | ✓ | | ✓ | | | | | | | +| Grid Search Tuner | ✓ | ✓ | | ✓ | | | | | | | | Hyperband Advisor | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | Metis Tuner | ✓ | ✓ | ✓ | ✓ | | | | | | | -| GP Tuner | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | | | | +| GP Tuner | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | | | | Known Limitations: -* GP Tuner and Metis Tuner support only **numerical values** in search space(`choice` type values can be no-numeraical with other tuners, e.g. string values). Both GP Tuner and Metis Tuner use Gaussian Process Regressor(GPR). GPR make predictions based on a kernel function and the 'distance' between different points, it's hard to get the true distance between no-numerical values. +* GP Tuner and Metis Tuner support only **numerical values** in search space (`choice` type values can be no-numeraical with other tuners, e.g. string values). Both GP Tuner and Metis Tuner use Gaussian Process Regressor(GPR). GPR make predictions based on a kernel function and the 'distance' between different points, it's hard to get the true distance between no-numerical values. * Note that for nested search space: * Only Random Search/TPE/Anneal/Evolution tuner supports nested search space - * We do not support nested search space "Hyper Parameter" in visualization now, the enhancement is being considered in #1110(https://github.com/microsoft/nni/issues/1110), any suggestions or discussions or contributions are warmly welcomed + * We do not support nested search space "Hyper Parameter" in visualization now, the enhancement is being considered in [#1110](https://github.com/microsoft/nni/issues/1110), any suggestions or discussions or contributions are warmly welcomed [1]: ../AdvancedFeature/GeneralNasInterfaces.md diff --git a/src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py b/src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py index 1191ca2357..8cec6df1ce 100644 --- a/src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py +++ b/src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py @@ -158,11 +158,11 @@ class EvolutionTuner(Tuner): EvolutionTuner is tuner using navie evolution algorithm. """ - def __init__(self, optimize_mode, population_size=32): + def __init__(self, optimize_mode="maximize", population_size=32): """ Parameters ---------- - optimize_mode : str + optimize_mode : str, default 'maximize' population_size : int initial population size. The larger population size, the better evolution performance. diff --git a/src/sdk/pynni/nni/nas_utils.py b/src/sdk/pynni/nni/nas_utils.py index cb985536c4..fa3748b882 100644 --- a/src/sdk/pynni/nni/nas_utils.py +++ b/src/sdk/pynni/nni/nas_utils.py @@ -265,6 +265,8 @@ def convert_nas_search_space(search_space): param search_space: raw search space return: the new search space, mutable_layers will be converted into choice """ + if not isinstance(search_space, dict): + return search_space ret = dict() for k, v in search_space.items(): if "_type" not in v: diff --git a/src/sdk/pynni/nni/parameter_expressions.py b/src/sdk/pynni/nni/parameter_expressions.py index 838f1f2484..97142d5c3b 100644 --- a/src/sdk/pynni/nni/parameter_expressions.py +++ b/src/sdk/pynni/nni/parameter_expressions.py @@ -48,7 +48,7 @@ def uniform(low, high, random_state): high: an float that represent an upper bound random_state: an object of numpy.random.RandomState ''' - assert high > low, 'Upper bound must be larger than lower bound' + assert high >= low, 'Upper bound must be larger than lower bound' return random_state.uniform(low, high) diff --git a/src/sdk/pynni/tests/assets/search_space.json b/src/sdk/pynni/tests/assets/search_space.json new file mode 100644 index 0000000000..0e7c7ba9cc --- /dev/null +++ b/src/sdk/pynni/tests/assets/search_space.json @@ -0,0 +1,88 @@ +{ + "choice_str": { + "_type": "choice", + "_value": ["cat", "dog", "elephant", "cow", "sheep", "panda"], + "fail": ["metis", "gp"] + }, + "choice_int": { + "_type": "choice", + "_value": [42, 43, -1] + }, + "choice_mixed": { + "_type": "choice", + "_value": [0.3, "cat", 1, null], + "fail": ["metis", "gp"] + }, + "choice_float": { + "_type": "choice", + "_value": [0.3, 1, 2.0] + }, + "choice_single": { + "_type": "choice", + "_value": [1] + }, + "randint_ok": { + "_type": "randint", + "_value": [-2, 3] + }, + "randint_single": { + "_type": "randint", + "_value": [10, 11] + }, + "randint_fail_equal": { + "_type": "randint", + "_value": [0, 0] + }, + "uniform_ok": { + "_type": "uniform", + "_value": [-1.0, 1.5] + }, + "uniform_equal": { + "_type": "uniform", + "_value": [99.9, 99.9] + }, + "quniform_ok": { + "_type": "quniform", + "_value": [0.0, 10.0, 2.5] + }, + "quniform_clip": { + "_type": "quniform", + "_value": [2.0, 10.0, 5.0] + }, + "quniform_clip_2": { + "_type": "quniform", + "_value": [-5.5, -0.5, 6] + }, + "loguniform_ok": { + "_type": "loguniform", + "_value": [0.001, 100] + }, + "loguniform_equal": { + "_type": "loguniform", + "_value": [1, 1] + }, + "qloguniform_ok": { + "_type": "qloguniform", + "_value": [0.001, 100, 1] + }, + "qloguniform_equal": { + "_type": "qloguniform", + "_value": [2, 2, 1] + }, + "normal_ok": { + "_type": "normal", + "_value": [-1.0, 5.0] + }, + "qnormal_ok": { + "_type": "qnormal", + "_value": [-1.5, 5.0, 0.1] + }, + "lognormal_ok": { + "_type": "lognormal", + "_value": [-1.0, 5.0] + }, + "qlognormal_ok": { + "_type": "qlognormal", + "_value": [-1.5, 5.0, 0.1] + } +} \ No newline at end of file diff --git a/src/sdk/pynni/tests/test_msg_dispatcher.py b/src/sdk/pynni/tests/test_msg_dispatcher.py new file mode 100644 index 0000000000..883e2349c3 --- /dev/null +++ b/src/sdk/pynni/tests/test_msg_dispatcher.py @@ -0,0 +1,125 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +# associated documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, distribute, +# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or +# substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ================================================================================================== + + +import json +from io import BytesIO +from unittest import TestCase, main + +import nni.protocol +from nni.msg_dispatcher import MsgDispatcher +from nni.protocol import CommandType, send, receive +from nni.tuner import Tuner +from nni.utils import extract_scalar_reward + + +class NaiveTuner(Tuner): + def __init__(self): + self.param = 0 + self.trial_results = [] + self.search_space = None + self._accept_customized_trials() + + def generate_parameters(self, parameter_id, **kwargs): + # report Tuner's internal states to generated parameters, + # so we don't need to pause the main loop + self.param += 2 + return { + 'param': self.param, + 'trial_results': self.trial_results, + 'search_space': self.search_space + } + + def receive_trial_result(self, parameter_id, parameters, value, **kwargs): + reward = extract_scalar_reward(value) + self.trial_results.append((parameter_id, parameters['param'], reward, kwargs.get("customized"))) + + def update_search_space(self, search_space): + self.search_space = search_space + + +_in_buf = BytesIO() +_out_buf = BytesIO() + + +def _reverse_io(): + _in_buf.seek(0) + _out_buf.seek(0) + nni.protocol._out_file = _in_buf + nni.protocol._in_file = _out_buf + + +def _restore_io(): + _in_buf.seek(0) + _out_buf.seek(0) + nni.protocol._in_file = _in_buf + nni.protocol._out_file = _out_buf + + +class MsgDispatcherTestCase(TestCase): + def test_msg_dispatcher(self): + _reverse_io() # now we are sending to Tuner's incoming stream + send(CommandType.RequestTrialJobs, '2') + send(CommandType.ReportMetricData, '{"parameter_id":0,"type":"PERIODICAL","value":10}') + send(CommandType.ReportMetricData, '{"parameter_id":1,"type":"FINAL","value":11}') + send(CommandType.UpdateSearchSpace, '{"name":"SS0"}') + send(CommandType.AddCustomizedTrialJob, '{"param":-1}') + send(CommandType.ReportMetricData, '{"parameter_id":2,"type":"FINAL","value":22}') + send(CommandType.RequestTrialJobs, '1') + send(CommandType.KillTrialJob, 'null') + _restore_io() + + tuner = NaiveTuner() + dispatcher = MsgDispatcher(tuner) + nni.msg_dispatcher_base._worker_fast_exit_on_terminate = False + + dispatcher.run() + e = dispatcher.worker_exceptions[0] + self.assertIs(type(e), AssertionError) + self.assertEqual(e.args[0], 'Unsupported command: CommandType.KillTrialJob') + + _reverse_io() # now we are receiving from Tuner's outgoing stream + self._assert_params(0, 2, [], None) + self._assert_params(1, 4, [], None) + + command, data = receive() # this one is customized + data = json.loads(data) + self.assertIs(command, CommandType.NewTrialJob) + self.assertEqual(data['parameter_id'], 2) + self.assertEqual(data['parameter_source'], 'customized') + self.assertEqual(data['parameters'], {'param': -1}) + + self._assert_params(3, 6, [[1, 4, 11, False], [2, -1, 22, True]], {'name': 'SS0'}) + + self.assertEqual(len(_out_buf.read()), 0) # no more commands + + def _assert_params(self, parameter_id, param, trial_results, search_space): + command, data = receive() + self.assertIs(command, CommandType.NewTrialJob) + data = json.loads(data) + self.assertEqual(data['parameter_id'], parameter_id) + self.assertEqual(data['parameter_source'], 'algorithm') + self.assertEqual(data['parameters']['param'], param) + self.assertEqual(data['parameters']['trial_results'], trial_results) + self.assertEqual(data['parameters']['search_space'], search_space) + + +if __name__ == '__main__': + main() diff --git a/src/sdk/pynni/tests/test_tuner.py b/src/sdk/pynni/tests/test_tuner.py index 57ee6ac70c..04d9f3aaba 100644 --- a/src/sdk/pynni/tests/test_tuner.py +++ b/src/sdk/pynni/tests/test_tuner.py @@ -17,107 +17,184 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT # OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ================================================================================================== - - -import nni.protocol -from nni.protocol import CommandType, send, receive -from nni.tuner import Tuner -from nni.msg_dispatcher import MsgDispatcher -from nni.utils import extract_scalar_reward -from io import BytesIO +import glob import json +import logging +import os +import shutil +import sys from unittest import TestCase, main +from nni.batch_tuner.batch_tuner import BatchTuner +from nni.evolution_tuner.evolution_tuner import EvolutionTuner +from nni.gp_tuner.gp_tuner import GPTuner +from nni.gridsearch_tuner.gridsearch_tuner import GridSearchTuner +from nni.hyperopt_tuner.hyperopt_tuner import HyperoptTuner +from nni.metis_tuner.metis_tuner import MetisTuner +try: + from nni.smac_tuner.smac_tuner import SMACTuner +except ImportError: + assert sys.platform == "win32" +from nni.tuner import Tuner -class NaiveTuner(Tuner): - def __init__(self): - self.param = 0 - self.trial_results = [] - self.search_space = None - self._accept_customized_trials() - - def generate_parameters(self, parameter_id, **kwargs): - # report Tuner's internal states to generated parameters, - # so we don't need to pause the main loop - self.param += 2 - return { - 'param': self.param, - 'trial_results': self.trial_results, - 'search_space': self.search_space - } - - def receive_trial_result(self, parameter_id, parameters, value, customized, **kwargs): - reward = extract_scalar_reward(value) - self.trial_results.append((parameter_id, parameters['param'], reward, customized)) - - def update_search_space(self, search_space): - self.search_space = search_space - - -_in_buf = BytesIO() -_out_buf = BytesIO() - - -def _reverse_io(): - _in_buf.seek(0) - _out_buf.seek(0) - nni.protocol._out_file = _in_buf - nni.protocol._in_file = _out_buf - - -def _restore_io(): - _in_buf.seek(0) - _out_buf.seek(0) - nni.protocol._in_file = _in_buf - nni.protocol._out_file = _out_buf +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger('test_tuner') class TunerTestCase(TestCase): - def test_tuner(self): - _reverse_io() # now we are sending to Tuner's incoming stream - send(CommandType.RequestTrialJobs, '2') - send(CommandType.ReportMetricData, '{"parameter_id":0,"type":"PERIODICAL","value":10}') - send(CommandType.ReportMetricData, '{"parameter_id":1,"type":"FINAL","value":11}') - send(CommandType.UpdateSearchSpace, '{"name":"SS0"}') - send(CommandType.AddCustomizedTrialJob, '{"param":-1}') - send(CommandType.ReportMetricData, '{"parameter_id":2,"type":"FINAL","value":22}') - send(CommandType.RequestTrialJobs, '1') - send(CommandType.KillTrialJob, 'null') - _restore_io() - - tuner = NaiveTuner() - dispatcher = MsgDispatcher(tuner) - nni.msg_dispatcher_base._worker_fast_exit_on_terminate = False - - dispatcher.run() - e = dispatcher.worker_exceptions[0] - self.assertIs(type(e), AssertionError) - self.assertEqual(e.args[0], 'Unsupported command: CommandType.KillTrialJob') - - _reverse_io() # now we are receiving from Tuner's outgoing stream - self._assert_params(0, 2, [], None) - self._assert_params(1, 4, [], None) - - command, data = receive() # this one is customized - data = json.loads(data) - self.assertIs(command, CommandType.NewTrialJob) - self.assertEqual(data['parameter_id'], 2) - self.assertEqual(data['parameter_source'], 'customized') - self.assertEqual(data['parameters'], {'param': -1}) - - self._assert_params(3, 6, [[1, 4, 11, False], [2, -1, 22, True]], {'name': 'SS0'}) - - self.assertEqual(len(_out_buf.read()), 0) # no more commands - - def _assert_params(self, parameter_id, param, trial_results, search_space): - command, data = receive() - self.assertIs(command, CommandType.NewTrialJob) - data = json.loads(data) - self.assertEqual(data['parameter_id'], parameter_id) - self.assertEqual(data['parameter_source'], 'algorithm') - self.assertEqual(data['parameters']['param'], param) - self.assertEqual(data['parameters']['trial_results'], trial_results) - self.assertEqual(data['parameters']['search_space'], search_space) + """ + Targeted at testing functions of built-in tuners, including + - [ ] load_checkpoint + - [ ] save_checkpoint + - [X] update_search_space + - [X] generate_multiple_parameters + - [ ] import_data + - [ ] trial_end + - [ ] receive_trial_result + """ + + def search_space_test_one(self, tuner_factory, search_space): + tuner = tuner_factory() + self.assertIsInstance(tuner, Tuner) + tuner.update_search_space(search_space) + + parameters = tuner.generate_multiple_parameters(list(range(0, 50))) + logger.info(parameters) + self.check_range(parameters, search_space) + if not parameters: # TODO: not strict + raise ValueError("No parameters generated") + return parameters + + def check_range(self, generated_params, search_space): + EPS = 1E-6 + for param in generated_params: + if self._testMethodName == "test_batch": + param = {list(search_space.keys())[0]: param} + for k, v in param.items(): + if k.startswith("_mutable_layer"): + _, block, layer, choice = k.split("/") + cand = search_space[block]["_value"][layer].get(choice) + # cand could be None, e.g., optional_inputs_chosen_state + if choice == "layer_choice": + self.assertIn(v, cand) + if choice == "optional_input_size": + if isinstance(cand, int): + self.assertEqual(v, cand) + else: + self.assertGreaterEqual(v, cand[0]) + self.assertLessEqual(v, cand[1]) + if choice == "optional_inputs": + pass # ignore for now + continue + item = search_space[k] + if item["_type"] == "choice": + self.assertIn(v, item["_value"]) + if item["_type"] == "randint": + self.assertIsInstance(v, int) + if item["_type"] == "uniform": + self.assertIsInstance(v, float) + if item["_type"] in ("randint", "uniform", "quniform", "loguniform", "qloguniform"): + self.assertGreaterEqual(v, item["_value"][0]) + self.assertLessEqual(v, item["_value"][1]) + if item["_type"].startswith("q"): + multiple = v / item["_value"][2] + print(k, v, multiple, item) + if item["_value"][0] + EPS < v < item["_value"][1] - EPS: + self.assertAlmostEqual(int(round(multiple)), multiple) + if item["_type"] in ("qlognormal", "lognormal"): + self.assertGreaterEqual(v, 0) + if item["_type"] == "mutable_layer": + for layer_name in item["_value"].keys(): + self.assertIn(v[layer_name]["chosen_layer"], item["layer_choice"]) + + def search_space_test_all(self, tuner_factory, supported_types=None, ignore_types=None): + # NOTE(yuge): ignore types + # Supported types are listed in the table. They are meant to be supported and should be correct. + # Other than those, all the rest are "unsupported", which are expected to produce ridiculous results + # or throw some exceptions. However, there are certain types I can't check. For example, generate + # "normal" using GP Tuner returns successfully and results are fine if we check the range (-inf to +inf), + # but they make no sense: it's not a normal distribution. So they are ignored in tests for now. + with open(os.path.join(os.path.dirname(__file__), "assets/search_space.json"), "r") as fp: + search_space_all = json.load(fp) + if supported_types is None: + supported_types = ["choice", "randint", "uniform", "quniform", "loguniform", "qloguniform", + "normal", "qnormal", "lognormal", "qlognormal"] + full_supported_search_space = dict() + for single in search_space_all: + single_keyword = single.split("_") + space = search_space_all[single] + expected_fail = not any([t in single_keyword for t in supported_types]) or "fail" in single_keyword + if ignore_types is not None and any([t in ignore_types for t in single_keyword]): + continue + if "fail" in space: + if self._testMethodName.split("_", 1)[1] in space.pop("fail"): + expected_fail = True + single_search_space = {single: space} + if not expected_fail: + # supports this key + self.search_space_test_one(tuner_factory, single_search_space) + full_supported_search_space.update(single_search_space) + else: + # unsupported key + with self.assertRaises(Exception, msg="Testing {}".format(single)) as cm: + self.search_space_test_one(tuner_factory, single_search_space) + logger.info("%s %s %s", tuner_factory, single, cm.exception) + if not any(t in self._testMethodName for t in ["batch", "grid_search"]): + # grid search fails for too many combinations + logger.info("Full supported search space: %s", full_supported_search_space) + self.search_space_test_one(tuner_factory, full_supported_search_space) + + def test_grid_search(self): + self.search_space_test_all(lambda: GridSearchTuner(), + supported_types=["choice", "randint", "quniform"]) + + def test_tpe(self): + self.search_space_test_all(lambda: HyperoptTuner("tpe")) + + def test_random_search(self): + self.search_space_test_all(lambda: HyperoptTuner("random_search")) + + def test_anneal(self): + self.search_space_test_all(lambda: HyperoptTuner("anneal")) + + def test_smac(self): + if sys.platform == "win32": + return # smac doesn't work on windows + self.search_space_test_all(lambda: SMACTuner(), + supported_types=["choice", "randint", "uniform", "quniform", "loguniform"]) + + def test_batch(self): + self.search_space_test_all(lambda: BatchTuner(), + supported_types=["choice"]) + + def test_evolution(self): + # Needs enough population size, otherwise it will throw a runtime error + self.search_space_test_all(lambda: EvolutionTuner(population_size=100)) + + def test_gp(self): + self.search_space_test_all(lambda: GPTuner(), + supported_types=["choice", "randint", "uniform", "quniform", "loguniform", + "qloguniform"], + ignore_types=["normal", "lognormal", "qnormal", "qlognormal"]) + + def test_metis(self): + self.search_space_test_all(lambda: MetisTuner(), + supported_types=["choice", "randint", "uniform", "quniform"]) + + def test_networkmorphism(self): + pass + + def test_ppo(self): + pass + + def tearDown(self): + file_list = glob.glob("smac3*") + ["param_config_space.pcs", "scenario.txt", "model_path"] + for file in file_list: + if os.path.exists(file): + if os.path.isdir(file): + shutil.rmtree(file) + else: + os.remove(file) if __name__ == '__main__': diff --git a/test/metrics_test.py b/test/metrics_test.py index 2a619510d1..1f8f0e8893 100644 --- a/test/metrics_test.py +++ b/test/metrics_test.py @@ -50,6 +50,8 @@ def run_test(): if status == 'DONE': num_succeeded = get_succeeded_trial_num(TRIAL_JOBS_URL) print_stderr(TRIAL_JOBS_URL) + if sys.platform == "win32": + time.sleep(sleep_interval) # Windows seems to have some issues on updating in time assert num_succeeded == max_trial_num, 'only %d succeeded trial jobs, there should be %d' % (num_succeeded, max_trial_num) check_metrics() break diff --git a/test/unittest.sh b/test/unittest.sh index a94beb6017..359f8a31f3 100755 --- a/test/unittest.sh +++ b/test/unittest.sh @@ -20,9 +20,7 @@ echo "===========================Testing: nni_sdk===========================" cd ${CWD}/../src/sdk/pynni/ python3 -m unittest discover -v tests - - -# -------------For typescrip unittest------------- +# -------------For typescript unittest------------- cd ${CWD}/../src/nni_manager echo "" echo "===========================Testing: nni_manager===========================" From eea50784c4facbedd15345796973e2281e59bdf7 Mon Sep 17 00:00:00 2001 From: chicm-ms <38930155+chicm-ms@users.noreply.github.com> Date: Mon, 4 Nov 2019 14:27:03 +0800 Subject: [PATCH 03/11] Dev pylint (#1697) Fix pylint errors --- azure-pipelines.yml | 23 +++- src/sdk/pycli/setup.py | 20 ++-- .../nni/compression/tensorflow/compressor.py | 2 +- .../pynni/nni/compression/torch/compressor.py | 2 +- src/sdk/pynni/nni/ppo_tuner/distri.py | 6 +- src/sdk/pynni/nni/ppo_tuner/policy.py | 2 +- src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py | 15 +-- src/sdk/pynni/nni/ppo_tuner/util.py | 2 +- src/sdk/pynni/nni/smac_tuner/__init__.py | 2 +- src/sdk/pynni/nni/smac_tuner/smac_tuner.py | 1 - tools/nni_cmd/command_utils.py | 2 +- tools/nni_cmd/common_utils.py | 10 +- tools/nni_cmd/config_schema.py | 104 +++++++++--------- tools/nni_cmd/config_utils.py | 31 +++--- tools/nni_cmd/constants.py | 5 +- tools/nni_cmd/launcher.py | 46 ++++---- tools/nni_cmd/launcher_utils.py | 20 ++-- tools/nni_cmd/nnictl.py | 16 ++- tools/nni_cmd/nnictl_utils.py | 66 ++++++----- tools/nni_cmd/package_management.py | 6 +- tools/nni_cmd/ssh_utils.py | 6 +- tools/nni_cmd/tensorboard_utils.py | 25 ++--- tools/nni_cmd/updater.py | 2 +- tools/nni_cmd/url_utils.py | 6 +- tools/nni_gpu_tool/gpu_metrics_collector.py | 21 ++-- tools/nni_trial_tool/hdfsClientUtility.py | 4 +- tools/nni_trial_tool/log_utils.py | 16 ++- tools/nni_trial_tool/rest_utils.py | 1 - .../test/test_hdfsClientUtility.py | 14 ++- tools/nni_trial_tool/trial_keeper.py | 64 ++++++----- 30 files changed, 287 insertions(+), 253 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 33ec934627..7b8e6f626c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,16 +8,33 @@ jobs: PYTHON_VERSION: '3.6' steps: - - script: python3 -m pip install --upgrade pip setuptools --user + - script: | + python3 -m pip install --upgrade pip setuptools --user + python3 -m pip install pylint==2.3.1 astroid==2.2.5 --user + python3 -m pip install coverage --user displayName: 'Install python tools' + - script: | + source install.sh + displayName: 'Install nni toolkit via source code' - script: | python3 -m pip install torch==0.4.1 --user python3 -m pip install torchvision==0.2.1 --user python3 -m pip install tensorflow==1.13.1 --user + python3 -m pip install keras==2.1.6 --user + python3 -m pip install gym onnx --user + sudo apt-get install swig -y + PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC + PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB displayName: 'Install dependencies' - script: | - source install.sh - displayName: 'Install nni toolkit via source code' + set -e + python3 -m pylint --rcfile pylintrc nni_annotation + python3 -m pylint --rcfile pylintrc nni_cmd + python3 -m pylint --rcfile pylintrc nni_gpu_tool + python3 -m pylint --rcfile pylintrc nni_trial_tool + python3 -m pylint --rcfile pylintrc nni + python3 -m pylint --rcfile pylintrc nnicli + displayName: 'Run pylint' - script: | python3 -m pip install flake8 --user IGNORE=./tools/nni_annotation/testcase/*:F821,./examples/trials/mnist-nas/*/mnist*.py:F821,./examples/trials/nas_cifar10/src/cifar10/general_child.py:F821 diff --git a/src/sdk/pycli/setup.py b/src/sdk/pycli/setup.py index 95a6ff2ff1..b85ef7bca8 100644 --- a/src/sdk/pycli/setup.py +++ b/src/sdk/pycli/setup.py @@ -1,18 +1,18 @@ import setuptools setuptools.setup( - name = 'nnicli', - version = '999.0.0-developing', - packages = setuptools.find_packages(), + name='nnicli', + version='999.0.0-developing', + packages=setuptools.find_packages(), - python_requires = '>=3.5', - install_requires = [ + python_requires='>=3.5', + install_requires=[ 'requests' ], - author = 'Microsoft NNI Team', - author_email = 'nni@microsoft.com', - description = 'nnicli for Neural Network Intelligence project', - license = 'MIT', - url = 'https://github.com/Microsoft/nni', + author='Microsoft NNI Team', + author_email='nni@microsoft.com', + description='nnicli for Neural Network Intelligence project', + license='MIT', + url='https://github.com/Microsoft/nni', ) diff --git a/src/sdk/pynni/nni/compression/tensorflow/compressor.py b/src/sdk/pynni/nni/compression/tensorflow/compressor.py index c46e883342..6382c25a8a 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/compressor.py +++ b/src/sdk/pynni/nni/compression/tensorflow/compressor.py @@ -80,7 +80,7 @@ def select_config(self, layer): Returns ------- ret : config or None - the retrieved configuration for this layer, if None, this layer should + the retrieved configuration for this layer, if None, this layer should not be compressed """ ret = None diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py index 580b1c1fac..bb9e76e0da 100644 --- a/src/sdk/pynni/nni/compression/torch/compressor.py +++ b/src/sdk/pynni/nni/compression/torch/compressor.py @@ -73,7 +73,7 @@ def select_config(self, layer): Returns ------- ret : config or None - the retrieved configuration for this layer, if None, this layer should + the retrieved configuration for this layer, if None, this layer should not be compressed """ ret = None diff --git a/src/sdk/pynni/nni/ppo_tuner/distri.py b/src/sdk/pynni/nni/ppo_tuner/distri.py index 4666acc2da..5f00843b3e 100644 --- a/src/sdk/pynni/nni/ppo_tuner/distri.py +++ b/src/sdk/pynni/nni/ppo_tuner/distri.py @@ -143,14 +143,14 @@ def sample(self): re_masked_res = tf.reshape(masked_res, [-1, self.size]) u = tf.random_uniform(tf.shape(re_masked_res), dtype=self.logits.dtype) - return tf.argmax(re_masked_res - tf.log(-tf.log(u)), axis=-1) + return tf.argmax(re_masked_res - tf.log(-1*tf.log(u)), axis=-1) else: u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype) - return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) + return tf.argmax(self.logits - tf.log(-1*tf.log(u)), axis=-1) @classmethod def fromflat(cls, flat): - return cls(flat) + return cls(flat) # pylint: disable=no-value-for-parameter class CategoricalPdType(PdType): """ diff --git a/src/sdk/pynni/nni/ppo_tuner/policy.py b/src/sdk/pynni/nni/ppo_tuner/policy.py index 65e2db414e..980959a49e 100644 --- a/src/sdk/pynni/nni/ppo_tuner/policy.py +++ b/src/sdk/pynni/nni/ppo_tuner/policy.py @@ -107,7 +107,7 @@ def _build_model_for_step(self): def sample(logits, mask_npinf): new_logits = tf.math.add(logits, mask_npinf) u = tf.random_uniform(tf.shape(new_logits), dtype=logits.dtype) - return tf.argmax(new_logits - tf.log(-tf.log(u)), axis=-1) + return tf.argmax(new_logits - tf.log(-1*tf.log(u)), axis=-1) def neglogp(logits, x): # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) diff --git a/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py b/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py index 1bc86ae750..4b5009d45d 100644 --- a/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py +++ b/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py @@ -22,11 +22,9 @@ class PPOTuner """ -import os import copy import logging import numpy as np -import json_tricks from gym import spaces import nni @@ -236,7 +234,8 @@ def compute_rewards(self, trials_info, trials_result): nextnonterminal = 1.0 - trials_info.dones[t+1] nextvalues = trials_info.values[t+1] delta = mb_rewards[t] + self.model_config.gamma * nextvalues * nextnonterminal - trials_info.values[t] - mb_advs[t] = lastgaelam = delta + self.model_config.gamma * self.model_config.lam * nextnonterminal * lastgaelam + lastgaelam = delta + self.model_config.gamma * self.model_config.lam * nextnonterminal * lastgaelam + mb_advs[t] = lastgaelam # pylint: disable=unsupported-assignment-operation mb_returns = mb_advs + trials_info.values trials_info.update_rewards(mb_rewards, mb_returns) @@ -536,8 +535,10 @@ def _next_round_inference(self): # generate new trials self.trials_result = [None for _ in range(self.inf_batch_size)] mb_obs, mb_actions, mb_values, mb_neglogpacs, mb_dones, last_values = self.model.inference(self.inf_batch_size) - self.trials_info = TrialsInfo(mb_obs, mb_actions, mb_values, mb_neglogpacs, - mb_dones, last_values, self.inf_batch_size) + self.trials_info = TrialsInfo(mb_obs, mb_actions, + mb_values, mb_neglogpacs, + mb_dones, last_values, + self.inf_batch_size) # check credit and submit new trials for _ in range(self.credit): trial_info_idx, actions = self.trials_info.get_next() @@ -581,8 +582,8 @@ def trial_end(self, parameter_id, success, **kwargs): assert trial_info_idx is not None # use mean of finished trials as the result of this failed trial values = [val for val in self.trials_result if val is not None] - logger.warning('zql values: {0}'.format(values)) - self.trials_result[trial_info_idx] = (sum(values) / len(values)) if len(values) > 0 else 0 + logger.warning('zql values: %s', values) + self.trials_result[trial_info_idx] = (sum(values) / len(values)) if values else 0 self.finished_trials += 1 if self.finished_trials == self.inf_batch_size: self._next_round_inference() diff --git a/src/sdk/pynni/nni/ppo_tuner/util.py b/src/sdk/pynni/nni/ppo_tuner/util.py index ac958e54de..acf704accc 100644 --- a/src/sdk/pynni/nni/ppo_tuner/util.py +++ b/src/sdk/pynni/nni/ppo_tuner/util.py @@ -56,7 +56,7 @@ def seq_to_batch(h, flat=False): def lstm(xs, ms, s, scope, nh, init_scale=1.0): """lstm cell""" - nbatch, nin = [v.value for v in xs[0].get_shape()] + _, nin = [v.value for v in xs[0].get_shape()] # the first is nbatch with tf.variable_scope(scope): wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) diff --git a/src/sdk/pynni/nni/smac_tuner/__init__.py b/src/sdk/pynni/nni/smac_tuner/__init__.py index 911fe59794..ca8bdc8b8e 100644 --- a/src/sdk/pynni/nni/smac_tuner/__init__.py +++ b/src/sdk/pynni/nni/smac_tuner/__init__.py @@ -1 +1 @@ -from .smac_tuner import SMACTuner \ No newline at end of file +from .smac_tuner import SMACTuner diff --git a/src/sdk/pynni/nni/smac_tuner/smac_tuner.py b/src/sdk/pynni/nni/smac_tuner/smac_tuner.py index fccf8c230e..4e2f876b9e 100644 --- a/src/sdk/pynni/nni/smac_tuner/smac_tuner.py +++ b/src/sdk/pynni/nni/smac_tuner/smac_tuner.py @@ -39,7 +39,6 @@ from .convert_ss_to_scenario import generate_scenario - class SMACTuner(Tuner): """ Parameters diff --git a/tools/nni_cmd/command_utils.py b/tools/nni_cmd/command_utils.py index a3bcb81965..cf13f63eae 100644 --- a/tools/nni_cmd/command_utils.py +++ b/tools/nni_cmd/command_utils.py @@ -3,7 +3,7 @@ import os import signal import psutil -from .common_utils import print_error, print_normal, print_warning +from .common_utils import print_error def check_output_command(file_path, head=None, tail=None): diff --git a/tools/nni_cmd/common_utils.py b/tools/nni_cmd/common_utils.py index 3a5e909ca2..af0fe3efa6 100644 --- a/tools/nni_cmd/common_utils.py +++ b/tools/nni_cmd/common_utils.py @@ -21,10 +21,10 @@ import os import sys import json -import ruamel.yaml as yaml -import psutil import socket from pathlib import Path +import ruamel.yaml as yaml +import psutil from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO, COLOR_RED_FORMAT, COLOR_YELLOW_FORMAT def get_yml_content(file_path): @@ -34,6 +34,7 @@ def get_yml_content(file_path): return yaml.load(file, Loader=yaml.Loader) except yaml.scanner.ScannerError as err: print_error('yaml file format error!') + print_error(err) exit(1) except Exception as exception: print_error(exception) @@ -46,6 +47,7 @@ def get_json_content(file_path): return json.load(file) except TypeError as err: print_error('json file format error!') + print_error(err) return None def print_error(content): @@ -70,7 +72,7 @@ def detect_process(pid): def detect_port(port): '''Detect if the port is used''' - socket_test = socket.socket(socket.AF_INET,socket.SOCK_STREAM) + socket_test = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: socket_test.connect(('127.0.0.1', int(port))) socket_test.close() @@ -79,7 +81,7 @@ def detect_port(port): return False def get_user(): - if sys.platform =='win32': + if sys.platform == 'win32': return os.environ['USERNAME'] else: return os.environ['USER'] diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index da943564fb..dded8d1e95 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -19,13 +19,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os -from schema import Schema, And, Use, Optional, Regex, Or +from schema import Schema, And, Optional, Regex, Or from .constants import SCHEMA_TYPE_ERROR, SCHEMA_RANGE_ERROR, SCHEMA_PATH_ERROR -def setType(key, type): +def setType(key, valueType): '''check key type''' - return And(type, error=SCHEMA_TYPE_ERROR % (key, type.__name__)) + return And(valueType, error=SCHEMA_TYPE_ERROR % (key, valueType.__name__)) def setChoice(key, *args): '''check choice''' @@ -47,7 +47,7 @@ def setPathCheck(key): 'experimentName': setType('experimentName', str), Optional('description'): setType('description', str), 'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999), - Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$',error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), + Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), @@ -106,7 +106,7 @@ def setPathCheck(key): 'builtinTunerName': 'NetworkMorphism', Optional('classArgs'): { Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), - Optional('task'): setChoice('task', 'cv','nlp','common'), + Optional('task'): setChoice('task', 'cv', 'nlp', 'common'), Optional('input_width'): setType('input_width', int), Optional('input_channel'): setType('input_channel', int), Optional('n_output_node'): setType('n_output_node', int), @@ -139,7 +139,7 @@ def setPathCheck(key): Optional('selection_num_warm_up'): setType('selection_num_warm_up', int), Optional('selection_num_starting_points'): setType('selection_num_starting_points', int), }, - Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), + Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), }, 'PPOTuner': { @@ -232,35 +232,35 @@ def setPathCheck(key): } common_trial_schema = { -'trial':{ - 'command': setType('command', str), - 'codeDir': setPathCheck('codeDir'), - Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') } } pai_trial_schema = { -'trial':{ - 'command': setType('command', str), - 'codeDir': setPathCheck('codeDir'), - 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), - 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), - 'memoryMB': setType('memoryMB', int), - 'image': setType('image', str), - Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'), - Optional('shmMB'): setType('shmMB', int), - Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ - error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), - Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ - error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), - Optional('virtualCluster'): setType('virtualCluster', str), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), - Optional('portList'): [{ - "label": setType('label', str), - "beginAt": setType('beginAt', int), - "portNumber": setType('portNumber', int) - }] + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), + 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), + 'memoryMB': setType('memoryMB', int), + 'image': setType('image', str), + Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'), + Optional('shmMB'): setType('shmMB', int), + Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ + error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), + Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ + error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), + Optional('virtualCluster'): setType('virtualCluster', str), + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + Optional('portList'): [{ + "label": setType('label', str), + "beginAt": setType('beginAt', int), + "portNumber": setType('portNumber', int) + }] } } @@ -273,7 +273,7 @@ def setPathCheck(key): } kubeflow_trial_schema = { -'trial':{ + 'trial':{ 'codeDir': setPathCheck('codeDir'), Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), Optional('ps'): { @@ -315,7 +315,7 @@ def setPathCheck(key): 'server': setType('server', str), 'path': setType('path', str) } - },{ + }, { 'operator': setChoice('operator', 'tf-operator', 'pytorch-operator'), 'apiVersion': setType('apiVersion', str), Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), @@ -363,7 +363,7 @@ def setPathCheck(key): 'server': setType('server', str), 'path': setType('path', str) } - },{ + }, { Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), Optional('serviceAccountName'): setType('serviceAccountName', str), 'keyVault': { @@ -383,24 +383,24 @@ def setPathCheck(key): } machine_list_schema = { -Optional('machineList'):[Or({ - 'ip': setType('ip', str), - Optional('port'): setNumberRange('port', int, 1, 65535), - 'username': setType('username', str), - 'passwd': setType('passwd', str), - Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), - Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), - Optional('useActiveGpu'): setType('useActiveGpu', bool) - },{ - 'ip': setType('ip', str), - Optional('port'): setNumberRange('port', int, 1, 65535), - 'username': setType('username', str), - 'sshKeyPath': setPathCheck('sshKeyPath'), - Optional('passphrase'): setType('passphrase', str), - Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), - Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), - Optional('useActiveGpu'): setType('useActiveGpu', bool) -})] + Optional('machineList'):[Or({ + 'ip': setType('ip', str), + Optional('port'): setNumberRange('port', int, 1, 65535), + 'username': setType('username', str), + 'passwd': setType('passwd', str), + Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool) + }, { + 'ip': setType('ip', str), + Optional('port'): setNumberRange('port', int, 1, 65535), + 'username': setType('username', str), + 'sshKeyPath': setPathCheck('sshKeyPath'), + Optional('passphrase'): setType('passphrase', str), + Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool) + })] } LOCAL_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema}) diff --git a/tools/nni_cmd/config_utils.py b/tools/nni_cmd/config_utils.py index 6b2b8a0cc0..c7c88bcf3e 100644 --- a/tools/nni_cmd/config_utils.py +++ b/tools/nni_cmd/config_utils.py @@ -21,7 +21,6 @@ import os import json -import shutil from .constants import NNICTL_HOME_DIR class Config: @@ -73,29 +72,29 @@ def __init__(self): self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiments = self.read_file() - def add_experiment(self, id, port, time, file_name, platform): + def add_experiment(self, expId, port, time, file_name, platform): '''set {key:value} paris to self.experiment''' - self.experiments[id] = {} - self.experiments[id]['port'] = port - self.experiments[id]['startTime'] = time - self.experiments[id]['endTime'] = 'N/A' - self.experiments[id]['status'] = 'INITIALIZED' - self.experiments[id]['fileName'] = file_name - self.experiments[id]['platform'] = platform + self.experiments[expId] = {} + self.experiments[expId]['port'] = port + self.experiments[expId]['startTime'] = time + self.experiments[expId]['endTime'] = 'N/A' + self.experiments[expId]['status'] = 'INITIALIZED' + self.experiments[expId]['fileName'] = file_name + self.experiments[expId]['platform'] = platform self.write_file() - def update_experiment(self, id, key, value): + def update_experiment(self, expId, key, value): '''Update experiment''' - if id not in self.experiments: + if expId not in self.experiments: return False - self.experiments[id][key] = value + self.experiments[expId][key] = value self.write_file() return True - def remove_experiment(self, id): + def remove_experiment(self, expId): '''remove an experiment by id''' if id in self.experiments: - self.experiments.pop(id) + self.experiments.pop(expId) self.write_file() def get_all_experiments(self): @@ -109,7 +108,7 @@ def write_file(self): json.dump(self.experiments, file) except IOError as error: print('Error:', error) - return + return '' def read_file(self): '''load config from local file''' @@ -119,4 +118,4 @@ def read_file(self): return json.load(file) except ValueError: return {} - return {} + return {} diff --git a/tools/nni_cmd/constants.py b/tools/nni_cmd/constants.py index d22a509c46..0777d2db98 100644 --- a/tools/nni_cmd/constants.py +++ b/tools/nni_cmd/constants.py @@ -21,7 +21,7 @@ import os from colorama import Fore -NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') +NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') ERROR_INFO = 'ERROR: %s' @@ -58,7 +58,8 @@ '-----------------------------------------------------------------------\n' EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \ - 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' + 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could ' \ + 'use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' EXPERIMENT_INFORMATION_FORMAT = '----------------------------------------------------------------------------------------\n' \ ' Experiment information\n' \ diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index e2fac2cb42..f99f8dfe43 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -22,22 +22,21 @@ import json import os import sys -import shutil import string -from subprocess import Popen, PIPE, call, check_output, check_call, CalledProcessError +import random +import site +import time import tempfile +from subprocess import Popen, check_call, CalledProcessError +from nni_annotation import expand_annotations, generate_search_space from nni.constants import ModuleName, AdvisorModuleName -from nni_annotation import * from .launcher_utils import validate_all_content -from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response +from .rest_utils import rest_put, rest_post, check_rest_server, check_response from .url_utils import cluster_metadata_url, experiment_url, get_local_urls from .config_utils import Config, Experiments -from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port, get_user, get_python_dir -from .constants import * -import random -import site -import time -from pathlib import Path +from .common_utils import get_yml_content, get_json_content, print_error, print_normal, \ + detect_port, get_user, get_python_dir +from .constants import NNICTL_HOME_DIR, ERROR_INFO, REST_TIME_OUT, EXPERIMENT_SUCCESS_INFO, LOG_HEADER, PACKAGE_REQUIREMENTS from .command_utils import check_output_command, kill_command from .nnictl_utils import update_experiment @@ -83,7 +82,8 @@ def _generate_installation_path(sitepackages_path): python_dir = os.getenv('VIRTUAL_ENV') else: python_sitepackage = site.getsitepackages()[0] - # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given that nni exists there + # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given + # that nni exists there if python_sitepackage.startswith('/usr') or python_sitepackage.startswith('/Library'): python_dir = try_installation_path_sequentially(site.getusersitepackages(), site.getsitepackages()[0]) else: @@ -98,7 +98,6 @@ def _generate_installation_path(sitepackages_path): def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None): '''Run nni manager process''' - nni_config = Config(config_file_name) if detect_port(port): print_error('Port %s is used by another process, please reset the port!\n' \ 'You could use \'nnictl create --help\' to get help information' % port) @@ -114,7 +113,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None entry_dir = get_nni_installation_path() entry_file = os.path.join(entry_dir, 'main.js') - + node_command = 'node' if sys.platform == 'win32': node_command = os.path.join(entry_dir[:-3], 'Scripts', 'node.exe') @@ -132,7 +131,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None cmds += ['--experiment_id', experiment_id] stdout_full_path, stderr_full_path = get_log_path(config_file_name) with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: - time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #add time information in the header of log files log_header = LOG_HEADER % str(time_now) stdout_file.write(log_header) @@ -212,7 +211,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name): if experiment_config.get('nniManagerIp') is None: return True, None ip_config_dict = dict() - ip_config_dict['nni_manager_ip'] = { 'nniManagerIp' : experiment_config['nniManagerIp'] } + ip_config_dict['nni_manager_ip'] = {'nniManagerIp': experiment_config['nniManagerIp']} response = rest_put(cluster_metadata_url(port), json.dumps(ip_config_dict), REST_TIME_OUT) err_message = None if not response or not response.status_code == 200: @@ -403,11 +402,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen stdout_full_path, stderr_full_path = get_log_path(config_file_name) with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: check_call([sys.executable, '-c', 'import %s'%(module_name)], stdout=stdout_file, stderr=stderr_file) - except CalledProcessError as e: + except CalledProcessError: print_error('some errors happen when import package %s.' %(package_name)) print_log_content(config_file_name) if package_name in PACKAGE_REQUIREMENTS: - print_error('If %s is not installed, it should be installed through \'nnictl package install --name %s\''%(package_name, package_name)) + print_error('If %s is not installed, it should be installed through '\ + '\'nnictl package install --name %s\''%(package_name, package_name)) exit(1) log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None @@ -416,7 +416,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen if log_level not in ['trace', 'debug'] and (args.debug or experiment_config.get('debug') is True): log_level = 'debug' # start rest server - rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level) + rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], \ + mode, config_file_name, experiment_id, log_dir, log_level) nni_config.set_config('restServerPid', rest_process.pid) # Deal with annotation if experiment_config.get('useAnnotation'): @@ -450,8 +451,9 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen exit(1) if mode != 'view': # set platform configuration - set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port, config_file_name, rest_process) - + set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port,\ + config_file_name, rest_process) + # start a new experiment print_normal('Starting experiment...') # set debug configuration @@ -478,7 +480,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen #save experiment information nnictl_experiment_config = Experiments() - nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name, experiment_config['trainingServicePlatform']) + nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name,\ + experiment_config['trainingServicePlatform']) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) @@ -503,7 +506,6 @@ def manage_stopped_experiment(args, mode): experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() experiment_id = None - experiment_endTime = None #find the latest stopped experiment if not args.id: print_error('Please set experiment id! \nYou could use \'nnictl {0} {id}\' to {0} a stopped experiment!\n' \ diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index da6a668064..f6c849abab 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -20,11 +20,11 @@ import os import json -from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA, FRAMEWORKCONTROLLER_CONFIG_SCHEMA, \ -tuner_schema_dict, advisor_schema_dict, assessor_schema_dict -from schema import SchemaMissingKeyError, SchemaForbiddenKeyError, SchemaUnexpectedTypeError, SchemaWrongKeyError, SchemaError -from .common_utils import get_json_content, print_error, print_warning, print_normal -from schema import Schema, And, Use, Optional, Regex, Or +from schema import SchemaError +from schema import Schema +from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ + FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict +from .common_utils import print_error, print_warning, print_normal def expand_path(experiment_config, key): '''Change '~' to user home directory''' @@ -164,11 +164,11 @@ def validate_common_content(experiment_config): print_error('Please set correct trainingServicePlatform!') exit(1) schema_dict = { - 'local': LOCAL_CONFIG_SCHEMA, - 'remote': REMOTE_CONFIG_SCHEMA, - 'pai': PAI_CONFIG_SCHEMA, - 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, - 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA + 'local': LOCAL_CONFIG_SCHEMA, + 'remote': REMOTE_CONFIG_SCHEMA, + 'pai': PAI_CONFIG_SCHEMA, + 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, + 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA } separate_schema_dict = { 'tuner': tuner_schema_dict, diff --git a/tools/nni_cmd/nnictl.py b/tools/nni_cmd/nnictl.py index 8da30fdfb7..88ee311423 100644 --- a/tools/nni_cmd/nnictl.py +++ b/tools/nni_cmd/nnictl.py @@ -20,14 +20,18 @@ import argparse +import os import pkg_resources +from colorama import init +from .common_utils import print_error from .launcher import create_experiment, resume_experiment, view_experiment from .updater import update_searchspace, update_concurrency, update_duration, update_trialnum, import_data -from .nnictl_utils import * -from .package_management import * -from .constants import * -from .tensorboard_utils import * -from colorama import init +from .nnictl_utils import stop_experiment, trial_ls, trial_kill, list_experiment, experiment_status,\ + log_trial, experiment_clean, platform_clean, experiment_list, \ + monitor_experiment, export_trials_data, trial_codegen, webui_url, get_config, log_stdout, log_stderr +from .package_management import package_install, package_show +from .constants import DEFAULT_REST_PORT +from .tensorboard_utils import start_tensorboard, stop_tensorboard init(autoreset=True) if os.environ.get('COVERAGE_PROCESS_START'): @@ -38,7 +42,7 @@ def nni_info(*args): if args[0].version: try: print(pkg_resources.get_distribution('nni').version) - except pkg_resources.ResolutionError as err: + except pkg_resources.ResolutionError: print_error('Get version failed, please use `pip3 list | grep nni` to check nni version!') else: print('please run "nnictl {positional argument} --help" to see nnictl guidance') diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index b6fada56e8..4cadce182d 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -20,15 +20,13 @@ import csv import os -import psutil import json -from datetime import datetime, timezone import time import re -from pathlib import Path -from pyhdfs import HdfsClient, HdfsFileNotFoundException import shutil -from subprocess import call, check_output +from datetime import datetime, timezone +from pathlib import Path +from pyhdfs import HdfsClient from nni_annotation import expand_annotations from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url @@ -102,7 +100,8 @@ def check_experiment_id(args, update=True): experiment_information = "" for key in running_experiment_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ - experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'],\ + experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) elif not running_experiment_list: @@ -157,23 +156,24 @@ def parse_ids(args): experiment_information = "" for key in running_experiment_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ - experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], \ + experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) else: result_list = running_experiment_list elif args.id.endswith('*'): - for id in running_experiment_list: - if id.startswith(args.id[:-1]): - result_list.append(id) + for expId in running_experiment_list: + if expId.startswith(args.id[:-1]): + result_list.append(expId) elif args.id in running_experiment_list: result_list.append(args.id) else: - for id in running_experiment_list: - if id.startswith(args.id): - result_list.append(id) + for expId in running_experiment_list: + if expId.startswith(args.id): + result_list.append(expId) if len(result_list) > 1: - print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) ) + print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list)) return None if not result_list and (args.id or args.port): print_error('There are no experiments matched, please set correct experiment id or restful server port') @@ -235,7 +235,6 @@ def stop_experiment(args): for experiment_id in experiment_id_list: print_normal('Stoping experiment %s' % experiment_id) nni_config = Config(experiment_dict[experiment_id]['fileName']) - rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if rest_pid: kill_command(rest_pid) @@ -249,7 +248,7 @@ def stop_experiment(args): nni_config.set_config('tensorboardPidList', []) print_normal('Stop experiment success.') experiment_config.update_experiment(experiment_id, 'status', 'STOPPED') - time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) def trial_ls(args): @@ -401,9 +400,9 @@ def local_clean(directory): print_normal('removing folder {0}'.format(directory)) try: shutil.rmtree(directory) - except FileNotFoundError as err: + except FileNotFoundError: print_error('{0} does not exist.'.format(directory)) - + def remote_clean(machine_list, experiment_id=None): '''clean up remote data''' for machine in machine_list: @@ -418,7 +417,7 @@ def remote_clean(machine_list, experiment_id=None): sftp = create_ssh_sftp_client(host, port, userName, passwd) print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir)) remove_remote_directory(sftp, remote_dir) - + def hdfs_clean(host, user_name, output_dir, experiment_id=None): '''clean up hdfs data''' hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5) @@ -475,7 +474,7 @@ def experiment_clean(args): machine_list = nni_config.get_config('experimentConfig').get('machineList') remote_clean(machine_list, experiment_id) elif platform == 'pai': - host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') + host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName') output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir') hdfs_clean(host, user_name, output_dir, experiment_id) @@ -492,7 +491,7 @@ def experiment_clean(args): experiment_config = Experiments() print_normal('removing metadata of experiment {0}'.format(experiment_id)) experiment_config.remove_experiment(experiment_id) - print_normal('Done.') + print_normal('Done.') def get_platform_dir(config_content): '''get the dir list to be deleted''' @@ -505,8 +504,7 @@ def get_platform_dir(config_content): port = machine.get('port') dir_list.append(host + ':' + str(port) + '/tmp/nni') elif platform == 'pai': - pai_config = config_content.get('paiConfig') - host = config_content.get('paiConfig').get('host') + host = config_content.get('paiConfig').get('host') user_name = config_content.get('paiConfig').get('userName') output_dir = config_content.get('trial').get('outputDir') dir_list.append('server: {0}, path: {1}/nni'.format(host, user_name)) @@ -529,17 +527,15 @@ def platform_clean(args): print_normal('platform {0} not supported.'.format(platform)) exit(0) update_experiment() - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - id_list = list(experiment_dict.keys()) dir_list = get_platform_dir(config_content) if not dir_list: print_normal('No folder of NNI caches is found.') exit(1) while True: - print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.') - for dir in dir_list: - print(' ' + dir) + print_normal('This command will remove below folders of NNI caches. If other users are using experiments' \ + ' on below hosts, it will be broken.') + for value in dir_list: + print(' ' + value) inputs = input('INFO: do you want to continue?[y/N]:') if not inputs.lower() or inputs.lower() in ['n', 'no']: exit(0) @@ -549,11 +545,9 @@ def platform_clean(args): break if platform == 'remote': machine_list = config_content.get('machineList') - for machine in machine_list: - remote_clean(machine_list, None) + remote_clean(machine_list, None) elif platform == 'pai': - pai_config = config_content.get('paiConfig') - host = config_content.get('paiConfig').get('host') + host = config_content.get('paiConfig').get('host') user_name = config_content.get('paiConfig').get('userName') output_dir = config_content.get('trial').get('outputDir') hdfs_clean(host, user_name, output_dir, None) @@ -618,7 +612,8 @@ def show_experiment_info(): return for key in experiment_id_list: print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \ - experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))) + experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], \ + get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))) print(TRIAL_MONITOR_HEAD) running, response = check_rest_server_quick(experiment_dict[key]['port']) if running: @@ -627,7 +622,8 @@ def show_experiment_info(): content = json.loads(response.text) for index, value in enumerate(content): content[index] = convert_time_stamp_to_date(value) - print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), content[index].get('endTime'), content[index].get('status'))) + print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), \ + content[index].get('endTime'), content[index].get('status'))) print(TRIAL_MONITOR_TAIL) def monitor_experiment(args): diff --git a/tools/nni_cmd/package_management.py b/tools/nni_cmd/package_management.py index de8dbe62ec..32ed79496d 100644 --- a/tools/nni_cmd/package_management.py +++ b/tools/nni_cmd/package_management.py @@ -18,12 +18,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import nni import os -import sys -from subprocess import call +import nni from .constants import PACKAGE_REQUIREMENTS -from .common_utils import print_normal, print_error +from .common_utils import print_error from .command_utils import install_requirements_command def process_install(package_name): diff --git a/tools/nni_cmd/ssh_utils.py b/tools/nni_cmd/ssh_utils.py index da707dac48..7453830323 100644 --- a/tools/nni_cmd/ssh_utils.py +++ b/tools/nni_cmd/ssh_utils.py @@ -20,7 +20,6 @@ import os from .common_utils import print_error -from subprocess import call from .command_utils import install_package_command def check_environment(): @@ -29,6 +28,8 @@ def check_environment(): import paramiko except: install_package_command('paramiko') + import paramiko + return paramiko def copy_remote_directory_to_local(sftp, remote_path, local_path): '''copy remote directory to local machine''' @@ -49,8 +50,7 @@ def copy_remote_directory_to_local(sftp, remote_path, local_path): def create_ssh_sftp_client(host_ip, port, username, password): '''create ssh client''' try: - check_environment() - import paramiko + paramiko = check_environment() conn = paramiko.Transport(host_ip, port) conn.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(conn) diff --git a/tools/nni_cmd/tensorboard_utils.py b/tools/nni_cmd/tensorboard_utils.py index b4578c34b0..9646b4de0e 100644 --- a/tools/nni_cmd/tensorboard_utils.py +++ b/tools/nni_cmd/tensorboard_utils.py @@ -19,21 +19,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os -import psutil import json -import datetime -import time -from subprocess import call, check_output, Popen, PIPE -from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response -from .config_utils import Config, Experiments -from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, get_local_urls -from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, COLOR_GREEN_FORMAT -import time -from .common_utils import print_normal, print_error, print_warning, detect_process, detect_port -from .nnictl_utils import * import re -from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local import tempfile +from subprocess import call, Popen +from .rest_utils import rest_get, check_rest_server_quick, check_response +from .config_utils import Config, Experiments +from .url_utils import trial_jobs_url, get_local_urls +from .constants import COLOR_GREEN_FORMAT, REST_TIME_OUT +from .common_utils import print_normal, print_error, detect_process, detect_port +from .nnictl_utils import check_experiment_id, check_experiment_id +from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local def parse_log_path(args, trial_content): '''parse log path''' @@ -43,7 +39,7 @@ def parse_log_path(args, trial_content): if args.trial_id and args.trial_id != 'all' and trial.get('id') != args.trial_id: continue pattern = r'(?P.+)://(?P.+):(?P.*)' - match = re.search(pattern,trial['logPath']) + match = re.search(pattern, trial['logPath']) if match: path_list.append(match.group('path')) host_list.append(match.group('host')) @@ -94,7 +90,8 @@ def start_tensorboard_process(args, nni_config, path_list, temp_nni_path): if detect_port(args.port): print_error('Port %s is used by another process, please reset port!' % str(args.port)) exit(1) - with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file: + with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, \ + open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file: cmds = ['tensorboard', '--logdir', format_tensorboard_log_path(path_list), '--port', str(args.port)] tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) url_list = get_local_urls(args.port) diff --git a/tools/nni_cmd/updater.py b/tools/nni_cmd/updater.py index 9258d73f0a..07ae6123cb 100644 --- a/tools/nni_cmd/updater.py +++ b/tools/nni_cmd/updater.py @@ -25,7 +25,7 @@ from .url_utils import experiment_url, import_data_url from .config_utils import Config from .common_utils import get_json_content, print_normal, print_error, print_warning -from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename +from .nnictl_utils import get_experiment_port, get_config_filename from .launcher_utils import parse_time from .constants import REST_TIME_OUT, TUNERS_SUPPORTING_IMPORT_DATA, TUNERS_NO_NEED_TO_IMPORT_DATA diff --git a/tools/nni_cmd/url_utils.py b/tools/nni_cmd/url_utils.py index c50b2551d2..05cfa8e66f 100644 --- a/tools/nni_cmd/url_utils.py +++ b/tools/nni_cmd/url_utils.py @@ -18,8 +18,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import socket import psutil -from socket import AddressFamily BASE_URL = 'http://localhost' @@ -83,8 +83,8 @@ def tensorboard_url(port): def get_local_urls(port): '''get urls of local machine''' url_list = [] - for name, info in psutil.net_if_addrs().items(): + for _, info in psutil.net_if_addrs().items(): for addr in info: - if AddressFamily.AF_INET == addr.family: + if socket.AddressFamily.AF_INET == addr.family: url_list.append('http://{}:{}'.format(addr.address, port)) return url_list diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index f58b9b2895..436e1edaaf 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -27,19 +27,20 @@ def check_ready_to_run(): if sys.platform == 'win32': - pgrep_output = subprocess.check_output('wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId') + pgrep_output = subprocess.check_output( + 'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId') pidList = pgrep_output.decode("utf-8").strip().split() pidList.pop(0) # remove the key word 'ProcessId' pidList = list(map(int, pidList)) pidList.remove(os.getpid()) - return len(pidList) == 0 + return not pidList else: pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pidList = [] for pid in pgrep_output.splitlines(): pidList.append(int(pid)) pidList.remove(os.getpid()) - return len(pidList) == 0 + return not pidList def main(argv): metrics_output_dir = os.environ['METRIC_OUTPUT_DIR'] @@ -69,10 +70,14 @@ def parse_nvidia_smi_result(smi, outputDir): outPut["gpuCount"] = len(gpuList) outPut["gpuInfos"] = [] for gpuIndex, gpu in enumerate(gpuList): - gpuInfo ={} + gpuInfo = {} gpuInfo['index'] = gpuIndex - gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0].getElementsByTagName('gpu_util')[0].childNodes[0].data.replace("%", "").strip() - gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0].getElementsByTagName('memory_util')[0].childNodes[0].data.replace("%", "").strip() + gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ + .getElementsByTagName('gpu_util')[0]\ + .childNodes[0].data.replace("%", "").strip() + gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ + .getElementsByTagName('memory_util')[0]\ + .childNodes[0].data.replace("%", "").strip() processes = gpu.getElementsByTagName('processes') runningProNumber = len(processes[0].getElementsByTagName('process_info')) gpuInfo['activeProcessNum'] = runningProNumber @@ -81,8 +86,8 @@ def parse_nvidia_smi_result(smi, outputDir): print(outPut) outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True))) outputFile.flush(); - except : - e_info = sys.exc_info() + except: + # e_info = sys.exc_info() print('xmldoc paring error') finally: os.umask(old_umask) diff --git a/tools/nni_trial_tool/hdfsClientUtility.py b/tools/nni_trial_tool/hdfsClientUtility.py index c732d2507c..9369e77a41 100644 --- a/tools/nni_trial_tool/hdfsClientUtility.py +++ b/tools/nni_trial_tool/hdfsClientUtility.py @@ -20,7 +20,6 @@ import os import posixpath -from pyhdfs import HdfsClient from .log_utils import LogType, nni_log def copyHdfsDirectoryToLocal(hdfsDirectory, localDirectory, hdfsClient): @@ -79,7 +78,8 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): try: result = result and copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) except Exception as exception: - nni_log(LogType.Error, 'Copy local directory {0} to hdfs directory {1} error: {2}'.format(file_path, hdfs_directory, str(exception))) + nni_log(LogType.Error, + 'Copy local directory {0} to hdfs directory {1} error: {2}'.format(file_path, hdfs_directory, str(exception))) result = False else: hdfs_file_path = os.path.join(hdfsDirectory, file) diff --git a/tools/nni_trial_tool/log_utils.py b/tools/nni_trial_tool/log_utils.py index 1806b06d79..8b7c5b3dd5 100644 --- a/tools/nni_trial_tool/log_utils.py +++ b/tools/nni_trial_tool/log_utils.py @@ -33,8 +33,7 @@ from queue import Queue -from .rest_utils import rest_get, rest_post, rest_put, rest_delete -from .constants import NNI_EXP_ID, NNI_TRIAL_JOB_ID, STDOUT_API +from .rest_utils import rest_post from .url_utils import gen_send_stdout_url @unique @@ -73,7 +72,7 @@ def emit(self, record): log_entry['msg'] = self.format(record) try: - response = rest_post(gen_send_stdout_url(self.host, self.port), json.dumps(log_entry), 10, True) + rest_post(gen_send_stdout_url(self.host, self.port), json.dumps(log_entry), 10, True) except Exception as e: self.orig_stderr.write(str(e) + '\n') self.orig_stderr.flush() @@ -112,7 +111,7 @@ def write(self, buf): self.orig_stdout.flush() try: self.logger.log(self.log_level, line.rstrip()) - except Exception as e: + except Exception: pass class PipeLogReader(threading.Thread): @@ -147,15 +146,14 @@ def _populateQueue(stream, queue): line = self.queue.get(True, 5) try: self.logger.log(self.log_level, line.rstrip()) - except Exception as e: + except Exception: pass - except Exception as e: + except Exception: if cur_process_exit == True: self._is_read_completed = True break - self.pip_log_reader_thread = threading.Thread(target = _populateQueue, - args = (self.pipeReader, self.queue)) + self.pip_log_reader_thread = threading.Thread(target=_populateQueue, args=(self.pipeReader, self.queue)) self.pip_log_reader_thread.daemon = True self.start() self.pip_log_reader_thread.start() @@ -196,4 +194,4 @@ def is_read_completed(self): def set_process_exit(self): self.process_exit = True - return self.process_exit \ No newline at end of file + return self.process_exit diff --git a/tools/nni_trial_tool/rest_utils.py b/tools/nni_trial_tool/rest_utils.py index 71eb353614..9f6227acbb 100644 --- a/tools/nni_trial_tool/rest_utils.py +++ b/tools/nni_trial_tool/rest_utils.py @@ -19,7 +19,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import time import requests def rest_get(url, timeout): diff --git a/tools/nni_trial_tool/test/test_hdfsClientUtility.py b/tools/nni_trial_tool/test/test_hdfsClientUtility.py index 4a54a893c9..68ffe79d8f 100644 --- a/tools/nni_trial_tool/test/test_hdfsClientUtility.py +++ b/tools/nni_trial_tool/test/test_hdfsClientUtility.py @@ -18,16 +18,17 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import os +import shutil +import random +import string import unittest import json import sys from pyhdfs import HdfsClient +from tools.nni_trial_tool.hdfsClientUtility import copyFileToHdfs, copyDirectoryToHdfs sys.path.append("..") -from trial.hdfsClientUtility import copyFileToHdfs, copyDirectoryToHdfs -import os -import shutil -import random -import string + class HDFSClientUtilityTest(unittest.TestCase): '''Unit test for hdfsClientUtility.py''' @@ -82,7 +83,8 @@ def test_copy_directory_run(self): with open('./{0}/{1}'.format(directory_name, file_name), 'w') as file: file.write(file_content) - result = copyDirectoryToHdfs('./{}'.format(directory_name), '/{0}/{1}'.format(self.hdfs_config['userName'], directory_name), self.hdfs_client) + result = copyDirectoryToHdfs('./{}'.format(directory_name), + '/{0}/{1}'.format(self.hdfs_config['userName'], directory_name), self.hdfs_client) self.assertTrue(result) directory_list = self.hdfs_client.listdir('/{0}'.format(self.hdfs_config['userName'])) diff --git a/tools/nni_trial_tool/trial_keeper.py b/tools/nni_trial_tool/trial_keeper.py index 23d9c4f1ab..2ce89cc192 100644 --- a/tools/nni_trial_tool/trial_keeper.py +++ b/tools/nni_trial_tool/trial_keeper.py @@ -18,32 +18,30 @@ # ============================================================================================================================== # import argparse -import sys import os -from subprocess import Popen, PIPE +from subprocess import Popen import time import logging import shlex import re import sys -import select import json import threading from pyhdfs import HdfsClient import pkg_resources from .rest_utils import rest_post, rest_get -from .url_utils import gen_send_stdout_url, gen_send_version_url, gen_parameter_meta_url +from .url_utils import gen_send_version_url, gen_parameter_meta_url -from .constants import HOME_DIR, LOG_DIR, NNI_PLATFORM, STDOUT_FULL_PATH, STDERR_FULL_PATH, \ - MULTI_PHASE, NNI_TRIAL_JOB_ID, NNI_SYS_DIR, NNI_EXP_ID +from .constants import LOG_DIR, NNI_PLATFORM, MULTI_PHASE, NNI_TRIAL_JOB_ID, NNI_SYS_DIR, NNI_EXP_ID from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal, copyHdfsFileToLocal -from .log_utils import LogType, nni_log, RemoteLogger, PipeLogReader, StdOutputType +from .log_utils import LogType, nni_log, RemoteLogger, StdOutputType logger = logging.getLogger('trial_keeper') regular = re.compile('v?(?P[0-9](\.[0-9]){0,1}).*') _hdfs_client = None + def get_hdfs_client(args): global _hdfs_client @@ -62,26 +60,29 @@ def get_hdfs_client(args): if hdfs_host is not None and args.nni_hdfs_exp_dir is not None: try: if args.webhdfs_path: - _hdfs_client = HdfsClient(hosts='{0}:80'.format(hdfs_host), user_name=args.pai_user_name, webhdfs_path=args.webhdfs_path, timeout=5) + _hdfs_client = HdfsClient(hosts='{0}:80'.format(hdfs_host), user_name=args.pai_user_name, + webhdfs_path=args.webhdfs_path, timeout=5) else: # backward compatibility - _hdfs_client = HdfsClient(hosts='{0}:{1}'.format(hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) + _hdfs_client = HdfsClient(hosts='{0}:{1}'.format(hdfs_host, '50070'), user_name=args.pai_user_name, + timeout=5) except Exception as e: nni_log(LogType.Error, 'Create HDFS client error: ' + str(e)) raise e return _hdfs_client + def main_loop(args): '''main loop logic for trial keeper''' if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) - stdout_file = open(STDOUT_FULL_PATH, 'a+') - stderr_file = open(STDERR_FULL_PATH, 'a+') - trial_keeper_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'trial_keeper', StdOutputType.Stdout, args.log_collection) + trial_keeper_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'trial_keeper', + StdOutputType.Stdout, args.log_collection) # redirect trial keeper's stdout and stderr to syslog - trial_syslogger_stdout = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'trial', StdOutputType.Stdout, args.log_collection) + trial_syslogger_stdout = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'trial', StdOutputType.Stdout, + args.log_collection) sys.stdout = sys.stderr = trial_keeper_syslogger hdfs_output_dir = None @@ -97,8 +98,10 @@ def main_loop(args): # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior log_pipe_stdout = trial_syslogger_stdout.get_pipelog_reader() - process = Popen(args.trial_command, shell = True, stdout = log_pipe_stdout, stderr = log_pipe_stdout) - nni_log(LogType.Info, 'Trial keeper spawns a subprocess (pid {0}) to run command: {1}'.format(process.pid, shlex.split(args.trial_command))) + process = Popen(args.trial_command, shell=True, stdout=log_pipe_stdout, stderr=log_pipe_stdout) + nni_log(LogType.Info, 'Trial keeper spawns a subprocess (pid {0}) to run command: {1}'.format(process.pid, + shlex.split( + args.trial_command))) while True: retCode = process.poll() @@ -110,9 +113,11 @@ def main_loop(args): nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] try: if copyDirectoryToHdfs(nni_local_output_dir, hdfs_output_dir, hdfs_client): - nni_log(LogType.Info, 'copy directory from {0} to {1} success!'.format(nni_local_output_dir, hdfs_output_dir)) + nni_log(LogType.Info, + 'copy directory from {0} to {1} success!'.format(nni_local_output_dir, hdfs_output_dir)) else: - nni_log(LogType.Info, 'copy directory from {0} to {1} failed!'.format(nni_local_output_dir, hdfs_output_dir)) + nni_log(LogType.Info, + 'copy directory from {0} to {1} failed!'.format(nni_local_output_dir, hdfs_output_dir)) except Exception as e: nni_log(LogType.Error, 'HDFS copy directory got exception: ' + str(e)) raise e @@ -123,14 +128,16 @@ def main_loop(args): time.sleep(2) + def trial_keeper_help_info(*args): print('please run --help to see guidance') + def check_version(args): try: trial_keeper_version = pkg_resources.get_distribution('nni').version except pkg_resources.ResolutionError as err: - #package nni does not exist, try nni-tool package + # package nni does not exist, try nni-tool package nni_log(LogType.Error, 'Package nni does not exist!') os._exit(1) if not args.nni_manager_version: @@ -145,21 +152,26 @@ def check_version(args): log_entry = {} if trial_keeper_version != nni_manager_version: nni_log(LogType.Error, 'Version does not match!') - error_message = 'NNIManager version is {0}, TrialKeeper version is {1}, NNI version does not match!'.format(nni_manager_version, trial_keeper_version) + error_message = 'NNIManager version is {0}, TrialKeeper version is {1}, NNI version does not match!'.format( + nni_manager_version, trial_keeper_version) log_entry['tag'] = 'VCFail' log_entry['msg'] = error_message - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, False) + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, + False) os._exit(1) else: nni_log(LogType.Info, 'Version match!') log_entry['tag'] = 'VCSuccess' - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, False) + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, + False) except AttributeError as err: nni_log(LogType.Error, err) + def is_multi_phase(): return MULTI_PHASE and (MULTI_PHASE in ['True', 'true']) + def download_parameter(meta_list, args): """ Download parameter file to local working directory. @@ -171,7 +183,8 @@ def download_parameter(meta_list, args): ] """ nni_log(LogType.Debug, str(meta_list)) - nni_log(LogType.Debug, 'NNI_SYS_DIR: {}, trial Id: {}, experiment ID: {}'.format(NNI_SYS_DIR, NNI_TRIAL_JOB_ID, NNI_EXP_ID)) + nni_log(LogType.Debug, + 'NNI_SYS_DIR: {}, trial Id: {}, experiment ID: {}'.format(NNI_SYS_DIR, NNI_TRIAL_JOB_ID, NNI_EXP_ID)) nni_log(LogType.Debug, 'NNI_SYS_DIR files: {}'.format(os.listdir(NNI_SYS_DIR))) for meta in meta_list: if meta['experimentId'] == NNI_EXP_ID and meta['trialId'] == NNI_TRIAL_JOB_ID: @@ -180,6 +193,7 @@ def download_parameter(meta_list, args): hdfs_client = get_hdfs_client(args) copyHdfsFileToLocal(meta['filePath'], param_fp, hdfs_client, override=False) + def fetch_parameter_file(args): class FetchThread(threading.Thread): def __init__(self, args): @@ -203,6 +217,7 @@ def run(self): fetch_file_thread = FetchThread(args) fetch_file_thread.start() + if __name__ == '__main__': '''NNI Trial Keeper main function''' PARSER = argparse.ArgumentParser() @@ -210,9 +225,9 @@ def run(self): PARSER.add_argument('--trial_command', type=str, help='Command to launch trial process') PARSER.add_argument('--nnimanager_ip', type=str, default='localhost', help='NNI manager rest server IP') PARSER.add_argument('--nnimanager_port', type=str, default='8081', help='NNI manager rest server port') - PARSER.add_argument('--pai_hdfs_output_dir', type=str, help='the output dir of pai_hdfs') # backward compatibility + PARSER.add_argument('--pai_hdfs_output_dir', type=str, help='the output dir of pai_hdfs') # backward compatibility PARSER.add_argument('--hdfs_output_dir', type=str, help='the output dir of hdfs') - PARSER.add_argument('--pai_hdfs_host', type=str, help='the host of pai_hdfs') # backward compatibility + PARSER.add_argument('--pai_hdfs_host', type=str, help='the host of pai_hdfs') # backward compatibility PARSER.add_argument('--hdfs_host', type=str, help='the host of hdfs') PARSER.add_argument('--pai_user_name', type=str, help='the username of hdfs') PARSER.add_argument('--nni_hdfs_exp_dir', type=str, help='nni experiment directory in hdfs') @@ -233,4 +248,3 @@ def run(self): except Exception as e: nni_log(LogType.Error, 'Exit trial keeper with code 1 because Exception: {} is catched'.format(str(e))) os._exit(1) - From 6210625b9225a13518b74ca50b87a0f648a325b4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 4 Nov 2019 14:52:46 +0800 Subject: [PATCH 04/11] Refine gitignore (#1642) * Refine gitignore * add newline at the end * add more environments --- .gitignore | 17 ++++++++++++++--- deployment/pypi/.gitignore | 4 +++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index f2d90fd13c..e96b14efc6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,9 @@ pids *.seed *.pid.lock +# Build package +dist/ + # Directory for instrumented libs generated by jscoverage/JSCover lib-cov @@ -54,9 +57,6 @@ typings/ # Yarn Integrity file .yarn-integrity -# dotenv environment variables file -.env - # next.js build output .next @@ -67,7 +67,18 @@ typings/ __pycache__ build *.egg-info +setup.pye +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# VSCode .vscode # In case you place source code in ~/nni/ diff --git a/deployment/pypi/.gitignore b/deployment/pypi/.gitignore index 331680e88e..be2c149bdc 100644 --- a/deployment/pypi/.gitignore +++ b/deployment/pypi/.gitignore @@ -1,6 +1,8 @@ nni/ +nni-yarn/ dist/ build/ *.egg-info/ +nni-yarn.tar.gz node-*.tar.xz -node-*/ \ No newline at end of file +node-*/ From 86f8c2abfc66269f3259b5c682aa3bcf14cde44c Mon Sep 17 00:00:00 2001 From: Tang Lang Date: Tue, 5 Nov 2019 11:47:25 +0800 Subject: [PATCH 05/11] pruner export (#1674) --- examples/model_compress/main_torch_pruner.py | 4 +- .../nni/compression/torch/builtin_pruners.py | 12 ++-- .../pynni/nni/compression/torch/compressor.py | 56 ++++++++++++++++++- 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/examples/model_compress/main_torch_pruner.py b/examples/model_compress/main_torch_pruner.py index e0b1be0251..2636128c4c 100644 --- a/examples/model_compress/main_torch_pruner.py +++ b/examples/model_compress/main_torch_pruner.py @@ -66,6 +66,7 @@ def main(): batch_size=1000, shuffle=True) model = Mnist() + model.to(device) '''you can change this to LevelPruner to implement it pruner = LevelPruner(configure_list) @@ -80,7 +81,7 @@ def main(): }] pruner = AGP_Pruner(model, configure_list) - pruner.compress() + model = pruner.compress() # you can also use compress(model) method # like that pruner.compress(model) @@ -90,6 +91,7 @@ def main(): print('# Epoch {} #'.format(epoch)) train(model, device, train_loader, optimizer) test(model, device, test_loader) + pruner.export_model('model.pth', 'mask.pth', 'model.onnx', [1, 1, 28, 28]) if __name__ == '__main__': diff --git a/src/sdk/pynni/nni/compression/torch/builtin_pruners.py b/src/sdk/pynni/nni/compression/torch/builtin_pruners.py index 593ba3f100..05e9e1e9bf 100644 --- a/src/sdk/pynni/nni/compression/torch/builtin_pruners.py +++ b/src/sdk/pynni/nni/compression/torch/builtin_pruners.py @@ -17,7 +17,6 @@ def __init__(self, model, config_list): - sparsity """ super().__init__(model, config_list) - self.mask_list = {} self.if_init_list = {} def calc_mask(self, layer, config): @@ -30,10 +29,10 @@ def calc_mask(self, layer, config): return torch.ones(weight.shape).type_as(weight) threshold = torch.topk(w_abs.view(-1), k, largest=False).values.max() mask = torch.gt(w_abs, threshold).type_as(weight) - self.mask_list.update({op_name: mask}) + self.mask_dict.update({op_name: mask}) self.if_init_list.update({op_name: False}) else: - mask = self.mask_list[op_name] + mask = self.mask_dict[op_name] return mask @@ -57,7 +56,6 @@ def __init__(self, model, config_list): - frequency: if you want update every 2 epoch, you can set it 2 """ super().__init__(model, config_list) - self.mask_list = {} self.now_epoch = 0 self.if_init_list = {} @@ -68,7 +66,7 @@ def calc_mask(self, layer, config): freq = config.get('frequency', 1) if self.now_epoch >= start_epoch and self.if_init_list.get(op_name, True) and ( self.now_epoch - start_epoch) % freq == 0: - mask = self.mask_list.get(op_name, torch.ones(weight.shape).type_as(weight)) + mask = self.mask_dict.get(op_name, torch.ones(weight.shape).type_as(weight)) target_sparsity = self.compute_target_sparsity(config) k = int(weight.numel() * target_sparsity) if k == 0 or target_sparsity >= 1 or target_sparsity <= 0: @@ -77,10 +75,10 @@ def calc_mask(self, layer, config): w_abs = weight.abs() * mask threshold = torch.topk(w_abs.view(-1), k, largest=False).values.max() new_mask = torch.gt(w_abs, threshold).type_as(weight) - self.mask_list.update({op_name: new_mask}) + self.mask_dict.update({op_name: new_mask}) self.if_init_list.update({op_name: False}) else: - new_mask = self.mask_list.get(op_name, torch.ones(weight.shape).type_as(weight)) + new_mask = self.mask_dict.get(op_name, torch.ones(weight.shape).type_as(weight)) return new_mask def compute_target_sparsity(self, config): diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py index bb9e76e0da..6a60a29cf0 100644 --- a/src/sdk/pynni/nni/compression/torch/compressor.py +++ b/src/sdk/pynni/nni/compression/torch/compressor.py @@ -128,11 +128,23 @@ def _expand_config_op_types(self, config): expanded_op_types.append(op_type) return expanded_op_types + class Pruner(Compressor): """ - Abstract base PyTorch pruner + Prune to an exact pruning level specification + + Attributes + ---------- + mask_dict : dict + Dictionary for saving masks, `key` should be layer name and + `value` should be a tensor which has the same shape with layer's weight + """ + def __init__(self, model, config_list): + super().__init__(model, config_list) + self.mask_dict = {} + def calc_mask(self, layer, config): """ Pruners should overload this method to provide mask for weight tensors. @@ -177,6 +189,48 @@ def new_forward(*inputs): layer.module.forward = new_forward + def export_model(self, model_path, mask_path=None, onnx_path=None, input_shape=None): + """ + Export pruned model weights, masks and onnx model(optional) + + Parameters + ---------- + model_path : str + path to save pruned model state_dict + mask_path : str + (optional) path to save mask dict + onnx_path : str + (optional) path to save onnx model + input_shape : list or tuple + input shape to onnx model + """ + assert model_path is not None, 'model_path must be specified' + for name, m in self.bound_model.named_modules(): + mask = self.mask_dict.get(name) + if mask is not None: + mask_sum = mask.sum().item() + mask_num = mask.numel() + _logger.info('Layer: %s Sparsity: %.2f', name, 1 - mask_sum / mask_num) + print('Layer: %s Sparsity: %.2f' % (name, 1 - mask_sum / mask_num)) + m.weight.data = m.weight.data.mul(mask) + else: + _logger.info('Layer: %s NOT compressed', name) + print('Layer: %s NOT compressed' % name) + torch.save(self.bound_model.state_dict(), model_path) + _logger.info('Model state_dict saved to %s', model_path) + print('Model state_dict saved to %s' % model_path) + if mask_path is not None: + torch.save(self.mask_dict, mask_path) + _logger.info('Mask dict saved to %s', mask_path) + print('Mask dict saved to %s' % mask_path) + if onnx_path is not None: + assert input_shape is not None, 'input_shape must be specified to export onnx model' + # input info needed + input_data = torch.Tensor(*input_shape) + torch.onnx.export(self.bound_model, input_data, onnx_path) + _logger.info('Model in onnx with input shape %s saved to %s', input_data.shape, onnx_path) + print('Model in onnx with input shape %s saved to %s' % (input_data.shape, onnx_path)) + class Quantizer(Compressor): """ From 52b93d0cabc11423428ea748972841c27bc068ee Mon Sep 17 00:00:00 2001 From: chicm-ms <38930155+chicm-ms@users.noreply.github.com> Date: Tue, 5 Nov 2019 14:00:25 +0800 Subject: [PATCH 06/11] Show error log of failed trial jobs for integration tests (#1602) * show failed job log --- src/nni_manager/common/utils.ts | 2 +- .../common/clusterJobRestServer.ts | 6 ++- test/config_test.py | 24 +++++------ test/metrics_test.py | 4 +- test/training_service.yml | 3 ++ test/utils.py | 42 +++++++++++++------ 6 files changed, 52 insertions(+), 29 deletions(-) diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 446f4d0ab1..1a69c00651 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -510,4 +510,4 @@ function unixPathJoin(...paths: any[]): string { export {countFilesRecursively, validateFileNameRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin, - mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine }; + mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine }; diff --git a/src/nni_manager/training_service/common/clusterJobRestServer.ts b/src/nni_manager/training_service/common/clusterJobRestServer.ts index 155cfda118..8d5a8772fc 100644 --- a/src/nni_manager/training_service/common/clusterJobRestServer.ts +++ b/src/nni_manager/training_service/common/clusterJobRestServer.ts @@ -30,7 +30,7 @@ import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo'; import { RestServer } from '../../common/restServer'; -import { getLogDir } from '../../common/utils'; +import { getExperimentRootDir, mkDirPSync } from '../../common/utils'; /** * Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update @@ -146,7 +146,9 @@ export abstract class ClusterJobRestServer extends RestServer { this.errorMessage = `Version check failed, didn't get version check response from trialKeeper,` + ` please check your NNI version in NNIManager and TrialKeeper!`; } - const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`); + const trialLogDir: string = path.join(getExperimentRootDir(), 'trials', req.params.trialId); + mkDirPSync(trialLogDir); + const trialLogPath: string = path.join(trialLogDir, 'stdout_log_collection.log'); try { let skipLogging: boolean = false; if (req.body.tag === 'trial' && req.body.msg !== undefined) { diff --git a/test/config_test.py b/test/config_test.py index dece4cf490..50682fed05 100644 --- a/test/config_test.py +++ b/test/config_test.py @@ -24,9 +24,10 @@ import subprocess import time import traceback +import json from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \ - parse_max_duration_time, get_succeeded_trial_num, print_stderr, deep_update + parse_max_duration_time, get_succeeded_trial_num, deep_update, print_failed_job_log, get_failed_trial_jobs from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL def gen_new_config(config_file, training_service='local'): @@ -37,18 +38,18 @@ def gen_new_config(config_file, training_service='local'): config = get_yml_content(config_file) new_config_file = config_file + '.tmp' - ts = get_yml_content('training_service.yml')[training_service] - print(ts) + it_config = get_yml_content('training_service.yml') # hack for kubeflow trial config if training_service == 'kubeflow': - ts['trial']['worker']['command'] = config['trial']['command'] + it_config[training_service]['trial']['worker']['command'] = config['trial']['command'] config['trial'].pop('command') if 'gpuNum' in config['trial']: config['trial'].pop('gpuNum') - deep_update(config, ts) - print(config) + deep_update(config, it_config['all']) + deep_update(config, it_config[training_service]) + dump_yml_content(new_config_file, config) return new_config_file, config @@ -57,6 +58,7 @@ def run_test(config_file, training_service, local_gpu=False): '''run test per configuration file''' new_config_file, config = gen_new_config(config_file, training_service) + print(json.dumps(config, sort_keys=True, indent=4)) if training_service == 'local' and not local_gpu and config['trial']['gpuNum'] > 0: print('no gpu, skiping: ', config_file) @@ -72,14 +74,12 @@ def run_test(config_file, training_service, local_gpu=False): for _ in range(0, max_duration+30, sleep_interval): time.sleep(sleep_interval) status = get_experiment_status(STATUS_URL) - if status == 'DONE': - num_succeeded = get_succeeded_trial_num(TRIAL_JOBS_URL) - if training_service == 'local': - print_stderr(TRIAL_JOBS_URL) - assert num_succeeded == max_trial_num, 'only %d succeeded trial jobs, there should be %d' % (num_succeeded, max_trial_num) + if status in ['DONE', 'ERROR'] or get_failed_trial_jobs(TRIAL_JOBS_URL): break - assert status == 'DONE', 'Failed to finish in maxExecDuration' + print_failed_job_log(config['trainingServicePlatform'], TRIAL_JOBS_URL) + if status != 'DONE' or get_succeeded_trial_num(TRIAL_JOBS_URL) < max_trial_num: + raise AssertionError('Failed to finish in maxExecDuration') finally: if os.path.exists(new_config_file): os.remove(new_config_file) diff --git a/test/metrics_test.py b/test/metrics_test.py index 1f8f0e8893..bf175c4c32 100644 --- a/test/metrics_test.py +++ b/test/metrics_test.py @@ -26,7 +26,7 @@ import json import requests -from utils import get_experiment_status, get_yml_content, parse_max_duration_time, get_succeeded_trial_num, print_stderr +from utils import get_experiment_status, get_yml_content, parse_max_duration_time, get_succeeded_trial_num, print_failed_job_log from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, METRICS_URL def run_test(): @@ -49,7 +49,7 @@ def run_test(): #print('experiment status:', status) if status == 'DONE': num_succeeded = get_succeeded_trial_num(TRIAL_JOBS_URL) - print_stderr(TRIAL_JOBS_URL) + print_failed_job_log('local', TRIAL_JOBS_URL) if sys.platform == "win32": time.sleep(sleep_interval) # Windows seems to have some issues on updating in time assert num_succeeded == max_trial_num, 'only %d succeeded trial jobs, there should be %d' % (num_succeeded, max_trial_num) diff --git a/test/training_service.yml b/test/training_service.yml index 5742c29987..a68954499a 100644 --- a/test/training_service.yml +++ b/test/training_service.yml @@ -1,3 +1,6 @@ +all: + logCollection: http + kubeflow: maxExecDuration: 15m nniManagerIp: diff --git a/test/utils.py b/test/utils.py index 66f78dad82..4778c5f113 100644 --- a/test/utils.py +++ b/test/utils.py @@ -81,13 +81,18 @@ def get_experiment_id(experiment_url): experiment_id = requests.get(experiment_url).json()['id'] return experiment_id -def get_nni_log_path(experiment_url): - '''get nni's log path from nni's experiment url''' +def get_experiment_dir(experiment_url): + '''get experiment root directory''' experiment_id = get_experiment_id(experiment_url) - experiment_path = os.path.join(os.path.expanduser('~'), 'nni', 'experiments', experiment_id) - nnimanager_log_path = os.path.join(experiment_path, 'log', 'nnimanager.log') + return os.path.join(os.path.expanduser('~'), 'nni', 'experiments', experiment_id) - return nnimanager_log_path +def get_nni_log_dir(experiment_url): + '''get nni's log directory from nni's experiment url''' + return os.path.join(get_experiment_dir(experiment_url), 'log') + +def get_nni_log_path(experiment_url): + '''get nni's log path from nni's experiment url''' + return os.path.join(get_nni_log_dir(experiment_url), 'nnimanager.log') def is_experiment_done(nnimanager_log_path): '''check if the experiment is done successfully''' @@ -104,7 +109,6 @@ def get_experiment_status(status_url): def get_succeeded_trial_num(trial_jobs_url): trial_jobs = requests.get(trial_jobs_url).json() - print(trial_jobs) num_succeed = 0 for trial_job in trial_jobs: if trial_job['status'] in ['SUCCEEDED', 'EARLY_STOPPED']: @@ -112,17 +116,31 @@ def get_succeeded_trial_num(trial_jobs_url): print('num_succeed:', num_succeed) return num_succeed -def print_stderr(trial_jobs_url): +def get_failed_trial_jobs(trial_jobs_url): + '''Return failed trial jobs''' trial_jobs = requests.get(trial_jobs_url).json() + failed_jobs = [] + for trial_job in trial_jobs: + if trial_job['status'] in ['FAILED']: + failed_jobs.append(trial_job) + return failed_jobs + +def print_failed_job_log(training_service, trial_jobs_url): + '''Print job log of FAILED trial jobs''' + trial_jobs = get_failed_trial_jobs(trial_jobs_url) for trial_job in trial_jobs: - if trial_job['status'] == 'FAILED': + if training_service == 'local': if sys.platform == "win32": p = trial_job['stderrPath'].split(':') - stderr_path = ':'.join([p[-2], p[-1]]) - subprocess.run(['type', stderr_path], shell=True) + log_filename = ':'.join([p[-2], p[-1]]) else: - stderr_path = trial_job['stderrPath'].split(':')[-1] - subprocess.run(['cat', stderr_path]) + log_filename = trial_job['stderrPath'].split(':')[-1] + else: + log_filename = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['id'], 'stdout_log_collection.log') + with open(log_filename, 'r') as f: + log_content = f.read() + print(log_filename, flush=True) + print(log_content, flush=True) def parse_max_duration_time(max_exec_duration): unit = max_exec_duration[-1] From db19946d112ff0659e228d9f5a9f05f3bbb83e41 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Wed, 6 Nov 2019 18:59:47 +0800 Subject: [PATCH 07/11] Support AAD token login in PAI mode (#1660) --- docs/en_US/TrainingService/PaiMode.md | 16 ++++++++++ .../rest_server/restValidationSchemas.ts | 3 +- .../training_service/pai/paiConfig.ts | 7 +++-- .../pai/paiTrainingService.ts | 31 ++++++++++++------- tools/nni_cmd/config_schema.py | 8 +++-- 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 3787f7165d..a5926c6484 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -82,6 +82,22 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod portNumber: 1 ``` +NNI support two kind of authorization method in PAI, including password and PAI token, [refer](https://github.com/microsoft/pai/blob/b6bd2ab1c8890f91b7ac5859743274d2aa923c22/docs/rest-server/API.md#2-authentication). The authorization is configured in `paiConfig` field. +For password authorization, the `paiConfig` schema is: +``` +paiConfig: + userName: your_pai_nni_user + passWord: your_pai_password + host: 10.1.1.1 +``` +For pai token authorization, the `paiConfig` schema is: +``` +paiConfig: + userName: your_pai_nni_user + token: your_pai_token + host: 10.1.1.1 +``` + Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command ``` nnictl create --config exp_pai.yml diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 99bbe4bb96..69a7ec1d90 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -107,7 +107,8 @@ export namespace ValidationSchemas { }), pai_config: joi.object({ userName: joi.string().min(1).required(), - passWord: joi.string().min(1).required(), + passWord: joi.string().min(1), + token: joi.string().min(1), host: joi.string().min(1).required() }), kubeflow_config: joi.object({ diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index 43f95f7f9c..d25da7513f 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -107,19 +107,22 @@ export class PAIJobConfig { */ export class PAIClusterConfig { public readonly userName: string; - public readonly passWord: string; + public readonly passWord?: string; public readonly host: string; + public readonly token?: string; /** * Constructor * @param userName User name of PAI Cluster * @param passWord password of PAI Cluster * @param host Host IP of PAI Cluster + * @param token PAI token of PAI Cluster */ - constructor(userName: string, passWord : string, host : string) { + constructor(userName: string, host : string, passWord?: string, token?: string) { this.userName = userName; this.passWord = passWord; this.host = host; + this.token = token; } } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 27cc8be976..d741931b29 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -208,7 +208,7 @@ class PAITrainingService implements TrainingService { const stopJobRequest: request.Options = { uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\ -/jobs/${trialJobDetail.paiJobName}/executionType`, +/jobs/${trialJobDetail.paiJobName}/executionType`, method: 'PUT', json: true, body: {value: 'STOP'}, @@ -256,9 +256,15 @@ class PAITrainingService implements TrainingService { path: '/webhdfs/api/v1', host: this.paiClusterConfig.host }); + if(this.paiClusterConfig.passWord) { + // Get PAI authentication token + await this.updatePaiToken(); + } else if(this.paiClusterConfig.token) { + this.paiToken = this.paiClusterConfig.token; + } else { + deferred.reject(new Error('pai cluster config format error, please set password or token!')); + } - // Get PAI authentication token - await this.updatePaiToken(); deferred.resolve(); break; @@ -483,8 +489,7 @@ class PAITrainingService implements TrainingService { request(submitJobRequest, (error: Error, response: request.Response, body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 400) { const errorMessage : string = (error !== undefined && error !== null) ? error.message : - `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body}`; - this.log.error(errorMessage); + `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body.message}`; trialJobDetail.status = 'FAILED'; deferred.resolve(true); } else { @@ -498,13 +503,15 @@ class PAITrainingService implements TrainingService { private async statusCheckingLoop(): Promise { while (!this.stopping) { - try { - await this.updatePaiToken(); - } catch (error) { - this.log.error(`${error}`); - //only throw error when initlize paiToken first time - if (this.paiToken === undefined) { - throw new Error(error); + if(this.paiClusterConfig && this.paiClusterConfig.passWord) { + try { + await this.updatePaiToken(); + } catch (error) { + this.log.error(`${error}`); + //only throw error when initlize paiToken first time + if (this.paiToken === undefined) { + throw new Error(error); + } } } await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig); diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index dded8d1e95..5eb9538ad8 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -265,11 +265,15 @@ def setPathCheck(key): } pai_config_schema = { - 'paiConfig':{ + 'paiConfig': Or({ 'userName': setType('userName', str), 'passWord': setType('passWord', str), 'host': setType('host', str) - } + }, { + 'userName': setType('userName', str), + 'token': setType('token', str), + 'host': setType('host', str) + }) } kubeflow_trial_schema = { From 3e0ef0052d40480691304cafa61cf71019c7b233 Mon Sep 17 00:00:00 2001 From: chicm-ms <38930155+chicm-ms@users.noreply.github.com> Date: Fri, 8 Nov 2019 17:44:01 +0800 Subject: [PATCH 08/11] round-robin policy (#1702) --- .../remote_machine/gpuScheduler.ts | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts index 5e7f065971..4244eb8967 100644 --- a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts +++ b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts @@ -28,6 +28,8 @@ import { parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, SSHClientManager } from './remoteMachineData'; +type SCHEDULE_POLICY_NAME = 'random' | 'round-robin'; + /** * A simple GPU scheduler implementation */ @@ -35,13 +37,18 @@ export class GPUScheduler { private readonly machineSSHClientMap : Map; private readonly log: Logger = getLogger(); + private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin'; + private roundRobinIndex: number = 0; + private configuredRMs: RemoteMachineMeta[] = []; /** * Constructor * @param machineSSHClientMap map from remote machine to sshClient */ constructor(machineSSHClientMap : Map) { + assert(machineSSHClientMap.size > 0); this.machineSSHClientMap = machineSSHClientMap; + this.configuredRMs = Array.from(machineSSHClientMap.keys()); } /** @@ -189,7 +196,21 @@ export class GPUScheduler { private selectMachine(rmMetas: RemoteMachineMeta[]): RemoteMachineMeta { assert(rmMetas !== undefined && rmMetas.length > 0); - return randomSelect(rmMetas); + if (this.policyName === 'random') { + return randomSelect(rmMetas); + } else if (this.policyName === 'round-robin') { + return this.roundRobinSelect(rmMetas); + } else { + throw new Error(`Unsupported schedule policy: ${this.policyName}`); + } + } + + private roundRobinSelect(rmMetas: RemoteMachineMeta[]): RemoteMachineMeta { + while (!rmMetas.includes(this.configuredRMs[this.roundRobinIndex % this.configuredRMs.length])) { + this.roundRobinIndex++; + } + + return this.configuredRMs[this.roundRobinIndex++ % this.configuredRMs.length]; } private selectGPUsForTrial(gpuInfos: GPUInfo[], requiredGPUNum: number): GPUInfo[] { From fb18f0d4d1da46f23b501db176c3434b876dac4d Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 8 Nov 2019 20:53:34 +0800 Subject: [PATCH 09/11] Fix pipeline node version for unittests (#1721) * try to fix pipeline * locate npm and node version * override with node in local dir * refactor path set * fix install SMAC on macOS --- Makefile | 6 ++++-- azure-pipelines.yml | 38 +++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 8877e5c2ae..4759d56c34 100644 --- a/Makefile +++ b/Makefile @@ -52,6 +52,7 @@ $(shell mkdir -p $(NNI_DEPENDENCY_FOLDER)) NNI_NODE_TARBALL ?= $(NNI_DEPENDENCY_FOLDER)/nni-node-$(OS_SPEC)-x64.tar.xz NNI_NODE_FOLDER = $(NNI_DEPENDENCY_FOLDER)/nni-node-$(OS_SPEC)-x64 NNI_NODE ?= $(BIN_FOLDER)/node +NNI_NPM ?= $(BIN_FOLDER)/npm NNI_YARN_TARBALL ?= $(NNI_DEPENDENCY_FOLDER)/nni-yarn.tar.gz NNI_YARN_FOLDER ?= $(NNI_DEPENDENCY_FOLDER)/nni-yarn NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn @@ -149,8 +150,9 @@ install-dependencies: $(NNI_NODE_TARBALL) $(NNI_YARN_TARBALL) mkdir $(NNI_NODE_FOLDER) tar -xf $(NNI_NODE_TARBALL) -C $(NNI_NODE_FOLDER) --strip-components 1 mkdir -p $(BIN_FOLDER) - rm -f $(NNI_NODE) - cp $(NNI_NODE_FOLDER)/bin/node $(NNI_NODE) + rm -f $(NNI_NODE) $(NNI_NPM) + ln -s $(NNI_NODE_FOLDER)/bin/node $(NNI_NODE) + ln -s $(NNI_NODE_FOLDER)/bin/npm $(NNI_NPM) #$(_INFO) Extracting Yarn $(_END) rm -rf $(NNI_YARN_FOLDER) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7b8e6f626c..14ef0197ca 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -12,6 +12,7 @@ jobs: python3 -m pip install --upgrade pip setuptools --user python3 -m pip install pylint==2.3.1 astroid==2.2.5 --user python3 -m pip install coverage --user + echo "##vso[task.setvariable variable=PATH]${HOME}/.local/bin:${PATH}" displayName: 'Install python tools' - script: | source install.sh @@ -23,8 +24,8 @@ jobs: python3 -m pip install keras==2.1.6 --user python3 -m pip install gym onnx --user sudo apt-get install swig -y - PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC - PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB + nnictl package install --name=SMAC + nnictl package install --name=BOHB displayName: 'Install dependencies' - script: | set -e @@ -42,25 +43,23 @@ jobs: displayName: 'Run flake8 tests to find Python syntax errors and undefined names' - script: | cd test - sudo apt install -y swig - PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC source unittest.sh displayName: 'Unit test' - script: | cd test - PATH=$HOME/.local/bin:$PATH python3 naive_test.py + python3 naive_test.py displayName: 'Naive test' - script: | cd test - PATH=$HOME/.local/bin:$PATH python3 tuner_test.py + python3 tuner_test.py displayName: 'Built-in tuners / assessors tests' - script: | cd test - PATH=$HOME/.local/bin:$PATH python3 metrics_test.py + python3 metrics_test.py displayName: 'Trial job metrics test' - script: | cd test - PATH=$HOME/.local/bin:$PATH python3 cli_test.py + python3 cli_test.py displayName: 'nnicli test' - job: 'basic_test_pr_macOS' @@ -74,33 +73,34 @@ jobs: steps: - script: python3 -m pip install --upgrade pip setuptools displayName: 'Install python tools' - - script: | - python3 -m pip install torch==0.4.1 --user - python3 -m pip install torchvision==0.2.1 --user - python3 -m pip install tensorflow==1.13.1 --user - displayName: 'Install dependencies' - script: | source install.sh + echo "##vso[task.setvariable variable=PATH]${HOME}/Library/Python/3.7/bin:${PATH}" displayName: 'Install nni toolkit via source code' - script: | - cd test + python3 -m pip install torch==0.4.1 --user + python3 -m pip install torchvision==0.2.1 --user + python3 -m pip install tensorflow==1.13.1 --user ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" < /dev/null 2> /dev/null brew install swig@3 ln -s /usr/local/opt/swig\@3/bin/swig /usr/local/bin/swig - PATH=$HOME/Library/Python/3.7/bin:$PATH nnictl package install --name=SMAC - PATH=$HOME/Library/Python/3.7/bin:$PATH source unittest.sh + nnictl package install --name=SMAC + displayName: 'Install dependencies' + - script: | + cd test + source unittest.sh displayName: 'Unit test' - script: | cd test - PATH=$HOME/Library/Python/3.7/bin:$PATH python3 naive_test.py + python3 naive_test.py displayName: 'Naive test' - script: | cd test - PATH=$HOME/Library/Python/3.7/bin:$PATH python3 tuner_test.py + python3 tuner_test.py displayName: 'Built-in tuners / assessors tests' - script: | cd test - PATH=$HOME/Library/Python/3.7/bin:$PATH python3 cli_test.py + python3 cli_test.py displayName: 'nnicli test' - job: 'basic_test_pr_Windows' From 2d375f435770b256657a4eb76b7a058b0eb16bd9 Mon Sep 17 00:00:00 2001 From: Tang Lang Date: Sat, 9 Nov 2019 11:25:28 +0800 Subject: [PATCH 10/11] add export doc (#1705) * add export doc --- docs/en_US/Compressor/Overview.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/en_US/Compressor/Overview.md b/docs/en_US/Compressor/Overview.md index 7ee603e3e3..5fc8e45c5d 100644 --- a/docs/en_US/Compressor/Overview.md +++ b/docs/en_US/Compressor/Overview.md @@ -95,7 +95,17 @@ pruner.update_epoch(epoch) The other is `step`, it can be called with `pruner.step()` after each minibatch. Note that not all algorithms need these two APIs, for those that do not need them, calling them is allowed but has no effect. -__[TODO]__ The last API is for users to export the compressed model. You will get a compressed model when you finish the training using this API. It also exports another file storing the values of masks. +You can easily export the compressed model using the following API if you are pruning your model, ```state_dict``` of the sparse model weights will be stored in ```model.pth```, which can be loaded by ```torch.load('model.pth')``` + +``` +pruner.export_model(model_path='model.pth') +``` + +```mask_dict ``` and pruned model in ```onnx``` format(```input_shape``` need to be specified) can also be exported like this: + +```python +pruner.export_model(model_path='model.pth', mask_path='mask.pth', onnx_path='model.onnx', input_shape=[1, 1, 28, 28]) +``` ## Customize new compression algorithms From 901012eb90bf7c0dbb3d2566e5d8c0eda1b5b249 Mon Sep 17 00:00:00 2001 From: liuzhe-lz <40699903+liuzhe-lz@users.noreply.github.com> Date: Sat, 9 Nov 2019 12:43:02 +0800 Subject: [PATCH 11/11] docstring fix (#1691) --- src/sdk/pynni/nni/assessor.py | 14 +++++++------- src/sdk/pynni/nni/tuner.py | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/sdk/pynni/nni/assessor.py b/src/sdk/pynni/nni/assessor.py index 0f1dc95619..01a2abcbe9 100644 --- a/src/sdk/pynni/nni/assessor.py +++ b/src/sdk/pynni/nni/assessor.py @@ -53,14 +53,14 @@ class Assessor(Recoverable): to tell whether this trial can be early stopped or not. This is the abstract base class for all assessors. - Early stopping algorithms should derive this class and override :meth:`assess_trial` method, + Early stopping algorithms should inherit this class and override :meth:`assess_trial` method, which receives intermediate results from trials and give an assessing result. If :meth:`assess_trial` returns :obj:`AssessResult.Bad` for a trial, it hints NNI framework that the trial is likely to result in a poor final accuracy, and therefore should be killed to save resource. - If an accessor want's to get notified when a trial ends, it can also override :meth:`trial_end`. + If an accessor want's to be notified when a trial ends, it can also override :meth:`trial_end`. To write a new assessor, you can reference :class:`~nni.medianstop_assessor.MedianstopAssessor`'s code as an example. @@ -77,7 +77,7 @@ def assess_trial(self, trial_job_id, trial_history): The NNI framework has little guarantee on ``trial_history``. This method is not guaranteed to be invoked for each time ``trial_history`` get updated. - It is also possible that a trial's history keeps updateing after receiving a bad result. + It is also possible that a trial's history keeps updating after receiving a bad result. And if the trial failed and retried, ``trial_history`` may be inconsistent with its previous value. The only guarantee is that ``trial_history`` is always growing. @@ -96,9 +96,9 @@ def assess_trial(self, trial_job_id, trial_history): Parameters ---------- - trial_job_id: str + trial_job_id : str Unique identifier of the trial. - trial_history: list + trial_history : list Intermediate results of this trial. The element type is decided by trial code. Returns @@ -114,9 +114,9 @@ def trial_end(self, trial_job_id, success): Parameters ---------- - trial_job_id: str + trial_job_id : str Unique identifier of the trial. - success: bool + success : bool True if the trial successfully completed; False if failed or terminated. """ diff --git a/src/sdk/pynni/nni/tuner.py b/src/sdk/pynni/nni/tuner.py index f011022151..177232b7ed 100644 --- a/src/sdk/pynni/nni/tuner.py +++ b/src/sdk/pynni/nni/tuner.py @@ -42,7 +42,7 @@ class Tuner(Recoverable): A new trial will run with this configuration. This is the abstract base class for all tuners. - Tuning algorithms should derive this class and override :meth:`update_search_space`, :meth:`receive_trial_result`, + Tuning algorithms should inherit this class and override :meth:`update_search_space`, :meth:`receive_trial_result`, as well as :meth:`generate_parameters` or :meth:`generate_multiple_parameters`. After initializing, NNI will first call :meth:`update_search_space` to tell tuner the feasible region, @@ -96,9 +96,9 @@ def generate_parameters(self, parameter_id, **kwargs): Parameters ---------- - parameter_id: int + parameter_id : int Unique identifier for requested hyper-parameters. This will later be used in :meth:`receive_trial_result`. - **kwargs: + **kwargs Unstable parameters which should be ignored by normal users. Returns @@ -129,10 +129,10 @@ def generate_multiple_parameters(self, parameter_id_list, **kwargs): Parameters ---------- - parameter_id_list: list of int + parameter_id_list : list of int Unique identifiers for each set of requested hyper-parameters. These will later be used in :meth:`receive_trial_result`. - **kwargs: + **kwargs Unstable parameters which should be ignored by normal users. Returns @@ -159,13 +159,13 @@ def receive_trial_result(self, parameter_id, parameters, value, **kwargs): Parameters ---------- - parameter_id: int + parameter_id : int Unique identifier of used hyper-parameters, same with :meth:`generate_parameters`. parameters Hyper-parameters generated by :meth:`generate_parameters`. value Result from trial (the return value of :func:`nni.report_final_result`). - **kwargs: + **kwargs Unstable parameters which should be ignored by normal users. """ raise NotImplementedError('Tuner: receive_trial_result not implemented') @@ -186,11 +186,11 @@ def trial_end(self, parameter_id, success, **kwargs): Parameters ---------- - parameter_id: int + parameter_id : int Unique identifier for hyper-parameters used by this trial. - success: bool + success : bool True if the trial successfully completed; False if failed or terminated. - **kwargs: + **kwargs Unstable parameters which should be ignored by normal users. """