From 5954dfd20fa4aa954131495c1b0fd11b1e9f396d Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 4 Feb 2021 21:04:38 +0800 Subject: [PATCH 01/11] Code refactor --- nni/retiarii/execution/base.py | 4 ++-- nni/retiarii/strategies/random_strategy.py | 15 ++++++++------- nni/retiarii/strategies/tpe_strategy.py | 6 ++---- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/nni/retiarii/execution/base.py b/nni/retiarii/execution/base.py index 97773d628a..b4fd66e488 100644 --- a/nni/retiarii/execution/base.py +++ b/nni/retiarii/execution/base.py @@ -68,11 +68,11 @@ def _send_trial_callback(self, paramater: dict) -> None: if self.resources <= 0: _logger.warning('There is no available resource, but trial is submitted.') self.resources -= 1 - _logger.info('on_resource_used: %d', self.resources) + _logger.info('Resource used. Remaining: %d', self.resources) def _request_trial_jobs_callback(self, num_trials: int) -> None: self.resources += num_trials - _logger.info('on_resource_available: %d', self.resources) + _logger.info('New resource available. Remaining: %d', self.resources) def _trial_end_callback(self, trial_id: int, success: bool) -> None: model = self._running_models[trial_id] diff --git a/nni/retiarii/strategies/random_strategy.py b/nni/retiarii/strategies/random_strategy.py index 78c1ac13da..4bbae08d35 100644 --- a/nni/retiarii/strategies/random_strategy.py +++ b/nni/retiarii/strategies/random_strategy.py @@ -11,22 +11,23 @@ class RandomSampler(Sampler): def choice(self, candidates, mutator, model, index): return random.choice(candidates) -class RandomStrategy(BaseStrategy): - def __init__(self): - self.random_sampler = RandomSampler() +class OppotunisticStrategy(BaseStrategy): + """ + This strategy submits models once there are resources available, and does not collect metrics after submission. + """ + def __init__(self, sampler): + self.sampler = RandomSampler() def run(self, base_model, applied_mutators): - _logger.info('stargety start...') + _logger.info('Random strategy has been started.') while True: avail_resource = query_available_resources() if avail_resource > 0: model = base_model - _logger.info('apply mutators...') - _logger.info('mutators: %s', str(applied_mutators)) + _logger.info('New model created. Applied mutators: %s', str(applied_mutators)) for mutator in applied_mutators: mutator.bind_sampler(self.random_sampler) model = mutator.apply(model) - # run models submit_models(model) else: time.sleep(2) diff --git a/nni/retiarii/strategies/tpe_strategy.py b/nni/retiarii/strategies/tpe_strategy.py index 9f0fcd2455..1a63d4f8ad 100644 --- a/nni/retiarii/strategies/tpe_strategy.py +++ b/nni/retiarii/strategies/tpe_strategy.py @@ -50,16 +50,14 @@ def run(self, base_model, applied_mutators): sample_space.extend(recorded_candidates) self.tpe_sampler.update_sample_space(sample_space) - _logger.info('stargety start...') + _logger.info('TPE strategy has been started.') while True: avail_resource = query_available_resources() if avail_resource > 0: model = base_model - _logger.info('apply mutators...') - _logger.info('mutators: %s', str(applied_mutators)) + _logger.info('New model created. Applied mutators: %s', str(applied_mutators)) self.tpe_sampler.generate_samples(self.model_id) for mutator in applied_mutators: - _logger.info('mutate model...') mutator.bind_sampler(self.tpe_sampler) model = mutator.apply(model) # run models From aff19a88b7426de346188acf4574880fa447b37e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 4 Feb 2021 21:48:58 +0800 Subject: [PATCH 02/11] Unfinished work checkpoint --- .../{random_strategy.py => random.py} | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) rename nni/retiarii/strategies/{random_strategy.py => random.py} (62%) diff --git a/nni/retiarii/strategies/random_strategy.py b/nni/retiarii/strategies/random.py similarity index 62% rename from nni/retiarii/strategies/random_strategy.py rename to nni/retiarii/strategies/random.py index 4bbae08d35..3e5cad1a6b 100644 --- a/nni/retiarii/strategies/random_strategy.py +++ b/nni/retiarii/strategies/random.py @@ -7,27 +7,43 @@ _logger = logging.getLogger(__name__) -class RandomSampler(Sampler): - def choice(self, candidates, mutator, model, index): - return random.choice(candidates) -class OppotunisticStrategy(BaseStrategy): +class MetricAgnosticStrategy(BaseStrategy): """ This strategy submits models once there are resources available, and does not collect metrics after submission. """ def __init__(self, sampler): - self.sampler = RandomSampler() + self.sampler = sampler def run(self, base_model, applied_mutators): _logger.info('Random strategy has been started.') + for mutator in applied_mutators: + mutator.bind_sampler(self.sampler) while True: avail_resource = query_available_resources() if avail_resource > 0: model = base_model _logger.info('New model created. Applied mutators: %s', str(applied_mutators)) for mutator in applied_mutators: - mutator.bind_sampler(self.random_sampler) model = mutator.apply(model) submit_models(model) else: time.sleep(2) + + +class RandomSampler(Sampler): + def choice(self, candidates, mutator, model, index): + return random.choice(candidates) + + +class RandomStrategy(MetricAgnosticStrategy): + def __init__(self): + super().__init__(RandomSampler()) + + +class GridSearchSampler(Sampler): + def choice(self, candidates, mutator, model, index): + for mutator in applied_mutators: + recorded_candidates, new_model = mutator.dry_run(new_model) + sample_space.extend(recorded_candidates) + self.tpe_sampler.update_sample_space(sample_space) \ No newline at end of file From a1f92a0017b4badbe7ea02cc0145cdf91bbea8bd Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 9 Feb 2021 19:10:43 +0800 Subject: [PATCH 03/11] Add brute-force strategies --- nni/retiarii/strategies/bruteforce.py | 111 ++++++++++++++++++++++++++ nni/retiarii/strategies/evolution.py | 0 nni/retiarii/strategies/random.py | 49 ------------ nni/retiarii/strategies/utils.py | 13 +++ 4 files changed, 124 insertions(+), 49 deletions(-) create mode 100644 nni/retiarii/strategies/bruteforce.py create mode 100644 nni/retiarii/strategies/evolution.py delete mode 100644 nni/retiarii/strategies/random.py create mode 100644 nni/retiarii/strategies/utils.py diff --git a/nni/retiarii/strategies/bruteforce.py b/nni/retiarii/strategies/bruteforce.py new file mode 100644 index 0000000000..097f1f6969 --- /dev/null +++ b/nni/retiarii/strategies/bruteforce.py @@ -0,0 +1,111 @@ +import copy +import itertools +import logging +import random +import time +from typing import Any, Dict, List + +from .. import Sampler, submit_models, query_available_resources +from .strategy import BaseStrategy +from .utils import dry_run_for_search_space + +_logger = logging.getLogger(__name__) + + +def _generate_with_gridsearch(search_space: Dict[Any, List[Any]], shuffle=True): + keys = list(search_space.keys()) + search_space_values = copy.deepcopy(list(search_space.values())) + if shuffle: + for values in search_space_values: + random.shuffle(values) + for values in itertools.product(*search_space_values): + yield {key: value for key, value in zip(keys, values)} + + +def _generate_with_random(search_space: Dict[Any, List[Any]], dedup=True, retries=500): + keys = list(search_space.keys()) + history = set() + search_space_values = copy.deepcopy(list(search_space.values())) + while True: + for retry_count in range(retries): + selected = [random.choice(v) for v in search_space_values] + if not dedup: + break + selected = tuple(selected) + if selected not in history: + history.add(selected) + break + if retry_count + 1 == retries: + _logger.info('Random generation has run out of patience. There is nothing to search. Exiting.') + return + yield {key: value for key, value in zip(keys, selected)} + + +class _FixedSampler(Sampler): + def __init__(self, sample): + self.sample = sample + + def choice(self, candidates, mutator, model, index): + return self.sample[(mutator, index)] + + +class GridSearch(BaseStrategy): + def __init__(self, shuffle=True): + self._polling_interval = 2. + self.shuffle = shuffle + + def run(self, base_model, applied_mutators): + search_space = dry_run_for_search_space(base_model, applied_mutators) + for sample in _generate_with_gridsearch(search_space, shuffle=self.shuffle): + _logger.info('New model created. Waiting for resource. %s', str(sample)) + if query_available_resources() <= 0: + time.sleep(self._polling_interval) + sampler = _FixedSampler(sample) + model = base_model + for mutator in applied_mutators: + model = mutator.bind_sampler(sampler).apply(model) + submit_models(model) + + +class _RandomSampler(Sampler): + def choice(self, candidates, mutator, model, index): + return random.choice(candidates) + + +class RandomStrategy(BaseStrategy): + def __init__(self, variational=False, dedup=True): + self.variational = variational + self.dedup = dedup + if variational and dedup: + raise ValueError('Dedup is not supported in variational mode.') + self.random_sampler = _RandomSampler() + self._polling_interval = 2. + + def run(self, base_model, applied_mutators): + if self.variational: + _logger.info('Random search running in variational mode.') + sampler = _RandomSampler() + for mutator in applied_mutators: + mutator.bind_sampler(sampler) + while True: + avail_resource = query_available_resources() + if avail_resource > 0: + model = base_model + for mutator in applied_mutators: + model = mutator.apply(model) + _logger.info('New model created. Applied mutators are: %s', str(applied_mutators)) + submit_models(model) + else: + time.sleep(self._polling_interval) + else: + _logger.info('Random search running in fixed size mode. Dedup: %s.', 'on' if self.dedup else 'off') + search_space = dry_run_for_search_space(base_model, applied_mutators) + for sample in _generate_with_random(search_space, dedup=self.dedup): + _logger.info('New model created. Waiting for resource. %s', str(sample)) + if query_available_resources() <= 0: + time.sleep(self._polling_interval) + sampler = _FixedSampler(sample) + model = base_model + for mutator in applied_mutators: + model = mutator.bind_sampler(sampler).apply(model) + submit_models(model) diff --git a/nni/retiarii/strategies/evolution.py b/nni/retiarii/strategies/evolution.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nni/retiarii/strategies/random.py b/nni/retiarii/strategies/random.py deleted file mode 100644 index 3e5cad1a6b..0000000000 --- a/nni/retiarii/strategies/random.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging -import random -import time - -from .. import Sampler, submit_models, query_available_resources -from .strategy import BaseStrategy - -_logger = logging.getLogger(__name__) - - -class MetricAgnosticStrategy(BaseStrategy): - """ - This strategy submits models once there are resources available, and does not collect metrics after submission. - """ - def __init__(self, sampler): - self.sampler = sampler - - def run(self, base_model, applied_mutators): - _logger.info('Random strategy has been started.') - for mutator in applied_mutators: - mutator.bind_sampler(self.sampler) - while True: - avail_resource = query_available_resources() - if avail_resource > 0: - model = base_model - _logger.info('New model created. Applied mutators: %s', str(applied_mutators)) - for mutator in applied_mutators: - model = mutator.apply(model) - submit_models(model) - else: - time.sleep(2) - - -class RandomSampler(Sampler): - def choice(self, candidates, mutator, model, index): - return random.choice(candidates) - - -class RandomStrategy(MetricAgnosticStrategy): - def __init__(self): - super().__init__(RandomSampler()) - - -class GridSearchSampler(Sampler): - def choice(self, candidates, mutator, model, index): - for mutator in applied_mutators: - recorded_candidates, new_model = mutator.dry_run(new_model) - sample_space.extend(recorded_candidates) - self.tpe_sampler.update_sample_space(sample_space) \ No newline at end of file diff --git a/nni/retiarii/strategies/utils.py b/nni/retiarii/strategies/utils.py new file mode 100644 index 0000000000..23552ca416 --- /dev/null +++ b/nni/retiarii/strategies/utils.py @@ -0,0 +1,13 @@ +import collections +from typing import Dict, Any, List +from ..graph import Model +from ..mutator import Mutator + + +def dry_run_for_search_space(model: Model, mutators: List[Mutator]) -> Dict[Any, List[Any]]: + search_space = collections.OrderedDict() + for mutator in mutators: + recorded_candidates, model = mutator.dry_run(model) + for i, candidates in recorded_candidates: + search_space[(id(mutator), i)] = candidates + return search_space From fef7d6bbc9ce07e15dd7eb29f6333bdefb2b5968 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 9 Feb 2021 19:16:22 +0800 Subject: [PATCH 04/11] Evolution draft --- nni/retiarii/strategies/evolution.py | 105 +++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/nni/retiarii/strategies/evolution.py b/nni/retiarii/strategies/evolution.py index e69de29bb2..73a30299b7 100644 --- a/nni/retiarii/strategies/evolution.py +++ b/nni/retiarii/strategies/evolution.py @@ -0,0 +1,105 @@ +# TODO: needs to be adapted to new API + + +class RegularizedEvolution: + def __init__(self, search_space, + concurrency, population_size, sample_size, cycles, mutation_prob, + reward_fn, command, setup): + self.search_space = search_space + self.concurrency = concurrency + self.population_size = population_size + self.command = command + self.setup = setup + self.population_size = population_size + self.sample_size = sample_size + self.cycles = cycles + self.mutation_prob = mutation_prob + self.reward_fn = reward_fn + assert self.cycles >= self.population_size >= self.sample_size + + self.population = collections.deque() + + def train_and_eval(self, config): + pid = get_trial_manager().submit_new_trial(self.command, config, self.setup) + + while True: + try: + metrics = get_trial_manager().query_metrics(pid) + if metrics is not None: + break + time.sleep(5) + continue + except TrialFailed: + _logger.warning(f'Config: {config}. Trial failed and use -inf as metrics.') + metrics = float('-inf') + break + return self.reward_fn(config, metrics) + + def random_config(self): + config = {} + for k, v in SearchSpaceUtils.flatten_search_space(self.search_space).items(): + config[k] = v.random() + _logger.info(f'Generated random config: {config}') + return SearchSpaceUtils.restore_config(config, self.search_space) + + def mutate_config(self, parent_config): + parent_config = SearchSpaceUtils.flatten_config(parent_config) + config = {} + for k, v in SearchSpaceUtils.flatten_search_space(self.search_space).items(): + config[k] = parent_config[k] + if random.uniform(0, 1) < self.mutation_prob: + config[k] = v.random(excludes=[parent_config[k]]) + _logger.info(f'Generated mutated config: {config}') + return SearchSpaceUtils.restore_config(config, self.search_space) + + def import_(self, individuals): + self.individuals = sorted(individuals, key=lambda i: i.reward)[-self.population_size:] + random.shuffle(self.individuals) + _logger.info(f'Imported individuals: {self.individuals}') + + def _run_random(self): + individual = Individual(self.random_config(), None) + individual.reward = self.train_and_eval(individual.config) + self.population.append(individual) + + def _run_mutation(self): + # Sample randomly chosen models from the current population. + try: + _lock.acquire() + samples = copy.deepcopy(self.population) + finally: + _lock.release() + random.shuffle(samples) + samples = list(samples)[:self.population_size] + parent = max(samples, key=lambda i: i.reward) + + individual = Individual(self.mutate_config(parent.config), None) + individual.reward = self.train_and_eval(individual.config) + try: + _lock.acquire() + self.population.append(individual) + self.population.popleft() + finally: + _lock.release() + + def _wait_for_futures_and_shutdown(self, futures, pool): + for i in futures: + try: + i.result() + except: + traceback.print_exc() + for k in futures: + k.cancel() + pool.shutdown(wait=True) + raise + pool.shutdown() + + def run(self): + # Initialize the population with random models. + pool = concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) + fs = [pool.submit(self._run_random) for _ in range(self.population_size - len(self.population))] + self._wait_for_futures_and_shutdown(fs, pool) + + pool = concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) + fs = [pool.submit(self._run_mutation) for _ in range(self.cycles - self.population_size)] + self._wait_for_futures_and_shutdown(fs, pool) From 5d38a5f77504dd3ae64c17d80aee4f7c195f8836 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 10 Feb 2021 09:42:37 +0800 Subject: [PATCH 05/11] Some renaming --- docs/en_US/NAS/retiarii/Tutorial.rst | 4 ++-- docs/en_US/NAS/retiarii/WriteStrategy.rst | 6 ++++-- nni/retiarii/experiment.py | 2 +- nni/retiarii/strategies/__init__.py | 2 -- nni/retiarii/strategy/__init__.py | 3 +++ nni/retiarii/{strategies/strategy.py => strategy/base.py} | 0 nni/retiarii/{strategies => strategy}/bruteforce.py | 4 ++-- nni/retiarii/{strategies => strategy}/evolution.py | 0 nni/retiarii/{strategies => strategy}/tpe_strategy.py | 0 nni/retiarii/{strategies => strategy}/utils.py | 0 test/retiarii_test/darts/test.py | 7 +++---- test/retiarii_test/darts/test_oneshot.py | 2 +- test/retiarii_test/mnasnet/test.py | 6 +++--- test/retiarii_test/mnist/test.py | 6 +++--- 14 files changed, 22 insertions(+), 20 deletions(-) delete mode 100644 nni/retiarii/strategies/__init__.py create mode 100644 nni/retiarii/strategy/__init__.py rename nni/retiarii/{strategies/strategy.py => strategy/base.py} (100%) rename nni/retiarii/{strategies => strategy}/bruteforce.py (98%) rename nni/retiarii/{strategies => strategy}/evolution.py (100%) rename nni/retiarii/{strategies => strategy}/tpe_strategy.py (100%) rename nni/retiarii/{strategies => strategy}/utils.py (100%) diff --git a/docs/en_US/NAS/retiarii/Tutorial.rst b/docs/en_US/NAS/retiarii/Tutorial.rst index 5572133dd5..1f90e98bd7 100644 --- a/docs/en_US/NAS/retiarii/Tutorial.rst +++ b/docs/en_US/NAS/retiarii/Tutorial.rst @@ -188,7 +188,7 @@ Here is a simple example of using trainer and strategy. dataloader_kwargs={"batch_size": 32}, optimizer_kwargs={"lr": 1e-3}, trainer_kwargs={"max_epochs": 1}) - simple_startegy = RandomStrategy() + simple_strategy = RandomStrategy() Users can refer to `this document <./WriteTrainer.rst>`__ for how to write a new trainer, and refer to `this document <./WriteStrategy.rst>`__ for how to write a new strategy. @@ -199,7 +199,7 @@ After all the above are prepared, it is time to start an experiment to do the mo .. code-block:: python - exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_startegy) + exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnasnet_search' exp_config.trial_concurrency = 2 diff --git a/docs/en_US/NAS/retiarii/WriteStrategy.rst b/docs/en_US/NAS/retiarii/WriteStrategy.rst index d8f10546f8..6f354b3b85 100644 --- a/docs/en_US/NAS/retiarii/WriteStrategy.rst +++ b/docs/en_US/NAS/retiarii/WriteStrategy.rst @@ -3,10 +3,12 @@ Customize A New Strategy To write a new strategy, you should inherit the base strategy class ``BaseStrategy``, then implement the member function ``run``. This member function takes ``base_model`` and ``applied_mutators`` as its input arguments. It can simply apply the user specified mutators in ``applied_mutators`` onto ``base_model`` to generate a new model. When a mutator is applied, it should be bound with a sampler (e.g., ``RandomSampler``). Every sampler implements the ``choice`` function which chooses value(s) from candidate values. The ``choice`` functions invoked in mutators are executed with the sampler. -Below is a very simple random strategy, the complete code can be found :githublink:`here `. +Below is a very simple random strategy, which makes the choices completely random. .. code-block:: python + from nni.retiarii import Sampler + class RandomSampler(Sampler): def choice(self, candidates, mutator, model, index): return random.choice(candidates) @@ -31,6 +33,6 @@ Below is a very simple random strategy, the complete code can be found :githubli else: time.sleep(2) -You can find that this strategy does not know the search space beforehand, it passively makes decisions every time ``choice`` is invoked from mutators. If a strategy wants to know the whole search space before making any decision (e.g., TPE, SMAC), it can use ``dry_run`` function provided by ``Mutator`` to obtain the space. An example strategy can be found :githublink:`here `. +You can find that this strategy does not know the search space beforehand, it passively makes decisions every time ``choice`` is invoked from mutators. If a strategy wants to know the whole search space before making any decision (e.g., TPE, SMAC), it can use ``dry_run`` function provided by ``Mutator`` to obtain the space. An example strategy can be found :githublink:`here `. After generating a new model, the strategy can use our provided APIs (e.g., ``submit_models``, ``is_stopped_exec``) to submit the model and get its reported results. More APIs can be found in `API References <./ApiReference.rst>`__. \ No newline at end of file diff --git a/nni/retiarii/experiment.py b/nni/retiarii/experiment.py index 6c535f9e15..a4db7c97f4 100644 --- a/nni/retiarii/experiment.py +++ b/nni/retiarii/experiment.py @@ -17,7 +17,7 @@ from .converter import convert_to_graph from .mutator import Mutator from .trainer.interface import BaseTrainer, BaseOneShotTrainer -from .strategies.strategy import BaseStrategy +from .strategy import BaseStrategy from .trainer import BaseOneShotTrainer _logger = logging.getLogger(__name__) diff --git a/nni/retiarii/strategies/__init__.py b/nni/retiarii/strategies/__init__.py deleted file mode 100644 index 62ef50bf4f..0000000000 --- a/nni/retiarii/strategies/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .tpe_strategy import TPEStrategy -from .random_strategy import RandomStrategy diff --git a/nni/retiarii/strategy/__init__.py b/nni/retiarii/strategy/__init__.py new file mode 100644 index 0000000000..ac88c11cbd --- /dev/null +++ b/nni/retiarii/strategy/__init__.py @@ -0,0 +1,3 @@ +from .base import BaseStrategy +from .bruteforce import Random, GridSearch +from .tpe_strategy import TPEStrategy diff --git a/nni/retiarii/strategies/strategy.py b/nni/retiarii/strategy/base.py similarity index 100% rename from nni/retiarii/strategies/strategy.py rename to nni/retiarii/strategy/base.py diff --git a/nni/retiarii/strategies/bruteforce.py b/nni/retiarii/strategy/bruteforce.py similarity index 98% rename from nni/retiarii/strategies/bruteforce.py rename to nni/retiarii/strategy/bruteforce.py index 097f1f6969..f29a60145f 100644 --- a/nni/retiarii/strategies/bruteforce.py +++ b/nni/retiarii/strategy/bruteforce.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List from .. import Sampler, submit_models, query_available_resources -from .strategy import BaseStrategy +from .base import BaseStrategy from .utils import dry_run_for_search_space _logger = logging.getLogger(__name__) @@ -72,7 +72,7 @@ def choice(self, candidates, mutator, model, index): return random.choice(candidates) -class RandomStrategy(BaseStrategy): +class Random(BaseStrategy): def __init__(self, variational=False, dedup=True): self.variational = variational self.dedup = dedup diff --git a/nni/retiarii/strategies/evolution.py b/nni/retiarii/strategy/evolution.py similarity index 100% rename from nni/retiarii/strategies/evolution.py rename to nni/retiarii/strategy/evolution.py diff --git a/nni/retiarii/strategies/tpe_strategy.py b/nni/retiarii/strategy/tpe_strategy.py similarity index 100% rename from nni/retiarii/strategies/tpe_strategy.py rename to nni/retiarii/strategy/tpe_strategy.py diff --git a/nni/retiarii/strategies/utils.py b/nni/retiarii/strategy/utils.py similarity index 100% rename from nni/retiarii/strategies/utils.py rename to nni/retiarii/strategy/utils.py diff --git a/test/retiarii_test/darts/test.py b/test/retiarii_test/darts/test.py index 824230c0f7..d2c2b94f1a 100644 --- a/test/retiarii_test/darts/test.py +++ b/test/retiarii_test/darts/test.py @@ -4,8 +4,8 @@ import torch from pathlib import Path +from nni.retiarii import strategy from nni.retiarii.experiment import RetiariiExperiment, RetiariiExeConfig -from nni.retiarii.strategies import TPEStrategy, RandomStrategy from nni.retiarii.trainer.pytorch import PyTorchImageClassificationTrainer from darts_model import CNN @@ -18,10 +18,9 @@ optimizer_kwargs={"lr": 1e-3}, trainer_kwargs={"max_epochs": 1}) - #simple_startegy = TPEStrategy() - simple_startegy = RandomStrategy() + simple_strategy = strategy.Random() - exp = RetiariiExperiment(base_model, trainer, [], simple_startegy) + exp = RetiariiExperiment(base_model, trainer, [], simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'darts_search' diff --git a/test/retiarii_test/darts/test_oneshot.py b/test/retiarii_test/darts/test_oneshot.py index 0b5a3279f2..ed56b2473f 100644 --- a/test/retiarii_test/darts/test_oneshot.py +++ b/test/retiarii_test/darts/test_oneshot.py @@ -9,7 +9,7 @@ from torchvision.datasets import CIFAR10 from nni.retiarii.experiment import RetiariiExperiment, RetiariiExeConfig -from nni.retiarii.strategies import TPEStrategy +from nni.retiarii.strategy import TPEStrategy from nni.retiarii.trainer.pytorch import DartsTrainer from darts_model import CNN diff --git a/test/retiarii_test/mnasnet/test.py b/test/retiarii_test/mnasnet/test.py index 829d1df10a..f67ff76bac 100644 --- a/test/retiarii_test/mnasnet/test.py +++ b/test/retiarii_test/mnasnet/test.py @@ -8,7 +8,7 @@ from base_mnasnet import MNASNet from nni.retiarii.experiment import RetiariiExperiment, RetiariiExeConfig -from nni.retiarii.strategies import TPEStrategy +from nni.retiarii.strategy import TPEStrategy from mutator import BlockMutator if __name__ == '__main__': @@ -31,9 +31,9 @@ applied_mutators.append(BlockMutator('mutable_0')) applied_mutators.append(BlockMutator('mutable_1')) - simple_startegy = TPEStrategy() + simple_strategy = TPEStrategy() - exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_startegy) + exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnasnet_search' diff --git a/test/retiarii_test/mnist/test.py b/test/retiarii_test/mnist/test.py index 747dff3185..98d0e2e03f 100644 --- a/test/retiarii_test/mnist/test.py +++ b/test/retiarii_test/mnist/test.py @@ -2,8 +2,8 @@ import nni.retiarii.nn.pytorch as nn import torch.nn.functional as F +from nni.retiarii import strategy from nni.retiarii.experiment import RetiariiExeConfig, RetiariiExperiment -from nni.retiarii.strategies import RandomStrategy from nni.retiarii.trainer.pytorch import PyTorchImageClassificationTrainer @@ -37,9 +37,9 @@ def forward(self, x): optimizer_kwargs={"lr": 1e-3}, trainer_kwargs={"max_epochs": 1}) - simple_startegy = RandomStrategy() + simple_strategy = strategy.Random() - exp = RetiariiExperiment(base_model, trainer, [], simple_startegy) + exp = RetiariiExperiment(base_model, trainer, [], simple_strategy) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'mnist_search' From 0a2e78137b63deadaa51d6421777682eaa6d6324 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 10 Feb 2021 10:29:12 +0800 Subject: [PATCH 06/11] Evolution (checkpoint) --- nni/retiarii/strategy/bruteforce.py | 30 ++------- nni/retiarii/strategy/evolution.py | 96 ++++++++++++++++++++++++++- nni/retiarii/strategy/tpe_strategy.py | 2 +- nni/retiarii/strategy/utils.py | 18 ++++- 4 files changed, 120 insertions(+), 26 deletions(-) diff --git a/nni/retiarii/strategy/bruteforce.py b/nni/retiarii/strategy/bruteforce.py index f29a60145f..d606688fc9 100644 --- a/nni/retiarii/strategy/bruteforce.py +++ b/nni/retiarii/strategy/bruteforce.py @@ -7,12 +7,12 @@ from .. import Sampler, submit_models, query_available_resources from .base import BaseStrategy -from .utils import dry_run_for_search_space +from .utils import dry_run_for_search_space, get_targeted_model _logger = logging.getLogger(__name__) -def _generate_with_gridsearch(search_space: Dict[Any, List[Any]], shuffle=True): +def grid_generator(search_space: Dict[Any, List[Any]], shuffle=True): keys = list(search_space.keys()) search_space_values = copy.deepcopy(list(search_space.values())) if shuffle: @@ -22,7 +22,7 @@ def _generate_with_gridsearch(search_space: Dict[Any, List[Any]], shuffle=True): yield {key: value for key, value in zip(keys, values)} -def _generate_with_random(search_space: Dict[Any, List[Any]], dedup=True, retries=500): +def random_generator(search_space: Dict[Any, List[Any]], dedup=True, retries=500): keys = list(search_space.keys()) history = set() search_space_values = copy.deepcopy(list(search_space.values())) @@ -41,14 +41,6 @@ def _generate_with_random(search_space: Dict[Any, List[Any]], dedup=True, retrie yield {key: value for key, value in zip(keys, selected)} -class _FixedSampler(Sampler): - def __init__(self, sample): - self.sample = sample - - def choice(self, candidates, mutator, model, index): - return self.sample[(mutator, index)] - - class GridSearch(BaseStrategy): def __init__(self, shuffle=True): self._polling_interval = 2. @@ -56,15 +48,11 @@ def __init__(self, shuffle=True): def run(self, base_model, applied_mutators): search_space = dry_run_for_search_space(base_model, applied_mutators) - for sample in _generate_with_gridsearch(search_space, shuffle=self.shuffle): + for sample in grid_generator(search_space, shuffle=self.shuffle): _logger.info('New model created. Waiting for resource. %s', str(sample)) if query_available_resources() <= 0: time.sleep(self._polling_interval) - sampler = _FixedSampler(sample) - model = base_model - for mutator in applied_mutators: - model = mutator.bind_sampler(sampler).apply(model) - submit_models(model) + submit_models(get_targeted_model(base_model, applied_mutators, sample)) class _RandomSampler(Sampler): @@ -100,12 +88,8 @@ def run(self, base_model, applied_mutators): else: _logger.info('Random search running in fixed size mode. Dedup: %s.', 'on' if self.dedup else 'off') search_space = dry_run_for_search_space(base_model, applied_mutators) - for sample in _generate_with_random(search_space, dedup=self.dedup): + for sample in random_generator(search_space, dedup=self.dedup): _logger.info('New model created. Waiting for resource. %s', str(sample)) if query_available_resources() <= 0: time.sleep(self._polling_interval) - sampler = _FixedSampler(sample) - model = base_model - for mutator in applied_mutators: - model = mutator.bind_sampler(sampler).apply(model) - submit_models(model) + submit_models(get_targeted_model(base_model, applied_mutators, sample)) diff --git a/nni/retiarii/strategy/evolution.py b/nni/retiarii/strategy/evolution.py index 73a30299b7..d8232d5371 100644 --- a/nni/retiarii/strategy/evolution.py +++ b/nni/retiarii/strategy/evolution.py @@ -1,4 +1,98 @@ -# TODO: needs to be adapted to new API +import collections +import dataclasses +import logging +import random + +from ..execution import submit_models +from ..graph import ModelStatus +from .base import BaseStrategy +from .utils import dry_run_for_search_space, get_targeted_model + + +_logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class Individual: + x: dict + y: float + + +class Evolution(BaseStrategy): + def __init__(self, optimize_mode='maximize', population_size=100, sample_size=25, cycles=20000, + mutation_prob=0.05, on_failure='ignore'): + assert optimize_mode in ['maximize', 'minimize'] + assert on_failure in ['ignore', 'worst'] + assert sample_size < population_size + self.optimize_mode = optimize_mode + self.population_size = population_size + self.sample_size = sample_size + self.cycles = cycles + self.mutation_prob = mutation_prob + self.on_failure = on_failure + + self._worst = float('-inf') if self.optimize_mode == 'maximize' else float('inf') + + self._succeed_count = 0 + self._population = collections.deque() + self._running_models = [] + + def random(self, search_space): + return {k: random.choice(v) for k, v in search_space.items()} + + def mutate(self, config, search_space): + new_config = {} + for k, v in config.items(): + if random.uniform(0, 1) < self.mutation_prob: + # NOTE: we do not exclude the original choice here for simplicity, + # which is slightly different from the original paper. + new_config[k] = random.choice(search_space[k]) + else: + new_config[k] = v + return new_config + + def run(self, base_model, applied_mutators): + search_space = dry_run_for_search_space(base_model, applied_mutators) + # Run the first population regardless of the resources + _logger.info('Initializing the first population.') + while len(self._population) + len(self._running_models) <= self._population: + # try to submit new models + if len(self._population) + len(self._running_models) < self._population: + random_config = self.random(search_space) + random_model = get_targeted_model(base_model, applied_mutators, random_config) + submit_models(random_model) + self._running_models.append((random_config, random_model)) + # collect results + self._remove_failed_models_from_running_list() + self._move_succeeded_models_to_population() + + def _is_better(self, a, b): + if self.optimize_mode == 'maximize': + return a > b + else: + return a < b + + def _remove_failed_models_from_running_list(self): + if self.on_failure == 'ignore': + number_of_failed_models = len([g for g in self._running_models if g.status == ModelStatus.Failed]) + self._running_models = [g for g in self._running_models if g.status != ModelStatus.Failed] + _logger.info('%d failed models are ignored. Will retry.', number_of_failed_models) + + def _move_succeeded_models_to_population(self): + completed_indices = [] + for i, (config, model) in enumerate(self._running_models): + metric = None + if self.on_failure == 'worst' and model.status == ModelStatus.Failed: + metric = self._worst + elif model.status == ModelStatus.Trained: + metric = model.metric + if metric is not None: + self._population.append(Individual(config, metric)) + if len(self._population) >= self.population_size: + self._population.popleft() + completed_indices.append(i) + for i in completed_indices: + self._running_models.pop(i) class RegularizedEvolution: diff --git a/nni/retiarii/strategy/tpe_strategy.py b/nni/retiarii/strategy/tpe_strategy.py index 1a63d4f8ad..8d823bae11 100644 --- a/nni/retiarii/strategy/tpe_strategy.py +++ b/nni/retiarii/strategy/tpe_strategy.py @@ -4,7 +4,7 @@ from nni.algorithms.hpo.hyperopt_tuner import HyperoptTuner from .. import Sampler, submit_models, query_available_resources, is_stopped_exec -from .strategy import BaseStrategy +from .base import BaseStrategy _logger = logging.getLogger(__name__) diff --git a/nni/retiarii/strategy/utils.py b/nni/retiarii/strategy/utils.py index 23552ca416..32219c2b0b 100644 --- a/nni/retiarii/strategy/utils.py +++ b/nni/retiarii/strategy/utils.py @@ -1,7 +1,15 @@ import collections from typing import Dict, Any, List from ..graph import Model -from ..mutator import Mutator +from ..mutator import Mutator, Sampler + + +class _FixedSampler(Sampler): + def __init__(self, sample): + self.sample = sample + + def choice(self, candidates, mutator, model, index): + return self.sample[(mutator, index)] def dry_run_for_search_space(model: Model, mutators: List[Mutator]) -> Dict[Any, List[Any]]: @@ -11,3 +19,11 @@ def dry_run_for_search_space(model: Model, mutators: List[Mutator]) -> Dict[Any, for i, candidates in recorded_candidates: search_space[(id(mutator), i)] = candidates return search_space + + +def get_targeted_model(base_model: Model, mutators: List[Mutator], sample: dict) -> Model: + sampler = _FixedSampler(sample) + model = base_model + for mutator in mutators: + model = mutator.bind_sampler(sampler).apply(model) + return model From d1e20b5846e52fb9f672d9d62734d561e2abfd48 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 10 Feb 2021 11:35:24 +0800 Subject: [PATCH 07/11] Refine evolution --- nni/retiarii/strategy/evolution.py | 162 ++++++++--------------------- 1 file changed, 41 insertions(+), 121 deletions(-) diff --git a/nni/retiarii/strategy/evolution.py b/nni/retiarii/strategy/evolution.py index d8232d5371..9236f9c8cf 100644 --- a/nni/retiarii/strategy/evolution.py +++ b/nni/retiarii/strategy/evolution.py @@ -2,6 +2,7 @@ import dataclasses import logging import random +import time from ..execution import submit_models from ..graph import ModelStatus @@ -33,23 +34,34 @@ def __init__(self, optimize_mode='maximize', population_size=100, sample_size=25 self._worst = float('-inf') if self.optimize_mode == 'maximize' else float('inf') - self._succeed_count = 0 + self._success_count = 0 self._population = collections.deque() self._running_models = [] + self._polling_interval = 2. def random(self, search_space): return {k: random.choice(v) for k, v in search_space.items()} - def mutate(self, config, search_space): - new_config = {} - for k, v in config.items(): + def mutate(self, parent, search_space): + child = {} + for k, v in parent.items(): if random.uniform(0, 1) < self.mutation_prob: # NOTE: we do not exclude the original choice here for simplicity, # which is slightly different from the original paper. - new_config[k] = random.choice(search_space[k]) + child[k] = random.choice(search_space[k]) else: - new_config[k] = v - return new_config + child[k] = v + return child + + def best_parent(self): + samples = [p for p in self.population] # copy population + random.shuffle(samples) + samples = list(samples)[:self.sample_size] + if self.optimize_mode == 'maximize': + parent = max(samples, key=lambda sample: sample.y) + else: + parent = min(samples, key=lambda sample: sample.y) + return parent.x def run(self, base_model, applied_mutators): search_space = dry_run_for_search_space(base_model, applied_mutators) @@ -58,19 +70,30 @@ def run(self, base_model, applied_mutators): while len(self._population) + len(self._running_models) <= self._population: # try to submit new models if len(self._population) + len(self._running_models) < self._population: - random_config = self.random(search_space) - random_model = get_targeted_model(base_model, applied_mutators, random_config) - submit_models(random_model) - self._running_models.append((random_config, random_model)) + config = self.random(search_space) + self._submit_config(config, base_model, applied_mutators) # collect results self._remove_failed_models_from_running_list() self._move_succeeded_models_to_population() + time.sleep(self._polling_interval) - def _is_better(self, a, b): - if self.optimize_mode == 'maximize': - return a > b - else: - return a < b + # Resource-aware mutation of models + _logger.info('Running mutations.') + while self._success_count + len(self._running_models) <= self.cycles: + # try to submit new models + if self._success_count + len(self._running_models) < self.cycles: + config = self.mutate(self.best_parent(), search_space) + self._submit_config(config, base_model, applied_mutators) + # collect results + self._remove_failed_models_from_running_list() + self._move_succeeded_models_to_population() + time.sleep(self._polling_interval) + + def _submit_config(self, config, base_model, mutators): + model = get_targeted_model(base_model, mutators, config) + submit_models(model) + self._running_models.append((config, model)) + return model def _remove_failed_models_from_running_list(self): if self.on_failure == 'ignore': @@ -90,110 +113,7 @@ def _move_succeeded_models_to_population(self): self._population.append(Individual(config, metric)) if len(self._population) >= self.population_size: self._population.popleft() - completed_indices.append(i) + completed_indices.append(i) for i in completed_indices: + self._success_count += 1 self._running_models.pop(i) - - -class RegularizedEvolution: - def __init__(self, search_space, - concurrency, population_size, sample_size, cycles, mutation_prob, - reward_fn, command, setup): - self.search_space = search_space - self.concurrency = concurrency - self.population_size = population_size - self.command = command - self.setup = setup - self.population_size = population_size - self.sample_size = sample_size - self.cycles = cycles - self.mutation_prob = mutation_prob - self.reward_fn = reward_fn - assert self.cycles >= self.population_size >= self.sample_size - - self.population = collections.deque() - - def train_and_eval(self, config): - pid = get_trial_manager().submit_new_trial(self.command, config, self.setup) - - while True: - try: - metrics = get_trial_manager().query_metrics(pid) - if metrics is not None: - break - time.sleep(5) - continue - except TrialFailed: - _logger.warning(f'Config: {config}. Trial failed and use -inf as metrics.') - metrics = float('-inf') - break - return self.reward_fn(config, metrics) - - def random_config(self): - config = {} - for k, v in SearchSpaceUtils.flatten_search_space(self.search_space).items(): - config[k] = v.random() - _logger.info(f'Generated random config: {config}') - return SearchSpaceUtils.restore_config(config, self.search_space) - - def mutate_config(self, parent_config): - parent_config = SearchSpaceUtils.flatten_config(parent_config) - config = {} - for k, v in SearchSpaceUtils.flatten_search_space(self.search_space).items(): - config[k] = parent_config[k] - if random.uniform(0, 1) < self.mutation_prob: - config[k] = v.random(excludes=[parent_config[k]]) - _logger.info(f'Generated mutated config: {config}') - return SearchSpaceUtils.restore_config(config, self.search_space) - - def import_(self, individuals): - self.individuals = sorted(individuals, key=lambda i: i.reward)[-self.population_size:] - random.shuffle(self.individuals) - _logger.info(f'Imported individuals: {self.individuals}') - - def _run_random(self): - individual = Individual(self.random_config(), None) - individual.reward = self.train_and_eval(individual.config) - self.population.append(individual) - - def _run_mutation(self): - # Sample randomly chosen models from the current population. - try: - _lock.acquire() - samples = copy.deepcopy(self.population) - finally: - _lock.release() - random.shuffle(samples) - samples = list(samples)[:self.population_size] - parent = max(samples, key=lambda i: i.reward) - - individual = Individual(self.mutate_config(parent.config), None) - individual.reward = self.train_and_eval(individual.config) - try: - _lock.acquire() - self.population.append(individual) - self.population.popleft() - finally: - _lock.release() - - def _wait_for_futures_and_shutdown(self, futures, pool): - for i in futures: - try: - i.result() - except: - traceback.print_exc() - for k in futures: - k.cancel() - pool.shutdown(wait=True) - raise - pool.shutdown() - - def run(self): - # Initialize the population with random models. - pool = concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) - fs = [pool.submit(self._run_random) for _ in range(self.population_size - len(self.population))] - self._wait_for_futures_and_shutdown(fs, pool) - - pool = concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) - fs = [pool.submit(self._run_mutation) for _ in range(self.cycles - self.population_size)] - self._wait_for_futures_and_shutdown(fs, pool) From 48be03e4887202dbe261088684d73b3b8d94d23c Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 10 Feb 2021 16:12:38 +0800 Subject: [PATCH 08/11] Refine evolution implementation --- nni/retiarii/strategy/evolution.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/nni/retiarii/strategy/evolution.py b/nni/retiarii/strategy/evolution.py index 9236f9c8cf..0585953449 100644 --- a/nni/retiarii/strategy/evolution.py +++ b/nni/retiarii/strategy/evolution.py @@ -4,7 +4,7 @@ import random import time -from ..execution import submit_models +from ..execution import query_available_resources, submit_models from ..graph import ModelStatus from .base import BaseStrategy from .utils import dry_run_for_search_space, get_targeted_model @@ -65,7 +65,7 @@ def best_parent(self): def run(self, base_model, applied_mutators): search_space = dry_run_for_search_space(base_model, applied_mutators) - # Run the first population regardless of the resources + # Run the first population regardless concurrency _logger.info('Initializing the first population.') while len(self._population) + len(self._running_models) <= self._population: # try to submit new models @@ -73,20 +73,20 @@ def run(self, base_model, applied_mutators): config = self.random(search_space) self._submit_config(config, base_model, applied_mutators) # collect results - self._remove_failed_models_from_running_list() self._move_succeeded_models_to_population() + self._remove_failed_models_from_running_list() time.sleep(self._polling_interval) # Resource-aware mutation of models _logger.info('Running mutations.') while self._success_count + len(self._running_models) <= self.cycles: # try to submit new models - if self._success_count + len(self._running_models) < self.cycles: + if query_available_resources() > 0 and self._success_count + len(self._running_models) < self.cycles: config = self.mutate(self.best_parent(), search_space) self._submit_config(config, base_model, applied_mutators) # collect results - self._remove_failed_models_from_running_list() self._move_succeeded_models_to_population() + self._remove_failed_models_from_running_list() time.sleep(self._polling_interval) def _submit_config(self, config, base_model, mutators): @@ -95,12 +95,6 @@ def _submit_config(self, config, base_model, mutators): self._running_models.append((config, model)) return model - def _remove_failed_models_from_running_list(self): - if self.on_failure == 'ignore': - number_of_failed_models = len([g for g in self._running_models if g.status == ModelStatus.Failed]) - self._running_models = [g for g in self._running_models if g.status != ModelStatus.Failed] - _logger.info('%d failed models are ignored. Will retry.', number_of_failed_models) - def _move_succeeded_models_to_population(self): completed_indices = [] for i, (config, model) in enumerate(self._running_models): @@ -110,10 +104,19 @@ def _move_succeeded_models_to_population(self): elif model.status == ModelStatus.Trained: metric = model.metric if metric is not None: - self._population.append(Individual(config, metric)) + individual = Individual(config, metric) + _logger.info('New individual created: %s', str(individual)) + self._population.append(individual) if len(self._population) >= self.population_size: self._population.popleft() completed_indices.append(i) for i in completed_indices: self._success_count += 1 self._running_models.pop(i) + + def _remove_failed_models_from_running_list(self): + # this is only done when on_failure policy is set to "ignore". + if self.on_failure == 'ignore': + number_of_failed_models = len([g for g in self._running_models if g.status == ModelStatus.Failed]) + self._running_models = [g for g in self._running_models if g.status != ModelStatus.Failed] + _logger.info('%d failed models are ignored. Will retry.', number_of_failed_models) From 2179bfd3e4bd9c6b4da98cd8f60dffa00cc81cb5 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 22 Feb 2021 14:38:02 +0800 Subject: [PATCH 09/11] Add unittests --- nni/retiarii/strategy/__init__.py | 1 + nni/retiarii/strategy/bruteforce.py | 20 ++++ nni/retiarii/strategy/evolution.py | 59 +++++++++--- nni/retiarii/strategy/utils.py | 4 +- test/ut/retiarii/test_strategy.py | 139 ++++++++++++++++++++++++++++ 5 files changed, 209 insertions(+), 14 deletions(-) create mode 100644 test/ut/retiarii/test_strategy.py diff --git a/nni/retiarii/strategy/__init__.py b/nni/retiarii/strategy/__init__.py index ac88c11cbd..af8810873b 100644 --- a/nni/retiarii/strategy/__init__.py +++ b/nni/retiarii/strategy/__init__.py @@ -1,3 +1,4 @@ from .base import BaseStrategy from .bruteforce import Random, GridSearch +from .evolution import RegularizedEvolution from .tpe_strategy import TPEStrategy diff --git a/nni/retiarii/strategy/bruteforce.py b/nni/retiarii/strategy/bruteforce.py index d606688fc9..333aaf2be6 100644 --- a/nni/retiarii/strategy/bruteforce.py +++ b/nni/retiarii/strategy/bruteforce.py @@ -42,6 +42,15 @@ def random_generator(search_space: Dict[Any, List[Any]], dedup=True, retries=500 class GridSearch(BaseStrategy): + """ + Traverse the search space and try all the possible combinations one by one. + + Parameters + ---------- + shuffle : bool + Shuffle the order in a candidate list, so that they are tried in a random order. + """ + def __init__(self, shuffle=True): self._polling_interval = 2. self.shuffle = shuffle @@ -61,6 +70,17 @@ def choice(self, candidates, mutator, model, index): class Random(BaseStrategy): + """ + Random search on the search space. + + Parameters + ---------- + variational : bool + Do not dry run to get the full search space. Used when the search space has variational size or candidates. Default: false. + dedup : bool + Do not try the same configuration twice. When variational is true, deduplication is not supported. Default: true. + """ + def __init__(self, variational=False, dedup=True): self.variational = variational self.dedup = dedup diff --git a/nni/retiarii/strategy/evolution.py b/nni/retiarii/strategy/evolution.py index 0585953449..6f7ca5bd03 100644 --- a/nni/retiarii/strategy/evolution.py +++ b/nni/retiarii/strategy/evolution.py @@ -15,11 +15,36 @@ @dataclasses.dataclass class Individual: + """ + A class that represents an individual. + Holds two attributes, where ``x`` is the model and ``y`` is the metric (e.g., accuracy). + """ x: dict y: float -class Evolution(BaseStrategy): +class RegularizedEvolution(BaseStrategy): + """ + Algorithm for regularized evolution (i.e. aging evolution). + Follows "Algorithm 1" in Real et al. "Regularized Evolution for Image Classifier Architecture Search". + + Parameters + ---------- + optimize_mode : str + Can be one of "maximize" and "minimize". + population_size : int + The number of individuals to keep in the population. + cycles : int + The number of cycles (trials) the algorithm should run for. + sample_size : int + The number of individuals that should participate in each tournament. + mutation_prob : float + Probability that mutation happens in each dim. + on_failure : str + Can be one of "ignore" and "worst". If "ignore", simply give up the model and find a new one. + If "worst", mark the model as -inf (if maximize, inf if minimize), so that the algorithm "learns" to avoid such model. + """ + def __init__(self, optimize_mode='maximize', population_size=100, sample_size=25, cycles=20000, mutation_prob=0.05, on_failure='ignore'): assert optimize_mode in ['maximize', 'minimize'] @@ -54,7 +79,7 @@ def mutate(self, parent, search_space): return child def best_parent(self): - samples = [p for p in self.population] # copy population + samples = [p for p in self._population] # copy population random.shuffle(samples) samples = list(samples)[:self.sample_size] if self.optimize_mode == 'maximize': @@ -67,9 +92,9 @@ def run(self, base_model, applied_mutators): search_space = dry_run_for_search_space(base_model, applied_mutators) # Run the first population regardless concurrency _logger.info('Initializing the first population.') - while len(self._population) + len(self._running_models) <= self._population: + while len(self._population) + len(self._running_models) <= self.population_size: # try to submit new models - if len(self._population) + len(self._running_models) < self._population: + while len(self._population) + len(self._running_models) < self.population_size: config = self.random(search_space) self._submit_config(config, base_model, applied_mutators) # collect results @@ -77,11 +102,14 @@ def run(self, base_model, applied_mutators): self._remove_failed_models_from_running_list() time.sleep(self._polling_interval) + if len(self._population) >= self.population_size: + break + # Resource-aware mutation of models _logger.info('Running mutations.') while self._success_count + len(self._running_models) <= self.cycles: # try to submit new models - if query_available_resources() > 0 and self._success_count + len(self._running_models) < self.cycles: + while query_available_resources() > 0 and self._success_count + len(self._running_models) < self.cycles: config = self.mutate(self.best_parent(), search_space) self._submit_config(config, base_model, applied_mutators) # collect results @@ -89,7 +117,11 @@ def run(self, base_model, applied_mutators): self._remove_failed_models_from_running_list() time.sleep(self._polling_interval) + if self._success_count >= self.cycles: + break + def _submit_config(self, config, base_model, mutators): + _logger.info('Model submitted to running queue: %s', config) model = get_targeted_model(base_model, mutators, config) submit_models(model) self._running_models.append((config, model)) @@ -105,18 +137,21 @@ def _move_succeeded_models_to_population(self): metric = model.metric if metric is not None: individual = Individual(config, metric) - _logger.info('New individual created: %s', str(individual)) + _logger.info('Individual created: %s', str(individual)) self._population.append(individual) - if len(self._population) >= self.population_size: + if len(self._population) > self.population_size: self._population.popleft() completed_indices.append(i) - for i in completed_indices: + for i in completed_indices[::-1]: + # delete from end to start so that the index number will not be affected. self._success_count += 1 self._running_models.pop(i) def _remove_failed_models_from_running_list(self): - # this is only done when on_failure policy is set to "ignore". + # This is only done when on_failure policy is set to "ignore". + # Otherwise, failed models will be treated as inf when processed. if self.on_failure == 'ignore': - number_of_failed_models = len([g for g in self._running_models if g.status == ModelStatus.Failed]) - self._running_models = [g for g in self._running_models if g.status != ModelStatus.Failed] - _logger.info('%d failed models are ignored. Will retry.', number_of_failed_models) + number_of_failed_models = len([g for g in self._running_models if g[1].status == ModelStatus.Failed]) + self._running_models = [g for g in self._running_models if g[1].status != ModelStatus.Failed] + if number_of_failed_models > 0: + _logger.info('%d failed models are ignored. Will retry.', number_of_failed_models) diff --git a/nni/retiarii/strategy/utils.py b/nni/retiarii/strategy/utils.py index 32219c2b0b..c1055d1707 100644 --- a/nni/retiarii/strategy/utils.py +++ b/nni/retiarii/strategy/utils.py @@ -16,8 +16,8 @@ def dry_run_for_search_space(model: Model, mutators: List[Mutator]) -> Dict[Any, search_space = collections.OrderedDict() for mutator in mutators: recorded_candidates, model = mutator.dry_run(model) - for i, candidates in recorded_candidates: - search_space[(id(mutator), i)] = candidates + for i, candidates in enumerate(recorded_candidates): + search_space[(mutator, i)] = candidates return search_space diff --git a/test/ut/retiarii/test_strategy.py b/test/ut/retiarii/test_strategy.py new file mode 100644 index 0000000000..5f5fe42208 --- /dev/null +++ b/test/ut/retiarii/test_strategy.py @@ -0,0 +1,139 @@ +import random +import time +import threading +from typing import * + +import nni.retiarii.execution.api +import nni.retiarii.nn.pytorch as nn +import nni.retiarii.strategy as strategy +import torch +import torch.nn.functional as F +from nni.retiarii import Model +from nni.retiarii.converter import convert_to_graph +from nni.retiarii.execution import wait_models +from nni.retiarii.execution.interface import AbstractExecutionEngine, WorkerInfo, MetricData, AbstractGraphListener +from nni.retiarii.graph import DebugTraining, ModelStatus +from nni.retiarii.nn.pytorch.mutator import process_inline_mutation + + +class MockExecutionEngine(AbstractExecutionEngine): + def __init__(self, failure_prob=0.): + self.models = [] + self.failure_prob = failure_prob + self._resource_left = 4 + + def _model_complete(self, model: Model): + time.sleep(random.uniform(0, 1)) + if random.uniform(0, 1) < self.failure_prob: + model.status = ModelStatus.Failed + else: + model.metric = random.uniform(0, 1) + model.status = ModelStatus.Trained + self._resource_left += 1 + + def submit_models(self, *models: Model) -> None: + for model in models: + self.models.append(model) + self._resource_left -= 1 + threading.Thread(target=self._model_complete, args=(model, )).start() + + def query_available_resource(self) -> Union[List[WorkerInfo], int]: + return self._resource_left + + def register_graph_listener(self, listener: AbstractGraphListener) -> None: + pass + + def trial_execute_graph(cls) -> MetricData: + pass + + +def _reset_execution_engine(engine=None): + nni.retiarii.execution.api._execution_engine = engine + + +class Net(nn.Module): + def __init__(self, hidden_size=32): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.LayerChoice([ + nn.Linear(4*4*50, hidden_size, bias=True), + nn.Linear(4*4*50, hidden_size, bias=False) + ]) + self.fc2 = nn.LayerChoice([ + nn.Linear(hidden_size, 10, bias=False), + nn.Linear(hidden_size, 10, bias=True) + ]) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4*4*50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def _get_model_and_mutators(): + base_model = Net() + script_module = torch.jit.script(base_model) + base_model_ir = convert_to_graph(script_module, base_model) + base_model_ir.training_config = DebugTraining() + mutators = process_inline_mutation(base_model_ir) + return base_model_ir, mutators + + +def test_grid_search(): + gridsearch = strategy.GridSearch() + engine = MockExecutionEngine() + _reset_execution_engine(engine) + gridsearch.run(*_get_model_and_mutators()) + wait_models(*engine.models) + selection = set() + for model in engine.models: + selection.add(( + model.get_node_by_name('_model__fc1').operation.parameters['bias'], + model.get_node_by_name('_model__fc2').operation.parameters['bias'] + )) + assert len(selection) == 4 + _reset_execution_engine() + + +def test_random_search(): + random = strategy.Random() + engine = MockExecutionEngine() + _reset_execution_engine(engine) + random.run(*_get_model_and_mutators()) + wait_models(*engine.models) + selection = set() + for model in engine.models: + selection.add(( + model.get_node_by_name('_model__fc1').operation.parameters['bias'], + model.get_node_by_name('_model__fc2').operation.parameters['bias'] + )) + assert len(selection) == 4 + _reset_execution_engine() + + +def test_evolution(): + evolution = strategy.RegularizedEvolution(population_size=5, sample_size=3, cycles=10, mutation_prob=0.5, on_failure='ignore') + engine = MockExecutionEngine(failure_prob=0.2) + _reset_execution_engine(engine) + evolution.run(*_get_model_and_mutators()) + wait_models(*engine.models) + _reset_execution_engine() + + evolution = strategy.RegularizedEvolution(population_size=5, sample_size=3, cycles=10, mutation_prob=0.5, on_failure='worst') + engine = MockExecutionEngine(failure_prob=0.4) + _reset_execution_engine(engine) + evolution.run(*_get_model_and_mutators()) + wait_models(*engine.models) + _reset_execution_engine() + + +if __name__ == '__main__': + test_grid_search() + test_random_search() + test_evolution() From 6316f771ff52daf013627ab49f6f6e27a213f602 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 22 Feb 2021 14:53:51 +0800 Subject: [PATCH 10/11] Refine documents --- docs/en_US/NAS/retiarii/ApiReference.rst | 10 ++++++++-- docs/en_US/NAS/retiarii/Tutorial.rst | 6 +++--- nni/retiarii/graph.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/en_US/NAS/retiarii/ApiReference.rst b/docs/en_US/NAS/retiarii/ApiReference.rst index 67315e195a..1e49b8fcef 100644 --- a/docs/en_US/NAS/retiarii/ApiReference.rst +++ b/docs/en_US/NAS/retiarii/ApiReference.rst @@ -72,10 +72,16 @@ Oneshot Trainers Strategies ---------- -.. autoclass:: nni.retiarii.strategies.RandomStrategy +.. autoclass:: nni.retiarii.strategy.Random :members: -.. autoclass:: nni.retiarii.strategies.TPEStrategy +.. autoclass:: nni.retiarii.strategy.GridSearch + :members: + +.. autoclass:: nni.retiarii.strategy.RegularizedEvolution + :members: + +.. autoclass:: nni.retiarii.strategy.TPEStrategy :members: Retiarii Experiments diff --git a/docs/en_US/NAS/retiarii/Tutorial.rst b/docs/en_US/NAS/retiarii/Tutorial.rst index 36f7949ced..9a9305cf96 100644 --- a/docs/en_US/NAS/retiarii/Tutorial.rst +++ b/docs/en_US/NAS/retiarii/Tutorial.rst @@ -167,13 +167,13 @@ In the following table, we listed the available trainers and strategies. - TPEStrategy - DartsTrainer * - Regression - - RandomStrategy + - Random - EnasTrainer * - - - + - GridSearch - ProxylessTrainer * - - - + - RegularizedEvolution - SinglePathTrainer (RandomTrainer) There usage and API document can be found `here <./ApiReference>`__\. diff --git a/nni/retiarii/graph.py b/nni/retiarii/graph.py index f8a99b7eb9..fa1b136ff4 100644 --- a/nni/retiarii/graph.py +++ b/nni/retiarii/graph.py @@ -131,7 +131,7 @@ def fork(self) -> 'Model': new_model = Model(_internal=True) new_model._root_graph_name = self._root_graph_name new_model.graphs = {name: graph._fork_to(new_model) for name, graph in self.graphs.items()} - new_model.training_config = copy.deepcopy(self.training_config) + new_model.training_config = copy.deepcopy(self.training_config) # TODO this may be a problem when training config is large new_model.history = self.history + [self] return new_model From 0f55c857c2f8f58919cdbe8b71b5962b6ef06522 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 22 Feb 2021 17:56:36 +0800 Subject: [PATCH 11/11] Add default in docstring --- nni/retiarii/strategy/bruteforce.py | 2 +- nni/retiarii/strategy/evolution.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/nni/retiarii/strategy/bruteforce.py b/nni/retiarii/strategy/bruteforce.py index 333aaf2be6..a7d965fe91 100644 --- a/nni/retiarii/strategy/bruteforce.py +++ b/nni/retiarii/strategy/bruteforce.py @@ -48,7 +48,7 @@ class GridSearch(BaseStrategy): Parameters ---------- shuffle : bool - Shuffle the order in a candidate list, so that they are tried in a random order. + Shuffle the order in a candidate list, so that they are tried in a random order. Default: true. """ def __init__(self, shuffle=True): diff --git a/nni/retiarii/strategy/evolution.py b/nni/retiarii/strategy/evolution.py index 6f7ca5bd03..fa365a8382 100644 --- a/nni/retiarii/strategy/evolution.py +++ b/nni/retiarii/strategy/evolution.py @@ -31,18 +31,19 @@ class RegularizedEvolution(BaseStrategy): Parameters ---------- optimize_mode : str - Can be one of "maximize" and "minimize". + Can be one of "maximize" and "minimize". Default: maximize. population_size : int - The number of individuals to keep in the population. + The number of individuals to keep in the population. Default: 100. cycles : int - The number of cycles (trials) the algorithm should run for. + The number of cycles (trials) the algorithm should run for. Default: 20000. sample_size : int - The number of individuals that should participate in each tournament. + The number of individuals that should participate in each tournament. Default: 25. mutation_prob : float - Probability that mutation happens in each dim. + Probability that mutation happens in each dim. Default: 0.05 on_failure : str Can be one of "ignore" and "worst". If "ignore", simply give up the model and find a new one. If "worst", mark the model as -inf (if maximize, inf if minimize), so that the algorithm "learns" to avoid such model. + Default: ignore. """ def __init__(self, optimize_mode='maximize', population_size=100, sample_size=25, cycles=20000,