Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Update nni exp dir #2686

Merged
merged 76 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
3a45961
Merge pull request #31 from microsoft/master
chicm-ms Aug 6, 2019
633db43
Merge pull request #32 from microsoft/master
chicm-ms Sep 9, 2019
3e926f1
Merge pull request #33 from microsoft/master
chicm-ms Oct 8, 2019
f173789
Merge pull request #34 from microsoft/master
chicm-ms Oct 9, 2019
508850a
Merge pull request #35 from microsoft/master
chicm-ms Oct 9, 2019
5a0e9c9
Merge pull request #36 from microsoft/master
chicm-ms Oct 10, 2019
e7df061
Merge pull request #37 from microsoft/master
chicm-ms Oct 23, 2019
2175cef
Merge pull request #38 from microsoft/master
chicm-ms Oct 29, 2019
2ccbfbb
Merge pull request #39 from microsoft/master
chicm-ms Oct 30, 2019
b29cb0b
Merge pull request #40 from microsoft/master
chicm-ms Oct 30, 2019
4a3ba83
Merge pull request #41 from microsoft/master
chicm-ms Nov 4, 2019
c8a1148
Merge pull request #42 from microsoft/master
chicm-ms Nov 4, 2019
73c6101
Merge pull request #43 from microsoft/master
chicm-ms Nov 5, 2019
6a518a9
Merge pull request #44 from microsoft/master
chicm-ms Nov 11, 2019
a0d587f
Merge pull request #45 from microsoft/master
chicm-ms Nov 12, 2019
e905bfe
Merge pull request #46 from microsoft/master
chicm-ms Nov 14, 2019
4b266f3
Merge pull request #47 from microsoft/master
chicm-ms Nov 15, 2019
237ff4b
Merge pull request #48 from microsoft/master
chicm-ms Nov 21, 2019
682be01
Merge pull request #49 from microsoft/master
chicm-ms Nov 25, 2019
133af82
Merge pull request #50 from microsoft/master
chicm-ms Nov 25, 2019
71a8a25
Merge pull request #51 from microsoft/master
chicm-ms Nov 26, 2019
d2a73bc
Merge pull request #52 from microsoft/master
chicm-ms Nov 26, 2019
198cf5e
Merge pull request #53 from microsoft/master
chicm-ms Dec 5, 2019
cdbfaf9
Merge pull request #54 from microsoft/master
chicm-ms Dec 6, 2019
7e9b29e
Merge pull request #55 from microsoft/master
chicm-ms Dec 10, 2019
d00c46d
Merge pull request #56 from microsoft/master
chicm-ms Dec 10, 2019
de7d1fa
Merge pull request #57 from microsoft/master
chicm-ms Dec 11, 2019
1835ab0
Merge pull request #58 from microsoft/master
chicm-ms Dec 12, 2019
24fead6
Merge pull request #59 from microsoft/master
chicm-ms Dec 20, 2019
0b7321e
Merge pull request #60 from microsoft/master
chicm-ms Dec 23, 2019
60058d4
Merge pull request #61 from microsoft/master
chicm-ms Dec 23, 2019
b111a55
Merge pull request #62 from microsoft/master
chicm-ms Dec 24, 2019
611c337
Merge pull request #63 from microsoft/master
chicm-ms Dec 30, 2019
4a1f14a
Merge pull request #64 from microsoft/master
chicm-ms Jan 10, 2020
7a9e604
Merge pull request #65 from microsoft/master
chicm-ms Jan 14, 2020
b8035b0
Merge pull request #66 from microsoft/master
chicm-ms Feb 4, 2020
47567d3
Merge pull request #67 from microsoft/master
chicm-ms Feb 10, 2020
614d427
Merge pull request #68 from microsoft/master
chicm-ms Feb 10, 2020
a0d9ed6
Merge pull request #69 from microsoft/master
chicm-ms Feb 11, 2020
22dc1ad
Merge pull request #70 from microsoft/master
chicm-ms Feb 19, 2020
0856813
Merge pull request #71 from microsoft/master
chicm-ms Feb 22, 2020
9e97bed
Merge pull request #72 from microsoft/master
chicm-ms Feb 25, 2020
16a1b27
Merge pull request #73 from microsoft/master
chicm-ms Mar 3, 2020
e246633
Merge pull request #74 from microsoft/master
chicm-ms Mar 4, 2020
0439bc1
Merge pull request #75 from microsoft/master
chicm-ms Mar 17, 2020
8b5613a
Merge pull request #76 from microsoft/master
chicm-ms Mar 18, 2020
43e8d31
Merge pull request #77 from microsoft/master
chicm-ms Mar 22, 2020
aae448e
Merge pull request #78 from microsoft/master
chicm-ms Mar 25, 2020
7095716
Merge pull request #79 from microsoft/master
chicm-ms Mar 25, 2020
c51263a
Merge pull request #80 from microsoft/master
chicm-ms Apr 11, 2020
9953c70
Merge pull request #81 from microsoft/master
chicm-ms Apr 14, 2020
f9136c4
Merge pull request #82 from microsoft/master
chicm-ms Apr 16, 2020
b384ad2
Merge pull request #83 from microsoft/master
chicm-ms Apr 20, 2020
ff592dd
Merge pull request #84 from microsoft/master
chicm-ms May 12, 2020
0b5378f
Merge pull request #85 from microsoft/master
chicm-ms May 18, 2020
a53e0b0
Merge pull request #86 from microsoft/master
chicm-ms May 25, 2020
3ea0b89
Merge pull request #87 from microsoft/master
chicm-ms May 28, 2020
cf3fb20
Merge pull request #88 from microsoft/master
chicm-ms May 28, 2020
7f4cdcd
Merge pull request #89 from microsoft/master
chicm-ms Jun 4, 2020
574db2c
Merge pull request #90 from microsoft/master
chicm-ms Jun 15, 2020
32bedcc
Merge pull request #91 from microsoft/master
chicm-ms Jun 21, 2020
6155aa4
Merge pull request #92 from microsoft/master
chicm-ms Jun 22, 2020
8139c9c
Merge pull request #93 from microsoft/master
chicm-ms Jun 23, 2020
43419d7
Merge pull request #94 from microsoft/master
chicm-ms Jun 28, 2020
6b6ee55
Merge pull request #95 from microsoft/master
chicm-ms Jun 28, 2020
1b975e0
Merge pull request #96 from microsoft/master
chicm-ms Jun 28, 2020
c8f3c5d
Merge pull request #97 from microsoft/master
chicm-ms Jun 29, 2020
4c306f0
Merge pull request #98 from microsoft/master
chicm-ms Jun 30, 2020
64de4c2
Merge pull request #99 from microsoft/master
chicm-ms Jun 30, 2020
0e5d3ac
Merge pull request #100 from microsoft/master
chicm-ms Jul 1, 2020
4a52608
Merge pull request #101 from microsoft/master
chicm-ms Jul 3, 2020
208b1ee
Merge pull request #102 from microsoft/master
chicm-ms Jul 8, 2020
e7b1a2e
Merge pull request #103 from microsoft/master
chicm-ms Jul 10, 2020
0847f23
update nni experiments dir
chicm-ms Jul 14, 2020
4c71eee
updates
chicm-ms Jul 14, 2020
35faa69
updates
chicm-ms Jul 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/en_US/Tutorial/ExperimentConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ Note: run `ifconfig` on NNI manager's machine to check if eth0 device exists. If

### logDir

Optional. Path to a directory. Default: `<user home directory>/nni/experiment`.
Optional. Path to a directory. Default: `<user home directory>/nni-experiments`.

Configures the directory to store logs and data of the experiment.

Expand Down
2 changes: 1 addition & 1 deletion docs/en_US/Tutorial/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Unable to open the WebUI may have the following reasons:

* `http://127.0.0.1`, `http://172.17.0.1` and `http://10.0.0.15` are referred to localhost, if you start your experiment on the server or remote machine. You can replace the IP to your server IP to view the WebUI, like `http://[your_server_ip]:8080`
* If you still can't see the WebUI after you use the server IP, you can check the proxy and the firewall of your machine. Or use the browser on the machine where you start your NNI experiment.
* Another reason may be your experiment is failed and NNI may fail to get the experiment information. You can check the log of NNIManager in the following directory: `~/nni/experiment/[your_experiment_id]` `/log/nnimanager.log`
* Another reason may be your experiment is failed and NNI may fail to get the experiment information. You can check the log of NNIManager in the following directory: `~/nni-experiments/[your_experiment_id]` `/log/nnimanager.log`

### Restful server start failed

Expand Down
176 changes: 88 additions & 88 deletions docs/en_US/Tutorial/HowToDebug.md
Original file line number Diff line number Diff line change
@@ -1,89 +1,89 @@
**How to Debug in NNI**
===
## Overview
There are three parts that might have logs in NNI. They are nnimanager, dispatcher and trial. Here we will introduce them succinctly. More information please refer to [Overview](../Overview.md).
- **NNI controller**: NNI controller (nnictl) is the nni command-line tool that is used to manage experiments (e.g., start an experiment).
- **nnimanager**: nnimanager is the core of NNI, whose log is important when the whole experiment fails (e.g., no webUI or training service fails)
- **Dispatcher**: Dispatcher calls the methods of **Tuner** and **Assessor**. Logs of dispatcher are related to the tuner or assessor code.
- **Tuner**: Tuner is an AutoML algorithm, which generates a new configuration for the next try. A new trial will run with this configuration.
- **Assessor**: Assessor analyzes trial's intermediate results (e.g., periodically evaluated accuracy on test dataset) to tell whether this trial can be early stopped or not.
- **Trial**: Trial code is the code you write to run your experiment, which is an individual attempt at applying a new configuration (e.g., a set of hyperparameter values, a specific nerual architecture).
## Where is the log
There are three kinds of log in NNI. When creating a new experiment, you can specify log level as debug by adding `--debug`. Besides, you can set more detailed log level in your configuration file by using
`logLevel` keyword. Available logLevels are: `trace`, `debug`, `info`, `warning`, `error`, `fatal`.
### NNI controller
All possible errors that happen when launching an NNI experiment can be found here.
You can use `nnictl log stderr` to find error information. For more options please refer to [NNICTL](Nnictl.md)
### Experiment Root Directory
Every experiment has a root folder, which is shown on the right-top corner of webUI. Or you could assemble it by replacing the `experiment_id` with your actual experiment_id in path `~/nni/experiment/experiment_id/` in case of webUI failure. `experiment_id` could be seen when you run `nnictl create ...` to create a new experiment.
> For flexibility, we also offer a `logDir` option in your configuration, which specifies the directory to store all experiments (defaults to `~/nni/experiment`). Please refer to [Configuration](ExperimentConfig.md) for more details.
Under that directory, there is another directory named `log`, where `nnimanager.log` and `dispatcher.log` are placed.
### Trial Root Directory
Usually in webUI, you can click `+` in the left of every trial to expand it to see each trial's log path.
Besides, there is another directory under experiment root directory, named `trials`, which stores all the trials.
Every trial has a unique id as its directory name. In this directory, a file named `stderr` records trial error and another named `trial.log` records this trial's log.
## Different kinds of errors
There are different kinds of errors. However, they can be divided into three categories based on their severity. So when nni fails, check each part sequentially.
Generally, if webUI is started successfully, there is a `Status` in the `Overview` tab, serving as a possible indicator of what kind of error happens. Otherwise you should check manually.
### **NNI** Fails
This is the most serious error. When this happens, the whole experiment fails and no trial will be run. Usually this might be related to some installation problem.
When this happens, you should check `nnictl`'s error output file `stderr` (i.e., nnictl log stderr) and then the `nnimanager`'s log to find if there is any error.
### **Dispatcher** Fails
Dispatcher fails. Usually, for some new users of NNI, it means that tuner fails. You could check dispatcher's log to see what happens to your dispatcher. For built-in tuner, some common errors might be invalid search space (unsupported type of search space or inconsistence between initializing args in configuration file and actual tuner's \_\_init\_\_ function args).
Take the later situation as an example. If you write a customized tuner who's \_\_init\_\_ function has an argument called `optimize_mode`, which you do not provide in your configuration file, NNI will fail to run your tuner so the experiment fails. You can see errors in the webUI like:
![](../../img/dispatcher_error.jpg)
Here we can see it is a dispatcher error. So we can check dispatcher's log, which might look like:
```
[2019-02-19 19:36:45] DEBUG (nni.main/MainThread) START
[2019-02-19 19:36:47] ERROR (nni.main/MainThread) __init__() missing 1 required positional arguments: 'optimize_mode'
Traceback (most recent call last):
File "/usr/lib/python3.7/site-packages/nni/__main__.py", line 202, in <module>
main()
File "/usr/lib/python3.7/site-packages/nni/__main__.py", line 164, in main
args.tuner_args)
File "/usr/lib/python3.7/site-packages/nni/__main__.py", line 81, in create_customized_class_instance
instance = class_constructor(**class_args)
TypeError: __init__() missing 1 required positional arguments: 'optimize_mode'.
```
### **Trial** Fails
In this situation, NNI can still run and create new trials.
It means your trial code (which is run by NNI) fails. This kind of error is strongly related to your trial code. Please check trial's log to fix any possible errors shown there.
A common example of this would be run the mnist example without installing tensorflow. Surely there is an Import Error (that is, not installing tensorflow but trying to import it in your trial code) and thus every trial fails.
![](../../img/trial_error.jpg)
As it shows, every trial has a log path, where you can find trial's log and stderr.
**How to Debug in NNI**
===

## Overview

There are three parts that might have logs in NNI. They are nnimanager, dispatcher and trial. Here we will introduce them succinctly. More information please refer to [Overview](../Overview.md).

- **NNI controller**: NNI controller (nnictl) is the nni command-line tool that is used to manage experiments (e.g., start an experiment).
- **nnimanager**: nnimanager is the core of NNI, whose log is important when the whole experiment fails (e.g., no webUI or training service fails)
- **Dispatcher**: Dispatcher calls the methods of **Tuner** and **Assessor**. Logs of dispatcher are related to the tuner or assessor code.
- **Tuner**: Tuner is an AutoML algorithm, which generates a new configuration for the next try. A new trial will run with this configuration.
- **Assessor**: Assessor analyzes trial's intermediate results (e.g., periodically evaluated accuracy on test dataset) to tell whether this trial can be early stopped or not.
- **Trial**: Trial code is the code you write to run your experiment, which is an individual attempt at applying a new configuration (e.g., a set of hyperparameter values, a specific nerual architecture).

## Where is the log

There are three kinds of log in NNI. When creating a new experiment, you can specify log level as debug by adding `--debug`. Besides, you can set more detailed log level in your configuration file by using
`logLevel` keyword. Available logLevels are: `trace`, `debug`, `info`, `warning`, `error`, `fatal`.

### NNI controller

All possible errors that happen when launching an NNI experiment can be found here.

You can use `nnictl log stderr` to find error information. For more options please refer to [NNICTL](Nnictl.md)


### Experiment Root Directory
Every experiment has a root folder, which is shown on the right-top corner of webUI. Or you could assemble it by replacing the `experiment_id` with your actual experiment_id in path `~/nni-experiments/experiment_id/` in case of webUI failure. `experiment_id` could be seen when you run `nnictl create ...` to create a new experiment.

> For flexibility, we also offer a `logDir` option in your configuration, which specifies the directory to store all experiments (defaults to `~/nni-experiments`). Please refer to [Configuration](ExperimentConfig.md) for more details.

Under that directory, there is another directory named `log`, where `nnimanager.log` and `dispatcher.log` are placed.

### Trial Root Directory

Usually in webUI, you can click `+` in the left of every trial to expand it to see each trial's log path.

Besides, there is another directory under experiment root directory, named `trials`, which stores all the trials.
Every trial has a unique id as its directory name. In this directory, a file named `stderr` records trial error and another named `trial.log` records this trial's log.

## Different kinds of errors

There are different kinds of errors. However, they can be divided into three categories based on their severity. So when nni fails, check each part sequentially.

Generally, if webUI is started successfully, there is a `Status` in the `Overview` tab, serving as a possible indicator of what kind of error happens. Otherwise you should check manually.

### **NNI** Fails

This is the most serious error. When this happens, the whole experiment fails and no trial will be run. Usually this might be related to some installation problem.

When this happens, you should check `nnictl`'s error output file `stderr` (i.e., nnictl log stderr) and then the `nnimanager`'s log to find if there is any error.


### **Dispatcher** Fails

Dispatcher fails. Usually, for some new users of NNI, it means that tuner fails. You could check dispatcher's log to see what happens to your dispatcher. For built-in tuner, some common errors might be invalid search space (unsupported type of search space or inconsistence between initializing args in configuration file and actual tuner's \_\_init\_\_ function args).

Take the later situation as an example. If you write a customized tuner who's \_\_init\_\_ function has an argument called `optimize_mode`, which you do not provide in your configuration file, NNI will fail to run your tuner so the experiment fails. You can see errors in the webUI like:

![](../../img/dispatcher_error.jpg)

Here we can see it is a dispatcher error. So we can check dispatcher's log, which might look like:

```
[2019-02-19 19:36:45] DEBUG (nni.main/MainThread) START
[2019-02-19 19:36:47] ERROR (nni.main/MainThread) __init__() missing 1 required positional arguments: 'optimize_mode'
Traceback (most recent call last):
File "/usr/lib/python3.7/site-packages/nni/__main__.py", line 202, in <module>
main()
File "/usr/lib/python3.7/site-packages/nni/__main__.py", line 164, in main
args.tuner_args)
File "/usr/lib/python3.7/site-packages/nni/__main__.py", line 81, in create_customized_class_instance
instance = class_constructor(**class_args)
TypeError: __init__() missing 1 required positional arguments: 'optimize_mode'.
```

### **Trial** Fails

In this situation, NNI can still run and create new trials.

It means your trial code (which is run by NNI) fails. This kind of error is strongly related to your trial code. Please check trial's log to fix any possible errors shown there.

A common example of this would be run the mnist example without installing tensorflow. Surely there is an Import Error (that is, not installing tensorflow but trying to import it in your trial code) and thus every trial fails.

![](../../img/trial_error.jpg)

As it shows, every trial has a log path, where you can find trial's log and stderr.

In addition to experiment level debug, NNI also provides the capability for debugging a single trial without the need to start the entire experiment. Refer to [standalone mode](../TrialExample/Trials#standalone-mode-for-debugging) for more information about debug single trial code.
2 changes: 1 addition & 1 deletion src/nni_manager/common/experimentStartupInfo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class ExperimentStartupInfo {
if (logDir !== undefined && logDir.length > 0) {
this.logDir = path.join(path.normalize(logDir), this.getExperimentId());
} else {
this.logDir = path.join(os.homedir(), 'nni', 'experiments', this.getExperimentId());
this.logDir = path.join(os.homedir(), 'nni-experiments', this.getExperimentId());
}

if (logLevel !== undefined && logLevel.length > 1) {
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/core/test/nnimanager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ describe('Unit test for nnimanager', function () {
it('test getExperimentProfile', () => {
return nniManager.getExperimentProfile().then((experimentProfile) => {
expect(experimentProfile.id).to.be.equal('unittest');
expect(experimentProfile.logDir).to.be.equal(path.join(os.homedir(),'nni','experiments','unittest'));
expect(experimentProfile.logDir).to.be.equal(path.join(os.homedir(),'nni-experiments','unittest'));

}).catch((error) => {
assert.fail(error);
Expand Down
2 changes: 1 addition & 1 deletion test/nni_test/nnitest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_experiment_dir(experiment_url=None, experiment_id=None):
assert any([experiment_url, experiment_id])
if experiment_id is None:
experiment_id = get_experiment_id(experiment_url)
return os.path.join(os.path.expanduser('~'), 'nni', 'experiments', experiment_id)
return os.path.join(os.path.expanduser('~'), 'nni-experiments', experiment_id)

def get_nni_log_dir(experiment_url=None, experiment_id=None):
'''get nni's log directory from nni's experiment url'''
Expand Down