-
Notifications
You must be signed in to change notification settings - Fork 1.8k
support display trial log on local mode #2718
Changes from 31 commits
482ef51
15f7706
dff2c2b
7315dac
520d9ad
a73db71
7b45ecf
6cd5e8f
4309a46
0d7974b
7545a83
6c7142d
183da4c
f702e97
607ed21
c71e296
3aa2183
1514a1b
9e0dead
3453bf6
2e98bd6
cd0333c
2f413da
92ef5c6
c0ed056
18ce44f
73989e0
dfabaad
40c0fd9
f7ec902
a01a1e8
5e0a5bf
5fbd017
d21d524
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,7 @@ class NNIRestHandler { | |
this.getMetricData(router); | ||
this.getMetricDataByRange(router); | ||
this.getLatestMetricData(router); | ||
this.getTrialLog(router); | ||
this.exportData(router); | ||
|
||
// Express-joi-validator configuration | ||
|
@@ -268,6 +269,19 @@ class NNIRestHandler { | |
}); | ||
} | ||
|
||
private getTrialLog(router: Router): void { | ||
router.get('/trial-log/:id/:type', async(req: Request, res: Response) => { | ||
this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => { | ||
if (log === '') { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't suggest sending unstructured infomation through REST API. The idea behind is that REST API is designed for sending "raw data" which can be easily understood by program, rather than "UI string". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think here |
||
log = 'No logs available.' | ||
} | ||
res.send(log); | ||
}).catch((err: Error) => { | ||
this.handleError(err, res); | ||
}); | ||
}); | ||
} | ||
|
||
private exportData(router: Router): void { | ||
router.get('/export-data', (req: Request, res: Response) => { | ||
this.nniManager.exportData().then((exportedData: string) => { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo'; | |
import { getLogger, Logger } from '../../common/log'; | ||
import { | ||
HyperParameters, TrainingService, TrialJobApplicationForm, | ||
TrialJobDetail, TrialJobMetric, TrialJobStatus | ||
TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType | ||
} from '../../common/trainingService'; | ||
import { | ||
delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString | ||
|
@@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService { | |
return trialJob; | ||
} | ||
|
||
public async getTrialLog(trialJobId: string, logType: LogType): Promise<string> { | ||
QuanluZhang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let logPath: string; | ||
if (logType === 'TRIAL_LOG') { | ||
logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log'); | ||
} else if (logType === 'TRIAL_ERROR') { | ||
logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr'); | ||
} else { | ||
throw new Error('unexpected log type'); | ||
} | ||
return fs.promises.readFile(logPath, 'utf8'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this readFile() function need to catch exception? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This exception should be catched by |
||
} | ||
|
||
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { | ||
this.eventEmitter.on('metric', listener); | ||
} | ||
|
@@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService { | |
while (!this.stopping) { | ||
while (!this.stopping && this.jobQueue.length !== 0) { | ||
const trialJobId: string = this.jobQueue[0]; | ||
const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); | ||
if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') { | ||
const trialJobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); | ||
if (trialJobDetail !== undefined && trialJobDetail.status === 'WAITING') { | ||
const [success, resource] = this.tryGetAvailableResource(); | ||
if (!success) { | ||
break; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,13 +10,13 @@ import * as path from 'path'; | |
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; | ||
import { Deferred } from 'ts-deferred'; | ||
import * as component from '../../common/component'; | ||
import { NNIError, NNIErrorNames } from '../../common/errors'; | ||
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; | ||
import { getExperimentId } from '../../common/experimentStartupInfo'; | ||
import { getLogger, Logger } from '../../common/log'; | ||
import { ObservableTimer } from '../../common/observableTimer'; | ||
import { | ||
HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, | ||
TrialJobDetail, TrialJobMetric | ||
TrialJobDetail, TrialJobMetric, LogType | ||
} from '../../common/trainingService'; | ||
import { | ||
delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, | ||
|
@@ -180,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService { | |
} | ||
} | ||
|
||
/** | ||
* Get trial job log | ||
* @param _trialJobId ID of trial job | ||
* @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR' | ||
*/ | ||
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The doc string should be written in base class. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. I write doc string in this class because I found other functions like |
||
throw new MethodNotImplementedError(); | ||
} | ||
|
||
/** | ||
* Add job metrics listener | ||
* @param listener callback listener | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,12 @@ import * as assert from 'assert'; | |
import * as chai from 'chai'; | ||
import * as chaiAsPromised from 'chai-as-promised'; | ||
import * as fs from 'fs'; | ||
import * as path from 'path'; | ||
import * as tmp from 'tmp'; | ||
import * as component from '../../common/component'; | ||
import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; | ||
import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils'; | ||
import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils'; | ||
import { getExperimentId } from '../../common/experimentStartupInfo' | ||
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; | ||
import { LocalTrainingService } from '../local/localTrainingService'; | ||
|
||
|
@@ -72,6 +74,38 @@ describe('Unit Test for LocalTrainingService', () => { | |
chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); | ||
}).timeout(20000); | ||
|
||
it('Get trial log', async () => { | ||
// set meta data | ||
const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering whether this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about change the command to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed. now use the original trialConifig in line 23 |
||
await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); | ||
|
||
// submit job | ||
const form: TrialJobApplicationForm = { | ||
sequenceId: 0, | ||
hyperParameters: { | ||
value: 'mock hyperparameters', | ||
index: 0 | ||
} | ||
}; | ||
|
||
const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); | ||
|
||
// get trial log | ||
const rootDir: string = getExperimentRootDir() | ||
fs.mkdirSync(path.join(rootDir, 'trials')) | ||
fs.mkdirSync(jobDetail.workingDirectory) | ||
fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log') | ||
fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr') | ||
chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log'); | ||
chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_ERROR')).to.be.equals('trial stderr'); | ||
fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log')) | ||
fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr')) | ||
fs.rmdirSync(jobDetail.workingDirectory) | ||
fs.rmdirSync(path.join(rootDir, 'trials')) | ||
|
||
await localTrainingService.cancelTrialJob(jobDetail.id); | ||
}).timeout(20000); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is testing training service, do we need to also test nnimanager or restserver? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This UT for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
|
||
it('Read metrics, Add listener, and remove listener', async () => { | ||
// set meta data | ||
const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it might be better to catch
MethodNotImplementedError
here, then return log message likeXXX training service does not support retrieving log
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you explain how did you fix this one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the latest commit, I revert this to the original version (i.e., not handle the error here) since the front end will check the training service now. The error here could also be an file operation error, so I let the restHander to catch the error throwed here.