diff --git a/src/nni_manager/common/manager.ts b/src/nni_manager/common/manager.ts index f37745de16..c003598abc 100644 --- a/src/nni_manager/common/manager.ts +++ b/src/nni_manager/common/manager.ts @@ -4,7 +4,7 @@ 'use strict'; import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore'; -import { TrialJobStatus } from './trainingService'; +import { TrialJobStatus, LogType } from './trainingService'; type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM'; type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL'; @@ -101,6 +101,8 @@ abstract class Manager { public abstract getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise; public abstract getLatestMetricData(): Promise; + public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; + public abstract getTrialJobStatistics(): Promise; public abstract getStatus(): NNIManagerStatus; } diff --git a/src/nni_manager/common/trainingService.ts b/src/nni_manager/common/trainingService.ts index 83bd51e884..4edcf16ab6 100644 --- a/src/nni_manager/common/trainingService.ts +++ b/src/nni_manager/common/trainingService.ts @@ -8,6 +8,8 @@ */ type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED'; +type LogType = 'TRIAL_LOG' | 'TRIAL_ERROR'; + interface TrainingServiceMetadata { readonly key: string; readonly value: string; @@ -79,6 +81,7 @@ abstract class TrainingService { public abstract updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise; public abstract get isMultiPhaseJobSupported(): boolean; public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise; + public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; public abstract setClusterMetadata(key: string, value: string): Promise; public abstract getClusterMetadata(key: string): Promise; public abstract cleanUp(): Promise; @@ -98,5 +101,5 @@ class NNIManagerIpConfig { export { TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm, TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters, - NNIManagerIpConfig + NNIManagerIpConfig, LogType }; diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 038fe9ef9a..ad243f4835 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -16,7 +16,7 @@ import { NNIManagerStatus, ProfileUpdateType, TrialJobStatistics } from '../common/manager'; import { - TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus + TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType } from '../common/trainingService'; import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils'; import { @@ -325,6 +325,10 @@ class NNIManager implements Manager { // FIXME: unit test } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + return this.trainingService.getTrialLog(trialJobId, logType); + } + public getExperimentProfile(): Promise { // TO DO: using Promise.resolve() const deferred: Deferred = new Deferred(); diff --git a/src/nni_manager/core/test/mockedTrainingService.ts b/src/nni_manager/core/test/mockedTrainingService.ts index 546a36e494..5dfec86427 100644 --- a/src/nni_manager/core/test/mockedTrainingService.ts +++ b/src/nni_manager/core/test/mockedTrainingService.ts @@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred'; import { Provider } from 'typescript-ioc'; import { MethodNotImplementedError } from '../../common/errors'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; const testTrainingServiceProvider: Provider = { get: () => { return new MockedTrainingService(); } @@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService { return deferred.promise; } + public getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + async run(): Promise { } diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index 457f154b69..af44d71a01 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -57,6 +57,7 @@ class NNIRestHandler { this.getMetricData(router); this.getMetricDataByRange(router); this.getLatestMetricData(router); + this.getTrialLog(router); this.exportData(router); // Express-joi-validator configuration @@ -268,6 +269,19 @@ class NNIRestHandler { }); } + private getTrialLog(router: Router): void { + router.get('/trial-log/:id/:type', async(req: Request, res: Response) => { + this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => { + if (log === '') { + log = 'No logs available.' + } + res.send(log); + }).catch((err: Error) => { + this.handleError(err, res); + }); + }); + } + private exportData(router: Router): void { router.get('/export-data', (req: Request, res: Response) => { this.nniManager.exportData().then((exportedData: string) => { diff --git a/src/nni_manager/rest_server/test/mockedNNIManager.ts b/src/nni_manager/rest_server/test/mockedNNIManager.ts index 5c8bc267b7..e45819d6cb 100644 --- a/src/nni_manager/rest_server/test/mockedNNIManager.ts +++ b/src/nni_manager/rest_server/test/mockedNNIManager.ts @@ -13,7 +13,7 @@ import { TrialJobStatistics, NNIManagerStatus } from '../../common/manager'; import { - TrialJobApplicationForm, TrialJobDetail, TrialJobStatus + TrialJobApplicationForm, TrialJobDetail, TrialJobStatus, LogType } from '../../common/trainingService'; export const testManagerProvider: Provider = { @@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager { public getLatestMetricData(): Promise { throw new MethodNotImplementedError(); } + public getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } public getExperimentProfile(): Promise { const profile: ExperimentProfile = { params: { diff --git a/src/nni_manager/training_service/dlts/dltsTrainingService.ts b/src/nni_manager/training_service/dlts/dltsTrainingService.ts index ba707fbb13..30d8fbcf8d 100644 --- a/src/nni_manager/training_service/dlts/dltsTrainingService.ts +++ b/src/nni_manager/training_service/dlts/dltsTrainingService.ts @@ -12,9 +12,10 @@ import { EventEmitter } from 'events'; import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { DLTS_TRIAL_COMMAND_FORMAT } from './dltsData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; @@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService { return trialJob } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index f21ac9ad69..11a54c453c 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -12,8 +12,9 @@ import { Base64 } from 'js-base64'; import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { - NNIManagerIpConfig, TrialJobDetail, TrialJobMetric + NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { AzureStorageClientUtility } from './azureStorageClientUtils'; @@ -98,6 +99,10 @@ abstract class KubernetesTrainingService { return Promise.resolve(kubernetesTrialJob); } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 71a1c5719c..a69bff8df8 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { HyperParameters, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric, TrialJobStatus + TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString @@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService { return trialJob; } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + let logPath: string; + if (logType === 'TRIAL_LOG') { + logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log'); + } else if (logType === 'TRIAL_ERROR') { + logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr'); + } else { + throw new Error('unexpected log type'); + } + return fs.promises.readFile(logPath, 'utf8'); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.eventEmitter.on('metric', listener); } @@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService { while (!this.stopping) { while (!this.stopping && this.jobQueue.length !== 0) { const trialJobId: string = this.jobQueue[0]; - const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); - if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') { + const trialJobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); + if (trialJobDetail !== undefined && trialJobDetail.status === 'WAITING') { const [success, resource] = this.tryGetAvailableResource(); if (!success) { break; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index aff583de54..56756b708d 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -11,9 +11,10 @@ import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay } from '../../common/utils'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; @@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService { return jobs; } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public async getTrialJob(trialJobId: string): Promise { if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index b291690a0b..8736bc09b7 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -10,13 +10,13 @@ import * as path from 'path'; import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; import { Deferred } from 'ts-deferred'; import * as component from '../../common/component'; -import { NNIError, NNIErrorNames } from '../../common/errors'; +import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { ObservableTimer } from '../../common/observableTimer'; import { HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric + TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, @@ -180,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService { } } + /** + * Get trial job log + * @param _trialJobId ID of trial job + * @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR' + */ + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + /** * Add job metrics listener * @param listener callback listener diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 1fd28604be..1e3b75cc86 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -6,7 +6,8 @@ import { Container, Scope } from 'typescript-ioc'; import * as component from '../../common/component'; import { getLogger, Logger } from '../../common/log'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { MethodNotImplementedError } from '../../common/errors' +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; @@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService { return await this.internalTrainingService.getTrialJob(trialJobId); } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { if (this.internalTrainingService === undefined) { throw new Error("TrainingService is not assigned!"); diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 1b310ef9e0..046f389ca2 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -9,10 +9,10 @@ import * as path from 'path'; import { Writable } from 'stream'; import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; -import { NNIError, NNIErrorNames } from '../../common/errors'; +import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; -import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils'; import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands'; import { ScheduleResultType } from '../../training_service/common/gpuData'; @@ -111,6 +111,10 @@ class TrialDispatcher implements TrainingService { return trial; } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public async submitTrialJob(form: TrialJobApplicationForm): Promise { if (this.trialConfig === undefined) { throw new Error(`trialConfig not initialized!`); diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index bc47e747ba..fbaaedcd41 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -3,14 +3,14 @@ 'use strict'; -import * as assert from 'assert'; import * as chai from 'chai'; import * as chaiAsPromised from 'chai-as-promised'; import * as fs from 'fs'; +import * as path from 'path'; import * as tmp from 'tmp'; import * as component from '../../common/component'; -import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; -import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils'; +import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingService'; +import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { LocalTrainingService } from '../local/localTrainingService'; @@ -72,6 +72,36 @@ describe('Unit Test for LocalTrainingService', () => { chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); }).timeout(20000); + it('Get trial log', async () => { + await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); + + // submit job + const form: TrialJobApplicationForm = { + sequenceId: 0, + hyperParameters: { + value: 'mock hyperparameters', + index: 0 + } + }; + + const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); + + // get trial log + const rootDir: string = getExperimentRootDir() + fs.mkdirSync(path.join(rootDir, 'trials')) + fs.mkdirSync(jobDetail.workingDirectory) + fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log') + fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr') + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log'); + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_ERROR')).to.be.equals('trial stderr'); + fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log')) + fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr')) + fs.rmdirSync(jobDetail.workingDirectory) + fs.rmdirSync(path.join(rootDir, 'trials')) + + await localTrainingService.cancelTrialJob(jobDetail.id); + }).timeout(20000); + it('Read metrics, Add listener, and remove listener', async () => { // set meta data const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index a0c6c274c1..a20cf5313f 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -2,6 +2,7 @@ import * as React from 'react'; import * as copy from 'copy-to-clipboard'; import { Stack, PrimaryButton, Pivot, PivotItem } from 'office-ui-fabric-react'; import { Trial } from '../../static/model/trial'; +import { MANAGER_IP } from '../../static/const'; import { EXPERIMENT, TRIALS } from '../../static/datamodel'; import JSONTree from 'react-json-tree'; import PaiTrialLog from '../public-child/PaiTrialLog'; @@ -9,6 +10,7 @@ import TrialLog from '../public-child/TrialLog'; import MessageInfo from '../Modals/MessageInfo'; import '../../static/style/overview.scss'; import '../../static/style/copyParameter.scss'; +import '../../static/style/openRow.scss'; interface OpenRowProps { trialId: string; @@ -55,6 +57,10 @@ class OpenRow extends React.Component { } } + openTrialLog = (type: string): void => { + window.open(`${MANAGER_IP}/trial-log/${this.props.trialId}/${type}`); + } + render(): React.ReactNode { const { isHidenInfo, typeInfo, info } = this.state; const trialId = this.props.trialId; @@ -105,7 +111,23 @@ class OpenRow extends React.Component { logCollection={EXPERIMENT.logCollectionEnabled} /> : - +
+ + {/* view each trial log in drawer*/} +
+
+ + +
+
+
}