Skip to content

Commit

Permalink
support display trial log on local mode (microsoft#2718)
Browse files Browse the repository at this point in the history
  • Loading branch information
JunweiSUN authored Aug 12, 2020
1 parent e2a8689 commit 10c177c
Show file tree
Hide file tree
Showing 15 changed files with 147 additions and 20 deletions.
4 changes: 3 additions & 1 deletion src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'use strict';

import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore';
import { TrialJobStatus } from './trainingService';
import { TrialJobStatus, LogType } from './trainingService';

type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
Expand Down Expand Up @@ -101,6 +101,8 @@ abstract class Manager {
public abstract getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]>;
public abstract getLatestMetricData(): Promise<MetricDataRecord[]>;

public abstract getTrialLog(trialJobId: string, logType: LogType): Promise<string>;

public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>;
public abstract getStatus(): NNIManagerStatus;
}
Expand Down
5 changes: 4 additions & 1 deletion src/nni_manager/common/trainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
*/
type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED';

type LogType = 'TRIAL_LOG' | 'TRIAL_ERROR';

interface TrainingServiceMetadata {
readonly key: string;
readonly value: string;
Expand Down Expand Up @@ -79,6 +81,7 @@ abstract class TrainingService {
public abstract updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail>;
public abstract get isMultiPhaseJobSupported(): boolean;
public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise<void>;
public abstract getTrialLog(trialJobId: string, logType: LogType): Promise<string>;
public abstract setClusterMetadata(key: string, value: string): Promise<void>;
public abstract getClusterMetadata(key: string): Promise<string>;
public abstract cleanUp(): Promise<void>;
Expand All @@ -98,5 +101,5 @@ class NNIManagerIpConfig {
export {
TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm,
TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters,
NNIManagerIpConfig
NNIManagerIpConfig, LogType
};
6 changes: 5 additions & 1 deletion src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import {
NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
} from '../common/manager';
import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType
} from '../common/trainingService';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils';
import {
Expand Down Expand Up @@ -325,6 +325,10 @@ class NNIManager implements Manager {
// FIXME: unit test
}

public async getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
return this.trainingService.getTrialLog(trialJobId, logType);
}

public getExperimentProfile(): Promise<ExperimentProfile> {
// TO DO: using Promise.resolve()
const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
Expand Down
6 changes: 5 additions & 1 deletion src/nni_manager/core/test/mockedTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred';
import { Provider } from 'typescript-ioc';

import { MethodNotImplementedError } from '../../common/errors';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService';

const testTrainingServiceProvider: Provider = {
get: () => { return new MockedTrainingService(); }
Expand Down Expand Up @@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService {
return deferred.promise;
}

public getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

async run(): Promise<void> {

}
Expand Down
14 changes: 14 additions & 0 deletions src/nni_manager/rest_server/restHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class NNIRestHandler {
this.getMetricData(router);
this.getMetricDataByRange(router);
this.getLatestMetricData(router);
this.getTrialLog(router);
this.exportData(router);

// Express-joi-validator configuration
Expand Down Expand Up @@ -268,6 +269,19 @@ class NNIRestHandler {
});
}

private getTrialLog(router: Router): void {
router.get('/trial-log/:id/:type', async(req: Request, res: Response) => {
this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => {
if (log === '') {
log = 'No logs available.'
}
res.send(log);
}).catch((err: Error) => {
this.handleError(err, res);
});
});
}

private exportData(router: Router): void {
router.get('/export-data', (req: Request, res: Response) => {
this.nniManager.exportData().then((exportedData: string) => {
Expand Down
5 changes: 4 additions & 1 deletion src/nni_manager/rest_server/test/mockedNNIManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import {
TrialJobStatistics, NNIManagerStatus
} from '../../common/manager';
import {
TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
TrialJobApplicationForm, TrialJobDetail, TrialJobStatus, LogType
} from '../../common/trainingService';

export const testManagerProvider: Provider = {
Expand Down Expand Up @@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager {
public getLatestMetricData(): Promise<MetricDataRecord[]> {
throw new MethodNotImplementedError();
}
public getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public getExperimentProfile(): Promise<ExperimentProfile> {
const profile: ExperimentProfile = {
params: {
Expand Down
7 changes: 6 additions & 1 deletion src/nni_manager/training_service/dlts/dltsTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ import { EventEmitter } from 'events';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { DLTS_TRIAL_COMMAND_FORMAT } from './dltsData';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
Expand Down Expand Up @@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService {
return trialJob
}

public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.metricsEmitter.on('metric', listener);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ import { Base64 } from 'js-base64';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils';
import { AzureStorageClientUtility } from './azureStorageClientUtils';
Expand Down Expand Up @@ -98,6 +99,10 @@ abstract class KubernetesTrainingService {
return Promise.resolve(kubernetesTrialJob);
}

public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.metricsEmitter.on('metric', listener);
}
Expand Down
18 changes: 15 additions & 3 deletions src/nni_manager/training_service/local/localTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import {
HyperParameters, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType
} from '../../common/trainingService';
import {
delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString
Expand Down Expand Up @@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService {
return trialJob;
}

public async getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
let logPath: string;
if (logType === 'TRIAL_LOG') {
logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log');
} else if (logType === 'TRIAL_ERROR') {
logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr');
} else {
throw new Error('unexpected log type');
}
return fs.promises.readFile(logPath, 'utf8');
}

public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.eventEmitter.on('metric', listener);
}
Expand Down Expand Up @@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService {
while (!this.stopping) {
while (!this.stopping && this.jobQueue.length !== 0) {
const trialJobId: string = this.jobQueue[0];
const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') {
const trialJobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
if (trialJobDetail !== undefined && trialJobDetail.status === 'WAITING') {
const [success, resource] = this.tryGetAvailableResource();
if (!success) {
break;
Expand Down
7 changes: 6 additions & 1 deletion src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { delay } from '../../common/utils';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
Expand Down Expand Up @@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService {
return jobs;
}

public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
if (this.paiClusterConfig === undefined) {
throw new Error('PAI Cluster config is not initialized');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ import * as path from 'path';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric
TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import {
delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus,
Expand Down Expand Up @@ -180,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService {
}
}

/**
* Get trial job log
* @param _trialJobId ID of trial job
* @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR'
*/
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

/**
* Add job metrics listener
* @param listener callback listener
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import { Container, Scope } from 'typescript-ioc';
import * as component from '../../common/component';
import { getLogger, Logger } from '../../common/log';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService';
import { MethodNotImplementedError } from '../../common/errors'
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService';
import { delay } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { PAIClusterConfig } from '../pai/paiConfig';
Expand Down Expand Up @@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService {
return await this.internalTrainingService.getTrialJob(trialJobId);
}

public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
if (this.internalTrainingService === undefined) {
throw new Error("TrainingService is not assigned!");
Expand Down
8 changes: 6 additions & 2 deletions src/nni_manager/training_service/reusable/trialDispatcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ import * as path from 'path';
import { Writable } from 'stream';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors';
import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService';
import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils';
import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands';
import { ScheduleResultType } from '../../training_service/common/gpuData';
Expand Down Expand Up @@ -111,6 +111,10 @@ class TrialDispatcher implements TrainingService {
return trial;
}

public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}

public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialDetail> {
if (this.trialConfig === undefined) {
throw new Error(`trialConfig not initialized!`);
Expand Down
36 changes: 33 additions & 3 deletions src/nni_manager/training_service/test/localTrainingService.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@

'use strict';

import * as assert from 'assert';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as fs from 'fs';
import * as path from 'path';
import * as tmp from 'tmp';
import * as component from '../../common/component';
import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils';
import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingService';
import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { LocalTrainingService } from '../local/localTrainingService';

Expand Down Expand Up @@ -72,6 +72,36 @@ describe('Unit Test for LocalTrainingService', () => {
chai.expect(jobDetail.status).to.be.equals('USER_CANCELED');
}).timeout(20000);

it('Get trial log', async () => {
await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig);

// submit job
const form: TrialJobApplicationForm = {
sequenceId: 0,
hyperParameters: {
value: 'mock hyperparameters',
index: 0
}
};

const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form);

// get trial log
const rootDir: string = getExperimentRootDir()
fs.mkdirSync(path.join(rootDir, 'trials'))
fs.mkdirSync(jobDetail.workingDirectory)
fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log')
fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr')
chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log');
chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_ERROR')).to.be.equals('trial stderr');
fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log'))
fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr'))
fs.rmdirSync(jobDetail.workingDirectory)
fs.rmdirSync(path.join(rootDir, 'trials'))

await localTrainingService.cancelTrialJob(jobDetail.id);
}).timeout(20000);

it('Read metrics, Add listener, and remove listener', async () => {
// set meta data
const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}`
Expand Down
Loading

0 comments on commit 10c177c

Please sign in to comment.