From 7db8001c430df41e98a8b2ab6ee9d6af517b98b1 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 22 Nov 2019 10:41:57 +0800 Subject: [PATCH] fix conflict --- .../remoteMachineTrainingService.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 11fc85f829..a01b41a8df 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -67,7 +67,7 @@ class RemoteMachineTrainingService implements TrainingService { private readonly expRootDir: string; private readonly remoteExpRootDir: string; private trialConfig: TrialConfig | undefined; - private readonly gpuScheduler: GPUScheduler; + private gpuScheduler?: GPUScheduler; private readonly jobQueue: string[]; private readonly timer: ObservableTimer; private stopping: boolean = false; @@ -87,7 +87,6 @@ class RemoteMachineTrainingService implements TrainingService { this.trialJobsMap = new Map(); this.trialSSHClientMap = new Map(); this.machineSSHClientMap = new Map(); - this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap); this.jobQueue = []; this.expRootDir = getExperimentRootDir(); this.remoteExpRootDir = this.getRemoteExperimentRootDir(); @@ -334,6 +333,7 @@ class RemoteMachineTrainingService implements TrainingService { break; case TrialConfigMetadataKey.MACHINE_LIST: await this.setupConnections(value); + this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap); break; case TrialConfigMetadataKey.TRIAL_CONFIG: const remoteMachineTrailConfig: TrialConfig = JSON.parse(value); @@ -397,9 +397,11 @@ class RemoteMachineTrainingService implements TrainingService { * remove gpu reversion when job is not running */ private updateGpuReservation(): void { - for (const [key, value] of this.trialJobsMap) { - if (!['WAITING', 'RUNNING'].includes(value.status)) { - this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap); + if (this.gpuScheduler) { + for (const [key, value] of this.trialJobsMap) { + if (!['WAITING', 'RUNNING'].includes(value.status)) { + this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap); + } } } } @@ -483,6 +485,9 @@ class RemoteMachineTrainingService implements TrainingService { if (this.trialConfig === undefined) { throw new Error('trial config is not initialized'); } + if (this.gpuScheduler === undefined) { + throw new Error('gpuScheduler is not initialized'); + } const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`);