Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Fix remoteTrainingService gpuScheduler #1749

Merged
merged 18 commits into from
Nov 22, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class RemoteMachineTrainingService implements TrainingService {
private readonly expRootDir: string;
private readonly remoteExpRootDir: string;
private trialConfig: TrialConfig | undefined;
private readonly gpuScheduler: GPUScheduler;
private gpuScheduler?: GPUScheduler;
private readonly jobQueue: string[];
private readonly timer: ObservableTimer;
private stopping: boolean = false;
Expand All @@ -87,7 +87,6 @@ class RemoteMachineTrainingService implements TrainingService {
this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
this.trialSSHClientMap = new Map<string, Client>();
this.machineSSHClientMap = new Map<RemoteMachineMeta, SSHClientManager>();
this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap);
this.jobQueue = [];
this.expRootDir = getExperimentRootDir();
this.remoteExpRootDir = this.getRemoteExperimentRootDir();
Expand Down Expand Up @@ -334,6 +333,7 @@ class RemoteMachineTrainingService implements TrainingService {
break;
case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value);
this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
Expand Down Expand Up @@ -397,9 +397,11 @@ class RemoteMachineTrainingService implements TrainingService {
* remove gpu reversion when job is not running
*/
private updateGpuReservation(): void {
for (const [key, value] of this.trialJobsMap) {
if (!['WAITING', 'RUNNING'].includes(value.status)) {
this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
if (this.gpuScheduler) {
for (const [key, value] of this.trialJobsMap) {
if (!['WAITING', 'RUNNING'].includes(value.status)) {
this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
}
}
}
}
Expand Down Expand Up @@ -483,6 +485,9 @@ class RemoteMachineTrainingService implements TrainingService {
if (this.trialConfig === undefined) {
throw new Error('trial config is not initialized');
}
if (this.gpuScheduler === undefined) {
throw new Error('gpuScheduler is not initialized');
}
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`);
Expand Down