Skip to content

Commit

Permalink
sharedstorage support remote umount and fix bug (microsoft#3456)
Browse files Browse the repository at this point in the history
  • Loading branch information
J-shang authored and Hao Ni committed Apr 7, 2021
1 parent 836fedc commit 281c043
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ class LinuxCommands extends OsCommands {
command = `bash '${script}'`;
} else {
script = script.replace(/"/g, '\\"');
const result = script.match(/[^\\]\\\\"/g);
if (result) {
result.forEach((res) => {
script = script.replace(res, res.replace(/"$/g, '\\"'));
})
}
command = `bash -c "${script}"`;
}
return command;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
} else {
environment.setStatus('FAILED');
}
this.releaseEnvironmentResource(environment);
await this.releaseEnvironmentResource(environment);
}
}
}
Expand All @@ -194,7 +194,16 @@ export class RemoteEnvironmentService extends EnvironmentService {
* If a environment is finished, release the connection resource
* @param environment remote machine environment job detail
*/
private releaseEnvironmentResource(environment: EnvironmentInformation): void {
private async releaseEnvironmentResource(environment: EnvironmentInformation): Promise<void> {
if (environment.useSharedStorage) {
const executor = await this.getExecutor(environment.id);
const remoteUmountCommand = component.get<SharedStorageService>(SharedStorageService).remoteUmountCommand;
const result = await executor.executeScript(remoteUmountCommand, false, false);
if (result.exitCode !== 0) {
this.log.error(`Umount shared storage on remote machine failed.\n ERROR: ${result.stderr}`);
}
}

const executorManager = this.environmentExecutorManagerMap.get(environment.id);
if (executorManager === undefined) {
throw new Error(`ExecutorManager is not assigned for environment ${environment.id}`);
Expand Down Expand Up @@ -272,8 +281,11 @@ export class RemoteEnvironmentService extends EnvironmentService {
const executor = await this.getExecutor(environment.id);
if (environment.useSharedStorage) {
this.remoteExperimentRootDir = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot;
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand;
await executor.executeScript(remoteMountCommand, false, false);
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand.replace(/echo -e /g, `echo `).replace(/echo /g, `echo -e `);
const result = await executor.executeScript(remoteMountCommand, false, false);
if (result.exitCode !== 0) {
throw new Error(`Mount shared storage on remote machine failed.\n ERROR: ${result.stderr}`);
}
} else {
this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId());
}
Expand Down Expand Up @@ -322,14 +334,14 @@ export class RemoteEnvironmentService extends EnvironmentService {

if (environment.status === 'UNKNOWN') {
environment.status = 'USER_CANCELED';
this.releaseEnvironmentResource(environment);
await this.releaseEnvironmentResource(environment);
return
}

const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
try {
await executor.killChildProcesses(jobpidPath);
this.releaseEnvironmentResource(environment);
await this.releaseEnvironmentResource(environment);
} catch (error) {
this.log.error(`stopEnvironment: ${error}`);
}
Expand Down
2 changes: 2 additions & 0 deletions ts/nni_manager/training_service/reusable/sharedStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ export abstract class SharedStorageService {
public abstract get storageService(): StorageService;
public abstract get localMountCommand(): string;
public abstract get remoteMountCommand(): string;
public abstract get remoteUmountCommand(): string;
public abstract get localWorkingRoot(): string;
public abstract get remoteWorkingRoot(): string;
public abstract cleanUp(): Promise<void>;
}
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
private log: Logger;
private internalStorageService: MountedStorageService;
private experimentId: string;
private localMounted?: string;

private storageType?: SharedStorageType;
private storageAccountName?: string;
Expand Down Expand Up @@ -113,10 +114,10 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}

if (azureblobConfig.localMounted === 'nnimount') {
this.localMounted = azureblobConfig.localMounted;
if (this.localMounted === 'nnimount') {
await this.helpLocalMount();
} else if (azureblobConfig.localMounted === 'nomount') {
} else if (this.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount' yet.`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
Expand Down Expand Up @@ -154,6 +155,15 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
}
}

public get remoteUmountCommand(): string {
if (this.remoteMountPoint) {
return `sudo umount -l ${this.remoteMountPoint}`;
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}

private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "${INSTALL_BLOBFUSE.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`;
const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`;
Expand Down Expand Up @@ -206,4 +216,21 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
return Promise.reject(errorMessage);
}
}

public async cleanUp(): Promise<void> {
if (this.localMounted !== 'nnimount') {
return Promise.resolve();
}
try {
const result = await cpp.exec(`sudo umount -l ${this.localMountPoint}`);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Umount ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ export class NFSSharedStorageService extends SharedStorageService {
private log: Logger;
private internalStorageService: MountedStorageService;
private experimentId: string;
private localMounted?: string;

private storageType?: SharedStorageType;
private nfsServer?: string;
Expand All @@ -83,9 +84,10 @@ export class NFSSharedStorageService extends SharedStorageService {
this.storageType = nfsConfig.storageType;
this.nfsServer = nfsConfig.nfsServer;
this.exportedDirectory = nfsConfig.exportedDirectory;
if (nfsConfig.localMounted === 'nnimount') {
this.localMounted = nfsConfig.localMounted;
if (this.localMounted === 'nnimount') {
await this.helpLocalMount();
} else if (nfsConfig.localMounted === 'nomount') {
} else if (this.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
Expand Down Expand Up @@ -122,6 +124,15 @@ export class NFSSharedStorageService extends SharedStorageService {
}
}

public get remoteUmountCommand(): string {
if (this.remoteMountPoint) {
return `sudo umount -f -l ${this.remoteMountPoint}`;
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}

private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`;
const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`;
Expand Down Expand Up @@ -157,4 +168,21 @@ export class NFSSharedStorageService extends SharedStorageService {

return Promise.resolve();
}

public async cleanUp(): Promise<void> {
if (this.localMounted !== 'nnimount') {
return Promise.resolve();
}
try {
const result = await cpp.exec(`sudo umount -f -l ${this.localMountPoint}`);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Umount ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
}
5 changes: 5 additions & 0 deletions ts/nni_manager/training_service/reusable/trialDispatcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ class TrialDispatcher implements TrainingService {
for (const commandChannel of this.commandChannelSet) {
await commandChannel.stop();
}
if (this.useSharedStorage) {
this.log.info(`stopping shared storage...`)
await component.get<SharedStorageService>(SharedStorageService).cleanUp();
this.log.info(`shared storage stopped.`)
}
}

private async environmentMaintenanceLoop(): Promise<void> {
Expand Down

0 comments on commit 281c043

Please sign in to comment.