Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Fix OpenPAI IT #4057

Merged
merged 43 commits into from
Aug 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
1cd5c78
add pipeline for paiK8s
Feb 23, 2020
2c4f8c5
fix pipeline
Feb 23, 2020
0881058
fix token
Feb 23, 2020
dad062f
exclude multiphase
Feb 25, 2020
e12bf39
fix comments
Feb 27, 2020
5dee410
fix pipeline
Mar 5, 2020
3190f1b
fix conflict
Mar 5, 2020
1f9eb85
Merge branch 'v1.5' of https://github.com/microsoft/nni into dev-pai-…
SparkSnail Mar 31, 2020
f855f01
fix pai-windows
SparkSnail Mar 31, 2020
7964124
debug
SparkSnail Mar 31, 2020
4b76499
fix error
Mar 31, 2020
a267885
debug
SparkSnail Apr 1, 2020
7b4d59d
debug
SparkSnail Apr 2, 2020
b535918
mount folder
SparkSnail Apr 2, 2020
736aa86
fix execopy
SparkSnail Apr 2, 2020
3f7c093
fix test image
SparkSnail Apr 3, 2020
7e39b24
fix docker image
SparkSnail Apr 3, 2020
5396657
set nfs path in variable
SparkSnail Apr 4, 2020
42572bc
fix openpai v1
Aug 3, 2021
b51d900
Merge branch 'master' into tmp-openpai
Aug 3, 2021
732c6eb
update agent pool
QuanluZhang Aug 3, 2021
23a9178
Merge branch 'master' into tmp-openpai
Aug 4, 2021
5103c21
fix storage service
SparkSnail Aug 5, 2021
5d35809
debug pipeline
SparkSnail Aug 5, 2021
3ddccee
fix pipeline
SparkSnail Aug 5, 2021
7391b9e
fix openpai
Aug 5, 2021
b52163e
fix pai
SparkSnail Aug 5, 2021
5f43cf7
fix pai
SparkSnail Aug 5, 2021
c213f1d
fix pai
SparkSnail Aug 5, 2021
7332688
fix build
SparkSnail Aug 5, 2021
5c14bc4
revert change
SparkSnail Aug 5, 2021
d36454a
Merge branch 'master' of https://github.com/microsoft/nni into dev-pa…
SparkSnail Aug 5, 2021
bb6b166
Merge branch 'master' of https://github.com/microsoft/nni into dev-pa…
SparkSnail Aug 6, 2021
bd3b9c9
Merge branch 'master' into tmp-openpai
Aug 9, 2021
c904ac8
fix pai
SparkSnail Aug 9, 2021
ea7abd9
fix pipeline
SparkSnail Aug 10, 2021
bcb2c10
debug
SparkSnail Aug 10, 2021
1a8df3c
Merge remote-tracking branch 'ms/dev-pai-pipeline' into tmp-openpai
Aug 11, 2021
f4aaf0d
.
Aug 11, 2021
d77e311
version check
Aug 11, 2021
4ef5658
debug
Aug 11, 2021
a6c9559
.
Aug 11, 2021
b708612
.
Aug 11, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions pipelines/integration-test-openpai-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ jobs:
--nni_docker_image nnidev/nni-nightly \
--pai_storage_config_name confignfs-data \
--pai_token $(pai_token) \
--nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \
--container_nfs_mount_path /mnt/confignfs-data/shinyang3 \
--nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \
--container_nfs_mount_path $(container_nfs_mount_path) \
--nni_manager_ip $(manager_ip) \
--vc nni
--vc nni \
--debug true
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName: Integration test

Expand All @@ -82,8 +83,8 @@ jobs:
--nni_docker_image nnidev/nni-nightly \
--pai_storage_config_name confignfs-data \
--pai_token $(pai_token) \
--nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \
--container_nfs_mount_path /mnt/confignfs-data/shinyang3 \
--nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) \
--container_nfs_mount_path $(container_nfs_mount_path) \
--nni_manager_ip $(manager_ip) \
--vc nni
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
Expand Down
3 changes: 3 additions & 0 deletions test/nni_test/nnitest/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def update_training_service_config(args):
config[args.ts]['trial']['paiStorageConfigName'] = args.pai_storage_config_name
if args.vc is not None:
config[args.ts]['trial']['virtualCluster'] = args.vc
if args.debug is not None:
config[args.ts]['debug'] = args.debug.lower() == 'true'
elif args.ts == 'kubeflow':
if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
Expand Down Expand Up @@ -146,6 +148,7 @@ def update_training_service_config(args):
parser.add_argument("--pai_storage_config_name", type=str)
parser.add_argument("--nni_manager_nfs_mount_path", type=str)
parser.add_argument("--container_nfs_mount_path", type=str)
parser.add_argument("--debug", type=str)
# args for kubeflow and frameworkController
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
Expand Down
2 changes: 1 addition & 1 deletion ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,4 @@ export function flattenConfig<T>(config: ExperimentConfig, platform: string): T
Object.assign(flattened, config.trainingService);
}
return <T>flattened;
}
}
3 changes: 2 additions & 1 deletion ts/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class PAITrainingService implements TrainingService {
this.paiTokenUpdateInterval = 7200000; //2hours
this.log.info('Construct paiBase training service.');
this.config = flattenConfig(config, 'openpai');
this.versionCheck = !this.config.debug;
this.paiJobRestServer = new PAIJobRestServer(this);
this.paiToken = this.config.token;
this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http';
Expand All @@ -78,7 +79,7 @@ class PAITrainingService implements TrainingService {

private async copyTrialCode(): Promise<void> {
await validateCodeDir(this.config.trialCodeDirectory);
const nniManagerNFSExpCodeDir = path.join(this.config.trialCodeDirectory, this.experimentId, 'nni-code');
const nniManagerNFSExpCodeDir = path.join(this.config.localStorageMountPoint, this.experimentId, 'nni-code');
await execMkdir(nniManagerNFSExpCodeDir);
this.log.info(`Starting copy codeDir data from ${this.config.trialCodeDirectory} to ${nniManagerNFSExpCodeDir}`);
await execCopydir(this.config.trialCodeDirectory, nniManagerNFSExpCodeDir);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ class RouterTrainingService implements TrainingService {
const instance = new RouterTrainingService();
instance.log = getLogger('RouterTrainingService');
const platform = Array.isArray(config.trainingService) ? 'hybrid' : config.trainingService.platform;
if (platform === 'remote' && !(<RemoteConfig>config.trainingService).reuseMode) {
if (platform === 'remote' && (<RemoteConfig>config.trainingService).reuseMode === false) {
instance.internalTrainingService = new RemoteMachineTrainingService(config);
} else if (platform === 'openpai' && !(<OpenpaiConfig>config.trainingService).reuseMode) {
} else if (platform === 'openpai' && (<OpenpaiConfig>config.trainingService).reuseMode === false) {
instance.internalTrainingService = new PAITrainingService(config);
} else if (platform === 'kubeflow' && !(<KubeflowConfig>config.trainingService).reuseMode) {
} else if (platform === 'kubeflow' && (<KubeflowConfig>config.trainingService).reuseMode === false) {
instance.internalTrainingService = new KubeflowTrainingService();
} else {
instance.internalTrainingService = await TrialDispatcher.construct(config);
Expand Down