diff --git a/dpgen/arginfo.py b/dpgen/arginfo.py index 52f966316..348451339 100644 --- a/dpgen/arginfo.py +++ b/dpgen/arginfo.py @@ -24,7 +24,7 @@ def general_mdata_arginfo(name: str, tasks: Tuple[str]) -> Argument: doc_api_version = "Please set to 1.0" doc_deepmd_version = "DeePMD-kit version, e.g. 2.1.3" doc_run_mdata = "machine.json file" - arg_api_version = Argument("api_version", str, optional=False, doc=doc_api_version) + arg_api_version = Argument("api_version", str, default="1.0", optional=True, doc=doc_api_version) arg_deepmd_version = Argument( "deepmd_version", str, optional=True, default="2", doc=doc_deepmd_version) diff --git a/dpgen/auto_test/common_equi.py b/dpgen/auto_test/common_equi.py index 863fdbdbe..978407eda 100644 --- a/dpgen/auto_test/common_equi.py +++ b/dpgen/auto_test/common_equi.py @@ -11,7 +11,6 @@ from dpgen import dlog from dpgen.auto_test.calculator import make_calculator from dpgen.auto_test.mpdb import get_structure -from dpgen.dispatcher.Dispatcher import make_dispatcher from packaging.version import Version from dpgen.dispatcher.Dispatcher import make_submission from dpgen.remote.decide_machine import convert_mdata @@ -176,21 +175,9 @@ def run_equi(confs, work_path = os.getcwd() print("%s --> Runing... " % (work_path)) - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - disp = make_dispatcher(machine, resources, work_path, run_tasks, group_size) - disp.run_jobs(resources, - command, - work_path, - run_tasks, - group_size, - forward_common_files, - forward_files, - backward_files, - outlog='outlog', - errlog='errlog') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( diff --git a/dpgen/auto_test/common_prop.py b/dpgen/auto_test/common_prop.py index 7a5f61930..0d237d515 100644 --- a/dpgen/auto_test/common_prop.py +++ b/dpgen/auto_test/common_prop.py @@ -13,7 +13,6 @@ from dpgen.auto_test.Vacancy import Vacancy from dpgen.auto_test.Gamma import Gamma from dpgen.auto_test.calculator import make_calculator -from dpgen.dispatcher.Dispatcher import make_dispatcher from dpgen.dispatcher.Dispatcher import make_submission from dpgen.remote.decide_machine import convert_mdata from dpgen.auto_test.lib.utils import create_path @@ -190,21 +189,9 @@ def worker(work_path, inter_type): run_tasks = [os.path.basename(ii) for ii in all_task] machine, resources, command, group_size = util.get_machine_info(mdata, inter_type) - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - disp = make_dispatcher(machine, resources, work_path, run_tasks, group_size) - disp.run_jobs(resources, - command, - work_path, - run_tasks, - group_size, - forward_common_files, - forward_files, - backward_files, - outlog='outlog', - errlog='errlog') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( mdata_machine=machine, diff --git a/dpgen/auto_test/lib/BatchJob.py b/dpgen/auto_test/lib/BatchJob.py deleted file mode 100644 index 6d68497df..000000000 --- a/dpgen/auto_test/lib/BatchJob.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -from enum import Enum -from subprocess import Popen, PIPE - -class JobStatus (Enum) : - unsubmitted = 1 - waiting = 2 - running = 3 - terminated = 4 - finished = 5 - unknow = 100 - -class BatchJob (object): - """ - Abstract class of a batch job - It submit a job (leave the id in file tag_jobid) - It check the status of the job (return JobStatus) - NOTICE: I assume that when a job finishes, a tag file named tag_finished should be touched by the user. - TYPICAL USAGE: - job = DERIVED_BatchJob (dir, script) - job.submit () - stat = job.check_status () - """ - def __init__ (self, - job_dir = "", # dir of the job - job_script = "", # name of the job script - job_finish_tag = "tag_finished", # name of the tag for finished job - job_id_file = "tag_jobid") : # job id if making an existing job - self.job_dir = job_dir - self.job_script = job_script - self.job_id_file = job_dir + "/" + job_id_file - self.job_finish_tag = job_dir + "/" + job_finish_tag - self.cwd = os.getcwd() - self.submit_cmd = str(self.submit_command()) - def get_job_id (self) : - if True == os.path.exists (self.job_id_file) : - fp = open (self.job_id_file, 'r') - job_id = fp.read () - return str(job_id) - else : - return "" - def submit_command (self) : - """ - submission is - $ [command] [script] - """ - raise RuntimeError ("submit_command not implemented") - def check_status (self): - raise RuntimeError ("check_status not implemented") - def submit (self) : - if self.get_job_id () != "" : - stat = self.check_status() - if stat != JobStatus.terminated : - if stat == JobStatus.unknow : - raise RuntimeError ("unknown job status, terminate!") - print ("# job %s, dir %s already submitted (waiting, running or finished), would not submit again" % - (self.get_job_id(), self.job_dir)) - return self.get_job_id() - else : - print ("# find terminated job " + self.get_job_id() + ", submit again") - if (False == os.path.isdir (self.job_dir) ) : - raise RuntimeError ("cannot find job dir " + self.job_dir) - abs_job_script = self.job_dir + "/" + self.job_script - if False == os.path.exists (abs_job_script) : - raise RuntimeError ("cannot find job script " + abs_job_script) - cwd = os.getcwd() - os.chdir (self.job_dir) - ret = Popen([self.submit_cmd + " " + self.job_script], stdout=PIPE, stderr=PIPE, shell = True) - stdout, stderr = ret.communicate() - if str(stderr, encoding='ascii') != "": - raise RuntimeError (stderr) - job_id = str(stdout, encoding='ascii').replace('\n','').split()[-1] - print ("# job %s submitted, dir %s " % (job_id, self.job_dir)) - fp = open (self.job_id_file, 'w') - fp.write (job_id) - fp.close() - os.chdir (cwd) - return self.get_job_id() diff --git a/dpgen/auto_test/lib/RemoteJob.py b/dpgen/auto_test/lib/RemoteJob.py deleted file mode 100644 index e66df1351..000000000 --- a/dpgen/auto_test/lib/RemoteJob.py +++ /dev/null @@ -1,565 +0,0 @@ -#!/usr/bin/env python3 - -import os, sys, paramiko, json, uuid, tarfile, time, stat -from enum import Enum - -class JobStatus (Enum) : - unsubmitted = 1 - waiting = 2 - running = 3 - terminated = 4 - finished = 5 - unknow = 100 - -def _default_item(resources, key, value) : - if key not in resources : - resources[key] = value - -def _set_default_resource(res) : - if res == None : - res = {} - _default_item(res, 'numb_node', 1) - _default_item(res, 'task_per_node', 1) - _default_item(res, 'numb_gpu', 0) - _default_item(res, 'time_limit', '1:0:0') - _default_item(res, 'mem_limit', -1) - _default_item(res, 'partition', '') - _default_item(res, 'account', '') - _default_item(res, 'qos', '') - _default_item(res, 'constraint_list', []) - _default_item(res, 'license_list', []) - _default_item(res, 'exclude_list', []) - _default_item(res, 'module_unload_list', []) - _default_item(res, 'module_list', []) - _default_item(res, 'source_list', []) - _default_item(res, 'envs', None) - _default_item(res, 'with_mpi', False) - - -class SSHSession (object) : - def __init__ (self, jdata) : - self.remote_profile = jdata - # with open(remote_profile) as fp : - # self.remote_profile = json.load(fp) - self.remote_host = self.remote_profile['hostname'] - self.remote_port = self.remote_profile['port'] - self.remote_uname = self.remote_profile['username'] - self.remote_password = self.remote_profile['password'] - self.remote_workpath = self.remote_profile['work_path'] - self.ssh = self._setup_ssh(self.remote_host, self.remote_port, username = self.remote_uname,password=self.remote_password) - - def _setup_ssh(self, - hostname, - port, - username = None, - password = None): - ssh_client = paramiko.SSHClient() - ssh_client.load_system_host_keys() - ssh_client.set_missing_host_key_policy(paramiko.WarningPolicy) - ssh_client.connect(hostname, port=port, username=username, password=password) - assert(ssh_client.get_transport().is_active()) - return ssh_client - - def get_ssh_client(self) : - return self.ssh - - def get_session_root(self) : - return self.remote_workpath - - def close(self) : - self.ssh.close() - - -class RemoteJob (object): - def __init__ (self, - ssh_session, - local_root - ) : - - self.local_root = os.path.abspath(local_root) - self.job_uuid = str(uuid.uuid4()) - # self.job_uuid = 'a21d0017-c9f1-4d29-9a03-97df06965cef' - self.remote_root = os.path.join(ssh_session.get_session_root(), self.job_uuid) - print("local_root is ", local_root) - print("remote_root is", self.remote_root) - self.ssh = ssh_session.get_ssh_client() - sftp = self.ssh.open_sftp() - sftp.mkdir(self.remote_root) - sftp.close() - # open('job_uuid', 'w').write(self.job_uuid) - - def get_job_root(self) : - return self.remote_root - - def upload(self, - job_dirs, - local_up_files, - dereference = True) : - cwd = os.getcwd() - os.chdir(self.local_root) - file_list = [] - for ii in job_dirs : - for jj in local_up_files : - file_list.append(os.path.join(ii,jj)) - self._put_files(file_list, dereference = dereference) - os.chdir(cwd) - - def download(self, - job_dirs, - remote_down_files) : - cwd = os.getcwd() - os.chdir(self.local_root) - file_list = [] - for ii in job_dirs : - for jj in remote_down_files : - file_list.append(os.path.join(ii,jj)) - self._get_files(file_list) - os.chdir(cwd) - - def block_checkcall(self, - cmd) : - stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd) - exit_status = stdout.channel.recv_exit_status() - if exit_status != 0: - raise RuntimeError("Get error code %d in calling through ssh with job: %s ", (exit_status, self.job_uuid)) - return stdin, stdout, stderr - - def block_call(self, - cmd) : - stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd) - exit_status = stdout.channel.recv_exit_status() - return exit_status, stdin, stdout, stderr - - def clean(self) : - sftp = self.ssh.open_sftp() - self._rmtree(sftp, self.remote_root) - sftp.close() - - def _rmtree(self, sftp, remotepath, level=0, verbose = False): - for f in sftp.listdir_attr(remotepath): - rpath = os.path.join(remotepath, f.filename) - if stat.S_ISDIR(f.st_mode): - self._rmtree(sftp, rpath, level=(level + 1)) - else: - rpath = os.path.join(remotepath, f.filename) - if verbose: print('removing %s%s' % (' ' * level, rpath)) - sftp.remove(rpath) - if verbose: print('removing %s%s' % (' ' * level, remotepath)) - sftp.rmdir(remotepath) - - def _put_files(self, - files, - dereference = True) : - of = self.job_uuid + '.tgz' - # local tar - cwd = os.getcwd() - os.chdir(self.local_root) - if os.path.isfile(of) : - os.remove(of) - with tarfile.open(of, "w:gz", dereference = dereference) as tar: - for ii in files : - tar.add(ii) - os.chdir(cwd) - # trans - from_f = os.path.join(self.local_root, of) - to_f = os.path.join(self.remote_root, of) - sftp = self.ssh.open_sftp() - sftp.put(from_f, to_f) - # remote extract - self.block_checkcall('tar xf %s' % of) - # clean up - os.remove(from_f) - sftp.remove(to_f) - sftp.close() - - def _get_files(self, - files) : - of = self.job_uuid + '.tgz' - flist = "" - for ii in files : - flist += " " + ii - # remote tar - self.block_checkcall('tar czf %s %s' % (of, flist)) - # trans - from_f = os.path.join(self.remote_root, of) - to_f = os.path.join(self.local_root, of) - if os.path.isfile(to_f) : - os.remove(to_f) - sftp = self.ssh.open_sftp() - sftp.get(from_f, to_f) - # extract - cwd = os.getcwd() - os.chdir(self.local_root) - with tarfile.open(of, "r:gz") as tar: - def is_within_directory(directory, target): - - abs_directory = os.path.abspath(directory) - abs_target = os.path.abspath(target) - - prefix = os.path.commonprefix([abs_directory, abs_target]) - - return prefix == abs_directory - - def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - - for member in tar.getmembers(): - member_path = os.path.join(path, member.name) - if not is_within_directory(path, member_path): - raise Exception("Attempted Path Traversal in Tar File") - - tar.extractall(path, members, numeric_owner=numeric_owner) - - - safe_extract(tar) - os.chdir(cwd) - # cleanup - os.remove(to_f) - sftp.remove(from_f) - -class CloudMachineJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None) : - - #print("Current path is",os.getcwd()) - - #for ii in job_dirs : - # if not os.path.isdir(ii) : - # raise RuntimeError("cannot find dir %s" % ii) - # print(self.remote_root) - script_name = self._make_script(job_dirs, cmd, args, resources) - self.stdin, self.stdout, self.stderr = self.ssh.exec_command(('cd %s; bash %s' % (self.remote_root, script_name))) - # print(self.stderr.read().decode('utf-8')) - # print(self.stdout.read().decode('utf-8')) - - def check_status(self) : - if not self._check_finish(self.stdout) : - return JobStatus.running - elif self._get_exit_status(self.stdout) == 0 : - return JobStatus.finished - else : - return JobStatus.terminated - - def _check_finish(self, stdout) : - return stdout.channel.exit_status_ready() - - def _get_exit_status(self, stdout) : - return stdout.channel.recv_exit_status() - - def _make_script(self, - job_dirs, - cmd, - args = None, - resources = None) : - _set_default_resource(resources) - envs = resources['envs'] - module_list = resources['module_list'] - module_unload_list = resources['module_unload_list'] - task_per_node = resources['task_per_node'] - - script_name = 'run.sh' - if args == None : - args = [] - for ii in job_dirs: - args.append('') - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write('#!/bin/bash\n\n') - # fp.write('set -euo pipefail\n') - if envs != None : - for key in envs.keys() : - fp.write('export %s=%s\n' % (key, envs[key])) - fp.write('\n') - if module_unload_list is not None : - for ii in module_unload_list : - fp.write('module unload %s\n' % ii) - fp.write('\n') - if module_list is not None : - for ii in module_list : - fp.write('module load %s\n' % ii) - fp.write('\n') - for ii,jj in zip(job_dirs, args) : - fp.write('cd %s\n' % ii) - fp.write('test $? -ne 0 && exit\n') - if resources['with_mpi'] == True : - fp.write('mpirun -n %d %s %s\n' - % (task_per_node, cmd, jj)) - else : - fp.write('%s %s\n' % (cmd, jj)) - fp.write('test $? -ne 0 && exit\n') - fp.write('cd %s\n' % self.remote_root) - fp.write('test $? -ne 0 && exit\n') - fp.write('\ntouch tag_finished\n') - sftp.close() - return script_name - - -class SlurmJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None) : - script_name = self._make_script(job_dirs, cmd, args, res = resources) - stdin, stdout, stderr = self.block_checkcall(('cd %s; sbatch %s' % (self.remote_root, script_name))) - subret = (stdout.readlines()) - job_id = subret[0].split()[-1] - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'w') as fp: - fp.write(job_id) - sftp.close() - - def check_status(self) : - job_id = self._get_job_id() - if job_id == "" : - raise RuntimeError("job %s is has not been submitted" % self.remote_root) - ret, stdin, stdout, stderr\ - = self.block_call ("squeue --job " + job_id) - err_str = stderr.read().decode('utf-8') - if (ret != 0) : - if str("Invalid job id specified") in err_str : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - raise RuntimeError\ - ("status command squeue fails to execute\nerror message:%s\nreturn code %d\n" % (err_str, ret)) - status_line = stdout.read().decode('utf-8').split ('\n')[-2] - status_word = status_line.split ()[-4] - if status_word in ["PD","CF","S"] : - return JobStatus.waiting - elif status_word in ["R","CG"] : - return JobStatus.running - elif status_word in ["C","E","K","BF","CA","CD","F","NF","PR","SE","ST","TO"] : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - def _get_job_id(self) : - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'r') as fp: - ret = fp.read().decode('utf-8') - sftp.close() - return ret - - def _check_finish_tag(self) : - sftp = self.ssh.open_sftp() - try: - sftp.stat(os.path.join(self.remote_root, 'tag_finished')) - ret = True - except IOError: - ret = False - sftp.close() - return ret - - def _make_script(self, - job_dirs, - cmd, - args = None, - res = None) : - _set_default_resource(res) - ret = '' - ret += "#!/bin/bash -l\n" - ret += "#SBATCH -N %d\n" % res['numb_node'] - ret += "#SBATCH --ntasks-per-node %d\n" % res['task_per_node'] - ret += "#SBATCH -t %s\n" % res['time_limit'] - if res['mem_limit'] > 0 : - ret += "#SBATCH --mem %dG \n" % res['mem_limit'] - if len(res['account']) > 0 : - ret += "#SBATCH --account %s \n" % res['account'] - if len(res['partition']) > 0 : - ret += "#SBATCH --partition %s \n" % res['partition'] - if len(res['qos']) > 0 : - ret += "#SBATCH --qos %s \n" % res['qos'] - if res['numb_gpu'] > 0 : - ret += "#SBATCH --gres=gpu:%d\n" % res['numb_gpu'] - for ii in res['constraint_list'] : - ret += '#SBATCH -C %s \n' % ii - for ii in res['license_list'] : - ret += '#SBATCH -L %s \n' % ii - for ii in res['exclude_list'] : - ret += '#SBATCH --exclude %s \n' % ii - ret += "\n" - # ret += 'set -euo pipefail\n\n' - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - - if args == None : - args = [] - for ii in job_dirs: - args.append('') - for ii,jj in zip(job_dirs, args) : - ret += 'cd %s\n' % ii - ret += 'test $? -ne 0 && exit\n' - if res['with_mpi'] : - ret += 'mpirun -n %d %s %s\n' % (res['task_per_node'],cmd, jj) - else : - ret += '%s %s\n' % (cmd, jj) - ret += 'test $? -ne 0 && exit\n' - ret += 'cd %s\n' % self.remote_root - ret += 'test $? -ne 0 && exit\n' - ret += '\ntouch tag_finished\n' - - script_name = 'run.sub' - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write(ret) - sftp.close() - - return script_name - - -class PBSJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None) : - script_name = self._make_script(job_dirs, cmd, args, res = resources) - stdin, stdout, stderr = self.block_checkcall(('cd %s; qsub %s' % (self.remote_root, script_name))) - subret = (stdout.readlines()) - job_id = subret[0].split()[0] - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'w') as fp: - fp.write(job_id) - sftp.close() - - def check_status(self) : - job_id = self._get_job_id() - if job_id == "" : - raise RuntimeError("job %s is has not been submitted" % self.remote_root) - ret, stdin, stdout, stderr\ - = self.block_call ("qstat " + job_id) - err_str = stderr.read().decode('utf-8') - if (ret != 0) : - if str("qstat: Unknown Job Id") in err_str : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - raise RuntimeError ("status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret)) - status_line = stdout.read().decode('utf-8').split ('\n')[-2] - status_word = status_line.split ()[-2] -# print (status_word) - if status_word in ["Q","H"] : - return JobStatus.waiting - elif status_word in ["R"] : - return JobStatus.running - elif status_word in ["C","E","K"] : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - def _get_job_id(self) : - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'r') as fp: - ret = fp.read().decode('utf-8') - sftp.close() - return ret - - def _check_finish_tag(self) : - sftp = self.ssh.open_sftp() - try: - sftp.stat(os.path.join(self.remote_root, 'tag_finished')) - ret = True - except IOError: - ret = False - sftp.close() - return ret - - def _make_script(self, - job_dirs, - cmd, - args = None, - res = None) : - _set_default_resource(res) - ret = '' - ret += "#!/bin/bash -l\n" - if res['numb_gpu'] == 0: - ret += '#PBS -l nodes=%d:ppn=%d\n' % (res['numb_node'], res['task_per_node']) - else : - ret += '#PBS -l nodes=%d:ppn=%d:gpus=%d\n' % (res['numb_node'], res['task_per_node'], res['numb_gpu']) - ret += '#PBS -l walltime=%s\n' % (res['time_limit']) - if res['mem_limit'] > 0 : - ret += "#PBS -l mem=%dG \n" % res['mem_limit'] - ret += '#PBS -j oe\n' - if len(res['partition']) > 0 : - ret += '#PBS -q %s\n' % res['partition'] - ret += "\n" - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - ret += 'cd $PBS_O_WORKDIR\n\n' - - if args == None : - args = [] - for ii in job_dirs: - args.append('') - for ii,jj in zip(job_dirs, args) : - ret += 'cd %s\n' % ii - ret += 'test $? -ne 0 && exit\n' - if res['with_mpi'] : - ret += 'mpirun -machinefile $PBS_NODEFILE -n %d %s %s\n' % (res['numb_node'] * res['task_per_node'], cmd, jj) - else : - ret += '%s %s\n' % (cmd, jj) - ret += 'test $? -ne 0 && exit\n' - ret += 'cd %s\n' % self.remote_root - ret += 'test $? -ne 0 && exit\n' - ret += '\ntouch tag_finished\n' - - script_name = 'run.sub' - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write(ret) - sftp.close() - - return script_name - - -# ssh_session = SSHSession('localhost.json') -# rjob = CloudMachineJob(ssh_session, '.') -# # can upload dirs and normal files -# rjob.upload(['job0', 'job1'], ['batch_exec.py', 'test']) -# rjob.submit(['job0', 'job1'], 'touch a; sleep 2') -# while rjob.check_status() == JobStatus.running : -# print('checked') -# time.sleep(2) -# print(rjob.check_status()) -# # can download dirs and normal files -# rjob.download(['job0', 'job1'], ['a']) -# # rjob.clean() diff --git a/dpgen/auto_test/lib/SlurmJob.py b/dpgen/auto_test/lib/SlurmJob.py deleted file mode 100644 index 3fc49088d..000000000 --- a/dpgen/auto_test/lib/SlurmJob.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -from enum import Enum -from subprocess import Popen, PIPE -from dpgen.auto_test.lib.BatchJob import BatchJob -from dpgen.auto_test.lib.BatchJob import JobStatus - -class SlurmJob (BatchJob) : - def submit_command (self): - return "sbatch" - def check_status (self): - job_id = self.get_job_id () - if len(job_id) == 0 : - return JobStatus.unsubmitted - ret = Popen (["squeue --job " + job_id], shell=True, stdout=PIPE, stderr=PIPE) - stdout, stderr = ret.communicate() - if (ret.returncode != 0) : - if str("Invalid job id specified") in str(stderr, encoding='ascii') : - if os.path.exists (self.job_finish_tag) : - return JobStatus.finished - else : - return JobStatus.terminated - else : - Logger.error ("status command " + "squeue" + " fails to execute") - Logger.error ("erro info: " + str(stderr, encoding='ascii')) - Logger.error ("return code: " + str(ret.returncode)) - sys.exit () - status_line = str(stdout, encoding='ascii').split ('\n')[-2] - status_word = status_line.split ()[4] -# status_word = status_line.split ()[-4] -# print ("status line: " + status_line) -# print ("status word: " + status_word) -# print (status_word) - if status_word in ["PD","CF","S"] : - return JobStatus.waiting - elif status_word in ["R","CG"] : - return JobStatus.running - elif status_word in ["C","E","K","BF","CA","CD","F","NF","PR","SE","ST","TO"] : - if os.path.exists (self.job_finish_tag) : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - -if __name__ == "__main__" : - job = SlurmJob ("/home/han.wang/data/test/string/test", "cu01.sleep") - job.submit () - print ("submit done") - stat = job.check_status () - print ("check done") - print (stat) - diff --git a/dpgen/auto_test/lib/localhost.json b/dpgen/auto_test/lib/localhost.json deleted file mode 100644 index f2feaed5d..000000000 --- a/dpgen/auto_test/lib/localhost.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "hostname" : "localhost", - "port" : 22, - "username": "wanghan", - "work_path" : "/home/wanghan/tmp", - "_comment" : "that's all" -} diff --git a/dpgen/data/gen.py b/dpgen/data/gen.py index 952090793..d34ebda90 100644 --- a/dpgen/data/gen.py +++ b/dpgen/data/gen.py @@ -29,7 +29,7 @@ from pymatgen.io.vasp import Incar from dpgen.remote.decide_machine import convert_mdata from dpgen import ROOT_PATH -from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher, make_submission +from dpgen.dispatcher.Dispatcher import make_submission @@ -1060,19 +1060,9 @@ def run_vasp_relax(jdata, mdata): # relax_run_tasks.append(ii) run_tasks = [os.path.basename(ii) for ii in relax_run_tasks] - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size) - dispatcher.run_jobs(fp_resources, - [fp_command], - work_dir, - run_tasks, - fp_group_size, - forward_common_files, - forward_files, - backward_files) + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( @@ -1196,19 +1186,9 @@ def run_abacus_relax(jdata, mdata): # relax_run_tasks.append(ii) run_tasks = [os.path.basename(ii) for ii in relax_run_tasks] - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size) - dispatcher.run_jobs(fp_resources, - [fp_command], - work_dir, - run_tasks, - fp_group_size, - forward_common_files, - forward_files, - backward_files) + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( @@ -1263,19 +1243,9 @@ def run_vasp_md(jdata, mdata): run_tasks = [ii.replace(work_dir+"/", "") for ii in md_run_tasks] #dlog.info("md_work_dir", work_dir) #dlog.info("run_tasks",run_tasks) - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size) - dispatcher.run_jobs(fp_resources, - [fp_command], - work_dir, - run_tasks, - fp_group_size, - forward_common_files, - forward_files, - backward_files) + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( @@ -1343,19 +1313,9 @@ def run_abacus_md(jdata, mdata): run_tasks = [ii.replace(work_dir+"/", "") for ii in md_run_tasks] #dlog.info("md_work_dir", work_dir) #dlog.info("run_tasks",run_tasks) - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size) - dispatcher.run_jobs(fp_resources, - [fp_command], - work_dir, - run_tasks, - fp_group_size, - forward_common_files, - forward_files, - backward_files) + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( diff --git a/dpgen/dispatcher/ALI.py b/dpgen/dispatcher/ALI.py deleted file mode 100644 index 2a01ab378..000000000 --- a/dpgen/dispatcher/ALI.py +++ /dev/null @@ -1,514 +0,0 @@ -from aliyunsdkecs.request.v20140526.DescribeInstancesRequest import DescribeInstancesRequest -from aliyunsdkcore.client import AcsClient -import aliyunsdkcore.request -aliyunsdkcore.request.set_default_protocol_type("https") -from aliyunsdkcore.acs_exception.exceptions import ClientException -from aliyunsdkcore.acs_exception.exceptions import ServerException -from aliyunsdkecs.request.v20140526.RunInstancesRequest import RunInstancesRequest -from aliyunsdkecs.request.v20140526.DeleteInstancesRequest import DeleteInstancesRequest -from aliyunsdkecs.request.v20140526.DescribeAutoProvisioningGroupInstancesRequest import DescribeAutoProvisioningGroupInstancesRequest -from aliyunsdkecs.request.v20140526.CreateAutoProvisioningGroupRequest import CreateAutoProvisioningGroupRequest -from aliyunsdkecs.request.v20140526.DeleteAutoProvisioningGroupRequest import DeleteAutoProvisioningGroupRequest -from aliyunsdkecs.request.v20140526.ModifyAutoProvisioningGroupRequest import ModifyAutoProvisioningGroupRequest -from aliyunsdkecs.request.v20140526.DeleteLaunchTemplateRequest import DeleteLaunchTemplateRequest -from aliyunsdkvpc.request.v20160428.DescribeVpcsRequest import DescribeVpcsRequest -from aliyunsdkecs.request.v20140526.DescribeLaunchTemplatesRequest import DescribeLaunchTemplatesRequest -from aliyunsdkecs.request.v20140526.CreateLaunchTemplateRequest import CreateLaunchTemplateRequest -from aliyunsdkecs.request.v20140526.DescribeImagesRequest import DescribeImagesRequest -from aliyunsdkecs.request.v20140526.DescribeSecurityGroupsRequest import DescribeSecurityGroupsRequest -from aliyunsdkvpc.request.v20160428.DescribeVSwitchesRequest import DescribeVSwitchesRequest -import time, json, os, glob, string, random, sys -from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, JobRecord -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.DispatcherList import DispatcherList, Entity -from os.path import join -from dpgen import dlog -from hashlib import sha1 - -# cloud_resources = {"AccessKey_ID":"", -# "AccessKey_Secret":"", -# "regionID": "cn-shenzhen", -# "img_name": "kit", -# "machine_type_price": [ -# {"machine_type": "ecs.gn6v-c8g1.2xlarge", "price_limit": 20.00, "numb": 1, "priority": 0}, -# {"machine_type": "ecs.gn5-c4g1.xlarge", "price_limit": 20.00, "numb": 1, "priority": 1} -# ], -# "instance_name": "CH4_test_username", -# "pay_strategy": "spot" -# "apg_id": apg_id, -# "template_id": template_id, -# "vsw_id": vsw_id, -# "region_id": region_id, -# "client": client} - -def manual_create(stage, num): - '''running this function in your project root path, which contains machine-ali.json. - please ensure your machine name is machine-ali.json - This will create a subdir named manual, which includes apg_id.json''' - root_path = os.getcwd() - fp = open("machine-ali.json") - data = json.load(fp) - if not os.path.exists("manual"): - os.mkdir("manual") - os.chdir("manual") - mdata_machine = data[stage][0]["machine"] - mdata_resources = data[stage][0]["resources"] - cloud_resources = mdata_machine["cloud_resources"] - ali = ALI(mdata_machine, mdata_resources, "work_path", [1], 1, cloud_resources) - img_id = ali.get_image_id(ali.cloud_resources["img_name"]) - sg_id, vpc_id = ali.get_sg_vpc_id() - ali.cloud_resources["template_id"] = ali.create_template(img_id, sg_id, vpc_id) - ali.cloud_resources["vsw_id"] = ali.get_vsw_id(vpc_id) - ali.nchunks_limit = num - ali.cloud_resources["apg_id"] = ali.create_apg() - time.sleep(90) - instance_list = ali.describe_apg_instances() - ip_list = ali.get_ip(instance_list) - print(instance_list) - print(ip_list) - -def manual_delete(stage): - '''running this function in your project root path, which contains machine-ali.json. ''' - if os.path.exists("manual"): - fp = open("machine-ali.json") - data = json.load(fp) - mdata_machine = data[stage][0]["machine"] - mdata_resources = data[stage][0]["resources"] - cloud_resources = mdata_machine["cloud_resources"] - ali = ALI(mdata_machine, mdata_resources, "work_path", [1], 1, cloud_resources) - os.chdir("manual") - fp = open("apg_id.json") - data = json.load(fp) - ali.cloud_resources["apg_id"] = data["apg_id"] - ali.delete_apg() - os.remove("apg_id.json") - print("delete successfully!") - -def delete_apg(stage): - fp = open("machine-ali.json") - data = json.load(fp) - mdata_machine = data[stage][0]["machine"] - mdata_resources = data[stage][0]["resources"] - cloud_resources = mdata_machine["cloud_resources"] - ali = ALI(mdata_machine, mdata_resources, "work_path", [1], 1, cloud_resources) - fp = open("apg_id.json") - data = json.load(fp) - ali.cloud_resources["apg_id"] = data["apg_id"] - ali.delete_apg() - os.remove("apg_id.json") - print("delete successfully!") - - -class ALI(DispatcherList): - def __init__(self, mdata_machine, mdata_resources, work_path, run_tasks, group_size, cloud_resources=None): - super().__init__(mdata_machine, mdata_resources, work_path, run_tasks, group_size, cloud_resources) - self.client = AcsClient(cloud_resources["AccessKey_ID"], cloud_resources["AccessKey_Secret"], cloud_resources["regionID"]) - - def init(self): - self.prepare() - for ii in range(self.nchunks): - self.create(ii) - - def create(self, ii): - if self.dispatcher_list[ii]["dispatcher_status"] == "unallocated" and len(self.ip_pool) > 0: - self.dispatcher_list[ii]["entity"] = Entity(self.ip_pool.pop(0), self.server_pool.pop(0)) - self.make_dispatcher(ii) - - # Derivate - def delete(self, ii): - '''delete one machine''' - request = DeleteInstancesRequest() - request.set_accept_format('json') - request.set_InstanceIds([self.dispatcher_list[ii]["entity"].instance_id]) - request.set_Force(True) - count = 0 - flag = 0 - while count < 10: - try: - response = self.client.do_action_with_exception(request) - flag = 1 - break - except ServerException as e: - time.sleep(10) - count += 1 - - if flag: - status_list = [item["dispatcher_status"] for item in self.dispatcher_list] - running_num = status_list.count("running") - running_num += status_list.count("unsubmitted") - self.change_apg_capasity(running_num) - else: - dlog.info("delete failed, exit") - sys.exit() - - def update(self): - self.server_pool = self.get_server_pool() - self.ip_pool = self.get_ip(self.server_pool) - - # Derivate - def catch_dispatcher_exception(self, ii): - '''everything is okay: return 0 - ssh not active : return 1 - machine callback : return 2''' - if self.check_spot_callback(self.dispatcher_list[ii]["entity"].instance_id): - dlog.info("machine %s callback, ip: %s" % (self.dispatcher_list[ii]["entity"].instance_id, self.dispatcher_list[ii]["entity"].ip)) - return 2 - elif not self.dispatcher_list[ii]["dispatcher"].session._check_alive(): - try: - self.dispatcher_list[ii]["dispatcher"].session.ensure_alive() - return 0 - except RuntimeError: - return 1 - else: return 0 - - def get_server_pool(self): - running_server = self.describe_apg_instances() - allocated_server = [] - for ii in range(self.nchunks): - if self.dispatcher_list[ii]["dispatcher_status"] == "running" or self.dispatcher_list[ii]["dispatcher_status"] == "unsubmitted": - allocated_server.append(self.dispatcher_list[ii]["entity"].instance_id) - return list(set(running_server) - set(allocated_server)) - - def clean(self): - self.delete_apg() - self.delete_template() - os.remove("apg_id.json") - - def prepare(self): - restart = False - if os.path.exists('apg_id.json'): - with open('apg_id.json') as fp: - apg = json.load(fp) - self.cloud_resources["apg_id"] = apg["apg_id"] - task_chunks_str = ['+'.join(ii) for ii in self.task_chunks] - task_hashes = [sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str] - for ii in range(self.nchunks): - fn = 'jr.%.06d.json' % ii - if os.path.exists(os.path.join(os.path.abspath(self.work_path), fn)): - cur_hash = task_hashes[ii] - job_record = JobRecord(self.work_path, self.task_chunks[ii], fn) - if not job_record.check_finished(cur_hash): - if not self.check_spot_callback(job_record.record[cur_hash]['context']['instance_id']): - self.dispatcher_list[ii]["entity"] = Entity(job_record.record[cur_hash]['context']['ip'], job_record.record[cur_hash]['context']['instance_id'], job_record) - self.make_dispatcher(ii) - self.dispatcher_list[ii]["dispatcher_status"] = "unsubmitted" - else: - os.remove(os.path.join(os.path.abspath(self.work_path), fn)) - else: - self.dispatcher_list[ii]["dispatcher_status"] = "finished" - self.server_pool = self.get_server_pool() - self.ip_pool = self.get_ip(self.server_pool) - restart = True - img_id = self.get_image_id(self.cloud_resources["img_name"]) - sg_id, vpc_id = self.get_sg_vpc_id() - self.cloud_resources["template_id"] = self.create_template(img_id, sg_id, vpc_id) - self.cloud_resources["vsw_id"] = self.get_vsw_id(vpc_id) - if not restart: - dlog.info("begin to create apg") - self.cloud_resources["apg_id"] = self.create_apg() - time.sleep(120) - self.server_pool = self.get_server_pool() - self.ip_pool = self.get_ip(self.server_pool) - else: dlog.info("restart dpgen") - - def delete_apg(self): - request = DeleteAutoProvisioningGroupRequest() - request.set_accept_format('json') - request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"]) - request.set_TerminateInstances(True) - count = 0 - flag = 0 - while count < 10: - try: - response = self.client.do_action_with_exception(request) - flag = 1 - break - except ServerException as e: - time.sleep(10) - count += 1 - if not flag: - dlog.info("delete apg failed, exit") - sys.exit() - - - def create_apg(self): - request = CreateAutoProvisioningGroupRequest() - request.set_accept_format('json') - request.set_TotalTargetCapacity(str(self.nchunks_limit)) - request.set_LaunchTemplateId(self.cloud_resources["template_id"]) - request.set_AutoProvisioningGroupName(self.cloud_resources["instance_name"] + ''.join(random.choice(string.ascii_uppercase) for _ in range(20))) - request.set_AutoProvisioningGroupType("maintain") - request.set_SpotAllocationStrategy("lowest-price") - request.set_SpotInstanceInterruptionBehavior("terminate") - request.set_SpotInstancePoolsToUseCount(1) - request.set_ExcessCapacityTerminationPolicy("termination") - request.set_TerminateInstances(True) - request.set_PayAsYouGoTargetCapacity("0") - request.set_SpotTargetCapacity(str(self.nchunks_limit)) - config = self.generate_config() - request.set_LaunchTemplateConfigs(config) - - try: - response = self.client.do_action_with_exception(request) - response = json.loads(response) - with open('apg_id.json', 'w') as fp: - json.dump({'apg_id': response["AutoProvisioningGroupId"]}, fp, indent=4) - return response["AutoProvisioningGroupId"] - except ServerException as e: - dlog.info("create apg failed, err msg: %s" % e) - sys.exit() - except ClientException as e: - dlog.info("create apg failed, err msg: %s" % e) - sys.exit() - - def describe_apg_instances(self): - request = DescribeAutoProvisioningGroupInstancesRequest() - request.set_accept_format('json') - request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"]) - request.set_PageSize(100) - iteration = self.nchunks // 100 - instance_list = [] - for i in range(iteration + 1): - request.set_PageNumber(i+1) - count = 0 - flag = 0 - err_msg = 0 - while count < 10: - try: - response = self.client.do_action_with_exception(request) - response = json.loads(response) - for ins in response["Instances"]["Instance"]: - instance_list.append(ins["InstanceId"]) - flag = 1 - break - except ServerException as e: - # dlog.info(e) - err_msg = e - count += 1 - except ClientException as e: - # dlog.info(e) - err_msg = e - count += 1 - if not flag: - dlog.info("describe_apg_instances failed, err msg: %s" %err_msg) - sys.exit() - return instance_list - - def generate_config(self): - machine_config = self.cloud_resources["machine_type_price"] - config = [] - for conf in machine_config: - for vsw in self.cloud_resources["vsw_id"]: - tmp = { - "InstanceType": conf["machine_type"], - "MaxPrice": str(conf["price_limit"] * conf["numb"]), - "VSwitchId": vsw, - "WeightedCapacity": "1", - "Priority": str(conf["priority"]) - } - config.append(tmp) - return config - - def create_template(self, image_id, sg_id, vpc_id): - request = CreateLaunchTemplateRequest() - request.set_accept_format('json') - request.set_LaunchTemplateName(''.join(random.choice(string.ascii_uppercase) for _ in range(20))) - request.set_ImageId(image_id) - request.set_ImageOwnerAlias("self") - request.set_PasswordInherit(True) - if "address" in self.cloud_resources and self.cloud_resources['address'] == "public": - request.set_InternetMaxBandwidthIn(100) - request.set_InternetMaxBandwidthOut(100) - request.set_InstanceType("ecs.c6.large") - request.set_InstanceName(self.cloud_resources["instance_name"]) - request.set_SecurityGroupId(sg_id) - request.set_VpcId(vpc_id) - request.set_SystemDiskCategory("cloud_efficiency") - request.set_SystemDiskSize(70) - request.set_IoOptimized("optimized") - request.set_InstanceChargeType("PostPaid") - request.set_NetworkType("vpc") - request.set_SpotStrategy("SpotWithPriceLimit") - request.set_SpotPriceLimit(100) - try: - response = self.client.do_action_with_exception(request) - response = json.loads(response) - return response["LaunchTemplateId"] - except ServerException as e: - dlog.info(e) - sys.exit() - except ClientException as e: - dlog.info(e) - sys.exit() - - def delete_template(self): - request = DeleteLaunchTemplateRequest() - request.set_accept_format('json') - count = 0 - flag = 0 - while count < 10: - try: - request.set_LaunchTemplateId(self.cloud_resources["template_id"]) - response = self.client.do_action_with_exception(request) - flag = 1 - break - except Exception: - count += 1 - # count = 10 and still failed, continue - - def get_image_id(self, img_name): - request = DescribeImagesRequest() - request.set_accept_format('json') - request.set_ImageOwnerAlias("self") - request.set_PageSize(20) - response = self.client.do_action_with_exception(request) - response = json.loads(response) - totalcount = response["TotalCount"] - - iteration = totalcount // 20 - if iteration * 20 < totalcount: - iteration += 1 - - for ii in range(1, iteration+1): - count = 0 - flag = 0 - request.set_PageNumber(ii) - while count < 10: - try: - response = self.client.do_action_with_exception(request) - response = json.loads(response) - for img in response["Images"]["Image"]: - if img["ImageName"] == img_name: - return img["ImageId"] - flag = 1 - break - except Exception: - count += 1 - time.sleep(10) - if not flag: - dlog.info("get image failed, exit") - sys.exit() - - def get_sg_vpc_id(self): - request = DescribeSecurityGroupsRequest() - request.set_accept_format('json') - response = self.client.do_action_with_exception(request) - response = json.loads(response) - for sg in response["SecurityGroups"]["SecurityGroup"]: - if sg["SecurityGroupName"] == "sg": - return sg["SecurityGroupId"], sg["VpcId"] - - def get_vsw_id(self, vpc_id): - request = DescribeVpcsRequest() - request.set_accept_format('json') - request.set_VpcId(vpc_id) - response = self.client.do_action_with_exception(request) - response = json.loads(response) - for vpc in response["Vpcs"]["Vpc"]: - if vpc["VpcId"] == vpc_id: - vswitchids = vpc["VSwitchIds"]["VSwitchId"] - break - vswitchid_option = [] - if "zone" in self.cloud_resources and self.cloud_resources['zone']: - for zone in self.cloud_resources['zone']: - for vswitchid in vswitchids: - request = DescribeVSwitchesRequest() - request.set_accept_format('json') - request.set_VSwitchId(vswitchid) - zoneid = self.cloud_resources['regionID']+"-"+zone - request.set_ZoneId(zoneid) - response = self.client.do_action_with_exception(request) - response = json.loads(response) - if(response["TotalCount"] == 1): - vswitchid_option.append(vswitchid) - continue - if(vswitchid_option): - return vswitchid_option - else: - return vswitchids - - def change_apg_capasity(self, capasity): - request = ModifyAutoProvisioningGroupRequest() - request.set_accept_format('json') - request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"]) - request.set_TotalTargetCapacity(str(capasity)) - request.set_SpotTargetCapacity(str(capasity)) - request.set_PayAsYouGoTargetCapacity("0") - count = 0 - flag = 0 - while count < 10: - try: - response = self.client.do_action_with_exception(request) - flag = 1 - break - except Exception: - count += 1 - time.sleep(10) - if not flag: - dlog.info("change_apg_capasity failed, exit") - sys.exit() - - def check_spot_callback(self, instance_id): - request = DescribeInstancesRequest() - request.set_accept_format('json') - request.set_InstanceIds([instance_id]) - status = False - count = 0 - while count < 10: - try: - response = self.client.do_action_with_exception(request) - response = json.loads(response) - if len(response["Instances"]["Instance"]) == 1 and "Recycling" in response["Instances"]["Instance"][0]["OperationLocks"]["LockReason"]: - status = True - if instance_id not in self.describe_apg_instances(): - status = True - break - except ServerException as e: - # dlog.info(e) - count += 1 - time.sleep(10) - except ClientException as e: - # dlog.info(e) - count += 1 - time.sleep(10) - return status - - def get_ip(self, instance_list): - request = DescribeInstancesRequest() - request.set_accept_format('json') - ip_list = [] - if len(instance_list) == 0: return ip_list - try: - if len(instance_list) <= 10: - for i in range(len(instance_list)): - request.set_InstanceIds([instance_list[i]]) - response = self.client.do_action_with_exception(request) - response = json.loads(response) - if "address" in self.cloud_resources and self.cloud_resources['address'] == "public": - ip_list.append(response["Instances"]["Instance"][0]["PublicIpAddress"]["IpAddress"][0]) - else: - ip_list.append(response["Instances"]["Instance"][0]["VpcAttributes"]["PrivateIpAddress"]['IpAddress'][0]) - # ip_list.append(response["Instances"]["Instance"][0]["PublicIpAddress"]["IpAddress"][0]) - else: - iteration = len(instance_list) // 10 - for i in range(iteration): - for j in range(10): - request.set_InstanceIds([instance_list[i*10+j]]) - response = self.client.do_action_with_exception(request) - response = json.loads(response) - if "address" in self.cloud_resources and self.cloud_resources['address'] == "public": - ip_list.append(response["Instances"]["Instance"][0]["PublicIpAddress"]["IpAddress"][0]) - else: - ip_list.append(response["Instances"]["Instance"][0]["VpcAttributes"]["PrivateIpAddress"]['IpAddress'][0]) - if len(instance_list) - iteration * 10 != 0: - for j in range(len(instance_list) - iteration * 10): - request.set_InstanceIds([instance_list[iteration*10+j]]) - response = self.client.do_action_with_exception(request) - response = json.loads(response) - if "address" in self.cloud_resources and self.cloud_resources['address'] == "public": - ip_list.append(response["Instances"]["Instance"][0]["PublicIpAddress"]["IpAddress"][0]) - else: - ip_list.append(response["Instances"]["Instance"][0]["VpcAttributes"]["PrivateIpAddress"]['IpAddress'][0]) - return ip_list - except Exception: return [] - diff --git a/dpgen/dispatcher/AWS.py b/dpgen/dispatcher/AWS.py deleted file mode 100644 index 84f2b7cbf..000000000 --- a/dpgen/dispatcher/AWS.py +++ /dev/null @@ -1,142 +0,0 @@ -import os,getpass,time -from datetime import datetime -from itertools import zip_longest -from dpgen.dispatcher.Batch import Batch -from dpgen.dispatcher.JobStatus import JobStatus -from dpgen import dlog - - -class AWS(Batch): - _query_time_interval = 30 - _job_id_map_status = {} - _jobQueue = "" - _query_next_allow_time = datetime.now().timestamp() - - def __init__(self, context, uuid_names=True): - import boto3 - self.batch_client = boto3.client('batch') - super().__init__(context, uuid_names) - - @staticmethod - def map_aws_status_to_dpgen_status(aws_status): - map_dict = {'SUBMITTED': JobStatus.waiting, - 'PENDING': JobStatus.waiting, - 'RUNNABLE': JobStatus.waiting, - 'STARTING': JobStatus.waiting, - 'RUNNING': JobStatus.running, - 'SUCCEEDED': JobStatus.finished, - 'FAILED': JobStatus.terminated, - 'UNKNOWN': JobStatus.unknown} - return map_dict.get(aws_status, JobStatus.unknown) - - @classmethod - def AWS_check_status(cls, job_id=""): - """ - to aviod query jobStatus too often, set a time interval - query_dict example: - {job_id: JobStatus} - - {'40fb24b2-d0ca-4443-8e3a-c0906ea03622': , - '41bda50c-0a23-4372-806c-87d16a680d85': } - - """ - query_dict ={} - if datetime.now().timestamp() > cls._query_next_allow_time: - cls._query_next_allow_time=datetime.now().timestamp()+cls._query_time_interval - for status in ['SUBMITTED', 'PENDING', 'RUNNABLE', 'STARTING', 'RUNNING','SUCCEEDED', 'FAILED']: - nextToken = '' - while nextToken is not None: - status_response = self.batch_client.list_jobs(jobQueue=cls._jobQueue, jobStatus=status, maxResults=100, nextToken=nextToken) - status_list=status_response.get('jobSummaryList') - nextToken = status_response.get('nextToken', None) - for job_dict in status_list: - cls._job_id_map_status.update({job_dict['jobId']: cls.map_aws_status_to_dpgen_status(job_dict['status'])}) - dlog.debug('20000:_map: %s' %(cls._job_id_map_status)) - dlog.debug('62000:job_id:%s, _query: %s, _map: %s' %(job_id, query_dict, cls._job_id_map_status)) - if job_id: - return cls._job_id_map_status.get(job_id) - - return cls._job_id_map_status - - @property - def job_id(self): - try: - self._job_id - except AttributeError: - if self.context.check_file_exists(self.job_id_name): - self._job_id = self.context.read_file(self.job_id_name) - response_list = self.batch_client.describe_jobs(jobs=[self._job_id]).get('jobs') - try: - response = response_list[0] - jobQueue = response['jobQueue'] - except IndexError: - pass - else: - self.job_id = (response, jobQueue) - return self._job_id - dlog.debug("50000, self._job_id:%s,_Queue:%s,_map:%s,"%(self._job_id, self.__class__._jobQueue, self.__class__._job_id_map_status )) - return "" - return self._job_id - - @job_id.setter - def job_id(self, values): - response, jobQueue = values - self._job_id = response['jobId'] - self._job_name = response['jobName'] - self.__class__._jobQueue = jobQueue - self.__class__._job_id_map_status[self._job_id] = self.map_aws_status_to_dpgen_status(response.get('status', 'SUBMITTED')) - self.context.write_file(self.job_id_name, self._job_id) - dlog.debug("15000, _job_id:%s, _job_name:%s, _map:%s, _Queue:%s" % (self._job_id, self._job_name, self.__class__._job_id_map_status, self.__class__._jobQueue)) - - def check_status(self): - return self.__class__.AWS_check_status(job_id=self.job_id) - - def sub_script(self, job_dirs, cmd, args, res, outlog, errlog): - if args is None: - args=[] - multi_command = "" - for job_dir in job_dirs: - for idx,t in enumerate(zip_longest(cmd, args, fillvalue='')): - c_str = f"cd {self.context.remote_root}/{job_dir} && ( test -f tag_{idx}_finished || ( ({t[0]} {t[1]} && touch tag_{idx}_finished 2>>{errlog} || exit 52 ) | tee -a {outlog}) ) || exit 51;" - multi_command += c_str - multi_command +="exit 0;" - dlog.debug("10000, %s" % multi_command) - return multi_command - - def default_resources(self, res): - if res == None: - res = {} - else: - # res.setdefault(jobDefinition) - res.setdefault('cpu_num', 32) - res.setdefault('memory_size', 120000) - res.setdefault('jobQueue', 'deepmd_m5_v1_7') - return res - - def do_submit(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err'): - - res = self.default_resources(res) - dlog.debug("2000, params=(%s, %s, %s, %s, %s, %s, )" % (job_dirs, cmd, args, res, outlog, errlog )) - dlog.debug('2200, self.context.remote_root: %s , self.context.local_root: %s' % (self.context.remote_root, self.context.local_root)) - # concreate_command = - script_str = self.sub_script(job_dirs, cmd, args=args, res=res, outlog=outlog, errlog=errlog) - dlog.debug('2300, script_str: %s, self.sub_script_name: %s' % (script_str, self.sub_script_name)) - """ - jobName example: - home-ec2-user-Ag_init-run_gen-iter_000000-01_model_devi-task_000_000048 - """ - jobName = os.path.join(self.context.remote_root,job_dirs.pop())[1:].replace('/','-').replace('.','_') - jobName += ("_" + str(self.context.job_uuid)) - response = self.batch_client.submit_job(jobName=jobName, - jobQueue=res['jobQueue'], - jobDefinition=res['jobDefinition'], - parameters={'task_command':script_str}, - containerOverrides={'vcpus':res['cpu_num'], 'memory':res['memory_size']}) - dlog.debug('4000, response:%s' % response) - self.job_id = (response, res['jobQueue']) diff --git a/dpgen/dispatcher/Batch.py b/dpgen/dispatcher/Batch.py deleted file mode 100644 index 1240be9f7..000000000 --- a/dpgen/dispatcher/Batch.py +++ /dev/null @@ -1,166 +0,0 @@ -import os,sys,time - -from dpgen.dispatcher.JobStatus import JobStatus -from dpgen import dlog - - -class Batch(object) : - def __init__ (self, - context, - uuid_names = True) : - self.context = context - self.uuid_names = uuid_names - if uuid_names: - self.upload_tag_name = '%s_tag_upload' % self.context.job_uuid - self.finish_tag_name = '%s_tag_finished' % self.context.job_uuid - self.sub_script_name = '%s.sub' % self.context.job_uuid - self.job_id_name = '%s_job_id' % self.context.job_uuid - else: - self.upload_tag_name = 'tag_upload' - self.finish_tag_name = 'tag_finished' - self.sub_script_name = 'run.sub' - self.job_id_name = 'job_id' - - def check_status(self) : - raise RuntimeError('abstract method check_status should be implemented by derived class') - - def default_resources(self, res) : - raise RuntimeError('abstract method sub_script_head should be implemented by derived class') - - def sub_script_head(self, res) : - raise RuntimeError('abstract method sub_script_head should be implemented by derived class') - - def sub_script_cmd(self, cmd, res): - raise RuntimeError('abstract method sub_script_cmd should be implemented by derived class') - - def do_submit(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err'): - ''' - submit a single job, assuming that no job is running there. - ''' - raise RuntimeError('abstract method check_status should be implemented by derived class') - - def sub_script(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err') : - """ - make submit script - - job_dirs(list): directories of jobs. size: n_job - cmd(list): commands to be executed. size: n_cmd - args(list of list): args of commands. size of n_cmd x n_job - can be None - res(dict): resources available - outlog(str): file name for output - errlog(str): file name for error - """ - res = self.default_resources(res) - ret = self.sub_script_head(res) - if not isinstance(cmd, list): - cmd = [cmd] - if args == None : - args = [] - for ii in cmd: - _args = [] - for jj in job_dirs: - _args.append('') - args.append(_args) - # loop over commands - self.cmd_cnt = 0 - try: - self.manual_cuda_devices = res['manual_cuda_devices'] - except KeyError: - self.manual_cuda_devices = 0 - try: - self.manual_cuda_multiplicity = res['manual_cuda_multiplicity'] - except KeyError: - self.manual_cuda_multiplicity = 1 - for ii in range(len(cmd)): - # for one command - ret += self._sub_script_inner(job_dirs, - cmd[ii], - args[ii], - ii, - res, - outlog=outlog, - errlog=errlog) - ret += '\ntouch %s\n' % self.finish_tag_name - return ret - - def submit(self, - job_dirs, - cmd, - args = None, - res = None, - restart = False, - outlog = 'log', - errlog = 'err'): - if restart: - dlog.debug('restart task') - status = self.check_status() - if status in [ JobStatus.unsubmitted, JobStatus.unknown, JobStatus.terminated ]: - dlog.debug('task restart point !!!') - self.do_submit(job_dirs, cmd, args, res, outlog=outlog, errlog=errlog) - elif status==JobStatus.waiting: - dlog.debug('task is waiting') - elif status==JobStatus.running: - dlog.debug('task is running') - elif status==JobStatus.finished: - dlog.debug('task is finished') - else: - raise RuntimeError('unknow job status, must be wrong') - else: - dlog.debug('new task') - self.do_submit(job_dirs, cmd, args, res, outlog=outlog, errlog=errlog) - if res is None: - sleep = 0 - else: - sleep = res.get('submit_wait_time', 0) - time.sleep(sleep) # For preventing the crash of the tasks while submitting - - def check_finish_tag(self) : - return self.context.check_file_exists(self.finish_tag_name) - - def _sub_script_inner(self, - job_dirs, - cmd, - args, - idx, - res, - outlog = 'log', - errlog = 'err') : - ret = "" - allow_failure = res.get('allow_failure', False) - for ii,jj in zip(job_dirs, args) : - ret += 'cd %s\n' % ii - ret += 'test $? -ne 0 && exit 1\n\n' - if self.manual_cuda_devices > 0: - # set CUDA_VISIBLE_DEVICES - ret += 'export CUDA_VISIBLE_DEVICES=%d\n' % (self.cmd_cnt % self.manual_cuda_devices) - ret += '{ if [ ! -f tag_%d_finished ] ;then\n' % idx - ret += ' %s 1>> %s 2>> %s \n' % (self.sub_script_cmd(cmd, jj, res), outlog, errlog) - if res['allow_failure'] is False: - ret += ' if test $? -ne 0; then exit 1; else touch tag_%d_finished; fi \n' % idx - else : - ret += ' if test $? -ne 0; then touch tag_failure_%d; fi \n' % idx - ret += ' touch tag_%d_finished \n' % idx - ret += 'fi }' - if self.manual_cuda_devices > 0: - ret += '&' - self.cmd_cnt += 1 - ret += '\n\n' - ret += 'cd %s\n' % self.context.remote_root - ret += 'test $? -ne 0 && exit 1\n' - if self.manual_cuda_devices > 0 and self.cmd_cnt % (self.manual_cuda_devices * self.manual_cuda_multiplicity) == 0: - ret += '\nwait\n\n' - ret += '\nwait\n\n' - return ret diff --git a/dpgen/dispatcher/Dispatcher.py b/dpgen/dispatcher/Dispatcher.py index cb2db0986..2afb8926e 100644 --- a/dpgen/dispatcher/Dispatcher.py +++ b/dpgen/dispatcher/Dispatcher.py @@ -1,349 +1,10 @@ from distutils.version import LooseVersion -import os,sys,time,random,json,glob -import warnings +import os from typing import List from dpdispatcher import Task, Submission, Resources, Machine -from dpgen.dispatcher.LocalContext import LocalSession -from dpgen.dispatcher.LocalContext import LocalContext -from dpgen.dispatcher.LazyLocalContext import LazyLocalContext -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.SSHContext import SSHContext -from dpgen.dispatcher.Slurm import Slurm -from dpgen.dispatcher.LSF import LSF -from dpgen.dispatcher.PBS import PBS -from dpgen.dispatcher.Shell import Shell -from dpgen.dispatcher.AWS import AWS -from dpgen.dispatcher.JobStatus import JobStatus -from dpgen import dlog -from hashlib import sha1 # import dargs from dargs.dargs import Argument -def _split_tasks(tasks, - group_size): - ntasks = len(tasks) - ngroups = ntasks // group_size - if ngroups * group_size < ntasks: - ngroups += 1 - chunks = [[]] * ngroups - tot = 0 - for ii in range(ngroups) : - chunks[ii] = (tasks[ii::ngroups]) - tot += len(chunks[ii]) - assert(tot == len(tasks)) - return chunks - - -class Dispatcher(object): - def __init__ (self, - remote_profile, - context_type = 'local', - batch_type = 'slurm', - job_record = 'jr.json'): - self.remote_profile = remote_profile - - if context_type == 'local': - self.session = LocalSession(remote_profile) - self.context = LocalContext - self.uuid_names = True - elif context_type == 'lazy-local': - self.session = None - self.context = LazyLocalContext - self.uuid_names = True - elif context_type == 'ssh': - self.session = SSHSession(remote_profile) - self.context = SSHContext - self.uuid_names = True - else : - raise RuntimeError('unknown context') - if batch_type == 'slurm': - self.batch = Slurm - elif batch_type == 'lsf': - self.batch = LSF - elif batch_type == 'pbs': - self.batch = PBS - elif batch_type == 'shell': - self.batch = Shell - elif batch_type == 'aws': - self.batch = AWS - else : - raise RuntimeError('unknown batch ' + batch_type) - self.jrname = job_record - - def run_jobs(self, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference = True, - mark_failure = False, - outlog = 'log', - errlog = 'err') : - job_handler = self.submit_jobs(resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference, - outlog, - errlog) - while not self.all_finished(job_handler, mark_failure) : - time.sleep(60) - # delete path map file when job finish - # _pmap.delete() - - - def submit_jobs(self, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference = True, - outlog = 'log', - errlog = 'err') : - self.backward_task_files = backward_task_files - # task_chunks = [ - # [os.path.basename(j) for j in tasks[i:i + group_size]] \ - # for i in range(0, len(tasks), group_size) - # ] - task_chunks = _split_tasks(tasks, group_size) - task_chunks_str = ['+'.join(ii) for ii in task_chunks] - task_hashes = [sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str] - job_record = JobRecord(work_path, task_chunks, fname = self.jrname) - job_record.dump() - nchunks = len(task_chunks) - - job_list = [] - for ii in range(nchunks) : - cur_chunk = task_chunks[ii] - cur_hash = task_hashes[ii] - if not job_record.check_finished(cur_hash): - # chunk is not finished - # check if chunk is submitted - submitted = job_record.check_submitted(cur_hash) - if not submitted: - job_uuid = None - else : - job_uuid = job_record.get_uuid(cur_hash) - dlog.debug("load uuid %s for chunk %s" % (job_uuid, cur_hash)) - # communication context, bach system - context = self.context(work_path, self.session, job_uuid) - batch = self.batch(context, uuid_names = self.uuid_names) - rjob = {'context':context, 'batch':batch} - # upload files - if not rjob['context'].check_file_exists(rjob['batch'].upload_tag_name): - rjob['context'].upload('.', - forward_common_files) - rjob['context'].upload(cur_chunk, - forward_task_files, - dereference = forward_task_deference) - - rjob['context'].write_file(rjob['batch'].upload_tag_name, '') - dlog.debug('uploaded files for %s' % task_chunks_str[ii]) - # submit new or recover old submission - if not submitted: - rjob['batch'].submit(cur_chunk, command, res = resources, outlog=outlog, errlog=errlog) - job_uuid = rjob['context'].job_uuid - dlog.debug('assigned uuid %s for %s ' % (job_uuid, task_chunks_str[ii])) - dlog.info('new submission of %s for chunk %s' % (job_uuid, cur_hash)) - else: - rjob['batch'].submit(cur_chunk, command, res = resources, outlog=outlog, errlog=errlog, restart = True) - dlog.info('restart from old submission %s for chunk %s' % (job_uuid, cur_hash)) - # record job and its remote context - job_list.append(rjob) - ip = None - instance_id = None - if 'cloud_resources' in self.remote_profile: - ip = self.remote_profile['hostname'] - instance_id = self.remote_profile['instance_id'] - job_record.record_remote_context(cur_hash, - context.local_root, - context.remote_root, - job_uuid, - ip, - instance_id) - job_record.dump() - else : - # finished job, append a None to list - job_list.append(None) - assert(len(job_list) == nchunks) - job_handler = { - 'task_chunks': task_chunks, - 'job_list': job_list, - 'job_record': job_record, - 'command': command, - 'resources': resources, - 'outlog': outlog, - 'errlog': errlog, - 'backward_task_files': backward_task_files - } - return job_handler - - - def all_finished(self, - job_handler, - mark_failure, - clean=True): - task_chunks = job_handler['task_chunks'] - task_chunks_str = ['+'.join(ii) for ii in task_chunks] - task_hashes = [sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str] - job_list = job_handler['job_list'] - job_record = job_handler['job_record'] - command = job_handler['command'] - tag_failure_list = ['tag_failure_%d' % ii for ii in range(len(command))] - resources = job_handler['resources'] - outlog = job_handler['outlog'] - errlog = job_handler['errlog'] - backward_task_files = job_handler['backward_task_files'] - dlog.debug('checking jobs') - nchunks = len(task_chunks) - for idx in range(nchunks) : - cur_hash = task_hashes[idx] - rjob = job_list[idx] - if not job_record.check_finished(cur_hash) : - # chunk not finished according to record - status = rjob['batch'].check_status() - job_uuid = rjob['context'].job_uuid - dlog.debug('checked job %s' % job_uuid) - if status == JobStatus.terminated : - job_record.increase_nfail(cur_hash) - if job_record.check_nfail(cur_hash) > 3: - raise RuntimeError('Job %s failed for more than 3 times' % job_uuid) - dlog.info('job %s terminated, submit again'% job_uuid) - dlog.debug('try %s times for %s'% (job_record.check_nfail(cur_hash), job_uuid)) - rjob['batch'].submit(task_chunks[idx], command, res = resources, outlog=outlog, errlog=errlog,restart=True) - elif status == JobStatus.finished : - dlog.info('job %s finished' % job_uuid) - if mark_failure: - rjob['context'].download(task_chunks[idx], tag_failure_list, check_exists = True, mark_failure = False) - rjob['context'].download(task_chunks[idx], backward_task_files, check_exists = True) - else: - rjob['context'].download(task_chunks[idx], backward_task_files) - if clean: - rjob['context'].clean() - job_record.record_finish(cur_hash) - job_record.dump() - job_record.dump() - return job_record.check_all_finished() - - -class JobRecord(object): - def __init__ (self, path, task_chunks, fname = 'job_record.json', ip=None): - self.path = os.path.abspath(path) - self.fname = os.path.join(self.path, fname) - self.task_chunks = task_chunks - if not os.path.exists(self.fname): - self._new_record() - else : - self.load() - - def check_submitted(self, chunk_hash): - self.valid_hash(chunk_hash) - return self.record[chunk_hash]['context'] is not None - - def record_remote_context(self, - chunk_hash, - local_root, - remote_root, - job_uuid, - ip=None, - instance_id=None): - self.valid_hash(chunk_hash) - # self.record[chunk_hash]['context'] = [local_root, remote_root, job_uuid, ip, instance_id] - self.record[chunk_hash]['context'] = {} - self.record[chunk_hash]['context']['local_root'] = local_root - self.record[chunk_hash]['context']['remote_root'] = remote_root - self.record[chunk_hash]['context']['job_uuid'] = job_uuid - self.record[chunk_hash]['context']['ip'] = ip - self.record[chunk_hash]['context']['instance_id'] = instance_id - - def get_uuid(self, chunk_hash): - self.valid_hash(chunk_hash) - return self.record[chunk_hash]['context']['job_uuid'] - - def check_finished(self, chunk_hash): - self.valid_hash(chunk_hash) - return self.record[chunk_hash]['finished'] - - def check_all_finished(self): - flist = [self.record[ii]['finished'] for ii in self.record] - return all(flist) - - def record_finish(self, chunk_hash): - self.valid_hash(chunk_hash) - self.record[chunk_hash]['finished'] = True - - def check_nfail(self,chunk_hash): - self.valid_hash(chunk_hash) - return self.record[chunk_hash]['fail_count'] - - def increase_nfail(self,chunk_hash): - self.valid_hash(chunk_hash) - self.record[chunk_hash]['fail_count'] += 1 - - def valid_hash(self, chunk_hash): - if chunk_hash not in self.record.keys(): - raise RuntimeError('chunk hash %s not in record, a invalid record may be used, please check file %s' % (chunk_hash, self.fname)) - - def dump(self): - with open(self.fname, 'w') as fp: - json.dump(self.record, fp, indent=4) - - def load(self): - with open(self.fname) as fp: - self.record = json.load(fp) - - def _new_record(self): - task_chunks_str=['+'.join(ii) for ii in self.task_chunks] - task_hash = [sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str] - self.record = {} - for ii,jj in zip(task_hash, self.task_chunks): - self.record[ii] = { - 'context': None, - 'finished': False, - 'fail_count': 0, - 'task_chunk': jj, - } - - -def make_dispatcher(mdata, mdata_resource=None, work_path=None, run_tasks=None, group_size=None): - if 'cloud_resources' in mdata: - if mdata['cloud_resources']['cloud_platform'] == 'ali': - from dpgen.dispatcher.ALI import ALI - dispatcher = ALI(mdata, mdata_resource, work_path, run_tasks, group_size, mdata['cloud_resources']) - dispatcher.init() - return dispatcher - elif mdata['cloud_resources']['cloud_platform'] == 'ucloud': - pass - else: - hostname = mdata.get('hostname', None) - #use_uuid = mdata.get('use_uuid', False) - if hostname: - context_type = 'ssh' - else: - context_type = 'local' - try: - batch_type = mdata['batch'] - except Exception: - dlog.info('cannot find key "batch" in machine file, try to use deprecated key "machine_type"') - batch_type = mdata['machine_type'] - lazy_local = (mdata.get('lazy-local', False)) or (mdata.get('lazy_local', False)) - if lazy_local and context_type == 'local': - dlog.info('Dispatcher switches to the lazy local mode') - context_type = 'lazy-local' - disp = Dispatcher(mdata, context_type=context_type, batch_type=batch_type) - return disp def make_submission(mdata_machine, mdata_resources, commands, work_path, run_tasks, group_size, forward_common_files, forward_files, backward_files, outlog, errlog): @@ -426,11 +87,11 @@ def make_submission_compat( backward_files: List[str], outlog: str="log", errlog: str="err", - api_version: str="0.9", + api_version: str="1.0", ) -> None: """Make submission with compatibility of both dispatcher API v0 and v1. - If `api_version` is less than 1.0, use `make_dispatcher`. If + If `api_version` is less than 1.0, raise RuntimeError. If `api_version` is large than 1.0, use `make_submission`. Parameters @@ -457,23 +118,11 @@ def make_submission_compat( path to log from stdout errlog : str, default=err path to log from stderr - api_version : str, default=0.9 - API version. 1.0 is recommended + api_version : str, default=1.0 + API version. 1.0 is required """ if LooseVersion(api_version) < LooseVersion('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(machine, resources, work_dir, run_tasks, group_size) - dispatcher.run_jobs(resources, - commands, - work_path, - run_tasks, - group_size, - forward_common_files, - forward_files, - backward_files, - outlog=outlog, - errlog=errlog) + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif LooseVersion(api_version) >= LooseVersion('1.0'): submission = make_submission( diff --git a/dpgen/dispatcher/DispatcherList.py b/dpgen/dispatcher/DispatcherList.py deleted file mode 100644 index 22b77fd50..000000000 --- a/dpgen/dispatcher/DispatcherList.py +++ /dev/null @@ -1,227 +0,0 @@ -from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, JobRecord -from paramiko.ssh_exception import NoValidConnectionsError -import os, time -from dpgen import dlog -class Entity(): - def __init__(self, ip, instance_id, job_record=None, job_handler=None): - self.ip = ip - self.instance_id = instance_id - self.job_record = job_record - self.job_handler = job_handler - -class DispatcherList(): - def __init__(self, mdata_machine, mdata_resources, work_path, run_tasks, group_size, cloud_resources=None): - self.mdata_machine = mdata_machine - self.mdata_resources = mdata_resources - self.task_chunks = _split_tasks(run_tasks, group_size) - self.nchunks = len(self.task_chunks) - self.nchunks_limit = int(self.mdata_machine.get("machine_upper_bound", self.nchunks)) - if(self.nchunks_limit > self.nchunks): - self.nchunks_limit = self.nchunks - self.work_path = work_path - self.cloud_resources = cloud_resources - self.server_pool = [] - self.ip_pool = [] - self.dispatcher_list = list({"dispatcher": None, - "dispatcher_status": "unallocated", - "entity": None} for ii in range(self.nchunks)) - # Derivate - def init(self): - # do something necessary - for ii in range(self.nchunks): - self.create(ii) - - # Base - def run_jobs(self, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference = True, - mark_failure = False, - outlog = 'log', - errlog = 'err'): - ratio_failure = self.mdata_resources.get("ratio_failure", 0) - while True: - if self.check_all_dispatchers_finished(ratio_failure): - self.clean() - break - self.exception_handling(ratio_failure) - jj = self.nchunks - 1 - for ii in range(self.nchunks): - dispatcher_status = self.check_dispatcher_status(ii) - if dispatcher_status == "unsubmitted": - dlog.info(self.dispatcher_list[ii]["entity"].ip) - self.dispatcher_list[ii]["entity"].job_handler = self.dispatcher_list[ii]["dispatcher"].submit_jobs(resources, - command, - work_path, - self.task_chunks[ii], - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference, - outlog, - errlog) - self.dispatcher_list[ii]["entity"].job_record = self.dispatcher_list[ii]["entity"].job_handler["job_record"] - self.dispatcher_list[ii]["dispatcher_status"] = "running" - elif dispatcher_status == "finished" and self.dispatcher_list[ii]["entity"]: - # no jobs in queue, delete current machine - # else add current machine to server_pool - entity = self.dispatcher_list[ii]["entity"] - status_list = [item["dispatcher_status"] for item in self.dispatcher_list] - flag = "unallocated" in status_list - if not flag: - self.delete(ii) - self.dispatcher_list[ii]["entity"] = None - else: - self.dispatcher_list[ii]["entity"] = None - self.server_pool.append(entity.instance_id) - self.ip_pool.append(entity.ip) - while(jj>=ii): - if(self.dispatcher_list[jj]["dispatcher_status"] == "unallocated"): - self.create(jj) - if(self.dispatcher_list[jj]["dispatcher_status"] == "unsubmitted"): - dlog.info(self.dispatcher_list[jj]["entity"].ip) - self.dispatcher_list[jj]["entity"].job_handler = self.dispatcher_list[jj]["dispatcher"].submit_jobs(resources, - command, - work_path, - self.task_chunks[jj], - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference, - outlog, - errlog) - self.dispatcher_list[jj]["entity"].job_record = self.dispatcher_list[jj]["entity"].job_handler["job_record"] - self.dispatcher_list[jj]["dispatcher_status"] = "running" - break - jj -=1 - elif dispatcher_status == "running": - pass - elif dispatcher_status == "unallocated": - # if len(server_pool) > 0: make_dispatcher - # else: pass - self.create(ii) - if self.dispatcher_list[ii]["dispatcher_status"] == "unsubmitted": - dlog.info(self.dispatcher_list[ii]["entity"].ip) - self.dispatcher_list[ii]["entity"].job_handler = self.dispatcher_list[ii]["dispatcher"].submit_jobs(resources, - command, - work_path, - self.task_chunks[ii], - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference, - outlog, - errlog) - self.dispatcher_list[ii]["entity"].job_record = self.dispatcher_list[ii]["entity"].job_handler["job_record"] - self.dispatcher_list[ii]["dispatcher_status"] = "running" - elif dispatcher_status == "terminated": - pass - self.update() - time.sleep(10) - - # Derivate - def create(self, ii): - '''case1: use existed machine(finished) to make_dispatcher - case2: create one machine, then make_dispatcher, change status from unallocated to unsubmitted''' - pass - - # Derivate - def delete(self, ii): - '''delete one machine - if entity is none, means this machine is used by another dispatcher, shouldn't be deleted''' - pass - - # Derivate, delete config like templates, etc. - def clean(self): - pass - - # Derivate - def update(): - pass - - # Base - def check_all_dispatchers_finished(self, ratio_failure): - status_list = [item["dispatcher_status"] for item in self.dispatcher_list] - finished_num = status_list.count("finished") - if finished_num / self.nchunks < (1 - ratio_failure): return False - else: return True - - # Base - def exception_handling(self, ratio_failure): - status_list = [item["dispatcher_status"] for item in self.dispatcher_list] - terminated_num = status_list.count("terminated") - if terminated_num / self.nchunks > ratio_failure: - # self.dispatcher_list = [lambda item["dispatcher_status"]: "finished" for item in self.dispatcher_list if item["dispatcher_status"] == "terminated"] - for ii in range(self.nchunks): - if self.dispatcher_list[ii]["dispatcher_status"] == "terminated": - self.dispatcher_list[ii]["dispatcher_status"] = "unallocated" - # Base - def make_dispatcher(self, ii): - entity = self.dispatcher_list[ii]["entity"] - profile = self.mdata_machine.copy() - profile['hostname'] = entity.ip - profile['instance_id'] = entity.instance_id - count = 0 - flag = 0 - while count < 3: - try: - self.dispatcher_list[ii]["dispatcher"] = Dispatcher(profile, context_type='ssh', batch_type='shell', job_record='jr.%.06d.json' % ii) - self.dispatcher_list[ii]["dispatcher_status"] = "unsubmitted" - flag = 1 - break - except Exception: - count += 1 - time.sleep(60) - if not flag: - # give up this machine, wait other machine in sever_pool. - # this machine will be append into server_pool next time when update apg_instances. - self.dispatcher_list[ii]["entity"] = None - - - # Base - def check_dispatcher_status(self, ii, allow_failure=False): - '''catch running dispatcher exception - if no exception occured, check finished''' - if self.dispatcher_list[ii]["dispatcher_status"] == "running": - status = self.catch_dispatcher_exception(ii) - if status == 0: - # param clean: delete remote work_dir or not. - clean = self.mdata_resources.get("clean", False) - try: - # avoid raising ssh exception in download proceess - finished = self.dispatcher_list[ii]["dispatcher"].all_finished(self.dispatcher_list[ii]["entity"].job_handler, allow_failure, clean) - if finished: - self.dispatcher_list[ii]["dispatcher_status"] = "finished" - except Exception: - pass - elif status == 1: - # self.dispatcher_list[ii]["dispatcher_status"] = "terminated" - pass - elif status == 2: - self.dispatcher_list[ii]["dispatcher"] = None - self.dispatcher_list[ii]["dispatcher_status"] = "terminated" - self.dispatcher_list[ii]["entity"] = None - os.remove(os.path.join(self.work_path, "jr.%.06d.json" % ii)) - return self.dispatcher_list[ii]["dispatcher_status"] - - # Derivate - def catch_dispatcher_exception(self, ii): - '''everything is okay: return 0 - ssh not active : return 1 - machine callback : return 2''' - pass - - - - - - diff --git a/dpgen/dispatcher/JobStatus.py b/dpgen/dispatcher/JobStatus.py deleted file mode 100644 index f649e36a0..000000000 --- a/dpgen/dispatcher/JobStatus.py +++ /dev/null @@ -1,11 +0,0 @@ -from enum import Enum - -class JobStatus (Enum) : - unsubmitted = 1 - waiting = 2 - running = 3 - terminated = 4 - finished = 5 - completing = 6 - unknown = 100 - diff --git a/dpgen/dispatcher/LSF.py b/dpgen/dispatcher/LSF.py deleted file mode 100644 index dfde7c5e3..000000000 --- a/dpgen/dispatcher/LSF.py +++ /dev/null @@ -1,190 +0,0 @@ -import os,getpass,time -from dpgen.dispatcher.Batch import Batch -from dpgen.dispatcher.JobStatus import JobStatus - -def _default_item(resources, key, value) : - if key not in resources : - resources[key] = value - -class LSF(Batch) : - - def check_status(self): - try: - job_id = self._get_job_id() - except Exception: - return JobStatus.terminated - if job_id == "" : - raise RuntimeError("job %s has not been submitted" % self.context.remote_root) - ret, stdin, stdout, stderr\ - = self.context.block_call ("bjobs " + job_id) - err_str = stderr.read().decode('utf-8') - if ("Job <%s> is not found" % job_id) in err_str : - if self.check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - elif ret != 0 : - raise RuntimeError ("status command bjobs fails to execute. erro info: %s return code %d" - % (err_str, ret)) - status_out = stdout.read().decode('utf-8').split('\n') - if len(status_out) < 2: - return JobStatus.unknown - else: - status_line = status_out[1] - status_word = status_line.split()[2] - - # ref: https://www.ibm.com/support/knowledgecenter/en/SSETD4_9.1.2/lsf_command_ref/bjobs.1.html - if status_word in ["PEND", "WAIT", "PSUSP"] : - return JobStatus.waiting - elif status_word in ["RUN", "USUSP"] : - return JobStatus.running - elif status_word in ["DONE","EXIT"] : - if self.check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - - def do_submit(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err'): - if res == None: - res = self.default_resources(res) - if 'task_max' in res and res['task_max'] > 0: - while self._check_sub_limit(task_max=res['task_max']): - time.sleep(60) - script_str = self.sub_script(job_dirs, cmd, args=args, res=res, outlog=outlog, errlog=errlog) - self.context.write_file(self.sub_script_name, script_str) - stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s < %s' % (self.context.remote_root, 'bsub', self.sub_script_name)) - subret = (stdout.readlines()) - job_id = subret[0].split()[1][1:-1] - self.context.write_file(self.job_id_name, job_id) - - - def default_resources(self, res_) : - """ - set default value if a key in res_ is not fhound - """ - if res_ == None : - res = {} - else: - res = res_ - _default_item(res, 'node_cpu', 1) - _default_item(res, 'numb_node', 1) - _default_item(res, 'task_per_node', 1) - _default_item(res, 'cpus_per_task', -1) - _default_item(res, 'numb_gpu', 0) - _default_item(res, 'time_limit', '1:0:0') - _default_item(res, 'mem_limit', -1) - _default_item(res, 'partition', '') - _default_item(res, 'account', '') - _default_item(res, 'qos', '') - _default_item(res, 'constraint_list', []) - _default_item(res, 'license_list', []) - _default_item(res, 'exclude_list', []) - _default_item(res, 'module_unload_list', []) - _default_item(res, 'module_list', []) - _default_item(res, 'source_list', []) - _default_item(res, 'envs', None) - _default_item(res, 'with_mpi', False) - _default_item(res, 'cuda_multi_tasks', False) - _default_item(res, 'allow_failure', False) - _default_item(res, 'cvasp', False) - return res - - def sub_script_head(self, res): - ret = '' - ret += "#!/bin/bash -l\n#BSUB -e %J.err\n#BSUB -o %J.out\n" - if res['numb_gpu'] == 0: - ret += '#BSUB -n %d\n#BSUB -R span[ptile=%d]\n' % ( - res['numb_node'] * res['task_per_node'], res['node_cpu']) - else: - if res['node_cpu']: - ret += '#BSUB -R span[ptile=%d]\n' % res['node_cpu'] - if res.get('new_lsf_gpu', False): - # supported in LSF >= 10.1.0.3 - # ref: https://www.ibm.com/support/knowledgecenter/en/SSWRJV_10.1.0 - # /lsf_resource_sharing/use_gpu_res_reqs.html - if res.get('exclusive', False): - j_exclusive = "no" - else: - j_exclusive = "yes" - ret += '#BSUB -n %d\n#BSUB -gpu "num=%d:mode=shared:j_exclusive=%s"\n' % ( - res['task_per_node'], res['numb_gpu'], j_exclusive) - else: - ret += '#BSUB -n %d\n#BSUB -R "select[ngpus >0] rusage[ngpus_excl_p=%d]"\n' % ( - res['task_per_node'], res['numb_gpu']) - if res['time_limit']: - ret += '#BSUB -W %s\n' % (res['time_limit'].split(':')[ - 0] + ':' + res['time_limit'].split(':')[1]) - if res['mem_limit'] > 0 : - ret += "#BSUB -M %d \n" % (res['mem_limit']) - ret += '#BSUB -J %s\n' % (res['job_name'] if 'job_name' in res else 'dpgen') - if len(res['partition']) > 0 : - ret += '#BSUB -q %s\n' % res['partition'] - if len(res['exclude_list']) > 0: - ret += '#BSUB -R "select[' - temp_exclude = [] - for ii in res['exclude_list']: - temp_exclude.append('hname != %s' % ii) - ret += ' && '.join(temp_exclude) - ret += ']"\n' - ret += "\n" - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - return ret - - - def sub_script_cmd(self, - cmd, - arg, - res) : - if res['with_mpi']: - ret = 'mpirun -machinefile $LSB_DJOB_HOSTFILE -n %d %s %s' % ( - res['numb_node'] * res['task_per_node'], cmd, arg) - else : - ret = '%s %s' % (cmd, arg) - return ret - - - def _get_job_id(self) : - if self.context.check_file_exists(self.job_id_name) : - return self.context.read_file(self.job_id_name) - else: - return "" - - - def _check_sub_limit(self, task_max, **kwarg) : - stdin_run, stdout_run, stderr_run = self.context.block_checkcall("bjobs | grep RUN | wc -l") - njobs_run = int(stdout_run.read().decode('utf-8').split ('\n')[0]) - stdin_pend, stdout_pend, stderr_pend = self.context.block_checkcall("bjobs | grep PEND | wc -l") - njobs_pend = int(stdout_pend.read().decode('utf-8').split ('\n')[0]) - if (njobs_pend + njobs_run) < task_max: - return False - else: - return True - - - def _make_squeue(self, mdata1, res): - ret = '' - ret += 'bjobs -u %s ' % mdata1['username'] - ret += '-q %s ' % res['partition'] - ret += '| grep PEND ' - return ret diff --git a/dpgen/dispatcher/LazyLocalContext.py b/dpgen/dispatcher/LazyLocalContext.py deleted file mode 100644 index 0b66335f2..000000000 --- a/dpgen/dispatcher/LazyLocalContext.py +++ /dev/null @@ -1,135 +0,0 @@ -import os,shutil,uuid -import subprocess as sp -from glob import glob -from dpgen import dlog - -class SPRetObj(object) : - def __init__ (self, - ret) : - self.data = ret - - def read(self) : - return self.data - - def readlines(self) : - lines = self.data.decode('utf-8').splitlines() - ret = [] - for aa in lines: - ret.append(aa+'\n') - return ret - -class LazyLocalContext(object) : - def __init__ (self, - local_root, - work_profile = None, - job_uuid = None) : - """ - work_profile: - local_root: - """ - assert(type(local_root) == str) - self.local_root = os.path.abspath(local_root) - self.remote_root = self.local_root - if job_uuid: - self.job_uuid=job_uuid - else: - self.job_uuid = str(uuid.uuid4()) - - def get_job_root(self) : - return self.local_root - - def upload(self, - job_dirs, - local_up_files, - dereference = True) : - pass - - def download(self, - job_dirs, - remote_down_files, - check_exists = False, - mark_failure = True, - back_error=False) : - for ii in job_dirs : - for jj in remote_down_files : - fname = os.path.join(self.local_root, ii, jj) - exists = os.path.exists(fname) - if not exists: - if check_exists: - if mark_failure: - with open(os.path.join(self.local_root, ii, 'tag_failure_download_%s' % jj), 'w') as fp: pass - else: - pass - else: - raise OSError('do not find download file ' + fname) - - - def block_checkcall(self, - cmd) : - cwd = os.getcwd() - os.chdir(self.local_root) - proc = sp.Popen(cmd, shell=True, stdout = sp.PIPE, stderr = sp.PIPE) - o, e = proc.communicate() - stdout = SPRetObj(o) - stderr = SPRetObj(e) - code = proc.returncode - if code != 0: - os.chdir(cwd) - raise RuntimeError("Get error code %d in locally calling %s with job: %s ", (code, cmd, self.job_uuid)) - os.chdir(cwd) - return None, stdout, stderr - - def block_call(self, cmd) : - cwd = os.getcwd() - os.chdir(self.local_root) - proc = sp.Popen(cmd, shell=True, stdout = sp.PIPE, stderr = sp.PIPE) - o, e = proc.communicate() - stdout = SPRetObj(o) - stderr = SPRetObj(e) - code = proc.returncode - os.chdir(cwd) - return code, None, stdout, stderr - - def clean(self) : - pass - - def write_file(self, fname, write_str): - with open(os.path.join(self.local_root, fname), 'w') as fp : - fp.write(write_str) - - def read_file(self, fname): - with open(os.path.join(self.local_root, fname), 'r') as fp: - ret = fp.read() - return ret - - def check_file_exists(self, fname): - return os.path.isfile(os.path.join(self.local_root, fname)) - - def call(self, cmd) : - cwd = os.getcwd() - os.chdir(self.local_root) - proc = sp.Popen(cmd, shell=True, stdout = sp.PIPE, stderr = sp.PIPE) - os.chdir(cwd) - return proc - - def kill(self, proc): - proc.kill() - - def check_finish(self, proc): - return (proc.poll() != None) - - def get_return(self, proc): - ret = proc.poll() - if ret is None: - return None, None, None - else : - try: - o, e = proc.communicate() - stdout = SPRetObj(o) - stderr = SPRetObj(e) - except ValueError: - stdout = None - stderr = None - return ret, stdout, stderr - - diff --git a/dpgen/dispatcher/LocalContext.py b/dpgen/dispatcher/LocalContext.py deleted file mode 100644 index 81fbd5007..000000000 --- a/dpgen/dispatcher/LocalContext.py +++ /dev/null @@ -1,210 +0,0 @@ -import os,shutil,uuid,hashlib -import subprocess as sp -from glob import glob -from dpgen import dlog - -class LocalSession (object) : - def __init__ (self, jdata) : - self.work_path = os.path.abspath(jdata['work_path']) - assert(os.path.exists(self.work_path)) - - def get_work_root(self) : - return self.work_path - -class SPRetObj(object) : - def __init__ (self, - ret) : - self.data = ret - - def read(self) : - return self.data - - def readlines(self) : - lines = self.data.decode('utf-8').splitlines() - ret = [] - for aa in lines: - ret.append(aa+'\n') - return ret - -def _check_file_path(fname) : - dirname = os.path.dirname(fname) - if dirname != "": - os.makedirs(dirname, exist_ok=True) - -def _identical_files(fname0, fname1) : - with open(fname0) as fp: - code0 = hashlib.sha1(fp.read().encode('utf-8')).hexdigest() - with open(fname1) as fp: - code1 = hashlib.sha1(fp.read().encode('utf-8')).hexdigest() - return code0 == code1 - - -class LocalContext(object) : - def __init__ (self, - local_root, - work_profile, - job_uuid = None) : - """ - work_profile: - local_root: - """ - assert(type(local_root) == str) - self.local_root = os.path.abspath(local_root) - if job_uuid: - self.job_uuid=job_uuid - else: - self.job_uuid = str(uuid.uuid4()) - - self.remote_root = os.path.join(work_profile.get_work_root(), self.job_uuid) - dlog.debug("local_root is %s"% local_root) - dlog.debug("remote_root is %s"% self.remote_root) - - os.makedirs(self.remote_root, exist_ok = True) - - def get_job_root(self) : - return self.remote_root - - def upload(self, - job_dirs, - local_up_files, - dereference = True) : - cwd = os.getcwd() - for ii in job_dirs : - local_job = os.path.join(self.local_root, ii) - remote_job = os.path.join(self.remote_root, ii) - os.makedirs(remote_job, exist_ok = True) - os.chdir(remote_job) - for jj in local_up_files : - if not os.path.exists(os.path.join(local_job, jj)): - os.chdir(cwd) - raise OSError('cannot find upload file ' + os.path.join(local_job, jj)) - if os.path.exists(os.path.join(remote_job, jj)) : - os.remove(os.path.join(remote_job, jj)) - _check_file_path(jj) - os.symlink(os.path.join(local_job, jj), - os.path.join(remote_job, jj)) - os.chdir(cwd) - - def download(self, - job_dirs, - remote_down_files, - check_exists = False, - mark_failure = True, - back_error=False) : - cwd = os.getcwd() - for ii in job_dirs : - local_job = os.path.join(self.local_root, ii) - remote_job = os.path.join(self.remote_root, ii) - flist = remote_down_files - if back_error : - os.chdir(remote_job) - flist += glob('error*') - os.chdir(cwd) - for jj in flist : - rfile = os.path.join(remote_job, jj) - lfile = os.path.join(local_job, jj) - if not os.path.realpath(rfile) == os.path.realpath(lfile) : - if (not os.path.exists(rfile)) and (not os.path.exists(lfile)): - if check_exists : - if mark_failure: - with open(os.path.join(self.local_root, ii, 'tag_failure_download_%s' % jj), 'w') as fp: pass - else : - pass - else : - raise RuntimeError('do not find download file ' + rfile) - elif (not os.path.exists(rfile)) and (os.path.exists(lfile)) : - # already downloaded - pass - elif (os.path.exists(rfile)) and (not os.path.exists(lfile)) : - # trivial case, download happily - # If the file to be downloaded is a softlink, `cp` should be performed instead of `mv`. - # Otherwise, `lfile` is still a file linked to some original file, - # and when this file's removed, `lfile` will be invalid. - if os.path.islink(rfile): - shutil.copyfile(rfile,lfile) - else: - shutil.move(rfile, lfile) - elif (os.path.exists(rfile)) and (os.path.exists(lfile)) : - # both exists, replace! - dlog.info('find existing %s, replacing by %s' % (lfile, rfile)) - if os.path.isdir(lfile): - shutil.rmtree(lfile, ignore_errors=True) - elif os.path.isfile(lfile) or os.path.islink(lfile): - os.remove(lfile) - shutil.move(rfile, lfile) - else : - raise RuntimeError('should not reach here!') - else : - # no nothing in the case of linked files - pass - os.chdir(cwd) - - def block_checkcall(self, - cmd) : - cwd = os.getcwd() - os.chdir(self.remote_root) - proc = sp.Popen(cmd, shell=True, stdout = sp.PIPE, stderr = sp.PIPE) - o, e = proc.communicate() - stdout = SPRetObj(o) - stderr = SPRetObj(e) - code = proc.returncode - if code != 0: - os.chdir(cwd) - raise RuntimeError("Get error code %d in locally calling %s with job: %s ", (code, cmd, self.job_uuid)) - os.chdir(cwd) - return None, stdout, stderr - - def block_call(self, cmd) : - cwd = os.getcwd() - os.chdir(self.remote_root) - proc = sp.Popen(cmd, shell=True, stdout = sp.PIPE, stderr = sp.PIPE) - o, e = proc.communicate() - stdout = SPRetObj(o) - stderr = SPRetObj(e) - code = proc.returncode - os.chdir(cwd) - return code, None, stdout, stderr - - def clean(self) : - shutil.rmtree(self.remote_root, ignore_errors=True) - - def write_file(self, fname, write_str): - with open(os.path.join(self.remote_root, fname), 'w') as fp : - fp.write(write_str) - - def read_file(self, fname): - with open(os.path.join(self.remote_root, fname), 'r') as fp: - ret = fp.read() - return ret - - def check_file_exists(self, fname): - return os.path.isfile(os.path.join(self.remote_root, fname)) - - def call(self, cmd) : - cwd = os.getcwd() - os.chdir(self.remote_root) - proc = sp.Popen(cmd, shell=True, stdout = sp.PIPE, stderr = sp.PIPE) - os.chdir(cwd) - return proc - - def kill(self, proc): - proc.kill() - - def check_finish(self, proc): - return (proc.poll() != None) - - def get_return(self, proc): - ret = proc.poll() - if ret is None: - return None, None, None - else : - try: - o, e = proc.communicate() - stdout = SPRetObj(o) - stderr = SPRetObj(e) - except ValueError: - stdout = None - stderr = None - return ret, stdout, stderr - - diff --git a/dpgen/dispatcher/PBS.py b/dpgen/dispatcher/PBS.py deleted file mode 100644 index 0fed5a888..000000000 --- a/dpgen/dispatcher/PBS.py +++ /dev/null @@ -1,137 +0,0 @@ -import os,getpass,time -from dpgen.dispatcher.Batch import Batch -from dpgen.dispatcher.JobStatus import JobStatus - -def _default_item(resources, key, value) : - if key not in resources : - resources[key] = value - -class PBS(Batch) : - - def check_status(self) : - job_id = self._get_job_id() - if job_id == "" : - return JobStatus.unsubmitted - ret, stdin, stdout, stderr\ - = self.context.block_call ("qstat " + job_id) - err_str = stderr.read().decode('utf-8') - if (ret != 0) : - if str("qstat: Unknown Job Id") in err_str or str("Job has finished") in err_str: - if self.check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - raise RuntimeError ("status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret)) - status_line = stdout.read().decode('utf-8').split ('\n')[-2] - status_word = status_line.split ()[-2] - # dlog.info (status_word) - if status_word in ["Q","H"] : - return JobStatus.waiting - elif status_word in ["R"] : - return JobStatus.running - elif status_word in ["C","E","K"] : - if self.check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - def do_submit(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err'): - if res == None: - res = self.default_resources(res) - # if 'task_max' in res and res['task_max'] > 0: - # while self._check_sub_limit(task_max=res['task_max']): - # time.sleep(60) - script_str = self.sub_script(job_dirs, cmd, args=args, res=res, outlog=outlog, errlog=errlog) - self.context.write_file(self.sub_script_name, script_str) - stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', self.sub_script_name)) - subret = (stdout.readlines()) - job_id = subret[0].split()[0] - self.context.write_file(self.job_id_name, job_id) - - def default_resources(self, res_) : - """ - set default value if a key in res_ is not fhound - """ - if res_ == None : - res = {} - else: - res = res_ - _default_item(res, 'numb_node', 1) - _default_item(res, 'task_per_node', 1) - _default_item(res, 'cpus_per_task', -1) - _default_item(res, 'numb_gpu', 0) - _default_item(res, 'time_limit', '1:0:0') - _default_item(res, 'mem_limit', -1) - _default_item(res, 'partition', '') - _default_item(res, 'account', '') - _default_item(res, 'qos', '') - _default_item(res, 'constraint_list', []) - _default_item(res, 'license_list', []) - _default_item(res, 'exclude_list', []) - _default_item(res, 'module_unload_list', []) - _default_item(res, 'module_list', []) - _default_item(res, 'source_list', []) - _default_item(res, 'envs', None) - _default_item(res, 'with_mpi', False) - _default_item(res, 'cuda_multi_tasks', False) - _default_item(res, 'allow_failure', True) - _default_item(res, 'cvasp', False) - return res - - def sub_script_head(self, res): - ret = '' - ret += "#!/bin/bash -l\n" - if res['numb_gpu'] == 0: - ret += '#PBS -l nodes=%d:ppn=%d\n' % (res['numb_node'], res['task_per_node']) - else : - ret += '#PBS -l nodes=%d:ppn=%d:gpus=%d\n' % (res['numb_node'], res['task_per_node'], res['numb_gpu']) - ret += '#PBS -l walltime=%s\n' % (res['time_limit']) - if res['mem_limit'] > 0 : - ret += "#PBS -l mem=%dG \n" % res['mem_limit'] - ret += '#PBS -j oe\n' - if len(res['partition']) > 0 : - ret += '#PBS -q %s\n' % res['partition'] - ret += "\n" - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - ret += 'cd $PBS_O_WORKDIR\n\n' - return ret - - def sub_script_cmd(self, - cmd, - arg, - res) : - if res['with_mpi']: - ret = 'mpirun -machinefile $PBS_NODEFILE -n %d %s %s' % ( - res['numb_node'] * res['task_per_node'], cmd, arg) - else : - ret = '%s %s' % (cmd, arg) - return ret - - def _get_job_id(self) : - if self.context.check_file_exists(self.job_id_name) : - return self.context.read_file(self.job_id_name) - else: - return "" - diff --git a/dpgen/dispatcher/SSHContext.py b/dpgen/dispatcher/SSHContext.py deleted file mode 100644 index 7f614f31b..000000000 --- a/dpgen/dispatcher/SSHContext.py +++ /dev/null @@ -1,359 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import os, sys, paramiko, json, uuid, tarfile, time, stat, shutil -from glob import glob -from dpgen import dlog - -class SSHSession (object) : - def __init__ (self, jdata) : - self.remote_profile = jdata - # with open(remote_profile) as fp : - # self.remote_profile = json.load(fp) - self.remote_host = self.remote_profile['hostname'] - self.remote_uname = self.remote_profile['username'] - self.remote_port = self.remote_profile.get('port', 22) - self.remote_password = self.remote_profile.get('password', None) - self.local_key_filename = self.remote_profile.get('key_filename', None) - self.remote_timeout = self.remote_profile.get('timeout', None) - self.local_key_passphrase = self.remote_profile.get('passphrase', None) - self.remote_workpath = self.remote_profile['work_path'] - self.ssh = None - self._setup_ssh(hostname=self.remote_host, - port=self.remote_port, - username=self.remote_uname, - password=self.remote_password, - key_filename=self.local_key_filename, - timeout=self.remote_timeout, - passphrase=self.local_key_passphrase) - - def ensure_alive(self, - max_check = 10, - sleep_time = 10): - count = 1 - while not self._check_alive(): - if count == max_check: - raise RuntimeError('cannot connect ssh after %d failures at interval %d s' % - (max_check, sleep_time)) - dlog.info('connection check failed, try to reconnect to ' + self.remote_host) - self._setup_ssh(hostname=self.remote_host, - port=self.remote_port, - username=self.remote_uname, - password=self.remote_password, - key_filename=self.local_key_filename, - timeout=self.remote_timeout, - passphrase=self.local_key_passphrase) - count += 1 - time.sleep(sleep_time) - - def _check_alive(self): - if self.ssh == None: - return False - try : - transport = self.ssh.get_transport() - transport.send_ignore() - return True - except EOFError: - return False - - def _setup_ssh(self, - hostname, - port=22, - username=None, - password=None, - key_filename=None, - timeout=None, - passphrase=None): - self.ssh = paramiko.SSHClient() - # ssh_client.load_system_host_keys() - self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy) - self.ssh.connect(hostname=hostname, port=port, - username=username, password=password, - key_filename=key_filename, timeout=timeout, passphrase=passphrase) - assert(self.ssh.get_transport().is_active()) - transport = self.ssh.get_transport() - transport.set_keepalive(60) - # reset sftp - self._sftp = None - - def get_ssh_client(self) : - return self.ssh - - def get_session_root(self) : - return self.remote_workpath - - def close(self) : - self.ssh.close() - - def exec_command(self, cmd, retry = 0): - """Calling self.ssh.exec_command but has an exception check.""" - try: - return self.ssh.exec_command(cmd) - except paramiko.ssh_exception.SSHException: - # SSH session not active - # retry for up to 3 times - if retry < 3: - dlog.warning("SSH session not active in calling %s, retry the command..." % cmd) - # ensure alive - self.ensure_alive() - return self.exec_command(cmd, retry = retry+1) - raise RuntimeError("SSH session not active") - - @property - def sftp(self): - """Returns sftp. Open a new one if not existing.""" - if self._sftp is None: - self.ensure_alive() - self._sftp = self.ssh.open_sftp() - return self._sftp - - -class SSHContext (object): - def __init__ (self, - local_root, - ssh_session, - job_uuid=None, - ) : - assert(type(local_root) == str) - self.local_root = os.path.abspath(local_root) - if job_uuid: - self.job_uuid=job_uuid - else: - self.job_uuid = str(uuid.uuid4()) - self.remote_root = os.path.join(ssh_session.get_session_root(), self.job_uuid) - self.ssh_session = ssh_session - self.ssh_session.ensure_alive() - try: - self.sftp.mkdir(self.remote_root) - except Exception: - pass - - @property - def ssh(self): - return self.ssh_session.get_ssh_client() - - @property - def sftp(self): - return self.ssh_session.sftp - - def close(self): - self.ssh_session.close() - - def get_job_root(self) : - return self.remote_root - - def upload(self, - job_dirs, - local_up_files, - dereference = True) : - self.ssh_session.ensure_alive() - cwd = os.getcwd() - os.chdir(self.local_root) - file_list = [] - for ii in job_dirs : - for jj in local_up_files : - file_list.append(os.path.join(ii,jj)) - self._put_files(file_list, dereference = dereference) - os.chdir(cwd) - - def download(self, - job_dirs, - remote_down_files, - check_exists = False, - mark_failure = True, - back_error=False) : - self.ssh_session.ensure_alive() - cwd = os.getcwd() - os.chdir(self.local_root) - file_list = [] - for ii in job_dirs : - for jj in remote_down_files : - file_name = os.path.join(ii,jj) - if check_exists: - if self.check_file_exists(file_name): - file_list.append(file_name) - elif mark_failure : - with open(os.path.join(self.local_root, ii, 'tag_failure_download_%s' % jj), 'w') as fp: pass - else: - pass - else: - file_list.append(file_name) - if back_error: - errors=glob(os.path.join(ii,'error*')) - file_list.extend(errors) - if len(file_list) > 0: - self._get_files(file_list) - os.chdir(cwd) - - def block_checkcall(self, - cmd, - retry=0) : - self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command(('cd %s ;' % self.remote_root) + cmd) - exit_status = stdout.channel.recv_exit_status() - if exit_status != 0: - if retry<3: - # sleep 60 s - dlog.warning("Get error code %d in calling %s through ssh with job: %s . message: %s" % - (exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8'))) - dlog.warning("Sleep 60 s and retry the command...") - time.sleep(60) - return self.block_checkcall(cmd, retry=retry+1) - raise RuntimeError("Get error code %d in calling %s through ssh with job: %s . message: %s" % - (exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8'))) - return stdin, stdout, stderr - - def block_call(self, - cmd) : - self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command(('cd %s ;' % self.remote_root) + cmd) - exit_status = stdout.channel.recv_exit_status() - return exit_status, stdin, stdout, stderr - - def clean(self) : - self.ssh_session.ensure_alive() - sftp = self.ssh.open_sftp() - self._rmtree(sftp, self.remote_root) - sftp.close() - - def write_file(self, fname, write_str): - self.ssh_session.ensure_alive() - with self.sftp.open(os.path.join(self.remote_root, fname), 'w') as fp : - fp.write(write_str) - - def read_file(self, fname): - self.ssh_session.ensure_alive() - with self.sftp.open(os.path.join(self.remote_root, fname), 'r') as fp: - ret = fp.read().decode('utf-8') - return ret - - def check_file_exists(self, fname): - self.ssh_session.ensure_alive() - try: - self.sftp.stat(os.path.join(self.remote_root, fname)) - ret = True - except IOError: - ret = False - return ret - - def call(self, cmd): - stdin, stdout, stderr = self.ssh_session.exec_command(cmd) - # stdin, stdout, stderr = self.ssh.exec_command('echo $$; exec ' + cmd) - # pid = stdout.readline().strip() - # print(pid) - return {'stdin':stdin, 'stdout':stdout, 'stderr':stderr} - - def check_finish(self, cmd_pipes): - return cmd_pipes['stdout'].channel.exit_status_ready() - - - def get_return(self, cmd_pipes): - if not self.check_finish(cmd_pipes): - return None, None, None - else : - retcode = cmd_pipes['stdout'].channel.recv_exit_status() - return retcode, cmd_pipes['stdout'], cmd_pipes['stderr'] - - def kill(self, cmd_pipes) : - raise RuntimeError('dose not work! we do not know how to kill proc through paramiko.SSHClient') - self.block_checkcall('kill -15 %s' % cmd_pipes['pid']) - - - def _rmtree(self, sftp, remotepath, level=0, verbose = False): - for f in sftp.listdir_attr(remotepath): - rpath = os.path.join(remotepath, f.filename) - if stat.S_ISDIR(f.st_mode): - self._rmtree(sftp, rpath, level=(level + 1)) - else: - rpath = os.path.join(remotepath, f.filename) - if verbose: dlog.info('removing %s%s' % (' ' * level, rpath)) - sftp.remove(rpath) - if verbose: dlog.info('removing %s%s' % (' ' * level, remotepath)) - sftp.rmdir(remotepath) - - def _put_files(self, - files, - dereference = True) : - of = self.job_uuid + '.tgz' - # local tar - cwd = os.getcwd() - os.chdir(self.local_root) - if os.path.isfile(of) : - os.remove(of) - with tarfile.open(of, "w:gz", dereference = dereference, compresslevel=6) as tar: - for ii in files : - tar.add(ii) - os.chdir(cwd) - # trans - from_f = os.path.join(self.local_root, of) - to_f = os.path.join(self.remote_root, of) - try: - self.sftp.put(from_f, to_f) - except FileNotFoundError: - raise FileNotFoundError("from %s to %s Error!"%(from_f,to_f)) - # remote extract - self.block_checkcall('tar xf %s' % of) - # clean up - os.remove(from_f) - self.sftp.remove(to_f) - - def _get_files(self, - files) : - of = self.job_uuid + '.tar.gz' - # remote tar - # If the number of files are large, we may get "Argument list too long" error. - # Thus, we may run tar commands for serveral times and tar only 100 files for - # each time. - per_nfile = 100 - ntar = len(files) // per_nfile + 1 - if ntar <= 1: - self.block_checkcall('tar czfh %s %s' % (of, " ".join(files))) - else: - of_tar = self.job_uuid + '.tar' - for ii in range(ntar): - ff = files[per_nfile * ii : per_nfile * (ii+1)] - if ii == 0: - # tar cf for the first time - self.block_checkcall('tar cfh %s %s' % (of_tar, " ".join(ff))) - else: - # append using tar rf - # -r, --append append files to the end of an archive - self.block_checkcall('tar rfh %s %s' % (of_tar, " ".join(ff))) - # compress the tar file using gzip, and will get a tar.gz file - # overwrite considering dpgen may stop and restart - # -f, --force force overwrite of output file and compress links - self.block_checkcall('gzip -f %s' % of_tar) - # trans - from_f = os.path.join(self.remote_root, of) - to_f = os.path.join(self.local_root, of) - if os.path.isfile(to_f) : - os.remove(to_f) - self.sftp.get(from_f, to_f) - # extract - cwd = os.getcwd() - os.chdir(self.local_root) - with tarfile.open(of, "r:gz") as tar: - def is_within_directory(directory, target): - - abs_directory = os.path.abspath(directory) - abs_target = os.path.abspath(target) - - prefix = os.path.commonprefix([abs_directory, abs_target]) - - return prefix == abs_directory - - def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - - for member in tar.getmembers(): - member_path = os.path.join(path, member.name) - if not is_within_directory(path, member_path): - raise Exception("Attempted Path Traversal in Tar File") - - tar.extractall(path, members, numeric_owner=numeric_owner) - - - safe_extract(tar) - os.chdir(cwd) - # cleanup - os.remove(to_f) - self.sftp.remove(from_f) diff --git a/dpgen/dispatcher/Shell.py b/dpgen/dispatcher/Shell.py deleted file mode 100644 index 35a82018d..000000000 --- a/dpgen/dispatcher/Shell.py +++ /dev/null @@ -1,112 +0,0 @@ -import os,getpass,time -from dpgen.dispatcher.Batch import Batch -from dpgen.dispatcher.JobStatus import JobStatus -import datetime - -def _default_item(resources, key, value) : - if key not in resources : - resources[key] = value - - -class Shell(Batch) : - - def check_status(self) : - if self.check_finish_tag(): - return JobStatus.finished - elif self.check_running(): - return JobStatus.running - else: - return JobStatus.terminated - ## warn: cannont distinguish terminated from unsubmitted. - - def do_submit(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err'): - if res == None: - res = {} - script_str = self.sub_script(job_dirs, cmd, args=args, res=res, outlog=outlog, errlog=errlog) - self.context.write_file(self.sub_script_name, script_str) - self.proc = self.context.call('cd %s && exec bash %s' % (self.context.remote_root, self.sub_script_name)) - - def check_running(self): - uuid_names = self.context.job_uuid - ## Check if the uuid.sub is running on remote machine - cnt = 0 - ret, stdin, stdout, stderr = self.context.block_call("ps aux | grep %s"%uuid_names) - response_list = stdout.read().decode('utf-8').split("\n") - for response in response_list: - if uuid_names + ".sub" in response: - return True - return False - - def default_resources(self, res_) : - if res_ == None : - res = {} - else: - res = res_ - _default_item(res, 'task_per_node', 1) - _default_item(res, 'module_list', []) - _default_item(res, 'module_unload_list', []) - _default_item(res, 'source_list', []) - _default_item(res, 'envs', {}) - _default_item(res, 'with_mpi', False) - _default_item(res, 'cuda_multi_tasks', False) - _default_item(res, 'allow_failure', False) - _default_item(res, 'cvasp', False) - return res - - def sub_script_head(self, resources) : - envs = resources['envs'] - module_list = resources['module_list'] - module_unload_list = resources['module_unload_list'] - task_per_node = resources['task_per_node'] - source_list = resources['source_list'] - - ret = '' - ret += ('#!/bin/bash\n\n') - # fp.write('set -euo pipefail\n') - for key in envs.keys() : - ret += ('export %s=%s\n' % (key, envs[key])) - ret += ('\n') - for ii in module_unload_list : - ret += ('module unload %s\n' % ii) - ret += ('\n') - for ii in module_list : - ret += ('module load %s\n' % ii) - ret += ('\n') - for ii in source_list : - ret += ('source %s\n' % ii) - ret += ('\n') - return ret - - - def sub_script_cmd(self, - cmd, - arg, - res) : - try: - cvasp=res['cvasp'] - fp_max_errors = 3 - try: - fp_max_errors = res['fp_max_errors'] - except Exception: - pass - except Exception: - cvasp=False - - _cmd = cmd.split('1>')[0].strip() - if cvasp : - if res['with_mpi']: - _cmd = 'python cvasp.py "mpirun -n %d %s %s" %s' % (res['task_per_node'], _cmd, arg, fp_max_errors) - else : - _cmd = 'python cvasp.py "%s %s" %s' % (_cmd, arg, fp_max_errors) - else : - if res['with_mpi']: - _cmd = 'mpirun -n %d %s %s' % (res['task_per_node'], _cmd, arg) - else : - _cmd = '%s %s' % (_cmd, arg) - return _cmd diff --git a/dpgen/dispatcher/Slurm.py b/dpgen/dispatcher/Slurm.py deleted file mode 100644 index e1d3550e2..000000000 --- a/dpgen/dispatcher/Slurm.py +++ /dev/null @@ -1,209 +0,0 @@ -import os,getpass,time -from dpgen.dispatcher.Batch import Batch -from dpgen.dispatcher.JobStatus import JobStatus - -def _default_item(resources, key, value) : - if key not in resources : - resources[key] = value - -class Slurm(Batch) : - - def check_status(self) : - """ - check the status of a job - """ - job_id = self._get_job_id() - if job_id == '' : - return JobStatus.unsubmitted - while True: - stat = self._check_status_inner(job_id) - if stat != JobStatus.completing: - return stat - else: - time.sleep(5) - - def do_submit(self, - job_dirs, - cmd, - args = None, - res = None, - outlog = 'log', - errlog = 'err'): - if res == None: - res = self.default_resources(res) - if 'task_max' in res and res['task_max'] > 0: - while self._check_sub_limit(task_max=res['task_max']): - time.sleep(60) - script_str = self.sub_script(job_dirs, cmd, args=args, res=res, outlog=outlog, errlog=errlog) - self.context.write_file(self.sub_script_name, script_str) - stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'sbatch', self.sub_script_name)) - subret = (stdout.readlines()) - job_id = subret[0].split()[-1] - self.context.write_file(self.job_id_name, job_id) - - def default_resources(self, res_) : - """ - set default value if a key in res_ is not fhound - """ - if res_ == None : - res = {} - else: - res = res_ - _default_item(res, 'numb_node', 1) - _default_item(res, 'task_per_node', 1) - _default_item(res, 'cpus_per_task', -1) - _default_item(res, 'numb_gpu', 0) - _default_item(res, 'time_limit', '1:0:0') - _default_item(res, 'mem_limit', -1) - _default_item(res, 'partition', '') - _default_item(res, 'account', '') - _default_item(res, 'qos', '') - _default_item(res, 'constraint_list', []) - _default_item(res, 'license_list', []) - _default_item(res, 'exclude_list', []) - _default_item(res, 'module_unload_list', []) - _default_item(res, 'module_list', []) - _default_item(res, 'source_list', []) - _default_item(res, 'envs', None) - _default_item(res, 'with_mpi', False) - _default_item(res, 'cuda_multi_tasks', False) - _default_item(res, 'allow_failure', False) - _default_item(res, 'cvasp', False) - return res - - def sub_script_head(self, res): - ret = '' - ret += "#!/bin/bash -l\n" - ret += "#SBATCH -N %d\n" % res['numb_node'] - ret += "#SBATCH --ntasks-per-node=%d\n" % res['task_per_node'] - if res['cpus_per_task'] > 0 : - ret += "#SBATCH --cpus-per-task=%d\n" % res['cpus_per_task'] - ret += "#SBATCH -t %s\n" % res['time_limit'] - if res['mem_limit'] > 0 : - ret += "#SBATCH --mem=%dG \n" % res['mem_limit'] - if 'job_name' in res: - if len(res['job_name']) > 0: - ret += '#SBATCH --job-name=%s\n' % res['job_name'] - if len(res['account']) > 0 : - ret += "#SBATCH --account=%s \n" % res['account'] - if len(res['partition']) > 0 : - ret += "#SBATCH --partition=%s \n" % res['partition'] - if len(res['qos']) > 0 : - ret += "#SBATCH --qos=%s \n" % res['qos'] - if res['numb_gpu'] > 0 : - ret += "#SBATCH --gres=gpu:%d\n" % res['numb_gpu'] - for ii in res['constraint_list'] : - ret += '#SBATCH -C %s \n' % ii - for ii in res['license_list'] : - ret += '#SBATCH -L %s \n' % ii - if len(res['exclude_list']) >0: - temp_exclude = "" - for ii in res['exclude_list'] : - temp_exclude += ii - temp_exclude += "," - temp_exclude = temp_exclude[:-1] - ret += '#SBATCH --exclude=%s \n' % temp_exclude - for flag in res.get('custom_flags', []): - ret += '#SBATCH %s \n' % flag - ret += "\n" - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - return ret - - def sub_script_cmd(self, - cmd, - arg, - res) : - try: - cvasp=res['cvasp'] - fp_max_errors = 3 - try: - fp_max_errors = res['fp_max_errors'] - except Exception: - pass - except Exception: - cvasp=False - - _cmd = cmd.split('1>')[0].strip() - if cvasp : - if res['with_mpi']: - _cmd = 'python cvasp.py "srun %s %s" %s' % (_cmd, arg, fp_max_errors) - else : - _cmd = 'python cvasp.py "%s %s" %s' % (_cmd, arg, fp_max_errors) - else : - if res['with_mpi']: - _cmd = 'srun %s %s' % (_cmd, arg) - else : - _cmd = '%s %s' % (_cmd, arg) - return _cmd - - def _get_job_id(self) : - if self.context.check_file_exists(self.job_id_name) : - return self.context.read_file(self.job_id_name) - else: - return "" - - def _check_status_inner(self, job_id, retry=0): - ret, stdin, stdout, stderr\ - = self.context.block_call ('squeue -o "%.18i %.2t" -j ' + job_id) - if (ret != 0) : - err_str = stderr.read().decode('utf-8') - if str("Invalid job id specified") in err_str : - if self.check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - # retry 3 times - if retry < 3: - # rest 60s - time.sleep(60) - return self._check_status_inner(job_id, retry=retry+1) - raise RuntimeError\ - ("status command squeue fails to execute\nerror message:%s\nreturn code %d\n" % (err_str, ret)) - status_line = stdout.read().decode('utf-8').split ('\n')[-2] - status_word = status_line.split ()[-1] - if not (len(status_line.split()) == 2 and status_word.isupper()): - raise RuntimeError("Error in getting job status, " + - f"status_line = {status_line}, " + - f"parsed status_word = {status_word}") - if status_word in ["PD","CF","S"] : - return JobStatus.waiting - elif status_word in ["R"] : - return JobStatus.running - elif status_word in ["CG"] : - return JobStatus.completing - elif status_word in ["C","E","K","BF","CA","CD","F","NF","PR","SE","ST","TO"] : - if self.check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - - def _check_sub_limit(self, task_max, **kwarg) : - if task_max <= 0: - return True - username = getpass.getuser() - stdin, stdout, stderr = self.context.block_checkcall('squeue -u %s -h' % username) - nj = len(stdout.readlines()) - return nj >= task_max - - def _make_squeue(self,mdata1, res): - ret = '' - ret += 'squeue -u %s ' % mdata1['username'] - ret += '-p %s ' % res['partition'] - ret += '| grep PD' - return ret diff --git a/dpgen/generator/lib/run_calypso.py b/dpgen/generator/lib/run_calypso.py index 4f1512ef7..22f166b8c 100644 --- a/dpgen/generator/lib/run_calypso.py +++ b/dpgen/generator/lib/run_calypso.py @@ -24,7 +24,7 @@ from dpgen.generator.lib.utils import create_path from dpgen.generator.lib.utils import make_iter_name from dpgen.generator.lib.parse_calypso import _parse_calypso_input -from dpgen.dispatcher.Dispatcher import make_dispatcher, make_submission +from dpgen.dispatcher.Dispatcher import make_submission train_name = '00.train' model_devi_name = '01.model_devi' @@ -40,7 +40,7 @@ def gen_structures(iter_index, jdata, mdata, caly_run_path, current_idx, length_ model_devi_group_size = mdata['model_devi_group_size'] model_devi_resources = mdata['model_devi_resources'] - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') iter_name = make_iter_name(iter_index) @@ -123,19 +123,7 @@ def gen_structures(iter_index, jdata, mdata, caly_run_path, current_idx, length_ run_tasks = [os.path.basename(ii) for ii in run_tasks_] if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher=make_dispatcher(mdata['model_devi_machine'],mdata['model_devi_resources'],'./', run_tasks, model_devi_group_size) - dispatcher.run_jobs(mdata['model_devi_resources'], - commands, - './', - run_tasks, - model_devi_group_size, - model_names, - forward_files, - backward_files, - outlog = 'model_devi.log', - errlog = 'model_devi.log') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): os.chdir(cwd) submission = make_submission( @@ -169,8 +157,6 @@ def gen_structures(iter_index, jdata, mdata, caly_run_path, current_idx, length_ # to traj shutil.copyfile(os.path.join('task.%03d'%(jjj),'traj.traj'),os.path.join('traj','%s.traj'%str(jjj+1)),) - if Version(api_version) < Version('1.0'): - os.rename('jr.json','jr_%s.json'%(str(ii))) tlist = glob.glob('task.*') for t in tlist: @@ -233,19 +219,7 @@ def gen_structures(iter_index, jdata, mdata, caly_run_path, current_idx, length_ run_tasks = [os.path.basename(ii) for ii in run_tasks_] if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher=make_dispatcher(mdata['model_devi_machine'],mdata['model_devi_resources'],'./', run_tasks, model_devi_group_size) - dispatcher.run_jobs(mdata['model_devi_resources'], - commands, - './', - run_tasks, - model_devi_group_size, - model_names, - forward_files, - backward_files, - outlog = 'model_devi.log', - errlog = 'model_devi.log') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): os.chdir(cwd) submission = make_submission( diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index 40183e5e4..124147f36 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -61,7 +61,7 @@ from dpgen.generator.lib.cp2k import make_cp2k_input, make_cp2k_input_from_external, make_cp2k_xyz from dpgen.generator.lib.ele_temp import NBandsEsti from dpgen.remote.decide_machine import convert_mdata -from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission +from dpgen.dispatcher.Dispatcher import make_submission from dpgen.util import sepline, expand_sys_str, normalize from dpgen import ROOT_PATH from pymatgen.io.vasp import Incar,Kpoints,Potcar @@ -595,25 +595,13 @@ def run_train (iter_index, except Exception: train_group_size = 1 - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') user_forward_files = mdata.get("train" + "_user_forward_files", []) forward_files += [os.path.basename(file) for file in user_forward_files] backward_files += mdata.get("train" + "_user_backward_files", []) if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['train_machine'], mdata['train_resources'], work_path, run_tasks, train_group_size) - dispatcher.run_jobs(mdata['train_resources'], - commands, - work_path, - run_tasks, - train_group_size, - trans_comm_data, - forward_files, - backward_files, - outlog = 'train.log', - errlog = 'train.log') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( @@ -1597,23 +1585,11 @@ def run_md_model_devi (iter_index, user_forward_files = mdata.get("model_devi" + "_user_forward_files", []) forward_files += [os.path.basename(file) for file in user_forward_files] backward_files += mdata.get("model_devi" + "_user_backward_files", []) - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if(len(run_tasks) == 0): raise RuntimeError("run_tasks for model_devi should not be empty! Please check your files.") if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['model_devi_machine'], mdata['model_devi_resources'], work_path, run_tasks, model_devi_group_size) - dispatcher.run_jobs(mdata['model_devi_resources'], - commands, - work_path, - run_tasks, - model_devi_group_size, - model_names, - forward_files, - backward_files, - outlog = 'model_devi.log', - errlog = 'model_devi.log') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( @@ -3132,22 +3108,9 @@ def run_fp_inner (iter_index, forward_files += [os.path.basename(file) for file in user_forward_files] backward_files += mdata.get("fp" + "_user_backward_files", []) - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_path, run_tasks, fp_group_size) - dispatcher.run_jobs(mdata['fp_resources'], - [fp_command], - work_path, - run_tasks, - fp_group_size, - forward_common_files, - forward_files, - backward_files, - mark_failure = mark_failure, - outlog = log_file, - errlog = log_file) + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( diff --git a/dpgen/remote/RemoteJob.py b/dpgen/remote/RemoteJob.py deleted file mode 100644 index 992fb82f4..000000000 --- a/dpgen/remote/RemoteJob.py +++ /dev/null @@ -1,949 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import os, sys, paramiko, json, uuid, tarfile, time, stat, shutil -from glob import glob -from enum import Enum -from dpgen import dlog - - -class JobStatus (Enum) : - unsubmitted = 1 - waiting = 2 - running = 3 - terminated = 4 - finished = 5 - unknown = 100 - -class awsMachineJob(object): - def __init__ (self, - remote_root, - work_path, - job_uuid=None, - ) : - self.remote_root=os.path.join(remote_root,work_path) - self.local_root = os.path.abspath(work_path) - if job_uuid: - self.job_uuid=job_uuid - else: - self.job_uuid = str(uuid.uuid4()) - - dlog.info("local_root is %s"% self.local_root) - dlog.info("remote_root is %s"% self.remote_root) - - def upload(self, - job_dir, - local_up_files, - dereference = True) : - cwd = os.getcwd() - print('cwd=',cwd) - os.chdir(self.local_root) - for ii in local_up_files : - print('self.local_root=',self.local_root,'remote_root=',self.remote_root,'job_dir=',job_dir,'ii=',ii) - if os.path.isfile(os.path.join(job_dir,ii)): - if not os.path.exists(os.path.join(self.remote_root,job_dir)): - os.makedirs(os.path.join(self.remote_root,job_dir)) - shutil.copyfile(os.path.join(job_dir,ii),os.path.join(self.remote_root,job_dir,ii)) - elif os.path.isdir(os.path.join(job_dir,ii)): - shutil.copytree(os.path.join(job_dir,ii),os.path.join(self.remote_root,job_dir,ii)) - else: - print('unknownfile','local_root=',self.local_root,'job_dir=',job_dir,'filename=',ii) - os.chdir(cwd) - def download(self, - job_dir, - remote_down_files, - dereference = True) : - for ii in remote_down_files: - # print('self.local_root=',self.local_root,'remote_root=',self.remote_root,'job_dir=',job_dir,'ii=',ii) - file_succ_copy_flag=False - while not file_succ_copy_flag: - if os.path.isfile(os.path.join(self.remote_root,job_dir,ii)): - shutil.copyfile(os.path.join(self.remote_root,job_dir,ii),os.path.join(self.local_root,job_dir,ii)) - file_succ_copy_flag=True - elif os.path.isdir(os.path.join(self.remote_root,job_dir,ii)): - try: - os.rmdir(os.path.join(self.local_root,job_dir,ii)) - except Exception: - print('dir is not empty '+str(os.path.join(self.local_root,job_dir,ii))) - else: - shutil.copytree(os.path.join(self.remote_root,job_dir,ii),os.path.join(self.local_root,job_dir,ii)) - file_succ_copy_flag=True - else: - print('unknownfile,maybe need for waiting for a while','local_root=',self.local_root,'job_dir=',job_dir,'filename=',ii) - time.sleep(5) - -def _default_item(resources, key, value) : - if key not in resources : - resources[key] = value - -def _set_default_resource(res) : - if res == None : - res = {} - _default_item(res, 'numb_node', 1) - _default_item(res, 'task_per_node', 1) - _default_item(res, 'numb_gpu', 0) - _default_item(res, 'time_limit', '1:0:0') - _default_item(res, 'mem_limit', -1) - _default_item(res, 'partition', '') - _default_item(res, 'account', '') - _default_item(res, 'qos', '') - _default_item(res, 'constraint_list', []) - _default_item(res, 'license_list', []) - _default_item(res, 'exclude_list', []) - _default_item(res, 'module_unload_list', []) - _default_item(res, 'module_list', []) - _default_item(res, 'source_list', []) - _default_item(res, 'envs', None) - _default_item(res, 'with_mpi', False) - - -class SSHSession (object) : - def __init__ (self, jdata) : - self.remote_profile = jdata - # with open(remote_profile) as fp : - # self.remote_profile = json.load(fp) - self.remote_host = self.remote_profile['hostname'] - self.remote_port = self.remote_profile['port'] - self.remote_uname = self.remote_profile['username'] - self.remote_password = None - if 'password' in self.remote_profile : - self.remote_password = self.remote_profile['password'] - self.local_key_filename = None - if 'key_filename' in self.remote_profile: - self.local_key_filename = self.remote_profile['key_filename'] - self.remote_timeout = None - if 'timeout' in self.remote_profile: - self.remote_timeout = self.remote_profile['timeout'] - self.local_key_passphrase = None - if 'passphrase' in self.remote_profile: - self.local_key_passphrase = self.remote_profile['passphrase'] - self.remote_workpath = self.remote_profile['work_path'] - self.ssh = self._setup_ssh(hostname=self.remote_host, - port=self.remote_port, - username=self.remote_uname, - password=self.remote_password, - key_filename=self.local_key_filename, - timeout=self.remote_timeout, - passphrase=self.local_key_passphrase) - - def _setup_ssh(self, - hostname, - port=22, - username=None, - password=None, - key_filename=None, - timeout=None, - passphrase=None - ): - ssh_client = paramiko.SSHClient() - ssh_client.load_system_host_keys() - ssh_client.set_missing_host_key_policy(paramiko.WarningPolicy) - ssh_client.connect(hostname, port, username, password, - key_filename, timeout, passphrase) - assert(ssh_client.get_transport().is_active()) - return ssh_client - - def get_ssh_client(self) : - return self.ssh - - def get_session_root(self) : - return self.remote_workpath - - def close(self) : - self.ssh.close() - - -class RemoteJob (object): - def __init__ (self, - ssh_session, - local_root, - job_uuid=None, - ) : - self.local_root = os.path.abspath(local_root) - if job_uuid: - self.job_uuid=job_uuid - else: - self.job_uuid = str(uuid.uuid4()) - - self.remote_root = os.path.join(ssh_session.get_session_root(), self.job_uuid) - dlog.info("local_root is %s"% local_root) - dlog.info("remote_root is %s"% self.remote_root) - self.ssh = ssh_session.get_ssh_client() - # keep ssh alive - transport = self.ssh.get_transport() - transport.set_keepalive(60) - try: - sftp = self.ssh.open_sftp() - sftp.mkdir(self.remote_root) - sftp.close() - except Exception: - pass - # open('job_uuid', 'w').write(self.job_uuid) - - def get_job_root(self) : - return self.remote_root - - def upload(self, - job_dirs, - local_up_files, - dereference = True) : - cwd = os.getcwd() - os.chdir(self.local_root) - file_list = [] - for ii in job_dirs : - for jj in local_up_files : - file_list.append(os.path.join(ii,jj)) - self._put_files(file_list, dereference = dereference) - os.chdir(cwd) - - def download(self, - job_dirs, - remote_down_files, - back_error=False) : - cwd = os.getcwd() - os.chdir(self.local_root) - file_list = [] - for ii in job_dirs : - for jj in remote_down_files : - file_list.append(os.path.join(ii,jj)) - if back_error: - errors=glob(os.path.join(ii,'error*')) - file_list.extend(errors) - self._get_files(file_list) - os.chdir(cwd) - - def block_checkcall(self, - cmd) : - stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd) - exit_status = stdout.channel.recv_exit_status() - if exit_status != 0: - dlog.info("Error info: %s "%(stderr.readlines()[0])) - raise RuntimeError("Get error code %d in calling %s through ssh with job: %s "% (exit_status, cmd, self.job_uuid)) - return stdin, stdout, stderr - - def block_call(self, - cmd) : - stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd) - exit_status = stdout.channel.recv_exit_status() - return exit_status, stdin, stdout, stderr - - def clean(self) : - sftp = self.ssh.open_sftp() - self._rmtree(sftp, self.remote_root) - sftp.close() - - def _rmtree(self, sftp, remotepath, level=0, verbose = False): - for f in sftp.listdir_attr(remotepath): - rpath = os.path.join(remotepath, f.filename) - if stat.S_ISDIR(f.st_mode): - self._rmtree(sftp, rpath, level=(level + 1)) - else: - rpath = os.path.join(remotepath, f.filename) - if verbose: dlog.info('removing %s%s' % (' ' * level, rpath)) - sftp.remove(rpath) - if verbose: dlog.info('removing %s%s' % (' ' * level, remotepath)) - sftp.rmdir(remotepath) - - def _put_files(self, - files, - dereference = True) : - of = self.job_uuid + '.tgz' - # local tar - cwd = os.getcwd() - os.chdir(self.local_root) - if os.path.isfile(of) : - os.remove(of) - with tarfile.open(of, "w:gz", dereference = dereference) as tar: - for ii in files : - tar.add(ii) - os.chdir(cwd) - # trans - from_f = os.path.join(self.local_root, of) - to_f = os.path.join(self.remote_root, of) - sftp = self.ssh.open_sftp() - sftp.put(from_f, to_f) - # remote extract - self.block_checkcall('tar xf %s' % of) - # clean up - os.remove(from_f) - sftp.remove(to_f) - sftp.close() - - def _get_files(self, - files) : - of = self.job_uuid + '.tgz' - flist = "" - for ii in files : - flist += " " + ii - # remote tar - self.block_checkcall('tar czf %s %s' % (of, flist)) - # trans - from_f = os.path.join(self.remote_root, of) - to_f = os.path.join(self.local_root, of) - if os.path.isfile(to_f) : - os.remove(to_f) - sftp = self.ssh.open_sftp() - sftp.get(from_f, to_f) - # extract - cwd = os.getcwd() - os.chdir(self.local_root) - with tarfile.open(of, "r:gz") as tar: - def is_within_directory(directory, target): - - abs_directory = os.path.abspath(directory) - abs_target = os.path.abspath(target) - - prefix = os.path.commonprefix([abs_directory, abs_target]) - - return prefix == abs_directory - - def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - - for member in tar.getmembers(): - member_path = os.path.join(path, member.name) - if not is_within_directory(path, member_path): - raise Exception("Attempted Path Traversal in Tar File") - - tar.extractall(path, members, numeric_owner=numeric_owner) - - - safe_extract(tar) - os.chdir(cwd) - # cleanup - os.remove(to_f) - sftp.remove(from_f) - -class CloudMachineJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None) : - - #dlog.info("Current path is",os.getcwd()) - - #for ii in job_dirs : - # if not os.path.isdir(ii) : - # raise RuntimeError("cannot find dir %s" % ii) - # dlog.info(self.remote_root) - script_name = self._make_script(job_dirs, cmd, args, resources) - self.stdin, self.stdout, self.stderr = self.ssh.exec_command(('cd %s; bash %s' % (self.remote_root, script_name))) - # dlog.info(self.stderr.read().decode('utf-8')) - # dlog.info(self.stdout.read().decode('utf-8')) - - def check_status(self) : - if not self._check_finish(self.stdout) : - return JobStatus.running - elif self._get_exit_status(self.stdout) == 0 : - return JobStatus.finished - else : - return JobStatus.terminated - - def _check_finish(self, stdout) : - return stdout.channel.exit_status_ready() - - def _get_exit_status(self, stdout) : - return stdout.channel.recv_exit_status() - - def _make_script(self, - job_dirs, - cmd, - args = None, - resources = None) : - _set_default_resource(resources) - envs = resources['envs'] - module_list = resources['module_list'] - module_unload_list = resources['module_unload_list'] - task_per_node = resources['task_per_node'] - - script_name = 'run.sh' - if args == None : - args = [] - for ii in job_dirs: - args.append('') - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write('#!/bin/bash\n\n') - # fp.write('set -euo pipefail\n') - if envs != None : - for key in envs.keys() : - fp.write('export %s=%s\n' % (key, envs[key])) - fp.write('\n') - if module_unload_list is not None : - for ii in module_unload_list : - fp.write('module unload %s\n' % ii) - fp.write('\n') - if module_list is not None : - for ii in module_list : - fp.write('module load %s\n' % ii) - fp.write('\n') - for ii,jj in zip(job_dirs, args) : - fp.write('cd %s\n' % ii) - fp.write('test $? -ne 0 && exit\n') - if resources['with_mpi'] == True : - fp.write('mpirun -n %d %s %s\n' - % (task_per_node, cmd, jj)) - else : - fp.write('%s %s\n' % (cmd, jj)) - if 'allow_failure' not in resources or resources['allow_failure'] is False: - fp.write('test $? -ne 0 && exit\n') - fp.write('cd %s\n' % self.remote_root) - fp.write('test $? -ne 0 && exit\n') - fp.write('\ntouch tag_finished\n') - sftp.close() - return script_name - - -class SlurmJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None, - restart=False) : - - def _submit(): - script_name = self._make_script(job_dirs, cmd, args, res = resources) - stdin, stdout, stderr = self.block_checkcall(('cd %s; sbatch %s' % (self.remote_root, script_name))) - subret = (stdout.readlines()) - job_id = subret[0].split()[-1] - sftp = self.ssh.open_sftp() - - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'w') as fp: - fp.write(job_id) - sftp.close() - - dlog.debug(restart) - if restart: - try: - status = self.check_status() - dlog.debug(status) - if status in [ JobStatus.unsubmitted, JobStatus.unknown, JobStatus.terminated ]: - dlog.debug('task restart point !!!') - _submit() - elif status==JobStatus.waiting: - dlog.debug('task is waiting') - elif status==JobStatus.running: - dlog.debug('task is running') - else: - dlog.debug('task is finished') - - except Exception: - dlog.debug('no job_id file') - dlog.debug('task restart point !!!') - _submit() - else: - dlog.debug('new task!!!') - _submit() - - def check_status(self) : - job_id = self._get_job_id() - if job_id == "" : - raise RuntimeError("job %s has not been submitted" % self.remote_root) - ret, stdin, stdout, stderr\ - = self.block_call ("squeue --job " + job_id) - err_str = stderr.read().decode('utf-8') - if (ret != 0) : - if str("Invalid job id specified") in err_str : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - raise RuntimeError\ - ("status command squeue fails to execute\nerror message:%s\nreturn code %d\n" % (err_str, ret)) - status_line = stdout.read().decode('utf-8').split ('\n')[-2] - status_word = status_line.split ()[-4] - if status_word in ["PD","CF","S"] : - return JobStatus.waiting - elif status_word in ["R","CG"] : - return JobStatus.running - elif status_word in ["C","E","K","BF","CA","CD","F","NF","PR","SE","ST","TO"] : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - def _get_job_id(self) : - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'r') as fp: - ret = fp.read().decode('utf-8') - sftp.close() - return ret - - def _check_finish_tag(self) : - sftp = self.ssh.open_sftp() - try: - sftp.stat(os.path.join(self.remote_root, 'tag_finished')) - ret = True - except IOError: - ret = False - sftp.close() - return ret - - def _make_squeue(self,mdata1, res): - ret = '' - ret += 'squeue -u %s ' % mdata1['username'] - ret += '-p %s ' % res['partition'] - ret += '| grep PD' - return ret - - def _make_script(self, - job_dirs, - cmd, - args = None, - res = None) : - _set_default_resource(res) - ret = '' - ret += "#!/bin/bash -l\n" - ret += "#SBATCH -N %d\n" % res['numb_node'] - ret += "#SBATCH --ntasks-per-node %d\n" % res['task_per_node'] - ret += "#SBATCH -t %s\n" % res['time_limit'] - if res['mem_limit'] > 0 : - ret += "#SBATCH --mem %dG \n" % res['mem_limit'] - if len(res['account']) > 0 : - ret += "#SBATCH --account %s \n" % res['account'] - if len(res['partition']) > 0 : - ret += "#SBATCH --partition %s \n" % res['partition'] - if len(res['qos']) > 0 : - ret += "#SBATCH --qos %s \n" % res['qos'] - if res['numb_gpu'] > 0 : - ret += "#SBATCH --gres=gpu:%d\n" % res['numb_gpu'] - for ii in res['constraint_list'] : - ret += '#SBATCH -C %s \n' % ii - for ii in res['license_list'] : - ret += '#SBATCH -L %s \n' % ii - if len(res['exclude_list']) >0: - temp_exclude = "" - for ii in res['exclude_list'] : - temp_exclude += ii - temp_exclude += "," - temp_exclude = temp_exclude[:-1] - ret += '#SBATCH --exclude %s \n' % temp_exclude - ret += "\n" - # ret += 'set -euo pipefail\n\n' - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - - if args == None : - args = [] - for ii in job_dirs: - args.append('') - - try: - cvasp=res['cvasp'] - try: - fp_max_errors = res['fp_max_errors'] - except Exception: - fp_max_errors = 3 - except Exception: - cvasp=False - - for ii,jj in zip(job_dirs, args) : - ret += 'cd %s\n' % ii - ret += 'test $? -ne 0 && exit\n\n' - - if cvasp: - cmd=cmd.split('1>')[0].strip() - if res['with_mpi'] : - ret += 'if [ -f tag_finished ] ;then\n' - ret += ' echo gogogo \n' - ret += 'else\n' - ret += ' python ../cvasp.py "srun %s" %s %s 1>log 2>log\n' % (cmd, fp_max_errors, jj) - ret += ' if test $? -ne 0 \n' - ret += ' then\n' - ret += ' exit\n' - ret += ' else\n' - ret += ' touch tag_finished\n' - ret += ' fi\n' - ret += 'fi\n\n' - else : - ret += 'if [ -f tag_finished ] ;then\n' - ret += ' echo gogogo \n' - ret += 'else\n' - ret += ' python ../cvasp.py "%s" %s %s 1>log 2>log\n' % (cmd, fp_max_errors, jj) - ret += ' if test $? -ne 0 \n' - ret += ' then\n' - ret += ' exit\n' - ret += ' else\n' - ret += ' touch tag_finished\n' - ret += ' fi\n' - ret += 'fi\n\n' - else: - if res['with_mpi'] : - ret += 'if [ -f tag_finished ] ;then\n' - ret += ' echo gogogo \n' - ret += 'else\n' - ret += ' srun %s %s\n' % (cmd, jj) - ret += ' if test $? -ne 0 \n' - ret += ' then\n' - ret += ' exit\n' - ret += ' else\n' - ret += ' touch tag_finished\n' - ret += ' fi\n' - ret += 'fi\n\n' - else : - ret += 'if [ -f tag_finished ] ;then\n' - ret += ' echo gogogo \n' - ret += 'else\n' - ret += ' %s %s\n' % (cmd, jj) - ret += ' if test $? -ne 0 \n' - ret += ' then\n' - ret += ' exit\n' - ret += ' else\n' - ret += ' touch tag_finished\n' - ret += ' fi\n' - ret += 'fi\n\n' - if 'allow_failure' not in res or res['allow_failure'] is False: - ret += 'test $? -ne 0 && exit\n' - ret += 'cd %s\n' % self.remote_root - ret += 'test $? -ne 0 && exit\n' - ret += '\ntouch tag_finished\n' - - script_name = 'run.sub' - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write(ret) - sftp.close() - - return script_name - - -class PBSJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None) : - script_name = self._make_script(job_dirs, cmd, args, res = resources) - stdin, stdout, stderr = self.block_checkcall(('cd %s; qsub %s' % (self.remote_root, script_name))) - subret = (stdout.readlines()) - job_id = subret[0].split()[0] - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'w') as fp: - fp.write(job_id) - sftp.close() - - def check_status(self) : - job_id = self._get_job_id() - if job_id == "" : - raise RuntimeError("job %s is has not been submitted" % self.remote_root) - ret, stdin, stdout, stderr\ - = self.block_call ("qstat " + job_id) - err_str = stderr.read().decode('utf-8') - if (ret != 0) : - if str("qstat: Unknown Job Id") in err_str : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - raise RuntimeError ("status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret)) - status_line = stdout.read().decode('utf-8').split ('\n')[-2] - status_word = status_line.split ()[-2] - # dlog.info (status_word) - if status_word in ["Q","H"] : - return JobStatus.waiting - elif status_word in ["R"] : - return JobStatus.running - elif status_word in ["C","E","K"] : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - def _get_job_id(self) : - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'r') as fp: - ret = fp.read().decode('utf-8') - sftp.close() - return ret - - def _check_finish_tag(self) : - sftp = self.ssh.open_sftp() - try: - sftp.stat(os.path.join(self.remote_root, 'tag_finished')) - ret = True - except IOError: - ret = False - sftp.close() - return ret - - def _make_script(self, - job_dirs, - cmd, - args = None, - res = None) : - _set_default_resource(res) - ret = '' - ret += "#!/bin/bash -l\n" - if res['numb_gpu'] == 0: - ret += '#PBS -l nodes=%d:ppn=%d\n' % (res['numb_node'], res['task_per_node']) - else : - ret += '#PBS -l nodes=%d:ppn=%d:gpus=%d\n' % (res['numb_node'], res['task_per_node'], res['numb_gpu']) - ret += '#PBS -l walltime=%s\n' % (res['time_limit']) - if res['mem_limit'] > 0 : - ret += "#PBS -l mem=%dG \n" % res['mem_limit'] - ret += '#PBS -j oe\n' - if len(res['partition']) > 0 : - ret += '#PBS -q %s\n' % res['partition'] - ret += "\n" - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - ret += 'cd $PBS_O_WORKDIR\n\n' - - if args == None : - args = [] - for ii in job_dirs: - args.append('') - for ii,jj in zip(job_dirs, args) : - ret += 'cd %s\n' % ii - ret += 'test $? -ne 0 && exit\n' - if res['with_mpi'] : - ret += 'mpirun -machinefile $PBS_NODEFILE -n %d %s %s\n' % (res['numb_node'] * res['task_per_node'], cmd, jj) - else : - ret += '%s %s\n' % (cmd, jj) - if 'allow_failure' not in res or res['allow_failure'] is False: - ret += 'test $? -ne 0 && exit\n' - ret += 'cd %s\n' % self.remote_root - ret += 'test $? -ne 0 && exit\n' - ret += '\ntouch tag_finished\n' - - script_name = 'run.sub' - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write(ret) - sftp.close() - - return script_name - - -# ssh_session = SSHSession('localhost.json') -# rjob = CloudMachineJob(ssh_session, '.') -# # can upload dirs and normal files -# rjob.upload(['job0', 'job1'], ['batch_exec.py', 'test']) -# rjob.submit(['job0', 'job1'], 'touch a; sleep 2') -# while rjob.check_status() == JobStatus.running : -# dlog.info('checked') -# time.sleep(2) -# dlog.info(rjob.check_status()) -# # can download dirs and normal files -# rjob.download(['job0', 'job1'], ['a']) -# # rjob.clean() - - -class LSFJob (RemoteJob) : - def submit(self, - job_dirs, - cmd, - args = None, - resources = None, - restart = False): - dlog.debug(restart) - if restart: - status = self.check_status() - if status in [ JobStatus.unsubmitted, JobStatus.unknown, JobStatus.terminated ]: - dlog.debug('task restart point !!!') - if 'task_max' in resources and resources['task_max'] > 0: - while self.check_limit(task_max=resources['task_max']): - time.sleep(60) - self._submit(job_dirs, cmd, args, resources) - elif status==JobStatus.waiting: - dlog.debug('task is waiting') - elif status==JobStatus.running: - dlog.debug('task is running') - else: - dlog.debug('task is finished') - #except Exception: - #dlog.debug('no job_id file') - #dlog.debug('task restart point !!!') - #self._submit(job_dirs, cmd, args, resources) - else: - dlog.debug('new task!!!') - if 'task_max' in resources and resources['task_max'] > 0: - while self.check_limit(task_max=resources['task_max']): - time.sleep(60) - self._submit(job_dirs, cmd, args, resources) - if resources.get('wait_time', False): - time.sleep(resources['wait_time']) # For preventing the crash of the tasks while submitting. - - def _submit(self, - job_dirs, - cmd, - args = None, - resources = None) : - script_name = self._make_script(job_dirs, cmd, args, res = resources) - stdin, stdout, stderr = self.block_checkcall(('cd %s; bsub < %s' % (self.remote_root, script_name))) - subret = (stdout.readlines()) - job_id = subret[0].split()[1][1:-1] - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'w') as fp: - fp.write(job_id) - sftp.close() - - def check_limit(self, task_max): - stdin_run, stdout_run, stderr_run = self.block_checkcall("bjobs | grep RUN | wc -l") - njobs_run = int(stdout_run.read().decode('utf-8').split ('\n')[0]) - stdin_pend, stdout_pend, stderr_pend = self.block_checkcall("bjobs | grep PEND | wc -l") - njobs_pend = int(stdout_pend.read().decode('utf-8').split ('\n')[0]) - if (njobs_pend + njobs_run) < task_max: - return False - else: - return True - - def check_status(self) : - try: - job_id = self._get_job_id() - except Exception: - return JobStatus.terminated - if job_id == "" : - raise RuntimeError("job %s is has not been submitted" % self.remote_root) - ret, stdin, stdout, stderr\ - = self.block_call ("bjobs " + job_id) - err_str = stderr.read().decode('utf-8') - if ("Job <%s> is not found" % job_id) in err_str : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - elif ret != 0 : - raise RuntimeError ("status command bjobs fails to execute. erro info: %s return code %d" - % (err_str, ret)) - status_out = stdout.read().decode('utf-8').split('\n') - if len(status_out) < 2: - return JobStatus.unknown - else: - status_line = status_out[1] - status_word = status_line.split()[2] - - # ref: https://www.ibm.com/support/knowledgecenter/en/SSETD4_9.1.2/lsf_command_ref/bjobs.1.html - if status_word in ["PEND", "WAIT", "PSUSP"] : - return JobStatus.waiting - elif status_word in ["RUN", "USUSP"] : - return JobStatus.running - elif status_word in ["DONE","EXIT"] : - if self._check_finish_tag() : - return JobStatus.finished - else : - return JobStatus.terminated - else : - return JobStatus.unknown - - def _get_job_id(self) : - sftp = self.ssh.open_sftp() - with sftp.open(os.path.join(self.remote_root, 'job_id'), 'r') as fp: - ret = fp.read().decode('utf-8') - sftp.close() - return ret - - def _check_finish_tag(self) : - sftp = self.ssh.open_sftp() - try: - sftp.stat(os.path.join(self.remote_root, 'tag_finished')) - ret = True - except IOError: - ret = False - sftp.close() - return ret - - def _make_script(self, - job_dirs, - cmd, - args = None, - res = None) : - _set_default_resource(res) - ret = '' - ret += "#!/bin/bash -l\n#BSUB -e %J.err\n#BSUB -o %J.out\n" - if res['numb_gpu'] == 0: - ret += '#BSUB -n %d\n#BSUB -R span[ptile=%d]\n' % ( - res['numb_node'] * res['task_per_node'], res['node_cpu']) - else: - if res['node_cpu']: - ret += '#BSUB -R span[ptile=%d]\n' % res['node_cpu'] - if 'new_lsf_gpu' in res and res['new_lsf_gpu'] == True: - # supportted in LSF >= 10.1.0 SP6 - # ref: https://www.ibm.com/support/knowledgecenter/en/SSWRJV_10.1.0/lsf_resource_sharing/use_gpu_res_reqs.html - ret += '#BSUB -n %d\n#BSUB -gpu "num=%d:mode=shared:j_exclusive=yes"\n' % ( - res['numb_gpu'], res['task_per_node']) - else: - ret += '#BSUB -n %d\n#BSUB -R "select[ngpus >0] rusage[ngpus_excl_p=%d]"\n' % ( - res['numb_gpu'], res['task_per_node']) - if res['time_limit']: - ret += '#BSUB -W %s\n' % (res['time_limit'].split(':')[ - 0] + ':' + res['time_limit'].split(':')[1]) - if res['mem_limit'] > 0 : - ret += "#BSUB -M %d \n" % (res['mem_limit']) - ret += '#BSUB -J %s\n' % (res['job_name'] if 'job_name' in res else 'dpgen') - if len(res['partition']) > 0 : - ret += '#BSUB -q %s\n' % res['partition'] - ret += "\n" - for ii in res['module_unload_list'] : - ret += "module unload %s\n" % ii - for ii in res['module_list'] : - ret += "module load %s\n" % ii - ret += "\n" - for ii in res['source_list'] : - ret += "source %s\n" %ii - ret += "\n" - envs = res['envs'] - if envs != None : - for key in envs.keys() : - ret += 'export %s=%s\n' % (key, envs[key]) - ret += '\n' - - if args == None : - args = [] - for ii in job_dirs: - args.append('') - for ii,jj in zip(job_dirs, args) : - ret += 'cd %s\n' % ii - ret += 'test $? -ne 0 && exit\n' - if res['with_mpi']: - ret += 'mpirun -machinefile $LSB_DJOB_HOSTFILE -n %d %s %s\n' % ( - res['numb_node'] * res['task_per_node'], cmd, jj) - else : - ret += '%s %s\n' % (cmd, jj) - if 'allow_failure' not in res or res['allow_failure'] is False: - ret += 'test $? -ne 0 && exit\n' - ret += 'cd %s\n' % self.remote_root - ret += 'test $? -ne 0 && exit\n' - ret += '\ntouch tag_finished\n' - - script_name = 'run.sub' - script = os.path.join(self.remote_root, script_name) - sftp = self.ssh.open_sftp() - with sftp.open(script, 'w') as fp : - fp.write(ret) - sftp.close() - - return script_name diff --git a/dpgen/remote/decide_machine.py b/dpgen/remote/decide_machine.py index 34486dba3..dc400d455 100644 --- a/dpgen/remote/decide_machine.py +++ b/dpgen/remote/decide_machine.py @@ -1,16 +1,6 @@ #!/usr/bin/env python # coding: utf-8 -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.SSHContext import SSHContext -from dpgen.dispatcher.Slurm import Slurm -from dpgen.dispatcher.LSF import LSF -from dpgen import dlog -import os -import json -import numpy as np -from packaging.version import Version - def convert_mdata(mdata, task_types=["train", "model_devi", "fp"]): ''' @@ -50,281 +40,3 @@ def convert_mdata(mdata, task_types=["train", "model_devi", "fp"]): group_size = task_data.get("group_size", 1) mdata[task_type + "_" + "group_size"] = group_size return mdata - - - -# def decide_train_machine(mdata): -# if Version(mdata.get('api_version', '0.9')) >= Version('1.0'): -# mdata['train_group_size'] = mdata['train'][0]['resources']['group_size'] -# if 'train' in mdata: -# continue_flag = False -# if 'record.machine' in os.listdir(): -# try: -# with open('record.machine', 'r') as _infile: -# profile = json.load(_infile) -# if profile['purpose'] == 'train': -# mdata['train_machine'] = profile['machine'] -# mdata['train_resources'] = profile['resources'] -# -# if 'python_path' in profile: -# mdata['python_path'] = profile['python_path'] -# if "group_size" in profile: -# mdata["train_group_size"] = profile["group_size"] -# if 'deepmd_version' in profile: -# mdata["deepmd_version"] = profile['deepmd_version'] -# if 'command' in profile: -# mdata['train_command'] = profile["command"] -# continue_flag = True -# except Exception: -# pass -# if ("hostname" not in mdata["train"][0]["machine"]) or (len(mdata["train"]) == 1): -# mdata["train_machine"] = mdata["train"][0]["machine"] -# mdata["train_resources"] = mdata["train"][0]["resources"] -# -# if 'python_path' in mdata["train"][0]: -# mdata["python_path"] = mdata["train"][0]["python_path"] -# if "group_size" in mdata["train"][0]: -# mdata["train_group_size"] = mdata["train"][0]["group_size"] -# if 'deepmd_version' in mdata["train"][0]: -# mdata["deepmd_version"] = mdata["train"][0]["deepmd_version"] -# if 'command' in mdata["train"][0]: -# mdata["train_command"] = mdata["train"][0]["command"] -# continue_flag = True -# -# pd_flag = False -# pd_count_list =[] -# # pd for pending job in slurm -# # if we need to launch new machine_idxines -# if not continue_flag: -# -# #assert isinstance(mdata['train']['machine'], list) -# #assert isinstance(mdata['train']['resources'], list) -# #assert len(mdata['train']['machine']) == len(mdata['train']['resources']) -# # mdata['train'] is a list -# for machine_idx in range(len(mdata['train'])): -# temp_machine = mdata['train'][machine_idx]['machine'] -# temp_resources = mdata['train'][machine_idx]['resources'] -# temp_ssh_sess = SSHSession(temp_machine) -# cwd = os.getcwd() -# temp_context = SSHContext(cwd, temp_ssh_sess) -# if temp_machine['machine_type'] == 'lsf': -# temp_batch = LSF(temp_context) -# else: -# temp_batch = Slurm(temp_context) -# # For other type of machines, please add them using 'elif'. -# # Here slurm is selected as the final choice in convinience. -# command = temp_batch._make_squeue(temp_machine, temp_resources) -# ret, stdin, stdout, stderr = temp_batch.context.block_call(command) -# pd_response = stdout.read().decode('utf-8').split("\n") -# pd_count = len(pd_response) -# temp_context.clean() -# ## If there is no need to waiting for allocation -# if pd_count ==1: -# mdata['train_machine'] = temp_machine -# mdata['train_resources'] = temp_resources -# -# if 'python_path' in mdata['train'][machine_idx]: -# mdata['python_path'] = mdata['train'][machine_idx]['python_path'] -# if 'group_size' in mdata['train'][machine_idx]: -# mdata['train_group_size'] = mdata['train'][machine_idx]['group_size'] -# if 'deepmd_version' in mdata['train'][machine_idx]: -# mdata['deepmd_version'] = mdata['train'][machine_idx]['deepmd_version'] -# if 'command' in mdata['train'][machine_idx]: -# mdata['train_command'] = mdata['train'][machine_idx]['command'] -# -# ## No need to wait -# pd_flag = True -# break -# else: -# pd_count_list.append(pd_count) -# if not pd_flag: -# ## All machines need waiting, then compare waiting jobs -# ## Select a machine which has fewest waiting jobs -# min_machine_idx = np.argsort(pd_count_list)[0] -# mdata['train_machine'] = mdata['train'][min_machine_idx]['machine'] -# mdata['train_resources'] = mdata['train'][min_machine_idx]['resources'] -# -# if 'python_path' in mdata['train'][min_machine_idx]: -# mdata['python_path'] = mdata['train'][min_machine_idx]['python_path'] -# if "group_size" in mdata['train'][min_machine_idx]: -# mdata["train_group_size"] = mdata['train'][min_machine_idx]["group_size"] -# if 'deepmd_version' in mdata['train'][min_machine_idx]: -# mdata['deepmd_version'] = mdata['train'][min_machine_idx]["deepmd_version"] -# if 'command' in mdata['train'][min_machine_idx]: -# mdata['train_command'] = mdata['train'][min_machine_idx]['command'] -# -# ## Record which machine is selected -# with open("record.machine","w") as _outfile: -# profile = {} -# profile['purpose'] = 'train' -# profile['machine'] = mdata['train_machine'] -# profile['resources'] = mdata['train_resources'] -# -# if 'python_path' in mdata: -# profile['python_path'] = mdata['python_path'] -# if "train_group_size" in mdata: -# profile["group_size"] = mdata["train_group_size"] -# if 'deepmd_version' in mdata: -# profile['deepmd_version'] = mdata['deepmd_version'] -# if 'train_command' in mdata: -# profile['command'] = mdata['train_command'] -# -# json.dump(profile, _outfile, indent = 4) -# return mdata -# -# def decide_model_devi_machine(mdata): -# if Version(mdata.get('api_version', '0.9')) >= Version('1.0'): -# mdata['model_devi_group_size'] = mdata['model_devi'][0]['resources']['group_size'] -# if 'model_devi' in mdata: -# continue_flag = False -# if 'record.machine' in os.listdir(): -# try: -# with open('record.machine', 'r') as _infile: -# profile = json.load(_infile) -# if profile['purpose'] == 'model_devi': -# mdata['model_devi_machine'] = profile['machine'] -# mdata['model_devi_resources'] = profile['resources'] -# mdata['model_devi_command'] = profile['command'] -# mdata['model_devi_group_size'] = profile['group_size'] -# continue_flag = True -# except Exception: -# pass -# if ("hostname" not in mdata["model_devi"][0]["machine"]) or (len(mdata["model_devi"]) == 1): -# mdata["model_devi_machine"] = mdata["model_devi"][0]["machine"] -# mdata["model_devi_resources"] = mdata["model_devi"][0]["resources"] -# mdata["model_devi_command"] = mdata["model_devi"][0]["command"] -# #if "group_size" in mdata["train"][0]: -# mdata["model_devi_group_size"] = mdata["model_devi"][0].get("group_size", 1) -# continue_flag = True -# -# pd_count_list =[] -# pd_flag = False -# if not continue_flag: -# -# #assert isinstance(mdata['model_devi']['machine'], list) -# #ssert isinstance(mdata['model_devi']['resources'], list) -# #assert len(mdata['model_devi']['machine']) == len(mdata['model_devi']['resources']) -# -# for machine_idx in range(len(mdata['model_devi'])): -# temp_machine = mdata['model_devi'][machine_idx]['machine'] -# temp_resources = mdata['model_devi'][machine_idx]['resources'] -# #assert isinstance(temp_machine, dict), "unsupported type of model_devi machine [%d]!" %machine_idx -# #assert isinstance(temp_resources, dict), "unsupported type of model_devi resources [%d]!"%machine_idx -# #assert temp_machine['machine_type'] == 'slurm', "Currently only support for Slurm!" -# temp_ssh_sess = SSHSession(temp_machine) -# cwd = os.getcwd() -# temp_context = SSHContext(cwd, temp_ssh_sess) -# if temp_machine['machine_type'] == 'lsf': -# temp_batch = LSF(temp_context) -# else: -# temp_batch = Slurm(temp_context) -# # For other type of machines, please add them using 'elif'. -# # Here slurm is selected as the final choice in convinience. -# command = temp_batch._make_squeue(temp_machine, temp_resources) -# ret, stdin, stdout, stderr = temp_batch.context.block_call(command) -# pd_response = stdout.read().decode('utf-8').split("\n") -# pd_count = len(pd_response) -# temp_context.clean() -# if pd_count ==0: -# mdata['model_devi_machine'] = temp_machine -# mdata['model_devi_resources'] = temp_resources -# mdata['model_devi_command'] = mdata['model_devi'][machine_idx]['command'] -# mdata['model_devi_group_size'] = mdata['model_devi'][machine_idx].get('group_size', 1) -# pd_flag = True -# break -# else: -# pd_count_list.append(pd_count) -# if not pd_flag: -# min_machine_idx = np.argsort(pd_count_list)[0] -# mdata['model_devi_machine'] = mdata['model_devi'][min_machine_idx]['machine'] -# mdata['model_devi_resources'] = mdata['model_devi'][min_machine_idx]['resources'] -# mdata['model_devi_command'] = mdata['model_devi'][min_machine_idx]['command'] -# mdata['model_devi_group_size'] = mdata['model_devi'][min_machine_idx].get('group_size', 1) -# with open("record.machine","w") as _outfile: -# profile = {} -# profile['purpose'] = 'model_devi' -# profile['machine'] = mdata['model_devi_machine'] -# profile['resources'] = mdata['model_devi_resources'] -# profile['group_size'] = mdata['model_devi_group_size'] -# profile['command'] = mdata['model_devi_command'] -# -# json.dump(profile, _outfile, indent = 4) -# return mdata -# def decide_fp_machine(mdata): -# if Version(mdata.get('api_version', '0.9')) >= Version('1.0'): -# mdata['fp_group_size'] = mdata['fp'][0]['resources']['group_size'] -# if 'fp' in mdata: -# #ssert isinstance(mdata['fp']['machine'], list) -# #assert isinstance(mdata['fp']['resources'], list) -# #assert len(mdata['fp']['machine']) == len(mdata['fp']['resources']) -# continue_flag = False -# ## decide whether to use an existing machine -# if 'record.machine' in os.listdir(): -# try: -# with open('record.machine', 'r') as _infile: -# profile = json.load(_infile) -# if profile['purpose'] == 'fp': -# mdata['fp_machine'] = profile['machine'] -# mdata['fp_resources'] = profile['resources'] -# mdata['fp_command'] = profile['command'] -# mdata['fp_group_size'] = profile['group_size'] -# -# continue_flag = True -# except Exception: -# pass -# if ("hostname" not in mdata["fp"][0]["machine"]) or (len(mdata["fp"]) == 1): -# mdata["fp_machine"] = mdata["fp"][0]["machine"] -# mdata["fp_resources"] = mdata["fp"][0]["resources"] -# mdata["fp_command"] = mdata["fp"][0]["command"] -# #if "group_size" in mdata["train"][0]: -# mdata["fp_group_size"] = mdata["fp"][0].get("group_size", 1) -# continue_flag = True -# -# -# pd_count_list =[] -# pd_flag = False -# if not continue_flag: -# for machine_idx in range(len(mdata['fp'])): -# temp_machine = mdata['fp'][machine_idx]['machine'] -# temp_resources = mdata['fp'][machine_idx]['resources'] -# temp_ssh_sess = SSHSession(temp_machine) -# cwd = os.getcwd() -# temp_context = SSHContext(cwd, temp_ssh_sess) -# if temp_machine['machine_type'] == 'lsf': -# temp_batch = LSF(temp_context) -# else: -# temp_batch = Slurm(temp_context) -# # For other type of machines, please add them using 'elif'. -# # Here slurm is selected as the final choice in convinience. -# command = temp_batch._make_squeue(temp_machine, temp_resources) -# ret, stdin, stdout, stderr = temp_batch.context.block_call(command) -# pd_response = stdout.read().decode('utf-8').split("\n") -# pd_count = len(pd_response) -# temp_context.clean() -# #dlog.info(temp_machine["username"] + " " + temp_machine["hostname"] + " " + str(pd_count)) -# if pd_count ==0: -# mdata['fp_machine'] = temp_machine -# mdata['fp_resources'] = temp_resources -# mdata['fp_command'] = mdata['fp'][machine_idx]['command'] -# mdata['fp_group_size'] = mdata['fp'][machine_idx].get('group_size', 1) -# pd_flag = True -# break -# else: -# pd_count_list.append(pd_count) -# if not pd_flag: -# min_machine_idx = np.argsort(pd_count_list)[0] -# mdata['fp_machine'] = mdata['fp'][min_machine_idx]['machine'] -# mdata['fp_resources'] = mdata['fp'][min_machine_idx]['resources'] -# mdata['fp_command'] = mdata['fp'][min_machine_idx]['command'] -# mdata['fp_group_size'] = mdata['fp'][min_machine_idx].get('group_size',1) -# -# with open("record.machine","w") as _outfile: -# profile = {} -# profile['purpose'] = 'fp' -# profile['machine'] = mdata['fp_machine'] -# profile['resources'] = mdata['fp_resources'] -# profile['group_size'] = mdata['fp_group_size'] -# profile['command'] = mdata['fp_command'] -# json.dump(profile, _outfile, indent = 4) -# return mdata - diff --git a/dpgen/remote/group_jobs.py b/dpgen/remote/group_jobs.py deleted file mode 100644 index 588bcfbed..000000000 --- a/dpgen/remote/group_jobs.py +++ /dev/null @@ -1,430 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import os,sys,glob,time -import numpy as np -import subprocess as sp -from monty.serialization import dumpfn,loadfn -from dpgen.remote.RemoteJob import SlurmJob, PBSJob, CloudMachineJob, JobStatus, awsMachineJob,SSHSession -from dpgen import dlog - -import requests -from hashlib import sha1 - -def _verfy_ac(private_key, params): - items= sorted(params.items()) - - params_data = "" - for key, value in items: - params_data = params_data + str(key) + str(value) - params_data = params_data + private_key - sign = sha1() - sign.update(params_data.encode()) - signature = sign.hexdigest() - return signature - -def aws_submit_jobs(machine, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference = True): - import boto3 - task_chunks = [ - [os.path.basename(j) for j in tasks[i:i + group_size]] \ - for i in range(0, len(tasks), group_size) - ] - task_chunks = (str(task_chunks).translate((str.maketrans('','',' \'"[]'))).split(',')) - # flatten the task_chunks - print('task_chunks=',task_chunks) - njob = len(task_chunks) - print('njob=',njob) - continue_status = False - ecs=boto3.client('ecs') - ec2=boto3.client('ec2') - status_list=[] - containerInstanceArns=ecs.list_container_instances(cluster="tensorflow") - if containerInstanceArns['containerInstanceArns']: - containerInstances=ecs.describe_container_instances(cluster="tensorflow", \ - containerInstances=containerInstanceArns['containerInstanceArns'])['containerInstances'] - status_list=[container['status'] for container in containerInstances] - - need_apply_num=group_size-len(status_list) - print('need_apply_num=',need_apply_num) - if need_apply_num>0: - for ii in range(need_apply_num) : #apply for machines, - ec2.run_instances(**machine['run_instances']) - machine_fin = False - status_list=[] - while not len(status_list)>=group_size: - containerInstanceArns=ecs.list_container_instances(cluster="tensorflow") - if containerInstanceArns['containerInstanceArns']: - containerInstances=ecs.describe_container_instances(cluster="tensorflow", \ - containerInstances=containerInstanceArns['containerInstanceArns'])['containerInstances'] - status_list=[container['status'] for container in containerInstances] - if len(status_list)>=group_size: - break - else: - time.sleep(20) - print('current available containers status_list=',status_list) - print('remote_root=',machine['remote_root']) - rjob = awsMachineJob(machine['remote_root'],work_path) - taskARNs=[] - taskstatus=[] - running_job_num=0 - rjob.upload('.', forward_common_files) - for ijob in range(njob) : #uplaod && submit job - containerInstanceArns=ecs.list_container_instances(cluster="tensorflow") - containerInstances=ecs.describe_container_instances(cluster="tensorflow", \ - containerInstances=containerInstanceArns['containerInstanceArns'])['containerInstances'] - status_list=[container['status'] for container in containerInstances] - print('current available containers status_list=',status_list) - while running_job_num>=group_size: - taskstatus=[task['lastStatus'] for task in ecs.describe_tasks(cluster='tensorflow',tasks=taskARNs)['tasks']] - running_job_num=len(list(filter(lambda str:(str=='PENDING' or str =='RUNNING'),taskstatus))) - print('waiting for running job finished, taskstatus=',taskstatus,'running_job_num=',running_job_num) - time.sleep(10) - chunk = str(task_chunks[ijob]) - print('current task chunk=',chunk) - task_definition=command['task_definition'] - concrete_command=(command['concrete_command'] %(work_path,chunk)) - command_override=command['command_override'] - command_override['containerOverrides'][0]['command'][0]=concrete_command - print('concrete_command=',concrete_command) - rjob.upload(chunk, forward_task_files, - dereference = forward_task_deference) - taskres=ecs.run_task(cluster='tensorflow',\ - taskDefinition=task_definition,overrides=command_override) - while not taskres['tasks'][0]: - print('task submit failed,taskres=',taskres,'trying to re-submit'+str(chunk),) - time.sleep(10) - taskres=ecs.run_task(cluster='tensorflow',\ - taskDefinition=task_definition,overrides=command_override) - - taskARNs.append(taskres['tasks'][0]['taskArn']) - taskstatus=[task['lastStatus'] for task in ecs.describe_tasks(cluster='tensorflow',tasks=taskARNs)['tasks']] - running_job_num=len(list(filter(lambda str:(str=='PENDING' or str =='RUNNING'),taskstatus))) - print('have submitted %s/%s,taskstatus=' %(work_path,chunk) ,taskstatus,'running_job_num=',running_job_num ) - task_fin_flag=False - while not task_fin_flag: - taskstatus=[task['lastStatus'] for task in ecs.describe_tasks(cluster='tensorflow',tasks=taskARNs)['tasks']] - task_fin_flag=all([status=='STOPPED' for status in taskstatus]) - if task_fin_flag: - print('task finished,next step:copy files to local && taskstatus=',taskstatus) - else: - print('all tasks submitted,task running && taskstatus=',taskstatus) - time.sleep(20) - for ii in range(njob): - chunk = task_chunks[ii] - print('downloading '+str(chunk),backward_task_files) - rjob.download(chunk,backward_task_files) - -def _ucloud_remove_machine(machine, UHostId): - ucloud_url = machine['url'] - ucloud_stop_param = {} - ucloud_stop_param['Action'] = "StopUHostInstance" - ucloud_stop_param['Region'] = machine['ucloud_param']['Region'] - ucloud_stop_param['UHostId'] = UHostId - ucloud_stop_param['PublicKey'] = machine['ucloud_param']['PublicKey'] - ucloud_stop_param['Signature'] = _verfy_ac(machine['Private'], ucloud_stop_param) - - - req = requests.get(ucloud_url, ucloud_stop_param) - if req.json()['RetCode'] != 0 : - raise RuntimeError ("failed to stop ucloud machine") - - terminate_fin = False - try_time = 0 - while not terminate_fin: - ucloud_delete_param = {} - ucloud_delete_param['Action'] = "TerminateUHostInstance" - ucloud_delete_param['Region'] = machine['ucloud_param']['Region'] - ucloud_delete_param['UHostId'] = UHostId - ucloud_delete_param['PublicKey'] = machine['ucloud_param']['PublicKey'] - ucloud_delete_param['Signature'] = _verfy_ac(machine['Private'], ucloud_delete_param) - req = requests.get(ucloud_url, ucloud_delete_param) - if req.json()['RetCode'] == 0 : - terminate_fin = True - try_time = try_time + 1 - if try_time >= 200: - raise RuntimeError ("failed to terminate ucloud machine") - time.sleep(10) - print("Machine ",UHostId,"has been successfully terminated!") - -def ucloud_submit_jobs(machine, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference = True) : - task_chunks = [ - [os.path.basename(j) for j in tasks[i:i + group_size]] \ - for i in range(0, len(tasks), group_size) - ] - njob = len(task_chunks) - continue_status = False - if os.path.isfile("record.machine"): - with open ("record.machine", "r") as fr: - record_machine = json.load(fr) - if record_machine["purpose"] == machine["purpose"] and record_machine["njob"] == njob: - continue_status = True - ucloud_machines = record_machine["ucloud_machines"] - ucloud_hostids = record_machine["ucloud_hostids"] - fr.close() - ucloud_url = machine['url'] - if continue_status == False: - assert machine['machine_type'] == 'ucloud' - ucloud_start_param = machine['ucloud_param'] - ucloud_start_param['Action'] = "CreateUHostInstance" - ucloud_start_param['Name'] = "train" - ucloud_start_param['Signature'] = _verfy_ac(machine['Private'], ucloud_start_param) - - - ucloud_machines = [] - ucloud_hostids = [] - for ii in range(njob) : - req = requests.get(ucloud_url, ucloud_start_param) - if req.json()['RetCode'] != 0 : - print(json.dumps(req.json(),indent=2, sort_keys=True)) - raise RuntimeError ("failed to start ucloud machine") - ucloud_machines.append(str(req.json()["IPs"][0])) - ucloud_hostids.append(str(req.json()["UHostIds"][0])) - - new_record_machine = {} - new_record_machine["purpose"] = machine["purpose"] - new_record_machine["njob"] = njob - new_record_machine["ucloud_machines"] = ucloud_machines - new_record_machine["ucloud_hostids"] = ucloud_hostids - with open ("record.machine", "w") as fw: - json.dump(new_record_machine, fw) - fw.close() - - machine_fin = [False for ii in ucloud_machines] - total_machine_num = len(ucloud_machines) - fin_machine_num = 0 - while not all(machine_fin): - for idx,mac in enumerate(ucloud_machines): - if not machine_fin[idx]: - ucloud_check_param = {} - ucloud_check_param['Action'] = "GetUHostInstanceVncInfo" - ucloud_check_param['Region'] = machine['ucloud_param']['Region'] - ucloud_check_param['UHostId'] = ucloud_hostids[idx] - ucloud_check_param['PublicKey'] = machine['ucloud_param']['PublicKey'] - ucloud_check_param['Signature'] = _verfy_ac(machine['Private'], ucloud_check_param) - req = requests.get(ucloud_url, ucloud_check_param) - print("the UHostId is", ucloud_hostids[idx]) - print(json.dumps(req.json(),indent=2, sort_keys=True)) - if req.json()['RetCode'] == 0 : - machine_fin[idx] = True - fin_machine_num = fin_machine_num + 1 - print("Current finish",fin_machine_num,"/", total_machine_num) - - - ucloud_check_param1 = {} - ucloud_check_param1['Action'] = "DescribeUHostInstance" - ucloud_check_param1['Region'] = machine['ucloud_param']['Region'] - ucloud_check_param1["Limit"] = 100 - ucloud_check_param1['PublicKey'] = machine['ucloud_param']['PublicKey'] - ucloud_check_param1['Signature'] = _verfy_ac(machine['Private'], ucloud_check_param1) - req1 = requests.get(ucloud_url, ucloud_check_param1).json() - - machine_all_fin = True - for idx1 in range(int(req1["TotalCount"])): - if req1["UHostSet"][idx1]["State"] != "Running": - machine_all_fin = False - break - if machine_all_fin == True: - machine_fin = [True for i in machine_fin] - time.sleep(10) - ssh_sess = [] - ssh_param = {} - ssh_param['port'] = 22 - ssh_param['username'] = 'root' - ssh_param['work_path'] = machine['work_path'] - for ii in ucloud_machines : - ssh_param['hostname'] = ii - ssh_sess.append(SSHSession(ssh_param)) - - job_list = [] - for ii in range(njob) : - chunk = task_chunks[ii] - print("Current machine is", ucloud_machines[ii]) - rjob = CloudMachineJob(ssh_sess[ii], work_path) - rjob.upload('.', forward_common_files) - rjob.upload(chunk, forward_task_files, - dereference = forward_task_deference) - rjob.submit(chunk, command, resources = resources) - job_list.append(rjob) - - job_fin = [False for ii in job_list] - while not all(job_fin) : - for idx,rjob in enumerate(job_list) : - if not job_fin[idx] : - status = rjob.check_status() - if status == JobStatus.terminated : - raise RuntimeError("find unsuccessfully terminated job on machine" % ucloud_machines[idx]) - elif status == JobStatus.finished : - rjob.download(task_chunks[idx], backward_task_files) - rjob.clean() - _ucloud_remove_machine(machine, ucloud_hostids[idx]) - job_fin[idx] = True - time.sleep(10) - os.remove("record.machine") - - -def group_slurm_jobs(ssh_sess, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - remote_job = SlurmJob, - forward_task_deference = True) : - - task_chunks = [ - [os.path.basename(j) for j in tasks[i:i + group_size]] \ - for i in range(0, len(tasks), group_size) - ] - cwd=os.getcwd() - _pmap=PMap(cwd) - path_map=_pmap.load() - dlog.debug("work_path: %s"% work_path) - dlog.debug("curr_path: %s"% cwd) - - job_list = [] - task_chunks_=['+'.join(ii) for ii in task_chunks] - for ii in task_chunks_: - dlog.debug("task_chunk %s" % ii) - - #dlog.debug(path_map) - for ii,chunk in enumerate(task_chunks) : - - # map chunk info. to uniq id - chunk_uni=task_chunks_[ii].encode('utf-8') - chunk_sha1=sha1(chunk_uni).hexdigest() - - if chunk_sha1 in path_map: - job_uuid=path_map[chunk_sha1][1].split('/')[-1] - dlog.debug("load uuid %s" % job_uuid) - else: - job_uuid=None - - rjob = remote_job(ssh_sess, work_path, job_uuid) - dlog.debug('uuid %s'%job_uuid) - rjob.upload('.', forward_common_files) - rjob.upload(chunk, forward_task_files, - dereference = forward_task_deference) - if job_uuid: - rjob.submit(chunk, command, resources = resources,restart=True) - else: - rjob.submit(chunk, command, resources = resources) - job_list.append(rjob) - path_map[chunk_sha1]=[rjob.local_root,rjob.remote_root] - _pmap.dump(path_map) - - job_fin = [False for ii in job_list] - lcount=[0]*len(job_list) - count_fail = 0 - while not all(job_fin) : - for idx,rjob in enumerate(job_list) : - if not job_fin[idx] : - try: - status = rjob.check_status() - except Exception: - ssh_sess = SSHSession(ssh_sess.remote_profile) - for _idx,_rjob in enumerate(job_list): - job_list[_idx] = SlurmJob(ssh_sess, work_path, _rjob.job_uuid) - count_fail = count_fail +1 - dlog.info("ssh_sess failed for %d times"%count_fail) - break - if status == JobStatus.terminated : - lcount[idx]+=1 - _job_uuid=rjob.remote_root.split('/')[-1] - dlog.info('Job at %s terminated, submit again'% _job_uuid) - dlog.debug('try %s times for %s'% (lcount[idx], _job_uuid)) - rjob.submit(task_chunks[idx], command, resources = resources,restart=True) - if lcount[idx]>3: - dlog.info('Too many errors for ! %s ' % _job_uuid) - rjob.download(task_chunks[idx], backward_task_files,back_error=True) - rjob.clean() - job_fin[idx] = True - elif status == JobStatus.finished : - rjob.download(task_chunks[idx], backward_task_files) - rjob.clean() - job_fin[idx] = True - time.sleep(10) - dlog.debug('error count') - dlog.debug(lcount) - # delete path map file when job finish - _pmap.delete() - -def group_local_jobs(ssh_sess, - resources, - command, - work_path, - tasks, - group_size, - forward_common_files, - forward_task_files, - backward_task_files, - forward_task_deference = True) : - task_chunks = [ - [os.path.basename(j) for j in tasks[i:i + group_size]] \ - for i in range(0, len(tasks), group_size) - ] - job_list = [] - for chunk in task_chunks : - rjob = CloudMachineJob(ssh_sess, work_path) - rjob.upload('.', forward_common_files) - rjob.upload(chunk, forward_task_files, - dereference = forward_task_deference) - rjob.submit(chunk, command, resources = resources) - job_list.append(rjob) - job_fin = False - while not job_fin : - status = rjob.check_status() - if status == JobStatus.terminated : - raise RuntimeError("find unsuccessfully terminated job in %s" % rjob.get_job_root()) - elif status == JobStatus.finished : - rjob.download(chunk, backward_task_files) - rjob.clean() - job_fin = True - time.sleep(10) - -class PMap(object): - ''' - Path map class to operate {read,write,delte} the pmap.json file - ''' - - def __init__(self,path,fname="pmap.json"): - self.f_path_map=os.path.join(path,fname) - - def load(self): - f_path_map=self.f_path_map - if os.path.isfile(f_path_map): - path_map=loadfn(f_path_map) - else: - path_map={} - return path_map - - def dump(self,pmap,indent=4): - f_path_map=self.f_path_map - dumpfn(pmap,f_path_map,indent=indent) - - def delete(self): - f_path_map=self.f_path_map - try: - os.remove(f_path_map) - except Exception: - pass diff --git a/dpgen/remote/localhost.json b/dpgen/remote/localhost.json deleted file mode 100644 index f2feaed5d..000000000 --- a/dpgen/remote/localhost.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "hostname" : "localhost", - "port" : 22, - "username": "wanghan", - "work_path" : "/home/wanghan/tmp", - "_comment" : "that's all" -} diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py index 245b3ff85..fd3c8bb5b 100644 --- a/dpgen/simplify/simplify.py +++ b/dpgen/simplify/simplify.py @@ -13,19 +13,15 @@ import queue import os import json -import argparse -import pickle import glob -import fnmatch import dpdata import numpy as np from typing import Union, List from dpgen import dlog -from dpgen import SHORT_CMD from dpgen.util import sepline, expand_sys_str, normalize from packaging.version import Version -from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission +from dpgen.dispatcher.Dispatcher import make_submission from dpgen.generator.run import make_train, run_train, post_train, run_fp, post_fp, fp_name, model_devi_name, train_name, train_task_fmt, sys_link_fp_vasp_pp, make_fp_vasp_incar, make_fp_vasp_kp, make_fp_vasp_cp_cvasp, data_system_fmt, model_devi_task_fmt, fp_task_fmt # TODO: maybe the following functions can be moved to dpgen.util from dpgen.generator.lib.utils import log_iter, make_iter_name, create_path, record_iter @@ -201,21 +197,9 @@ def run_model_devi(iter_index, jdata, mdata): forward_files = [rest_data_name + ".old"] backward_files = [detail_file_name] - api_version = mdata.get('api_version', '0.9') + api_version = mdata.get('api_version', '1.0') if Version(api_version) < Version('1.0'): - warnings.warn(f"the dpdispatcher will be updated to new version." - f"And the interface may be changed. Please check the documents for more details") - dispatcher = make_dispatcher(mdata['model_devi_machine'], mdata['model_devi_resources'], work_path, run_tasks, model_devi_group_size) - dispatcher.run_jobs(mdata['model_devi_resources'], - commands, - work_path, - run_tasks, - model_devi_group_size, - model_names, - forward_files, - backward_files, - outlog = 'model_devi.log', - errlog = 'model_devi.log') + raise RuntimeError("API version %s has been removed. Please upgrade to 1.0." % api_version) elif Version(api_version) >= Version('1.0'): submission = make_submission( diff --git a/examples/machine/DeePMD-kit-1.x/machine-ali.json b/examples/machine/DeePMD-kit-1.x/machine-ali.json deleted file mode 100644 index e78fc9dd4..000000000 --- a/examples/machine/DeePMD-kit-1.x/machine-ali.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "train": - { - "machine": { - "batch": "shell", - "hostname": "", - "password": "PASSWORD", - "port": 22, - "username": "root", - "work_path": "/root/dpgen_work", - "ali_auth": { - "AccessKey_ID":"", - "AccessKey_Secret":"", - "regionID": "cn-shenzhen", - "img_name": "kit", - "machine_type_price": [ - {"machine_type": "ecs.gn6v-c8g1.2xlarge", "price_limit": 20.00, "numb": 1, "priority": 0}, - {"machine_type": "ecs.gn5-c4g1.xlarge", "price_limit": 20.00, "numb": 1, "priority": 1} - ], - "instance_name": "CH4", - "pay_strategy": "spot" - } - }, - "resources": { - "numb_gpu": 1, - "numb_node": 1, - "task_per_node": 12, - "partition": "gpu", - "exclude_list": [], - "mem_limit": 32, - "source_list": [], - "module_list": [], - "time_limit": "23:0:0" - }, - "command": "/root/deepmd-kit/bin/dp", - "group_size": 2 - }, - - "model_devi": - { - "machine": { - "batch": "shell", - "hostname": "", - "password": "PASSWORD", - "port": 22, - "username": "root", - "work_path": "/root/dpgen_work", - "ali_auth": { - "AccessKey_ID":"", - "AccessKey_Secret":"", - "regionID": "cn-shenzhen", - "img_name": "kit", - "machine_type_price": [ - {"machine_type": "ecs.gn6v-c8g1.2xlarge", "price_limit": 20.00, "numb": 1, "priority": 0}, - {"machine_type": "ecs.gn5-c4g1.xlarge", "price_limit": 20.00, "numb": 1, "priority": 1} - ], - "instance_name": "CH4", - "pay_strategy": "spot" - } - }, - "resources": { - "numb_gpu": 1, - "task_per_node": 4, - "partition": "gpu", - "exclude_list": [], - "mem_limit": 11, - "source_list": [], - "module_list": [], - "time_limit": "23:0:0" - }, - "command": "/root/deepmd-kit/bin/lmp", - "group_size": 2 - }, - - "fp": - { - "machine": { - "batch": "shell", - "hostname": "", - "password": "PASSWORD", - "port": 22, - "username": "root", - "work_path": "/root/dpgen_work", - "ali_auth": { - "AccessKey_ID":"", - "AccessKey_Secret":"", - "regionID": "cn-shenzhen", - "img_name": "vasp", - "machine_type_price": [ - {"machine_type": "ecs.c6.4xlarge", "price_limit": 0.2, "numb": 16, "priority": 0}, - {"machine_type": "ecs.g6.4xlarge", "price_limit": 0.2, "numb": 16, "priority": 1} - ], - "instance_name": "CH4", - "pay_strategy": "spot" - } - }, - "resources": { - "numb_gpu": 0, - "task_per_node": 16, - "with_mpi": "false", - "source_list": ["/opt/intel/parallel_studio_xe_2018/psxevars.sh"], - "module_list": [], - "partition": "cpu", - "envs" : {"PATH" : "/root/deepmd-pkg/vasp.5.4.4/bin:$PATH"} - }, - "command": "mpirun -n 16 /root/deepmd-pkg/vasp.5.4.4/bin/vasp_std", - "group_size": 1 - } -} - - - diff --git a/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json b/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json index 0ecba4fa6..ae6ac31ab 100644 --- a/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json +++ b/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json @@ -27,8 +27,8 @@ } }, "resources": { + "batch_type": "DpCloudServer", "number_node": 1, - "local_root":"./", "cpu_per_node": 4, "gpu_per_node": 1, "queue_name": "GPU", @@ -61,8 +61,8 @@ } }, "resources": { + "batch_type": "DpCloudServer", "number_node": 1, - "local_root":"./", "cpu_per_node": 4, "gpu_per_node": 1, "queue_name": "GPU", @@ -95,12 +95,12 @@ } }, "resources": { + "batch_type": "DpCloudServer", "number_node": 1, "cpu_per_node": 32, "gpu_per_node": 0, "queue_name": "CPU", "group_size": 5, - "local_root":"./", "source_list": ["/opt/intel/oneapi/setvars.sh"] } } diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-aws.json b/examples/machine/deprecated/DeePMD-kit-0.12/machine-aws.json deleted file mode 100644 index 7d050b548..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-aws.json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "machine_type":"aws", - "train_machine":{ - "machine_type":"aws", - "remote_root":"/home/ec2-user/efs", - "run_instances":{ - "BlockDeviceMappings":[ - { - "DeviceName": "/dev/xvda", - "Ebs": { - "DeleteOnTermination": true, - "VolumeSize": 40, - "VolumeType": "gp2" - } - } - ], - "ImageId":"ami-0329a1fdc914b0c55", - "InstanceType":"t2.small", - "KeyName":"yfb", - "IamInstanceProfile":{ - "Name": "ecsInstanceRole"}, - "MaxCount":1, - "MinCount":1, - "Monitoring":{ - "Enabled": false - }, - "SecurityGroupIds":[ - "sg-0c3e6637acfb70200" - ], - "UserData":"#!/bin/bash\ncloud-init-per once yum_update yum update -y\ncloud-init-per once install_nfs_utils yum install -y nfs-utils\ncloud-init-per once mkdir_efs mkdir /efs\ncloud-init-per once mount_efs echo -e 'fs-96b3e4ef.efs.us-east-2.amazonaws.com:/ /efs nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0' >> /etc/fstab\nmount -a\necho \"ECS_CLUSTER=tensorflow\" >> /etc/ecs/ecs.config" - } - }, - "model_devi_group_size":5, - "model_devi_machine":{ - "machine_type":"aws", - "remote_root":"/home/ec2-user/efs", - "run_instances":{ - "BlockDeviceMappings":[ - { - "DeviceName": "/dev/xvda", - "Ebs": { - "DeleteOnTermination": true, - "VolumeSize": 40, - "VolumeType": "gp2" - } - } - ], - "ImageId":"ami-0329a1fdc914b0c55", - "InstanceType":"t2.small", - "KeyName":"yfb", - "IamInstanceProfile":{ - "Name": "ecsInstanceRole"}, - "MaxCount":1, - "MinCount":1, - "Monitoring":{ - "Enabled": false - }, - "SecurityGroupIds":[ - "sg-0c3e6637acfb70200" - ], - "UserData":"#!/bin/bash\ncloud-init-per once yum_update yum update -y\ncloud-init-per once install_nfs_utils yum install -y nfs-utils\ncloud-init-per once mkdir_efs mkdir /efs\ncloud-init-per once mount_efs echo -e 'fs-96b3e4ef.efs.us-east-2.amazonaws.com:/ /efs nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0' >> /etc/fstab\nmount -a\necho \"ECS_CLUSTER=tensorflow\" >> /etc/ecs/ecs.config" - } - }, - "fp_machine":{ - "machine_type":"aws", - "remote_root":"/home/ec2-user/efs", - "run_instances":{ - "BlockDeviceMappings":[ - { - "DeviceName": "/dev/xvda", - "Ebs": { - "DeleteOnTermination": true, - "VolumeSize": 40, - "VolumeType": "gp2" - } - } - ], - "ImageId":"ami-0329a1fdc914b0c55", - "InstanceType":"t2.small", - "KeyName":"yfb", - "IamInstanceProfile":{ - "Name": "ecsInstanceRole"}, - "MaxCount":1, - "MinCount":1, - "Monitoring":{ - "Enabled":false - }, - "SecurityGroupIds":[ - "sg-0c3e6637acfb70200" - ], - "UserData":"#!/bin/bash\ncloud-init-per once yum_update yum update -y\ncloud-init-per once install_nfs_utils yum install -y nfs-utils\ncloud-init-per once mkdir_efs mkdir /efs\ncloud-init-per once mount_efs echo -e 'fs-96b3e4ef.efs.us-east-2.amazonaws.com:/ /efs nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0' >> /etc/fstab\nmount -a\necho \"ECS_CLUSTER=tensorflow\" >> /etc/ecs/ecs.config" - } - }, - "fp_group_size":5, - "fp_resources":{ - "with_mpi":true - }, - "deepmd_path": "/deepmd_root/", - "model_devi_command":"/usr/bin/lmp_mpi", - "fp_command":"/usr/bin/vasp_std", - - "train_resources": {}, - "model_devi_resources":{}, - - "task_definition":{ - "requiresCompatibilities": [ - "EC2" - ], - "containerDefinitions": [{ - "command": [ - "ls /home/ec2-user/efs && cd /deepmd-kit/examples/train && dp_train water_smth_test.json | tee /home/ec2-user/efs/dp_train.log" - ], - "entryPoint": [ - "sh", - "-c" - ], - "name": "deepmd-training-container", - "mountPoints": [{ - "sourceVolume": "efs", - "containerPath": "/home/ec2-user"} - ], - "image": "787517567283.dkr.ecr.us-east-2.amazonaws.com/deepmd:squashed", - "memory": 1800, - "cpu": 1000, - "essential": true, - "portMappings": [{ - "containerPort": 80, - "protocol": "tcp" - }], - "logConfiguration": { - "logDriver": "awslogs", - "options": { - "awslogs-group": "awslogs-tf-ecs", - "awslogs-region": "us-east-2", - "awslogs-stream-prefix": "tf", - "awslogs-create-group": "true" - } - } - }], - "volumes": [{ - "host": { - "sourcePath": "/" }, - "name": "efs" - } - ], - "networkMode": "bridge", - "placementConstraints": [], - "family": "deepmd"}, - "run_train_task_definition":{ - "command_override":{"containerOverrides":[{"name":"deepmd-training-container","command":["concrete_command"]}]}, - "task_definition":"arn:aws:ecs:us-east-2:787517567283:task-definition/run_train:1", - "concrete_command":"cd /home/ec2-user/efs/%s/%s && dp_train input.json && dp_frz" - }, - "model_devi_task_definition":{ - "command_override":{"containerOverrides":[{"name":"deepmd-training-container","command":["concrete_command"]}]}, - "task_definition":"arn:aws:ecs:us-east-2:787517567283:task-definition/run_train:2", - "concrete_command":"cd /home/ec2-user/efs/%s/%s && /usr/bin/lmp_mpi -i input.lammps | tee model_devi.log" - }, - "fp_task_definition":{ - "command_override":{"containerOverrides":[{"name":"deepmd-training-container","command":["concrete_command"]}]}, - "task_definition":"arn:aws:ecs:us-east-2:787517567283:task-definition/run_fp:2", - "concrete_command":"cd /home/ec2-user/efs/%s/%s && mpirun -n 2 --allow-run-as-root /usr/bin/vasp_std | tee fp.log" - } -} diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-local.json b/examples/machine/deprecated/DeePMD-kit-0.12/machine-local.json deleted file mode 100644 index b8e15a625..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-local.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "_comment": "training on localhost ", - "_comment" : "This is for DeePMD-kit 0.12.4", - "deepmd_path": "/home/wanghan/local/deepmd/0.12.4/", - "train_machine": { - "batch": "shell", - "work_path" : "/home/wanghan/tmp/subs/" - }, - "train_resources": { - "envs": { - "PYTHONPATH" : "/home/wanghan/local/tensorflow/1.8.py/lib/python3.6/site-packages/" - } - }, - - - "_comment": "model_devi on localhost ", - "model_devi_command": "/home/wanghan/local/bin/lmp_mpi_010", - "model_devi_group_size": 5, - "model_devi_machine": { - "batch": "shell", - "_comment" : "If lazy_local is true, calculations are done directly in current folders.", - "lazy_local" : true - }, - "model_devi_resources": { - }, - - "_comment": "fp on localhost ", - "fp_command": "/home/wanghan/local/bin/vasp_std", - "fp_group_size": 2, - "fp_machine": { - "batch": "local", - "work_path" : "/home/wanghan/tmp/subs/", - "_comment" : "that's all" - }, - "fp_resources": { - "module_list": ["mpi"], - "task_per_node":4, - "with_mpi": true, - "_comment": "that's all" - }, - - "_comment": " that's all " -} diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-lsf.json b/examples/machine/deprecated/DeePMD-kit-0.12/machine-lsf.json deleted file mode 100644 index d8ebd61ed..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-lsf.json +++ /dev/null @@ -1,93 +0,0 @@ -{ - "train": [ - { - "machine": { - "batch": "lsf", - "hostname": "localhost", - "port": 22, - "username": "ypliu", - "work_path": "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/dpmd" - }, - "resources": { - "_comment": "this part should be modified if GPU resources could be called directly by LSF", - "node_cpu": 4, - "numb_node": 1, - "task_per_node": 4, - "partition": "gpu", - "exclude_list": [], - "mem_limit": 11, - "source_list": [ - "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/source_env.sh", - "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/test_gpu_sub.sh" - ], - "module_list": [ - "vasp/5.4.4", - "cuda" - ], - "time_limit": "23:0:0" - }, - "deepmd_path": "/data/home/ypliu/deepmd/deepmd_root" - } - ], - "model_devi": [ - { - "machine": { - "batch": "lsf", - "hostname": "localhost", - "port": 22, - "username": "ypliu", - "work_path": "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/lammps" - }, - "resources": { - "_comment": "this part should be modified if GPU resources could be called directly by LSF", - "node_cpu": 4, - "numb_node": 1, - "task_per_node": 4, - "partition": "gpu", - "exclude_list": [], - "mem_limit": 11, - "source_list": [ - "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/source_env.sh", - "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/test_gpu_sub.sh" - ], - "module_list": [ - "vasp/5.4.4", - "cuda", - "gcc/4.9.4" - ], - "time_limit": "23:0:0" - }, - "command": "/data/home/ypliu/lammps/lammps-7Aug19/src/lmp_mpi", - "group_size": 10 - } - ], - "fp": [ - { - "machine": { - "batch": "lsf", - "hostname": "localhost", - "port": 22, - "username": "ypliu", - "work_path": "/data/home/ypliu/test/deepmd-tutorial/cp2k_dpgen/cp2k" - }, - "resources": { - "cvasp": false, - "task_per_node": 28, - "node_cpu": 28, - "exclude_list": [], - "mem_limit": 128, - "with_mpi": true, - "source_list": [], - "module_list": [ - "intel/17.0.1", - "mpi/intel/2017.1.132" - ], - "time_limit": "96:0:0", - "partition": "q2680v4m128", - "_comment": "that's Bel" - }, - "command": "/share/apps/cp2k-5.0/Linux-x86-64-intel-host/cp2k.popt -i input.inp", - "group_size": 5 - } - ] -} \ No newline at end of file diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-multi.json b/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-multi.json deleted file mode 100644 index e24838077..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-multi.json +++ /dev/null @@ -1,241 +0,0 @@ -{ - "train": [ - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "GPU", - "exclude_list": [], - "source_list": [ - "/gpfs/share/home/1600017784/env/train_tf112_float.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "qos": "bigdata" - }, - "deepmd_path": "/gpfs/share/software/deepmd-kit/0.12.4/gpu/gcc/4.9.0/tf1120-lowprec" - }, - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "AdminGPU", - "exclude_list": [], - "source_list": [ - "/gpfs/share/home/1600017784/env/train_tf112_float.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "qos": "bigdata" - }, - "deepmd_path": "/gpfs/share/software/deepmd-kit/0.12.4/gpu/gcc/4.9.0/tf1120-lowprec" - }, - { - "deepmd_path": "/data2/publicsoft/deepmd-kit/0.12.4-s/", - "machine": { - "batch": "slurm", - "hostname": "115.27.161.2", - "port": 22, - "username": "anguse", - "work_path": "/data1/anguse/generator/Cu/work/", - "_comment": "that's all" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "all", - "mem_limit": 16, - "exclude_list": [ - "gpu06", - "gpu07" - ], - "source_list": [ - "/data1/anguse/env/train.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "_comment": "that's all" - } - } - ], - "model_devi": [ - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 2, - "partition": "GPU", - "exclude_list": [], - "source_list": [ - "/gpfs/share/home/1600017784/env/lmp_tf112_float.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "qos": "bigdata" - }, - "command": "lmp_serial", - "group_size": 10 - }, - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 2, - "partition": "AdminGPU", - "exclude_list": [], - "source_list": [ - "/gpfs/share/home/1600017784/env/lmp_tf112_float.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "qos": "bigdata" - }, - "command": "lmp_serial", - "group_size": 10 - }, - { - "machine": { - "batch": "slurm", - "hostname": "115.27.161.2", - "port": 22, - "username": "anguse", - "work_path": "/data1/anguse/generator/Cu/work/", - "_comment": "that's all" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "all", - "mem_limit": 16, - "exclude_list": [ - "gpu12" - ], - "source_list": [ - "/data1/anguse/env/lmp.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "_comment": "that's all" - }, - "command": "lmp_serial", - "group_size": 20 - } - ], - "fp": [ - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "cvasp": true, - "task_per_node": 28, - "numb_gpu": 0, - "exclude_list": [], - "with_mpi": true, - "source_list": [], - "module_list": [ - "intel/2017.1", - "vasp/5.4.4-intel-2017.1" - ], - "time_limit": "120:0:0", - "partition": "C028M256G", - "qos": "bigdata", - "_comment": "that's Bel" - }, - "command": "vasp_std", - "group_size": 5 - }, - { - "machine": { - "batch": "slurm", - "hostname": "162.105.133.134", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "cvasp": true, - "task_per_node": 16, - "numb_gpu": 0, - "exclude_list": [], - "with_mpi": false, - "source_list": [ - "activate dppy" - ], - "module_list": [ - "mpich/3.2.1-intel-2017.1", - "vasp/5.4.4-intel-2017.1" - ], - "time_limit": "120:0:0", - "partition": "C032M0128G", - "_comment": "that's Bel" - }, - "command": "mpirun -n 16 vasp_std", - "group_size": 5 - }, - { - "machine": { - "batch": "slurm", - "hostname": "162.105.133.134", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "cvasp": true, - "task_per_node": 16, - "numb_gpu": 0, - "exclude_list": [], - "with_mpi": false, - "source_list": [ - "activate dppy" - ], - "module_list": [ - "mpich/3.2.1-intel-2017.1", - "vasp/5.4.4-intel-2017.1" - ], - "time_limit": "120:0:0", - "partition": "C032M0256G", - "_comment": "that's all" - }, - "command": "mpirun -n 16 vasp_std", - "group_size": 5 - } - ] -} \ No newline at end of file diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-multi.yaml b/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-multi.yaml deleted file mode 100644 index 5bd30d186..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-multi.yaml +++ /dev/null @@ -1,189 +0,0 @@ ---- -train: -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 4 - partition: GPU - exclude_list: [] - source_list: - - "/gpfs/share/home/1600017784/env/train_tf112_float.env" - module_list: [] - time_limit: '23:0:0' - qos: bigdata - deepmd_path: "/gpfs/share/software/deepmd-kit/0.12.4/gpu/gcc/4.9.0/tf1120-lowprec" -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 4 - partition: AdminGPU - exclude_list: [] - source_list: - - "/gpfs/share/home/1600017784/env/train_tf112_float.env" - module_list: [] - time_limit: '23:0:0' - qos: bigdata - deepmd_path: "/gpfs/share/software/deepmd-kit/0.12.4/gpu/gcc/4.9.0/tf1120-lowprec" -- deepmd_path: "/data2/publicsoft/deepmd-kit/0.12.4-s/" - machine: - batch: slurm - hostname: 115.27.161.2 - port: 22 - username: anguse - work_path: "/data1/anguse/generator/Cu/work/" - _comment: that's all - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 4 - partition: all - mem_limit: 16 - exclude_list: - - gpu06 - - gpu07 - source_list: - - "/data1/anguse/env/train.env" - module_list: [] - time_limit: '23:0:0' - _comment: that's all -model_devi: -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 2 - partition: GPU - exclude_list: [] - source_list: - - "/gpfs/share/home/1600017784/env/lmp_tf112_float.env" - module_list: [] - time_limit: '23:0:0' - qos: bigdata - command: lmp_serial - group_size: 10 -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 2 - partition: AdminGPU - exclude_list: [] - source_list: - - "/gpfs/share/home/1600017784/env/lmp_tf112_float.env" - module_list: [] - time_limit: '23:0:0' - qos: bigdata - command: lmp_serial - group_size: 10 -- machine: - batch: slurm - hostname: 115.27.161.2 - port: 22 - username: anguse - work_path: "/data1/anguse/generator/Cu/work/" - _comment: that's all - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 4 - partition: all - mem_limit: 16 - exclude_list: - - gpu12 - source_list: - - "/data1/anguse/env/lmp.env" - module_list: [] - time_limit: '23:0:0' - _comment: that's all - command: lmp_serial - group_size: 20 -fp: -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - cvasp: true - task_per_node: 28 - numb_gpu: 0 - exclude_list: [] - with_mpi: true - source_list: [] - module_list: - - intel/2017.1 - - vasp/5.4.4-intel-2017.1 - time_limit: '120:0:0' - partition: C028M256G - qos: bigdata - _comment: that's Bel - command: vasp_std - group_size: 5 -- machine: - batch: slurm - hostname: 162.105.133.134 - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - cvasp: true - task_per_node: 16 - numb_gpu: 0 - exclude_list: [] - with_mpi: false - source_list: - - activate dppy - module_list: - - mpich/3.2.1-intel-2017.1 - - vasp/5.4.4-intel-2017.1 - time_limit: '120:0:0' - partition: C032M0128G - _comment: that's Bel - command: mpirun -n 16 vasp_std - group_size: 5 -- machine: - batch: slurm - hostname: 162.105.133.134 - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - cvasp: true - task_per_node: 16 - numb_gpu: 0 - exclude_list: [] - with_mpi: false - source_list: - - activate dppy - module_list: - - mpich/3.2.1-intel-2017.1 - - vasp/5.4.4-intel-2017.1 - time_limit: '120:0:0' - partition: C032M0256G - _comment: that's all - command: mpirun -n 16 vasp_std - group_size: 5 - diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-single.json b/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-single.json deleted file mode 100644 index 2dbdafd5e..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-single.json +++ /dev/null @@ -1,82 +0,0 @@ -{ - "train": [ - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "GPU", - "exclude_list": [], - "source_list": [ - "/gpfs/share/home/1600017784/env/train_tf112_float.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "qos": "bigdata" - }, - "deepmd_path": "/gpfs/share/software/deepmd-kit/0.12.4/gpu/gcc/4.9.0/tf1120-lowprec" - } - ], - "model_devi": [ - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 2, - "partition": "GPU", - "exclude_list": [], - "source_list": [ - "/gpfs/share/home/1600017784/env/lmp_tf112_float.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "qos": "bigdata" - }, - "command": "lmp_serial", - "group_size": 10 - } - ], - "fp": [ - { - "machine": { - "batch": "slurm", - "hostname": "localhost", - "port": 22, - "username": "1600017784", - "work_path": "/gpfs/share/home/1600017784/generator/Cu/work" - }, - "resources": { - "cvasp": true, - "task_per_node": 4, - "numb_gpu": 1, - "exclude_list": [], - "with_mpi": false, - "source_list": [], - "module_list": [ - "mpich/3.2.1-intel-2017.1", - "vasp/5.4.4-intel-2017.1", - "cuda/10.1" - ], - "time_limit": "120:0:0", - "partition": "GPU", - "_comment": "that's All" - }, - "command": "vasp_gpu", - "group_size": 5 - } - ] -} \ No newline at end of file diff --git a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-single.yaml b/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-single.yaml deleted file mode 100644 index 3b52e52ce..000000000 --- a/examples/machine/deprecated/DeePMD-kit-0.12/machine-slurm-vasp-single.yaml +++ /dev/null @@ -1,64 +0,0 @@ ---- -train: -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 4 - partition: GPU - exclude_list: [] - source_list: - - "/gpfs/share/home/1600017784/env/train_tf112_float.env" - module_list: [] - time_limit: '23:0:0' - qos: bigdata - deepmd_path: "/gpfs/share/software/deepmd-kit/0.12.4/gpu/gcc/4.9.0/tf1120-lowprec" -model_devi: -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - numb_node: 1 - numb_gpu: 1 - task_per_node: 2 - partition: GPU - exclude_list: [] - source_list: - - "/gpfs/share/home/1600017784/env/lmp_tf112_float.env" - module_list: [] - time_limit: '23:0:0' - qos: bigdata - command: lmp_serial - group_size: 10 -fp: -- machine: - batch: slurm - hostname: localhost - port: 22 - username: '1600017784' - work_path: "/gpfs/share/home/1600017784/generator/Cu/work" - resources: - cvasp: true - task_per_node: 4 - numb_gpu: 1 - exclude_list: [] - with_mpi: false - source_list: [] - module_list: - - mpich/3.2.1-intel-2017.1 - - vasp/5.4.4-intel-2017.1 - - cuda/10.1 - time_limit: '120:0:0' - partition: GPU - _comment: that's All - command: vasp_gpu - group_size: 5 - diff --git a/examples/machine/deprecated/machine-hnu.json b/examples/machine/deprecated/machine-hnu.json deleted file mode 100644 index eb9cb91f2..000000000 --- a/examples/machine/deprecated/machine-hnu.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "deepmd_path": "/home/llang/dp_v2/local/0.12.0/", - "train_machine": { - "machine_type": "pbs", - "hostname" : "localhost", - "port" : 22, - "username": "llang", - "work_path" : "/home/llang/dp_v2/wanghan/tmp/", - "_comment" : "that's all" - }, - "train_resources": { - "numb_node": 1, - "numb_gpu": 0, - "task_per_node":20, - "source_list": [ "/opt/rh/devtoolset-4/enable" ], - "module_list": [ ], - "envs": { - "OMP_NUM_THREADS": 1 - }, - "time_limit": "12:0:0", - "_comment": "that's all" - }, - - "model_devi_command": "/home/llang/dp_v2/local/bin/lmp_mpi_0_12_0", - "model_devi_group_size": 10, - "_comment": "model_devi on localhost ", - "model_devi_machine": { - "machine_type": "pbs", - "hostname" : "localhost", - "port" : 22, - "username": "llang", - "work_path" : "/home/llang/dp_v2/wanghan/tmp/", - "_comment" : "that's all" - }, - "_comment": " if numb_nodes(nn) = 1 multi-threading rather than mpi is assumed", - "model_devi_resources": { - "numb_node": 1, - "numb_gpu": 0, - "task_per_node":1, - "with_mpi": true, - "source_list": [ "/opt/rh/devtoolset-4/enable" ], - "module_list": [ ], - "time_limit": "2:0:0", - "_comment": "that's all" - }, - - - "_comment": "fp on localhost ", - "fp_command": "/opt/software/vasp.5.4.4/bin/vasp_std", - "fp_group_size": 5, - "fp_machine": { - "machine_type": "pbs", - "hostname" : "localhost", - "port" : 22, - "username": "llang", - "work_path" : "/home/llang/dp_v2/wanghan/tmp/", - "_comment" : "that's all" - }, - "fp_resources": { - "numb_node": 1, - "task_per_node":10, - "numb_gpu": 0, - "with_mpi": true, - "source_list": [ "/opt/rh/devtoolset-4/enable" ], - "module_list": [ ], - "time_limit": "2:0:0", - "_comment": "that's all" - }, - - "_comment": " that's all " -} diff --git a/examples/machine/deprecated/machine-tiger-pwscf-della.json b/examples/machine/deprecated/machine-tiger-pwscf-della.json deleted file mode 100644 index 44911f487..000000000 --- a/examples/machine/deprecated/machine-tiger-pwscf-della.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "deepmd_path": "/home/linfengz/SCR/wanghan/local/deeppot/0.11.0-gpu/", - "train_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/SCR/tmp/", - "_comment" : "that's all" - }, - "train_resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node":7, - "source_list": [ "/home/linfengz/SCR/softwares/tensorflow.gpu.1.6/bin/activate" ], - "module_list": [ "cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4"], - "time_limit": "6:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - "model_devi_command": "/home/linfengz/SCR/wanghan/local/bin/lmp_serial_0110_gpu", - "model_devi_group_size": 20, - "_comment": "model_devi on localhost ", - "model_devi_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/SCR/tmp/", - "_comment" : "that's all" - }, - "_comment": " if use GPU, numb_nodes(nn) should always be 1 ", - "_comment": " if numb_nodes(nn) = 1 multi-threading rather than mpi is assumed", - "model_devi_resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node":7, - "source_list": [ ], - "module_list": [ "cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4"], - "time_limit": "2:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - - "_comment": "fp on localhost ", - "fp_command": "/home/linfengz/local/bin/pw.x < input", - "fp_group_size": 2, - "fp_machine": { - "machine_type": "slurm", - "hostname" : "della.princeton.edu", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/data.gpfs/remote.subs", - "_comment" : "that's all" - }, - "fp_resources": { - "numb_node": 1, - "task_per_node":4, - "with_mpi": true, - "source_list": [ ], - "module_list": [ "fftw", "intel", "openmpi" ], - "time_limit": "5:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - "_comment": " that's all " -} diff --git a/examples/machine/deprecated/machine-tiger-vasp-della.json b/examples/machine/deprecated/machine-tiger-vasp-della.json deleted file mode 100644 index fa1fdf6e9..000000000 --- a/examples/machine/deprecated/machine-tiger-vasp-della.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "deepmd_path": "/home/linfengz/SCR/wanghan/local/deeppot/0.11.0-gpu/", - "train_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "yixiaoc", - "work_path" : "/home/yixiaoc/SCR/tmp/", - "_comment" : "that's all" - }, - "train_resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node":7, - "source_list": [ "/home/linfengz/SCR/softwares/tensorflow.gpu.1.6/bin/activate" ], - "module_list": [ "cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4"], - "time_limit": "12:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - "model_devi_command": "/home/linfengz/SCR/wanghan/local/bin/lmp_serial_0110_gpu", - "model_devi_group_size": 10, - "_comment": "model_devi on localhost ", - "model_devi_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "yixiaoc", - "work_path" : "/home/yixiaoc/SCR/tmp/", - "_comment" : "that's all" - }, - "_comment": " if use GPU, numb_nodes(nn) should always be 1 ", - "_comment": " if numb_nodes(nn) = 1 multi-threading rather than mpi is assumed", - "model_devi_resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node":7, - "source_list": [ ], - "module_list": [ "cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4"], - "time_limit": "4:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - - "_comment": "fp on localhost ", - "fp_command": "/home/linfengz/local/bin/vasp_cpu_kpt_ptch", - "fp_group_size": 1, - "fp_machine": { - "machine_type": "slurm", - "hostname" : "della.princeton.edu", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/data.gpfs/remote.subs", - "_comment" : "that's all" - }, - "fp_resources": { - "numb_node": 1, - "task_per_node":16, - "with_mpi": true, - "source_list": [ ], - "module_list": [ "intel/17.0/64/17.0.5.239", "intel-mpi/intel/2017.5/64", "intel-mkl/2017.4/5/64" ], - "time_limit": "6:0:0", - "_comment": "that's all" - }, - - "_comment": " that's all " -} diff --git a/examples/machine/deprecated/machine-tiger.json b/examples/machine/deprecated/machine-tiger.json deleted file mode 100644 index ccc1b573f..000000000 --- a/examples/machine/deprecated/machine-tiger.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "deepmd_path": "/home/linfengz/SCR/wanghan/local/deeppot/0.11.0-gpu/", - "train_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/SCR/tmp/", - "_comment" : "that's all" - }, - "train_resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node":7, - "source_list": [ "/home/linfengz/SCR/softwares/tensorflow.gpu.1.6/bin/activate" ], - "module_list": [ "cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4"], - "time_limit": "6:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - "model_devi_command": "/home/linfengz/SCR/wanghan/local/bin/lmp_serial_0110_gpu", - "model_devi_group_size": 20, - "_comment": "model_devi on localhost ", - "model_devi_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/SCR/tmp/", - "_comment" : "that's all" - }, - "_comment": " if use GPU, numb_nodes(nn) should always be 1 ", - "_comment": " if numb_nodes(nn) = 1 multi-threading rather than mpi is assumed", - "model_devi_resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node":7, - "source_list": [ ], - "module_list": [ "cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4"], - "time_limit": "2:0:0", - "mem_limit": 32, - "_comment": "that's all" - }, - - - "_comment": "fp on localhost ", - "fp_command": "/home/linfengz/SCR/wanghan/local/bin/vasp", - "fp_group_size": 5, - "fp_machine": { - "machine_type": "slurm", - "hostname" : "localhost", - "port" : 22, - "username": "linfengz", - "work_path" : "/home/linfengz/SCR/tmp/", - "_comment" : "that's all" - }, - "fp_resources": { - "numb_node": 1, - "task_per_node":1, - "numb_gpu": 1, - "source_list": [ ], - "module_list": ["cudatoolkit/9.2", "cudnn/cuda-9.2/7.1.4", "intel-mkl/2017.4/5/64", "intel/17.0/64/17.0.5.239"], - "time_limit": "2:0:0", - "_comment": "that's all" - }, - - "_comment": " that's all " -} diff --git a/examples/machine/deprecated/machine-ucloud.json b/examples/machine/deprecated/machine-ucloud.json deleted file mode 100644 index 52e9040c1..000000000 --- a/examples/machine/deprecated/machine-ucloud.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "deepmd_path": "/home/ubuntu/software/deepmd_float/", - "_comment": "training on ucloud ", - "train_machine": { - "machine_type": "ucloud", - "url": "http://api.ucloud.cn", - "work_path" : "/root/", - "Private": "g5GGyzJM3TdVPK338tkXhcUZ4GuyChs2VONcug9kcYohwLAaWWQAWYMwYtMHrPm2", - "ucloud_param": { - "Region" : "cn-bj2", - "Zone" : "cn-bj2-04", - "ImageId" : "uimage-z2tlg4", - "ChargeType": "Month", - "GPU" : "1", - "Name" : "train", - "UHostType" : "G2", - "PublicKey" : "71RUR4l/3cFVntcHsMaoQk8qZo6uWDflDI7EAwdWqvdev0KvJek//w==" , - "LoginMode" : "Password", - "Password": "YW5ndXNlMTk5OA==" - }, - "purpose" : "train" , - "_comment" : "that's all" - }, - "train_resources": { - "envs": { - "PATH" : "/usr/local/cuda-9.0/bin:$PATH", - "LD_LIBRARY_PATH" : "/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH" - }, - "_comment": "that's all" - }, - - - "model_devi_command": "/usr/bin/lmp_mpi", - "model_devi_group_size": 20, - "model_devi_machine": { - "machine_type": "ucloud", - "url": "http://api.ucloud.cn", - "work_path" : "/root/", - "Private": "g5GGyzJM3TdVPK338tkXhcUZ4GuyChs2VONcug9kcYohwLAaWWQAWYMwYtMHrPm2", - "ucloud_param": { - "Region" : "cn-bj2", - "Zone" : "cn-bj2-05", - "ImageId": "uimage-tnj2gb", - "ChargeType" : "Month", - "Name" : "model", - "PublicKey" : "71RUR4l/3cFVntcHsMaoQk8qZo6uWDflDI7EAwdWqvdev0KvJek//w==" , - "LoginMode" : "Password", - "Password": "YW5ndXNlMTk5OA==" - }, - "purpose" : "model" , - "_comment" : "that's all" - }, - "model_devi_resources": { - "envs": { - "LD_LIBRARY_PATH" : "/home/ubuntu/software/deepmd_float/lib:$LD_LIBRARY_PATH" - }, - "_comment": "that's all" - }, - - - "_comment": "fp on localhost ", - "fp_command": "/usr/bin/vasp_std", - "fp_group_size": 5, - "fp_machine": { - "machine_type": "ucloud", - "url": "http://api.ucloud.cn", - "work_path" : "/root/", - "Private": "g5GGyzJM3TdVPK338tkXhcUZ4GuyChs2VONcug9kcYohwLAaWWQAWYMwYtMHrPm2", - "ucloud_param": { - "Region" : "cn-bj2", - "Zone" : "cn-bj2-05", - "Name": "fp", - "ImageId": "uimage-tnj2gb", - "ChargeType" : "Month", - "PublicKey" : "71RUR4l/3cFVntcHsMaoQk8qZo6uWDflDI7EAwdWqvdev0KvJek//w==" , - "LoginMode" : "Password", - "Password": "YW5ndXNlMTk5OA==" - }, - "purpose" : "fp" , - "_comment" : "that's all" - }, - "fp_resources": { - "task_per_node":8, - "with_mpi": true, - "envs": { - "LD_LIBRARY_PATH" : "/home/ubuntu/software/deepmd_float/lib:$LD_LIBRARY_PATH" - }, - "_comment": "that's all" - }, - - "_comment": " that's all " -} diff --git a/examples/run/dp2.x-gromacs-gaussian/machine.json b/examples/run/dp2.x-gromacs-gaussian/machine.json deleted file mode 100644 index 0f73b2277..000000000 --- a/examples/run/dp2.x-gromacs-gaussian/machine.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "deepmd_version" : "2.0", - "train": [ - { - "machine": { - "batch": "slurm", - "work_path": "/work/path" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "partition": "all", - "time_limit": "120:0:0", - "task_per_node": 8, - "exclude_list": [], - "module_list": [], - "source_list": ["/path/to/dp-2.0.env"] - }, - "command": "dp" - } - ], - "model_devi": [ - { - "machine": { - "batch": "slurm", - "work_path": "/work/path" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "partition": "all", - "time_limit": "120:0:0", - "task_per_node": 8, - "source_list": [ - "/path/to/gromacs-dp/env" - ], - "module_list": [], - "exclude_list": [], - "envs": { - "GMX_DEEPMD_INPUT_JSON": "input.json" - } - }, - "command": "gmx_mpi", - "group_size": 1 - } - ], - "fp": [ - { - "machine": { - "batch": "slurm", - "work_path": "/work/path" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 0, - "time_limit": "120:0:0", - "task_per_node": 28, - "partition": "cpu", - "exclude_list": [], - "source_list": [ - "/path/to/gaussian/bashrc" - ], - "module_list": [] - }, - "command": "g16 < input", - "group_size": 20 - } - ] -} diff --git a/tests/dispatcher/__init__.py b/tests/dispatcher/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/dispatcher/context.py b/tests/dispatcher/context.py deleted file mode 100644 index 1ab29dc9a..000000000 --- a/tests/dispatcher/context.py +++ /dev/null @@ -1,16 +0,0 @@ -import sys,os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) - -from dpgen.dispatcher.LocalContext import LocalSession -from dpgen.dispatcher.LocalContext import LocalContext -from dpgen.dispatcher.LazyLocalContext import LazyLocalContext -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.SSHContext import SSHContext -# from dpgen.dispatcher.Dispatcher import FinRecord -from dpgen.dispatcher.Dispatcher import _split_tasks - -from dpgen.dispatcher.LocalContext import _identical_files - -def setUpModule(): - os.chdir(os.path.abspath(os.path.dirname(__file__))) diff --git a/tests/dispatcher/loc/task0/dir0/test2 b/tests/dispatcher/loc/task0/dir0/test2 deleted file mode 100644 index 48e9eaa49..000000000 --- a/tests/dispatcher/loc/task0/dir0/test2 +++ /dev/null @@ -1 +0,0 @@ -140c75e5-993c-4644-b877-cd3ceb2b254a \ No newline at end of file diff --git a/tests/dispatcher/loc/task0/test0 b/tests/dispatcher/loc/task0/test0 deleted file mode 100644 index 2271a069f..000000000 --- a/tests/dispatcher/loc/task0/test0 +++ /dev/null @@ -1 +0,0 @@ -dfea7618-49df-42ac-b723-f7c04e349203 \ No newline at end of file diff --git a/tests/dispatcher/loc/task0/test1 b/tests/dispatcher/loc/task0/test1 deleted file mode 100644 index 8b014a575..000000000 --- a/tests/dispatcher/loc/task0/test1 +++ /dev/null @@ -1 +0,0 @@ -99cee2e2-0de4-43ba-a296-805f4e551ace \ No newline at end of file diff --git a/tests/dispatcher/loc/task1/dir0/test2 b/tests/dispatcher/loc/task1/dir0/test2 deleted file mode 100644 index abb717f2c..000000000 --- a/tests/dispatcher/loc/task1/dir0/test2 +++ /dev/null @@ -1 +0,0 @@ -0d7eaf5f-0a04-492a-b9ae-c7d77781c928 \ No newline at end of file diff --git a/tests/dispatcher/loc/task1/test0 b/tests/dispatcher/loc/task1/test0 deleted file mode 100644 index c44e41aff..000000000 --- a/tests/dispatcher/loc/task1/test0 +++ /dev/null @@ -1 +0,0 @@ -b96519be-c495-4150-b634-39b61b54ffd9 \ No newline at end of file diff --git a/tests/dispatcher/loc/task1/test1 b/tests/dispatcher/loc/task1/test1 deleted file mode 100644 index 514520d9d..000000000 --- a/tests/dispatcher/loc/task1/test1 +++ /dev/null @@ -1 +0,0 @@ -00bc5947-dfb6-47e4-909e-3c647b551c82 \ No newline at end of file diff --git a/tests/dispatcher/lsf/context.py b/tests/dispatcher/lsf/context.py deleted file mode 100644 index bddf23c43..000000000 --- a/tests/dispatcher/lsf/context.py +++ /dev/null @@ -1,19 +0,0 @@ -import sys,os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) - -from dpgen.dispatcher.LocalContext import LocalSession -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.LocalContext import LocalContext -from dpgen.dispatcher.SSHContext import SSHContext -from dpgen.dispatcher.LSF import LSF -from dpgen.dispatcher.Dispatcher import Dispatcher -from dpgen.dispatcher.JobStatus import JobStatus - -def my_file_cmp(test, f0, f1): - with open(f0) as fp0 : - with open(f1) as fp1: - test.assertTrue(fp0.read() == fp1.read()) - -def setUpModule(): - os.chdir(os.path.abspath(os.path.dirname(__file__))) diff --git a/tests/dispatcher/lsf/test_dispatcher.py b/tests/dispatcher/lsf/test_dispatcher.py deleted file mode 100644 index 7f0cff086..000000000 --- a/tests/dispatcher/lsf/test_dispatcher.py +++ /dev/null @@ -1,46 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'lsf' -from .context import LocalSession -from .context import LocalContext -from .context import LSF -from .context import JobStatus -from .context import Dispatcher -from .context import my_file_cmp -from .context import setUpModule - -class TestDispatcher(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - os.makedirs('loc/task2', exist_ok = True) - for ii in ['loc/task0', 'loc/task1', 'loc/task2']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write('this is test0 from ' + ii + '\n') - work_profile = {'work_path':'rmt'} - self.disp = Dispatcher(work_profile, 'local', 'lsf') - - @unittest.skipIf(not shutil.which("bsub"), "requires LSF") - def test_sub_success(self): - tasks = ['task0', 'task1', 'task2'] - self.disp.run_jobs(None, - 'cp test0 test1', - 'loc', - tasks, - 2, - [], - ['test0'], - ['test1', 'hereout.log', 'hereerr.log'], - outlog = 'hereout.log', - errlog = 'hereerr.log') - for ii in tasks: - my_file_cmp(self, - os.path.join('loc', ii, 'test0'), - os.path.join('loc', ii, 'test1')) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereout.log'))) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereerr.log'))) - diff --git a/tests/dispatcher/lsf/test_lsf_local.py b/tests/dispatcher/lsf/test_lsf_local.py deleted file mode 100644 index e036042b4..000000000 --- a/tests/dispatcher/lsf/test_lsf_local.py +++ /dev/null @@ -1,106 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'lsf' -from .context import LocalSession -from .context import LocalContext -from .context import LSF -from .context import JobStatus -from .context import setUpModule - -class TestLSF(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - work_profile = LocalSession({'work_path':'rmt'}) - self.ctx = LocalContext('loc', work_profile) - self.lsf = LSF(self.ctx) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.lsf.context.upload(job_dirs, ['test0']) - ret = self.lsf.sub_script(job_dirs, ['touch test1', 'touch test2']) - self.lsf.context.write_file('run.sub', ret) - with open('run.sub', 'w') as fp: - fp.write(ret) - - @unittest.skipIf(not shutil.which("bsub"), "requires LSF") - def test_sub_success(self) : - job_dirs = ['task0', 'task1'] - self.lsf.context.upload(job_dirs, ['test0']) - self.lsf.submit(job_dirs, ['touch test1', 'touch test2']) - while True: - ret = self.lsf.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'tag_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/test2'))) - - @unittest.skipIf(not shutil.which("bsub"), "requires LSF") - def test_sub_bkill(self) : - job_dirs = ['task0', 'task1'] - self.lsf.context.upload(job_dirs, ['test0']) - # sub - self.lsf.submit(job_dirs, ['touch test1', 'sleep 10']) - while True: - ret = self.lsf.check_status() - if ret == JobStatus.finished : - raise RuntimeError('should not finished') - if ret == JobStatus.running : - # wait for file writing - time.sleep(2) - job_id = self.lsf._get_job_id() - os.system('bkill ' + job_id) - break - time.sleep(1) - while True: - ret = self.lsf.check_status() - if ret == JobStatus.terminated : - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/tag_1_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'tag_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/test1'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/test2'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/test2'))) - # sub restart - self.lsf.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - while True: - ret = self.lsf.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'tag_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.lsf.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/pbs/context.py b/tests/dispatcher/pbs/context.py deleted file mode 100644 index b9b96469e..000000000 --- a/tests/dispatcher/pbs/context.py +++ /dev/null @@ -1,19 +0,0 @@ -import sys,os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) - -from dpgen.dispatcher.LocalContext import LocalSession -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.LocalContext import LocalContext -from dpgen.dispatcher.SSHContext import SSHContext -from dpgen.dispatcher.PBS import PBS -from dpgen.dispatcher.Dispatcher import Dispatcher -from dpgen.dispatcher.JobStatus import JobStatus - -def my_file_cmp(test, f0, f1): - with open(f0) as fp0 : - with open(f1) as fp1: - test.assertTrue(fp0.read() == fp1.read()) - -def setUpModule(): - os.chdir(os.path.abspath(os.path.dirname(__file__))) diff --git a/tests/dispatcher/pbs/test_dispatcher.py b/tests/dispatcher/pbs/test_dispatcher.py deleted file mode 100644 index 94832d24a..000000000 --- a/tests/dispatcher/pbs/test_dispatcher.py +++ /dev/null @@ -1,46 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'pbs' -from .context import LocalSession -from .context import LocalContext -from .context import PBS -from .context import JobStatus -from .context import Dispatcher -from .context import my_file_cmp -from .context import setUpModule - -@unittest.skipIf(not shutil.which("qsub"), "requires PBS") -class TestDispatcher(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - os.makedirs('loc/task2', exist_ok = True) - for ii in ['loc/task0', 'loc/task1', 'loc/task2']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write('this is test0 from ' + ii + '\n') - work_profile = {'work_path':'rmt'} - self.disp = Dispatcher(work_profile, 'local', 'pbs') - - def test_sub_success(self): - tasks = ['task0', 'task1', 'task2'] - self.disp.run_jobs(None, - 'cp test0 test1', - 'loc', - tasks, - 2, - [], - ['test0'], - ['test1', 'hereout.log', 'hereerr.log'], - outlog = 'hereout.log', - errlog = 'hereerr.log') - for ii in tasks: - my_file_cmp(self, - os.path.join('loc', ii, 'test0'), - os.path.join('loc', ii, 'test1')) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereout.log'))) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereerr.log'))) - diff --git a/tests/dispatcher/pbs/test_pbs_local.py b/tests/dispatcher/pbs/test_pbs_local.py deleted file mode 100644 index 9ffc68c47..000000000 --- a/tests/dispatcher/pbs/test_pbs_local.py +++ /dev/null @@ -1,100 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'pbs' -from .context import LocalSession -from .context import LocalContext -from .context import PBS -from .context import JobStatus -from .context import setUpModule - -@unittest.skipIf(not shutil.which("qsub"), "requires PBS") -class TestPBS(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - work_profile = LocalSession({'work_path':'rmt'}) - self.ctx = LocalContext('loc', work_profile) - self.pbs = PBS(self.ctx) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.pbs.context.upload(job_dirs, ['test0']) - ret = self.pbs.sub_script(job_dirs, ['touch test1', 'touch test2']) - self.pbs.context.write_file('run.sub', ret) - with open('run.sub', 'w') as fp: - fp.write(ret) - - # def test_sub_success(self) : - # job_dirs = ['task0', 'task1'] - # self.pbs.context.upload(job_dirs, ['test0']) - # self.pbs.submit(job_dirs, ['touch test1', 'touch test2']) - # while True: - # ret = self.pbs.check_status() - # if ret == JobStatus.finished : - # break - # time.sleep(1) - # self.assertTrue(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/tag_0_finished'))) - # self.assertTrue(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/tag_1_finished'))) - # self.assertTrue(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/tag_0_finished'))) - # self.assertTrue(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/tag_1_finished'))) - # self.assertTrue(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'tag_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/test2'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/test2'))) - - # def test_sub_scancel(self) : - # job_dirs = ['task0', 'task1'] - # self.pbs.context.upload(job_dirs, ['test0']) - # # sub - # self.pbs.submit(job_dirs, ['touch test1', 'sleep 10']) - # while True: - # ret = self.pbs.check_status() - # if ret == JobStatus.finished : - # raise RuntimeError('should not finished') - # if ret == JobStatus.running : - # # wait for file writing - # time.sleep(2) - # job_id = self.pbs._get_job_id() - # os.system('scancel ' + job_id) - # break - # time.sleep(1) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/tag_0_finished'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/tag_1_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/tag_0_finished'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/tag_1_finished'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'tag_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/test1'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/test2'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/test2'))) - # # sub restart - # self.pbs.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - # while True: - # ret = self.pbs.check_status() - # if ret == JobStatus.finished : - # break - # time.sleep(1) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/tag_0_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/tag_1_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/tag_0_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/tag_1_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'tag_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task0/test2'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.pbs.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/shell/context.py b/tests/dispatcher/shell/context.py deleted file mode 100644 index f9ceec793..000000000 --- a/tests/dispatcher/shell/context.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys,os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) - -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.SSHContext import SSHContext -from dpgen.dispatcher.LocalContext import LocalSession -from dpgen.dispatcher.LocalContext import LocalContext -from dpgen.dispatcher.Shell import Shell -from dpgen.dispatcher.JobStatus import JobStatus -from dpgen.dispatcher.Dispatcher import Dispatcher - -def my_file_cmp(test, f0, f1): - with open(f0) as fp0 : - with open(f1) as fp1: - test.assertTrue(fp0.read() == fp1.read()) - -def setUpModule(): - os.chdir(os.path.abspath(os.path.dirname(__file__))) - diff --git a/tests/dispatcher/shell/test_dispatcher.py b/tests/dispatcher/shell/test_dispatcher.py deleted file mode 100644 index 6d7b642ab..000000000 --- a/tests/dispatcher/shell/test_dispatcher.py +++ /dev/null @@ -1,45 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'shell' -from .context import LocalSession -from .context import LocalContext -from .context import Shell -from .context import JobStatus -from .context import Dispatcher -from .context import my_file_cmp -from .context import setUpModule - -class TestDispatcher(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - os.makedirs('loc/task2', exist_ok = True) - for ii in ['loc/task0', 'loc/task1', 'loc/task2']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write('this is test0 from ' + ii + '\n') - work_profile = {'work_path':'rmt'} - self.disp = Dispatcher(work_profile, context_type = 'local', batch_type = 'shell') - - def test_sub_success(self): - tasks = ['task0', 'task1', 'task2'] - self.disp.run_jobs(None, - 'cp test0 test1', - 'loc', - tasks, - 2, - [], - ['test0'], - ['test1', 'hereout.log', 'hereerr.log'], - outlog = 'hereout.log', - errlog = 'hereerr.log') - for ii in tasks: - my_file_cmp(self, - os.path.join('loc', ii, 'test0'), - os.path.join('loc', ii, 'test1')) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereout.log'))) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereerr.log'))) - diff --git a/tests/dispatcher/shell/test_shell_local.py b/tests/dispatcher/shell/test_shell_local.py deleted file mode 100644 index 4c47136c1..000000000 --- a/tests/dispatcher/shell/test_shell_local.py +++ /dev/null @@ -1,125 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'shell' -from .context import LocalSession -from .context import LocalContext -from .context import Shell -from .context import JobStatus -from .context import my_file_cmp -from .context import setUpModule - -class TestShell(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - work_profile = LocalSession({'work_path':'rmt'}) - self.ctx = LocalContext('loc', work_profile) - self.shell = Shell(self.ctx) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - if os.path.exists('run.sub'): - os.remove('run.sub') - if os.path.exists('run.sub.1'): - os.remove('run.sub.1') - - def test_manual_cuda_devices(self): - job_dirs = ['task0', 'task1'] - res = {'manual_cuda_devices': 3} - ret = self.shell.sub_script(job_dirs, ['touch test1', 'touch test2'], res = res) - with open('run.sub.gpu', 'w') as fp: - fp.write(ret) - - def test_manual_cuda_multiplicity(self): - job_dirs = ['task0', 'task1', 'task2', 'task3'] - res = {'manual_cuda_devices': 2, 'manual_cuda_multiplicity': 2} - ret = self.shell.sub_script(job_dirs, ['touch test1', 'touch test2'], res = res) - with open('run.sub.gpu.multi', 'w') as fp: - fp.write(ret) - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.shell.context.upload(job_dirs, ['test0']) - ret = self.shell.sub_script(job_dirs, ['touch test1', 'touch test2']) - with open('run.sub', 'w') as fp: - fp.write(ret) - ret1 = self.shell.sub_script(job_dirs, ['touch', 'touch'], args = [['test1 ', 'test1 '], ['test2 ', 'test2 ']]) - with open('run.sub.1', 'w') as fp: - fp.write(ret1) - time.sleep(1) - my_file_cmp(self, 'run.sub.1', 'run.sub') - # with open('run.sub', 'w') as fp: - # fp.write(ret) - - def test_sub_success(self) : - job_dirs = ['task0', 'task1'] - self.shell.context.upload(job_dirs, ['test0']) - self.shell.submit(job_dirs, ['touch test1', 'touch test2']) - while True: - ret = self.shell.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, '%s_tag_finished' % self.shell.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test2'))) - - - def test_sub_scancel(self) : - job_dirs = ['task0', 'task1'] - self.shell.context.upload(job_dirs, ['test0']) - # sub - self.shell.submit(job_dirs, ['touch test1', 'sleep 10']) - while True: - ret = self.shell.check_status() - if ret == JobStatus.finished : - raise RuntimeError('should not finished') - if ret == JobStatus.running : - # wait for file writing - time.sleep(2) - # kill job - self.shell.context.kill(self.shell.proc) - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_1_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, '%s_tag_finished' % self.shell.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test1'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test2'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test2'))) - # sub restart - self.shell.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - while True: - ret = self.shell.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, '%s_tag_finished' % self.shell.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/shell/test_shell_ssh.py b/tests/dispatcher/shell/test_shell_ssh.py deleted file mode 100644 index 7b9f0773b..000000000 --- a/tests/dispatcher/shell/test_shell_ssh.py +++ /dev/null @@ -1,113 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time,getpass -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'shell' -from .context import SSHSession -from .context import SSHContext -from .context import Shell -from .context import JobStatus -from .context import setUpModule - -class TestShell(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - port = 22 - try : - ssh_session = SSHSession({'hostname' : 'localhost', - 'port': port, - 'username' : getpass.getuser(), - 'work_path' : os.path.join(os.getcwd(), 'rmt')}) - except Exception: - ssh_session = SSHSession({'hostname' : 'localhost', - 'port': 5566, - 'username' : getpass.getuser(), - 'work_path' : os.path.join(os.getcwd(), 'rmt')}) - self.ctx = SSHContext('loc', ssh_session) - self.shell = Shell(self.ctx) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.shell.context.upload(job_dirs, ['test0']) - ret = self.shell.sub_script(job_dirs, ['touch test1', 'touch test2']) - self.shell.context.write_file('run.sub', ret) - # with open('run.sub', 'w') as fp: - # fp.write(ret) - - def test_sub_success(self) : - job_dirs = ['task0', 'task1'] - self.shell.context.upload(job_dirs, ['test0']) - self.shell.submit(job_dirs, ['touch test1', 'touch test2']) - while True: - ret = self.shell.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, '%s_tag_finished' % self.shell.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test2'))) - - - # def test_sub_scancel(self) : - # job_dirs = ['task0', 'task1'] - # self.shell.context.upload(job_dirs, ['test0']) - # # sub - # self.shell.submit(job_dirs, ['touch test1', 'sleep 10']) - # while True: - # ret = self.shell.check_status() - # if ret == JobStatus.finished : - # raise RuntimeError('should not finished') - # if ret == JobStatus.running : - # # wait for file writing - # time.sleep(2) - # # kill job - # ################################################## - # # problematic killing remotly - # ################################################## - # self.shell.context.kill(self.shell.proc) - # break - # time.sleep(1) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_0_finished'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_1_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_0_finished'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_1_finished'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'tag_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test1'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test2'))) - # self.assertFalse(os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test2'))) - # # sub restart - # self.shell.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - # while True: - # ret = self.shell.check_status() - # if ret == JobStatus.finished : - # break - # time.sleep(1) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_0_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/tag_1_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_0_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/tag_1_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'tag_finished'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test1'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task0/test2'))) - # self.assertTrue (os.path.isfile(os.path.join('rmt', self.shell.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/slurm/context.py b/tests/dispatcher/slurm/context.py deleted file mode 100644 index e608d2a7a..000000000 --- a/tests/dispatcher/slurm/context.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys,os - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) - -from dpgen.dispatcher.LocalContext import LocalSession -from dpgen.dispatcher.SSHContext import SSHSession -from dpgen.dispatcher.LocalContext import LocalContext -from dpgen.dispatcher.LazyLocalContext import LazyLocalContext -from dpgen.dispatcher.SSHContext import SSHContext -from dpgen.dispatcher.Slurm import Slurm -from dpgen.dispatcher.Dispatcher import Dispatcher -from dpgen.dispatcher.JobStatus import JobStatus - -def my_file_cmp(test, f0, f1): - with open(f0) as fp0 : - with open(f1) as fp1: - test.assertTrue(fp0.read() == fp1.read()) - -def setUpModule(): - os.chdir(os.path.abspath(os.path.dirname(__file__))) diff --git a/tests/dispatcher/slurm/test_dispatcher.py b/tests/dispatcher/slurm/test_dispatcher.py deleted file mode 100644 index 3009eed8b..000000000 --- a/tests/dispatcher/slurm/test_dispatcher.py +++ /dev/null @@ -1,52 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'slurm' -from .context import LocalSession -from .context import LocalContext -from .context import Slurm -from .context import JobStatus -from .context import Dispatcher -from .context import my_file_cmp -from .context import setUpModule - -@unittest.skipIf(not shutil.which("sbatch"), "requires Slurm") -class TestDispatcher(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - os.makedirs('loc/task2', exist_ok = True) - for ii in ['loc/task0', 'loc/task1', 'loc/task2']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write('this is test0 from ' + ii + '\n') - work_profile = {'work_path':'rmt'} - self.disp = Dispatcher(work_profile, 'local', 'slurm') - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_sub_success(self): - tasks = ['task0', 'task1', 'task2'] - self.disp.run_jobs(None, - 'cp test0 test1', - 'loc', - tasks, - 2, - [], - ['test0'], - ['test1', 'hereout.log', 'hereerr.log'], - outlog = 'hereout.log', - errlog = 'hereerr.log') - for ii in tasks: - my_file_cmp(self, - os.path.join('loc', ii, 'test0'), - os.path.join('loc', ii, 'test1')) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereout.log'))) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereerr.log'))) - diff --git a/tests/dispatcher/slurm/test_dispatcher_lazy_local.py b/tests/dispatcher/slurm/test_dispatcher_lazy_local.py deleted file mode 100644 index 89fd9b9a4..000000000 --- a/tests/dispatcher/slurm/test_dispatcher_lazy_local.py +++ /dev/null @@ -1,52 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'slurm' -from .context import LocalSession -from .context import LocalContext -from .context import Slurm -from .context import JobStatus -from .context import Dispatcher -from .context import my_file_cmp -from .context import setUpModule - -@unittest.skipIf(not shutil.which("sbatch"), "requires Slurm") -class TestDispatcher(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - os.makedirs('loc/task2', exist_ok = True) - for ii in ['loc/task0', 'loc/task1', 'loc/task2']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write('this is test0 from ' + ii + '\n') - work_profile = {} - self.disp = Dispatcher(work_profile, 'lazy-local', 'slurm') - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_sub_success(self): - tasks = ['task0', 'task1', 'task2'] - self.disp.run_jobs(None, - 'cp test0 test1', - 'loc', - tasks, - 2, - [], - ['test0'], - ['test1', 'hereout.log', 'hereerr.log'], - outlog = 'hereout.log', - errlog = 'hereerr.log') - for ii in tasks: - my_file_cmp(self, - os.path.join('loc', ii, 'test0'), - os.path.join('loc', ii, 'test1')) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereout.log'))) - self.assertTrue(os.path.isfile(os.path.join('loc', ii, 'hereerr.log'))) - diff --git a/tests/dispatcher/slurm/test_slurm_lazy_local.py b/tests/dispatcher/slurm/test_slurm_lazy_local.py deleted file mode 100644 index ac44886a3..000000000 --- a/tests/dispatcher/slurm/test_slurm_lazy_local.py +++ /dev/null @@ -1,106 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'slurm' -from .context import LazyLocalContext -from .context import Slurm -from .context import JobStatus -from .context import setUpModule - -@unittest.skipIf(not shutil.which("sbatch"), "requires Slurm") -class TestSlurm(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - self.ctx = LazyLocalContext('loc') - self.slurm = Slurm(self.ctx, uuid_names = True) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - ret = self.slurm.sub_script(job_dirs, ['touch test1', 'touch test2']) - self.slurm.context.write_file('run.sub', ret) - with open('run.sub', 'w') as fp: - fp.write(ret) - - def test_sub_success(self) : - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - self.slurm.submit(job_dirs, ['touch test1', 'touch test2']) - job_uuid = self.slurm.context.job_uuid - with open(os.path.join('rmt', self.slurm.context.remote_root, '%s_job_id' % job_uuid)) as fp: - tmp_id = fp.read() - self.assertEqual(self.slurm._get_job_id(), tmp_id) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - - def test_sub_scancel(self) : - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - # sub - self.slurm.submit(job_dirs, ['touch test1', 'sleep 10']) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - raise RuntimeError('should not finished') - if ret == JobStatus.running : - # wait for file writing - time.sleep(2) - job_id = self.slurm._get_job_id() - job_uuid = self.slurm.context.job_uuid - with open(os.path.join('rmt', self.slurm.context.remote_root, '%s_job_id' % job_uuid)) as fp: - tmp_id = fp.read() - self.assertEqual(job_id, tmp_id) - os.system('scancel ' + job_id) - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - # sub restart - self.slurm.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/slurm/test_slurm_local.py b/tests/dispatcher/slurm/test_slurm_local.py deleted file mode 100644 index 0aeca1f75..000000000 --- a/tests/dispatcher/slurm/test_slurm_local.py +++ /dev/null @@ -1,100 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'slurm' -from .context import LocalSession -from .context import LocalContext -from .context import Slurm -from .context import JobStatus -from .context import setUpModule - -@unittest.skipIf(not shutil.which("sbatch"), "requires Slurm") -class TestSlurm(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - work_profile = LocalSession({'work_path':'rmt'}) - self.ctx = LocalContext('loc', work_profile) - self.slurm = Slurm(self.ctx) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - ret = self.slurm.sub_script(job_dirs, ['touch test1', 'touch test2']) - self.slurm.context.write_file('run.sub', ret) - with open('run.sub', 'w') as fp: - fp.write(ret) - - def test_sub_success(self) : - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - self.slurm.submit(job_dirs, ['touch test1', 'touch test2']) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % self.slurm.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - - def test_sub_scancel(self) : - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - # sub - self.slurm.submit(job_dirs, ['touch test1', 'sleep 10']) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - raise RuntimeError('should not finished') - if ret == JobStatus.running : - # wait for file writing - time.sleep(2) - job_id = self.slurm._get_job_id() - os.system('scancel ' + job_id) - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % self.slurm.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - # sub restart - self.slurm.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % self.slurm.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/slurm/test_slurm_ssh.py b/tests/dispatcher/slurm/test_slurm_ssh.py deleted file mode 100644 index 774650110..000000000 --- a/tests/dispatcher/slurm/test_slurm_ssh.py +++ /dev/null @@ -1,105 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'slurm' -from .context import LocalSession -from .context import LocalContext -from .context import Slurm -from .context import JobStatus -from .context import my_file_cmp -from .context import setUpModule - -@unittest.skipIf(not shutil.which("sbatch"), "requires Slurm") -class TestSlurm(unittest.TestCase) : - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('rmt', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - work_profile = LocalSession({'work_path':'rmt'}) - self.ctx = LocalContext('loc', work_profile) - self.slurm = Slurm(self.ctx) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - if os.path.exists('dpgen.log'): - os.remove('dpgen.log') - - def test_gen_sub_script(self): - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - ret = self.slurm.sub_script(job_dirs, ['touch test1', 'touch test2']) - self.slurm.context.write_file('run.sub', ret) - with open('run.sub', 'w') as fp: - fp.write(ret) - ret1 = self.slurm.sub_script(job_dirs, ['touch', 'touch'], [['test1 ', 'test1 '], ['test2 ', 'test2 ']]) - with open('run.sub.1', 'w') as fp: - fp.write(ret1) - my_file_cmp(self, 'run.sub.1', 'run.sub') - - def test_sub_success(self) : - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - self.slurm.submit(job_dirs, ['touch test1', 'touch test2']) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % self.slurm.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - - def test_sub_scancel(self) : - job_dirs = ['task0', 'task1'] - self.slurm.context.upload(job_dirs, ['test0']) - # sub - self.slurm.submit(job_dirs, ['touch test1', 'sleep 10']) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - raise RuntimeError('should not finished') - if ret == JobStatus.running : - # wait for file writing - time.sleep(2) - job_id = self.slurm._get_job_id() - os.system('scancel ' + job_id) - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % self.slurm.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertFalse(os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - # sub restart - self.slurm.submit(job_dirs, ['rm test1', 'touch test2'], restart = True) - while True: - ret = self.slurm.check_status() - if ret == JobStatus.finished : - break - time.sleep(1) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_0_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/tag_1_finished'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, '%s_tag_finished' % self.slurm.context.job_uuid))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test1'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task0/test2'))) - self.assertTrue (os.path.isfile(os.path.join('rmt', self.slurm.context.remote_root, 'task1/test2'))) - diff --git a/tests/dispatcher/test_dispatcher_utils.py b/tests/dispatcher/test_dispatcher_utils.py deleted file mode 100644 index 01f0e0a1f..000000000 --- a/tests/dispatcher/test_dispatcher_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'dispatcher' -# from .context import FinRecord -from .context import _split_tasks -from .context import setUpModule - -# class TestFinRecord(unittest.TestCase): -# def setUp(self): -# self.njobs = 10 -# self.fr = FinRecord('.', self.njobs) - -# def tearDown(self): -# if os.path.isfile('fin.record'): -# os.remove('fin.record') - -# def test_all_false(self) : -# recd = self.fr.get_record() -# self.assertEqual(recd, [False]*self.njobs) - -# def test_write_read(self) : -# recd = self.fr.get_record() -# recd[self.njobs//3] = True -# self.fr.write_record(recd) -# recd1 = self.fr.get_record() -# self.assertEqual(recd, recd1) - -class TestDispatchSplit(unittest.TestCase): - def test_split(self): - tasks = [ii for ii in range(10)] - chunks = _split_tasks(tasks, 5) - self.assertEqual(chunks, [[0,2,4,6,8],[1,3,5,7,9]]) - - def test_split_1(self): - tasks = [ii for ii in range(13)] - chunks = _split_tasks(tasks, 5) - self.assertEqual(chunks, [[0,3,6,9,12],[1,4,7,10],[2,5,8,11]]) - - diff --git a/tests/dispatcher/test_lazy_local_context.py b/tests/dispatcher/test_lazy_local_context.py deleted file mode 100644 index 87270d836..000000000 --- a/tests/dispatcher/test_lazy_local_context.py +++ /dev/null @@ -1,174 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest -from pathlib import Path - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'dispatcher' -from .context import LazyLocalContext -from .context import setUpModule - -class TestLazyLocalContext(unittest.TestCase): - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - with open(os.path.join(ii, 'test1'),'w') as fp: - fp.write(str(uuid.uuid4())) - with open(os.path.join(ii, 'test2'),'w') as fp: - fp.write(str(uuid.uuid4())) - os.makedirs(os.path.join(ii, 'dir0'), exist_ok = True) - - def tearDown(self): - shutil.rmtree('loc') - - def test_upload(self) : - self.job = LazyLocalContext('loc', None) - self.job1 = LazyLocalContext('loc', None, job_uuid = self.job.job_uuid) - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - self.job1.upload(tasks, files) - - def test_download(self): - # upload files - self.job = LazyLocalContext('loc', None) - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - self.job.download(tasks, ['test0', 'dir0']) - - def test_download_check_mark(self): - # upload files - self.job = LazyLocalContext('loc', None) - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - record_uuid = [] - # generate extra donwload files - for ii in tasks : - for jj in ['test6', 'test7'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6'): - continue - with open(os.path.join('loc',ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - self.job.download(tasks, ['test6', 'test7', 'dir1'], check_exists = True, mark_failure = True) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test6', 'test7'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertTrue(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'failed to find ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj))) - self.assertTrue(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj))) - - - def test_download_check_nomark(self): - # upload files - self.job = LazyLocalContext('loc', None) - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - record_uuid = [] - # generate extra donwload files - for ii in tasks : - for jj in ['test6', 'test7'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6'): - continue - with open(os.path.join('loc',ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - self.job.download(tasks, ['test6', 'test7', 'dir1'], check_exists = True, mark_failure = False) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test6', 'test7'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'found ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj))) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj))) - - - - def test_block_call(self) : - self.job = LazyLocalContext('loc', None) - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - # ls - code, stdin, stdout, stderr = self.job.block_call('ls') - self.assertEqual(stdout.read().decode('utf-8'), 'task0\ntask1\n') - self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) - self.assertEqual(code, 0) - code, stdin, stdout, stderr = self.job.block_call('ls a') - self.assertEqual(code, 2) - # self.assertEqual(stderr.read().decode('utf-8'), 'ls: cannot access a: No such file or directory\n') - err_msg = stderr.read().decode('utf-8') - self.assertTrue('ls: cannot access' in err_msg) - self.assertTrue('No such file or directory\n' in err_msg) - - def test_block_checkcall(self) : - self.job = LazyLocalContext('loc', None) - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - # ls - stdin, stdout, stderr = self.job.block_checkcall('ls') - self.assertEqual(stdout.read().decode('utf-8'), 'task0\ntask1\n') - self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) - with self.assertRaises(RuntimeError): - stdin, stdout, stderr = self.job.block_checkcall('ls a') - - def test_file(self) : - self.job = LazyLocalContext('loc', None) - self.assertFalse(self.job.check_file_exists('aaa')) - tmp = str(uuid.uuid4()) - self.job.write_file('aaa', tmp) - self.assertTrue(self.job.check_file_exists('aaa')) - tmp1 = self.job.read_file('aaa') - self.assertEqual(tmp, tmp1) - - - def test_call(self) : - self.job = LazyLocalContext('loc', None) - proc = self.job.call('sleep 1.5') - self.assertFalse(self.job.check_finish(proc)) - time.sleep(1) - self.assertFalse(self.job.check_finish(proc)) - time.sleep(2.5) - self.assertTrue(self.job.check_finish(proc)) - r,o,e=self.job.get_return(proc) - self.assertEqual(r, 0) - self.assertEqual(o.read(), b'') - self.assertEqual(e.read(), b'') - # r,o,e=self.job.get_return(proc) - # self.assertEqual(r, 0) - # self.assertEqual(o, None) - # self.assertEqual(e, None) - diff --git a/tests/dispatcher/test_local_context.py b/tests/dispatcher/test_local_context.py deleted file mode 100644 index c5b046485..000000000 --- a/tests/dispatcher/test_local_context.py +++ /dev/null @@ -1,363 +0,0 @@ -import os,sys,json,glob,shutil,uuid,time -import unittest -from pathlib import Path - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'dispatcher' -from .context import LocalContext, LocalSession -from .context import setUpModule -from .context import _identical_files - -class TestIdFile(unittest.TestCase) : - def test_id(self) : - with open('f0', 'w') as fp: - fp.write('foo') - with open('f1', 'w') as fp: - fp.write('foo') - self.assertTrue(_identical_files('f0', 'f1')) - os.remove('f0') - os.remove('f1') - - def test_diff(self) : - with open('f0', 'w') as fp: - fp.write('foo') - with open('f1', 'w') as fp: - fp.write('bar') - self.assertFalse(_identical_files('f0', 'f1')) - os.remove('f0') - os.remove('f1') - - -class TestLocalContext(unittest.TestCase): - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - with open(os.path.join(ii, 'test1'),'w') as fp: - fp.write(str(uuid.uuid4())) - with open(os.path.join(ii, 'test2'),'w') as fp: - fp.write(str(uuid.uuid4())) - os.makedirs(os.path.join(ii, 'dir0'), exist_ok = True) - os.makedirs(os.path.join(ii, 'dir2'), exist_ok = True) - with open(os.path.join(ii, 'dir2', 'dtest0'),'w') as fp: - fp.write(str(uuid.uuid4())) - os.makedirs('rmt', exist_ok = True) - - def tearDown(self): - shutil.rmtree('loc') - shutil.rmtree('rmt') - - def test_upload_non_exist(self) : - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - # test uploading non-existing file - with self.assertRaises(OSError): - self.job.upload(tasks, ['foo']) - - def test_upload(self) : - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - self.job1 = LocalContext('loc', work_profile, job_uuid = self.job.job_uuid) - tasks = ['task0', 'task1'] - files = ['test0', 'test1', 'dir2/dtest0'] - self.job.upload(tasks, files) - for ii in tasks : - for jj in files : - locf = os.path.join('loc', ii, jj) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj) - with open(locf) as fp: - locs = fp.read() - with open(rmtf) as fp: - rmts = fp.read() - self.assertEqual(locs, rmts) - self.job.upload(tasks, ['dir0']) - for ii in tasks : - for jj in ['dir0'] : - locf = os.path.join('loc', ii, jj) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj) - self.assertEqual(os.path.realpath(locf), - os.path.realpath(rmtf)) - self.job1.upload(tasks, files) - for ii in tasks : - for jj in files : - locf = os.path.join('loc', ii, jj) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj) - with open(locf) as fp: - locs = fp.read() - with open(rmtf) as fp: - rmts = fp.read() - self.assertEqual(locs, rmts) - - def test_dl_f_f(self): - # no local, no remote - self.test_download_non_exist() - - def test_dl_t_f(self) : - # has local, no remote - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - record_uuid = [] - for ii in tasks : - for jj in ['dir1'] : - os.makedirs(os.path.join('loc',ii,jj), exist_ok=False) - for kk in ['test6', 'test7']: - with open(os.path.join('loc',ii,jj,kk), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - files = ['dir1'] - self.job.download(tasks, files) - cc = 0 - for ii in tasks : - for jj in ['dir1'] : - for kk in ['test6', 'test7']: - with open(os.path.join('loc',ii,jj,kk), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - - def test_dl_t_t(self) : - # has local, has remote - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - for ii in tasks : - for jj in ['dir1'] : - os.makedirs(os.path.join('loc',ii,jj), exist_ok=False) - for kk in ['test6', 'test7']: - with open(os.path.join('loc',ii,jj,kk), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid = [] - for ii in tasks : - for jj in ['dir1'] : - os.makedirs(os.path.join('rmt', self.job.job_uuid,ii,jj), exist_ok=False) - for kk in ['test6', 'test7']: - with open(os.path.join('rmt', self.job.job_uuid,ii,jj,kk), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - files = ['dir1'] - self.job.download(tasks, files) - cc = 0 - for ii in tasks : - for jj in ['dir1'] : - for kk in ['test6', 'test7']: - with open(os.path.join('loc',ii,jj,kk), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - - - def test_download_non_exist(self): - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - # down load non-existing file - with self.assertRaises(RuntimeError): - self.job.download(tasks, ['foo']) - - def test_download(self): - # upload files - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - # generate extra donwload files - record_uuid = [] - for ii in tasks : - for jj in ['test4', 'test5'] : - with open(os.path.join('rmt',self.job.job_uuid,ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # generate extra donwload dirs and files - for ii in tasks : - for jj in ['dir1'] : - os.makedirs(os.path.join('rmt',self.job.job_uuid,ii,jj), exist_ok=False) - for kk in ['test6']: - with open(os.path.join('rmt',self.job.job_uuid,ii,jj,kk), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # donwload - files = ['test0', 'dir0', 'test4', 'test5', 'dir1'] - self.job.download(tasks, files) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test4', 'test5'] : - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - for kk in ['test6']: - with open(os.path.join('loc',ii,jj,kk), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - # check links preserved - for ii in tasks : - for jj in ['test0'] : - locf = os.path.join('loc', ii, jj) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj) - self.assertEqual(os.path.realpath(locf), - os.path.realpath(rmtf)) - for ii in tasks : - for jj in ['dir0'] : - for kk in ['test6'] : - locf = os.path.join('loc', ii, jj, kk) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj, kk) - self.assertEqual(os.path.realpath(locf), - os.path.realpath(rmtf)) - - def test_download_check_mark(self): - # upload files - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - # generate extra donwload files - record_uuid = [] - for ii in tasks : - for jj in ['test7', 'test8'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - continue - with open(os.path.join('rmt',self.job.job_uuid,ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # donwload - files = ['test7', 'test8', 'dir1'] - self.job.download(tasks, files, check_exists = True, mark_failure = True) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test7', 'test8'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertTrue(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'failed to find ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj))) - self.assertTrue(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj))) - - - def test_download_check_nomark(self): - # upload files - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - # generate extra donwload files - record_uuid = [] - for ii in tasks : - for jj in ['test7', 'test8'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - continue - with open(os.path.join('rmt',self.job.job_uuid,ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # donwload - files = ['test7', 'test8', 'dir1'] - self.job.download(tasks, files, check_exists = True, mark_failure = False) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test7', 'test8'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'found ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj))) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj))) - - - def test_block_call(self) : - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - # ls - code, stdin, stdout, stderr = self.job.block_call('ls') - self.assertEqual(stdout.read().decode('utf-8'), 'task0\ntask1\n') - self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) - self.assertEqual(code, 0) - code, stdin, stdout, stderr = self.job.block_call('ls a') - self.assertEqual(code, 2) - # self.assertEqual(stderr.read().decode('utf-8'), 'ls: cannot access a: No such file or directory\n') - err_msg = stderr.read().decode('utf-8') - self.assertTrue('ls: cannot access' in err_msg) - self.assertTrue('No such file or directory\n' in err_msg) - - - def test_block_checkcall(self) : - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - # ls - stdin, stdout, stderr = self.job.block_checkcall('ls') - self.assertEqual(stdout.read().decode('utf-8'), 'task0\ntask1\n') - self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) - with self.assertRaises(RuntimeError): - stdin, stdout, stderr = self.job.block_checkcall('ls a') - - def test_file(self) : - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - self.assertFalse(self.job.check_file_exists('aaa')) - tmp = str(uuid.uuid4()) - self.job.write_file('aaa', tmp) - self.assertTrue(self.job.check_file_exists('aaa')) - tmp1 = self.job.read_file('aaa') - self.assertEqual(tmp, tmp1) - - - def test_call(self) : - work_profile = LocalSession({'work_path':'rmt'}) - self.job = LocalContext('loc', work_profile) - proc = self.job.call('sleep 1.5') - self.assertFalse(self.job.check_finish(proc)) - time.sleep(1) - self.assertFalse(self.job.check_finish(proc)) - time.sleep(2.5) - self.assertTrue(self.job.check_finish(proc)) - r,o,e=self.job.get_return(proc) - self.assertEqual(r, 0) - self.assertEqual(o.read(), b'') - self.assertEqual(e.read(), b'') - # r,o,e=self.job.get_return(proc) - # self.assertEqual(r, 0) - # self.assertEqual(o, None) - # self.assertEqual(e, None) - diff --git a/tests/dispatcher/test_local_session.py b/tests/dispatcher/test_local_session.py deleted file mode 100644 index 6712e639f..000000000 --- a/tests/dispatcher/test_local_session.py +++ /dev/null @@ -1,16 +0,0 @@ -import os,sys,json,glob,shutil -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'dispatcher' -from .context import LocalSession -from .context import setUpModule - -class TestLocalSession(unittest.TestCase): - def test_work_path(self): - cwd = os.getcwd() - wp = LocalSession({'work_path' : cwd}) - self.assertTrue(os.path.abspath(cwd), wp.get_work_root()) - - - diff --git a/tests/dispatcher/test_ssh_context.py b/tests/dispatcher/test_ssh_context.py deleted file mode 100644 index a24e2d653..000000000 --- a/tests/dispatcher/test_ssh_context.py +++ /dev/null @@ -1,231 +0,0 @@ -import os,sys,json,glob,shutil,uuid,getpass -import unittest -from pathlib import Path - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'dispatcher' -from .context import SSHContext, SSHSession -from .context import setUpModule - -class TestSSHContext(unittest.TestCase): - def setUp(self) : - os.makedirs('loc', exist_ok = True) - os.makedirs('loc/task0', exist_ok = True) - os.makedirs('loc/task1', exist_ok = True) - for ii in ['loc/task0', 'loc/task1']: - with open(os.path.join(ii, 'test0'),'w') as fp: - fp.write(str(uuid.uuid4())) - with open(os.path.join(ii, 'test1'),'w') as fp: - fp.write(str(uuid.uuid4())) - os.makedirs(os.path.join(ii, 'dir0'), exist_ok = True) - with open(os.path.join(ii, 'dir0', 'test2'),'w') as fp: - fp.write(str(uuid.uuid4())) - os.makedirs('rmt', exist_ok = True) - try : - self.ssh_session = SSHSession({'hostname' : 'localhost', - 'port': 22, - 'username' : getpass.getuser(), - 'work_path' : os.path.join(os.getcwd(), 'rmt')}) - except Exception: - # for tianhe-2 - try: - self.ssh_session = SSHSession({'hostname' : 'localhost', - 'port': 5566, - 'username' : getpass.getuser(), - 'work_path' : os.path.join(os.getcwd(), 'rmt')}) - except Exception: - self.skipTest("Network error") - self.job = SSHContext('loc', self.ssh_session) - self.job1 = SSHContext('loc', self.ssh_session, job_uuid = self.job.job_uuid) - - def tearDown(self): - self.job.close() - self.job1.close() - shutil.rmtree('loc') - shutil.rmtree('rmt') - - def test_upload(self) : - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - for ii in tasks : - for jj in files : - locf = os.path.join('loc', ii, jj) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj) - with open(locf) as fp: - locs = fp.read() - with open(rmtf) as fp: - rmts = fp.read() - self.assertEqual(locs, rmts) - self.job.upload(tasks, ['dir0']) - for ii in tasks : - for jj in ['dir0'] : - for kk in ['test2'] : - locf = os.path.join('loc', ii, jj, kk) - rmtf = os.path.join('rmt', self.job.job_uuid, ii, jj, kk) - with open(locf) as fp: - locs = fp.read() - with open(rmtf) as fp: - rmts = fp.read() - self.assertEqual(locs, rmts) - - - def test_donwload(self): - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - # generate extra donwload files - record_uuid = [] - for ii in tasks : - for jj in ['test4', 'test5'] : - with open(os.path.join('rmt',self.job.job_uuid,ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # generate extra donwload dirs and files - for ii in tasks : - for jj in ['dir1'] : - os.makedirs(os.path.join('rmt',self.job.job_uuid,ii,jj), exist_ok=False) - for kk in ['test6']: - with open(os.path.join('rmt',self.job.job_uuid,ii,jj,kk), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # donwload - files = ['test4', 'test5', 'dir1'] - self.job.download(tasks, files) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test4', 'test5'] : - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - for kk in ['test6']: - with open(os.path.join('loc',ii,jj,kk), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - - - def test_donwload_check_mark(self): - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - # generate extra donwload files - record_uuid = [] - for ii in tasks : - for jj in ['test6', 'test7'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6'): - continue - with open(os.path.join('rmt',self.job.job_uuid,ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # donwload - files = ['test6', 'test7', 'dir1'] - self.job.download(tasks, files, check_exists = True, mark_failure = True) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test6', 'test7'] : - if (ii == 'task1' and jj == 'test7') or \ - (ii == 'task0' and jj == 'test6') : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertTrue(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'failed to find ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj))) - self.assertTrue(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj))) - - - def test_donwload_check_nomark(self): - tasks = ['task0', 'task1'] - self.job.upload(tasks, ['test0', 'dir0']) - # generate extra donwload files - record_uuid = [] - for ii in tasks : - for jj in ['test6', 'test7'] : - if ii == 'task1' and jj == 'test7' : - continue - if ii == 'task0' and jj == 'test6' : - continue - with open(os.path.join('rmt',self.job.job_uuid,ii,jj), 'w') as fp: - tmp = str(uuid.uuid4()) - fp.write(tmp) - record_uuid.append(tmp) - # donwload - files = ['test6', 'test7', 'dir1'] - self.job.download(tasks, files, check_exists = True, mark_failure = False) - # check dlded - cc = 0 - for ii in tasks : - for jj in ['test6', 'test7'] : - if ii == 'task1' and jj == 'test7' : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'found ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - if ii == 'task0' and jj == 'test6' : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj)), - msg = 'found ' + os.path.join('loc', ii, jj)) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj)), - msg = 'found ' + os.path.join('loc', ii, 'tag_failure_download_%s' % jj)) - continue - with open(os.path.join('loc',ii,jj), 'r') as fp: - tmp = fp.read() - self.assertEqual(tmp, record_uuid[cc]) - cc += 1 - for ii in tasks : - for jj in ['dir1'] : - self.assertFalse(os.path.exists(os.path.join('loc', ii, jj))) - self.assertFalse(os.path.exists(os.path.join('loc', ii, 'tag_failure_download_%s' % jj))) - - def test_block_call(self) : - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - # ls - code, stdin, stdout, stderr = self.job.block_call('ls') - self.assertEqual(stdout.read(), b'task0\ntask1\n') - self.assertEqual(code, 0) - code, stdin, stdout, stderr = self.job.block_call('ls') - self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) - code, stdin, stdout, stderr = self.job.block_call('ls a') - self.assertEqual(code, 2) - # self.assertEqual(stderr.read().decode('utf-8'), 'ls: cannot access a: No such file or directory\n') - err_msg = stderr.read().decode('utf-8') - self.assertTrue('ls: cannot access' in err_msg) - self.assertTrue('No such file or directory\n' in err_msg) - - def test_block_checkcall(self) : - tasks = ['task0', 'task1'] - files = ['test0', 'test1'] - self.job.upload(tasks, files) - # ls - stdin, stdout, stderr = self.job.block_checkcall('ls') - self.assertEqual(stdout.read(), b'task0\ntask1\n') - stdin, stdout, stderr = self.job.block_checkcall('ls') - self.assertEqual(stdout.readlines(), ['task0\n','task1\n']) - with self.assertRaises(RuntimeError): - stdin, stdout, stderr = self.job.block_checkcall('ls a') - - def test_file(self) : - self.assertFalse(self.job.check_file_exists('aaa')) - tmp = str(uuid.uuid4()) - self.job.write_file('aaa', tmp) - self.assertTrue(self.job.check_file_exists('aaa')) - tmp1 = self.job.read_file('aaa') - self.assertEqual(tmp, tmp1) - - diff --git a/tests/generator/test_make_dispatcher.py b/tests/generator/test_make_dispatcher.py deleted file mode 100644 index 998ed39f9..000000000 --- a/tests/generator/test_make_dispatcher.py +++ /dev/null @@ -1,44 +0,0 @@ -import os,sys,sys -import unittest - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -__package__ = 'generator' -from .context import make_dispatcher - -class TestDispatcher(unittest.TestCase): - # def test_ssh_slurm(self): - # dis = make_dispatcher({ - # 'batch': 'slurm', - # 'hostname': 'localhost', - # 'username': 'wanghan', - # 'port': 22, - # 'work_path': '.', - # }) - # self.assertEqual(dis.context.__name__, 'SSHContext') - # self.assertEqual(dis.batch.__name__, 'Slurm') - - def test_local_slurm(self): - dis = make_dispatcher({ - 'batch': 'slurm', - 'work_path': '.', - }) - self.assertEqual(dis.context.__name__, 'LocalContext') - self.assertEqual(dis.batch.__name__, 'Slurm') - - def test_lazy_local_slurm(self): - dis = make_dispatcher({ - 'batch': 'slurm', - 'lazy_local': True, - 'work_path': '.', - }) - self.assertEqual(dis.context.__name__, 'LazyLocalContext') - self.assertEqual(dis.batch.__name__, 'Slurm') - - def test_dep_lazy_local_slurm(self): - dis = make_dispatcher({ - 'machine_type': 'slurm', - 'lazy_local': True, - 'work_path': '.', - }) - self.assertEqual(dis.context.__name__, 'LazyLocalContext') - self.assertEqual(dis.batch.__name__, 'Slurm') diff --git a/tests/test_check_examples.py b/tests/test_check_examples.py index a8029c644..aadf4b4f0 100644 --- a/tests/test_check_examples.py +++ b/tests/test_check_examples.py @@ -66,24 +66,17 @@ (run_jdata, p_examples / "run" / "dp2.x-lammps-gaussian" / "param_C4H16N4_deepmd-kit-2.0.1.json"), (run_jdata, p_examples / "run" / "dprc" / "generator.json"), # machines - #(run_mdata, p_examples / "machine" / "DeePMD-kit-2.x" / "lebesgue_v2_machine.json"), - #(run_mdata, p_examples / "machine" / "DeePMD-kit-1.x" / "machine-ali.json"), + (run_mdata, p_examples / "machine" / "DeePMD-kit-2.x" / "lebesgue_v2_machine.json"), (run_mdata, p_examples / "machine" / "DeePMD-kit-1.x" / "machine-local.json"), (run_mdata, p_examples / "machine" / "DeePMD-kit-1.x" / "machine-lsf-slurm-cp2k.json"), (run_mdata, p_examples / "machine" / "DeePMD-kit-1.x" / "machine-pbs-gaussian.json"), (run_mdata, p_examples / "machine" / "DeePMD-kit-1.x" / "machine-slurm-qe.json"), (run_mdata, p_examples / "machine" / "DeePMD-kit-1.0" / "machine-local-4GPU.json"), - #(run_mdata, p_examples / "machine" / "deprecated" / "machine-hnu.json"), - #(run_mdata, p_examples / "machine" / "deprecated" / "machine-tiger-pwscf-della.json"), - #(run_mdata, p_examples / "machine" / "deprecated" / "machine-tiger-vasp-della.json"), - #(run_mdata, p_examples / "machine" / "deprecated" / "machine-tiger.json"), - #(run_mdata, p_examples / "machine" / "deprecated" / "machine-ucloud.json"), (run_mdata, p_examples / "CH4-refact-dpdispatcher" / "machine-ali-ehpc.json"), (run_mdata, p_examples / "CH4-refact-dpdispatcher" / "machine-dpcloudserver.json"), (run_mdata, p_examples / "run" / "dp2.x-lammps-ABACUS-lcao" / "fcc-al" / "machine.json"), (run_mdata, p_examples / "run" / "dp2.x-lammps-ABACUS-pw" / "fcc-al" / "machine.json"), (run_mdata, p_examples / "run" / "dp2.x-lammps-gaussian" / "machine.json"), - #(run_mdata, p_examples / "run" / "dp2.x-gromacs-gaussian" / "machine.json"), (simplify_mdata, p_examples / "simplify-MAPbI3-scan-lebesgue" / "simplify_example" / "machine.json"), )