From f94277e82655ea0d9dc3a27beb896aee42999244 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Tue, 23 Mar 2021 12:11:21 +0100 Subject: [PATCH 1/5] Rename resource attribute to be more precise --- docs/source/changelog.rst | 4 ++-- tardis/adapters/sites/htcondor.py | 2 +- tardis/adapters/sites/slurm.py | 2 +- tardis/resources/drone.py | 2 +- tests/adapters_t/sites_t/test_htcondorsiteadapter.py | 4 ++-- tests/adapters_t/sites_t/test_slurm.py | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c2c4f067..08c7f36c 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by changelog.py at 2021-03-22, command +.. Created by changelog.py at 2021-03-23, command '/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.9/bin/changelog docs/source/changes compile --output=docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' @@ -6,7 +6,7 @@ CHANGELOG ######### -[Unreleased] - 2021-03-22 +[Unreleased] - 2021-03-23 ========================= Added diff --git a/tardis/adapters/sites/htcondor.py b/tardis/adapters/sites/htcondor.py index fa4a7bb4..f91a6949 100644 --- a/tardis/adapters/sites/htcondor.py +++ b/tardis/adapters/sites/htcondor.py @@ -99,7 +99,7 @@ async def deploy_resource( drone_environment = self.drone_environment( resource_attributes.drone_uuid, - resource_attributes.machine_meta_data_translation_mapping, + resource_attributes.obs_machine_meta_data_translation_mapping, ) submit_jdl = jdl_template.substitute( diff --git a/tardis/adapters/sites/slurm.py b/tardis/adapters/sites/slurm.py index 7f79d108..72b4f69e 100644 --- a/tardis/adapters/sites/slurm.py +++ b/tardis/adapters/sites/slurm.py @@ -110,7 +110,7 @@ async def deploy_resource( sbatch_cmdline_option_string = submit_cmd_option_formatter( self.sbatch_cmdline_options( resource_attributes.drone_uuid, - resource_attributes.machine_meta_data_translation_mapping, + resource_attributes.obs_machine_meta_data_translation_mapping, ) ) diff --git a/tardis/resources/drone.py b/tardis/resources/drone.py index 2cb7bbf8..95cd783c 100644 --- a/tardis/resources/drone.py +++ b/tardis/resources/drone.py @@ -40,7 +40,7 @@ def __init__( self.resource_attributes = AttributeDict( site_name=self._site_agent.site_name, machine_type=self.site_agent.machine_type, - machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950 + obs_machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950 remote_resource_uuid=remote_resource_uuid, created=created or datetime.now(), updated=updated or datetime.now(), diff --git a/tests/adapters_t/sites_t/test_htcondorsiteadapter.py b/tests/adapters_t/sites_t/test_htcondorsiteadapter.py index a83d3947..05b78f42 100644 --- a/tests/adapters_t/sites_t/test_htcondorsiteadapter.py +++ b/tests/adapters_t/sites_t/test_htcondorsiteadapter.py @@ -99,7 +99,7 @@ def test_deploy_resource(self): self.adapter.deploy_resource, AttributeDict( drone_uuid="test-123", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1024, Disk=1024 * 1024, @@ -125,7 +125,7 @@ def test_translate_resources_raises_logs(self): self.adapter.deploy_resource, AttributeDict( drone_uuid="test-123", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1024, Disk=1024 * 1024, diff --git a/tests/adapters_t/sites_t/test_slurm.py b/tests/adapters_t/sites_t/test_slurm.py index 19897eb4..63bdc8b5 100644 --- a/tests/adapters_t/sites_t/test_slurm.py +++ b/tests/adapters_t/sites_t/test_slurm.py @@ -162,7 +162,7 @@ def test_deploy_resource(self): resource_attributes=AttributeDict( machine_type="test2large", site_name="TestSite", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1000, Disk=1000, @@ -199,7 +199,7 @@ def test_deploy_resource_w_submit_options(self): resource_attributes=AttributeDict( machine_type="test2large", site_name="TestSite", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1000, Disk=1000, From a2a01b0dd81f815db7345e48476bef715911665f Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Tue, 23 Mar 2021 12:24:23 +0100 Subject: [PATCH 2/5] Refactor machine meta data translation --- tardis/interfaces/siteadapter.py | 15 +++++--------- tardis/utilities/utils.py | 27 ++++++++++++++++++++++++++ tests/interfaces_t/test_siteadapter.py | 2 +- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/tardis/interfaces/siteadapter.py b/tardis/interfaces/siteadapter.py index 42e91c80..e3b98e79 100644 --- a/tardis/interfaces/siteadapter.py +++ b/tardis/interfaces/siteadapter.py @@ -1,5 +1,6 @@ from ..configuration.configuration import Configuration from ..utilities.attributedict import AttributeDict +from ..utilities.utils import machine_meta_data_translation from abc import ABCMeta, abstractmethod from cobald.utility.primitives import infinity as inf @@ -92,16 +93,10 @@ def drone_environment( :return: Translated :rtype: dict """ - try: - drone_environment = { - key: meta_data_translation_mapping[key] * value - for key, value in self.machine_meta_data.items() - } - except KeyError as ke: - logger.critical(f"drone_environment failed: no translation known for {ke}") - raise - else: - drone_environment["Uuid"] = drone_uuid + drone_environment = machine_meta_data_translation( + self.machine_meta_data, meta_data_translation_mapping + ) + drone_environment["Uuid"] = drone_uuid return drone_environment diff --git a/tardis/utilities/utils.py b/tardis/utilities/utils.py index b4ba1124..20d21eaf 100644 --- a/tardis/utilities/utils.py +++ b/tardis/utilities/utils.py @@ -7,6 +7,9 @@ from typing import List, Tuple import csv +import logging + +logger = logging.getLogger("cobald.runtime.tardis.utilities.utils") async def async_run_command( @@ -82,6 +85,30 @@ def csv_parser( } +def machine_meta_data_translation( + machine_meta_data: AttributeDict, meta_data_translation_mapping: AttributeDict +): + """ + Helper function to translate units of the machine_meta_data to match the + units required by the overlay batch system + :param machine_meta_data: Machine Meta Data (Cores, Memory, Disk) + :param meta_data_translation_mapping: Map used for the translation of meta + data, contains conversion factors + :return: + :rtype: dict + """ + try: + return { + key: meta_data_translation_mapping[key] * value + for key, value in machine_meta_data.items() + } + except KeyError as ke: + logger.critical( + f"machine_meta_data_translation failed: no translation known for {ke}" + ) + raise + + def submit_cmd_option_formatter(options: AttributeDict) -> str: option_prefix = dict(short="-", long="--") option_separator = dict(short=" ", long="=") diff --git a/tests/interfaces_t/test_siteadapter.py b/tests/interfaces_t/test_siteadapter.py index 71eedd64..614b3ac2 100644 --- a/tests/interfaces_t/test_siteadapter.py +++ b/tests/interfaces_t/test_siteadapter.py @@ -65,7 +65,7 @@ def test_drone_environment(self): ) with self.assertLogs( - logger="cobald.runtime.tardis.interfaces.site", level=logging.CRITICAL + logger="cobald.runtime.tardis.utilities.utils", level=logging.CRITICAL ), self.assertRaises(KeyError): self.site_adapter.drone_environment( drone_uuid="test-123", From 912335f646beb0538db86a807b8616bdeecee337 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Tue, 23 Mar 2021 12:30:13 +0100 Subject: [PATCH 3/5] Fix machine meta data translation bug in HTCondor site adapter --- tardis/adapters/sites/htcondor.py | 11 +++++- .../sites_t/test_htcondorsiteadapter.py | 39 +++++++++++++++++-- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/tardis/adapters/sites/htcondor.py b/tardis/adapters/sites/htcondor.py index f91a6949..604ed167 100644 --- a/tardis/adapters/sites/htcondor.py +++ b/tardis/adapters/sites/htcondor.py @@ -7,7 +7,7 @@ from ...utilities.attributedict import AttributeDict from ...utilities.staticmapping import StaticMapping from ...utilities.executors.shellexecutor import ShellExecutor -from ...utilities.utils import csv_parser +from ...utilities.utils import csv_parser, machine_meta_data_translation from contextlib import contextmanager from datetime import datetime @@ -58,6 +58,10 @@ async def htcondor_queue_updater(executor): class HTCondorAdapter(SiteAdapter): + htcondor_machine_meta_data_translation_mapping = AttributeDict( + Cores=1, Memory=1024, Disk=1024 * 1024 + ) + def __init__(self, machine_type: str, site_name: str): self._machine_type = machine_type self._site_name = site_name @@ -103,7 +107,10 @@ async def deploy_resource( ) submit_jdl = jdl_template.substitute( - drone_environment, + machine_meta_data_translation( + self.machine_meta_data, + self.htcondor_machine_meta_data_translation_mapping, + ), Environment=";".join( f"TardisDrone{key}={value}" for key, value in drone_environment.items() ), diff --git a/tests/adapters_t/sites_t/test_htcondorsiteadapter.py b/tests/adapters_t/sites_t/test_htcondorsiteadapter.py index 05b78f42..722bee7c 100644 --- a/tests/adapters_t/sites_t/test_htcondorsiteadapter.py +++ b/tests/adapters_t/sites_t/test_htcondorsiteadapter.py @@ -34,7 +34,7 @@ CONDOR_SUSPEND_FAILED_MESSAGE = """Run command condor_suspend 1351043 via ShellExecutor failed""" -CONDOR_SUBMIT_JDL = """executable = start_pilot.sh +CONDOR_SUBMIT_JDL_CONDOR_OBS = """executable = start_pilot.sh transfer_input_files = setup_pilot.sh output = logs/$(cluster).$(process).out error = logs/$(cluster).$(process).err @@ -50,6 +50,22 @@ queue 1""" # noqa: B950 +CONDOR_SUBMIT_JDL_SPARK_OBS = """executable = start_pilot.sh +transfer_input_files = setup_pilot.sh +output = logs/$(cluster).$(process).out +error = logs/$(cluster).$(process).err +log = logs/cluster.log + +accounting_group=tardis + +environment=TardisDroneCores=8;TardisDroneMemory=32;TardisDroneDisk=160;TardisDroneUuid=test-123 + +request_cpus=8 +request_memory=32768 +request_disk=167772160 + +queue 1""" # noqa: B950 + class TestHTCondorSiteAdapter(TestCase): mock_config_patcher = None @@ -94,7 +110,7 @@ def machine_type_configuration(self): ) @mock_executor_run_command(stdout=CONDOR_SUBMIT_OUTPUT) - def test_deploy_resource(self): + def test_deploy_resource_htcondor_obs(self): response = run_async( self.adapter.deploy_resource, AttributeDict( @@ -111,7 +127,24 @@ def test_deploy_resource(self): self.assertFalse(response.updated - datetime.now() > timedelta(seconds=1)) self.mock_executor.return_value.run_command.assert_called_with( - "condor_submit", stdin_input=CONDOR_SUBMIT_JDL + "condor_submit", stdin_input=CONDOR_SUBMIT_JDL_CONDOR_OBS + ) + self.mock_executor.reset() + + run_async( + self.adapter.deploy_resource, + AttributeDict( + drone_uuid="test-123", + obs_machine_meta_data_translation_mapping=AttributeDict( + Cores=1, + Memory=1, + Disk=1, + ), + ), + ) + + self.mock_executor.return_value.run_command.assert_called_with( + "condor_submit", stdin_input=CONDOR_SUBMIT_JDL_SPARK_OBS ) self.mock_executor.reset() From 3718083d4e0b4fca9c1c68869365fddfb9aa3b07 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Tue, 23 Mar 2021 12:48:44 +0100 Subject: [PATCH 4/5] Add changelog for translation bug --- docs/source/changelog.rst | 1 + .../173.fix_meta_data_translation_htcondor.yaml | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 docs/source/changes/173.fix_meta_data_translation_htcondor.yaml diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 08c7f36c..9bd87053 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -18,6 +18,7 @@ Fixed ----- * Fixes a bug that the drone_minimum_lifetime parameter is not working as described in the documentation +* Fixes a bug in the HTCondor Site Adapter that could leading to wrong requirements when using non HTCondor OBS [0.5.0] - 2020-12-09 ==================== diff --git a/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml b/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml new file mode 100644 index 00000000..6888fe53 --- /dev/null +++ b/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml @@ -0,0 +1,12 @@ +category: fixed +summary: "Fixes a bug in the HTCondor Site Adapter that could leading to wrong requirements when using non HTCondor OBS" +description: | + The HTCondor Site Adapter takes a wrong `machine_meta_data_translation_mapping` into account in some circumstances. + Due to a bug introduced in #157, the HTCondor Site Adapter uses the `machine_meta_data_translation_mapping` of the + Batchsystem Adapter (OBS). In case the OBS is also HTCondor or the OBS has the same translations it does not have any + affect. However, in case the OBS is using different units for memory and disk space --hence different translation + mappings-- the requested Drones have wrong requirements. +pull requests: + - 173 +issues: + - 170 From 7793f5781012df65dd6feea3374e1d6e1384efbf Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Tue, 23 Mar 2021 14:58:48 +0100 Subject: [PATCH 5/5] Fix typo in changelog Co-authored-by: Max Fischer --- docs/source/changelog.rst | 2 +- docs/source/changes/173.fix_meta_data_translation_htcondor.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 9bd87053..1f951583 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -18,7 +18,7 @@ Fixed ----- * Fixes a bug that the drone_minimum_lifetime parameter is not working as described in the documentation -* Fixes a bug in the HTCondor Site Adapter that could leading to wrong requirements when using non HTCondor OBS +* Fixes a bug in the HTCondor Site Adapter which leads to wrong requirements when using non HTCondor OBS [0.5.0] - 2020-12-09 ==================== diff --git a/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml b/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml index 6888fe53..bf434ebb 100644 --- a/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml +++ b/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml @@ -1,5 +1,5 @@ category: fixed -summary: "Fixes a bug in the HTCondor Site Adapter that could leading to wrong requirements when using non HTCondor OBS" +summary: "Fixes a bug in the HTCondor Site Adapter which leads to wrong requirements when using non HTCondor OBS" description: | The HTCondor Site Adapter takes a wrong `machine_meta_data_translation_mapping` into account in some circumstances. Due to a bug introduced in #157, the HTCondor Site Adapter uses the `machine_meta_data_translation_mapping` of the