diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c2c4f067..1f951583 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by changelog.py at 2021-03-22, command +.. Created by changelog.py at 2021-03-23, command '/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.9/bin/changelog docs/source/changes compile --output=docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' @@ -6,7 +6,7 @@ CHANGELOG ######### -[Unreleased] - 2021-03-22 +[Unreleased] - 2021-03-23 ========================= Added @@ -18,6 +18,7 @@ Fixed ----- * Fixes a bug that the drone_minimum_lifetime parameter is not working as described in the documentation +* Fixes a bug in the HTCondor Site Adapter which leads to wrong requirements when using non HTCondor OBS [0.5.0] - 2020-12-09 ==================== diff --git a/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml b/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml new file mode 100644 index 00000000..bf434ebb --- /dev/null +++ b/docs/source/changes/173.fix_meta_data_translation_htcondor.yaml @@ -0,0 +1,12 @@ +category: fixed +summary: "Fixes a bug in the HTCondor Site Adapter which leads to wrong requirements when using non HTCondor OBS" +description: | + The HTCondor Site Adapter takes a wrong `machine_meta_data_translation_mapping` into account in some circumstances. + Due to a bug introduced in #157, the HTCondor Site Adapter uses the `machine_meta_data_translation_mapping` of the + Batchsystem Adapter (OBS). In case the OBS is also HTCondor or the OBS has the same translations it does not have any + affect. However, in case the OBS is using different units for memory and disk space --hence different translation + mappings-- the requested Drones have wrong requirements. +pull requests: + - 173 +issues: + - 170 diff --git a/tardis/adapters/sites/htcondor.py b/tardis/adapters/sites/htcondor.py index fa4a7bb4..604ed167 100644 --- a/tardis/adapters/sites/htcondor.py +++ b/tardis/adapters/sites/htcondor.py @@ -7,7 +7,7 @@ from ...utilities.attributedict import AttributeDict from ...utilities.staticmapping import StaticMapping from ...utilities.executors.shellexecutor import ShellExecutor -from ...utilities.utils import csv_parser +from ...utilities.utils import csv_parser, machine_meta_data_translation from contextlib import contextmanager from datetime import datetime @@ -58,6 +58,10 @@ async def htcondor_queue_updater(executor): class HTCondorAdapter(SiteAdapter): + htcondor_machine_meta_data_translation_mapping = AttributeDict( + Cores=1, Memory=1024, Disk=1024 * 1024 + ) + def __init__(self, machine_type: str, site_name: str): self._machine_type = machine_type self._site_name = site_name @@ -99,11 +103,14 @@ async def deploy_resource( drone_environment = self.drone_environment( resource_attributes.drone_uuid, - resource_attributes.machine_meta_data_translation_mapping, + resource_attributes.obs_machine_meta_data_translation_mapping, ) submit_jdl = jdl_template.substitute( - drone_environment, + machine_meta_data_translation( + self.machine_meta_data, + self.htcondor_machine_meta_data_translation_mapping, + ), Environment=";".join( f"TardisDrone{key}={value}" for key, value in drone_environment.items() ), diff --git a/tardis/adapters/sites/slurm.py b/tardis/adapters/sites/slurm.py index 7f79d108..72b4f69e 100644 --- a/tardis/adapters/sites/slurm.py +++ b/tardis/adapters/sites/slurm.py @@ -110,7 +110,7 @@ async def deploy_resource( sbatch_cmdline_option_string = submit_cmd_option_formatter( self.sbatch_cmdline_options( resource_attributes.drone_uuid, - resource_attributes.machine_meta_data_translation_mapping, + resource_attributes.obs_machine_meta_data_translation_mapping, ) ) diff --git a/tardis/interfaces/siteadapter.py b/tardis/interfaces/siteadapter.py index 42e91c80..e3b98e79 100644 --- a/tardis/interfaces/siteadapter.py +++ b/tardis/interfaces/siteadapter.py @@ -1,5 +1,6 @@ from ..configuration.configuration import Configuration from ..utilities.attributedict import AttributeDict +from ..utilities.utils import machine_meta_data_translation from abc import ABCMeta, abstractmethod from cobald.utility.primitives import infinity as inf @@ -92,16 +93,10 @@ def drone_environment( :return: Translated :rtype: dict """ - try: - drone_environment = { - key: meta_data_translation_mapping[key] * value - for key, value in self.machine_meta_data.items() - } - except KeyError as ke: - logger.critical(f"drone_environment failed: no translation known for {ke}") - raise - else: - drone_environment["Uuid"] = drone_uuid + drone_environment = machine_meta_data_translation( + self.machine_meta_data, meta_data_translation_mapping + ) + drone_environment["Uuid"] = drone_uuid return drone_environment diff --git a/tardis/resources/drone.py b/tardis/resources/drone.py index 2cb7bbf8..95cd783c 100644 --- a/tardis/resources/drone.py +++ b/tardis/resources/drone.py @@ -40,7 +40,7 @@ def __init__( self.resource_attributes = AttributeDict( site_name=self._site_agent.site_name, machine_type=self.site_agent.machine_type, - machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950 + obs_machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950 remote_resource_uuid=remote_resource_uuid, created=created or datetime.now(), updated=updated or datetime.now(), diff --git a/tardis/utilities/utils.py b/tardis/utilities/utils.py index b4ba1124..20d21eaf 100644 --- a/tardis/utilities/utils.py +++ b/tardis/utilities/utils.py @@ -7,6 +7,9 @@ from typing import List, Tuple import csv +import logging + +logger = logging.getLogger("cobald.runtime.tardis.utilities.utils") async def async_run_command( @@ -82,6 +85,30 @@ def csv_parser( } +def machine_meta_data_translation( + machine_meta_data: AttributeDict, meta_data_translation_mapping: AttributeDict +): + """ + Helper function to translate units of the machine_meta_data to match the + units required by the overlay batch system + :param machine_meta_data: Machine Meta Data (Cores, Memory, Disk) + :param meta_data_translation_mapping: Map used for the translation of meta + data, contains conversion factors + :return: + :rtype: dict + """ + try: + return { + key: meta_data_translation_mapping[key] * value + for key, value in machine_meta_data.items() + } + except KeyError as ke: + logger.critical( + f"machine_meta_data_translation failed: no translation known for {ke}" + ) + raise + + def submit_cmd_option_formatter(options: AttributeDict) -> str: option_prefix = dict(short="-", long="--") option_separator = dict(short=" ", long="=") diff --git a/tests/adapters_t/sites_t/test_htcondorsiteadapter.py b/tests/adapters_t/sites_t/test_htcondorsiteadapter.py index a83d3947..722bee7c 100644 --- a/tests/adapters_t/sites_t/test_htcondorsiteadapter.py +++ b/tests/adapters_t/sites_t/test_htcondorsiteadapter.py @@ -34,7 +34,7 @@ CONDOR_SUSPEND_FAILED_MESSAGE = """Run command condor_suspend 1351043 via ShellExecutor failed""" -CONDOR_SUBMIT_JDL = """executable = start_pilot.sh +CONDOR_SUBMIT_JDL_CONDOR_OBS = """executable = start_pilot.sh transfer_input_files = setup_pilot.sh output = logs/$(cluster).$(process).out error = logs/$(cluster).$(process).err @@ -50,6 +50,22 @@ queue 1""" # noqa: B950 +CONDOR_SUBMIT_JDL_SPARK_OBS = """executable = start_pilot.sh +transfer_input_files = setup_pilot.sh +output = logs/$(cluster).$(process).out +error = logs/$(cluster).$(process).err +log = logs/cluster.log + +accounting_group=tardis + +environment=TardisDroneCores=8;TardisDroneMemory=32;TardisDroneDisk=160;TardisDroneUuid=test-123 + +request_cpus=8 +request_memory=32768 +request_disk=167772160 + +queue 1""" # noqa: B950 + class TestHTCondorSiteAdapter(TestCase): mock_config_patcher = None @@ -94,12 +110,12 @@ def machine_type_configuration(self): ) @mock_executor_run_command(stdout=CONDOR_SUBMIT_OUTPUT) - def test_deploy_resource(self): + def test_deploy_resource_htcondor_obs(self): response = run_async( self.adapter.deploy_resource, AttributeDict( drone_uuid="test-123", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1024, Disk=1024 * 1024, @@ -111,7 +127,24 @@ def test_deploy_resource(self): self.assertFalse(response.updated - datetime.now() > timedelta(seconds=1)) self.mock_executor.return_value.run_command.assert_called_with( - "condor_submit", stdin_input=CONDOR_SUBMIT_JDL + "condor_submit", stdin_input=CONDOR_SUBMIT_JDL_CONDOR_OBS + ) + self.mock_executor.reset() + + run_async( + self.adapter.deploy_resource, + AttributeDict( + drone_uuid="test-123", + obs_machine_meta_data_translation_mapping=AttributeDict( + Cores=1, + Memory=1, + Disk=1, + ), + ), + ) + + self.mock_executor.return_value.run_command.assert_called_with( + "condor_submit", stdin_input=CONDOR_SUBMIT_JDL_SPARK_OBS ) self.mock_executor.reset() @@ -125,7 +158,7 @@ def test_translate_resources_raises_logs(self): self.adapter.deploy_resource, AttributeDict( drone_uuid="test-123", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1024, Disk=1024 * 1024, diff --git a/tests/adapters_t/sites_t/test_slurm.py b/tests/adapters_t/sites_t/test_slurm.py index 19897eb4..63bdc8b5 100644 --- a/tests/adapters_t/sites_t/test_slurm.py +++ b/tests/adapters_t/sites_t/test_slurm.py @@ -162,7 +162,7 @@ def test_deploy_resource(self): resource_attributes=AttributeDict( machine_type="test2large", site_name="TestSite", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1000, Disk=1000, @@ -199,7 +199,7 @@ def test_deploy_resource_w_submit_options(self): resource_attributes=AttributeDict( machine_type="test2large", site_name="TestSite", - machine_meta_data_translation_mapping=AttributeDict( + obs_machine_meta_data_translation_mapping=AttributeDict( Cores=1, Memory=1000, Disk=1000, diff --git a/tests/interfaces_t/test_siteadapter.py b/tests/interfaces_t/test_siteadapter.py index 71eedd64..614b3ac2 100644 --- a/tests/interfaces_t/test_siteadapter.py +++ b/tests/interfaces_t/test_siteadapter.py @@ -65,7 +65,7 @@ def test_drone_environment(self): ) with self.assertLogs( - logger="cobald.runtime.tardis.interfaces.site", level=logging.CRITICAL + logger="cobald.runtime.tardis.utilities.utils", level=logging.CRITICAL ), self.assertRaises(KeyError): self.site_adapter.drone_environment( drone_uuid="test-123",