Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix htcondor site adapter #173

Merged
merged 5 commits into from
Mar 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
.. Created by changelog.py at 2021-03-22, command
.. Created by changelog.py at 2021-03-23, command
'/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.9/bin/changelog docs/source/changes compile --output=docs/source/changelog.rst'
based on the format of 'https://keepachangelog.com/'

#########
CHANGELOG
#########

[Unreleased] - 2021-03-22
[Unreleased] - 2021-03-23
=========================

Added
Expand All @@ -18,6 +18,7 @@ Fixed
-----

* Fixes a bug that the drone_minimum_lifetime parameter is not working as described in the documentation
* Fixes a bug in the HTCondor Site Adapter which leads to wrong requirements when using non HTCondor OBS

[0.5.0] - 2020-12-09
====================
Expand Down
12 changes: 12 additions & 0 deletions docs/source/changes/173.fix_meta_data_translation_htcondor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
category: fixed
summary: "Fixes a bug in the HTCondor Site Adapter which leads to wrong requirements when using non HTCondor OBS"
description: |
The HTCondor Site Adapter takes a wrong `machine_meta_data_translation_mapping` into account in some circumstances.
Due to a bug introduced in #157, the HTCondor Site Adapter uses the `machine_meta_data_translation_mapping` of the
Batchsystem Adapter (OBS). In case the OBS is also HTCondor or the OBS has the same translations it does not have any
affect. However, in case the OBS is using different units for memory and disk space --hence different translation
mappings-- the requested Drones have wrong requirements.
pull requests:
- 173
issues:
- 170
13 changes: 10 additions & 3 deletions tardis/adapters/sites/htcondor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ...utilities.attributedict import AttributeDict
from ...utilities.staticmapping import StaticMapping
from ...utilities.executors.shellexecutor import ShellExecutor
from ...utilities.utils import csv_parser
from ...utilities.utils import csv_parser, machine_meta_data_translation

from contextlib import contextmanager
from datetime import datetime
Expand Down Expand Up @@ -58,6 +58,10 @@ async def htcondor_queue_updater(executor):


class HTCondorAdapter(SiteAdapter):
htcondor_machine_meta_data_translation_mapping = AttributeDict(
Cores=1, Memory=1024, Disk=1024 * 1024
)

def __init__(self, machine_type: str, site_name: str):
self._machine_type = machine_type
self._site_name = site_name
Expand Down Expand Up @@ -99,11 +103,14 @@ async def deploy_resource(

drone_environment = self.drone_environment(
resource_attributes.drone_uuid,
resource_attributes.machine_meta_data_translation_mapping,
resource_attributes.obs_machine_meta_data_translation_mapping,
)

submit_jdl = jdl_template.substitute(
drone_environment,
machine_meta_data_translation(
self.machine_meta_data,
self.htcondor_machine_meta_data_translation_mapping,
),
Environment=";".join(
f"TardisDrone{key}={value}" for key, value in drone_environment.items()
),
Expand Down
2 changes: 1 addition & 1 deletion tardis/adapters/sites/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ async def deploy_resource(
sbatch_cmdline_option_string = submit_cmd_option_formatter(
self.sbatch_cmdline_options(
resource_attributes.drone_uuid,
resource_attributes.machine_meta_data_translation_mapping,
resource_attributes.obs_machine_meta_data_translation_mapping,
)
)

Expand Down
15 changes: 5 additions & 10 deletions tardis/interfaces/siteadapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ..configuration.configuration import Configuration
from ..utilities.attributedict import AttributeDict
from ..utilities.utils import machine_meta_data_translation

from abc import ABCMeta, abstractmethod
from cobald.utility.primitives import infinity as inf
Expand Down Expand Up @@ -92,16 +93,10 @@ def drone_environment(
:return: Translated
:rtype: dict
"""
try:
drone_environment = {
key: meta_data_translation_mapping[key] * value
for key, value in self.machine_meta_data.items()
}
except KeyError as ke:
logger.critical(f"drone_environment failed: no translation known for {ke}")
raise
else:
drone_environment["Uuid"] = drone_uuid
drone_environment = machine_meta_data_translation(
self.machine_meta_data, meta_data_translation_mapping
)
drone_environment["Uuid"] = drone_uuid

return drone_environment

Expand Down
2 changes: 1 addition & 1 deletion tardis/resources/drone.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(
self.resource_attributes = AttributeDict(
site_name=self._site_agent.site_name,
machine_type=self.site_agent.machine_type,
machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950
obs_machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950
remote_resource_uuid=remote_resource_uuid,
created=created or datetime.now(),
updated=updated or datetime.now(),
Expand Down
27 changes: 27 additions & 0 deletions tardis/utilities/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from typing import List, Tuple

import csv
import logging

logger = logging.getLogger("cobald.runtime.tardis.utilities.utils")


async def async_run_command(
Expand Down Expand Up @@ -82,6 +85,30 @@ def csv_parser(
}


def machine_meta_data_translation(
machine_meta_data: AttributeDict, meta_data_translation_mapping: AttributeDict
):
"""
Helper function to translate units of the machine_meta_data to match the
units required by the overlay batch system
:param machine_meta_data: Machine Meta Data (Cores, Memory, Disk)
:param meta_data_translation_mapping: Map used for the translation of meta
data, contains conversion factors
:return:
:rtype: dict
"""
try:
return {
key: meta_data_translation_mapping[key] * value
for key, value in machine_meta_data.items()
}
except KeyError as ke:
logger.critical(
f"machine_meta_data_translation failed: no translation known for {ke}"
)
raise


def submit_cmd_option_formatter(options: AttributeDict) -> str:
option_prefix = dict(short="-", long="--")
option_separator = dict(short=" ", long="=")
Expand Down
43 changes: 38 additions & 5 deletions tests/adapters_t/sites_t/test_htcondorsiteadapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
CONDOR_SUSPEND_FAILED_MESSAGE = """Run command condor_suspend 1351043 via
ShellExecutor failed"""

CONDOR_SUBMIT_JDL = """executable = start_pilot.sh
CONDOR_SUBMIT_JDL_CONDOR_OBS = """executable = start_pilot.sh
transfer_input_files = setup_pilot.sh
output = logs/$(cluster).$(process).out
error = logs/$(cluster).$(process).err
Expand All @@ -50,6 +50,22 @@

queue 1""" # noqa: B950

CONDOR_SUBMIT_JDL_SPARK_OBS = """executable = start_pilot.sh
transfer_input_files = setup_pilot.sh
output = logs/$(cluster).$(process).out
error = logs/$(cluster).$(process).err
log = logs/cluster.log

accounting_group=tardis

environment=TardisDroneCores=8;TardisDroneMemory=32;TardisDroneDisk=160;TardisDroneUuid=test-123

request_cpus=8
request_memory=32768
request_disk=167772160

queue 1""" # noqa: B950


class TestHTCondorSiteAdapter(TestCase):
mock_config_patcher = None
Expand Down Expand Up @@ -94,12 +110,12 @@ def machine_type_configuration(self):
)

@mock_executor_run_command(stdout=CONDOR_SUBMIT_OUTPUT)
def test_deploy_resource(self):
def test_deploy_resource_htcondor_obs(self):
response = run_async(
self.adapter.deploy_resource,
AttributeDict(
drone_uuid="test-123",
machine_meta_data_translation_mapping=AttributeDict(
obs_machine_meta_data_translation_mapping=AttributeDict(
Cores=1,
Memory=1024,
Disk=1024 * 1024,
Expand All @@ -111,7 +127,24 @@ def test_deploy_resource(self):
self.assertFalse(response.updated - datetime.now() > timedelta(seconds=1))

self.mock_executor.return_value.run_command.assert_called_with(
"condor_submit", stdin_input=CONDOR_SUBMIT_JDL
"condor_submit", stdin_input=CONDOR_SUBMIT_JDL_CONDOR_OBS
)
self.mock_executor.reset()

run_async(
self.adapter.deploy_resource,
AttributeDict(
drone_uuid="test-123",
obs_machine_meta_data_translation_mapping=AttributeDict(
Cores=1,
Memory=1,
Disk=1,
),
),
)

self.mock_executor.return_value.run_command.assert_called_with(
"condor_submit", stdin_input=CONDOR_SUBMIT_JDL_SPARK_OBS
)
self.mock_executor.reset()

Expand All @@ -125,7 +158,7 @@ def test_translate_resources_raises_logs(self):
self.adapter.deploy_resource,
AttributeDict(
drone_uuid="test-123",
machine_meta_data_translation_mapping=AttributeDict(
obs_machine_meta_data_translation_mapping=AttributeDict(
Cores=1,
Memory=1024,
Disk=1024 * 1024,
Expand Down
4 changes: 2 additions & 2 deletions tests/adapters_t/sites_t/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_deploy_resource(self):
resource_attributes=AttributeDict(
machine_type="test2large",
site_name="TestSite",
machine_meta_data_translation_mapping=AttributeDict(
obs_machine_meta_data_translation_mapping=AttributeDict(
Cores=1,
Memory=1000,
Disk=1000,
Expand Down Expand Up @@ -199,7 +199,7 @@ def test_deploy_resource_w_submit_options(self):
resource_attributes=AttributeDict(
machine_type="test2large",
site_name="TestSite",
machine_meta_data_translation_mapping=AttributeDict(
obs_machine_meta_data_translation_mapping=AttributeDict(
Cores=1,
Memory=1000,
Disk=1000,
Expand Down
2 changes: 1 addition & 1 deletion tests/interfaces_t/test_siteadapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_drone_environment(self):
)

with self.assertLogs(
logger="cobald.runtime.tardis.interfaces.site", level=logging.CRITICAL
logger="cobald.runtime.tardis.utilities.utils", level=logging.CRITICAL
), self.assertRaises(KeyError):
self.site_adapter.drone_environment(
drone_uuid="test-123",
Expand Down