From 0d1cc4836211b085466845b4ade4f3c89b17b4eb Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Tue, 23 Aug 2022 18:04:59 -0300 Subject: [PATCH 01/12] [optimizer] improvement: map AMD GPU ids as integers --- CHANGELOG.md | 4 ++ guapow/__init__.py | 2 +- guapow/service/optimizer/gpu.py | 38 +++++++++++++------ .../optimizer/gpu/test_amd_gpu_driver.py | 22 ++++++----- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 157916f..7706f0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [1.2.2] +### Improvements +- Minor code refactoring + ## [1.2.1] 2022-08-22 ### Fixes - Performance mode not being activated for AMD GPUs from the RX 6XX0 series (tested on kernels >= 5.15) diff --git a/guapow/__init__.py b/guapow/__init__.py index 4c2cea2..e631cb0 100644 --- a/guapow/__init__.py +++ b/guapow/__init__.py @@ -2,4 +2,4 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) __app_name__ = 'guapow' -__version__ = '1.2.1' +__version__ = '1.2.2' diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index 85a77ba..ece40bb 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -199,6 +199,7 @@ def __init__(self, cache: bool, logger: Logger, gpus_path: str = '/sys/class/drm super(AMDGPUDriver, self).__init__(cache, logger) self._gpus_path = gpus_path self._re_power_mode: Optional[Pattern] = None + self._re_extract_id: Optional[Pattern] = None @classmethod def get_vendor_name(cls) -> str: @@ -214,6 +215,14 @@ def re_power_mode(self) -> Pattern: return self._re_power_mode + @property + def re_extract_id(self) -> Pattern: + if not self._re_extract_id: + gen_pattern = re.compile(r'/\w+{id}').findall(self._gpus_path)[0].replace('{id}', r'(\d+)') + self._re_extract_id = re.compile(gen_pattern) + + return self._re_extract_id + async def get_gpus(self) -> Optional[Set[str]]: required_files = {self.PERFORMANCE_FILE: set(), self.PROFILE_FILE: set()} @@ -237,7 +246,11 @@ async def get_gpus(self) -> Optional[Set[str]]: all_files_available = False if all_files_available: - gpus.add(gpu_dir) + try: + gpus.add(self.re_extract_id.findall(gpu_dir)[0]) + except IndexError: + self._log.error(f"[{self.__class__.__name__}] Could not extract AMD GPU id from directory: " + f"{gpu_dir}") return gpus if gpus else None @@ -263,14 +276,15 @@ def _map_power_mode_output(self, output: str, file_path: str) -> Optional[str]: f"Content: {content_log}") async def _fill_power_mode(self, gpu_id: str, gpu_modes: Dict[str, str]): - control_file = f'{gpu_id}/{self.PERFORMANCE_FILE}' + gpu_dir = self._gpus_path.format(id=gpu_id) + control_file = f'{gpu_dir}/{self.PERFORMANCE_FILE}' control_type = await self._read_file(control_file) self._log.debug(f"{self.get_vendor_name()} GPU file ({control_file}): {control_type}") if not control_type: return - power_file = f'{gpu_id}/{self.PROFILE_FILE}' + power_file = f'{gpu_dir}/{self.PROFILE_FILE}' power_mode = self._map_power_mode_output(await self._read_file(power_file), power_file) self._log.debug(f"{self.get_vendor_name()} GPU file ({power_file}): {power_mode}") @@ -283,7 +297,7 @@ async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dic -> Optional[Dict[str, str]]: if gpu_ids: res = {} - await asyncio.gather(*tuple(self._fill_power_mode(gpu_id, res) for gpu_id in gpu_ids)) + await asyncio.gather(*tuple(self._fill_power_mode(id_, res) for id_ in gpu_ids)) return res if res else None async def _write_to_file(self, file_path: str, content: str) -> bool: @@ -304,22 +318,22 @@ async def set_power_mode(self, ids_modes: Dict[str, str], res = {} if ids_modes: coros, writes = [], dict() - for gpu_dir, mode_str in ids_modes.items(): + for id_, mode_str in ids_modes.items(): mode = mode_str.split(':') if len(mode) == 2: + gpu_dir = self._gpus_path.format(id=id_) self._log.info(f"Changing {self.get_vendor_name()} GPU ({gpu_dir}) operation mode " f"(performance: {mode[0]}, profile: {mode[1]})") - writes[gpu_dir] = list() - coros.append(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', mode[0], gpu_dir, - writes)) - coros.append(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', mode[1], gpu_dir, writes)) + writes[id_] = list() + coros.append(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', mode[0], id_, writes)) + coros.append(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', mode[1], id_, writes)) await asyncio.gather(*coros) - for gpu_dir in ids_modes: - gpu_writes = writes.get(gpu_dir) - res[gpu_dir] = gpu_writes and all(gpu_writes) + for id_ in ids_modes: + gpu_writes = writes.get(id_) + res[id_] = gpu_writes and all(gpu_writes) return res diff --git a/tests/service/optimizer/gpu/test_amd_gpu_driver.py b/tests/service/optimizer/gpu/test_amd_gpu_driver.py index b1729a8..3270447 100644 --- a/tests/service/optimizer/gpu/test_amd_gpu_driver.py +++ b/tests/service/optimizer/gpu/test_amd_gpu_driver.py @@ -1,4 +1,5 @@ import os +import re import shutil import sys import traceback @@ -51,20 +52,19 @@ async def test_get_gpus__return_available_gpus_when_required_files_exist(self): returned = await driver.get_gpus() self.assertIsNotNone(returned) - expected = {TEST_GPU_FOLDER.format(id=n) for n in (1, 2, 3, 4)} - self.assertEqual(expected, returned) + self.assertEqual({'1', '2', '3', '4'}, returned) async def test_get_power_mode__return_a_string_concatenating_the_performance_and_profile_ids(self): driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER) - gpu_dirs = {TEST_GPU_FOLDER.format(id=n) for n in (1, 2, 3, 4)} - actual_modes = await driver.get_power_mode(gpu_dirs) + gpu_ids = {str(n) for n in range(1, 5)} + actual_modes = await driver.get_power_mode(gpu_ids) expected = { - TEST_GPU_FOLDER.format(id=1): 'manual:3', - TEST_GPU_FOLDER.format(id=2): 'manual:5', - TEST_GPU_FOLDER.format(id=3): 'auto:5', - TEST_GPU_FOLDER.format(id=4): 'auto:0' + '1': 'manual:3', + '2': 'manual:5', + '3': 'auto:5', + '4': 'auto:0' } self.assertEqual(expected, actual_modes) @@ -76,8 +76,10 @@ async def test_set_power_mode__write_the_expected_content_within_the_parsed_mode self.fail(f"Could not copy example folder '{example_folder}'") driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER) - res = await driver.set_power_mode({TEMP_GPU_FOLDER: driver.get_performance_mode()}) - self.assertEqual({TEMP_GPU_FOLDER: True}, res) + card_id = re.compile(r'\d+').findall(TEMP_GPU_FOLDER)[0] + + res = await driver.set_power_mode({card_id: driver.get_performance_mode()}) + self.assertEqual({card_id: True}, res) with open(f'{TEMP_GPU_FOLDER}/{AMDGPUDriver.PERFORMANCE_FILE}') as f: control_mode = f.read() From b4988094bd86f5cdea708eafbbc509ac1721dbad Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Tue, 23 Aug 2022 18:26:53 -0300 Subject: [PATCH 02/12] [optimizer] improvement: using tasks instead of coroutines to read/write AMD GPU performance files --- CHANGELOG.md | 2 +- guapow/service/optimizer/gpu.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7706f0c..95ebdcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [1.2.2] ### Improvements -- Minor code refactoring +- Minor code refactoring and performance improvements (regarding AMD GPU performance management) ## [1.2.1] 2022-08-22 ### Fixes diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index ece40bb..3605048 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -297,7 +297,7 @@ async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dic -> Optional[Dict[str, str]]: if gpu_ids: res = {} - await asyncio.gather(*tuple(self._fill_power_mode(id_, res) for id_ in gpu_ids)) + await asyncio.gather(*tuple(asyncio.create_task(self._fill_power_mode(id_, res)) for id_ in gpu_ids)) return res if res else None async def _write_to_file(self, file_path: str, content: str) -> bool: @@ -317,7 +317,7 @@ async def set_power_mode(self, ids_modes: Dict[str, str], user_environment: Optional[Dict[str, str]] = None) -> Dict[str, bool]: res = {} if ids_modes: - coros, writes = [], dict() + tasks, writes = [], dict() for id_, mode_str in ids_modes.items(): mode = mode_str.split(':') @@ -326,10 +326,12 @@ async def set_power_mode(self, ids_modes: Dict[str, str], self._log.info(f"Changing {self.get_vendor_name()} GPU ({gpu_dir}) operation mode " f"(performance: {mode[0]}, profile: {mode[1]})") writes[id_] = list() - coros.append(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', mode[0], id_, writes)) - coros.append(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', mode[1], id_, writes)) + tasks.append(asyncio.create_task(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', + mode[0], id_, writes))) + tasks.append(asyncio.create_task(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', + mode[1], id_, writes))) - await asyncio.gather(*coros) + await asyncio.gather(*tasks) for id_ in ids_modes: gpu_writes = writes.get(id_) From e3162549dda659e6408474688f33b7da575f3827 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Wed, 24 Aug 2022 08:19:55 -0300 Subject: [PATCH 03/12] Revert "[optimizer] improvement: using tasks instead of coroutines to read/write AMD GPU performance files" This reverts commit b4988094bd86f5cdea708eafbbc509ac1721dbad. --- CHANGELOG.md | 2 +- guapow/service/optimizer/gpu.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95ebdcc..7706f0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [1.2.2] ### Improvements -- Minor code refactoring and performance improvements (regarding AMD GPU performance management) +- Minor code refactoring ## [1.2.1] 2022-08-22 ### Fixes diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index 3605048..ece40bb 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -297,7 +297,7 @@ async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dic -> Optional[Dict[str, str]]: if gpu_ids: res = {} - await asyncio.gather(*tuple(asyncio.create_task(self._fill_power_mode(id_, res)) for id_ in gpu_ids)) + await asyncio.gather(*tuple(self._fill_power_mode(id_, res) for id_ in gpu_ids)) return res if res else None async def _write_to_file(self, file_path: str, content: str) -> bool: @@ -317,7 +317,7 @@ async def set_power_mode(self, ids_modes: Dict[str, str], user_environment: Optional[Dict[str, str]] = None) -> Dict[str, bool]: res = {} if ids_modes: - tasks, writes = [], dict() + coros, writes = [], dict() for id_, mode_str in ids_modes.items(): mode = mode_str.split(':') @@ -326,12 +326,10 @@ async def set_power_mode(self, ids_modes: Dict[str, str], self._log.info(f"Changing {self.get_vendor_name()} GPU ({gpu_dir}) operation mode " f"(performance: {mode[0]}, profile: {mode[1]})") writes[id_] = list() - tasks.append(asyncio.create_task(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', - mode[0], id_, writes))) - tasks.append(asyncio.create_task(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', - mode[1], id_, writes))) + coros.append(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', mode[0], id_, writes)) + coros.append(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', mode[1], id_, writes)) - await asyncio.gather(*tasks) + await asyncio.gather(*coros) for id_ in ids_modes: gpu_writes = writes.get(id_) From 1141cb69e403a9ee6688ebf2887a34ef44bb4a45 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Wed, 24 Aug 2022 17:20:13 -0300 Subject: [PATCH 04/12] [optimizer] fix: AMD GPU id extraction pattern can capture unexpected strings --- guapow/service/optimizer/gpu.py | 3 +-- tests/service/optimizer/gpu/test_amd_gpu_driver.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index ece40bb..788991d 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -218,8 +218,7 @@ def re_power_mode(self) -> Pattern: @property def re_extract_id(self) -> Pattern: if not self._re_extract_id: - gen_pattern = re.compile(r'/\w+{id}').findall(self._gpus_path)[0].replace('{id}', r'(\d+)') - self._re_extract_id = re.compile(gen_pattern) + self._re_extract_id = re.compile(self._gpus_path.replace('{id}', r'(\d+)')) return self._re_extract_id diff --git a/tests/service/optimizer/gpu/test_amd_gpu_driver.py b/tests/service/optimizer/gpu/test_amd_gpu_driver.py index 3270447..09342ce 100644 --- a/tests/service/optimizer/gpu/test_amd_gpu_driver.py +++ b/tests/service/optimizer/gpu/test_amd_gpu_driver.py @@ -76,7 +76,7 @@ async def test_set_power_mode__write_the_expected_content_within_the_parsed_mode self.fail(f"Could not copy example folder '{example_folder}'") driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER) - card_id = re.compile(r'\d+').findall(TEMP_GPU_FOLDER)[0] + card_id = re.compile(r'/card(\d+)/device$').findall(TEMP_GPU_FOLDER)[0] res = await driver.set_power_mode({card_id: driver.get_performance_mode()}) self.assertEqual({card_id: True}, res) From 98fea9349fa6a4d1f9302b61f047edb2cdaccdbd Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Sun, 11 Sep 2022 12:33:05 -0300 Subject: [PATCH 05/12] [optimizer.gpu] refactoring: AMD GPU logging --- CHANGELOG.md | 2 +- guapow/service/optimizer/gpu.py | 30 +++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7706f0c..643f194 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [1.2.2] ### Improvements -- Minor code refactoring +- Minor code refactoring and log improvements regarding AMD GPU management ## [1.2.1] 2022-08-22 ### Fixes diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index 788991d..df70acd 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -256,12 +256,12 @@ async def get_gpus(self) -> Optional[Set[str]]: async def _read_file(self, file_path: str) -> Optional[str]: try: async with aiofiles.open(file_path) as f: - return (await f.read()).strip() + return await f.read() except: err_stack = traceback.format_exc().replace('\n', ' ') self._log.error(f"[{self.__class__.__name__}] Could not read file '{file_path}': {err_stack}") - def _map_power_mode_output(self, output: str, file_path: str) -> Optional[str]: + def _map_power_profile_output(self, output: str, file_path: str) -> Optional[str]: if output is not None: for raw_line in output.split('\n'): if raw_line.startswith(' '): @@ -271,26 +271,26 @@ def _map_power_mode_output(self, output: str, file_path: str) -> Optional[str]: return line[0].strip() content_log = output.replace('\n', ' ') - self._log.error(f"Could not map the {self.get_vendor_name()} power mode from {file_path}. " + self._log.error(f"Could not map the {self.get_vendor_name()} power profile from {file_path}. " f"Content: {content_log}") async def _fill_power_mode(self, gpu_id: str, gpu_modes: Dict[str, str]): gpu_dir = self._gpus_path.format(id=gpu_id) - control_file = f'{gpu_dir}/{self.PERFORMANCE_FILE}' - control_type = await self._read_file(control_file) - self._log.debug(f"{self.get_vendor_name()} GPU file ({control_file}): {control_type}") + performance_level_file = f"{gpu_dir}/{self.PERFORMANCE_FILE}" + performance_level = (await self._read_file(performance_level_file)).strip() + self._log.debug(f"{self.get_vendor_name()} GPU file ({performance_level_file}): {performance_level}") - if not control_type: + if not performance_level: return - power_file = f'{gpu_dir}/{self.PROFILE_FILE}' - power_mode = self._map_power_mode_output(await self._read_file(power_file), power_file) - self._log.debug(f"{self.get_vendor_name()} GPU file ({power_file}): {power_mode}") + power_profile_file = f"{gpu_dir}/{self.PROFILE_FILE}" + power_profile = self._map_power_profile_output(await self._read_file(power_profile_file), power_profile_file) + self._log.debug(f"{self.get_vendor_name()} GPU file ({power_profile_file}): {power_profile}") - if not power_mode: + if not power_profile: return - gpu_modes[gpu_id] = f'{control_type}:{power_mode}' + gpu_modes[gpu_id] = f"{performance_level}:{power_profile}" async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dict[str, str]] = None) \ -> Optional[Dict[str, str]]: @@ -322,11 +322,15 @@ async def set_power_mode(self, ids_modes: Dict[str, str], if len(mode) == 2: gpu_dir = self._gpus_path.format(id=id_) - self._log.info(f"Changing {self.get_vendor_name()} GPU ({gpu_dir}) operation mode " + self._log.info(f"Changing {self.get_vendor_name()} GPU ({id_}) operation mode " f"(performance: {mode[0]}, profile: {mode[1]})") writes[id_] = list() coros.append(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', mode[0], id_, writes)) coros.append(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', mode[1], id_, writes)) + else: + self._log.error(f"Could not change {self.get_vendor_name()} GPU operation mode: " + f"unexpected mode format '{mode_str}' " + f"(expected: 'performance_level:power_profile'") await asyncio.gather(*coros) From e3e7655d21a6c9c45b6b75cba5770020c6bf3ba9 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Thu, 15 Sep 2022 07:53:07 -0300 Subject: [PATCH 06/12] [optimizer.gpu] refactoring: GPU logging format and data structures --- guapow/service/optimizer/gpu.py | 121 +++++++++++------- guapow/service/optimizer/main.py | 2 +- .../service/optimizer/gpu/test_gpu_manager.py | 4 +- 3 files changed, 76 insertions(+), 51 deletions(-) diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index df70acd..8abbb85 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -7,7 +7,7 @@ from asyncio import Lock from copy import deepcopy from glob import glob -from logging import Logger +from logging import Logger, ERROR, DEBUG, WARNING, INFO from re import Pattern from typing import Optional, Tuple, Set, Dict, List, Type, AsyncIterator, Any @@ -27,7 +27,7 @@ class GPUDriver(ABC): @abstractmethod def __init__(self, cache: bool, logger: Logger): - self._log = logger + self._logger = logger self._lock = Lock() self._cache_lock = Lock() if cache else None self._gpus: Optional[Set[str]] = None @@ -78,6 +78,19 @@ def get_default_mode(self) -> Any: def get_performance_mode(self) -> Any: pass + def _log(self, msg: str, level: int = INFO): + final_msg = f"{self.get_vendor_name()}: {msg}" + if level == INFO: + self._logger.info(final_msg) + elif level == DEBUG: + self._logger.debug(final_msg) + elif level == ERROR: + self._logger.error(final_msg) + elif level == WARNING: + self._logger.warning(final_msg) + else: + self._logger.info(final_msg) + class NvidiaPowerMode(CustomEnum): ON_DEMAND = 0 @@ -132,7 +145,7 @@ async def set_power_mode(self, ids_modes: Dict[str, NvidiaPowerMode], cmd = f'nvidia-settings {params}' log_str = {', '.join((f'{i}={ids_modes[i].value}' for i in ids_modes))} - self._log.info(f"Changing {self.get_vendor_name()} GPUs power mode ({log_str}): {cmd}") + self._log(f"changing GPUs power mode ({log_str}): {cmd}") _, output = await system.async_syscall(cmd, custom_env=self._map_env_vars(user_environment)) if output: @@ -142,11 +155,10 @@ async def set_power_mode(self, ids_modes: Dict[str, NvidiaPowerMode], try: return {id_: int(mode) == ids_modes[id_].value for id_, mode in changed_gpus if id_ in ids_modes} except ValueError: - self._log.error(f"[{self.__class__.__name__}] Error while parsing changing modes response: " - f"{output}") + self._log(f"error while parsing changing modes response: {output}", ERROR) err_msg = output.replace('\n', ' ') if output else '' - self._log.error(f"[{self.__class__.__name__}] Could not determine the changing modes response: {err_msg}") + self._log(f"could not determine the changing modes response: {err_msg}", ERROR) return {i: False for i in ids_modes} async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dict[str, str]] = None) \ @@ -158,7 +170,7 @@ async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dic if code == 0: if not output: - self._log.warning(f"Could not detect {self.get_vendor_name()} GPUs power mode ({cmd}). No output returned") + self._log(f"could not detect GPUs power mode ({cmd}). No output returned", WARNING) else: modes = self._get_re_get_power().findall(output) @@ -166,12 +178,12 @@ async def get_power_mode(self, gpu_ids: Set[str], user_environment: Optional[Dic try: return {id_: NvidiaPowerMode.from_value(int(mode)) for id_, mode in modes if id_ in gpu_ids} except ValueError: - self._log.error(f"[{self.__class__.__name__}] Error when parsing power modes: {modes}") + self._log(f"error when parsing power modes: {modes}", ERROR) - self._log.error("Could not detect {} GPUs power mode ({}). No modes found in output: {}".format(self.get_vendor_name(), cmd, output)) + self._log(f"could not detect GPUs power mode ({cmd}). No modes found in output: {output}", ERROR) else: output_str = '. Output: {}'.format(output.replace('\n', ' ')) if output else '' - self._log.error(f"Could not detect {self.get_vendor_name()} GPUs power mode ({cmd}){output_str}") + self._log(f"could not detect GPUs power mode ({cmd}){output_str}", ERROR) def can_work(self) -> Tuple[bool, Optional[str]]: if not shutil.which('nvidia-settings'): @@ -191,11 +203,11 @@ def get_performance_mode(self) -> NvidiaPowerMode: class AMDGPUDriver(GPUDriver): - PERFORMANCE_FILE = 'power_dpm_force_performance_level' - PROFILE_FILE = 'pp_power_profile_mode' - VENDOR = 'AMD' + PERFORMANCE_FILE = "power_dpm_force_performance_level" + PROFILE_FILE = "pp_power_profile_mode" + VENDOR = "AMD" - def __init__(self, cache: bool, logger: Logger, gpus_path: str = '/sys/class/drm/card{id}/device'): + def __init__(self, cache: bool, logger: Logger, gpus_path: str = "/sys/class/drm/card{id}/device"): super(AMDGPUDriver, self).__init__(cache, logger) self._gpus_path = gpus_path self._re_power_mode: Optional[Pattern] = None @@ -222,6 +234,12 @@ def re_extract_id(self) -> Pattern: return self._re_extract_id + def extract_gpu_id(self, gpu_path: str) -> Optional[int]: + try: + return self.re_extract_id.findall(gpu_path)[0] + except IndexError: + self._log(f"Could not extract GPU id from path: {gpu_path}", ERROR) + async def get_gpus(self) -> Optional[Set[str]]: required_files = {self.PERFORMANCE_FILE: set(), self.PROFILE_FILE: set()} @@ -229,8 +247,9 @@ async def get_gpus(self) -> Optional[Set[str]]: gpu_file = os.path.basename(gpu_file_path) if gpu_file in required_files: if not os.access(gpu_file_path, mode=os.W_OK): - self._log.warning(f"Writing is not allowed for {self.get_vendor_name()} GPU file {gpu_file_path}. " - f"It will not be possible to set this GPU to performance mode") + id_ = self.extract_gpu_id(gpu_file_path) + self._log(f"Writing is not allowed for file '{gpu_file_path}. It will not be possible to set " + f"the GPU ({id_}) to performance mode", WARNING) else: required_files[gpu_file].add(os.path.dirname(gpu_file_path)) @@ -239,19 +258,24 @@ async def get_gpus(self) -> Optional[Set[str]]: if all_gpu_dirs: gpus = set() for gpu_dir in all_gpu_dirs: - all_files_available = True - for gpu_file_dirs in required_files.values(): + missing_files = set() + for file, gpu_file_dirs in required_files.items(): if gpu_dir not in gpu_file_dirs: - all_files_available = False + missing_files.add(file) + + if missing_files: + self._log(f"not all required files are accessible for mounted GPU in '{gpu_dir}' " + f"(missing: {', '.join(sorted(missing_files))})", WARNING) + else: + self._log(f"all required files are accessible for GPU mounted in '{gpu_dir}'", DEBUG) + gpu_id = self.extract_gpu_id(gpu_dir) - if all_files_available: - try: - gpus.add(self.re_extract_id.findall(gpu_dir)[0]) - except IndexError: - self._log.error(f"[{self.__class__.__name__}] Could not extract AMD GPU id from directory: " - f"{gpu_dir}") + if gpu_id is not None: + gpus.add(gpu_id) return gpus if gpus else None + else: + self._log("no mounted GPU directories", DEBUG) async def _read_file(self, file_path: str) -> Optional[str]: try: @@ -259,7 +283,7 @@ async def _read_file(self, file_path: str) -> Optional[str]: return await f.read() except: err_stack = traceback.format_exc().replace('\n', ' ') - self._log.error(f"[{self.__class__.__name__}] Could not read file '{file_path}': {err_stack}") + self._log(f"Could not read file '{file_path}': {err_stack}", ERROR) def _map_power_profile_output(self, output: str, file_path: str) -> Optional[str]: if output is not None: @@ -271,21 +295,20 @@ def _map_power_profile_output(self, output: str, file_path: str) -> Optional[str return line[0].strip() content_log = output.replace('\n', ' ') - self._log.error(f"Could not map the {self.get_vendor_name()} power profile from {file_path}. " - f"Content: {content_log}") + self._log(f"could not map power profile from {file_path}. Content: {content_log}", WARNING) async def _fill_power_mode(self, gpu_id: str, gpu_modes: Dict[str, str]): gpu_dir = self._gpus_path.format(id=gpu_id) performance_level_file = f"{gpu_dir}/{self.PERFORMANCE_FILE}" performance_level = (await self._read_file(performance_level_file)).strip() - self._log.debug(f"{self.get_vendor_name()} GPU file ({performance_level_file}): {performance_level}") + self._log(f"GPU file ({performance_level_file}): {performance_level}", DEBUG) if not performance_level: return power_profile_file = f"{gpu_dir}/{self.PROFILE_FILE}" power_profile = self._map_power_profile_output(await self._read_file(power_profile_file), power_profile_file) - self._log.debug(f"{self.get_vendor_name()} GPU file ({power_profile_file}): {power_profile}") + self._log(f"GPU file ({power_profile_file}): {power_profile}", DEBUG) if not power_profile: return @@ -305,8 +328,8 @@ async def _write_to_file(self, file_path: str, content: str) -> bool: await f.write(content) return True except: - self._log.error(f"[{self.__class__.__name__}] Could not write '{content}' to file '{file_path}'") - traceback.print_exc() + err_stack = traceback.format_exc().replace('\n', ' ') + self._log(f"could not write '{content}' to file '{file_path}': {err_stack}", ERROR) return False async def _fill_write_result(self, file_path: str, content: str, id_: str, output: Dict[str, List[bool]]): @@ -322,15 +345,13 @@ async def set_power_mode(self, ids_modes: Dict[str, str], if len(mode) == 2: gpu_dir = self._gpus_path.format(id=id_) - self._log.info(f"Changing {self.get_vendor_name()} GPU ({id_}) operation mode " - f"(performance: {mode[0]}, profile: {mode[1]})") + self._log(f"changing GPU ({id_}) operation mode (performance: {mode[0]}, profile: {mode[1]})") writes[id_] = list() coros.append(self._fill_write_result(f'{gpu_dir}/{self.PERFORMANCE_FILE}', mode[0], id_, writes)) coros.append(self._fill_write_result(f'{gpu_dir}/{self.PROFILE_FILE}', mode[1], id_, writes)) else: - self._log.error(f"Could not change {self.get_vendor_name()} GPU operation mode: " - f"unexpected mode format '{mode_str}' " - f"(expected: 'performance_level:power_profile'") + self._log(f"could not change GPU ({id_}) operation mode: unexpected mode format '{mode_str}' " + f"(expected: 'performance_level:power_profile'", ERROR) await asyncio.gather(*coros) @@ -372,7 +393,7 @@ class GPUManager: LOG_CACHE_KEY__WORK = 0 LOG_CACHE_KEY__AVAILABLE = 1 - def __init__(self, logger: Logger, drivers: Optional[List[GPUDriver]] = None, cache_gpus: bool = False): + def __init__(self, logger: Logger, drivers: Optional[Tuple[GPUDriver]] = None, cache_gpus: bool = False): self._log = logger self._drivers = drivers self._drivers_lock = Lock() @@ -380,7 +401,7 @@ def __init__(self, logger: Logger, drivers: Optional[List[GPUDriver]] = None, ca self._gpu_state_cache: Dict[Type[GPUDriver], Dict[str, Any]] = {} self._gpu_state_cache_lock = Lock() self._log_cache: Dict[Type[GPUDriver], Dict[int, object]] = {} # to avoid repetitive logs - self._working_drivers_cache: Optional[List[GPUDriver]] = None # cached working drivers (only when 'cache_gpus') + self._working_drivers_cache: Optional[Tuple[GPUDriver]] = None # only when 'cache_gpus' self._working_drivers_cache_lock = Lock() def is_cache_enabled(self) -> bool: @@ -434,7 +455,7 @@ async def _map_driver_if_gpus(self, driver: GPUDriver) -> Optional[Tuple[GPUDriv return driver, gpus async def _map_drivers_and_gpus(self) -> AsyncIterator[Tuple[GPUDriver, Set[str]]]: - for task in asyncio.as_completed([self._map_driver_if_gpus(driver) for driver in self._drivers]): + for task in asyncio.as_completed(tuple(self._map_driver_if_gpus(driver) for driver in self._drivers)): driver_gpus = await task if driver_gpus: @@ -443,7 +464,8 @@ async def _map_drivers_and_gpus(self) -> AsyncIterator[Tuple[GPUDriver, Set[str] async def map_working_drivers_and_gpus(self) -> AsyncIterator[Tuple[GPUDriver, Set[str]]]: async with self._drivers_lock: if self._drivers is None: - self._drivers = [cls(self._cache_gpus, self._log) for cls in GPUDriver.__subclasses__() if cls != self.__class__] + driver_types = GPUDriver.__subclasses__() + self._drivers = tuple(cls(self._cache_gpus, self._log) for cls in driver_types if cls != self.__class__) if self._drivers: if self._cache_gpus: @@ -452,14 +474,18 @@ async def map_working_drivers_and_gpus(self) -> AsyncIterator[Tuple[GPUDriver, S for driver in self._working_drivers_cache: yield driver, await self._get_driver_gpus(driver) else: - self._working_drivers_cache = [] + working_drivers = [] async for driver, gpus in self._map_drivers_and_gpus(): yield driver, gpus - self._working_drivers_cache.append(driver) + working_drivers.append(driver) + + self._working_drivers_cache = tuple(working_drivers) else: async for driver, gpus in self._map_drivers_and_gpus(): yield driver, gpus + else: + self._log.error("No GPU driver instances available") async def activate_performance(self, user_environment: Optional[Dict[str, str]] = None) \ -> Optional[Dict[Type[GPUDriver], Set[GPUState]]]: @@ -468,7 +494,6 @@ async def activate_performance(self, user_environment: Optional[Dict[str, str]] async for driver, gpus in self.map_working_drivers_and_gpus(): async with driver.lock(): gpu_modes = await driver.get_power_mode(gpus, user_environment) - if gpu_modes: performance_mode = driver.get_performance_mode() async with self._gpu_state_cache_lock: @@ -501,12 +526,12 @@ async def activate_performance(self, user_environment: Optional[Dict[str, str]] return res - def get_drivers(self) -> Optional[List[GPUDriver]]: - return [*self._drivers] if self._drivers is not None else None + def get_drivers(self) -> Optional[Tuple[GPUDriver]]: + return tuple(self._drivers) if self._drivers is not None else None - def get_cached_working_drivers(self) -> Optional[List[GPUDriver]]: + def get_cached_working_drivers(self) -> Optional[Tuple[GPUDriver]]: if self._working_drivers_cache: - return [*self._working_drivers_cache] + return tuple(self._working_drivers_cache) def get_gpu_state_cache_view(self) -> Dict[Type[GPUDriver], Dict[str, Any]]: return deepcopy(self._gpu_state_cache) diff --git a/guapow/service/optimizer/main.py b/guapow/service/optimizer/main.py index 6d070be..5a232e1 100644 --- a/guapow/service/optimizer/main.py +++ b/guapow/service/optimizer/main.py @@ -74,7 +74,7 @@ async def prepare_app() -> Tuple[web.Application, OptimizerConfig]: if gpu_driver: logger.info(f'Pre-defined GPU vendor: {opt_config.gpu_vendor}') - gpu_drivers = [gpu_driver(cache=opt_config.gpu_cache, logger=logger)] + gpu_drivers = (gpu_driver(cache=opt_config.gpu_cache, logger=logger),) else: logger.warning(f'Invalid pre-defined GPU vendor: {opt_config.gpu_vendor}') diff --git a/tests/service/optimizer/gpu/test_gpu_manager.py b/tests/service/optimizer/gpu/test_gpu_manager.py index d3be227..c69322e 100644 --- a/tests/service/optimizer/gpu/test_gpu_manager.py +++ b/tests/service/optimizer/gpu/test_gpu_manager.py @@ -67,7 +67,7 @@ async def test_map_working_drivers_and_gpus__must_lock_concurrent_requests_when_ driver_1.can_work = Mock(return_value=(True, None)) driver_1.get_gpus = AsyncMock(return_value={'0'}) - man = GPUManager(Mock(), drivers=[driver_1], cache_gpus=True) + man = GPUManager(Mock(), drivers=(driver_1,), cache_gpus=True) self.assertIsNone(man.get_cached_working_drivers()) async def mock_map_working_drivers() -> List[Tuple[GPUDriver, Set[str]]]: @@ -79,7 +79,7 @@ async def mock_map_working_drivers() -> List[Tuple[GPUDriver, Set[str]]]: for res in tasks_res: self.assertEqual([(driver_1, {'0'})], res) - self.assertEqual([driver_1], man.get_cached_working_drivers()) + self.assertEqual((driver_1,), man.get_cached_working_drivers()) driver_1.can_work.assert_called_once() # only one call when cache is on (even for concurrent requests) driver_1.get_gpus.assert_called_once() # only one call when cache is on (even for concurrent requests) From 61f66a58d7f9dfd4faaebd38e218147ad2157069 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Thu, 15 Sep 2022 08:15:45 -0300 Subject: [PATCH 07/12] [optimizer] improvement: property 'check.finished.interval' accepting floats and min allowed is 0.5 --- CHANGELOG.md | 4 +++- README.md | 2 +- guapow/common/config.py | 6 +++--- guapow/service/optimizer/watch.py | 9 ++++++--- tests/common/config/test_optimizer_config.py | 8 ++++---- tests/common/config/test_optimizer_config_reader.py | 2 +- tests/resources/opt_check_finished_interval.conf | 2 +- 7 files changed, 19 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 643f194..c455da3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [1.2.2] +## [NEXT] ### Improvements - Minor code refactoring and log improvements regarding AMD GPU management +- Optimizer: + - configuration property `check.finished.interval` now accepts floats and the minimum value accepted is `0.5`. ## [1.2.1] 2022-08-22 ### Fixes diff --git a/README.md b/README.md index 595a896..527438a 100644 --- a/README.md +++ b/README.md @@ -466,7 +466,7 @@ makepkg -si port = 5087 (TCP port) compositor = (pre-defines the installed compositor. Options: kwin, compiz, marco, picom, compton, nvidia) scripts.allow_root = false (allow custom scripts/commands to run at the root level) - check.finished.interval = 3 (finished applications checking interval in seconds) + check.finished.interval = 3 (finished applications checking interval in seconds. Min accepted value: 0.5) launcher.mapping.timeout = 30 (max time in seconds to find the application mapped to a given launcher. float values are allowed) gpu.cache = false (if 'true': maps all available GPUs on startup. Otherwise, GPUs will be mapped for every request) gpu.vendor = # pre-defines your GPU vendor for faster GPUs mapping. Supported: nvidia, amd diff --git a/guapow/common/config.py b/guapow/common/config.py index 212cfd0..f1a5fca 100644 --- a/guapow/common/config.py +++ b/guapow/common/config.py @@ -73,7 +73,7 @@ class OptimizerConfig(RootFileModel): FILE_MAPPING = {'port': ('port', int, None), 'compositor': ('compositor', str, None), 'scripts.allow_root': ('allow_root_scripts', bool, True), - 'check.finished.interval': ('check_finished_interval', int, None), + 'check.finished.interval': ('check_finished_interval', float, None), 'launcher.mapping.timeout': ('launcher_mapping_timeout', float, None), 'gpu.cache': ('gpu_cache', bool, True), 'gpu.vendor': ('gpu_vendor', str, None), @@ -84,7 +84,7 @@ class OptimizerConfig(RootFileModel): def __init__(self, port: Optional[int] = None, compositor: Optional[str] = None, allow_root_scripts: Optional[bool] = False, - check_finished_interval: Optional[int] = None, launcher_mapping_timeout: Optional[float] = 30, + check_finished_interval: Optional[float] = None, launcher_mapping_timeout: Optional[float] = 30, gpu_cache: Optional[bool] = False, cpu_performance: Optional[bool] = None, profile_cache: Optional[bool] = None, pre_cache_profiles: Optional[bool] = None, gpu_vendor: Optional[str] = None, renicer_interval: Optional[float] = None): @@ -127,7 +127,7 @@ def has_valid_launcher_mapping_timeout(self) -> bool: return self.launcher_mapping_timeout is not None and self.launcher_mapping_timeout >= 0 def has_valid_check_finished_interval(self) -> bool: - return self.check_finished_interval is not None and self.check_finished_interval > 0 + return self.check_finished_interval is not None and self.check_finished_interval >= 0.5 def has_valid_renicer_interval(self) -> bool: return self.renicer_interval is not None and self.renicer_interval > 0 diff --git a/guapow/service/optimizer/watch.py b/guapow/service/optimizer/watch.py index f46c280..b96478d 100644 --- a/guapow/service/optimizer/watch.py +++ b/guapow/service/optimizer/watch.py @@ -12,7 +12,7 @@ class DeadProcessWatcher: - def __init__(self, context: OptimizationContext, restore_man: PostProcessTaskManager, check_interval: int, to_watch: Optional[List[OptimizedProcess]] = None, + def __init__(self, context: OptimizationContext, restore_man: PostProcessTaskManager, check_interval: float, to_watch: Optional[List[OptimizedProcess]] = None, to_relaunch: Optional[Dict[str, str]] = None): self._context = context self._check_interval = check_interval @@ -121,8 +121,11 @@ def get_to_relaunch_view(self) -> Optional[Dict[str, str]]: class DeadProcessWatcherManager: - def __init__(self, check_interval: int, restore_man: PostProcessTaskManager, context: OptimizationContext): - self._watcher = DeadProcessWatcher(context=context, check_interval=check_interval, restore_man=restore_man, to_relaunch={}) + def __init__(self, check_interval: float, restore_man: PostProcessTaskManager, context: OptimizationContext): + self._watcher = DeadProcessWatcher(context=context, + check_interval=check_interval, + restore_man=restore_man, + to_relaunch={}) async def watch(self, process: OptimizedProcess): if process: diff --git a/tests/common/config/test_optimizer_config.py b/tests/common/config/test_optimizer_config.py index 7da7f54..8dae774 100644 --- a/tests/common/config/test_optimizer_config.py +++ b/tests/common/config/test_optimizer_config.py @@ -58,12 +58,12 @@ def test_has_valid_port__false_when_port_is_higher_than_limit(self): config = OptimizerConfig(65536) self.assertFalse(config.has_valid_port()) - def test_has_valid_check_finished_interval__true_when_higher_than_zero(self): - config = OptimizerConfig(check_finished_interval=1) + def test_has_valid_check_finished_interval__true_when_higher_or_equal_point_five(self): + config = OptimizerConfig(check_finished_interval=0.5) self.assertTrue(config.has_valid_check_finished_interval()) - def test_has_valid_check_finished_interval__false_when_zero(self): - config = OptimizerConfig(check_finished_interval=0) + def test_has_valid_check_finished_interval__false_when_less_than_point_five(self): + config = OptimizerConfig(check_finished_interval=0.49) self.assertFalse(config.has_valid_check_finished_interval()) def test_has_valid_check_finished_interval__false_when_negative(self): diff --git a/tests/common/config/test_optimizer_config_reader.py b/tests/common/config/test_optimizer_config_reader.py index 689fe1c..5e8b298 100644 --- a/tests/common/config/test_optimizer_config_reader.py +++ b/tests/common/config/test_optimizer_config_reader.py @@ -78,7 +78,7 @@ async def test_read_valid__return_instance_with_valid_check_finished_interval_va file_path = f'{RESOURCES_DIR}/opt_check_finished_interval.conf' config = await self.reader.read_valid(file_path=file_path) self.assertIsNotNone(config) - self.assertEqual(1, config.check_finished_interval) + self.assertEqual(0.5, config.check_finished_interval) async def test_read_valid__return_instance_with_valid_check_finished_interval_value_for_invalid_definition(self): file_path = f'{RESOURCES_DIR}/opt_invalid_check_interval.conf' diff --git a/tests/resources/opt_check_finished_interval.conf b/tests/resources/opt_check_finished_interval.conf index 0351601..44cdaba 100644 --- a/tests/resources/opt_check_finished_interval.conf +++ b/tests/resources/opt_check_finished_interval.conf @@ -1 +1 @@ -check.finished.interval=1 \ No newline at end of file +check.finished.interval=0.5 \ No newline at end of file From ec16aa7766e584d6ce52c569c92b11bf7ec53031 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Tue, 20 Sep 2022 07:31:16 -0300 Subject: [PATCH 08/12] [README.md] fix: spelling --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 527438a..cb09f5b 100644 --- a/README.md +++ b/README.md @@ -539,7 +539,7 @@ makepkg -si ``` - Logs: - - Logs are managed through the environemnt variables: + - Logs are managed through the environment variables: - `GUAPOW_WATCH_LOG`: enables/disables logs. Options: **1** (enables, default), **0** (disables). - `GUAPOW_WATCH_LOG_LEVEL`: controls the type of logging that should be printed. Options: `info`, `debug`, `error` and `warning` (`debug` is the most detailed type). Default: `info`. - If the **watcher** is running as a service, these variables can be changed on the definition file (`~.config/systemd/user/guapow-watch.service`) From 68ecdf1cd100eb68514b3f91b41775140b47d253 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Thu, 22 Sep 2022 07:51:51 -0300 Subject: [PATCH 09/12] [service.optimizer] new property 'gpu.id' to limit which GPUs should be optimized --- CHANGELOG.md | 8 ++- README.md | 3 +- guapow/common/config.py | 5 +- guapow/common/model_util.py | 19 +++++++ guapow/dist/daemon/opt.conf | 1 + guapow/service/optimizer/gpu.py | 27 +++++++++- guapow/service/optimizer/main.py | 17 +++++-- guapow/service/optimizer/task/environment.py | 13 +++-- guapow/service/optimizer/task/model.py | 5 +- .../config/test_optimizer_config_reader.py | 15 ++++++ tests/common/test_class_util.py | 7 +-- tests/resources/opt_gpu_ids.conf | 1 + tests/resources/opt_gpu_ids_invalid.conf | 1 + .../service/optimizer/gpu/test_gpu_manager.py | 50 +++++++++++++++++++ .../optimization/test_environment.py | 24 ++++++++- 15 files changed, 174 insertions(+), 22 deletions(-) create mode 100644 tests/resources/opt_gpu_ids.conf create mode 100644 tests/resources/opt_gpu_ids_invalid.conf diff --git a/CHANGELOG.md b/CHANGELOG.md index c455da3..84504dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Improvements - Minor code refactoring and log improvements regarding AMD GPU management - Optimizer: - - configuration property `check.finished.interval` now accepts floats and the minimum value accepted is `0.5`. + - configuration property `check.finished.interval` now accepts floats and the minimum value accepted is `0.5` + - new configuration property `gpu.id`: allows to define which GPU cards should be optimized (e.g: `gpus.id = 0,1`). If not defined, all available GPUs are considered (default). + +### Fixes +- optimizer: + - when running as a system service, sometimes the GPU mapped directories are not available during the system startup and affects the correct behavior of the property `gpu.cache` when it is enabled (`true`) + - so now the available GPUs will be cached after a first request when the `optimizer` is running as a system service (otherwise they will be cached normally during the service startup process) ## [1.2.1] 2022-08-22 ### Fixes diff --git a/README.md b/README.md index cb09f5b..c2b14f6 100644 --- a/README.md +++ b/README.md @@ -468,7 +468,8 @@ makepkg -si scripts.allow_root = false (allow custom scripts/commands to run at the root level) check.finished.interval = 3 (finished applications checking interval in seconds. Min accepted value: 0.5) launcher.mapping.timeout = 30 (max time in seconds to find the application mapped to a given launcher. float values are allowed) - gpu.cache = false (if 'true': maps all available GPUs on startup. Otherwise, GPUs will be mapped for every request) + gpu.cache = false (if 'true': maps all available GPUs once after the first request (if running as a system service) or during startup (if not running as system service). Otherwise, GPUs will be mapped for every request) + gpu.id = # comma separated list of integers representing which GPU cards should be optimized (e.g: 0, 1). If not defined, all available GPUs are considered (default) gpu.vendor = # pre-defines your GPU vendor for faster GPUs mapping. Supported: nvidia, amd cpu.performance = false (set cpu governors and energy policy levels to full performance on startup) request.allowed_users = (restricts users that can request optimizations, separated by comma. e.g: root,xpto) diff --git a/guapow/common/config.py b/guapow/common/config.py index f1a5fca..7b9ecc8 100644 --- a/guapow/common/config.py +++ b/guapow/common/config.py @@ -76,6 +76,7 @@ class OptimizerConfig(RootFileModel): 'check.finished.interval': ('check_finished_interval', float, None), 'launcher.mapping.timeout': ('launcher_mapping_timeout', float, None), 'gpu.cache': ('gpu_cache', bool, True), + 'gpu.id': ('gpu_ids', Set[int], None), 'gpu.vendor': ('gpu_vendor', str, None), 'cpu.performance': ('cpu_performance', bool, True), 'profile.cache': ('profile_cache', bool, True), @@ -87,7 +88,8 @@ def __init__(self, port: Optional[int] = None, compositor: Optional[str] = None, check_finished_interval: Optional[float] = None, launcher_mapping_timeout: Optional[float] = 30, gpu_cache: Optional[bool] = False, cpu_performance: Optional[bool] = None, profile_cache: Optional[bool] = None, pre_cache_profiles: Optional[bool] = None, - gpu_vendor: Optional[str] = None, renicer_interval: Optional[float] = None): + gpu_vendor: Optional[str] = None, renicer_interval: Optional[float] = None, + gpu_ids: Optional[Set[int]] = None): self.port = port self.compositor = compositor self.allow_root_scripts = allow_root_scripts @@ -95,6 +97,7 @@ def __init__(self, port: Optional[int] = None, compositor: Optional[str] = None, self.launcher_mapping_timeout = launcher_mapping_timeout self.gpu_cache = gpu_cache self.gpu_vendor = gpu_vendor + self.gpu_ids = gpu_ids self.cpu_performance = cpu_performance self.request = RequestSettings.default() self.profile_cache = profile_cache diff --git a/guapow/common/model_util.py b/guapow/common/model_util.py index 6c253a9..703ab6f 100644 --- a/guapow/common/model_util.py +++ b/guapow/common/model_util.py @@ -136,6 +136,25 @@ def get_raw_type(self) -> type: return list +class IntSetPropertyMapper(FileModelCollectionPropertyMapper): + + def supports(self, prop_type: type) -> bool: + return prop_type == Set[int] + + def map(self, prop_val: str, prop_type: type) -> Optional[Set[int]]: + return {int(n) for n in prop_val.split(',') if n.isdigit()} + + def create_collection(self) -> object: + return set() + + def update_collection(self, collection: object, update: object): + if isinstance(collection, set) and isinstance(update, set): + collection.update(update) + + def get_raw_type(self) -> type: + return set + + class StringListPropertyMapper(FileModelCollectionPropertyMapper): def supports(self, prop_type: type) -> bool: diff --git a/guapow/dist/daemon/opt.conf b/guapow/dist/daemon/opt.conf index d46724b..7eca072 100644 --- a/guapow/dist/daemon/opt.conf +++ b/guapow/dist/daemon/opt.conf @@ -7,6 +7,7 @@ # check.finished.interval = 3 # finished applications checking interval in seconds # launcher.mapping.timeout = 30 # max time in seconds to find the application mapped to a given launcher. float values are allowed # gpu.cache = false # if 'true': maps all available GPUs on startup. Otherwise, GPUs will be mapped for every request +# gpu.id = # comma separated list of integers representing which GPU cards should be optimized (e.g: 0, 1). If not defined, all available GPUs are considered (default) # gpu.vendor = # pre-defines your GPU vendor for faster GPUs mapping. Supported: nvidia, amd # cpu.performance = false # set cpu governors and energy policy levels to full performance on startup # request.allowed_users = # restricts users that can request optimizations, separated by comma. e.g: root,xpto) diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index 8abbb85..a3721f1 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -487,13 +487,36 @@ async def map_working_drivers_and_gpus(self) -> AsyncIterator[Tuple[GPUDriver, S else: self._log.error("No GPU driver instances available") - async def activate_performance(self, user_environment: Optional[Dict[str, str]] = None) \ + async def activate_performance(self, user_environment: Optional[Dict[str, str]] = None, + target_gpu_ids: Optional[Set[str]] = None) \ -> Optional[Dict[Type[GPUDriver], Set[GPUState]]]: + """ + + Args: + user_environment: user environment variables + target_gpu_ids: the target GPU ids to enter in performance mode. If None, all available GPUs will be considered. + + Returns: the GPUs previous states + + """ res = {} async for driver, gpus in self.map_working_drivers_and_gpus(): + if not gpus: + continue + + target_gpus = gpus.intersection(target_gpu_ids) if target_gpu_ids else gpus + + if not target_gpus: + self._log.debug(f"[{driver.get_vendor_name()} GPU] No valid target GPUs available " + f"to enter in performance mode (valid: {', '.join(sorted(gpus))})") + continue + async with driver.lock(): - gpu_modes = await driver.get_power_mode(gpus, user_environment) + if target_gpu_ids and gpus != target_gpu_ids: + self._log.debug(f"Target GPU ids to enter in performance mode: {', '.join(sorted(target_gpus))}") + + gpu_modes = await driver.get_power_mode(target_gpus, user_environment) if gpu_modes: performance_mode = driver.get_performance_mode() async with self._gpu_state_cache_lock: diff --git a/guapow/service/optimizer/main.py b/guapow/service/optimizer/main.py index 5a232e1..7ceb8e2 100644 --- a/guapow/service/optimizer/main.py +++ b/guapow/service/optimizer/main.py @@ -39,8 +39,12 @@ async def prepare_app() -> Tuple[web.Application, OptimizerConfig]: getLogger('aiohttp.server').disabled = True is_service = OptimizerConfig.is_service() + logger = new_logger(name=f'{__app_name__}-opt', service=is_service, enabled=OptimizerConfig.is_log_enabled(), write_to_file=False, level=OptimizerConfig.get_log_level()) + if is_service: + logger.debug("Initializing as a system service") + user_id, user_name = os.getuid(), getpass.getuser() logger.debug(f"Initializing as user '{user_name}' (pid={os.getpid()})") @@ -51,9 +55,6 @@ async def prepare_app() -> Tuple[web.Application, OptimizerConfig]: logger.info(f'Finished process checking interval: {opt_config.check_finished_interval} seconds') logger.info(f'Launcher mapping timeout: {opt_config.launcher_mapping_timeout} seconds') - if not opt_config.gpu_cache: - logger.warning("Available GPUs cache is disabled. Available GPUs will be mapped for every request") - if opt_config.allow_root_scripts: logger.warning("Scripts are allowed to run at root level") @@ -78,6 +79,12 @@ async def prepare_app() -> Tuple[web.Application, OptimizerConfig]: else: logger.warning(f'Invalid pre-defined GPU vendor: {opt_config.gpu_vendor}') + if opt_config.gpu_ids: + logger.info(f"Target GPU ids to be optimized: {', '.join(str(i) for i in sorted(opt_config.gpu_ids))}") + + if not opt_config.gpu_cache: + logger.warning("Available GPUs cache is disabled. Available GPUs will be mapped for every request") + gpu_man = GPUManager(cache_gpus=opt_config.gpu_cache, logger=logger, drivers=gpu_drivers) cpu_count = get_cpu_count() @@ -87,7 +94,9 @@ async def prepare_app() -> Tuple[web.Application, OptimizerConfig]: compositor=compositor, allow_root_scripts=bool(opt_config.allow_root_scripts), launcher_mapping_timeout=opt_config.launcher_mapping_timeout, mouse_man=MouseCursorManager(logger), renicer_interval=opt_config.renicer_interval, - cpuenergy_man=cpu_energy_man, queue=OptimizationQueue.empty()) + cpuenergy_man=cpu_energy_man, queue=OptimizationQueue.empty(), + system_service=opt_config.is_service(), + gpu_ids={str(i) for i in opt_config.gpu_ids} if opt_config.gpu_ids else None) watcher_man = DeadProcessWatcherManager(context=context, restore_man=PostProcessTaskManager(context), check_interval=opt_config.check_finished_interval) diff --git a/guapow/service/optimizer/task/environment.py b/guapow/service/optimizer/task/environment.py index 1ac8acf..2c65220 100644 --- a/guapow/service/optimizer/task/environment.py +++ b/guapow/service/optimizer/task/environment.py @@ -73,17 +73,14 @@ class ChangeGPUModeToPerformance(EnvironmentTask): def __init__(self, context: OptimizationContext): super(ChangeGPUModeToPerformance, self).__init__(context=context) + self._context = context self._log = context.logger self._gpu_man = context.gpu_man - def check_gpus_for_every_request(self): - return not self._gpu_man.is_cache_enabled() - async def is_available(self) -> Tuple[bool, Optional[str]]: - if self.check_gpus_for_every_request(): + if self._context.system_service or not self._gpu_man.is_cache_enabled(): return True, None - - if await self._list_drivers_with_gpus(): + elif await self._list_drivers_with_gpus(): return True, None else: return False, "No manageable GPUs found" @@ -96,7 +93,9 @@ async def should_run(self, process: OptimizedProcess) -> bool: return bool(process.profile.gpu and process.profile.gpu.performance) async def run(self, process: OptimizedProcess): - previous_gpu_states = await self._gpu_man.activate_performance(user_environment=process.request.user_env) + target_gpu_ids = self._context.gpu_ids if self._context.gpu_ids else None + previous_gpu_states = await self._gpu_man.activate_performance(user_environment=process.request.user_env, + target_gpu_ids=target_gpu_ids) if previous_gpu_states: process.previous_gpus_states = previous_gpu_states diff --git a/guapow/service/optimizer/task/model.py b/guapow/service/optimizer/task/model.py index 56b9c90..25b9fed 100644 --- a/guapow/service/optimizer/task/model.py +++ b/guapow/service/optimizer/task/model.py @@ -19,7 +19,8 @@ def __init__(self, gpu_man: Optional[GPUManager], logger: Optional[Logger], mouse_man: Optional[MouseCursorManager], queue: Optional[OptimizationQueue], cpu_count: int, launcher_mapping_timeout: float, renicer_interval: float, compositor: Optional[WindowCompositor] = None, allow_root_scripts: bool = False, - compositor_disabled_context: Optional[dict] = None): + compositor_disabled_context: Optional[dict] = None, + system_service: bool = False, gpu_ids: Optional[Set[str]] = None): self.queue = queue self.gpu_man = gpu_man @@ -33,6 +34,8 @@ def __init__(self, gpu_man: Optional[GPUManager], logger: Optional[Logger], self.launcher_mapping_timeout = launcher_mapping_timeout self.compositor_disabled_context = compositor_disabled_context # if the compositor was disabled by the Optimizer self.renicer_interval = renicer_interval + self.system_service = system_service + self.gpu_ids = gpu_ids @classmethod def empty(cls) -> "OptimizationContext": diff --git a/tests/common/config/test_optimizer_config_reader.py b/tests/common/config/test_optimizer_config_reader.py index 5e8b298..98d31a3 100644 --- a/tests/common/config/test_optimizer_config_reader.py +++ b/tests/common/config/test_optimizer_config_reader.py @@ -29,6 +29,7 @@ async def test_read_valid__return_valid_instance_when_no_existent_property_is_de self.assertIsNotNone(config.request) self.assertTrue(config.request.encrypted) self.assertIsNone(config.request.allowed_users) + self.assertIsNone(config.gpu_ids) async def test_read_valid__return_valid_instance_when_file_does_not_exist(self): file_path = f'{RESOURCES_DIR}/123opt.conf' @@ -156,3 +157,17 @@ async def test_read_valid__return_instance_with_valid_renicer_interval_for_inval self.assertIsNotNone(config) self.assertTrue(config.is_valid()) self.assertEqual(5, config.renicer_interval) # default value is 5 + + async def test_read_valid__return_instance_with_valid_gpu_targets(self): + file_path = f'{RESOURCES_DIR}/opt_gpu_ids.conf' + config = await self.reader.read_valid(file_path=file_path) + self.assertIsNotNone(config) + self.assertTrue(config.is_valid()) + self.assertEqual({1, 2, 3}, config.gpu_ids) + + async def test_read_valid__return_instance_with_no_gpu_target_when_invalid_definition(self): + file_path = f'{RESOURCES_DIR}/opt_gpu_ids_invalid.conf' + config = await self.reader.read_valid(file_path=file_path) + self.assertIsNotNone(config) + self.assertTrue(config.is_valid()) + self.assertIsNone(config.gpu_ids) diff --git a/tests/common/test_class_util.py b/tests/common/test_class_util.py index 34d5cae..48ce353 100644 --- a/tests/common/test_class_util.py +++ b/tests/common/test_class_util.py @@ -3,7 +3,7 @@ from guapow.common import class_util from guapow.common.model_util import FileModelPropertyMapper, StringPropertyMapper, IntPropertyMapper, \ FloatPropertyMapper, BoolPropertyMapper, StringListPropertyMapper, IntListPropertyMapper, CustomEnumPropertyMapper, \ - DictPropertyMapper, StringSetPropertyMapper + DictPropertyMapper, StringSetPropertyMapper, IntSetPropertyMapper class InstantiateSubclassesTest(TestCase): @@ -12,9 +12,10 @@ def test__must_instantiate_all_FileModelPropertyMapper_subclasses(self): subs = class_util.instantiate_subclasses(FileModelPropertyMapper) self.assertIsNotNone(subs) - self.assertEqual(9, len(subs)) + self.assertEqual(10, len(subs)) for mtype in {StringPropertyMapper, IntPropertyMapper, FloatPropertyMapper, BoolPropertyMapper, - StringListPropertyMapper, IntListPropertyMapper, StringSetPropertyMapper, CustomEnumPropertyMapper, DictPropertyMapper}: + StringListPropertyMapper, IntListPropertyMapper, StringSetPropertyMapper, + CustomEnumPropertyMapper, DictPropertyMapper, IntSetPropertyMapper}: instances = [s for s in subs if isinstance(s, mtype)] self.assertEqual(1, len(instances), f"Unexpected number of instances ({len(instances)}) found for {mtype.__name__}") diff --git a/tests/resources/opt_gpu_ids.conf b/tests/resources/opt_gpu_ids.conf new file mode 100644 index 0000000..bcc4925 --- /dev/null +++ b/tests/resources/opt_gpu_ids.conf @@ -0,0 +1 @@ +gpu.id = 1,2,3 diff --git a/tests/resources/opt_gpu_ids_invalid.conf b/tests/resources/opt_gpu_ids_invalid.conf new file mode 100644 index 0000000..9c78fa4 --- /dev/null +++ b/tests/resources/opt_gpu_ids_invalid.conf @@ -0,0 +1 @@ +gpu.id = a,b,c diff --git a/tests/service/optimizer/gpu/test_gpu_manager.py b/tests/service/optimizer/gpu/test_gpu_manager.py index c69322e..bed5337 100644 --- a/tests/service/optimizer/gpu/test_gpu_manager.py +++ b/tests/service/optimizer/gpu/test_gpu_manager.py @@ -133,6 +133,56 @@ async def test_activate_performance__set_all_drivers_gpus_to_performance_when_no self.assertEqual(expected_state_cache, gpu_man.get_gpu_state_cache_view()) + async def test_activate_performance__set_only_target_gpus_when_not_in_performance_first_exec(self): + driver_1 = Mock() + driver_1.__class__ = GPUDriver + driver_1.lock.return_value = Lock() + driver_1.can_work.return_value = True, None + driver_1.get_performance_mode.return_value = NvidiaPowerMode.PERFORMANCE + driver_1.get_cached_gpus = AsyncMock(return_value={'0', '1'}) + driver_1.get_power_mode = AsyncMock(return_value={'1': NvidiaPowerMode.ON_DEMAND}) + driver_1.set_power_mode = AsyncMock(return_value={'1': True}) + + driver_2 = Mock() + driver_2.__class__ = AMDGPUDriver + driver_2.lock.return_value = Lock() + driver_2.can_work.return_value = True, None + driver_2_perf_mode = 'manual:5' + driver_2_def_mode = 'auto:3' + driver_2.get_performance_mode.return_value = driver_2_perf_mode + driver_2.get_cached_gpus = AsyncMock(return_value={'1', '3'}) + driver_2.get_power_mode = AsyncMock(return_value={'1': driver_2_def_mode, '3': driver_2_def_mode}) + driver_2.set_power_mode = AsyncMock(return_value={'1': True, '3': True}) + + gpu_man = GPUManager(logger=Mock(), drivers=[driver_1, driver_2]) + actual_changes = await gpu_man.activate_performance(target_gpu_ids={'1', '3'}) + + driver_1.lock.assert_called_once() + driver_1.get_performance_mode.assert_called() + driver_1.get_cached_gpus.assert_called_once() + driver_1.can_work.assert_called_once() + driver_1.get_power_mode.assert_called_once_with({'1'}, None) + driver_1.set_power_mode.assert_called_once_with({'1': NvidiaPowerMode.PERFORMANCE}, None) + + driver_2.lock.assert_called_once() + driver_2.get_performance_mode.assert_called() + driver_2.get_cached_gpus.assert_called_once() + driver_2.can_work.assert_called_once() + driver_2.get_power_mode.assert_called_once_with({'1', '3'}, None) + driver_2.set_power_mode.assert_called_once_with({'1': driver_2_perf_mode, '3': driver_2_perf_mode}, None) + + self.assertIsNotNone(actual_changes) + + expected_changes = {GPUDriver: {GPUState('1', GPUDriver, NvidiaPowerMode.ON_DEMAND)}, + AMDGPUDriver: {GPUState(str(i), AMDGPUDriver, driver_2_def_mode) for i in [1, 3]}} + + self.assertEqual(expected_changes, actual_changes) + + expected_state_cache = {GPUDriver: {'1': NvidiaPowerMode.ON_DEMAND}, + AMDGPUDriver: {str(i): driver_2_def_mode for i in [1, 3]}} + + self.assertEqual(expected_state_cache, gpu_man.get_gpu_state_cache_view()) + async def test_activate_performance__should_only_activate_gpu_performance_for_concurrent_calls(self): driver_lock = Lock() diff --git a/tests/service/optimizer/optimization/test_environment.py b/tests/service/optimizer/optimization/test_environment.py index 231bc5f..1d86a33 100644 --- a/tests/service/optimizer/optimization/test_environment.py +++ b/tests/service/optimizer/optimization/test_environment.py @@ -222,7 +222,8 @@ async def test_run__should_not_change_governors_when_performance_is_already_set_ class ChangeGPUModeToPerformanceTest(IsolatedAsyncioTestCase): def setUp(self): - self.gpu_man = Mock() + self.gpu_man = MagicMock() + self.gpu_man.activate_performance = AsyncMock() self.context = OptimizationContext.empty() self.context.gpu_man = self.gpu_man @@ -241,12 +242,19 @@ async def test_is_available__true_when_gpu_manager_returns_working_drivers_with_ self.gpu_man.map_working_drivers_and_gpus.assert_called_once() - async def test_is_available__true_when_gpu_cache_is_off(self): + async def test_is_available__true_when_gpu_cache_is_off_and_not_system_service(self): + self.context.system_service = False self.gpu_man.is_cache_enabled.return_value = False self.assertTrue(await self.task.is_available()) # first call self.assertTrue(await self.task.is_available()) # second call (ensuring 'is_cache_enabled' is always called) self.assertEqual(2, self.gpu_man.is_cache_enabled.call_count) + async def test_is_available__true_when_gpu_cache_is_off_and_system_service(self): + self.context.system_service = True + self.gpu_man.is_cache_enabled.return_value = False + self.assertTrue(await self.task.is_available()) + self.assertEqual(0, self.gpu_man.is_cache_enabled.call_count) # no call count, since system service is on + async def test_should_run__true_when_gpu_profile_performance_is_true(self): self.profile.gpu.performance = True self.assertTrue(await self.task.should_run(self.process)) @@ -259,6 +267,18 @@ async def test_should_run__false_when_gpu_profile_performance_is_false(self): self.profile.gpu.performance = False self.assertFalse(await self.task.should_run(self.process)) + async def test_run__it_should_activate_performance_only_for_target_gpus_in_context(self): + self.context.gpu_ids = {'0', '1'} + await self.task.run(self.process) + self.gpu_man.activate_performance.assert_awaited_once_with(user_environment=self.process.request.user_env, + target_gpu_ids={'0', '1'}) + + async def test_run__it_should_activate_performance_for_all_available_gpus(self): + self.context.gpu_ids = None + await self.task.run(self.process) + self.gpu_man.activate_performance.assert_awaited_once_with(user_environment=self.process.request.user_env, + target_gpu_ids=None) + class DisableWindowCompositorTest(IsolatedAsyncioTestCase): From f744bcb2247938767e7eed241a60c193fd92be6e Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Thu, 22 Sep 2022 08:10:09 -0300 Subject: [PATCH 10/12] [service.optimizer] refactoring logs and adding more test cases --- guapow/service/optimizer/gpu.py | 9 ++-- .../service/optimizer/gpu/test_gpu_manager.py | 46 +++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index a3721f1..2dc0966 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -508,13 +508,14 @@ async def activate_performance(self, user_environment: Optional[Dict[str, str]] target_gpus = gpus.intersection(target_gpu_ids) if target_gpu_ids else gpus if not target_gpus: - self._log.debug(f"[{driver.get_vendor_name()} GPU] No valid target GPUs available " - f"to enter in performance mode (valid: {', '.join(sorted(gpus))})") + self._log.debug(f"[{driver.get_vendor_name()}] No valid target GPUs available " + f"for performance mode (valid: {', '.join(sorted(gpus))})") continue async with driver.lock(): if target_gpu_ids and gpus != target_gpu_ids: - self._log.debug(f"Target GPU ids to enter in performance mode: {', '.join(sorted(target_gpus))}") + self._log.debug(f"[{driver.get_vendor_name()}] Target GPU ids for performance mode: " + f"{', '.join(sorted(target_gpus))}") gpu_modes = await driver.get_power_mode(target_gpus, user_environment) if gpu_modes: @@ -542,7 +543,7 @@ async def activate_performance(self, user_environment: Optional[Dict[str, str]] not_changed = {gpu for gpu, changed in gpus_changed.items() if not changed} if not_changed: - self._log.error(f"Could not change power mode of {driver.get_vendor_name()} GPUs: " + self._log.error(f"[{driver.get_vendor_name()}] could not change power mode of GPUs: " f"{', '.join(sorted(not_changed))}") res[driver.__class__] = driver_res diff --git a/tests/service/optimizer/gpu/test_gpu_manager.py b/tests/service/optimizer/gpu/test_gpu_manager.py index bed5337..16e4466 100644 --- a/tests/service/optimizer/gpu/test_gpu_manager.py +++ b/tests/service/optimizer/gpu/test_gpu_manager.py @@ -183,6 +183,52 @@ async def test_activate_performance__set_only_target_gpus_when_not_in_performanc self.assertEqual(expected_state_cache, gpu_man.get_gpu_state_cache_view()) + async def test_activate_performance__should_not_try_to_activate_performance_when_target_gpus_dont_match(self): + driver_1 = Mock() + driver_1.__class__ = GPUDriver + driver_1.lock.return_value = Lock() + driver_1.can_work.return_value = True, None + driver_1.get_performance_mode.return_value = NvidiaPowerMode.PERFORMANCE + driver_1.get_cached_gpus = AsyncMock(return_value={'0', '1'}) + driver_1.get_power_mode = AsyncMock() + driver_1.set_power_mode = AsyncMock() + + gpu_man = GPUManager(logger=Mock(), drivers=[driver_1]) + actual_changes = await gpu_man.activate_performance(target_gpu_ids={'3'}) + + driver_1.can_work.assert_called_once() + driver_1.get_cached_gpus.assert_called_once() + driver_1.lock.assert_not_called() + driver_1.get_performance_mode.assert_not_called() + driver_1.get_power_mode.assert_not_called() + driver_1.set_power_mode.assert_not_called() + + self.assertFalse(actual_changes) + self.assertFalse(gpu_man.get_gpu_state_cache_view()) + + async def test_activate_performance__should_try_to_activate_performance_when_no_gpus_available(self): + driver_1 = Mock() + driver_1.__class__ = GPUDriver + driver_1.lock.return_value = Lock() + driver_1.can_work.return_value = True, None + driver_1.get_performance_mode.return_value = NvidiaPowerMode.PERFORMANCE + driver_1.get_cached_gpus = AsyncMock(return_value=set()) + driver_1.get_power_mode = AsyncMock() + driver_1.set_power_mode = AsyncMock() + + gpu_man = GPUManager(logger=Mock(), drivers=[driver_1]) + actual_changes = await gpu_man.activate_performance() + + driver_1.can_work.assert_called_once() + driver_1.get_cached_gpus.assert_called_once() + driver_1.lock.assert_not_called() + driver_1.get_performance_mode.assert_not_called() + driver_1.get_power_mode.assert_not_called() + driver_1.set_power_mode.assert_not_called() + + self.assertFalse(actual_changes) + self.assertFalse(gpu_man.get_gpu_state_cache_view()) + async def test_activate_performance__should_only_activate_gpu_performance_for_concurrent_calls(self): driver_lock = Lock() From 16dd260641de980b98576a6723849a25c28d632d Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Thu, 22 Sep 2022 08:19:39 -0300 Subject: [PATCH 11/12] [service.optimizer] refactoring logs --- guapow/service/optimizer/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guapow/service/optimizer/gpu.py b/guapow/service/optimizer/gpu.py index 2dc0966..a489e06 100644 --- a/guapow/service/optimizer/gpu.py +++ b/guapow/service/optimizer/gpu.py @@ -442,7 +442,7 @@ async def _get_driver_gpus(self, driver: GPUDriver) -> Optional[Set[str]]: if gpus != cached_gpus: gpu_ids = f" (ids={', '.join((str(i) for i in sorted(gpus)))})" if gpus else '' - self._log.debug(f'{driver.get_vendor_name()} GPUs available: {len(gpus)}{gpu_ids}') + self._log.debug(f'[{driver.get_vendor_name()}] GPUs available: {len(gpus)}{gpu_ids}') driver_cache[self.LOG_CACHE_KEY__AVAILABLE] = gpus return gpus From c9113962cbefce85b1e3ef0197304da8bb110739 Mon Sep 17 00:00:00 2001 From: Vinicius Moreira Date: Thu, 22 Sep 2022 08:20:42 -0300 Subject: [PATCH 12/12] [CHANGELOG.md] Updating release date --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84504dc..9ded1e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [NEXT] +## [1.2.2] 2022-09-22 ### Improvements - Minor code refactoring and log improvements regarding AMD GPU management - Optimizer: