Skip to content

Commit

Permalink
[optimizer] fix: AMD GPU performance mode not working
Browse files Browse the repository at this point in the history
  • Loading branch information
vinifmor committed Jun 8, 2022
1 parent 603e71f commit d4604a2
Show file tree
Hide file tree
Showing 21 changed files with 278 additions and 227 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [1.1.1] 2022-06-08

### Fixes
- AMD GPU performance mode not working [#1](https://github.com/vinifmor/guapow/issues/1)
- (test context -> GPU: Ryzen 7 5700G, Kernel: 5.15.45, O.S: Arch Linux)


## [1.1.0] 2022-06-03

### Features
Expand Down
2 changes: 1 addition & 1 deletion guapow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
__app_name__ = 'guapow'
__version__ = '1.1.0'
__version__ = '1.1.1'
223 changes: 137 additions & 86 deletions guapow/service/optimizer/gpu.py

Large diffs are not rendered by default.

17 changes: 11 additions & 6 deletions guapow/service/optimizer/post_process/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from guapow.common import system
from guapow.common.scripts import RunScripts
from guapow.common.users import is_root_user
from guapow.service.optimizer.gpu import GPUPowerMode, GPUState, GPUDriver
from guapow.service.optimizer.gpu import GPUState, GPUDriver
from guapow.service.optimizer.post_process.context import PostProcessContext
from guapow.service.optimizer.task.model import OptimizationContext, CPUState

Expand Down Expand Up @@ -52,19 +52,22 @@ async def _restore(self, driver: GPUDriver, states: List[GPUState], user_env: Op
gpus_to_restore = {}

for id_, modes in gpus_states.items():
mode = [*modes][0] if len(modes) == 1 else GPUPowerMode.AUTO # if there is more than one mode mapped to same GPU, AUTO is preferred
# if there is more than one mode mapped to same GPU, a default mode is preferred
mode = [*modes][0] if len(modes) == 1 else driver.get_default_mode()
current_mode = gpus_current_modes.get(id_)

if mode:
if mode != current_mode:
gpus_to_restore[id_] = mode
else:
self._log.info(f"It is not necessary to restore {driver.get_vendor_name()} GPU ({id_}) to '{mode.name.lower()}' mode")
self._log.info(f"It is not necessary to restore {driver.get_vendor_name()} GPU ({id_}) to "
f"'{mode.name.lower()}' mode")
else:
self._log.error(f"Current mode unknown for {driver.get_vendor_name()} GPU '{id_}'")

if gpus_to_restore:
self._log.debug(f"Restoring power mode of {driver.get_vendor_name()} GPUS: {', '.join(gpus_to_restore)}")
self._log.debug(f"Restoring power mode of {driver.get_vendor_name()} GPUS: "
f"{', '.join(gpus_to_restore)}")

gpus_changed = await driver.set_power_mode(gpus_to_restore, user_env)

Expand All @@ -73,10 +76,12 @@ async def _restore(self, driver: GPUDriver, states: List[GPUState], user_env: Op
not_restored = {gpu for gpu, changed in gpus_changed.items() if not changed}

if not_restored:
self._log.error(f"Could not restore power mode of {driver.get_vendor_name()} GPUS: {', '.join(gpus_changed)}")
self._log.error(f"Could not restore power mode of {driver.get_vendor_name()} GPUS: "
f"{', '.join(gpus_changed)}")

else:
self._log.error(f"Could not restore power mode of {driver.get_vendor_name()} GPUs: {', '.join(gpus_to_restore.keys())}")
self._log.error(f"Could not restore power mode of {driver.get_vendor_name()} GPUs: "
f"{', '.join(gpus_to_restore.keys())}")

async def run(self, context: PostProcessContext):
restore_tasks = []
Expand Down
Empty file.
Empty file.
1 change: 0 additions & 1 deletion tests/resources/amd/gpu/2/pp_compute_power_profile

This file was deleted.

This file was deleted.

1 change: 0 additions & 1 deletion tests/resources/amd/gpu/3/pp_compute_power_profile

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
manual
4 changes: 4 additions & 0 deletions tests/resources/amd/gpu/card1/device/pp_power_profile_mode
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0 BOOTUP
3 VIDEO*
5 COMPUTE
6 CUSTOM
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
manual
4 changes: 4 additions & 0 deletions tests/resources/amd/gpu/card2/device/pp_power_profile_mode
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0 BOOTUP
3 VIDEO
5 COMPUTE*
6 CUSTOM
4 changes: 4 additions & 0 deletions tests/resources/amd/gpu/card3/device/pp_power_profile_mode
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
0 BOOTUP
3 VIDEO
5 COMPUTE*
6 CUSTOM
126 changes: 41 additions & 85 deletions tests/service/optimizer/gpu/test_amd_gpu_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,131 +3,87 @@
import sys
import traceback
from unittest import IsolatedAsyncioTestCase
from unittest.mock import Mock, AsyncMock
from unittest.mock import Mock

from guapow.service.optimizer.gpu import AMDGPUDriver, GPUPowerMode
from guapow.service.optimizer.gpu import AMDGPUDriver
from tests import RESOURCES_DIR

TEST_GPU_FOLDER = f'{RESOURCES_DIR}/amd/gpu'
TEMP_GPU_FOLDER = f'{TEST_GPU_FOLDER}/4'
TEST_GPU_FOLDER = RESOURCES_DIR + '/amd/gpu/card{id}/device'
TEMP_GPU_FOLDER = TEST_GPU_FOLDER.format(id=4)


class AMDGPUDriverTest(IsolatedAsyncioTestCase):

def tearDown(self):
if os.path.exists(TEMP_GPU_FOLDER):
temp_dir = os.path.dirname(TEMP_GPU_FOLDER)

if os.path.exists(temp_dir):
try:
shutil.rmtree(TEMP_GPU_FOLDER)
shutil.rmtree(temp_dir)
except:
sys.stderr.write(f"Could not remove temp AMD gpu folder '{TEMP_GPU_FOLDER}'")
traceback.print_exc()

async def get_cached_gpus__must_always_return_the_same_cached_result_when_cache_is_true(self):
driver = AMDGPUDriver(cache=True, logger=Mock(), gpus_path=RESOURCES_DIR)
driver.get_gpus = AsyncMock(side_effect=[{'a'}, {'b'}])

for _ in range(2):
self.assertEqual({'a'}, await driver.get_cached_gpus())

driver.get_gpus.assert_called_once()

async def get_cached_gpus__must_always_return_the_same_cached_result_even_when_none(self):
driver = AMDGPUDriver(cache=True, logger=Mock(), gpus_path=RESOURCES_DIR)
driver.get_gpus = AsyncMock(side_effect=[None, {'b'}])

for _ in range(2):
self.assertIsNone(await driver.get_cached_gpus())
def test_get_vendor_name__must_return_AMD(self):
self.assertEqual('AMD', AMDGPUDriver.get_vendor_name())

driver.get_gpus.assert_called_once()

async def get_cached_gpus__must_always_call_get_gpus_when_cache_is_false(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=RESOURCES_DIR)
driver.get_gpus = AsyncMock(side_effect=[{'a'}, {'b'}])
async def test_can_work__always_true(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
res, msg = driver.can_work()
self.assertTrue(res)
self.assertIsNone(msg)

self.assertEqual({'a'}, await driver.get_cached_gpus())
self.assertEqual({'b'}, await driver.get_cached_gpus())
def test_get_default_mode__must_return_auto_and_3(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
self.assertEqual('auto:3', driver.get_default_mode())

self.assertEqual(2, driver.get_gpus.call_count)
def test_get_performance_mode__must_return_manual_and_5(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
self.assertEqual('manual:5', driver.get_performance_mode())

async def test_get_gpus__empty_when_there_are_no_files(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=RESOURCES_DIR)
returned = await driver.get_gpus()
self.assertIsNotNone(returned)
self.assertEqual(set(), returned)
self.assertIsNone(returned)

async def test_get_gpus__return_available_gpus_when_required_files_exist(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
returned = await driver.get_gpus()
self.assertIsNotNone(returned)
self.assertEqual({'1', '2', '3'}, returned)

def test_can_work__always_true(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
driver._gpus = {'1'}
res, msg = driver.can_work()
self.assertTrue(res)
self.assertIsNone(msg)
expected = {TEST_GPU_FOLDER.format(id=n) for n in (1, 2, 3)}
self.assertEqual(expected, returned)

async def test_get_power_mode__return_auto_when_both_files_content_not_meet_expected_words(self):
async def test_get_power_mode__return_a_string_concatenating_the_performance_and_profile_ids(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)

actual_modes = await driver.get_power_mode({'1', '3'})
self.assertEqual({'1': GPUPowerMode.AUTO, '3': GPUPowerMode.AUTO}, actual_modes)
gpu_dirs = {TEST_GPU_FOLDER.format(id=n) for n in (1, 2, 3)}
actual_modes = await driver.get_power_mode(gpu_dirs)

self.assertIsNotNone(driver._default_performance_level)
self.assertEqual('', driver._default_performance_level['1'])
self.assertNotIn('3', driver._default_performance_level)
expected = {
TEST_GPU_FOLDER.format(id=1): 'manual:3',
TEST_GPU_FOLDER.format(id=2): 'manual:5',
TEST_GPU_FOLDER.format(id=3): 'auto:5'
}
self.assertEqual(expected, actual_modes)

self.assertIsNotNone(driver._default_power_profile)
self.assertEqual('', driver._default_power_profile['1'])
self.assertEqual('unknown', driver._default_power_profile['3'])

async def test_get_power_mode__return_performance_when_both_files_content_meet_expected_words(self):
driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
actual_modes = await driver.get_power_mode({'2'})
self.assertEqual({'2': GPUPowerMode.PERFORMANCE}, actual_modes)

async def test_set_power_mode__write_the_expected_performance_related_words_to_both_files(self):
example_folder = f'{TEST_GPU_FOLDER}/1'
async def test_set_power_mode__write_the_expected_content_within_the_parsed_mode_string(self):
example_folder = TEST_GPU_FOLDER.format(id=1)
try:
shutil.copytree(example_folder, TEMP_GPU_FOLDER)
except:
self.fail(f"Could not copy example folder '{example_folder}'")

driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
res = await driver.set_power_mode({'4': GPUPowerMode.PERFORMANCE})
self.assertEqual({'4': True}, res)

with open(f'{TEMP_GPU_FOLDER}/{AMDGPUDriver.PERFORMANCE_FILE}') as f:
current_performance = f.read()

self.assertEqual('auto', current_performance)

with open(f'{TEMP_GPU_FOLDER}/{AMDGPUDriver.PROFILE_FILE}') as f:
current_profile = f.read()

self.assertEqual('set', current_profile)

async def test_set_power_mode__write_cached_strings_when_not_performance(self):
example_folder = f'{TEST_GPU_FOLDER}/1'
try:
shutil.copytree(example_folder, TEMP_GPU_FOLDER)
except:
self.fail(f"Could not copy example folder '{example_folder}'")

driver = AMDGPUDriver(cache=False, logger=Mock(), gpus_path=TEST_GPU_FOLDER)
driver._default_performance_level = {'4': 'low'}
driver._default_power_profile = {'4': 'get'}

res = await driver.set_power_mode({'4': GPUPowerMode.AUTO})
self.assertEqual({'4': True}, res)
res = await driver.set_power_mode({TEMP_GPU_FOLDER: driver.get_performance_mode()})
self.assertEqual({TEMP_GPU_FOLDER: True}, res)

with open(f'{TEMP_GPU_FOLDER}/{AMDGPUDriver.PERFORMANCE_FILE}') as f:
current_performance = f.read()
control_mode = f.read()

self.assertEqual(driver._default_performance_level['4'], current_performance)
self.assertEqual('manual', control_mode)

with open(f'{TEMP_GPU_FOLDER}/{AMDGPUDriver.PROFILE_FILE}') as f:
current_profile = f.read()
power_mode = f.read()

self.assertEqual(driver._default_power_profile['4'], current_profile)
self.assertEqual('5', power_mode)
Loading

0 comments on commit d4604a2

Please sign in to comment.