-
Notifications
You must be signed in to change notification settings - Fork 50
/
rocm_smi.py
executable file
·3249 lines (2835 loc) · 133 KB
/
rocm_smi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""ROCm_SMI_LIB CLI Tool
This tool acts as a command line interface for manipulating
and monitoring the amdgpu kernel, and is intended to replace
and deprecate the existing rocm_smi.py CLI tool.
It uses Ctypes to call the rocm_smi_lib API.
Recommended: At least one AMD GPU with ROCm driver installed
Required: ROCm SMI library installed (librocm_smi64)
"""
from __future__ import print_function
import argparse
import json
import logging
import os
import sys
import subprocess
import _thread
import time
from time import ctime
from subprocess import check_output
from rsmiBindings import *
# rocmSmiLib_cli version. Increment this as needed.
# Major version - Increment when backwards-compatibility breaks
# Minor version - Increment when adding a new feature, set to 0 when major is incremented
# Patch version - Increment when adding a fix, set to 0 when minor is incremented
SMI_MAJ = 1
SMI_MIN = 4
SMI_PAT = 1
__version__ = '%s.%s.%s' % (SMI_MAJ, SMI_MIN, SMI_PAT)
# Set to 1 if an error occurs
RETCODE = 0
# If we want JSON format output instead
PRINT_JSON = False
JSON_DATA = {}
# Version of the JSON output used to save clocks
CLOCK_JSON_VERSION = 1
headerString = ' ROCm System Management Interface '
footerString = ' End of ROCm SMI Log '
# Output formatting
appWidth = 80
deviceList = []
# Enable or disable serialized format
OUTPUT_SERIALIZATION = False
# These are the valid clock types that can be returned/modified:
# TODO: "clk_type_names" from rsmiBindings.py should fetch valid clocks from
# the same location as rocm_smi_device.cc instead of hardcoding the values
validClockNames = clk_type_names[1:-2]
# The purpose of the [1:-2] here ^^^^ is to remove the duplicate elements at the
# beginning and end of the clk_type_names list (specifically sclk and mclk)
# Also the "invalid" clock in the list is removed since it isn't a valid clock type
validClockNames.append('pcie')
validClockNames.sort()
def driverInitialized():
""" Returns true if amdgpu is found in the list of initialized modules
"""
driverInitialized = ''
try:
driverInitialized = str(subprocess.check_output("cat /proc/modules|grep amdgpu", shell=True))
except subprocess.CalledProcessError:
pass
if len(driverInitialized) > 0:
return True
return False
def formatJson(device, log):
""" Print out in JSON format
@param device: DRM device identifier
@param log: String to parse and output into JSON format
"""
global JSON_DATA
for line in log.splitlines():
# Drop any invalid or improperly-formatted data
if ':' not in line:
continue
logTuple = line.split(': ')
if str(device) != 'system':
JSON_DATA['card' + str(device)][logTuple[0]] = logTuple[1].strip()
else:
JSON_DATA['system'][logTuple[0]] = logTuple[1].strip()
def formatCsv(deviceList):
""" Print out the JSON_DATA in CSV format """
global JSON_DATA
jsondata = json.dumps(JSON_DATA)
header = ['device']
headerkeys = []
# Separate device-specific information from system-level information
for dev in deviceList:
if str(dev) != 'system':
headerkeys.extend(l for l in JSON_DATA['card' + str(dev)].keys() if l not in headerkeys)
else:
headerkeys.extend(l for l in JSON_DATA['system'].keys() if l not in headerkeys)
header.extend(headerkeys)
outStr = '%s\n' % ','.join(header)
if len(header) <= 1:
return ''
for dev in deviceList:
outStr += 'card%s,' % dev
for val in headerkeys:
try:
if str(dev) != 'system':
# Remove commas like the ones in PCIe speed
outStr += '%s,' % JSON_DATA['card' + str(dev)][val].replace(',', '')
else:
outStr += '%s,' % JSON_DATA['system'][val].replace(',', '')
except KeyError as e:
# If the key doesn't exist (like dcefclock on Fiji, or unsupported functionality)
outStr += 'N/A,'
# Drop the trailing ',' and replace it with a \n
outStr = '%s\n' % outStr[0:-1]
return outStr
def formatMatrixToJSON(deviceList, matrix, metricName):
""" Format symmetric matrix of GPU permutations to become JSON print-ready.
@param deviceList: List of DRM devices (can be a single-item list)
@param metricName: Title of the item to print to the log
@param matrix: symmetric matrix full of values of every permutation of DRM devices.
example:
GPU0 GPU1
GPU0 0 40
GPU1 40 0
Where matrix content is: [[0, 40], [40, 0]]
"""
devices_ind = range(len(deviceList))
for row_indx in devices_ind:
# Start at row_indx +1 to avoid printing repeated values ( GPU1 x GPU2 is the same as GPU2 x GPU1 )
for col_ind in range(row_indx + 1, len(deviceList)):
try:
valueStr = matrix[deviceList[row_indx]][deviceList[col_ind]].value
except AttributeError:
valueStr = matrix[deviceList[row_indx]][deviceList[col_ind]]
printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr)
def getBus(device):
""" Return the bus identifier of a given device
@param device: DRM device identifier
"""
bdfid = c_uint64(0)
ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid))
# BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) |((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7)
domain = (bdfid.value >> 32) & 0xffffffff
bus = (bdfid.value >> 8) & 0xff
device = (bdfid.value >> 3) & 0x1f
function = bdfid.value & 0x7
pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function)
if rsmi_ret_ok(ret, device):
return pic_id
def getFanSpeed(device):
""" Return a tuple with the fan speed (value,%) for a specified device,
or (None,None) if either current fan speed or max fan speed cannot be
obtained
@param device: DRM device identifier
"""
fanLevel = c_int64()
fanMax = c_int64()
sensor_ind = c_uint32(0)
fl = 0
fm = 0
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
if rsmi_ret_ok(ret, device, None, True):
fl = fanLevel.value
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
if rsmi_ret_ok(ret, device, None, True):
fm = fanMax.value
if fl == 0 or fm == 0:
return (fl, 0) # to prevent division by zero crash
return (fl, round((float(fl) / float(fm)) * 100, 2))
def getGpuUse(device):
""" Return the current GPU usage as a percentage
@param device: DRM device identifier
"""
percent = c_uint32()
ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
if rsmi_ret_ok(ret, device, 'GPU Utilization '):
return percent.value
return -1
def getId(device):
""" Return the hexadecimal value of a device's ID
@param device: DRM device identifier
"""
dv_id = c_short()
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
if rsmi_ret_ok(ret, device):
return hex(dv_id.value)
def getMaxPower(device):
""" Return the maximum power cap of a given device
@param device: DRM device identifier
"""
power_cap = c_uint64()
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if rsmi_ret_ok(ret, device):
return power_cap.value / 1000000
return -1
def getMemInfo(device, memType):
""" Return the specified memory usage for the specified device
@param device: DRM device identifier
@param type: [vram|vis_vram|gtt] Memory type to return
"""
memType = memType.upper()
if memType not in memory_type_l:
printErrLog(device, 'Invalid memory type %s' % (memType))
return (None, None)
memoryUse = c_uint64()
memoryTot = c_uint64()
memUsed = None
memTotal = None
ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse))
if rsmi_ret_ok(ret, device, memType):
memUsed = memoryUse.value
ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot))
if rsmi_ret_ok(ret, device, memType + ' total'):
memTotal = memoryTot.value
return (memUsed, memTotal)
def getProcessName(pid):
""" Get the process name of a specific pid
@param pid: Process ID of a program to be parsed
"""
if int(pid) < 1:
logging.debug('PID must be greater than 0')
return 'UNKNOWN'
try:
pName = str(subprocess.check_output("ps -p %d -o comm=" % (int(pid)), shell=True))
except subprocess.CalledProcessError as e:
pName = 'UNKNOWN'
if pName == None:
pName = 'UNKNOWN'
# Remove the substrings surrounding from process name (b' and \n')
if str(pName).startswith('b\''):
pName = pName[2:]
if str(pName).endswith('\\n\''):
pName = pName[:-3]
return pName
def getPerfLevel(device):
""" Return the current performance level of a given device
@param device: DRM device identifier
"""
perf = rsmi_dev_perf_level_t()
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
if rsmi_ret_ok(ret, device):
return perf_level_string(perf.value)
return -1
def getPid(name):
""" Get the process id of a specific application
@param name: Process name of a program to be parsed
"""
return check_output(['pidof', name])
def getPidList():
""" Return a list of KFD process IDs """
num_items = c_uint32()
ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
if rsmi_ret_ok(ret):
buff_sz = num_items.value + 10
procs = (rsmi_process_info_t * buff_sz)()
procList = []
ret = rocmsmi.rsmi_compute_process_info_get(byref(procs), byref(num_items))
for i in range(num_items.value):
procList.append('%s' % (procs[i].process_id))
return procList
return
def getPower(device):
""" Return the current power level of a given device
@param device: DRM device identifier
"""
power = c_uint32()
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power))
if rsmi_ret_ok(ret, device, 'power'):
return power.value / 1000000
return 'N/A'
def getRasEnablement(device, block):
""" Return RAS enablement state for a given device
@param device: DRM device identifier
@param block: RAS block identifier
"""
state = rsmi_ras_err_state_t()
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
if rsmi_ret_ok(ret, device, block, True):
return rsmi_ras_err_stale_machine[state.value].upper()
return 'N/A'
def getTemp(device, sensor):
""" Display the current temperature from a given device's sensor
@param device: DRM device identifier
@param sensor: Temperature sensor identifier
"""
temp = c_int64(0)
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp))
if rsmi_ret_ok(ret, device, sensor, True):
return temp.value / 1000
return 'N/A'
def getVbiosVersion(device):
""" Returns the VBIOS version for a given device
@param device: DRM device identifier
"""
vbios = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
if rsmi_ret_ok(ret, device):
return vbios.value.decode()
def getVersion(deviceList, component):
""" Return the software version for the specified component
@param deviceList: List of DRM devices (can be a single-item list)
@param component: Component (currently only driver)
"""
ver_str = create_string_buffer(256)
ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256)
if rsmi_ret_ok(ret, None, component):
return ver_str.value.decode()
return None
def print2DArray(dataArray):
""" Print 2D Array with uniform spacing """
global PRINT_JSON
dataArrayLength = []
isPid = False
if str(dataArray[0][0]) == 'PID':
isPid = True
for position in range(len(dataArray[0])):
dataArrayLength.append(len(dataArray[0][position]))
for position in range(len(dataArray)):
for cell in range(len(dataArray[0])):
if len(dataArray[position][cell]) > dataArrayLength[cell]:
dataArrayLength[cell] = len(dataArray[position][cell])
for position in range(len(dataArray)):
printString = ''
for cell in range(len(dataArray[0])):
printString += str(dataArray[position][cell]).ljust(dataArrayLength[cell], ' ') + '\t'
if PRINT_JSON:
printString = ' '.join(printString.split()).lower()
firstElement = printString.split(' ', 1)[0]
printString = printString.split(' ', 1)[1]
printString = printString.replace(' ', ', ')
if (position > 0):
if isPid:
printSysLog('PID%s' % (firstElement), printString)
else:
printSysLog(firstElement, printString)
else:
printLog(None, printString, None)
def printEmptyLine():
""" Print out a single empty line """
global PRINT_JSON
if not PRINT_JSON:
print()
def printErrLog(device, err):
""" Print out an error to the SMI log
@param device: DRM device identifier
@param err: Error string to print
"""
global PRINT_JSON
devName = device
for line in err.split('\n'):
errstr = 'GPU[%s] \t\t: %s' % (devName, line)
if not PRINT_JSON:
logging.error(errstr)
else:
logging.debug(errstr)
def printEventList(device, delay, eventList):
""" Print out notification events for a specified device
@param device: DRM device identifier
@param delay: Notification delay in ms
@param eventList: List of event type names (can be a single-item list)
"""
mask = 0
ret = rocmsmi.rsmi_event_notification_init(device)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to initialize event notifications.')
return
for eventType in eventList:
mask |= 2 ** notification_type_names.index(eventType.upper())
ret = rocmsmi.rsmi_event_notification_mask_set(device, mask)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to set event notification mask.')
return
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
num_elements = c_uint32(1)
data = rsmi_evt_notification_data_t(1)
rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
if len(data.message) > 0:
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1],
data.message.decode('utf8') + '\r']])
def printLog(device, metricName, value):
""" Print out to the SMI log
@param device: DRM device identifier
@param metricName: Title of the item to print to the log
@param value: The item's value to print to the log
"""
global PRINT_JSON
if PRINT_JSON:
if value is not None and device is not None:
formatJson(device, str(metricName) + ': ' + str(value))
elif device is not None:
formatJson(device, str(metricName))
return
if value is not None:
logstr = 'GPU[%s]\t\t: %s: %s' % (device, metricName, value)
else:
logstr = 'GPU[%s]\t\t: %s' % (device, metricName)
if device is None:
logstr = logstr[13:]
logging.debug(logstr)
# Force thread safe printing
print(logstr + '\n', end='')
def printListLog(metricName, valuesList):
""" Print out to the SMI log for the lists
@param metricName: Title of the item to print to the log
@param valuesList: The item's list of values to print to the log
"""
global PRINT_JSON
listStr = ''
line = metricName + ':\n'
if not valuesList:
line = 'None'
else:
for value in valuesList:
value = str(value) + ' '
if (len(line) + len(value)) < appWidth:
line += value
else:
listStr = listStr + line + '\n'
line = value
if not PRINT_JSON:
print(listStr + line)
def printLogSpacer(displayString=None, fill='='):
""" Prints [name of the option]/[name of the program] in the spacer to explain data below
If no parameters are given, a default fill of the '=' string is used in the spacer
@param displayString: name of item to be displayed inside of the log spacer
@param fill: padding string which surrounds the given display string
"""
global appWidth, PRINT_JSON
if not PRINT_JSON:
if displayString:
if len(displayString) % 2:
displayString += fill
logSpacer = fill * int((appWidth - (len(displayString))) / 2) + displayString + fill * int(
(appWidth - (len(displayString))) / 2)
else:
logSpacer = fill * appWidth
print(logSpacer)
def printSysLog(SysComponentName, value):
""" Print out to the SMI log for repeated features
@param SysComponentName: Title of the item to print to the log
@param value: The item's value to print to the log
"""
global PRINT_JSON, JSON_DATA
if PRINT_JSON:
if 'system' not in JSON_DATA:
JSON_DATA['system'] = {}
formatJson('system', str(SysComponentName) + ': ' + str(value))
return
logstr = '{}: {}'.format(SysComponentName, value)
logging.debug(logstr)
print(logstr)
def printTableLog(column_headers, data_matrix, device=None, tableName=None, anchor='>', v_delim=' '):
""" Print out to the SMI log for the lists
@param column_headers: Header names for each column
@param data_matrix: Matrix of values
@param device: DRM device identifier
@param tableName: Title of the table to print to the log
@param anchor: Alignment direction of the print output
@param v_delim: Boundary String delimiter for the print output
"""
# Usage: the length of col_Names would be determining column width.
# If additional space is needed, please pad corresponding column name with spaces
# If table should print tabulated, pad name of column one with leading zeroes
# Use anchor '<' to to align columns to the right
global OUTPUT_SERIALIZATION, PRINT_JSON
if OUTPUT_SERIALIZATION or PRINT_JSON:
return
if (device is not None) or tableName:
if device is not None:
print('\nGPU[%s]: ' % (device), end='\t')
if tableName:
print(tableName, end='')
printEmptyLine()
for header in column_headers:
print('{:>}'.format(header), end=v_delim)
printEmptyLine()
for row in data_matrix:
for index, cell in enumerate(row):
if cell is None:
cell = 'None'
print('{:{anc}{width}}'.format(cell, anc=anchor, width=len(column_headers[index])), end=v_delim)
printEmptyLine()
def printTableRow(space, displayString, v_delim=" "):
""" Print out a line of a matrix table
@param space: The item's spacing to print
@param displayString: The item's value to print
@param v_delim: Boundary String delimiter for the print output
"""
if space:
print(space % (displayString), end=v_delim)
else:
print(displayString, end=v_delim)
def checkIfSecondaryDie(device):
""" Checks if GCD(die) is the secondary die in a MCM.
Secondary dies lack power management features.
TODO: switch to more robust way to check for primary/secondary die, when implemented in Kernel and rocm_smi_lib.
@param device: The device to check
"""
power_cap = c_uint64()
# secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero.
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
return False
return True
def checkIfSecondaryDie(device):
""" Checks if GCD(die) is the secondary die in a MCM.
Secondary dies lack power management features.
TODO: switch to more robust way to check for primary/secondary die, when implemented in Kernel and rocm_smi_lib.
@param device: The device to check
"""
power_cap = c_uint64()
# secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero.
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, None, False) and power_cap.value == 0):
return False
return True
def resetClocks(deviceList):
""" Reset clocks to default
Reset clocks to default values by setting performance level to auto, as well
as setting OverDrive back to 0
@param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(' Reset Clocks ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
printLog(device, 'OverDrive set to 0', None)
else:
printLog(device, 'Unable to reset OverDrive', None)
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully reset clocks', None)
else:
printLog(device, 'Unable to reset clocks', None)
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
printLog(device, 'Performance level reset to auto', None)
else:
printLog(device, 'Unable to reset performance level to auto', None)
def resetFans(deviceList):
""" Reset fans to driver control for a list of devices.
@param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(' Reset GPU Fan Speed ')
for device in deviceList:
sensor_ind = c_uint32(0)
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully reset fan speed to driver control', None)
printLogSpacer()
def resetPowerOverDrive(deviceList, autoRespond):
""" Reset Power OverDrive to the default value
@param deviceList: List of DRM devices (can be a single-item list)
"""
setPowerOverDrive(deviceList, 0, autoRespond)
def resetProfile(deviceList):
""" Reset profile for a list of a devices.
@param deviceList: List of DRM devices (can be a single-item list)
"""
printLogSpacer(' Reset Profile ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_power_profile_set(device, 0, profileString('BOOTUP DEFAULT'))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully reset Power Profile', None)
else:
printErrLog(device, 'Unable to reset Power Profile')
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully reset Performance Level', None)
else:
printErrLog(device, 'Unable to reset Performance Level')
printLogSpacer()
def resetXgmiErr(deviceList):
""" Reset the XGMI Error value
@param deviceList: Reset XGMI error count for these devices
"""
printLogSpacer('Reset XGMI Error Status ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_xgmi_error_reset(device)
if rsmi_ret_ok(ret, device, 'reset xgmi'):
printLog(device, 'Successfully reset XGMI Error count', None)
else:
logging.error('GPU[%s]\t\t: Unable to reset XGMI error count', device)
printLogSpacer()
def resetPerfDeterminism(deviceList):
""" Reset Performance Determinism
@param deviceList: Disable Performance Determinism for these devices
"""
printLogSpacer('Disable Performance Determinism')
for device in deviceList:
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0))
if rsmi_ret_ok(ret, device, 'disable performance determinism'):
printLog(device, 'Successfully disabled performance determinism', None)
else:
logging.error('GPU[%s]\t\t: Unable to diable performance determinism', device)
printLogSpacer()
def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond):
""" Set the range for the specified clktype in the PowerPlay table for a list of devices.
Parameters:
deviceList -- List of DRM devices (can be a single-item list)
clktype -- [sclk|mclk] Which clock type to apply the range to
minvalue -- Minimum value to apply to the clock range
maxvalue -- Maximum value to apply to the clock range
autoRespond -- Response to automatically provide for all prompts
"""
global RETCODE
if clkType not in {'sclk', 'mclk'}:
printLog(None, 'Invalid range identifier %s' % (clkType), None)
logging.error('Unsupported range type %s', clkType)
RETCODE = 1
return
try:
int(minvalue) & int(maxvalue)
except ValueError:
printErrLog(device, 'Unable to set %s range' % (clkType))
logging.error('%s or %s is not an integer', minvalue, maxvalue)
RETCODE = 1
return
confirmOutOfSpecWarning(autoRespond)
printLogSpacer(' Set Valid %s Range ' % (clkType))
for device in deviceList:
ret = rocmsmi.rsmi_dev_clk_range_set(device, int(minvalue), int(maxvalue), rsmi_clk_names_dict[clkType])
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue), None)
else:
printErrLog(device, 'Unable to set %s from %s(MHz) to %s(MHz)' % (clkType, minvalue, maxvalue))
RETCODE = 1
def setVoltageCurve(deviceList, point, clk, volt, autoRespond):
""" Set voltage curve for a point in the PowerPlay table for a list of devices.
Parameters:
deviceList -- List of DRM devices (can be a single-item list)
point -- Point on the voltage curve to modify
clk -- Clock speed specified for this curve point
volt -- Voltage specified for this curve point
autoRespond -- Response to automatically provide for all prompts
"""
global RETCODE
value = '%s %s %s' % (point, clk, volt)
try:
any(int(item) for item in value)
except ValueError:
printLogNoDev('Unable to set Voltage curve')
logging.error('Non-integer characters are present in %s', value)
RETCODE = 1
return
confirmOutOfSpecWarning(autoRespond)
for device in deviceList:
ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
else:
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
RETCODE = 1
def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond):
""" Set clock frequency and voltage for a level in the PowerPlay table for a list of devices.
Parameters:
deviceList -- List of DRM devices (can be a single-item list)
clktype -- [sclk|mclk] Which clock type to apply the range to
point -- Point on the voltage curve to modify
clk -- Clock speed specified for this curve point
volt -- Voltage specified for this curve point
autoRespond -- Response to automatically provide for all prompts
"""
global RETCODE
value = '%s %s %s' % (point, clk, volt)
try:
any(int(item) for item in value.split())
except ValueError:
printLogNoDev('Unable to set PowerPlay table level')
logging.error('Non-integer characters are present in %s', value)
RETCODE = 1
return
confirmOutOfSpecWarning(autoRespond)
for device in deviceList:
if clkType == 'sclk':
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
rsmi_clk_names_dict[clkType])
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
else:
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
RETCODE = 1
elif clkType == 'mclk':
ret = rocmsmi.rsmi_dev_od_clk_info_set(device, rsmi_freq_ind_t(int(point)), int(clk),
rsmi_clk_names_dict[clkType])
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None)
else:
printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt))
RETCODE = 1
else:
printErrLog(device, 'Unable to set %s range' % (clkType))
logging.error('Unsupported range type %s', clkType)
RETCODE = 1
def setClockOverDrive(deviceList, clktype, value, autoRespond):
""" Set clock speed to OverDrive for a list of devices
@param deviceList: List of DRM devices (can be a single-item list)
@param type: [sclk|mclk] Clock type to set
@param value: [0-20] OverDrive percentage
@param autoRespond: Response to automatically provide for all prompts
"""
printLogSpacer(' Set Clock OverDrive (Range: 0% to 20%) ')
global RETCODE
try:
int(value)
except ValueError:
printLog(None, 'Unable to set OverDrive level', None)
logging.error('%s it is not an integer', value)
RETCODE = 1
return
confirmOutOfSpecWarning(autoRespond)
for device in deviceList:
if int(value) < 0:
printErrLog(device, 'Unable to set OverDrive')
logging.debug('Overdrive cannot be less than 0%')
RETCODE = 1
return
if int(value) > 20:
printLog(device, 'Setting OverDrive to 20%', None)
logging.debug('OverDrive cannot be set to a value greater than 20%')
value = '20'
if getPerfLevel(device) != 'MANUAL':
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
if rsmi_ret_ok(ret, device):
printLog(device, 'Performance level set to manual', None)
else:
printErrLog(device, 'Unable to set performance level to manual')
if clktype == 'mclk':
fsFile = os.path.join('/sys/class/drm', 'card%d' % (device), 'device', 'pp_mclk_od')
if not os.path.isfile(fsFile):
printLog(None, 'Unable to write to sysfs file', None)
logging.debug('%s does not exist', fsFile)
continue
try:
logging.debug('Writing value \'%s\' to file \'%s\'', value, fsFile)
with open(fsFile, 'w') as fs:
fs.write(value + '\n')
except (IOError, OSError):
printLog(None, 'Unable to write to sysfs file %s' % fsFile, None)
logging.warning('IO or OS error')
RETCODE = 1
continue
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
elif clktype == 'sclk':
ret = rocmsmi.rsmi_dev_overdrive_level_set(device, rsmi_dev_perf_level_t(int(value)))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set %s OverDrive to %s%%' % (clktype, value), None)
else:
printLog(device, 'Unable to set %s OverDrive to %s%%' % (clktype, value), None)
else:
printErrLog(device, 'Unable to set OverDrive')
logging.error('Unsupported clock type %s', clktype)
RETCODE = 1
def setClocks(deviceList, clktype, clk):
""" Set clock frequency levels for a list of devices.
@param deviceList: List of DRM devices (can be a single-item list)
@param clktype: [validClockNames] Clock type to set
@param clk: Clock frequency level to set
"""
global RETCODE
if not clk:
printLog(None, 'Invalid clock frequency', None)
RETCODE = 1
return
if clktype not in validClockNames:
printErrLog(None, 'Unable to set clock level')
logging.error('Invalid clock type %s', clktype)
RETCODE = 1
return
check_value = ''.join(map(str, clk))
try:
int(check_value)
except ValueError:
printLog(None, 'Unable to set clock level', None)
logging.error('Non-integer characters are present in value %s', value)
RETCODE = 1
return
# Generate a frequency bitmask from user input value
freq_bitmask = 0
for bit in clk:
if bit > 63:
printErrLog(None, 'Invalid clock frequency')
logging.error('Invalid frequency: %s', bit)
RETCODE = 1
return
freq_bitmask |= (1 << bit)
printLogSpacer(' Set %s Frequency ' % (str(clktype)))
for device in deviceList:
# Check if the performance level is manual, if not then set it to manual
if getPerfLevel(device).lower() != 'manual':
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(3))
if rsmi_ret_ok(ret, device):
printLog(device, 'Performance level was set to manual', None)
else:
printErrLog(device, 'Unable to set performance level to manual')
RETCODE = 1
return
if clktype != 'pcie':
ret = rocmsmi.rsmi_dev_gpu_clk_freq_set(device, rsmi_clk_names_dict[clktype], freq_bitmask)
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set %s bitmask to' % (clktype), hex(freq_bitmask))
else:
printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
RETCODE = 1
else:
ret = rocmsmi.rsmi_dev_pci_bandwidth_set(device, freq_bitmask)
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set %s to level bitmask' % (clktype), hex(freq_bitmask))
else:
printErrLog(device, 'Unable to set %s bitmask to: %s' % (clktype, hex(freq_bitmask)))
RETCODE = 1
printLogSpacer()
def setPerfDeterminism(deviceList, clkvalue):
""" Set clock frequency level for a list of devices to enable performance
determinism.
@param deviceList: List of DRM devices (can be a single-item list)
@param value: Clock frequency level to set
"""
global RETCODE
try:
int(clkvalue)
except ValueError:
printErrLog(device, 'Unable to set Performance Determinism')
logging.error('%s is not an integer', clkvalue)
RETCODE = 1
return
for device in deviceList:
ret = rocmsmi.rsmi_perf_determinism_mode_set(device, int(clkvalue))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully enabled performance determinism and set GFX clock frequency', str(clkvalue))
else:
printErrLog(device, 'Unable to set performance determinism and clock frequency to %s' % (str(clkvalue)))
RETCODE = 1
def resetGpu(device):
""" Perform a GPU reset on the specified device
@param device: DRM device identifier
"""