forked from fasrc/prometheus-slurm-exporter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathday_gpu_usage.py
165 lines (128 loc) · 5.9 KB
/
day_gpu_usage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import re
import subprocess
import sys
def extract_gres_gpu(string):
# Use regular expression to find the number after "gres/gpu="
match = re.search(r'gres/gpu=(\d+)', string)
if match:
return int(match.group(1)) # Return the number as an integer
else:
return int(0) # Return None if not found
def extract_gpu_factor(input_string):
# Split the string by commas to separate the different attributes and extract A100 or H100
attributes = input_string.split(',')
# Initialize the factor to one
factor = 0.0
# Iterate over each attribute to find the 'gres/gpu' part
for attribute in attributes:
if attribute.startswith('gres/gpu:'):
# Extract the GPU type (if present)
gpu_info = attribute.split('=')[0].replace('gres/gpu:', '')
# Assign factor based on GPU type
if 'h100' in gpu_info.lower():
factor = wgpu['h100']
elif 'a100' in gpu_info.lower():
factor = wgpu['a100']
return factor
def convert_to_hours(time_str):
# Split the time string into components based on the presence of '-'
if '-' in time_str:
days_part, time_part = time_str.split('-')
days = int(days_part) # Convert days to integer
else:
time_part = time_str
days = 0 # No days present in the format
# Split the time part (hours:minutes:seconds)
hours, minutes, seconds = map(int, time_part.split(':'))
# Calculate total hours
total_hours = days * 24 + hours + minutes / 60 + seconds / 3600
return float(total_hours)
def update_dictionary(data_dict, name, t_time, g_time, g_tr_time):
if name in data_dict:
# If name exists, update the amount and quantity by adding
data_dict[name]['total_hours'] += t_time
data_dict[name]['gpu_hours'] += g_time
data_dict[name]['gpu_tres_hours'] += g_tr_time
else:
# If name doesn't exist, add a new entry
data_dict[name] = {'total_hours': t_time, 'gpu_hours': g_time, 'gpu_tres_hours': g_tr_time}
#print(data_dict)
#return data_dict
def get_node_names():
"""
Runs the shell command to get the list of node names and returns them as a list.
"""
try:
# Run the shell command and capture the output
command = "sinfo -p kempner_requeue -N 1 | grep kempner | awk '{print $1}'"
result = subprocess.check_output(command, shell=True, text=True)
# Split the result into a list of node names (each line represents a node)
node_names = result.strip().split('\n')
return node_names
except subprocess.CalledProcessError as e:
print(f"Error occurred while running the command: {e}")
return []
def check_kempner_node(n_name, n_list, p_key):
"""
Check if any node name matches a line and if the line does not contain 'kempner'.
"""
for n in n_list:
if n_name in n_list and "kempner" not in p_key:
return "non-kempner"
else:
return p_key
def print_sorted_dictionary(data_dict):
sorted_data_dict = dict(sorted(data_dict.items(), key=lambda x: x[1]['gpu_tres_hours'], reverse=True))
print("\nDictionary (sorted by gpu_rest_hourst):")
for k, v in sorted_data_dict.items():
print("name= {} , gpu_hours= {:0.1f}, gpu_tres_hours= {:0.1f}".format(k, v['gpu_hours'], v['gpu_tres_hours']))
def write_dict_to_file(data_dict, file_name):
"""
Write the dictionary to the specified file.
"""
with open(file_name, 'w') as file:
sorted_data_dict = dict(sorted(data_dict.items(), key=lambda x: x[1]['gpu_tres_hours'], reverse=True))
for k, v in sorted_data_dict.items():
file.write("name= {} , gpu_hours= {:0.1f}, gpu_tres_hours= {:0.1f} \n".format(k, v['gpu_hours'], v['gpu_tres_hours']))
# Initialize an empty data dictionary
partition_dict = {}
user_dict = {}
group_dict = {}
#Get the list of kempner nodes
node_list = get_node_names()
wgpu = {'a100': 209.1, 'h100': 546.9}
# Get the input file name from the argument
input_file_name = sys.argv[1]
# Open the file and process it line by line
with open(input_file_name, 'r') as file:
for line in file:
# Filter lines containing the finished jobs
if "gpu" in line and "RUNNING" not in line and "PENDING" not in line:
# Split the line using the '|' separator
fields = line.strip().split('|')
# Ensure there are enough fields to avoid index errors
if len(fields) >= 8:
user_key = fields[2]
group_key = fields[3].split(',')[0]
partition_key = fields[4].split(',')[0]
gpu_tfield = fields[5]
gpu_count = extract_gres_gpu(fields[6])
node_name = fields[7]
cpu_count = fields[11]
if gpu_count > 0:
gpu_thours = convert_to_hours(gpu_tfield)
tres_factor = extract_gpu_factor(fields[6])
if tres_factor > 0:
gpu_hours = gpu_thours*gpu_count
gpu_tres_hours = gpu_hours*tres_factor
update_dictionary(user_dict, user_key, gpu_thours, gpu_hours, gpu_tres_hours)
if "kempner" in group_key:
update_dictionary(group_dict, group_key, gpu_thours, gpu_hours, gpu_tres_hours)
partition_name = check_kempner_node(node_name, node_list, partition_key)
#print(node_name, partition_name)
if "kempner" in partition_name:
update_dictionary(partition_dict, partition_name, gpu_thours, gpu_hours, gpu_tres_hours)
#print_sorted_dictionary(user_dict)
write_dict_to_file(user_dict, "user_dictionary.csv")
write_dict_to_file(group_dict, "group_dictionary.csv")
write_dict_to_file(partition_dict, "partition_dictionary.csv")