Skip to content

Commit

Permalink
Merge pull request #35 from CentML/feb09-energy-permissions
Browse files Browse the repository at this point in the history
Last minute fixes
  • Loading branch information
jimgao1 authored Feb 10, 2023
2 parents 79643fd + 061a1ee commit d0e8aab
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 12 deletions.
6 changes: 6 additions & 0 deletions skyline/analysis/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,26 @@
import logging
import os

import torch
from skyline.analysis.session import AnalysisSession
from skyline.nvml import NVML


def analyze_project(project_root, entry_point, nvml):
torch.cuda.empty_cache()
session = AnalysisSession.new_from(project_root, entry_point)
yield session.measure_breakdown(nvml)
torch.cuda.empty_cache()
yield session.measure_throughput()
torch.cuda.empty_cache()

print("analyze_project: running habitat_predict()")
yield session.habitat_predict()
torch.cuda.empty_cache()

print("analyze_project: running energy_compute()")
yield session.energy_compute()
torch.cuda.empty_cache()


def main():
Expand Down
26 changes: 20 additions & 6 deletions skyline/analysis/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,29 @@ def energy_compute(self) -> pm.EnergyResponse:

resp.total_consumption = energy_measurer.total_energy()/float(iterations)

cpu_component = pm.EnergyConsumptionComponent()
cpu_component.component_type = pm.ENERGY_CPU_DRAM
cpu_component.consumption_joules = energy_measurer.cpu_energy()/float(iterations)
components = []
components_joules = []

if energy_measurer.cpu_energy() is not None:
cpu_component = pm.EnergyConsumptionComponent()
cpu_component.component_type = pm.ENERGY_CPU_DRAM
cpu_component.consumption_joules = energy_measurer.cpu_energy()/float(iterations)
components.append(cpu_component)
components_joules.append(0)
else:
cpu_component = pm.EnergyConsumptionComponent()
cpu_component.component_type = pm.ENERGY_CPU_DRAM
cpu_component.consumption_joules = 0
components.append(cpu_component)
components_joules.append(0)

gpu_component = pm.EnergyConsumptionComponent()
gpu_component.component_type = pm.ENERGY_NVIDIA
gpu_component.consumption_joules = energy_measurer.gpu_energy()/float(iterations)
components.append(gpu_component)
components_joules.append(gpu_component.consumption_joules)

resp.components.extend([cpu_component, gpu_component])
resp.components.extend(components)

# get last 10 runs if they exist
path_to_entry_point = os.path.join(self._project_root, self._entry_point)
Expand All @@ -170,7 +184,7 @@ def energy_compute(self) -> pm.EnergyResponse:
resp.past_measurements.extend(_convert_to_energy_responses(past_runs))

# add current run to database
self._energy_table_interface.add_entry([path_to_entry_point, cpu_component.consumption_joules, gpu_component.consumption_joules])
self._energy_table_interface.add_entry([path_to_entry_point] + components_joules)

except PermissionError as err:
# Remind user to set their CPU permissions
Expand Down Expand Up @@ -327,7 +341,7 @@ def measure_throughput(self):

# 1. Measure the throughput at several spots to be able to build a
# prediction model
num_samples = 6
num_samples = 3
samples = self._profiler.sample_run_time_ms_by_batch_size(
start_batch_size=self._batch_size,
memory_usage_percentage=self._memory_usage_percentage,
Expand Down
20 changes: 15 additions & 5 deletions skyline/energy/measurer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,19 @@ def __init__(self, interval):
self.last_dram = None

def measurer_init(self):
self.sensor = Sensor()
energy = self.sensor.energy()
self.last_cpu = np.array(energy[0::2])
self.last_dram = np.array(energy[1::2])
self.sensor = None
try:
self.sensor = Sensor()
energy = self.sensor.energy()
self.last_cpu = np.array(energy[0::2])
self.last_dram = np.array(energy[1::2])
except Exception as e:
print("Warning. Failed to get CPU energy")

def measurer_measure(self):
# Get energy consumed so far (since last CPU reset)
if self.sensor is None: return

energy = self.sensor.energy()
cpu = np.array(energy[0::2])
dram = np.array(energy[1::2])
Expand All @@ -42,6 +48,8 @@ def measurer_deallocate(self):
pass

def total_energy(self):
if len(self.power) == 0: return None

# J = W * s, 1W = 1000 mW
energy = self.interval * sum(self.power) / 1000.0
return energy
Expand Down Expand Up @@ -107,7 +115,9 @@ def end_measurement(self):
def total_energy(self):
total_energy = 0.
for m in self.measurers:
total_energy += self.measurers[m].total_energy()
e = self.measurers[m].total_energy()
if e is not None:
total_energy += e
return total_energy

def cpu_energy(self):
Expand Down
2 changes: 1 addition & 1 deletion skyline/profiler/iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def _select_batch_size(self, lower, upper, is_increasing):

# increase the growth amount for batch sizes to better account for small
# models whose initial batch size is much smaller than the maximum possible.
tiers = [1000, 200, 100, 20, 10, 5]
tiers = [100, 20, 10, 5]
for t in tiers:
if diff >= t: return base + mult * t

Expand Down

0 comments on commit d0e8aab

Please sign in to comment.