diff --git a/README.md b/README.md index 093ff436..0450ab2d 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,14 @@ R. Souza, T. Skluzacek, S. Wilkinson, M. Ziatdinov, and R. da Silva ``` +## Disclaimer & Get in Touch + +Please note that this a research software. We encourage you to give it a try and use it with your own stack. We +are continuously working on improving documentation and adding more examples and notebooks, but we are still far from +a good documentation covering the whole system. If you are interested in working with FlowCept in your own scientific +project, we can give you a jump start if you reach out to us. Feel free to [create an issue](https://github.com/ORNL/flowcept/issues/new), +[create a new discussion thread](https://github.com/ORNL/flowcept/discussions/new/choose) or drop us an email (we trust you'll find a way to reach out to us :wink: ). + ## Acknowledgement This research uses resources of the Oak Ridge Leadership Computing Facility diff --git a/extra_requirements/amd-requirements.txt b/extra_requirements/amd-requirements.txt new file mode 100644 index 00000000..db5916cb --- /dev/null +++ b/extra_requirements/amd-requirements.txt @@ -0,0 +1 @@ +pyamdgpuinfo==2.1.6 diff --git a/extra_requirements/analytics-requirements.txt b/extra_requirements/analytics-requirements.txt index be6a0312..5f323650 100644 --- a/extra_requirements/analytics-requirements.txt +++ b/extra_requirements/analytics-requirements.txt @@ -1,4 +1,3 @@ seaborn==0.13.2 -h2o==3.44.0.3 plotly==5.18.0 scipy==1.10.1 diff --git a/extra_requirements/data_augmentation-requirements.txt b/extra_requirements/data_augmentation-requirements.txt new file mode 100644 index 00000000..e8d25e80 --- /dev/null +++ b/extra_requirements/data_augmentation-requirements.txt @@ -0,0 +1 @@ +h2o==3.44.0.3 diff --git a/flowcept/__init__.py b/flowcept/__init__.py index 572616ef..8b684319 100644 --- a/flowcept/__init__.py +++ b/flowcept/__init__.py @@ -14,9 +14,8 @@ model_explainer, model_profiler, ) -except Exception as _exp: - flowcept.commons.logger.exception(_exp) - +except: + pass if Vocabulary.Settings.ZAMBEZE_KIND in flowcept.configs.ADAPTERS: try: @@ -50,7 +49,7 @@ MLFlowInterceptor, ) except Exception as _exp: - flowcept.commons.loggerr.error( + flowcept.commons.logger.error( flowcept.commons._get_adapter_exception_msg( Vocabulary.Settings.MLFLOW_KIND ) diff --git a/flowcept/commons/flowcept_dataclasses/telemetry.py b/flowcept/commons/flowcept_dataclasses/telemetry.py index 6ce84ecf..39dc5b8a 100644 --- a/flowcept/commons/flowcept_dataclasses/telemetry.py +++ b/flowcept/commons/flowcept_dataclasses/telemetry.py @@ -53,26 +53,26 @@ class Process: executable: str cmd_line: List[str] - @dataclass(init=False) - class GPU: - @dataclass - class GPUMetrics: - total: int - free: int - used: int - usage_percent: float - temperature: float - power_usage: float - - gpu_sums: GPUMetrics - per_gpu: Dict[int, GPUMetrics] = None + # @dataclass(init=False) + # class GPU: + # @dataclass + # class GPUMetrics: + # total: int + # free: int + # used: int + # usage_percent: float + # temperature: float + # power_usage: float + # + # gpu_sums: GPUMetrics + # per_gpu: Dict[int, GPUMetrics] = None cpu: CPU = None process: Process = None memory: Memory = None disk: Disk = None network: Network = None - gpu: GPU = None + gpu: Dict = None # TODO: use dataclasses def to_dict(self): ret = {} @@ -87,6 +87,7 @@ def to_dict(self): if self.network is not None: ret["network"] = self.network.__dict__ if self.gpu is not None: - ret["gpu"] = asdict(self.gpu, dict_factory=remove_none_values) + # ret["gpu"] = asdict(self.gpu, dict_factory=remove_none_values) + ret["gpu"] = self.gpu return ret diff --git a/flowcept/commons/utils.py b/flowcept/commons/utils.py index b9a34a6a..69669b72 100644 --- a/flowcept/commons/utils.py +++ b/flowcept/commons/utils.py @@ -83,8 +83,9 @@ def default(self, obj): def _get_adapter_exception_msg(adapter_kind): return ( f"You have an adapter for {adapter_kind} in" - f" {SETTINGS_PATH} but we couldn't import its interceptor. " - f" Consider fixing the following exception or remove that adapter " + f" {SETTINGS_PATH} but we couldn't import its interceptor." + f" Consider fixing the following exception (e.g., try installing the" + f" adapter requirements -- see the README file remove that adapter" f" from the settings." f" Exception:" ) diff --git a/flowcept/configs.py b/flowcept/configs.py index 36c14f52..66ed7c12 100644 --- a/flowcept/configs.py +++ b/flowcept/configs.py @@ -107,6 +107,26 @@ TELEMETRY_CAPTURE = settings["project"].get("telemetry_capture", None) + +################################## +# GPU TELEMETRY CAPTURE SETTINGS # +################################# + +N_GPUS = dict() +if TELEMETRY_CAPTURE.get("gpu", False): + try: + from pynvml import nvmlDeviceGetCount + + N_GPUS["nvidia"] = nvmlDeviceGetCount() + except: + pass + try: + import pyamdgpuinfo + + N_GPUS["amd"] = pyamdgpuinfo.detect_gpus() + except: + pass + ###################### # SYS METADATA # ###################### diff --git a/flowcept/flowceptor/telemetry_capture.py b/flowcept/flowceptor/telemetry_capture.py index 23b912da..a8a98634 100644 --- a/flowcept/flowceptor/telemetry_capture.py +++ b/flowcept/flowceptor/telemetry_capture.py @@ -1,18 +1,26 @@ import psutil -import pynvml -from pynvml import ( - nvmlDeviceGetCount, - nvmlDeviceGetHandleByIndex, - nvmlDeviceGetMemoryInfo, - nvmlInit, - nvmlShutdown, - nvmlDeviceGetTemperature, - nvmlDeviceGetPowerUsage, - NVML_TEMPERATURE_GPU, -) +try: + import pynvml + from pynvml import ( + nvmlDeviceGetCount, + nvmlDeviceGetHandleByIndex, + nvmlDeviceGetMemoryInfo, + nvmlDeviceGetName, + nvmlInit, + nvmlShutdown, + nvmlDeviceGetTemperature, + nvmlDeviceGetPowerUsage, + NVML_TEMPERATURE_GPU, + ) +except: + pass +try: + import pyamdgpuinfo +except: + pass from flowcept.commons.flowcept_logger import FlowceptLogger -from flowcept.configs import TELEMETRY_CAPTURE +from flowcept.configs import TELEMETRY_CAPTURE, N_GPUS from flowcept.commons.flowcept_dataclasses.telemetry import Telemetry @@ -26,19 +34,29 @@ def capture(self) -> Telemetry: return None tel = Telemetry() - tel.process = self._capture_process_info() - tel.cpu = self._capture_cpu() - tel.memory = self._capture_memory() - tel.network = self._capture_network() - tel.disk = self._capture_disk() - tel.gpu = self._capture_gpu() + if self.conf.get("process_info", False): + tel.process = self._capture_process_info() + + capt_cpu = self.conf.get("cpu", False) + capt_per_cpu = self.conf.get("per_cpu", False) + if capt_cpu or capt_per_cpu: + tel.cpu = self._capture_cpu(capt_cpu, capt_per_cpu) + + if self.conf.get("mem", False): + tel.memory = self._capture_memory() + + if self.conf.get("network", False): + tel.network = self._capture_network() + + if self.conf.get("disk", False): + tel.disk = self._capture_disk() + + if self.conf.get("gpu", False): + tel.gpu = self._capture_gpu() return tel def _capture_disk(self): - capt = self.conf.get("disk", False) - if not capt: - return None try: disk = Telemetry.Disk() disk.disk_usage = psutil.disk_usage("/")._asdict() @@ -54,9 +72,6 @@ def _capture_disk(self): self.logger.exception(e) def _capture_network(self): - capt = self.conf.get("network", False) - if not capt: - return None try: net = Telemetry.Network() net.netio_sum = psutil.net_io_counters(pernic=False)._asdict() @@ -70,9 +85,6 @@ def _capture_network(self): self.logger.exception(e) def _capture_memory(self): - capt = self.conf.get("mem", False) - if not capt: - return None try: mem = Telemetry.Memory() mem.virtual = psutil.virtual_memory()._asdict() @@ -82,9 +94,6 @@ def _capture_memory(self): self.logger.exception(e) def _capture_process_info(self): - capt = self.conf.get("process_info", False) - if not capt: - return None try: p = Telemetry.Process() psutil_p = psutil.Process() @@ -113,11 +122,7 @@ def _capture_process_info(self): except Exception as e: self.logger.exception(e) - def _capture_cpu(self): - capt_cpu = self.conf.get("cpu", False) - capt_per_cpu = self.conf.get("per_cpu", False) - if not (capt_cpu or capt_per_cpu): - return None + def _capture_cpu(self, capt_cpu, capt_per_cpu): try: cpu = Telemetry.CPU() if capt_cpu: @@ -133,58 +138,127 @@ def _capture_cpu(self): self.logger.exception(e) return None - def _capture_gpu(self): - capt = self.conf.get("gpu", False) - if not capt: - return None + def __get_gpu_info_nvidia(self, gpu_ix: int = 0): + try: + handle = nvmlDeviceGetHandleByIndex(gpu_ix) + nvidia_info = nvmlDeviceGetMemoryInfo(handle) + except Exception as e: + self.logger.exception(e) + return {} + + flowcept_gpu_info = { + "total": nvidia_info.total, + "used": nvidia_info.used, + "temperature": nvmlDeviceGetTemperature( + handle, NVML_TEMPERATURE_GPU + ), + "power_usage": nvmlDeviceGetPowerUsage(handle), + "name": nvmlDeviceGetName(handle), + } + + return flowcept_gpu_info + + def __get_gpu_info_amd(self, gpu_ix: int = 0): + flowcept_gpu_info = {} + try: + amd_info = pyamdgpuinfo.get_gpu(gpu_ix) + except Exception as e: + self.logger.exception(e) + return flowcept_gpu_info + + memory_info = amd_info.memory_info.copy() + try: + flowcept_gpu_info["total"] = memory_info.pop("vram_size") + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info["temperature"] = amd_info.query_temperature() + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info["power_usage"] = amd_info.query_power() + except Exception as e: + self.logger.exception(e) try: - deviceCount = nvmlDeviceGetCount() - handle = nvmlDeviceGetHandleByIndex(0) - info = nvmlDeviceGetMemoryInfo(handle) - _this_gpu = { - "total": info.total, - "free": info.free, - "used": info.used, - "usage_percent": info.used / info.total * 100, - "temperature": nvmlDeviceGetTemperature( - handle, NVML_TEMPERATURE_GPU - ), - "power_usage": nvmlDeviceGetPowerUsage(handle), - } - gpu = Telemetry.GPU() - if deviceCount == 1: - gpu.gpu_sums = gpu.GPUMetrics(**_this_gpu) + flowcept_gpu_info["used"] = amd_info.query_vram_usage() + except Exception as e: + self.logger.exception(e) + + try: + max_clocks = amd_info.query_max_clocks() + flowcept_gpu_info["max_shader_clock"] = max_clocks["sclk_max"] + flowcept_gpu_info["max_memory_clock"] = max_clocks["mclk_max"] + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info["shader_clock"] = amd_info.query_sclk() + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info["memory_clock"] = amd_info.query_mclk() + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info["gtt_usage"] = amd_info.query_gtt_usage() + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info["load"] = amd_info.query_load() + except Exception as e: + self.logger.exception(e) + + try: + flowcept_gpu_info[ + "graphics_voltage" + ] = amd_info.query_graphics_voltage() + except Exception as e: + self.logger.exception(e) + + flowcept_gpu_info.update(memory_info) + + try: + name = amd_info.name + if name is not None: + flowcept_gpu_info["name"] = name + except Exception as e: + self.logger.exception(e) + + return flowcept_gpu_info + + def _capture_gpu(self): + try: + if len(N_GPUS) == 0: + self.logger.exception( + "You are trying to capture telemetry GPU info, but we" + " couldn't detect any GPU, neither NVIDIA nor AMD." + " Please set GPU telemetry capture to false." + ) + + n_nvidia_gpus = N_GPUS.get("nvidia", 0) + n_amd_gpus = N_GPUS.get("amd", 0) + + if n_nvidia_gpus > 0: + n_gpus = n_nvidia_gpus + gpu_capture_func = self.__get_gpu_info_nvidia + elif n_amd_gpus > 0: + n_gpus = n_amd_gpus + gpu_capture_func = self.__get_gpu_info_amd else: - gpu.per_gpu = {0: gpu.GPUMetrics(**_this_gpu)} - sums = _this_gpu.copy() - for i in range(1, deviceCount): - handle = nvmlDeviceGetHandleByIndex(i) - info = nvmlDeviceGetMemoryInfo(handle) - _temp = nvmlDeviceGetTemperature( - handle, pynvml.NVML_TEMPERATURE_GPU - ) - _pow = nvmlDeviceGetPowerUsage(handle) - - sums["total"] += info.total - sums["free"] += info.free - sums["used"] += info.used - sums["temperature"] += _temp - sums["power_usage"] += _pow - - gpu.per_gpu[i] = gpu.GPUMetrics( - total=info.total, - free=info.free, - used=info.used, - usage_percent=info.used / info.total * 100, - temperature=_temp, - power_usage=_pow, - ) - - sums["usage_percent"] = sums["used"] / sums["total"] * 100 - gpu.gpu_sums = gpu.GPUMetrics(**sums) - - return gpu + self.logger.exception("This should never happen.") + return None + + gpu_telemetry = {} + for i in range(0, n_gpus): + gpu_telemetry[i] = gpu_capture_func(i) + + return gpu_telemetry except Exception as e: self.logger.exception(e) return None @@ -192,8 +266,8 @@ def _capture_gpu(self): def init_gpu_telemetry(self): if self.conf is None: return None - - if self.conf.get("gpu", False): + # These methods are only needed for NVIDIA GPUs + if N_GPUS.get("nvidia", 0) > 0: try: nvmlInit() except Exception as e: @@ -203,8 +277,8 @@ def init_gpu_telemetry(self): def shutdown_gpu_telemetry(self): if self.conf is None: return None - - if self.conf.get("gpu", False): + # These methods are only needed for NVIDIA GPUs + if N_GPUS.get("nvidia", 0) > 0: try: nvmlShutdown() except Exception as e: diff --git a/setup.py b/setup.py index f728430f..35cbb436 100644 --- a/setup.py +++ b/setup.py @@ -54,16 +54,21 @@ def create_settings_file(): "tensorboard", "dask", "nvidia", + "amd", "analytics", "responsible_ai", + "data_augmentation", ] +skip_full = {"amd", "nvidia"} + extras_require = dict() for req in extras_requirement_keys: req_path = f"extra_requirements/{req}-requirements.txt" _requirements = get_requirements(req_path) extras_require[req] = _requirements - full_requirements.extend(_requirements) + if req not in skip_full: + full_requirements.extend(_requirements) extras_require["full"] = full_requirements