From 11280f43698ee854c6c9b0cf1402535b090fb4c5 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Nov 2022 20:29:20 +0800 Subject: [PATCH] fix a bug when device info not exists in json format (#1166) --- visualdl/component/profiler/parser/event_node.py | 14 ++++++++++---- visualdl/component/profiler/profiler_data.py | 2 ++ visualdl/component/profiler/profiler_reader.py | 8 ++++++-- visualdl/component/profiler/profiler_server.py | 4 ++++ visualdl/component/profiler/run_manager.py | 3 --- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/visualdl/component/profiler/parser/event_node.py b/visualdl/component/profiler/parser/event_node.py index b7df58307..b3d0eebf7 100644 --- a/visualdl/component/profiler/parser/event_node.py +++ b/visualdl/component/profiler/parser/event_node.py @@ -265,10 +265,16 @@ def __init__(self, data): def parse_json(self, json_data): self.schema_version = json_data['schemaVersion'] self.span_idx = json_data['span_indx'] - self.device_infos = { - device_info['id']: device_info - for device_info in json_data['deviceProperties'] - } + try: + self.device_infos = { + device_info['id']: device_info + for device_info in json_data['deviceProperties'] + } + except Exception: + print( + "paddlepaddle-gpu version is needed to get GPU device informations." + ) + self.device_infos = {} hostnodes = [] runtimenodes = [] devicenodes = [] diff --git a/visualdl/component/profiler/profiler_data.py b/visualdl/component/profiler/profiler_data.py index 100f8a245..e6ce7f75b 100644 --- a/visualdl/component/profiler/profiler_data.py +++ b/visualdl/component/profiler/profiler_data.py @@ -1767,6 +1767,8 @@ def get_distributed_info(self): data = [] for profile_data in self.profile_datas: device_infos = profile_data.device_infos + if not device_infos: + return data gpu_id = int(next(iter(profile_data.gpu_ids))) data.append({ 'worker_name': diff --git a/visualdl/component/profiler/profiler_reader.py b/visualdl/component/profiler/profiler_reader.py index 5bcbbcf7a..a97a18bd5 100644 --- a/visualdl/component/profiler/profiler_reader.py +++ b/visualdl/component/profiler/profiler_reader.py @@ -14,6 +14,7 @@ # ======================================================================= import os import re +from threading import Lock from threading import Thread import packaging.version @@ -28,6 +29,7 @@ from visualdl.io import bfile _name_pattern = re.compile(r"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))") +_lock = Lock() def is_VDLProfiler_file(path): @@ -117,8 +119,10 @@ def runs(self, update=True): self.run_managers[run] = RunManager(run) self.run_managers[run].set_all_filenames(filenames) for filename in filenames: - if self.run_managers[run].has_handled(filename): - continue + with _lock: # we add this to prevent parallel requests for handling a file multiple times + if self.run_managers[run].has_handled(filename): + continue + self.run_managers[run].handled_filenames.add(filename) self._read_data(run, filename) return list(self.walks.keys()) diff --git a/visualdl/component/profiler/profiler_server.py b/visualdl/component/profiler/profiler_server.py index bb4977063..8708a00aa 100644 --- a/visualdl/component/profiler/profiler_server.py +++ b/visualdl/component/profiler/profiler_server.py @@ -194,6 +194,8 @@ def distributed_steps(self, run, worker, span): run_manager = self._reader.get_run_manager(run) distributed_profiler_data = run_manager.get_distributed_profiler_data( span) + if distributed_profiler_data is None: + return return distributed_profiler_data.get_distributed_steps() @result() @@ -201,6 +203,8 @@ def distributed_histogram(self, run, worker, span, step, time_unit='ms'): run_manager = self._reader.get_run_manager(run) distributed_profiler_data = run_manager.get_distributed_profiler_data( span) + if distributed_profiler_data is None: + return return distributed_profiler_data.get_distributed_histogram( step, time_unit) diff --git a/visualdl/component/profiler/run_manager.py b/visualdl/component/profiler/run_manager.py index 037ca02e5..418626abf 100644 --- a/visualdl/component/profiler/run_manager.py +++ b/visualdl/component/profiler/run_manager.py @@ -104,11 +104,8 @@ def _parse_file(self, worker_name, result): return def join(self): - if self.has_join: - return for thread in self.threads.values(): thread.join() - self.has_join = True distributed_profiler_data = defaultdict(list) for worker_name, span_data in self.profiler_data.items(): for span_idx, profiler_data in span_data.items():