Skip to content

Commit

Permalink
Update trace_linker to use external_id for finding GPU op's parent CP…
Browse files Browse the repository at this point in the history
…U op
  • Loading branch information
JoongunPark committed Nov 16, 2024
1 parent 396bc6e commit 744108d
Showing 1 changed file with 18 additions and 16 deletions.
34 changes: 18 additions & 16 deletions src/trace_link/trace_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def load_sync_dependencies(
)
if not success:
logging.error("Failed to load Critical Path Graph")
return sync_dependencies

raw_events = trace_analysis.t.get_raw_trace_for_one_rank(rank=rank)["traceEvents"]
for edge in cp_graph.critical_path_edges_set:
Expand Down Expand Up @@ -541,7 +540,7 @@ def map_host_to_device_ops(
]:
"""Map Chakra host operators to corresponding device operators."""
logging.debug("Mapping Charka host operators to corresponding device operators.")
cpu_ev_idx_to_gpu_ops_map = self.group_gpu_ops_by_cpu_launchers(
cpu_external_id_to_gpu_ops_map = self.group_gpu_ops_by_cpu_launchers(
kineto_gpu_ops, kineto_correlation_cuda_runtime_map, sorted_kineto_cpu_ops, sorted_kineto_cpu_op_ts
)

Expand Down Expand Up @@ -569,7 +568,7 @@ def map_host_to_device_ops(
) = self.link_ops(
host_op,
kineto_op,
cpu_ev_idx_to_gpu_ops_map,
cpu_external_id_to_gpu_ops_map,
kineto_rf_id_to_device_op_map,
kineto_external_id_to_kineto_op_map,
)
Expand All @@ -593,7 +592,7 @@ def group_gpu_ops_by_cpu_launchers(
"""
Group GPU operators based on their corresponding CPU launchers.
This is determined by the 'ev_idx' which links GPU operators to their initiating CPU launcher events.
This is determined by the 'external_id' which links GPU operators to their initiating CPU launcher events.
Args:
kineto_gpu_ops (List[KinetoOperator]): List of Kineto GPU operators.
Expand All @@ -607,9 +606,9 @@ def group_gpu_ops_by_cpu_launchers(
Dict[int, List[KinetoOperator]]: Mapping from CPU launch event indices to GPU operators.
Raises:
ValueError: If 'ev_idx' is missing for any GPU operator.
ValueError: If 'external_id' is missing for any GPU operator.
"""
cpu_ev_idx_to_gpu_ops_map = {}
cpu_external_id_to_gpu_ops_map = {}
for gpu_op in kineto_gpu_ops:
parent_cpu_op = self.find_parent_cpu_op(
gpu_op, kineto_correlation_cuda_runtime_map, sorted_kineto_cpu_ops, sorted_kineto_cpu_op_ts
Expand All @@ -619,19 +618,19 @@ def group_gpu_ops_by_cpu_launchers(
logging.warning(warning_msg)
continue

if parent_cpu_op.ev_idx == "":
if parent_cpu_op.external_id == "":
error_msg = (
f"Missing 'ev_idx' for CPU operator {parent_cpu_op.name}. "
f"Missing 'external_id' for CPU operator {parent_cpu_op.name}. "
f"Cannot link GPU op {gpu_op.name} to {parent_cpu_op.name}."
)
logging.warning(error_msg)
continue

logging.debug(f"group_gpu_ops_by_cpu_launchers '{parent_cpu_op.name}' -> '{gpu_op.name}'")

cpu_ev_idx_to_gpu_ops_map.setdefault(parent_cpu_op.ev_idx, []).append(gpu_op)
cpu_external_id_to_gpu_ops_map.setdefault(parent_cpu_op.external_id, []).append(gpu_op)

return cpu_ev_idx_to_gpu_ops_map
return cpu_external_id_to_gpu_ops_map

def find_parent_cpu_op(
self,
Expand Down Expand Up @@ -735,26 +734,29 @@ def find_closest_op(

# After skipping 'nccl:coalesced', verify that the closest operation is on the same thread
# as the GPU operation
if closest_op.tid == kineto_gpu_op.tid:
if closest_op.tid == kineto_gpu_op.tid and closest_op.external_id == kineto_gpu_op.external_id:
return closest_op

# If the tids do not match, search forward to find the closest matching tid
potential_op = None
for i in range(index - 1, -1, -1):
op = sorted_kineto_cpu_ops[i]
if op.tid == kineto_gpu_op.tid:
if "nccl" in kineto_gpu_op.name.lower() and op.name == "nccl:coalesced":
continue # Skip 'nccl:coalesced' if it's an NCCL-related GPU operation
if op.timestamp <= ts:
if op.external_id == kineto_gpu_op.external_id:
return op
if op.timestamp <= ts and potential_op is None:
potential_op = op

# If no matching tid is found going forward, return None
return None
return closest_op

def link_ops(
self,
host_op: PyTorchOperator,
kineto_op: KinetoOperator,
cpu_ev_idx_to_gpu_ops_map: Dict[int, List[KinetoOperator]],
cpu_external_id_to_gpu_ops_map: Dict[int, List[KinetoOperator]],
kineto_rf_id_to_device_op_map: Dict[int, KinetoOperator],
kineto_external_id_to_kineto_op_map: Dict[int, KinetoOperator],
) -> Tuple[List[KinetoOperator], int, int, int, Optional[int]]:
Expand All @@ -764,7 +766,7 @@ def link_ops(
Args:
host_op (PyTorchOperator): Chakra host operator to link.
kineto_op (KinetoOperator): Corresponding Kineto operator.
cpu_ev_idx_to_gpu_ops_map (Dict[int, List[KinetoOperator]]): GPU ops mapping.
cpu_external_id_to_gpu_ops_map (Dict[int, List[KinetoOperator]]): GPU ops mapping.
kineto_rf_id_to_device_op_map (Dict[int, KinetoOperator]): Kineto operator mapping.
kineto_external_id_to_kineto_op_map (Dict[int, KinetoOperator]): Mapping from external id to
KinetoOperators.
Expand All @@ -779,7 +781,7 @@ def link_ops(
- List[int]: List of synchronization dependency IDs.
"""
kineto_op.host_op = host_op
linked_gpu_ops = cpu_ev_idx_to_gpu_ops_map.get(kineto_op.ev_idx, [])
linked_gpu_ops = cpu_external_id_to_gpu_ops_map.get(kineto_op.external_id, [])
inclusive_dur = kineto_op.inclusive_dur
exclusive_dur = kineto_op.exclusive_dur
timestamp = kineto_op.timestamp
Expand Down

0 comments on commit 744108d

Please sign in to comment.