Use CUPTI activity markers instead of nvtx driver callbacks for NVTX …

…tracking. Add support to parse nvtx range events from CUPTI activity markers, and merge them to form ThreadMarkerRange events for XPlane. PiperOrigin-RevId: 708364236
openxla · Dec 20, 2024 · edcdfcf · edcdfcf
1 parent f6233cf
commit edcdfcf
Show file tree

Hide file tree

Showing 17 changed files with 500 additions and 38 deletions.
diff --git a/xla/backends/profiler/gpu/BUILD b/xla/backends/profiler/gpu/BUILD
@@ -5,6 +5,7 @@ load(
     "//xla/tsl:tsl.bzl",
     "if_google",
     "if_nvcc",
+    "if_oss",
     "internal_visibility",
     "tsl_copts",
     "tsl_gpu_library",
@@ -177,6 +178,7 @@ tsl_gpu_library(
         "//xla/tsl/profiler/utils:per_thread",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:optional",
@@ -422,3 +424,30 @@ xla_test(
         "@tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
+
+cuda_library(
+    name = "nvtx_with_cuda_kernels",
+    testonly = 1,
+    srcs = ["nvtx_with_cuda_kernels.cu.cc"],
+    hdrs = ["nvtx_with_cuda_kernels.h"],
+    copts = if_nvcc([
+        "-nvcc_options",
+        "ptxas-options=-v",
+    ]),
+    local_defines = if_oss(["NVTX_VERSION_3_1=1"]),
+    visibility = ["//visibility:public"],
+)
+
+xla_test(
+    name = "nvtx_with_cuda_kernels_test",
+    size = "small",
+    srcs = ["nvtx_with_cuda_kernels_test.cc"],
+    backends = ["gpu"],
+    copts = tf_profiler_copts() + tsl_copts(),
+    tags = ["no_mac"],
+    deps = [
+        ":nvtx_with_cuda_kernels",
+        "@com_google_googletest//:gtest_main",
+        "@tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.cc b/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
 #include "tsl/platform/errors.h"
@@ -99,6 +101,14 @@ using CuptiActivityMemsetTy = CUpti_ActivityMemset;
 using CuptiActivityGraphTraceTy = CUpti_ActivityGraphTrace;
 #endif  // CUDA_VERSION >= 11070
 
+#if CUDA_VERSION >= 8000
+using CuptiActivityMarkerTy = CUpti_ActivityMarker2;
+constexpr int kCuptiActivityMarkerVersion = 2;
+#else
+using CuptiActivityMarkerTy = CUpti_ActivityMarker;
+constexpr int kCuptiActivityMarkerVersion = 1;
+#endif  // CUDA_VERSION >= 11070
+
 // Maps an OverheadKind enum to a const string.
 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
   switch (kind) {
@@ -208,6 +218,55 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
   });
 }
 
+template <int CuptiActivityMarkerVersion>
+const char *GetActivityMarkerDomain(const CuptiActivityMarkerTy *marker_trace) {
+  if constexpr (CuptiActivityMarkerVersion == 1) {
+    return "";
+  } else {
+    return marker_trace->domain;
+  }
+}
+
+void AddMarkerActivityEvent(CuptiEventCollectorDelegate &collector,
+                            CuptiActivityMarkerTy *marker_trace) {
+  // Currently only support thread marker (i.e., nvtx range push/pop)
+  if (marker_trace->objectKind != CUPTI_ACTIVITY_OBJECT_THREAD) return;
+  if (marker_trace->flags == CUPTI_ACTIVITY_FLAG_MARKER_START) {
+    collector.receive(CuptiTracerEvent{
+        /* .type = */ CuptiTracerEventType::ThreadMarkerStart,
+        /* .source = */ CuptiTracerEventSource::Activity,
+        /* .name = */ marker_trace->name,
+        /* .annotation = */ "",
+        /* .nvtx_range = */
+        GetActivityMarkerDomain<kCuptiActivityMarkerVersion>(marker_trace),
+        /* .start_time_ns = */ marker_trace->timestamp,
+        /* .end_time_ns = */ marker_trace->timestamp,
+        /* .device_id = */ 0,
+        /* .correlation_id = */ 0,
+        /* .thread_id = */ marker_trace->objectId.pt.threadId,
+        /* .context_id = */ 0,
+        /* .stream_id = */ 0,
+        /* .graph_id = */ marker_trace->id,
+    });
+  } else if (marker_trace->flags == CUPTI_ACTIVITY_FLAG_MARKER_END) {
+    collector.receive(CuptiTracerEvent{
+        /* .type = */ CuptiTracerEventType::ThreadMarkerEnd,
+        /* .source = */ CuptiTracerEventSource::Activity,
+        /* .name = */ "",
+        /* .annotation = */ "",
+        /* .nvtx_range = */ "",
+        /* .start_time_ns = */ marker_trace->timestamp,
+        /* .end_time_ns = */ marker_trace->timestamp,
+        /* .device_id = */ 0,
+        /* .correlation_id = */ 0,
+        /* .thread_id = */ marker_trace->objectId.pt.threadId,
+        /* .context_id = */ 0,
+        /* .stream_id = */ 0,
+        /* .graph_id = */ marker_trace->id,
+    });
+  }
+}
+
 void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
                             const CuptiActivityMemcpyTy *memcpy) {
   CuptiTracerEvent event{};
@@ -512,6 +571,10 @@ static absl::Status ConvertActivityBuffer(
               collector, reinterpret_cast<CuptiActivityGraphTraceTy *>(record));
           break;
 #endif
+        case CUPTI_ACTIVITY_KIND_MARKER:
+          AddMarkerActivityEvent(
+              collector, reinterpret_cast<CuptiActivityMarkerTy *>(record));
+          break;
         default:
           VLOG(3) << "Activity type " << record->kind << " is not supported.";
           break;

diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.h b/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -174,6 +174,9 @@ enum class CuptiTracerEventType {
   HostRegister = 13,
   HostUnregister = 14,
   CudaGraph = 15,
+  ThreadMarkerRange = 16,
+  ThreadMarkerStart = 17,
+  ThreadMarkerEnd = 18,
   Generic = 100,
 };
 

diff --git a/xla/backends/profiler/gpu/cupti_collector.cc b/xla/backends/profiler/gpu/cupti_collector.cc
@@ -62,11 +62,14 @@ namespace profiler {
 namespace {
 
 using tensorflow::profiler::XEventMetadata;
+using tensorflow::profiler::XLine;
+using tensorflow::profiler::XPlane;
 using tensorflow::profiler::XSpace;
 using tensorflow::profiler::XStatMetadata;
 using tsl::mutex;
 using tsl::mutex_lock;
 using tsl::profiler::Annotation;
+using tsl::profiler::FindMutablePlaneWithName;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
 using tsl::profiler::GpuPlaneName;
 using tsl::profiler::kCuptiDriverApiPlaneName;
@@ -79,13 +82,27 @@ using tsl::profiler::XEventBuilder;
 using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 
+static constexpr int64_t kNvtxLineIdStart = 1LL << 32;
+static constexpr int64_t kNvtxLineIdEnd = 2LL << 32;
+
+bool IsNvtxLine(int64_t line_id) {
+  return line_id >= kNvtxLineIdStart && line_id < kNvtxLineIdEnd;
+}
+
 bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
   // DriverCallback(i.e. kernel launching) events are host events.
   if (event.source == CuptiTracerEventSource::DriverCallback) {
     *line_id = event.thread_id;
     return true;
   }
-  // Non-overhead activity events are device events.
+  // nvtx marker events from activity source are host events. Those markers
+  // are put into a separate line whose id value greater than kNvtxLineIdStart.
+  if (event.source == CuptiTracerEventSource::Activity &&
+      event.type == CuptiTracerEventType::ThreadMarkerRange) {
+    *line_id = kNvtxLineIdStart + event.thread_id;
+    return true;
+  }
+  // Other non-overhead activity events are device events.
   if (event.type != CuptiTracerEventType::Overhead) {
     *line_id = event.stream_id;
     return false;
@@ -106,6 +123,37 @@ bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
   }
 }
 
+int64_t GetNextAvailableLineId(absl::flat_hash_set<int64_t>& occupied_line_ids,
+                               int64_t next_line_id) {
+  while (occupied_line_ids.contains(next_line_id)) ++next_line_id;
+  occupied_line_ids.insert(next_line_id);
+  return next_line_id;
+}
+
+// Change the line id of the lines where line id >= kNvtxLineIdStart to
+// any non-occupied line id start from 1, making sure the lower 32 bits value of
+// the line ids are unique. This is to avoid the effective line id conflict
+// which only count on the lower 32 bits of the line id in further analysis.
+void AdjustHostPlaneNvtxLines(XPlane* plane) {
+  // Get all occupied line ids with value less than kNvtxLineIdStart.
+  absl::flat_hash_set<int64_t> occupied_line_ids;
+  for (const XLine& line : plane->lines()) {
+    if (line.id() < kNvtxLineIdStart) {
+      occupied_line_ids.insert(line.id());
+    }
+  }
+
+  // Change the line id,  whose id value > kNvtxLineIdStart, to a non-occupied
+  // line id in uint32 range.
+  int64_t next_line_id = 0;
+  for (XLine& line : *plane->mutable_lines()) {
+    if (line.id() >= kNvtxLineIdStart) {
+      next_line_id = GetNextAvailableLineId(occupied_line_ids, next_line_id);
+      line.set_id(next_line_id);
+    }
+  }
+}
+
 struct DeviceOccupancyParams {
   cudaOccFuncAttributes attributes = {};
   int block_size = 0;
@@ -165,7 +213,7 @@ class PerDeviceCollector {
     return stats;
   }
 
-  void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
+  void CreateXEvent(CuptiTracerEvent& event, XPlaneBuilder* plane,
                     uint64_t start_gpu_ns, uint64_t end_gpu_ns,
                     XLineBuilder* line) {
     if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
@@ -183,6 +231,12 @@ class PerDeviceCollector {
     if (event.graph_id != 0 && event.type == CuptiTracerEventType::CudaGraph &&
         event.source == CuptiTracerEventSource::DriverCallback) {
       absl::StrAppend(&kernel_name, " (CudaGraph:", event.graph_id, ")");
+    } else if (event.type == CuptiTracerEventType::ThreadMarkerRange) {
+      kernel_name =
+          event.nvtx_range.empty()
+              ? absl::StrCat("NVTX:", kernel_name)
+              : absl::StrCat("NVTX:", event.nvtx_range, ":", kernel_name);
+      event.nvtx_range = "";
     }
     XEventMetadata* event_metadata =
         plane->GetOrCreateEventMetadata(std::move(kernel_name));
@@ -410,7 +464,15 @@ class PerDeviceCollector {
           GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
     });
     host_plane->ForEachLine([&](XLineBuilder line) {
-      line.SetName(absl::StrCat("Host Threads/", line.Id()));
+      if (IsNvtxLine(line.Id())) {
+        // Lines will order by name, by appending suffix to the normal cupti
+        // line name, the nvtx lines will be placed right after their
+        // corresponding cupti lines.
+        line.SetName(absl::StrCat("Host Threads/",
+                                  static_cast<uint32_t>(line.Id()), "/NVTX"));
+      } else {
+        line.SetName(absl::StrCat("Host Threads/", line.Id()));
+      }
     });
     size_t num_events = events_.size();
     events_.clear();
@@ -680,6 +742,7 @@ void CuptiTraceCollector::OnTracerCachedActivityBuffers(
 
 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
 // eventually convert and filter them to XSpace.
+// It also add support to handle cupti activity events for nvtx thread markers.
 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
  public:
   CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
@@ -699,6 +762,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     } else {
       num_activity_events_++;
     }
+    if (event.type == CuptiTracerEventType::ThreadMarkerStart ||
+        event.type == CuptiTracerEventType::ThreadMarkerEnd) {
+      // Process the nvtx marker, merge thread range start/end if appropriate.
+      // If merged, the event will contains the merged content, and be used for
+      // followed AddEvent() processing.
+      if (!AddNvtxMarker(event)) return;
+    }
     per_device_collector_[event.device_id].AddEvent(std::move(event));
   }
   void OnEventsDropped(const std::string& reason,
@@ -745,6 +815,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
           start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
       NormalizeTimeStamps(&device_plane, start_walltime_ns_);
     }
+    AdjustHostPlaneNvtxLines(
+        FindMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
     NormalizeTimeStamps(&host_plane, start_walltime_ns_);
     return num_events > 0;
   }
@@ -775,6 +847,39 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   uint64_t start_walltime_ns_;
   uint64_t start_gpu_ns_;
   int num_gpus_;
+  uint32_t num_duplicate_nvtx_marker_start_ = 0;
+  uint32_t num_unmatched_nvtx_marker_end_ = 0;
+
+  // process the nvtx marker, a)cache range start event, or b)merge range end
+  // with its corresponding start event. If merged, the event be updated with
+  // the merged content and return true. If not merged, return false.
+  bool AddNvtxMarker(CuptiTracerEvent& event) {
+    const uint32_t marker_id = event.graph_id;
+    auto it = nvtx_markers_.find(marker_id);
+    if (event.type == CuptiTracerEventType::ThreadMarkerStart) {
+      if (it == nvtx_markers_.end()) {
+        nvtx_markers_[marker_id] =
+            std::make_unique<CuptiTracerEvent>(std::move(event));
+      } else {
+        LOG_IF(ERROR, ++num_duplicate_nvtx_marker_start_ < 100)
+            << "Duplicate nvtx thread range start marker id: " << marker_id;
+      }
+    } else if (event.type == CuptiTracerEventType::ThreadMarkerEnd) {
+      if (it != nvtx_markers_.end()) {
+        it->second->type = CuptiTracerEventType::ThreadMarkerRange;
+        it->second->end_time_ns = event.end_time_ns;
+        it->second->graph_id = 0;
+        event = std::move(*it->second);
+        nvtx_markers_.erase(it);
+        return true;  // The event is merged for further processing.
+      } else {
+        LOG_IF(ERROR, ++num_unmatched_nvtx_marker_end_ < 100)
+            << "Unmatched nvtx thread range end marker id: " << marker_id;
+      }
+    }
+    // No merged event is generated, return false.
+    return false;
+  }
 
   // Set the all XLines of specified XPlane to starting walltime.
   // Events time in both host and device planes are CUTPI timestamps.
@@ -788,6 +893,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   }
 
   absl::FixedArray<PerDeviceCollector> per_device_collector_;
+  absl::flat_hash_map<uint32_t, std::unique_ptr<CuptiTracerEvent>>
+      nvtx_markers_;
 
   CuptiTraceCollectorImpl(const CuptiTraceCollectorImpl&) = delete;
   void operator=(const CuptiTraceCollectorImpl&) = delete;

diff --git a/xla/backends/profiler/gpu/cupti_error_manager.cc b/xla/backends/profiler/gpu/cupti_error_manager.cc
@@ -279,6 +279,14 @@ CUptiResult CuptiErrorManager::GetGraphExecId(CUgraphExec graph_exec,
   return error;
 }
 
+CUptiResult CuptiErrorManager::SetThreadIdType(
+    CUpti_ActivityThreadIdType type) {
+  IGNORE_CALL_IF_DISABLED;
+  CUptiResult error = interface_->SetThreadIdType(type);
+  LOG_AND_DISABLE_IF_ERROR(error);
+  return error;
+}
+
 void CuptiErrorManager::CleanUp() {
   if (undo_disabled_) {  // prevent deadlock
     return;

diff --git a/xla/backends/profiler/gpu/cupti_error_manager.h b/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -117,6 +117,8 @@ class CuptiErrorManager : public xla::profiler::CuptiInterface {
   CUptiResult GetGraphExecId(CUgraphExec graph_exec,
                              uint32_t* graph_id) override;
 
+  CUptiResult SetThreadIdType(CUpti_ActivityThreadIdType type) override;
+
   // Clears Undo stack. We are maintaining undo stack for each profiling phase.
   // Once the profiling is done, we need to clear the undo stack.
   void CleanUp() override;

diff --git a/xla/backends/profiler/gpu/cupti_error_manager_test.cc b/xla/backends/profiler/gpu/cupti_error_manager_test.cc
@@ -124,6 +124,9 @@ TEST_F(CuptiErrorManagerTest, GpuTraceActivityEnableTest) {
       .InSequence(s1)
       .WillRepeatedly(
           Invoke(cupti_wrapper_.get(), &CuptiWrapper::EnableCallback));
+  EXPECT_CALL(*mock_, SetThreadIdType(_))
+      .InSequence(s1)
+      .WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::SetThreadIdType));
   EXPECT_CALL(*mock_, ActivityUsePerThreadBuffer())
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(),
@@ -182,6 +185,9 @@ TEST_F(CuptiErrorManagerTest, GpuTraceAutoEnableTest) {
   EXPECT_CALL(*mock_, EnableDomain(1, _, _))
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::EnableDomain));
+  EXPECT_CALL(*mock_, SetThreadIdType(_))
+      .InSequence(s1)
+      .WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::SetThreadIdType));
   EXPECT_CALL(*mock_, ActivityUsePerThreadBuffer())
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(),