Skip to content

Commit

Permalink
Use CUPTI activity markers instead of nvtx driver callbacks for NVTX …
Browse files Browse the repository at this point in the history
…tracking.

Add support to parse nvtx range events from CUPTI activity markers, and merge them to form ThreadMarkerRange events for XPlane.

PiperOrigin-RevId: 708364236
  • Loading branch information
Google-ML-Automation committed Dec 20, 2024
1 parent f6233cf commit edcdfcf
Show file tree
Hide file tree
Showing 17 changed files with 500 additions and 38 deletions.
29 changes: 29 additions & 0 deletions xla/backends/profiler/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ load(
"//xla/tsl:tsl.bzl",
"if_google",
"if_nvcc",
"if_oss",
"internal_visibility",
"tsl_copts",
"tsl_gpu_library",
Expand Down Expand Up @@ -177,6 +178,7 @@ tsl_gpu_library(
"//xla/tsl/profiler/utils:per_thread",
"@com_google_absl//absl/cleanup",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings:string_view",
"@com_google_absl//absl/types:optional",
Expand Down Expand Up @@ -422,3 +424,30 @@ xla_test(
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
],
)

cuda_library(
name = "nvtx_with_cuda_kernels",
testonly = 1,
srcs = ["nvtx_with_cuda_kernels.cu.cc"],
hdrs = ["nvtx_with_cuda_kernels.h"],
copts = if_nvcc([
"-nvcc_options",
"ptxas-options=-v",
]),
local_defines = if_oss(["NVTX_VERSION_3_1=1"]),
visibility = ["//visibility:public"],
)

xla_test(
name = "nvtx_with_cuda_kernels_test",
size = "small",
srcs = ["nvtx_with_cuda_kernels_test.cc"],
backends = ["gpu"],
copts = tf_profiler_copts() + tsl_copts(),
tags = ["no_mac"],
deps = [
":nvtx_with_cuda_kernels",
"@com_google_googletest//:gtest_main",
"@tsl//tsl/platform:test_main",
],
)
63 changes: 63 additions & 0 deletions xla/backends/profiler/gpu/cupti_buffer_events.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ limitations under the License.
#include <cstdint>

#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
#include "third_party/gpus/cuda/include/cuda.h"
#include "xla/backends/profiler/gpu/cupti_interface.h"
#include "tsl/platform/errors.h"
Expand Down Expand Up @@ -99,6 +101,14 @@ using CuptiActivityMemsetTy = CUpti_ActivityMemset;
using CuptiActivityGraphTraceTy = CUpti_ActivityGraphTrace;
#endif // CUDA_VERSION >= 11070

#if CUDA_VERSION >= 8000
using CuptiActivityMarkerTy = CUpti_ActivityMarker2;
constexpr int kCuptiActivityMarkerVersion = 2;
#else
using CuptiActivityMarkerTy = CUpti_ActivityMarker;
constexpr int kCuptiActivityMarkerVersion = 1;
#endif // CUDA_VERSION >= 11070

// Maps an OverheadKind enum to a const string.
const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
switch (kind) {
Expand Down Expand Up @@ -208,6 +218,55 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
});
}

template <int CuptiActivityMarkerVersion>
const char *GetActivityMarkerDomain(const CuptiActivityMarkerTy *marker_trace) {
if constexpr (CuptiActivityMarkerVersion == 1) {
return "";
} else {
return marker_trace->domain;
}
}

void AddMarkerActivityEvent(CuptiEventCollectorDelegate &collector,
CuptiActivityMarkerTy *marker_trace) {
// Currently only support thread marker (i.e., nvtx range push/pop)
if (marker_trace->objectKind != CUPTI_ACTIVITY_OBJECT_THREAD) return;
if (marker_trace->flags == CUPTI_ACTIVITY_FLAG_MARKER_START) {
collector.receive(CuptiTracerEvent{
/* .type = */ CuptiTracerEventType::ThreadMarkerStart,
/* .source = */ CuptiTracerEventSource::Activity,
/* .name = */ marker_trace->name,
/* .annotation = */ "",
/* .nvtx_range = */
GetActivityMarkerDomain<kCuptiActivityMarkerVersion>(marker_trace),
/* .start_time_ns = */ marker_trace->timestamp,
/* .end_time_ns = */ marker_trace->timestamp,
/* .device_id = */ 0,
/* .correlation_id = */ 0,
/* .thread_id = */ marker_trace->objectId.pt.threadId,
/* .context_id = */ 0,
/* .stream_id = */ 0,
/* .graph_id = */ marker_trace->id,
});
} else if (marker_trace->flags == CUPTI_ACTIVITY_FLAG_MARKER_END) {
collector.receive(CuptiTracerEvent{
/* .type = */ CuptiTracerEventType::ThreadMarkerEnd,
/* .source = */ CuptiTracerEventSource::Activity,
/* .name = */ "",
/* .annotation = */ "",
/* .nvtx_range = */ "",
/* .start_time_ns = */ marker_trace->timestamp,
/* .end_time_ns = */ marker_trace->timestamp,
/* .device_id = */ 0,
/* .correlation_id = */ 0,
/* .thread_id = */ marker_trace->objectId.pt.threadId,
/* .context_id = */ 0,
/* .stream_id = */ 0,
/* .graph_id = */ marker_trace->id,
});
}
}

void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
const CuptiActivityMemcpyTy *memcpy) {
CuptiTracerEvent event{};
Expand Down Expand Up @@ -512,6 +571,10 @@ static absl::Status ConvertActivityBuffer(
collector, reinterpret_cast<CuptiActivityGraphTraceTy *>(record));
break;
#endif
case CUPTI_ACTIVITY_KIND_MARKER:
AddMarkerActivityEvent(
collector, reinterpret_cast<CuptiActivityMarkerTy *>(record));
break;
default:
VLOG(3) << "Activity type " << record->kind << " is not supported.";
break;
Expand Down
3 changes: 3 additions & 0 deletions xla/backends/profiler/gpu/cupti_buffer_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ enum class CuptiTracerEventType {
HostRegister = 13,
HostUnregister = 14,
CudaGraph = 15,
ThreadMarkerRange = 16,
ThreadMarkerStart = 17,
ThreadMarkerEnd = 18,
Generic = 100,
};

Expand Down
113 changes: 110 additions & 3 deletions xla/backends/profiler/gpu/cupti_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,14 @@ namespace profiler {
namespace {

using tensorflow::profiler::XEventMetadata;
using tensorflow::profiler::XLine;
using tensorflow::profiler::XPlane;
using tensorflow::profiler::XSpace;
using tensorflow::profiler::XStatMetadata;
using tsl::mutex;
using tsl::mutex_lock;
using tsl::profiler::Annotation;
using tsl::profiler::FindMutablePlaneWithName;
using tsl::profiler::FindOrAddMutablePlaneWithName;
using tsl::profiler::GpuPlaneName;
using tsl::profiler::kCuptiDriverApiPlaneName;
Expand All @@ -79,13 +82,27 @@ using tsl::profiler::XEventBuilder;
using tsl::profiler::XLineBuilder;
using tsl::profiler::XPlaneBuilder;

static constexpr int64_t kNvtxLineIdStart = 1LL << 32;
static constexpr int64_t kNvtxLineIdEnd = 2LL << 32;

bool IsNvtxLine(int64_t line_id) {
return line_id >= kNvtxLineIdStart && line_id < kNvtxLineIdEnd;
}

bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
// DriverCallback(i.e. kernel launching) events are host events.
if (event.source == CuptiTracerEventSource::DriverCallback) {
*line_id = event.thread_id;
return true;
}
// Non-overhead activity events are device events.
// nvtx marker events from activity source are host events. Those markers
// are put into a separate line whose id value greater than kNvtxLineIdStart.
if (event.source == CuptiTracerEventSource::Activity &&
event.type == CuptiTracerEventType::ThreadMarkerRange) {
*line_id = kNvtxLineIdStart + event.thread_id;
return true;
}
// Other non-overhead activity events are device events.
if (event.type != CuptiTracerEventType::Overhead) {
*line_id = event.stream_id;
return false;
Expand All @@ -106,6 +123,37 @@ bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
}
}

int64_t GetNextAvailableLineId(absl::flat_hash_set<int64_t>& occupied_line_ids,
int64_t next_line_id) {
while (occupied_line_ids.contains(next_line_id)) ++next_line_id;
occupied_line_ids.insert(next_line_id);
return next_line_id;
}

// Change the line id of the lines where line id >= kNvtxLineIdStart to
// any non-occupied line id start from 1, making sure the lower 32 bits value of
// the line ids are unique. This is to avoid the effective line id conflict
// which only count on the lower 32 bits of the line id in further analysis.
void AdjustHostPlaneNvtxLines(XPlane* plane) {
// Get all occupied line ids with value less than kNvtxLineIdStart.
absl::flat_hash_set<int64_t> occupied_line_ids;
for (const XLine& line : plane->lines()) {
if (line.id() < kNvtxLineIdStart) {
occupied_line_ids.insert(line.id());
}
}

// Change the line id, whose id value > kNvtxLineIdStart, to a non-occupied
// line id in uint32 range.
int64_t next_line_id = 0;
for (XLine& line : *plane->mutable_lines()) {
if (line.id() >= kNvtxLineIdStart) {
next_line_id = GetNextAvailableLineId(occupied_line_ids, next_line_id);
line.set_id(next_line_id);
}
}
}

struct DeviceOccupancyParams {
cudaOccFuncAttributes attributes = {};
int block_size = 0;
Expand Down Expand Up @@ -165,7 +213,7 @@ class PerDeviceCollector {
return stats;
}

void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
void CreateXEvent(CuptiTracerEvent& event, XPlaneBuilder* plane,
uint64_t start_gpu_ns, uint64_t end_gpu_ns,
XLineBuilder* line) {
if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
Expand All @@ -183,6 +231,12 @@ class PerDeviceCollector {
if (event.graph_id != 0 && event.type == CuptiTracerEventType::CudaGraph &&
event.source == CuptiTracerEventSource::DriverCallback) {
absl::StrAppend(&kernel_name, " (CudaGraph:", event.graph_id, ")");
} else if (event.type == CuptiTracerEventType::ThreadMarkerRange) {
kernel_name =
event.nvtx_range.empty()
? absl::StrCat("NVTX:", kernel_name)
: absl::StrCat("NVTX:", event.nvtx_range, ":", kernel_name);
event.nvtx_range = "";
}
XEventMetadata* event_metadata =
plane->GetOrCreateEventMetadata(std::move(kernel_name));
Expand Down Expand Up @@ -410,7 +464,15 @@ class PerDeviceCollector {
GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
});
host_plane->ForEachLine([&](XLineBuilder line) {
line.SetName(absl::StrCat("Host Threads/", line.Id()));
if (IsNvtxLine(line.Id())) {
// Lines will order by name, by appending suffix to the normal cupti
// line name, the nvtx lines will be placed right after their
// corresponding cupti lines.
line.SetName(absl::StrCat("Host Threads/",
static_cast<uint32_t>(line.Id()), "/NVTX"));
} else {
line.SetName(absl::StrCat("Host Threads/", line.Id()));
}
});
size_t num_events = events_.size();
events_.clear();
Expand Down Expand Up @@ -680,6 +742,7 @@ void CuptiTraceCollector::OnTracerCachedActivityBuffers(

// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
// eventually convert and filter them to XSpace.
// It also add support to handle cupti activity events for nvtx thread markers.
class CuptiTraceCollectorImpl : public CuptiTraceCollector {
public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
Expand All @@ -699,6 +762,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
} else {
num_activity_events_++;
}
if (event.type == CuptiTracerEventType::ThreadMarkerStart ||
event.type == CuptiTracerEventType::ThreadMarkerEnd) {
// Process the nvtx marker, merge thread range start/end if appropriate.
// If merged, the event will contains the merged content, and be used for
// followed AddEvent() processing.
if (!AddNvtxMarker(event)) return;
}
per_device_collector_[event.device_id].AddEvent(std::move(event));
}
void OnEventsDropped(const std::string& reason,
Expand Down Expand Up @@ -745,6 +815,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
NormalizeTimeStamps(&device_plane, start_walltime_ns_);
}
AdjustHostPlaneNvtxLines(
FindMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
NormalizeTimeStamps(&host_plane, start_walltime_ns_);
return num_events > 0;
}
Expand Down Expand Up @@ -775,6 +847,39 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
uint64_t start_walltime_ns_;
uint64_t start_gpu_ns_;
int num_gpus_;
uint32_t num_duplicate_nvtx_marker_start_ = 0;
uint32_t num_unmatched_nvtx_marker_end_ = 0;

// process the nvtx marker, a)cache range start event, or b)merge range end
// with its corresponding start event. If merged, the event be updated with
// the merged content and return true. If not merged, return false.
bool AddNvtxMarker(CuptiTracerEvent& event) {
const uint32_t marker_id = event.graph_id;
auto it = nvtx_markers_.find(marker_id);
if (event.type == CuptiTracerEventType::ThreadMarkerStart) {
if (it == nvtx_markers_.end()) {
nvtx_markers_[marker_id] =
std::make_unique<CuptiTracerEvent>(std::move(event));
} else {
LOG_IF(ERROR, ++num_duplicate_nvtx_marker_start_ < 100)
<< "Duplicate nvtx thread range start marker id: " << marker_id;
}
} else if (event.type == CuptiTracerEventType::ThreadMarkerEnd) {
if (it != nvtx_markers_.end()) {
it->second->type = CuptiTracerEventType::ThreadMarkerRange;
it->second->end_time_ns = event.end_time_ns;
it->second->graph_id = 0;
event = std::move(*it->second);
nvtx_markers_.erase(it);
return true; // The event is merged for further processing.
} else {
LOG_IF(ERROR, ++num_unmatched_nvtx_marker_end_ < 100)
<< "Unmatched nvtx thread range end marker id: " << marker_id;
}
}
// No merged event is generated, return false.
return false;
}

// Set the all XLines of specified XPlane to starting walltime.
// Events time in both host and device planes are CUTPI timestamps.
Expand All @@ -788,6 +893,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
}

absl::FixedArray<PerDeviceCollector> per_device_collector_;
absl::flat_hash_map<uint32_t, std::unique_ptr<CuptiTracerEvent>>
nvtx_markers_;

CuptiTraceCollectorImpl(const CuptiTraceCollectorImpl&) = delete;
void operator=(const CuptiTraceCollectorImpl&) = delete;
Expand Down
8 changes: 8 additions & 0 deletions xla/backends/profiler/gpu/cupti_error_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ CUptiResult CuptiErrorManager::GetGraphExecId(CUgraphExec graph_exec,
return error;
}

CUptiResult CuptiErrorManager::SetThreadIdType(
CUpti_ActivityThreadIdType type) {
IGNORE_CALL_IF_DISABLED;
CUptiResult error = interface_->SetThreadIdType(type);
LOG_AND_DISABLE_IF_ERROR(error);
return error;
}

void CuptiErrorManager::CleanUp() {
if (undo_disabled_) { // prevent deadlock
return;
Expand Down
2 changes: 2 additions & 0 deletions xla/backends/profiler/gpu/cupti_error_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ class CuptiErrorManager : public xla::profiler::CuptiInterface {
CUptiResult GetGraphExecId(CUgraphExec graph_exec,
uint32_t* graph_id) override;

CUptiResult SetThreadIdType(CUpti_ActivityThreadIdType type) override;

// Clears Undo stack. We are maintaining undo stack for each profiling phase.
// Once the profiling is done, we need to clear the undo stack.
void CleanUp() override;
Expand Down
6 changes: 6 additions & 0 deletions xla/backends/profiler/gpu/cupti_error_manager_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ TEST_F(CuptiErrorManagerTest, GpuTraceActivityEnableTest) {
.InSequence(s1)
.WillRepeatedly(
Invoke(cupti_wrapper_.get(), &CuptiWrapper::EnableCallback));
EXPECT_CALL(*mock_, SetThreadIdType(_))
.InSequence(s1)
.WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::SetThreadIdType));
EXPECT_CALL(*mock_, ActivityUsePerThreadBuffer())
.InSequence(s1)
.WillOnce(Invoke(cupti_wrapper_.get(),
Expand Down Expand Up @@ -182,6 +185,9 @@ TEST_F(CuptiErrorManagerTest, GpuTraceAutoEnableTest) {
EXPECT_CALL(*mock_, EnableDomain(1, _, _))
.InSequence(s1)
.WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::EnableDomain));
EXPECT_CALL(*mock_, SetThreadIdType(_))
.InSequence(s1)
.WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::SetThreadIdType));
EXPECT_CALL(*mock_, ActivityUsePerThreadBuffer())
.InSequence(s1)
.WillOnce(Invoke(cupti_wrapper_.get(),
Expand Down
Loading

0 comments on commit edcdfcf

Please sign in to comment.