From 1dfc926eb84d3175d783c1f25d054411b8660115 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Mon, 5 Feb 2024 09:08:43 -0800 Subject: [PATCH 01/27] Metal back-end WIP --- .../src/profiler/TracyTimelineItemGpu.cpp | 3 +- public/common/TracyQueue.hpp | 3 +- public/tracy/TracyMetal.hmm | 364 ++++++++++++++++++ 3 files changed, 368 insertions(+), 2 deletions(-) create mode 100644 public/tracy/TracyMetal.hmm diff --git a/profiler/src/profiler/TracyTimelineItemGpu.cpp b/profiler/src/profiler/TracyTimelineItemGpu.cpp index 4ae1d683f..b499c64f1 100644 --- a/profiler/src/profiler/TracyTimelineItemGpu.cpp +++ b/profiler/src/profiler/TracyTimelineItemGpu.cpp @@ -42,7 +42,8 @@ void TimelineItemGpu::HeaderTooltip( const char* label ) const const bool isMultithreaded = ( m_gpu->type == GpuContextType::Vulkan ) || ( m_gpu->type == GpuContextType::OpenCL ) || - ( m_gpu->type == GpuContextType::Direct3D12 ); + ( m_gpu->type == GpuContextType::Direct3D12 ) || + ( m_gpu->type == GpuContextType::Metal ); char buf[64]; sprintf( buf, "%s context %i", GpuContextNames[(int)m_gpu->type], m_idx ); diff --git a/public/common/TracyQueue.hpp b/public/common/TracyQueue.hpp index affbd67ab..ec448d724 100644 --- a/public/common/TracyQueue.hpp +++ b/public/common/TracyQueue.hpp @@ -401,7 +401,8 @@ enum class GpuContextType : uint8_t Vulkan, OpenCL, Direct3D12, - Direct3D11 + Direct3D11, + Metal }; enum GpuContextFlags : uint8_t diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm new file mode 100644 index 000000000..c7cbc3e7e --- /dev/null +++ b/public/tracy/TracyMetal.hmm @@ -0,0 +1,364 @@ +#ifndef __TRACYMETAL_HMM__ +#define __TRACYMETAL_HMM__ + +#ifndef TRACY_ENABLE + +#define TracyMetalContext(device,queue) nullptr +#define TracyMetalDestroy(ctx) +#define TracyMetalContextName(ctx, name, size) + +#define TracyMetalZone(ctx, name) +#define TracyMetalZoneC(ctx, name, color) +#define TracyMetalNamedZone(ctx, varname, name, active) +#define TracyMetalNamedZoneC(ctx, varname, name, color, active) +#define TracyMetalZoneTransient(ctx, varname, name, active) + +#define TracyMetalZoneS(ctx, name, depth) +#define TracyMetalZoneCS(ctx, name, color, depth) +#define TracyMetalNamedZoneS(ctx, varname, name, depth, active) +#define TracyMetalNamedZoneCS(ctx, varname, name, color, depth, active) +#define TracyMetalZoneTransientS(ctx, varname, name, depth, active) + +#define TracyMetalCollect(ctx) + +namespace tracy +{ +class MetalZoneScope {}; +} + +using TracyMetalCtx = void*; + +#else + +#include +#include +#include + +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" + +// ok to import if in obj-c code +#import + +#define TracyMetalPanic(msg, ...) do { assert(false && "TracyMetal: " msg); TracyMessageLC("TracyMetal: " msg, tracy::Color::Red4); fprintf(stderr, "TracyMetal: %s\n", msg); __VA_ARGS__; } while(false); + + +namespace tracy +{ + +class MetalCtx +{ + friend class MetalZoneScope; + + enum { MaxQueries = 4 * 1024 }; // Metal: between 8 and 32768 _BYTES_... + +public: + MetalCtx(id device) + : m_device(device) + { + if (m_device == nil) + { + TracyMetalPanic("device is nil.", return); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) + { + TracyMetalPanic("timestamp sampling at compute dispatch boundary is not supported.", return); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) + { + TracyMetalPanic("timestamp sampling at draw boundary is not supported.", return); + } + id timestampCounterSet = nil; + for (id counterSet in m_device.counterSets) + { + if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp]) + { + timestampCounterSet = counterSet; + break; + } + } + if (timestampCounterSet == nil) + { + TracyMetalPanic("timestamp counters are not supported on the platform.", return); + } + + MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init]; + sampleDescriptor.counterSet = timestampCounterSet; + sampleDescriptor.sampleCount = MaxQueries; + sampleDescriptor.storageMode = MTLStorageModeShared; + sampleDescriptor.label = @"TracyMetalTimestampPool"; + + NSError* error = nil; + id counterSampleBuffer = [m_device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; + if (error != nil) + { + NSLog(error.localizedDescription); + NSLog(error.localizedFailureReason); + TracyMetalPanic("unable to create sample buffer for timestamp counters.", return); + } + m_counterSampleBuffer = counterSampleBuffer; + + MTLTimestamp cpuTimestamp = 0; + MTLTimestamp gpuTimestamp = 0; + float period = 1.0f; + [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; + + m_contextId = GetGpuCtxCounter().fetch_add(1); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp)); + MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp)); + MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // #TODO: why not GetThreadHandle()? + MemWrite(&item->gpuNewContext.period, period); + MemWrite(&item->gpuNewContext.context, m_contextId); + MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + MemWrite(&item->gpuNewContext.type, GpuContextType::Metal); + Profiler::QueueSerialFinish(); // TODO: DeferItem() for TRACY_ON_DEMAND + } + + ~MetalCtx() + { + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, m_contextId ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + bool Collect() + { + ZoneScopedC(Color::Red4); + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + return true; + } +#endif + + // Only one thread is allowed to collect timestamps at any given time + // but there's no need to block contending threads + if (!m_collectionMutex.try_lock()) + { + return true; + } + + std::unique_lock lock (m_collectionMutex, std::adopt_lock); + + uintptr_t begin = m_previousCheckpoint.load(); + uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; + uint32_t count = RingCount(begin, latestCheckpoint); + + if (count == 0) // no pending timestamp queries + { + uintptr_t nextCheckpoint = m_queryCounter.load(); + if (nextCheckpoint != latestCheckpoint) + { + // TODO: signal event / fence now? + } + return true; + } + + if (count >= MaxQueries) + { + TracyMetalPanic("too many pending timestamp queries.", return false;); + } + + NSRange range = { }; + NSData* data = [m_counterSampleBuffer resolveCounterRange:range]; + NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); + MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); + if (timestamps == nil) + { + TracyMetalPanic("unable to resolve timestamps.", return false;); + } + + for (auto i = begin; i != latestCheckpoint; ++i) + { + uint32_t k = RingIndex(i); + MTLTimestamp& timestamp = timestamps[k].timestamp; + // TODO: check the value of timestamp: MTLCounterErrorValue, zero, or valid + if (timestamp == MTLCounterErrorValue) + { + break; + } + m_previousCheckpoint += 1; + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, timestamp); + MemWrite(&item->gpuTime.queryId, static_cast(k)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + timestamp = MTLCounterErrorValue; // "reset" timestamp + } + + //RecalibrateClocks(); // to account for drift + + return true; + } + +private: + tracy_force_inline uint32_t RingIndex(uintptr_t index) + { + index %= MaxQueries; + return static_cast(index); + } + + tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end) + { + // wrap-around safe: all unsigned + uintptr_t count = end - begin; + return static_cast(count); + } + + tracy_force_inline unsigned int NextQueryId() + { + auto id = m_queryCounter.fetch_add(1); + if (RingCount(m_previousCheckpoint, id) >= MaxQueries) + { + TracyMetalPanic("too many pending timestamp queries."); + // #TODO: return some sentinel value; ideally a "hidden" query index + } + return RingIndex(id); + } + + tracy_force_inline uint8_t GetContextId() const + { + return m_contextId; + } + + uint8_t m_contextId = 255; + + id m_device = nil; + id m_counterSampleBuffer = nil; + + using atomic_counter = std::atomic; + static_assert(atomic_counter::is_always_lock_free); + atomic_counter m_queryCounter = 0; + + atomic_counter m_previousCheckpoint = 0; + atomic_counter::value_type m_nextCheckpoint = 0; + + std::mutex m_collectionMutex; +}; + +class MetalZoneScope +{ +public: + tracy_force_inline MetalZoneScope( MetalCtx* ctx, id cmdEncoder, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_ctx = ctx; + m_cmdEncoder = cmdEncoder; + + const auto queryId = ctx->NextQueryId(); + [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); + + Profiler::QueueSerialFinish(); + } + + tracy_force_inline ~MetalZoneScope() + { + if( !m_active ) return; + + const auto queryId = m_ctx->NextQueryId(); + [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() ); + + Profiler::QueueSerialFinish(); + } + +private: + const bool m_active; + + MetalCtx* m_ctx; + id m_cmdEncoder; +}; + +static inline MetalCtx* CreateMetalContext(id device) +{ + auto ctx = (MetalCtx*)tracy_malloc( sizeof( MetalCtx ) ); + new (ctx) MetalCtx( device ); + return ctx; +} + +static inline void DestroyMetalContext( MetalCtx* ctx ) +{ + ctx->~MetalCtx(); + tracy_free( ctx ); +} +} + +using TracyMetalCtx = tracy::MetalCtx*; + +#define TracyMetalContext(device) tracy::CreateMetalContext(device); +#define TracyMetalDestroy(ctx) tracy::DestroyMetalContext(ctx); +#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size); + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyMetalZone( ctx, name ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true ) +# define TracyMetalZoneC( ctx, name, color ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true ) +# define TracyMetalNamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyMetalNamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyMetalZoneTransient(ctx, varname, name, active) TracyMetalZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) +#else +# define TracyMetalZone( ctx, cmdEnc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, cmdEnc, name, true ) +# define TracyMetalZoneC( ctx, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, name, color, true ) +# define TracyMetalNamedZone( ctx, varname, cmdEnc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, cmdEnc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyMetalNamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyMetalZoneTransient(ctx, varname, name, active) tracy::MetalZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyMetalZoneS( ctx, name, depth ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true ) +# define TracyMetalZoneCS( ctx, name, color, depth ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true ) +# define TracyMetalNamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyMetalNamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyMetalZoneTransientS(ctx, varname, name, depth, active) tracy::MetalZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; +#else +# define TracyMetalZoneS( ctx, name, depth, active ) TracyMetalZone( ctx, name ) +# define TracyMetalZoneCS( ctx, name, color, depth, active ) TracyMetalZoneC( name, color ) +# define TracyMetalNamedZoneS( ctx, varname, name, depth, active ) TracyMetalNamedZone( ctx, varname, name, active ) +# define TracyMetalNamedZoneCS( ctx, varname, name, color, depth, active ) TracyMetalNamedZoneC( ctx, varname, name, color, active ) +# define TracyMetalZoneTransientS(ctx, varname, name, depth, active) TracyMetalZoneTransient(ctx, varname, name, active) +#endif + +#define TracyMetalCollect( ctx ) ctx->Collect(); + +#endif + +#endif//__TRACYMETAL_HMM__ From 47180dbf7ff57c12fbe462c8eb37a0d2336f230f Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Wed, 7 Feb 2024 08:51:22 -0800 Subject: [PATCH 02/27] basing metal zone scopes on MTLComputePassDescriptor --- public/tracy/TracyMetal.hmm | 101 ++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index c7cbc3e7e..43e938897 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -3,7 +3,7 @@ #ifndef TRACY_ENABLE -#define TracyMetalContext(device,queue) nullptr +#define TracyMetalContext(device) nullptr #define TracyMetalDestroy(ctx) #define TracyMetalContextName(ctx, name, size) @@ -63,13 +63,25 @@ public: { TracyMetalPanic("device is nil.", return); } - if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary]) { - TracyMetalPanic("timestamp sampling at compute dispatch boundary is not supported.", return); + TracyMetalPanic("timestamp sampling at pipeline stage boundary is not supported.", return); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) { - TracyMetalPanic("timestamp sampling at draw boundary is not supported.", return); + TracyMetalPanic("timestamp sampling at draw call boundary is not supported.", /* return */); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary]) + { + TracyMetalPanic("timestamp sampling at blit boundary is not supported.", /* return */); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) + { + TracyMetalPanic("timestamp sampling at compute dispatch boundary is not supported.", /* return */); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary]) + { + TracyMetalPanic("timestamp sampling at tile dispatch boundary is not supported.", /* return */); } id timestampCounterSet = nil; for (id counterSet in m_device.counterSets) @@ -95,8 +107,8 @@ public: id counterSampleBuffer = [m_device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; if (error != nil) { - NSLog(error.localizedDescription); - NSLog(error.localizedFailureReason); + NSLog(@"%@", error.localizedDescription); + NSLog(@"%@", error.localizedFailureReason); TracyMetalPanic("unable to create sample buffer for timestamp counters.", return); } m_counterSampleBuffer = counterSampleBuffer; @@ -124,6 +136,23 @@ public: { } + static MetalCtx* Create(id device) + { + auto ctx = static_cast(tracy_malloc(sizeof(MetalCtx))); + new (ctx) MetalCtx(device); + if (ctx->m_contextId == 255) + { + TracyMetalPanic("error during context creation.", Destroy(ctx); return nullptr); + } + return ctx; + } + + static void Destroy(MetalCtx* ctx) + { + ctx->~MetalCtx(); + tracy_free(ctx); + } + void Name( const char* name, uint16_t len ) { auto ptr = (char*)tracy_malloc( len ); @@ -179,7 +208,7 @@ public: TracyMetalPanic("too many pending timestamp queries.", return false;); } - NSRange range = { }; + NSRange range = { begin, latestCheckpoint }; NSData* data = [m_counterSampleBuffer resolveCounterRange:range]; NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); @@ -188,7 +217,7 @@ public: TracyMetalPanic("unable to resolve timestamps.", return false;); } - for (auto i = begin; i != latestCheckpoint; ++i) + for (auto i = 0; i < numResolvedTimestamps; ++i) { uint32_t k = RingIndex(i); MTLTimestamp& timestamp = timestamps[k].timestamp; @@ -226,9 +255,9 @@ private: return static_cast(count); } - tracy_force_inline unsigned int NextQueryId() + tracy_force_inline unsigned int NextQueryId(int n=1) { - auto id = m_queryCounter.fetch_add(1); + auto id = m_queryCounter.fetch_add(n); if (RingCount(m_previousCheckpoint, id) >= MaxQueries) { TracyMetalPanic("too many pending timestamp queries."); @@ -260,6 +289,33 @@ private: class MetalZoneScope { public: + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLComputePassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic("pass descriptor is nil."); + m_ctx = ctx; + + auto queryId = m_queryId = ctx->NextQueryId(2); + desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1; + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + } + +#if 0 tracy_force_inline MetalZoneScope( MetalCtx* ctx, id cmdEncoder, const SourceLocationData* srcloc, bool is_active ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) @@ -271,7 +327,7 @@ public: m_ctx = ctx; m_cmdEncoder = cmdEncoder; - const auto queryId = ctx->NextQueryId(); + auto queryId = m_queryId = ctx->NextQueryId(); [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; auto* item = Profiler::QueueSerial(); @@ -284,13 +340,13 @@ public: Profiler::QueueSerialFinish(); } +#endif tracy_force_inline ~MetalZoneScope() { if( !m_active ) return; - const auto queryId = m_ctx->NextQueryId(); - [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; + auto queryId = m_queryId + 1; auto* item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); @@ -307,27 +363,16 @@ private: MetalCtx* m_ctx; id m_cmdEncoder; + uint32_t m_queryId = 0; }; -static inline MetalCtx* CreateMetalContext(id device) -{ - auto ctx = (MetalCtx*)tracy_malloc( sizeof( MetalCtx ) ); - new (ctx) MetalCtx( device ); - return ctx; -} - -static inline void DestroyMetalContext( MetalCtx* ctx ) -{ - ctx->~MetalCtx(); - tracy_free( ctx ); -} } using TracyMetalCtx = tracy::MetalCtx*; -#define TracyMetalContext(device) tracy::CreateMetalContext(device); -#define TracyMetalDestroy(ctx) tracy::DestroyMetalContext(ctx); -#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size); +#define TracyMetalContext(device) tracy::MetalCtx::Create(device) +#define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx) +#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size) #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK # define TracyMetalZone( ctx, name ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true ) From 4b695cc3dd933679caca010c3fb373cef23f4afd Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Wed, 7 Feb 2024 11:00:14 -0800 Subject: [PATCH 03/27] debugging timestamps... --- public/tracy/TracyMetal.hmm | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 43e938897..3bf129f24 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -117,6 +117,8 @@ public: MTLTimestamp gpuTimestamp = 0; float period = 1.0f; [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; + fprintf(stdout, "TracyMetal: Calibration: CPU timestamp: %llu\n", cpuTimestamp); + fprintf(stdout, "TracyMetal: Calibration: GPU timestamp: %llu\n", gpuTimestamp); m_contextId = GetGpuCtxCounter().fetch_add(1); @@ -205,31 +207,38 @@ public: if (count >= MaxQueries) { - TracyMetalPanic("too many pending timestamp queries.", return false;); + TracyMetalPanic("Collect: too many pending timestamp queries.", return false;); } - NSRange range = { begin, latestCheckpoint }; + NSRange range = NSMakeRange(begin, count); NSData* data = [m_counterSampleBuffer resolveCounterRange:range]; NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); if (timestamps == nil) { - TracyMetalPanic("unable to resolve timestamps.", return false;); + TracyMetalPanic("Collect: unable to resolve timestamps.", return false;); } for (auto i = 0; i < numResolvedTimestamps; ++i) { - uint32_t k = RingIndex(i); - MTLTimestamp& timestamp = timestamps[k].timestamp; + MTLTimestamp& timestamp = timestamps[i].timestamp; // TODO: check the value of timestamp: MTLCounterErrorValue, zero, or valid if (timestamp == MTLCounterErrorValue) { + TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF)."); + break; + } + if (timestamp == 0) + { + TracyMetalPanic("Collect: invalid timestamp: zero."); break; } m_previousCheckpoint += 1; + uint32_t k = RingIndex(begin + i); + fprintf(stdout, "TracyMetal: timestamp[%d]: %llu\n", k, timestamp); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, timestamp); + MemWrite(&item->gpuTime.gpuTime, static_cast(timestamp)); MemWrite(&item->gpuTime.queryId, static_cast(k)); MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); @@ -260,7 +269,7 @@ private: auto id = m_queryCounter.fetch_add(n); if (RingCount(m_previousCheckpoint, id) >= MaxQueries) { - TracyMetalPanic("too many pending timestamp queries."); + TracyMetalPanic("NextQueryId: too many pending timestamp queries."); // #TODO: return some sentinel value; ideally a "hidden" query index } return RingIndex(id); From fd1e60e2fd383faf20de98e388c2cb6507d848e1 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Wed, 7 Feb 2024 14:19:59 -0800 Subject: [PATCH 04/27] fixing cpu timestamp baseline --- public/tracy/TracyMetal.hmm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 3bf129f24..871dce189 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -115,10 +115,12 @@ public: MTLTimestamp cpuTimestamp = 0; MTLTimestamp gpuTimestamp = 0; - float period = 1.0f; [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; fprintf(stdout, "TracyMetal: Calibration: CPU timestamp: %llu\n", cpuTimestamp); fprintf(stdout, "TracyMetal: Calibration: GPU timestamp: %llu\n", gpuTimestamp); + cpuTimestamp = Profiler::GetTime(); + fprintf(stdout, "TracyMetal: Calibration: CPU timestamp (profiler): %llu\n", cpuTimestamp); + float period = 1.08f; m_contextId = GetGpuCtxCounter().fetch_add(1); From 94407dbd1f53897c362b9f4dd5eb0db7f0f98ef8 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 8 Feb 2024 09:45:53 -0800 Subject: [PATCH 05/27] giving up on calibration, for now --- public/tracy/TracyMetal.hmm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 871dce189..9c89f8419 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -120,7 +120,7 @@ public: fprintf(stdout, "TracyMetal: Calibration: GPU timestamp: %llu\n", gpuTimestamp); cpuTimestamp = Profiler::GetTime(); fprintf(stdout, "TracyMetal: Calibration: CPU timestamp (profiler): %llu\n", cpuTimestamp); - float period = 1.08f; + float period = 1.0f; m_contextId = GetGpuCtxCounter().fetch_add(1); @@ -131,7 +131,8 @@ public: MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // #TODO: why not GetThreadHandle()? MemWrite(&item->gpuNewContext.period, period); MemWrite(&item->gpuNewContext.context, m_contextId); - MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + //MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0)); MemWrite(&item->gpuNewContext.type, GpuContextType::Metal); Profiler::QueueSerialFinish(); // TODO: DeferItem() for TRACY_ON_DEMAND } From 3d69c485bea4a3a913c2efd993516cee2654d17c Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 8 Feb 2024 09:46:17 -0800 Subject: [PATCH 06/27] fixing timestamp mapping range --- public/tracy/TracyMetal.hmm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 9c89f8419..c89c92250 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -213,7 +213,7 @@ public: TracyMetalPanic("Collect: too many pending timestamp queries.", return false;); } - NSRange range = NSMakeRange(begin, count); + NSRange range = NSMakeRange(RingIndex(begin), count); NSData* data = [m_counterSampleBuffer resolveCounterRange:range]; NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); From 6d04913b4aa338f796420b42cd9ecf255e106e15 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 8 Feb 2024 09:46:31 -0800 Subject: [PATCH 07/27] stale comments --- public/tracy/TracyMetal.hmm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index c89c92250..bd9d4eb0c 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -225,13 +225,12 @@ public: for (auto i = 0; i < numResolvedTimestamps; ++i) { MTLTimestamp& timestamp = timestamps[i].timestamp; - // TODO: check the value of timestamp: MTLCounterErrorValue, zero, or valid if (timestamp == MTLCounterErrorValue) { TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF)."); break; } - if (timestamp == 0) + if (timestamp == 0) // zero is apparently also considered "invalid"... { TracyMetalPanic("Collect: invalid timestamp: zero."); break; From c6cc424991dd411560a940bf5204c355c268f43d Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 8 Feb 2024 15:14:14 -0800 Subject: [PATCH 08/27] collecting/resolving timestamps in pairs --- public/tracy/TracyMetal.hmm | 48 ++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index bd9d4eb0c..a3c107cc4 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -210,6 +210,7 @@ public: if (count >= MaxQueries) { + fprintf(stdout, "TracyMetal: Collect: FULL [%llu, %llu] (%d)\n", begin, latestCheckpoint, count); TracyMetalPanic("Collect: too many pending timestamp queries.", return false;); } @@ -222,29 +223,54 @@ public: TracyMetalPanic("Collect: unable to resolve timestamps.", return false;); } - for (auto i = 0; i < numResolvedTimestamps; ++i) + if (numResolvedTimestamps != count) { - MTLTimestamp& timestamp = timestamps[i].timestamp; - if (timestamp == MTLCounterErrorValue) + fprintf(stdout, "TracyMetal: Collect: numResolvedTimestamps != count : %d != %d\n", numResolvedTimestamps, count); + } + + for (auto i = 0; i < numResolvedTimestamps; i += 2) + { + static MTLTimestamp lastValidTimestamp = 0; + MTLTimestamp& t_start = timestamps[i+0].timestamp; + MTLTimestamp& t_end = timestamps[i+1].timestamp; + uint32_t k = RingIndex(begin + i); + fprintf(stdout, "TracyMetal: Collect: timestamp[%d] = %llu | timestamp[%d] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); + if (t_start == MTLCounterErrorValue) { TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF)."); break; } - if (timestamp == 0) // zero is apparently also considered "invalid"... + if (t_start == 0) // zero is apparently also considered "invalid"... { - TracyMetalPanic("Collect: invalid timestamp: zero."); - break; + static int HACK_retries = 0; + if (++HACK_retries > 8) { + fprintf(stdout, "TracyMetal: Collect: giving up...\n", k, t_start, k+1, t_end); + t_start = t_end = lastValidTimestamp + 10; + HACK_retries = 0; + } else { + TracyMetalPanic("Collect: invalid timestamp: zero."); + break; + } } - m_previousCheckpoint += 1; - uint32_t k = RingIndex(begin + i); - fprintf(stdout, "TracyMetal: timestamp[%d]: %llu\n", k, timestamp); + m_previousCheckpoint += 2; + { auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, static_cast(timestamp)); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_start)); MemWrite(&item->gpuTime.queryId, static_cast(k)); MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); - timestamp = MTLCounterErrorValue; // "reset" timestamp + } + { + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_end)); + MemWrite(&item->gpuTime.queryId, static_cast(k+1)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + } + lastValidTimestamp = t_end; + t_start = t_end = MTLCounterErrorValue; // "reset" timestamps } //RecalibrateClocks(); // to account for drift From c3744403791ee3fad0de23e270915bc4791d0f6f Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 8 Feb 2024 15:14:41 -0800 Subject: [PATCH 09/27] adding blit pass and render pass interfaces --- public/tracy/TracyMetal.hmm | 54 +++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index a3c107cc4..cbe12fe93 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -352,6 +352,60 @@ public: Profiler::QueueSerialFinish(); } + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic("pass descriptor is nil."); + m_ctx = ctx; + + auto queryId = m_queryId = ctx->NextQueryId(2); + desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1; + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic("pass descriptor is nil."); + m_ctx = ctx; + + auto queryId = m_queryId = ctx->NextQueryId(2); + desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; + desc.sampleBufferAttachments[0].startOfVertexSampleIndex = queryId; + desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample; + desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample; + desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = queryId+1; + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + } + #if 0 tracy_force_inline MetalZoneScope( MetalCtx* ctx, id cmdEncoder, const SourceLocationData* srcloc, bool is_active ) #ifdef TRACY_ON_DEMAND From a9538799dadbfe8341dc37e6202c769c79cc8900 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 8 Feb 2024 15:14:52 -0800 Subject: [PATCH 10/27] more debugging --- public/tracy/TracyMetal.hmm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index cbe12fe93..f529eeed6 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -295,8 +295,10 @@ private: tracy_force_inline unsigned int NextQueryId(int n=1) { auto id = m_queryCounter.fetch_add(n); - if (RingCount(m_previousCheckpoint, id) >= MaxQueries) + auto count = RingCount(m_previousCheckpoint, id); + if (count >= MaxQueries) { + fprintf(stdout, "TracyMetal: NextQueryId: FULL [%llu, %llu] (%d)\n", m_previousCheckpoint.load(), id, count); TracyMetalPanic("NextQueryId: too many pending timestamp queries."); // #TODO: return some sentinel value; ideally a "hidden" query index } From ad2c2efacd71f1b3089a7298373a892ed87e5104 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Fri, 9 Feb 2024 14:13:59 -0800 Subject: [PATCH 11/27] fixing collect wrap-around --- public/tracy/TracyMetal.hmm | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index f529eeed6..e95cedb7f 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -208,6 +208,11 @@ public: return true; } + if (RingIndex(begin) + count > RingSize()) + { + count = RingSize() - RingIndex(begin); + } + if (count >= MaxQueries) { fprintf(stdout, "TracyMetal: Collect: FULL [%llu, %llu] (%d)\n", begin, latestCheckpoint, count); @@ -292,6 +297,11 @@ private: return static_cast(count); } + tracy_force_inline uint32_t RingSize() const + { + return MaxQueries; + } + tracy_force_inline unsigned int NextQueryId(int n=1) { auto id = m_queryCounter.fetch_add(n); From b11e66594dc68c1d5d2924b7cf1a848ed0501a2d Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Fri, 9 Feb 2024 14:14:10 -0800 Subject: [PATCH 12/27] debugging --- public/tracy/TracyMetal.hmm | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index e95cedb7f..0a2de4be2 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -176,7 +176,7 @@ public: bool Collect() { - ZoneScopedC(Color::Red4); + ZoneScopedNC("TracyMetal::Collect", Color::Red4); #ifdef TRACY_ON_DEMAND if (!GetProfiler().IsConnected()) @@ -197,14 +197,16 @@ public: uintptr_t begin = m_previousCheckpoint.load(); uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; uint32_t count = RingCount(begin, latestCheckpoint); + ZoneValue(begin); + ZoneValue(latestCheckpoint); if (count == 0) // no pending timestamp queries { - uintptr_t nextCheckpoint = m_queryCounter.load(); - if (nextCheckpoint != latestCheckpoint) - { - // TODO: signal event / fence now? - } + //uintptr_t nextCheckpoint = m_queryCounter.load(); + //if (nextCheckpoint != latestCheckpoint) + //{ + // // TODO: signal event / fence now? + //} return true; } @@ -212,10 +214,11 @@ public: { count = RingSize() - RingIndex(begin); } + ZoneValue(count); if (count >= MaxQueries) { - fprintf(stdout, "TracyMetal: Collect: FULL [%llu, %llu] (%d)\n", begin, latestCheckpoint, count); + fprintf(stdout, "TracyMetal: Collect: FULL [%llu, %llu] (%u)\n", begin, latestCheckpoint, count); TracyMetalPanic("Collect: too many pending timestamp queries.", return false;); } @@ -230,7 +233,7 @@ public: if (numResolvedTimestamps != count) { - fprintf(stdout, "TracyMetal: Collect: numResolvedTimestamps != count : %d != %d\n", numResolvedTimestamps, count); + fprintf(stdout, "TracyMetal: Collect: numResolvedTimestamps != count : %u != %u\n", numResolvedTimestamps, count); } for (auto i = 0; i < numResolvedTimestamps; i += 2) @@ -239,7 +242,7 @@ public: MTLTimestamp& t_start = timestamps[i+0].timestamp; MTLTimestamp& t_end = timestamps[i+1].timestamp; uint32_t k = RingIndex(begin + i); - fprintf(stdout, "TracyMetal: Collect: timestamp[%d] = %llu | timestamp[%d] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); + fprintf(stdout, "TracyMetal: Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); if (t_start == MTLCounterErrorValue) { TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF)."); @@ -250,7 +253,7 @@ public: static int HACK_retries = 0; if (++HACK_retries > 8) { fprintf(stdout, "TracyMetal: Collect: giving up...\n", k, t_start, k+1, t_end); - t_start = t_end = lastValidTimestamp + 10; + t_start = t_end = lastValidTimestamp + 100; HACK_retries = 0; } else { TracyMetalPanic("Collect: invalid timestamp: zero."); @@ -304,11 +307,13 @@ private: tracy_force_inline unsigned int NextQueryId(int n=1) { + ZoneScopedNC("TracyMetal::NextQueryId", tracy::Color::LightCoral); auto id = m_queryCounter.fetch_add(n); + ZoneValue(id); auto count = RingCount(m_previousCheckpoint, id); if (count >= MaxQueries) { - fprintf(stdout, "TracyMetal: NextQueryId: FULL [%llu, %llu] (%d)\n", m_previousCheckpoint.load(), id, count); + fprintf(stdout, "TracyMetal: NextQueryId: FULL [%llu, %llu] (%u)\n", m_previousCheckpoint.load(), id, count); TracyMetalPanic("NextQueryId: too many pending timestamp queries."); // #TODO: return some sentinel value; ideally a "hidden" query index } From 04cf494d9cb33fcea0bc01015a6b8ab92ee012f0 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Mon, 12 Feb 2024 13:55:21 -0800 Subject: [PATCH 13/27] bugfixes --- public/tracy/TracyMetal.hmm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 0a2de4be2..4af7ad8d2 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -248,11 +248,11 @@ public: TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF)."); break; } - if (t_start == 0) // zero is apparently also considered "invalid"... + if (t_start == 0 || t_end == 0) // zero is apparently also considered "invalid"... { static int HACK_retries = 0; if (++HACK_retries > 8) { - fprintf(stdout, "TracyMetal: Collect: giving up...\n", k, t_start, k+1, t_end); + fprintf(stdout, "TracyMetal: Collect: giving up...\n"); t_start = t_end = lastValidTimestamp + 100; HACK_retries = 0; } else { @@ -280,6 +280,7 @@ public: lastValidTimestamp = t_end; t_start = t_end = MTLCounterErrorValue; // "reset" timestamps } + ZoneValue(RingCount(begin, m_previousCheckpoint.load())); //RecalibrateClocks(); // to account for drift From 91ca0f2f8962563a52380ae8007cf322ad70d4b1 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Fri, 10 May 2024 10:31:40 -0700 Subject: [PATCH 14/27] improved panic macro (supports print args) --- public/tracy/TracyMetal.hmm | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 4af7ad8d2..ebd8a0194 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -43,7 +43,16 @@ using TracyMetalCtx = void*; // ok to import if in obj-c code #import -#define TracyMetalPanic(msg, ...) do { assert(false && "TracyMetal: " msg); TracyMessageLC("TracyMetal: " msg, tracy::Color::Red4); fprintf(stderr, "TracyMetal: %s\n", msg); __VA_ARGS__; } while(false); +#define VA_ARGS(...) , ##__VA_ARGS__ + +#define TracyMetalPanic(ret, msg, ...) do { \ + char buffer [1024]; \ + snprintf(buffer, sizeof(buffer), "TracyMetal: " msg VA_ARGS(__VA_ARGS__)); \ + TracyMessageC(buffer, strlen(buffer), tracy::Color::OrangeRed); \ + fprintf(stderr, "%s\n", buffer); \ + assert(false && "TracyMetal: " msg); \ + ret; \ + } while(false); namespace tracy From 5f09d454be4c092ffab78362b5426f267e4f3fe6 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Fri, 10 May 2024 10:33:36 -0700 Subject: [PATCH 15/27] blargh --- public/tracy/TracyMetal.hmm | 82 ++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index ebd8a0194..b2974a6a0 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -68,29 +68,31 @@ public: MetalCtx(id device) : m_device(device) { + ZoneScopedNC("TracyMetalCtx", tracy::Color::Red4); + if (m_device == nil) { - TracyMetalPanic("device is nil.", return); + TracyMetalPanic(return, "device is nil."); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary]) { - TracyMetalPanic("timestamp sampling at pipeline stage boundary is not supported.", return); + TracyMetalPanic(return, "ERROR: timestamp sampling at pipeline stage boundary is not supported."); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) { - TracyMetalPanic("timestamp sampling at draw call boundary is not supported.", /* return */); + TracyMetalPanic(, "WARNING: timestamp sampling at draw call boundary is not supported."); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary]) { - TracyMetalPanic("timestamp sampling at blit boundary is not supported.", /* return */); + TracyMetalPanic(, "WARNING: timestamp sampling at blit boundary is not supported."); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) { - TracyMetalPanic("timestamp sampling at compute dispatch boundary is not supported.", /* return */); + TracyMetalPanic(, "WARNING: timestamp sampling at compute dispatch boundary is not supported."); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary]) { - TracyMetalPanic("timestamp sampling at tile dispatch boundary is not supported.", /* return */); + TracyMetalPanic(, "WARNING: timestamp sampling at tile dispatch boundary is not supported."); } id timestampCounterSet = nil; for (id counterSet in m_device.counterSets) @@ -103,7 +105,7 @@ public: } if (timestampCounterSet == nil) { - TracyMetalPanic("timestamp counters are not supported on the platform.", return); + TracyMetalPanic(return, "ERROR: timestamp counters are not supported on the platform."); } MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init]; @@ -118,17 +120,19 @@ public: { NSLog(@"%@", error.localizedDescription); NSLog(@"%@", error.localizedFailureReason); - TracyMetalPanic("unable to create sample buffer for timestamp counters.", return); + TracyMetalPanic(return, "ERROR: unable to create sample buffer for timestamp counters."); } m_counterSampleBuffer = counterSampleBuffer; + + m_timestampRequestTime.resize(MaxQueries); MTLTimestamp cpuTimestamp = 0; MTLTimestamp gpuTimestamp = 0; [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; - fprintf(stdout, "TracyMetal: Calibration: CPU timestamp: %llu\n", cpuTimestamp); - fprintf(stdout, "TracyMetal: Calibration: GPU timestamp: %llu\n", gpuTimestamp); + TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp); + TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp); cpuTimestamp = Profiler::GetTime(); - fprintf(stdout, "TracyMetal: Calibration: CPU timestamp (profiler): %llu\n", cpuTimestamp); + TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp); float period = 1.0f; m_contextId = GetGpuCtxCounter().fetch_add(1); @@ -148,6 +152,7 @@ public: ~MetalCtx() { + ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4); } static MetalCtx* Create(id device) @@ -156,7 +161,8 @@ public: new (ctx) MetalCtx(device); if (ctx->m_contextId == 255) { - TracyMetalPanic("error during context creation.", Destroy(ctx); return nullptr); + Destroy(ctx); + TracyMetalPanic(return nullptr, "ERROR: unable to create context."); } return ctx; } @@ -227,8 +233,7 @@ public: if (count >= MaxQueries) { - fprintf(stdout, "TracyMetal: Collect: FULL [%llu, %llu] (%u)\n", begin, latestCheckpoint, count); - TracyMetalPanic("Collect: too many pending timestamp queries.", return false;); + //TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); } NSRange range = NSMakeRange(RingIndex(begin), count); @@ -237,12 +242,12 @@ public: MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); if (timestamps == nil) { - TracyMetalPanic("Collect: unable to resolve timestamps.", return false;); + TracyMetalPanic(return false, "Collect: unable to resolve timestamps."); } if (numResolvedTimestamps != count) { - fprintf(stdout, "TracyMetal: Collect: numResolvedTimestamps != count : %u != %u\n", numResolvedTimestamps, count); + TracyMetalPanic(, "Collect: numResolvedTimestamps != count : %u != %u", (uint32_t)numResolvedTimestamps, count); } for (auto i = 0; i < numResolvedTimestamps; i += 2) @@ -251,23 +256,27 @@ public: MTLTimestamp& t_start = timestamps[i+0].timestamp; MTLTimestamp& t_end = timestamps[i+1].timestamp; uint32_t k = RingIndex(begin + i); - fprintf(stdout, "TracyMetal: Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); + //fprintf(stdout, "TracyMetal: Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); if (t_start == MTLCounterErrorValue) { - TracyMetalPanic("Collect: invalid timestamp: MTLCounterErrorValue (0xFF..FF)."); + TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); break; } if (t_start == 0 || t_end == 0) // zero is apparently also considered "invalid"... { - static int HACK_retries = 0; - if (++HACK_retries > 8) { - fprintf(stdout, "TracyMetal: Collect: giving up...\n"); - t_start = t_end = lastValidTimestamp + 100; - HACK_retries = 0; - } else { - TracyMetalPanic("Collect: invalid timestamp: zero."); + auto checkTime = std::chrono::high_resolution_clock::now(); + auto requestTime = m_timestampRequestTime[k]; + auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; + //TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight); + const float timeout_ms = 2000.0f; + if (ms_in_flight < timeout_ms) break; - } + static int HACK_retries = 0; + //if (++HACK_retries <= 1000000) + // break; + TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight); + t_start = t_end = lastValidTimestamp + 100; + HACK_retries = 0; } m_previousCheckpoint += 2; { @@ -288,6 +297,7 @@ public: } lastValidTimestamp = t_end; t_start = t_end = MTLCounterErrorValue; // "reset" timestamps + TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"); } ZoneValue(RingCount(begin, m_previousCheckpoint.load())); @@ -323,11 +333,15 @@ private: auto count = RingCount(m_previousCheckpoint, id); if (count >= MaxQueries) { - fprintf(stdout, "TracyMetal: NextQueryId: FULL [%llu, %llu] (%u)\n", m_previousCheckpoint.load(), id, count); - TracyMetalPanic("NextQueryId: too many pending timestamp queries."); + TracyMetalPanic(, "NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", m_previousCheckpoint.load(), id, count); // #TODO: return some sentinel value; ideally a "hidden" query index + return (MaxQueries - n); } - return RingIndex(id); + TracyAllocN((void*)(uintptr_t)RingIndex(id), 2, "TracyMetalTimestampQueryId"); + uint32_t idx = RingIndex(id); + m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); + //TracyMetalPanic(, "NextQueryId: %u (%llu)", idx, id); + return idx; } tracy_force_inline uint8_t GetContextId() const @@ -347,6 +361,8 @@ private: atomic_counter m_previousCheckpoint = 0; atomic_counter::value_type m_nextCheckpoint = 0; + std::vector m_timestampRequestTime; + std::mutex m_collectionMutex; }; @@ -361,7 +377,7 @@ public: #endif { if ( !m_active ) return; - if (desc == nil) TracyMetalPanic("pass descriptor is nil."); + if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); m_ctx = ctx; auto queryId = m_queryId = ctx->NextQueryId(2); @@ -387,7 +403,7 @@ public: #endif { if ( !m_active ) return; - if (desc == nil) TracyMetalPanic("pass descriptor is nil."); + if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); m_ctx = ctx; auto queryId = m_queryId = ctx->NextQueryId(2); @@ -413,7 +429,7 @@ public: #endif { if ( !m_active ) return; - if (desc == nil) TracyMetalPanic("pass descriptor is nil."); + if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); m_ctx = ctx; auto queryId = m_queryId = ctx->NextQueryId(2); @@ -481,6 +497,8 @@ private: MetalCtx* m_ctx; id m_cmdEncoder; + +public: uint32_t m_queryId = 0; }; From cef49c22699b1980fb506441b0b54b7122172fe8 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Tue, 14 May 2024 16:12:36 -0700 Subject: [PATCH 16/27] blarg again... --- public/tracy/TracyMetal.hmm | 155 ++++++++++++++++++++++++------------ 1 file changed, 105 insertions(+), 50 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index b2974a6a0..2ffefd518 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -43,6 +43,8 @@ using TracyMetalCtx = void*; // ok to import if in obj-c code #import +#define TRACY_METAL_DEBUG_NO_WRAPAROUND (0) + #define VA_ARGS(...) , ##__VA_ARGS__ #define TracyMetalPanic(ret, msg, ...) do { \ @@ -70,6 +72,9 @@ public: { ZoneScopedNC("TracyMetalCtx", tracy::Color::Red4); + TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue); + TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample); + if (m_device == nil) { TracyMetalPanic(return, "device is nil."); @@ -125,6 +130,7 @@ public: m_counterSampleBuffer = counterSampleBuffer; m_timestampRequestTime.resize(MaxQueries); + go_horse.resize(MaxQueries); MTLTimestamp cpuTimestamp = 0; MTLTimestamp gpuTimestamp = 0; @@ -211,6 +217,10 @@ public: uintptr_t begin = m_previousCheckpoint.load(); uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + latestCheckpoint = (latestCheckpoint >= MaxQueries) ? MaxQueries : latestCheckpoint; + //if (latestCheckpoint >= MaxQueries) return true; +#endif uint32_t count = RingCount(begin, latestCheckpoint); ZoneValue(begin); ZoneValue(latestCheckpoint); @@ -233,9 +243,11 @@ public: if (count >= MaxQueries) { - //TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); + TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); } + //TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count); + NSRange range = NSMakeRange(RingIndex(begin), count); NSData* data = [m_counterSampleBuffer resolveCounterRange:range]; NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); @@ -257,13 +269,25 @@ public: MTLTimestamp& t_end = timestamps[i+1].timestamp; uint32_t k = RingIndex(begin + i); //fprintf(stdout, "TracyMetal: Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); - if (t_start == MTLCounterErrorValue) + if ((t_start == MTLCounterErrorValue) || (t_end == MTLCounterErrorValue)) { TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); break; } - if (t_start == 0 || t_end == 0) // zero is apparently also considered "invalid"... + if (go_horse[k+0] == 0) { + TracyMetalPanic(, "Collect: go_horse not ready at %u (%llu).", k+0, begin+i+0); + break; + } + if (go_horse[k+1] == 0) + { + TracyMetalPanic(, "Collect: go_horse not ready at %u (%llu).", k+1, begin+i+1); + break; + } + if ((t_start == 0) || (t_end == 0)) // zero is apparently also considered "invalid"... + { + break; + auto checkTime = std::chrono::high_resolution_clock::now(); auto requestTime = m_timestampRequestTime[k]; auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; @@ -278,11 +302,17 @@ public: t_start = t_end = lastValidTimestamp + 100; HACK_retries = 0; } - m_previousCheckpoint += 2; + auto t_start_copy = t_start; + auto t_end_copy = t_end; + t_start = t_end = MTLCounterErrorValue; // "reset" timestamps + t_start = t_end = 0; + m_timestampRequestTime[k+0] += std::chrono::minutes(60); + m_timestampRequestTime[k+1] += std::chrono::minutes(60); + go_horse[k+0] = go_horse[k+1] = 0; { auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, static_cast(t_start)); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_start_copy)); MemWrite(&item->gpuTime.queryId, static_cast(k)); MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); @@ -290,14 +320,16 @@ public: { auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, static_cast(t_end)); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_end_copy)); MemWrite(&item->gpuTime.queryId, static_cast(k+1)); MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); } - lastValidTimestamp = t_end; - t_start = t_end = MTLCounterErrorValue; // "reset" timestamps + TracyMetalPanic(, "zone %u ]", k); + TracyMetalPanic(, "zone %u ]", k+1); + lastValidTimestamp = t_end_copy; TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"); + m_previousCheckpoint += 2; } ZoneValue(RingCount(begin, m_previousCheckpoint.load())); @@ -329,6 +361,9 @@ private: { ZoneScopedNC("TracyMetal::NextQueryId", tracy::Color::LightCoral); auto id = m_queryCounter.fetch_add(n); +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + if (id >= MaxQueries) return MaxQueries; +#endif ZoneValue(id); auto count = RingCount(m_previousCheckpoint, id); if (count >= MaxQueries) @@ -337,10 +372,11 @@ private: // #TODO: return some sentinel value; ideally a "hidden" query index return (MaxQueries - n); } - TracyAllocN((void*)(uintptr_t)RingIndex(id), 2, "TracyMetalTimestampQueryId"); uint32_t idx = RingIndex(id); + TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"); m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); - //TracyMetalPanic(, "NextQueryId: %u (%llu)", idx, id); + //if (id >= MaxQueries) + // TracyMetalPanic(, "NextQueryId: %u (%llu)", idx, id); return idx; } @@ -362,6 +398,7 @@ private: atomic_counter::value_type m_nextCheckpoint = 0; std::vector m_timestampRequestTime; + std::vector go_horse; std::mutex m_collectionMutex; }; @@ -381,18 +418,16 @@ public: m_ctx = ctx; auto queryId = m_queryId = ctx->NextQueryId(2); +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + if (queryId >= MetalCtx::MaxQueries) return; +#endif + desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId; desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1; - auto* item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); - MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); - MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); - MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); - MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); - Profiler::QueueSerialFinish(); + SubmitZoneBeginGpu(ctx, queryId, srcloc); + //SubmitZoneEndGpu(ctx, queryId+1); } tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) @@ -407,18 +442,16 @@ public: m_ctx = ctx; auto queryId = m_queryId = ctx->NextQueryId(2); +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + if (queryId >= MetalCtx::MaxQueries) return; +#endif + desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId; desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1; - auto* item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); - MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); - MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); - MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); - MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); - Profiler::QueueSerialFinish(); + SubmitZoneBeginGpu(ctx, queryId, srcloc); + //SubmitZoneEndGpu(ctx, queryId+1); } tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) @@ -433,20 +466,18 @@ public: m_ctx = ctx; auto queryId = m_queryId = ctx->NextQueryId(2); +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + if (queryId >= MetalCtx::MaxQueries) return; +#endif + desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; desc.sampleBufferAttachments[0].startOfVertexSampleIndex = queryId; desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample; desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample; desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = queryId+1; - auto* item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); - MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); - MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); - MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); - MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); - Profiler::QueueSerialFinish(); + SubmitZoneBeginGpu(ctx, queryId, srcloc); + //SubmitZoneEndGpu(ctx, queryId+1); } #if 0 @@ -462,8 +493,37 @@ public: m_cmdEncoder = cmdEncoder; auto queryId = m_queryId = ctx->NextQueryId(); +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + if (queryId >= MetalCtx::MaxQueries) return; +#endif + [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; + SubmitZoneBeginGpu(ctx, queryId, srcloc); + } +#endif + + tracy_force_inline ~MetalZoneScope() + { + if( !m_active ) return; + + auto queryId = m_queryId + 1; + +#if TRACY_METAL_DEBUG_NO_WRAPAROUND + if (queryId >= MetalCtx::MaxQueries) return; +#endif + + SubmitZoneEndGpu(m_ctx, queryId); + } + +private: + const bool m_active; + + MetalCtx* m_ctx; + id m_cmdEncoder; + + static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc) + { auto* item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); @@ -471,33 +531,28 @@ public: MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); - Profiler::QueueSerialFinish(); + + TracyMetalPanic(, "zone %u [", queryId); + + ctx->go_horse[queryId] = 1; } -#endif - - tracy_force_inline ~MetalZoneScope() + + static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId) { - if( !m_active ) return; - - auto queryId = m_queryId + 1; - auto* item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() ); - + MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); + + TracyMetalPanic(, "zone %u {]", queryId); + + ctx->go_horse[queryId] = 1; } -private: - const bool m_active; - - MetalCtx* m_ctx; - id m_cmdEncoder; - public: uint32_t m_queryId = 0; }; From 638fa1f06c98ec5b0debf02c7269df922e23b304 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Wed, 15 May 2024 13:14:09 -0700 Subject: [PATCH 17/27] blarg3 --- public/tracy/TracyMetal.hmm | 205 +++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 86 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 2ffefd518..e197bac9c 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -43,8 +43,6 @@ using TracyMetalCtx = void*; // ok to import if in obj-c code #import -#define TRACY_METAL_DEBUG_NO_WRAPAROUND (0) - #define VA_ARGS(...) , ##__VA_ARGS__ #define TracyMetalPanic(ret, msg, ...) do { \ @@ -99,35 +97,10 @@ public: { TracyMetalPanic(, "WARNING: timestamp sampling at tile dispatch boundary is not supported."); } - id timestampCounterSet = nil; - for (id counterSet in m_device.counterSets) - { - if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp]) - { - timestampCounterSet = counterSet; - break; - } - } - if (timestampCounterSet == nil) - { - TracyMetalPanic(return, "ERROR: timestamp counters are not supported on the platform."); - } - - MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init]; - sampleDescriptor.counterSet = timestampCounterSet; - sampleDescriptor.sampleCount = MaxQueries; - sampleDescriptor.storageMode = MTLStorageModeShared; - sampleDescriptor.label = @"TracyMetalTimestampPool"; - NSError* error = nil; - id counterSampleBuffer = [m_device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; - if (error != nil) - { - NSLog(@"%@", error.localizedDescription); - NSLog(@"%@", error.localizedFailureReason); - TracyMetalPanic(return, "ERROR: unable to create sample buffer for timestamp counters."); - } - m_counterSampleBuffer = counterSampleBuffer; + m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries); + m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries); + //m_counterSampleBuffer = NewTimestampSampleBuffer(m_device, MaxQueries); m_timestampRequestTime.resize(MaxQueries); go_horse.resize(MaxQueries); @@ -217,14 +190,10 @@ public: uintptr_t begin = m_previousCheckpoint.load(); uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - latestCheckpoint = (latestCheckpoint >= MaxQueries) ? MaxQueries : latestCheckpoint; - //if (latestCheckpoint >= MaxQueries) return true; -#endif - uint32_t count = RingCount(begin, latestCheckpoint); ZoneValue(begin); ZoneValue(latestCheckpoint); + uint32_t count = RingCount(begin, latestCheckpoint); if (count == 0) // no pending timestamp queries { //uintptr_t nextCheckpoint = m_queryCounter.load(); @@ -235,13 +204,20 @@ public: return true; } - if (RingIndex(begin) + count > RingSize()) + // resolve up until the ring buffer boundary and let a subsequenty call + // to Collect handle the wrap-around + bool reallocateBuffer = false; + if (RingIndex(begin) + count >= RingSize()) { count = RingSize() - RingIndex(begin); + reallocateBuffer = true; } ZoneValue(count); + + auto buffer_idx = (begin / MaxQueries) % 2; + auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx]; - if (count >= MaxQueries) + if (count >= RingSize()) { TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); } @@ -249,7 +225,7 @@ public: //TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count); NSRange range = NSMakeRange(RingIndex(begin), count); - NSData* data = [m_counterSampleBuffer resolveCounterRange:range]; + NSData* data = [counterSampleBuffer resolveCounterRange:range]; NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); if (timestamps == nil) @@ -262,8 +238,10 @@ public: TracyMetalPanic(, "Collect: numResolvedTimestamps != count : %u != %u", (uint32_t)numResolvedTimestamps, count); } + int resolved = 0; for (auto i = 0; i < numResolvedTimestamps; i += 2) { + ZoneScopedN("TracyMetal::Collect::[i]"); static MTLTimestamp lastValidTimestamp = 0; MTLTimestamp& t_start = timestamps[i+0].timestamp; MTLTimestamp& t_end = timestamps[i+1].timestamp; @@ -295,21 +273,19 @@ public: const float timeout_ms = 2000.0f; if (ms_in_flight < timeout_ms) break; - static int HACK_retries = 0; - //if (++HACK_retries <= 1000000) - // break; TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight); t_start = t_end = lastValidTimestamp + 100; - HACK_retries = 0; } + TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone"); + TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone"); auto t_start_copy = t_start; auto t_end_copy = t_end; - t_start = t_end = MTLCounterErrorValue; // "reset" timestamps t_start = t_end = 0; m_timestampRequestTime[k+0] += std::chrono::minutes(60); m_timestampRequestTime[k+1] += std::chrono::minutes(60); go_horse[k+0] = go_horse[k+1] = 0; { + ZoneScopedN("TracyMetal::Collect::QueueSerial"); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); MemWrite(&item->gpuTime.gpuTime, static_cast(t_start_copy)); @@ -318,6 +294,7 @@ public: Profiler::QueueSerialFinish(); } { + ZoneScopedN("TracyMetal::Collect::QueueSerial"); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); MemWrite(&item->gpuTime.gpuTime, static_cast(t_end_copy)); @@ -325,13 +302,19 @@ public: MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); } - TracyMetalPanic(, "zone %u ]", k); - TracyMetalPanic(, "zone %u ]", k+1); + //TracyMetalPanic(, "zone %u ]", k); + //TracyMetalPanic(, "zone %u ]", k+1); lastValidTimestamp = t_end_copy; TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"); - m_previousCheckpoint += 2; + resolved += 2; } ZoneValue(RingCount(begin, m_previousCheckpoint.load())); + + m_previousCheckpoint += resolved; + + counterSampleBuffer = nil; + if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0) + m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries); //RecalibrateClocks(); // to account for drift @@ -357,13 +340,38 @@ private: return MaxQueries; } + struct Query { id buffer; uint32_t idx; }; + + tracy_force_inline Query NextQuery() + { + ZoneScopedNC("TracyMetal::NextQuery", tracy::Color::LightCoral); + auto id = m_queryCounter.fetch_add(2); + ZoneValue(id); + auto count = RingCount(m_previousCheckpoint, id); + if (count >= MaxQueries) + { + TracyMetalPanic(, "NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", m_previousCheckpoint.load(), id, count); + // #TODO: return some sentinel value; ideally a "hidden" query index + //return (MaxQueries - n); + } + uint32_t buffer_idx = (id / MaxQueries) % 2; + ZoneValue(buffer_idx); + auto buffer = m_counterSampleBuffers[buffer_idx]; + if (buffer == nil) + TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id); + uint32_t idx = RingIndex(id); + ZoneValue(idx); + TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"); + m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); + //if (id >= MaxQueries) + // TracyMetalPanic(, "NextQueryId: %u (%llu)", idx, id); + return Query{ buffer, idx }; + } + tracy_force_inline unsigned int NextQueryId(int n=1) { ZoneScopedNC("TracyMetal::NextQueryId", tracy::Color::LightCoral); auto id = m_queryCounter.fetch_add(n); -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - if (id >= MaxQueries) return MaxQueries; -#endif ZoneValue(id); auto count = RingCount(m_previousCheckpoint, id); if (count >= MaxQueries) @@ -384,12 +392,51 @@ private: { return m_contextId; } + + static id NewTimestampSampleBuffer(id device, size_t count) + { + ZoneScopedN("TracyMetal::NewTimestampSampleBuffer"); + + id timestampCounterSet = nil; + for (id counterSet in device.counterSets) + { + if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp]) + { + timestampCounterSet = counterSet; + break; + } + } + if (timestampCounterSet == nil) + { + TracyMetalPanic(return nil, "ERROR: timestamp counters are not supported on the platform."); + } + + MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init]; + sampleDescriptor.counterSet = timestampCounterSet; + sampleDescriptor.sampleCount = MaxQueries; + sampleDescriptor.storageMode = MTLStorageModeShared; + sampleDescriptor.label = @"TracyMetalTimestampPool"; + + NSError* error = nil; + id counterSampleBuffer = [device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; + if (error != nil) + { + //NSLog(@"%@", error.localizedDescription); + //NSLog(@"%@", error.localizedFailureReason); + TracyMetalPanic(return nil, + "ERROR: unable to create sample buffer for timestamp counters : %s | %s", + [error.localizedDescription cString], [error.localizedFailureReason cString]); + } + + return counterSampleBuffer; + } uint8_t m_contextId = 255; id m_device = nil; - id m_counterSampleBuffer = nil; - + id m_counterSampleBuffers [2] = {}; + //id m_counterSampleBuffer; + using atomic_counter = std::atomic; static_assert(atomic_counter::is_always_lock_free); atomic_counter m_queryCounter = 0; @@ -417,16 +464,13 @@ public: if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); m_ctx = ctx; - auto queryId = m_queryId = ctx->NextQueryId(2); -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - if (queryId >= MetalCtx::MaxQueries) return; -#endif + auto query = m_query = ctx->NextQuery(); - desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; - desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId; - desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1; + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; - SubmitZoneBeginGpu(ctx, queryId, srcloc); + SubmitZoneBeginGpu(ctx, query.idx+0, srcloc); //SubmitZoneEndGpu(ctx, queryId+1); } @@ -441,16 +485,13 @@ public: if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); m_ctx = ctx; - auto queryId = m_queryId = ctx->NextQueryId(2); -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - if (queryId >= MetalCtx::MaxQueries) return; -#endif + auto query = m_query = ctx->NextQuery(); - desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; - desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = queryId; - desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = queryId+1; + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; - SubmitZoneBeginGpu(ctx, queryId, srcloc); + SubmitZoneBeginGpu(ctx, query.idx+0, srcloc); //SubmitZoneEndGpu(ctx, queryId+1); } @@ -465,18 +506,15 @@ public: if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); m_ctx = ctx; - auto queryId = m_queryId = ctx->NextQueryId(2); -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - if (queryId >= MetalCtx::MaxQueries) return; -#endif + auto query = m_query = ctx->NextQuery(); - desc.sampleBufferAttachments[0].sampleBuffer = ctx->m_counterSampleBuffer; - desc.sampleBufferAttachments[0].startOfVertexSampleIndex = queryId; + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfVertexSampleIndex = query.idx+0; desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample; desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample; - desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = queryId+1; + desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = query.idx+1; - SubmitZoneBeginGpu(ctx, queryId, srcloc); + SubmitZoneBeginGpu(ctx, query.idx+0, srcloc); //SubmitZoneEndGpu(ctx, queryId+1); } @@ -493,9 +531,6 @@ public: m_cmdEncoder = cmdEncoder; auto queryId = m_queryId = ctx->NextQueryId(); -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - if (queryId >= MetalCtx::MaxQueries) return; -#endif [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; @@ -507,11 +542,7 @@ public: { if( !m_active ) return; - auto queryId = m_queryId + 1; - -#if TRACY_METAL_DEBUG_NO_WRAPAROUND - if (queryId >= MetalCtx::MaxQueries) return; -#endif + auto queryId = m_query.idx + 1; SubmitZoneEndGpu(m_ctx, queryId); } @@ -533,7 +564,8 @@ private: MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); - TracyMetalPanic(, "zone %u [", queryId); + //TracyMetalPanic(, "zone %u [", queryId); + TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"); ctx->go_horse[queryId] = 1; } @@ -548,13 +580,14 @@ private: MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); - TracyMetalPanic(, "zone %u {]", queryId); + //TracyMetalPanic(, "zone %u {]", queryId); + TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"); ctx->go_horse[queryId] = 1; } public: - uint32_t m_queryId = 0; + MetalCtx::Query m_query = {}; }; } From 5ec7565852e6d65818a161f0bc1601ba30d9a87e Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Wed, 15 May 2024 22:08:27 -0700 Subject: [PATCH 18/27] cleanup --- public/tracy/TracyMetal.hmm | 44 ++++++------------------------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index e197bac9c..8a573d6c6 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -100,10 +100,8 @@ public: m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries); m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries); - //m_counterSampleBuffer = NewTimestampSampleBuffer(m_device, MaxQueries); m_timestampRequestTime.resize(MaxQueries); - go_horse.resize(MaxQueries); MTLTimestamp cpuTimestamp = 0; MTLTimestamp gpuTimestamp = 0; @@ -252,59 +250,39 @@ public: TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); break; } - if (go_horse[k+0] == 0) - { - TracyMetalPanic(, "Collect: go_horse not ready at %u (%llu).", k+0, begin+i+0); - break; - } - if (go_horse[k+1] == 0) - { - TracyMetalPanic(, "Collect: go_horse not ready at %u (%llu).", k+1, begin+i+1); - break; - } if ((t_start == 0) || (t_end == 0)) // zero is apparently also considered "invalid"... { - break; - auto checkTime = std::chrono::high_resolution_clock::now(); auto requestTime = m_timestampRequestTime[k]; auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; //TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight); - const float timeout_ms = 2000.0f; + const float timeout_ms = 200.0f; if (ms_in_flight < timeout_ms) break; + ZoneScopedN("TracyMetal::Collect::Drop"); TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight); - t_start = t_end = lastValidTimestamp + 100; + t_start = lastValidTimestamp + 5; + t_end = t_start + 5; } TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone"); TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone"); - auto t_start_copy = t_start; - auto t_end_copy = t_end; - t_start = t_end = 0; - m_timestampRequestTime[k+0] += std::chrono::minutes(60); - m_timestampRequestTime[k+1] += std::chrono::minutes(60); - go_horse[k+0] = go_horse[k+1] = 0; { - ZoneScopedN("TracyMetal::Collect::QueueSerial"); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, static_cast(t_start_copy)); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_start)); MemWrite(&item->gpuTime.queryId, static_cast(k)); MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); } { - ZoneScopedN("TracyMetal::Collect::QueueSerial"); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, static_cast(t_end_copy)); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_end)); MemWrite(&item->gpuTime.queryId, static_cast(k+1)); MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); } - //TracyMetalPanic(, "zone %u ]", k); - //TracyMetalPanic(, "zone %u ]", k+1); - lastValidTimestamp = t_end_copy; + lastValidTimestamp = t_end; TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"); resolved += 2; } @@ -435,7 +413,6 @@ private: id m_device = nil; id m_counterSampleBuffers [2] = {}; - //id m_counterSampleBuffer; using atomic_counter = std::atomic; static_assert(atomic_counter::is_always_lock_free); @@ -445,7 +422,6 @@ private: atomic_counter::value_type m_nextCheckpoint = 0; std::vector m_timestampRequestTime; - std::vector go_horse; std::mutex m_collectionMutex; }; @@ -564,10 +540,7 @@ private: MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); - //TracyMetalPanic(, "zone %u [", queryId); TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"); - - ctx->go_horse[queryId] = 1; } static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId) @@ -580,10 +553,7 @@ private: MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); - //TracyMetalPanic(, "zone %u {]", queryId); TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"); - - ctx->go_horse[queryId] = 1; } public: From 06b23cc2937519a7948e021e81e302bb628cb772 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 16 May 2024 11:07:56 -0700 Subject: [PATCH 19/27] more cleanup --- public/tracy/TracyMetal.hmm | 109 ++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 62 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 8a573d6c6..ec8e210f5 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -55,6 +55,14 @@ using TracyMetalCtx = void*; } while(false); +#define TRACY_METAL_DEBUG_MASK (0) + +#if TRACY_METAL_DEBUG_MASK +#define TracyMetalDebug(mask, ...) if (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; } +#else +#define TracyMetalDebug(mask, ...) +#endif + namespace tracy { @@ -66,12 +74,12 @@ class MetalCtx public: MetalCtx(id device) - : m_device(device) + : m_device(device) { ZoneScopedNC("TracyMetalCtx", tracy::Color::Red4); - TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue); - TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample); + TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue)); + TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample)); if (m_device == nil) { @@ -106,10 +114,12 @@ public: MTLTimestamp cpuTimestamp = 0; MTLTimestamp gpuTimestamp = 0; [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; - TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp); - TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp); + TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp)); + TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp)); + cpuTimestamp = Profiler::GetTime(); - TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp); + TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp)); + float period = 1.0f; m_contextId = GetGpuCtxCounter().fetch_add(1); @@ -220,7 +230,7 @@ public: TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); } - //TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count); + TracyMetalDebug(1<<3, TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count)); NSRange range = NSMakeRange(RingIndex(begin), count); NSData* data = [counterSampleBuffer resolveCounterRange:range]; @@ -244,7 +254,7 @@ public: MTLTimestamp& t_start = timestamps[i+0].timestamp; MTLTimestamp& t_end = timestamps[i+1].timestamp; uint32_t k = RingIndex(begin + i); - //fprintf(stdout, "TracyMetal: Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)); + TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start))); if ((t_start == MTLCounterErrorValue) || (t_end == MTLCounterErrorValue)) { TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); @@ -255,7 +265,7 @@ public: auto checkTime = std::chrono::high_resolution_clock::now(); auto requestTime = m_timestampRequestTime[k]; auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; - //TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight); + TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight)); const float timeout_ms = 200.0f; if (ms_in_flight < timeout_ms) break; @@ -264,8 +274,8 @@ public: t_start = lastValidTimestamp + 5; t_end = t_start + 5; } - TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone"); - TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone"); + TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone")); + TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone")); { auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); @@ -283,7 +293,7 @@ public: Profiler::QueueSerialFinish(); } lastValidTimestamp = t_end; - TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"); + TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId")); resolved += 2; } ZoneValue(RingCount(begin, m_previousCheckpoint.load())); @@ -328,9 +338,13 @@ private: auto count = RingCount(m_previousCheckpoint, id); if (count >= MaxQueries) { - TracyMetalPanic(, "NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", m_previousCheckpoint.load(), id, count); - // #TODO: return some sentinel value; ideally a "hidden" query index - //return (MaxQueries - n); + // TODO: return a proper (hidden) "sentinel" query + Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 }; + TracyMetalPanic( + return sentinel, + "NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", + m_previousCheckpoint.load(), id, count + ); } uint32_t buffer_idx = (id / MaxQueries) % 2; ZoneValue(buffer_idx); @@ -339,33 +353,11 @@ private: TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id); uint32_t idx = RingIndex(id); ZoneValue(idx); - TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"); + TracyMetalDebug(1<<1, TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId")); m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); - //if (id >= MaxQueries) - // TracyMetalPanic(, "NextQueryId: %u (%llu)", idx, id); return Query{ buffer, idx }; } - tracy_force_inline unsigned int NextQueryId(int n=1) - { - ZoneScopedNC("TracyMetal::NextQueryId", tracy::Color::LightCoral); - auto id = m_queryCounter.fetch_add(n); - ZoneValue(id); - auto count = RingCount(m_previousCheckpoint, id); - if (count >= MaxQueries) - { - TracyMetalPanic(, "NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", m_previousCheckpoint.load(), id, count); - // #TODO: return some sentinel value; ideally a "hidden" query index - return (MaxQueries - n); - } - uint32_t idx = RingIndex(id); - TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"); - m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); - //if (id >= MaxQueries) - // TracyMetalPanic(, "NextQueryId: %u (%llu)", idx, id); - return idx; - } - tracy_force_inline uint8_t GetContextId() const { return m_contextId; @@ -399,8 +391,7 @@ private: id counterSampleBuffer = [device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; if (error != nil) { - //NSLog(@"%@", error.localizedDescription); - //NSLog(@"%@", error.localizedFailureReason); + //NSLog(@"%@ | %@", error.localizedDescription, error.localizedFailureReason); TracyMetalPanic(return nil, "ERROR: unable to create sample buffer for timestamp counters : %s | %s", [error.localizedDescription cString], [error.localizedFailureReason cString]); @@ -437,17 +428,16 @@ public: #endif { if ( !m_active ) return; - if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); + if (desc == nil) TracyMetalPanic(return, "compute pass descriptor is nil."); m_ctx = ctx; - auto query = m_query = ctx->NextQuery(); + auto& query = m_query = ctx->NextQuery(); desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; - SubmitZoneBeginGpu(ctx, query.idx+0, srcloc); - //SubmitZoneEndGpu(ctx, queryId+1); + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); } tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) @@ -458,17 +448,16 @@ public: #endif { if ( !m_active ) return; - if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); + if (desc == nil) TracyMetalPanic(return, "blit pass descriptor is nil."); m_ctx = ctx; - auto query = m_query = ctx->NextQuery(); + auto& query = m_query = ctx->NextQuery(); desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; - SubmitZoneBeginGpu(ctx, query.idx+0, srcloc); - //SubmitZoneEndGpu(ctx, queryId+1); + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); } tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) @@ -479,10 +468,10 @@ public: #endif { if ( !m_active ) return; - if (desc == nil) TracyMetalPanic(return, "pass descriptor is nil."); + if (desc == nil) TracyMetalPanic(return, "render pass descriptor is nil."); m_ctx = ctx; - auto query = m_query = ctx->NextQuery(); + auto& query = m_query = ctx->NextQuery(); desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; desc.sampleBufferAttachments[0].startOfVertexSampleIndex = query.idx+0; @@ -490,8 +479,7 @@ public: desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample; desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = query.idx+1; - SubmitZoneBeginGpu(ctx, query.idx+0, srcloc); - //SubmitZoneEndGpu(ctx, queryId+1); + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); } #if 0 @@ -506,21 +494,19 @@ public: m_ctx = ctx; m_cmdEncoder = cmdEncoder; - auto queryId = m_queryId = ctx->NextQueryId(); + auto& query = m_query = ctx->NextQueryId(); - [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:queryId withBarrier:YES]; + [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:query.idx withBarrier:YES]; - SubmitZoneBeginGpu(ctx, queryId, srcloc); + SubmitZoneBeginGpu(ctx, query.idx, srcloc); } #endif tracy_force_inline ~MetalZoneScope() { if( !m_active ) return; - - auto queryId = m_query.idx + 1; - - SubmitZoneEndGpu(m_ctx, queryId); + + SubmitZoneEndGpu(m_ctx, m_query.idx + 1); } private: @@ -540,7 +526,7 @@ private: MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); - TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"); + TracyMetalDebug(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone")); } static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId) @@ -553,10 +539,9 @@ private: MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() ); Profiler::QueueSerialFinish(); - TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"); + TracyMetalDebug(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone")); } -public: MetalCtx::Query m_query = {}; }; From ea65b145c654e3bb80c8400eb53291e594536956 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 16 May 2024 11:35:50 -0700 Subject: [PATCH 20/27] adding wiretap for debugging purposes --- public/tracy/TracyMetal.hmm | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index ec8e210f5..b2956f28c 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -63,6 +63,10 @@ using TracyMetalCtx = void*; #define TracyMetalDebug(mask, ...) #endif +#ifndef TracyMetalZoneScopeWireTap +#define TracyMetalZoneScopeWireTap +#endif//TracyMetalZoneScopeWireTap + namespace tracy { @@ -508,6 +512,8 @@ public: SubmitZoneEndGpu(m_ctx, m_query.idx + 1); } + + TracyMetalZoneScopeWireTap; private: const bool m_active; From 799360dfb81952f0bd2657fad4d20b24c19e46a8 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 16 May 2024 22:24:35 -0700 Subject: [PATCH 21/27] Collect pending timestamps during shutdown --- public/tracy/TracyMetal.hmm | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index b2956f28c..bd0293a97 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -144,6 +144,12 @@ public: ~MetalCtx() { ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4); + ZoneValue(m_previousCheckpoint.load()); + ZoneValue(m_queryCounter.load()); + // collect the last remnants of Metal GPU activity... + // TODO: add a timeout to this loop? + while (m_previousCheckpoint.load() != m_queryCounter.load()) + Collect(); } static MetalCtx* Create(id device) From aa85824455cfdbda4ea97072f77775bd1173381e Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Thu, 16 May 2024 22:25:14 -0700 Subject: [PATCH 22/27] comments about the decisions and behavior of the Metal back-end --- public/tracy/TracyMetal.hmm | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index bd0293a97..e2995a289 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -1,6 +1,34 @@ #ifndef __TRACYMETAL_HMM__ #define __TRACYMETAL_HMM__ +/* The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, + Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where + a command encoder is created. This is because not all hardware supports timestamps at + command granularity, and can only provide timestamps around an entire command encoder. + This accommodates for all tiers of hardware; in the future, variants of TracyMetalZone() + will be added to support the habitual command-level granularity of Tracy GPU back-ends. + Metal also imposes a few restrictions that make the process of requesting and collecting + queries more complicated in Tracy: + a) timestamp query buffers are limited to 4096 queries (32KB, where each query is 8 bytes) + b) when a timestamp query buffer is created, Metal initializes all timestamps with zeroes, + and there's no way to reset them back to zero after timestamps get resolved; the only + way to clear the timestamps is by allocating a new timestamp query buffer + c) if a command encoder records no commands and its corresponding command buffer ends up + committed to the command queue, Metal will "optimize-away" the encoder along with any + timestamp queries associated with it (the timestamp will remain as zero and will never + get resolved) + Because of the limitations above, two timestamp buffers are managed internally. Once one + of the buffers fills up with requests, the second buffer can start serving new requests. + Once all requests in a buffer get resolved and collected, the entire buffer is discarded + and a new one allocated for future requests. (Proper cycling through a ring buffer would + require bookkeeping and completion handlers to collect only the known complete queries.) + In the current implementation, there is potential for a race condition when the buffer is + discarded and reallocated. In practice, the race condition will never materialize so long + as TracyMetalCollect() is called frequently to keep the amount of unresolved queries low. + Finally, there's a timeout mechanism during timestamp collection to detect "empty" command + encoders and ensure progress. +*/ + #ifndef TRACY_ENABLE #define TracyMetalContext(device) nullptr From fe51f02a25113316dbf8c29bbdd9488dbca6aac0 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Fri, 17 May 2024 09:51:02 -0700 Subject: [PATCH 23/27] cleanup and comments --- public/tracy/TracyMetal.hmm | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index e2995a289..e2e2043ba 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -146,6 +146,7 @@ public: MTLTimestamp cpuTimestamp = 0; MTLTimestamp gpuTimestamp = 0; [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; + m_mostRecentTimestamp = gpuTimestamp; TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp)); TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp)); @@ -186,8 +187,8 @@ public: new (ctx) MetalCtx(device); if (ctx->m_contextId == 255) { - Destroy(ctx); TracyMetalPanic(return nullptr, "ERROR: unable to create context."); + Destroy(ctx); } return ctx; } @@ -288,9 +289,8 @@ public: for (auto i = 0; i < numResolvedTimestamps; i += 2) { ZoneScopedN("TracyMetal::Collect::[i]"); - static MTLTimestamp lastValidTimestamp = 0; - MTLTimestamp& t_start = timestamps[i+0].timestamp; - MTLTimestamp& t_end = timestamps[i+1].timestamp; + MTLTimestamp t_start = timestamps[i+0].timestamp; + MTLTimestamp t_end = timestamps[i+1].timestamp; uint32_t k = RingIndex(begin + i); TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start))); if ((t_start == MTLCounterErrorValue) || (t_end == MTLCounterErrorValue)) @@ -298,7 +298,9 @@ public: TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); break; } - if ((t_start == 0) || (t_end == 0)) // zero is apparently also considered "invalid"... + // Metal will initialize timestamp buffer with zeroes; encountering a zero-value + // timestamp means that the timestamp has not been written and resolved yet + if ((t_start == 0) || (t_end == 0)) { auto checkTime = std::chrono::high_resolution_clock::now(); auto requestTime = m_timestampRequestTime[k]; @@ -309,7 +311,7 @@ public: break; ZoneScopedN("TracyMetal::Collect::Drop"); TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight); - t_start = lastValidTimestamp + 5; + t_start = m_mostRecentTimestamp + 5; t_end = t_start + 5; } TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone")); @@ -330,7 +332,7 @@ public: MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); } - lastValidTimestamp = t_end; + m_mostRecentTimestamp = (t_end > m_mostRecentTimestamp) ? t_end : m_mostRecentTimestamp; TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId")); resolved += 2; } @@ -338,7 +340,11 @@ public: m_previousCheckpoint += resolved; - counterSampleBuffer = nil; + // Check whether the timestamp buffer has been fully resolved/collected: + // WARN: there's technically a race condition here: NextQuery() may reference the + // buffer that is being released instead of the new one. In practice, this should + // never happen so long as Collect is called frequently enough to prevent pending + // timestamp query requests from piling up too quickly. if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0) m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries); @@ -448,7 +454,7 @@ private: atomic_counter m_queryCounter = 0; atomic_counter m_previousCheckpoint = 0; - atomic_counter::value_type m_nextCheckpoint = 0; + MTLTimestamp m_mostRecentTimestamp = 0; std::vector m_timestampRequestTime; From 0ffa0be4fd3371c769068daa2dd7bc7d57352c15 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Tue, 3 Sep 2024 11:15:39 -0700 Subject: [PATCH 24/27] addressing code review comments --- public/tracy/TracyMetal.hmm | 81 +++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index e2e2043ba..5a6b9a7cf 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -1,7 +1,9 @@ #ifndef __TRACYMETAL_HMM__ #define __TRACYMETAL_HMM__ -/* The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, +/* This file implements a Metal API back-end for Tracy (it has only been tested on Apple + Silicon devices, but it should also work on Intel-based Macs and older iOS devices). + The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where a command encoder is created. This is because not all hardware supports timestamps at command granularity, and can only provide timestamps around an entire command encoder. @@ -58,6 +60,10 @@ using TracyMetalCtx = void*; #else +#if not __has_feature(objc_arc) +#error TracyMetal requires ARC to be enabled. +#endif + #include #include #include @@ -82,8 +88,13 @@ using TracyMetalCtx = void*; ret; \ } while(false); +#ifndef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT +#define TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT 0.200f +#endif//TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT +#ifndef TRACY_METAL_DEBUG_MASK #define TRACY_METAL_DEBUG_MASK (0) +#endif//TRACY_METAL_DEBUG_MASK #if TRACY_METAL_DEBUG_MASK #define TracyMetalDebug(mask, ...) if (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; } @@ -91,9 +102,9 @@ using TracyMetalCtx = void*; #define TracyMetalDebug(mask, ...) #endif -#ifndef TracyMetalZoneScopeWireTap -#define TracyMetalZoneScopeWireTap -#endif//TracyMetalZoneScopeWireTap +#ifndef TracyMetalDebugZoneScopeWireTap +#define TracyMetalDebugZoneScopeWireTap +#endif//TracyMetalDebugZoneScopeWireTap namespace tracy { @@ -109,7 +120,7 @@ public: : m_device(device) { ZoneScopedNC("TracyMetalCtx", tracy::Color::Red4); - + TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue)); TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample)); @@ -123,19 +134,19 @@ public: } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) { - TracyMetalPanic(, "WARNING: timestamp sampling at draw call boundary is not supported."); + TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n")); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary]) { - TracyMetalPanic(, "WARNING: timestamp sampling at blit boundary is not supported."); + TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n")); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) { - TracyMetalPanic(, "WARNING: timestamp sampling at compute dispatch boundary is not supported."); + TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n")); } if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary]) { - TracyMetalPanic(, "WARNING: timestamp sampling at tile dispatch boundary is not supported."); + TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n")); } m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries); @@ -161,20 +172,20 @@ public: MemWrite(&item->hdr.type, QueueType::GpuNewContext); MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp)); MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp)); - MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // #TODO: why not GetThreadHandle()? + MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()? MemWrite(&item->gpuNewContext.period, period); MemWrite(&item->gpuNewContext.context, m_contextId); //MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0)); MemWrite(&item->gpuNewContext.type, GpuContextType::Metal); - Profiler::QueueSerialFinish(); // TODO: DeferItem() for TRACY_ON_DEMAND + SubmitQueueItem(item); } ~MetalCtx() { ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4); - ZoneValue(m_previousCheckpoint.load()); - ZoneValue(m_queryCounter.load()); + TracyMetalDebug(1<<0, ZoneValue(m_previousCheckpoint.load())); + TracyMetalDebug(1<<0, ZoneValue(m_queryCounter.load())); // collect the last remnants of Metal GPU activity... // TODO: add a timeout to this loop? while (m_previousCheckpoint.load() != m_queryCounter.load()) @@ -204,15 +215,12 @@ public: auto ptr = (char*)tracy_malloc( len ); memcpy( ptr, name, len ); - auto item = Profiler::QueueSerial(); + auto* item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuContextName ); MemWrite( &item->gpuContextNameFat.context, m_contextId ); MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); MemWrite( &item->gpuContextNameFat.size, len ); -#ifdef TRACY_ON_DEMAND - GetProfiler().DeferItem( *item ); -#endif - Profiler::QueueSerialFinish(); + SubmitQueueItem(item); } bool Collect() @@ -237,8 +245,8 @@ public: uintptr_t begin = m_previousCheckpoint.load(); uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; - ZoneValue(begin); - ZoneValue(latestCheckpoint); + TracyMetalDebug(1<<3, ZoneValue(begin)); + TracyMetalDebug(1<<3, ZoneValue(latestCheckpoint)); uint32_t count = RingCount(begin, latestCheckpoint); if (count == 0) // no pending timestamp queries @@ -259,7 +267,7 @@ public: count = RingSize() - RingIndex(begin); reallocateBuffer = true; } - ZoneValue(count); + TracyMetalDebug(1<<3, ZoneValue(count)); auto buffer_idx = (begin / MaxQueries) % 2; auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx]; @@ -306,7 +314,7 @@ public: auto requestTime = m_timestampRequestTime[k]; auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight)); - const float timeout_ms = 200.0f; + const float timeout_ms = TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT * 1000.0f; if (ms_in_flight < timeout_ms) break; ZoneScopedN("TracyMetal::Collect::Drop"); @@ -336,7 +344,7 @@ public: TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId")); resolved += 2; } - ZoneValue(RingCount(begin, m_previousCheckpoint.load())); + TracyMetalDebug(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load()))); m_previousCheckpoint += resolved; @@ -346,7 +354,9 @@ public: // never happen so long as Collect is called frequently enough to prevent pending // timestamp query requests from piling up too quickly. if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0) + { m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries); + } //RecalibrateClocks(); // to account for drift @@ -354,6 +364,14 @@ public: } private: + tracy_force_inline void SubmitQueueItem(QueueItem* item) + { +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + tracy_force_inline uint32_t RingIndex(uintptr_t index) { index %= MaxQueries; @@ -378,7 +396,7 @@ private: { ZoneScopedNC("TracyMetal::NextQuery", tracy::Color::LightCoral); auto id = m_queryCounter.fetch_add(2); - ZoneValue(id); + TracyMetalDebug(1<<1, ZoneValue(id)); auto count = RingCount(m_previousCheckpoint, id); if (count >= MaxQueries) { @@ -386,17 +404,17 @@ private: Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 }; TracyMetalPanic( return sentinel, - "NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", + "NextQueryId: FULL! too many pending timestamp queries. Consider calling TracyMetalCollect() more frequently. [%llu, %llu] (%u)", m_previousCheckpoint.load(), id, count ); } uint32_t buffer_idx = (id / MaxQueries) % 2; - ZoneValue(buffer_idx); + TracyMetalDebug(1<<1, ZoneValue(buffer_idx)); auto buffer = m_counterSampleBuffers[buffer_idx]; if (buffer == nil) TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id); uint32_t idx = RingIndex(id); - ZoneValue(idx); + TracyMetalDebug(1<<1, ZoneValue(idx)); TracyMetalDebug(1<<1, TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId")); m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); return Query{ buffer, idx }; @@ -526,7 +544,7 @@ public: SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); } -#if 0 + /* TODO: implement this constructor interfarce for "command-level" profiling, if the device supports it tracy_force_inline MetalZoneScope( MetalCtx* ctx, id cmdEncoder, const SourceLocationData* srcloc, bool is_active ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) @@ -544,7 +562,7 @@ public: SubmitZoneBeginGpu(ctx, query.idx, srcloc); } -#endif + */ tracy_force_inline ~MetalZoneScope() { @@ -553,13 +571,16 @@ public: SubmitZoneEndGpu(m_ctx, m_query.idx + 1); } - TracyMetalZoneScopeWireTap; + TracyMetalDebugZoneScopeWireTap; private: const bool m_active; MetalCtx* m_ctx; + + /* TODO: declare it for "command-level" profiling id m_cmdEncoder; + */ static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc) { From 19f376a6c90b20dfb12320edbc9afc121201b8a5 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Mon, 9 Sep 2024 17:08:15 -0700 Subject: [PATCH 25/27] fixixng interface macros --- public/tracy/TracyMetal.hmm | 62 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 5a6b9a7cf..357d48554 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -37,17 +37,17 @@ #define TracyMetalDestroy(ctx) #define TracyMetalContextName(ctx, name, size) -#define TracyMetalZone(ctx, name) -#define TracyMetalZoneC(ctx, name, color) -#define TracyMetalNamedZone(ctx, varname, name, active) -#define TracyMetalNamedZoneC(ctx, varname, name, color, active) -#define TracyMetalZoneTransient(ctx, varname, name, active) - -#define TracyMetalZoneS(ctx, name, depth) -#define TracyMetalZoneCS(ctx, name, color, depth) -#define TracyMetalNamedZoneS(ctx, varname, name, depth, active) -#define TracyMetalNamedZoneCS(ctx, varname, name, color, depth, active) -#define TracyMetalZoneTransientS(ctx, varname, name, depth, active) +#define TracyMetalZone(ctx, encoder, name) +#define TracyMetalZoneC(ctx, encoder, name, color) +#define TracyMetalNamedZone(ctx, varname, encoder, name, active) +#define TracyMetalNamedZoneC(ctx, varname, encoder, name, color, active) +#define TracyMetalZoneTransient(ctx, varname, encoder, name, active) + +#define TracyMetalZoneS(ctx, encoder, name, depth) +#define TracyMetalZoneCS(ctx,encoder, name, color, depth) +#define TracyMetalNamedZoneS(ctx, varname, nencoder, ame, depth, active) +#define TracyMetalNamedZoneCS(ctx, varname, encoder, name, color, depth, active) +#define TracyMetalZoneTransientS(ctx, varname, encoder, name, depth, active) #define TracyMetalCollect(ctx) @@ -621,31 +621,31 @@ using TracyMetalCtx = tracy::MetalCtx*; #define TracyMetalContextName(ctx, name, size) ctx->Name(name, size) #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyMetalZone( ctx, name ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true ) -# define TracyMetalZoneC( ctx, name, color ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true ) -# define TracyMetalNamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyMetalNamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyMetalZoneTransient(ctx, varname, name, active) TracyMetalZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) +# define TracyMetalZone( ctx, encoder, name ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, encoder, name, TRACY_CALLSTACK, true ) +# define TracyMetalZoneC( ctx, encoder, name, color ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, encoder, name, color, TRACY_CALLSTACK, true ) +# define TracyMetalNamedZone( ctx, varname, encoder, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyMetalNamedZoneC( ctx, varname, encoder, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyMetalZoneTransient(ctx, varname, encoder, name, active) TracyMetalZoneTransientS(ctx, varname, encoder, name, TRACY_CALLSTACK, active) #else -# define TracyMetalZone( ctx, cmdEnc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, cmdEnc, name, true ) -# define TracyMetalZoneC( ctx, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, name, color, true ) -# define TracyMetalNamedZone( ctx, varname, cmdEnc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, cmdEnc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); -# define TracyMetalNamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); -# define TracyMetalZoneTransient(ctx, varname, name, active) tracy::MetalZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; +# define TracyMetalZone( ctx, encoder, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, encoder, name, true ) +# define TracyMetalZoneC( ctx, encoder, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, encoder, name, color, true ) +# define TracyMetalNamedZone( ctx, varname, encoder, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyMetalNamedZoneC( ctx, varname, encoder, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyMetalZoneTransient(ctx, varname, encoder, name, active) tracy::MetalZoneScope varname{ ctx, encoder, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; #endif #ifdef TRACY_HAS_CALLSTACK -# define TracyMetalZoneS( ctx, name, depth ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true ) -# define TracyMetalZoneCS( ctx, name, color, depth ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true ) -# define TracyMetalNamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); -# define TracyMetalNamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); -# define TracyMetalZoneTransientS(ctx, varname, name, depth, active) tracy::MetalZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; +# define TracyMetalZoneS( ctx, encoder, name, depth ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, encoder, name, depth, true ) +# define TracyMetalZoneCS( ctx, encoder, name, color, depth ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, encoder, name, color, depth, true ) +# define TracyMetalNamedZoneS( ctx, varname, encoder, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyMetalNamedZoneCS( ctx, varname, encoder, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyMetalZoneTransientS(ctx, varname, encoder, name, depth, active) tracy::MetalZoneScope varname{ ctx, encoder, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; #else -# define TracyMetalZoneS( ctx, name, depth, active ) TracyMetalZone( ctx, name ) -# define TracyMetalZoneCS( ctx, name, color, depth, active ) TracyMetalZoneC( name, color ) -# define TracyMetalNamedZoneS( ctx, varname, name, depth, active ) TracyMetalNamedZone( ctx, varname, name, active ) -# define TracyMetalNamedZoneCS( ctx, varname, name, color, depth, active ) TracyMetalNamedZoneC( ctx, varname, name, color, active ) -# define TracyMetalZoneTransientS(ctx, varname, name, depth, active) TracyMetalZoneTransient(ctx, varname, name, active) +# define TracyMetalZoneS( ctx, encoder, name, depth, active ) TracyMetalZone( ctx, encoder, name ) +# define TracyMetalZoneCS( ctx, encoder, name, color, depth, active ) TracyMetalZoneC( ctx, encoder, name, color ) +# define TracyMetalNamedZoneS( ctx, varname, encoder, name, depth, active ) TracyMetalNamedZone( ctx, varname, encoder, name, active ) +# define TracyMetalNamedZoneCS( ctx, varname, encoder, name, color, depth, active ) TracyMetalNamedZoneC( ctx, varname, encoder, name, color, active ) +# define TracyMetalZoneTransientS(ctx, varname, encoder, name, depth, active) TracyMetalZoneTransient(ctx, varname, encoder, name, active) #endif #define TracyMetalCollect( ctx ) ctx->Collect(); From bd061d1d1b61617edd42a2e23d3d91fd8e9d4658 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Mon, 9 Sep 2024 17:20:19 -0700 Subject: [PATCH 26/27] removing unsupported macro interfaces --- public/tracy/TracyMetal.hmm | 46 +++++++------------------------------ 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm index 357d48554..d7aa4d2ec 100644 --- a/public/tracy/TracyMetal.hmm +++ b/public/tracy/TracyMetal.hmm @@ -37,17 +37,10 @@ #define TracyMetalDestroy(ctx) #define TracyMetalContextName(ctx, name, size) -#define TracyMetalZone(ctx, encoder, name) -#define TracyMetalZoneC(ctx, encoder, name, color) -#define TracyMetalNamedZone(ctx, varname, encoder, name, active) -#define TracyMetalNamedZoneC(ctx, varname, encoder, name, color, active) -#define TracyMetalZoneTransient(ctx, varname, encoder, name, active) - -#define TracyMetalZoneS(ctx, encoder, name, depth) -#define TracyMetalZoneCS(ctx,encoder, name, color, depth) -#define TracyMetalNamedZoneS(ctx, varname, nencoder, ame, depth, active) -#define TracyMetalNamedZoneCS(ctx, varname, encoder, name, color, depth, active) -#define TracyMetalZoneTransientS(ctx, varname, encoder, name, depth, active) +#define TracyMetalZone(ctx, encoderDesc, name) +#define TracyMetalZoneC(ctx, encoderDesc, name, color) +#define TracyMetalNamedZone(ctx, varname, encoderDesc, name, active) +#define TracyMetalNamedZoneC(ctx, varname, encoderDesc, name, color, active) #define TracyMetalCollect(ctx) @@ -620,33 +613,10 @@ using TracyMetalCtx = tracy::MetalCtx*; #define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx) #define TracyMetalContextName(ctx, name, size) ctx->Name(name, size) -#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyMetalZone( ctx, encoder, name ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, encoder, name, TRACY_CALLSTACK, true ) -# define TracyMetalZoneC( ctx, encoder, name, color ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, encoder, name, color, TRACY_CALLSTACK, true ) -# define TracyMetalNamedZone( ctx, varname, encoder, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyMetalNamedZoneC( ctx, varname, encoder, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyMetalZoneTransient(ctx, varname, encoder, name, active) TracyMetalZoneTransientS(ctx, varname, encoder, name, TRACY_CALLSTACK, active) -#else -# define TracyMetalZone( ctx, encoder, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, encoder, name, true ) -# define TracyMetalZoneC( ctx, encoder, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, encoder, name, color, true ) -# define TracyMetalNamedZone( ctx, varname, encoder, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); -# define TracyMetalNamedZoneC( ctx, varname, encoder, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); -# define TracyMetalZoneTransient(ctx, varname, encoder, name, active) tracy::MetalZoneScope varname{ ctx, encoder, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; -#endif - -#ifdef TRACY_HAS_CALLSTACK -# define TracyMetalZoneS( ctx, encoder, name, depth ) TracyMetalNamedZoneS( ctx, ___tracy_gpu_zone, encoder, name, depth, true ) -# define TracyMetalZoneCS( ctx, encoder, name, color, depth ) TracyMetalNamedZoneCS( ctx, ___tracy_gpu_zone, encoder, name, color, depth, true ) -# define TracyMetalNamedZoneS( ctx, varname, encoder, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); -# define TracyMetalNamedZoneCS( ctx, varname, encoder, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoder, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); -# define TracyMetalZoneTransientS(ctx, varname, encoder, name, depth, active) tracy::MetalZoneScope varname{ ctx, encoder, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; -#else -# define TracyMetalZoneS( ctx, encoder, name, depth, active ) TracyMetalZone( ctx, encoder, name ) -# define TracyMetalZoneCS( ctx, encoder, name, color, depth, active ) TracyMetalZoneC( ctx, encoder, name, color ) -# define TracyMetalNamedZoneS( ctx, varname, encoder, name, depth, active ) TracyMetalNamedZone( ctx, varname, encoder, name, active ) -# define TracyMetalNamedZoneCS( ctx, varname, encoder, name, color, depth, active ) TracyMetalNamedZoneC( ctx, varname, encoder, name, color, active ) -# define TracyMetalZoneTransientS(ctx, varname, encoder, name, depth, active) TracyMetalZoneTransient(ctx, varname, encoder, name, active) -#endif +#define TracyMetalZone( ctx, encoderDesc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, encoderDesc, name, true ) +#define TracyMetalZoneC( ctx, encoderDesc, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, encoderDesc, name, color, true ) +#define TracyMetalNamedZone( ctx, varname, encoderDesc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +#define TracyMetalNamedZoneC( ctx, varname, encoderDesc, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); #define TracyMetalCollect( ctx ) ctx->Collect(); From c168ff6c3c89a26fb237d1ba6ac1059f8a6fda64 Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Mon, 9 Sep 2024 17:26:33 -0700 Subject: [PATCH 27/27] updating documentation --- manual/tracy.tex | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/manual/tracy.tex b/manual/tracy.tex index ef02d9044..b987a1335 100644 --- a/manual/tracy.tex +++ b/manual/tracy.tex @@ -139,7 +139,7 @@ \section*{Quick-start guide} \section{A quick look at Tracy Profiler} \label{quicklook} -Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, and Lua integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, OpenCL.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more. +Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, and Lua integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more. While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf, or Very Sleepy), it mainly focuses on manual markup of the source code. Such markup allows frame-by-frame inspection of the program execution. For example, you will be able to see exactly which functions are called, how much time they require, and how they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it cannot accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds. @@ -976,6 +976,7 @@ \subsection{Feature support matrix} % GPU zones fields intentionally left blank for BSDs GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & & \faTimes \\ GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & & \faTimes \\ +GPU zones (Metal) & \faTimes & \faTimes & \faTimes & \faCheck\textsuperscript{\emph{b}} & \faCheck\textsuperscript{\emph{b}} & \faTimes & \faTimes \\ Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faTimes \\ Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\ Crash handling & \faCheck & \faCheck & \faCheck & \faTimes & \faTimes & \faTimes & \faTimes \\ @@ -991,6 +992,7 @@ \subsection{Feature support matrix} \vspace{1em} \faPoo{} -- Not possible to support due to platform limitations. \\ \textsuperscript{\emph{a}}Possible through WSL2. +\textsuperscript{\emph{b}}Only tested on Apple Silicon M1 series \caption{Feature support matrix} \label{featuretable} \end{table} @@ -1559,7 +1561,7 @@ \subsubsection{Memory pools} \subsection{GPU profiling} \label{gpuprofiling} -Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, and OpenCL execution time on GPU. +Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, Metal and OpenCL execution time on GPU. Note that the CPU and GPU timers may be unsynchronized unless you create a calibrated context, but the availability of calibrated contexts is limited. You can try to correct the desynchronization of uncalibrated contexts in the profiler's options (section~\ref{options}). @@ -1665,6 +1667,16 @@ \subsubsection{Direct3D 12} Direct3D 12 contexts are always calibrated. +\subsubsection{Metal} + +To enable Metal support, include the \texttt{public/tracy/TracyMetal.hmm} header file, and create a \texttt{tracy::MetalCtx} object with the \texttt{TracyMetalContext(device)} macro. The object should later be cleaned up with the \texttt{TracyMetalDestroy(context)} macro. To set a custom name for the context, use the \texttt{TracyMetalContextName(name, namelen)} macro. The header file \texttt{TracyMetal.hmm} is intended to be included by \textbf{Objective-C} code, and Objective-C Automatic Reference Counting (ARC) support is required. + +At the moment, the Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, Direct3D and OpenGL. Specifically, \texttt{TracyMetalZone(name, encoderDescriptor)} must be placed before the site where a command encoder is about to be created. This is because not all Apple hardware supports timestamps at command granularity, and can only provide timestamps around an entire command encoder (this accommodates for all tiers of GPU hardware on Apple platforms). + +You may also use \texttt{TracyMetalZoneC(name, encoderDescriptor, color)} to specify a zone color. There is no interface for callstack or transient Metal zones at the moment. + +You are required to periodically collect the GPU events using the \texttt{TracyMetalCollect(ctx)} macro. Good places for collection are: after synchronous waits, after present drawable calls, and inside the completion handler of command buffers. + \subsubsection{OpenCL} OpenCL support is achieved by including the \texttt{public/tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application. To set a custom name for the context, use the \texttt{TracyCLContextName(ctx, name, size)} macro. @@ -1679,13 +1691,13 @@ \subsubsection{Multiple zones in one scope} Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}). -To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan and Direct3D 11/12 -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone} and \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}. +To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan, Direct3D 11/12 and Metal -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}, \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}, and \texttt{TracyMetalZone} with \texttt{TracyMetalNamedZone}. Remember to provide your name for the created stack variable as the first parameter to the macros. \subsubsection{Transient GPU zones} -Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, and Direct3D 11/12 macros. +Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, and Direct3D 11/12 macros. Transient zones are not available for Metal at this moment. \subsection{Fibers} \label{fibers} @@ -3198,7 +3210,7 @@ \subsubsection{Timeline view} The left-hand side \emph{index area} of the timeline view displays various labels (threads, locks), which can be categorized in the following way: \begin{itemize} -\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, and Direct3D 12 contexts are additionally split into separate threads. +\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, Direct3D 12 and Metal contexts are additionally split into separate threads. \item \emph{Pink label} -- CPU data graph. \item \emph{White label} -- A CPU thread. It will be replaced by a bright red label in a thread that has crashed (section~\ref{crashhandling}). If automated sampling was performed, clicking the~\LMB{}~left mouse button on the \emph{\faGhost{}~ghost zones} button will switch zone display mode between 'instrumented' and 'ghost.' \item \emph{Green label} -- Fiber, coroutine, or any other sort of cooperative multitasking 'green thread.' @@ -3218,7 +3230,7 @@ \subsubsection{Timeline view} Meanwhile, the \emph{Streaming thread} is performing some \emph{Streaming jobs}. The first \emph{Streaming job} sent a message (section~\ref{messagelog}). In addition to being listed in the message log, it is indicated by a triangle over the thread separator. When multiple messages are in one place, the triangle outline shape changes to a filled triangle. -The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/OpenCL context in place of a thread name. +The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/Metal/OpenCL context in place of a thread name. Hovering the \faMousePointer{} mouse pointer over a zone will highlight all other zones that have the exact source location with a white outline. Clicking the \LMB{}~left mouse button on a zone will open the zone information window (section~\ref{zoneinfo}). Holding the \keys{\ctrl} key and clicking the \LMB{}~left mouse button on a zone will open the zone statistics window (section~\ref{findzone}). Clicking the \MMB{}~middle mouse button on a zone will zoom the view to the extent of the zone. @@ -3389,7 +3401,7 @@ \subsection{Options menu} \begin{itemize} \item \emph{\faSignature{} Draw CPU usage graph} -- You can disable drawing of the CPU usage graph here. \end{itemize} -\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Direct3D/OpenCL zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}. +\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Metal/Direct3D/OpenCL zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}. \item \emph{\faMicrochip{} Draw CPU zones} -- Determines whether CPU zones are displayed. \begin{itemize} \item \emph{\faGhost{} Draw ghost zones} -- Controls if ghost zones should be displayed in threads which don't have any instrumented zones available.