diff --git a/manual/tracy.tex b/manual/tracy.tex
index ef02d90442..b987a1335d 100644
--- a/manual/tracy.tex
+++ b/manual/tracy.tex
@@ -139,7 +139,7 @@ \section*{Quick-start guide}
 \section{A quick look at Tracy Profiler}
 \label{quicklook}
 
-Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, and Lua integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, OpenCL.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
+Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that you can use for remote or embedded telemetry of games and other applications. It can profile CPU\footnote{Direct support is provided for C, C++, and Lua integration. At the same time, third-party bindings to many other languages exist on the internet, such as Rust, Zig, C\#, OCaml, Odin, etc.}, GPU\footnote{All major graphic APIs: OpenGL, Vulkan, Direct3D 11/12, Metal, OpenCL.}, memory allocations, locks, context switches, automatically attribute screenshots to captured frames, and much more.
 
 While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf, or Very Sleepy), it mainly focuses on manual markup of the source code. Such markup allows frame-by-frame inspection of the program execution. For example, you will be able to see exactly which functions are called, how much time they require, and how they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it cannot accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds.
 
@@ -976,6 +976,7 @@ \subsection{Feature support matrix}
 % GPU zones fields intentionally left blank for BSDs
 GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & & \faTimes \\
 GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & & \faTimes \\
+GPU zones (Metal) & \faTimes & \faTimes & \faTimes & \faCheck\textsuperscript{\emph{b}} & \faCheck\textsuperscript{\emph{b}} & \faTimes & \faTimes \\
 Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faTimes \\
 Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
 Crash handling & \faCheck & \faCheck & \faCheck & \faTimes & \faTimes & \faTimes & \faTimes \\
@@ -991,6 +992,7 @@ \subsection{Feature support matrix}
 \vspace{1em}
 \faPoo{} -- Not possible to support due to platform limitations. \\
 \textsuperscript{\emph{a}}Possible through WSL2.
+\textsuperscript{\emph{b}}Only tested on Apple Silicon M1 series
 \caption{Feature support matrix}
 \label{featuretable}
 \end{table}
@@ -1559,7 +1561,7 @@ \subsubsection{Memory pools}
 \subsection{GPU profiling}
 \label{gpuprofiling}
 
-Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, and OpenCL execution time on GPU.
+Tracy provides bindings for profiling OpenGL, Vulkan, Direct3D 11, Direct3D 12, Metal and OpenCL execution time on GPU.
 
 Note that the CPU and GPU timers may be unsynchronized unless you create a calibrated context, but the availability of calibrated contexts is limited. You can try to correct the desynchronization of uncalibrated contexts in the profiler's options (section~\ref{options}).
 
@@ -1665,6 +1667,16 @@ \subsubsection{Direct3D 12}
 
 Direct3D 12 contexts are always calibrated.
 
+\subsubsection{Metal}
+
+To enable Metal support, include the \texttt{public/tracy/TracyMetal.hmm} header file, and create a \texttt{tracy::MetalCtx} object with the \texttt{TracyMetalContext(device)} macro. The object should later be cleaned up with the \texttt{TracyMetalDestroy(context)} macro. To set a custom name for the context, use the \texttt{TracyMetalContextName(name, namelen)} macro. The header file \texttt{TracyMetal.hmm} is intended to be included by \textbf{Objective-C} code, and Objective-C Automatic Reference Counting (ARC) support is required.
+
+At the moment, the Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, Direct3D and OpenGL. Specifically, \texttt{TracyMetalZone(name, encoderDescriptor)} must be placed before the site where a command encoder is about to be created. This is because not all Apple hardware supports timestamps at command granularity, and can only provide timestamps around an entire command encoder (this accommodates for all tiers of GPU hardware on Apple platforms).
+
+You may also use \texttt{TracyMetalZoneC(name, encoderDescriptor, color)} to specify a zone color. There is no interface for callstack or transient Metal zones at the moment.
+
+You are required to periodically collect the GPU events using the \texttt{TracyMetalCollect(ctx)} macro. Good places for collection are: after synchronous waits, after present drawable calls, and inside the completion handler of command buffers. 
+
 \subsubsection{OpenCL}
 
 OpenCL support is achieved by including the \texttt{public/tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application. To set a custom name for the context, use the \texttt{TracyCLContextName(ctx, name, size)} macro.
@@ -1679,13 +1691,13 @@ \subsubsection{Multiple zones in one scope}
 
 Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).
 
-To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan and Direct3D 11/12 -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone} and \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}.
+To solve this problem, in case of OpenGL use the \texttt{TracyGpuNamedZone} macro in place of \texttt{TracyGpuZone} (or the color variant). The same applies to Vulkan, Direct3D 11/12 and Metal -- replace \texttt{TracyVkZone} with \texttt{TracyVkNamedZone}, \texttt{TracyD3D11Zone}/\texttt{TracyD3D12Zone} with \texttt{TracyD3D11NamedZone}/\texttt{TracyD3D12NamedZone}, and \texttt{TracyMetalZone} with \texttt{TracyMetalNamedZone}.
 
 Remember to provide your name for the created stack variable as the first parameter to the macros.
 
 \subsubsection{Transient GPU zones}
 
-Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, and Direct3D 11/12 macros.
+Transient zones (see section~\ref{transientzones} for details) are available in OpenGL, Vulkan, and Direct3D 11/12 macros. Transient zones are not available for Metal at this moment.
 
 \subsection{Fibers}
 \label{fibers}
@@ -3198,7 +3210,7 @@ \subsubsection{Timeline view}
 The left-hand side \emph{index area} of the timeline view displays various labels (threads, locks), which can be categorized in the following way:
 
 \begin{itemize}
-\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, and Direct3D 12 contexts are additionally split into separate threads.
+\item \emph{Light blue label} -- GPU context. Multi-threaded Vulkan, OpenCL, Direct3D 12 and Metal contexts are additionally split into separate threads.
 \item \emph{Pink label} -- CPU data graph.
 \item \emph{White label} -- A CPU thread. It will be replaced by a bright red label in a thread that has crashed (section~\ref{crashhandling}). If automated sampling was performed, clicking the~\LMB{}~left mouse button on the \emph{\faGhost{}~ghost zones} button will switch zone display mode between 'instrumented' and 'ghost.'
 \item \emph{Green label} -- Fiber, coroutine, or any other sort of cooperative multitasking 'green thread.'
@@ -3218,7 +3230,7 @@ \subsubsection{Timeline view}
 
 Meanwhile, the \emph{Streaming thread} is performing some \emph{Streaming jobs}. The first \emph{Streaming job} sent a message (section~\ref{messagelog}). In addition to being listed in the message log, it is indicated by a triangle over the thread separator. When multiple messages are in one place, the triangle outline shape changes to a filled triangle.
 
-The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/OpenCL context in place of a thread name.
+The GPU zones are displayed just like CPU zones, with an OpenGL/Vulkan/Direct3D/Metal/OpenCL context in place of a thread name.
 
 Hovering the \faMousePointer{} mouse pointer over a zone will highlight all other zones that have the exact source location with a white outline. Clicking the \LMB{}~left mouse button on a zone will open the zone information window (section~\ref{zoneinfo}). Holding the \keys{\ctrl} key and clicking the \LMB{}~left mouse button on a zone will open the zone statistics window (section~\ref{findzone}). Clicking the \MMB{}~middle mouse button on a zone will zoom the view to the extent of the zone.
 
@@ -3389,7 +3401,7 @@ \subsection{Options menu}
 \begin{itemize}
 \item \emph{\faSignature{} Draw CPU usage graph} -- You can disable drawing of the CPU usage graph here.
 \end{itemize}
-\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Direct3D/OpenCL zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}.
+\item \emph{\faEye{} Draw GPU zones} -- Allows disabling display of OpenGL/Vulkan/Metal/Direct3D/OpenCL zones. The \emph{GPU zones} drop-down allows disabling individual GPU contexts and setting CPU/GPU drift offsets of uncalibrated contexts (see section~\ref{gpuprofiling} for more information). The \emph{\faRobot~Auto} button automatically measures the GPU drift value\footnote{There is an assumption that drift is linear. Automated measurement calculates and removes change over time in delay-to-execution of GPU zones. Resulting value may still be incorrect.}.
 \item \emph{\faMicrochip{} Draw CPU zones} -- Determines whether CPU zones are displayed.
 \begin{itemize}
 \item \emph{\faGhost{} Draw ghost zones} -- Controls if ghost zones should be displayed in threads which don't have any instrumented zones available.
diff --git a/profiler/src/profiler/TracyTimelineItemGpu.cpp b/profiler/src/profiler/TracyTimelineItemGpu.cpp
index 4ae1d683fa..b499c64f17 100644
--- a/profiler/src/profiler/TracyTimelineItemGpu.cpp
+++ b/profiler/src/profiler/TracyTimelineItemGpu.cpp
@@ -42,7 +42,8 @@ void TimelineItemGpu::HeaderTooltip( const char* label ) const
     const bool isMultithreaded =
         ( m_gpu->type == GpuContextType::Vulkan ) ||
         ( m_gpu->type == GpuContextType::OpenCL ) ||
-        ( m_gpu->type == GpuContextType::Direct3D12 );
+        ( m_gpu->type == GpuContextType::Direct3D12 ) ||
+        ( m_gpu->type == GpuContextType::Metal );
 
     char buf[64];
     sprintf( buf, "%s context %i", GpuContextNames[(int)m_gpu->type], m_idx );
diff --git a/public/common/TracyQueue.hpp b/public/common/TracyQueue.hpp
index affbd67ab2..ec448d724f 100644
--- a/public/common/TracyQueue.hpp
+++ b/public/common/TracyQueue.hpp
@@ -401,7 +401,8 @@ enum class GpuContextType : uint8_t
     Vulkan,
     OpenCL,
     Direct3D12,
-    Direct3D11
+    Direct3D11,
+    Metal
 };
 
 enum GpuContextFlags : uint8_t
diff --git a/public/tracy/TracyMetal.hmm b/public/tracy/TracyMetal.hmm
new file mode 100644
index 0000000000..d7aa4d2eca
--- /dev/null
+++ b/public/tracy/TracyMetal.hmm
@@ -0,0 +1,625 @@
+#ifndef __TRACYMETAL_HMM__
+#define __TRACYMETAL_HMM__
+
+/* This file implements a Metal API back-end for Tracy (it has only been tested on Apple
+   Silicon devices, but it should also work on Intel-based Macs and older iOS devices).
+   The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan,
+   Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where
+   a command encoder is created. This is because not all hardware supports timestamps at
+   command granularity, and can only provide timestamps around an entire command encoder.
+   This accommodates for all tiers of hardware; in the future, variants of TracyMetalZone()
+   will be added to support the habitual command-level granularity of Tracy GPU back-ends.
+   Metal also imposes a few restrictions that make the process of requesting and collecting
+   queries more complicated in Tracy:
+   a) timestamp query buffers are limited to 4096 queries (32KB, where each query is 8 bytes)
+   b) when a timestamp query buffer is created, Metal initializes all timestamps with zeroes,
+      and there's no way to reset them back to zero after timestamps get resolved; the only
+      way to clear the timestamps is by allocating a new timestamp query buffer
+   c) if a command encoder records no commands and its corresponding command buffer ends up
+      committed to the command queue, Metal will "optimize-away" the encoder along with any
+      timestamp queries associated with it (the timestamp will remain as zero and will never
+      get resolved)
+   Because of the limitations above, two timestamp buffers are managed internally. Once one
+   of the buffers fills up with requests, the second buffer can start serving new requests.
+   Once all requests in a buffer get resolved and collected, the entire buffer is discarded
+   and a new one allocated for future requests. (Proper cycling through a ring buffer would
+   require bookkeeping and completion handlers to collect only the known complete queries.)
+   In the current implementation, there is potential for a race condition when the buffer is
+   discarded and reallocated. In practice, the race condition will never materialize so long
+   as TracyMetalCollect() is called frequently to keep the amount of unresolved queries low.
+   Finally, there's a timeout mechanism during timestamp collection to detect "empty" command
+   encoders and ensure progress.
+*/
+
+#ifndef TRACY_ENABLE
+
+#define TracyMetalContext(device) nullptr
+#define TracyMetalDestroy(ctx)
+#define TracyMetalContextName(ctx, name, size)
+
+#define TracyMetalZone(ctx, encoderDesc, name)
+#define TracyMetalZoneC(ctx, encoderDesc, name, color)
+#define TracyMetalNamedZone(ctx, varname, encoderDesc, name, active)
+#define TracyMetalNamedZoneC(ctx, varname, encoderDesc, name, color, active)
+
+#define TracyMetalCollect(ctx)
+
+namespace tracy
+{
+class MetalZoneScope {};
+}
+
+using TracyMetalCtx = void*;
+
+#else
+
+#if not __has_feature(objc_arc)
+#error TracyMetal requires ARC to be enabled.
+#endif
+
+#include <atomic>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "Tracy.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyCallstack.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+
+// ok to import if in obj-c code
+#import <Metal/Metal.h>
+
+#define VA_ARGS(...) , ##__VA_ARGS__
+
+#define TracyMetalPanic(ret, msg, ...) do { \
+    char buffer [1024]; \
+    snprintf(buffer, sizeof(buffer), "TracyMetal: " msg VA_ARGS(__VA_ARGS__)); \
+    TracyMessageC(buffer, strlen(buffer), tracy::Color::OrangeRed); \
+    fprintf(stderr, "%s\n", buffer); \
+    assert(false && "TracyMetal: " msg); \
+    ret; \
+    } while(false);
+
+#ifndef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT
+#define TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT 0.200f
+#endif//TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT
+
+#ifndef TRACY_METAL_DEBUG_MASK
+#define TRACY_METAL_DEBUG_MASK (0)
+#endif//TRACY_METAL_DEBUG_MASK
+
+#if TRACY_METAL_DEBUG_MASK
+#define TracyMetalDebug(mask, ...) if (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; }
+#else
+#define TracyMetalDebug(mask, ...)
+#endif
+
+#ifndef TracyMetalDebugZoneScopeWireTap
+#define TracyMetalDebugZoneScopeWireTap
+#endif//TracyMetalDebugZoneScopeWireTap
+
+namespace tracy
+{
+
+class MetalCtx
+{
+    friend class MetalZoneScope;
+
+    enum { MaxQueries = 4 * 1024 };    // Metal: between 8 and 32768 _BYTES_...
+
+public:
+    MetalCtx(id<MTLDevice> device)
+    : m_device(device)
+    {
+        ZoneScopedNC("TracyMetalCtx", tracy::Color::Red4);
+  
+        TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue));
+        TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample));
+        
+        if (m_device == nil)
+        {
+            TracyMetalPanic(return, "device is nil.");
+        }
+        if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
+        {
+            TracyMetalPanic(return, "ERROR: timestamp sampling at pipeline stage boundary is not supported.");
+        }
+        if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary])
+        {
+            TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n"));
+        }
+        if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary])
+        {
+            TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n"));
+        }
+        if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary])
+        {
+            TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n"));
+        }
+        if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary])
+        {
+            TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n"));
+        }
+        
+        m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries);
+        m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries);
+        
+        m_timestampRequestTime.resize(MaxQueries);
+
+        MTLTimestamp cpuTimestamp = 0;
+        MTLTimestamp gpuTimestamp = 0;
+        [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp];
+        m_mostRecentTimestamp = gpuTimestamp;
+        TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp));
+        TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp));
+
+        cpuTimestamp = Profiler::GetTime();
+        TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp));
+
+        float period = 1.0f;
+        
+        m_contextId = GetGpuCtxCounter().fetch_add(1);
+        
+        auto* item = Profiler::QueueSerial();
+        MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+        MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp));
+        MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp));
+        MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()?
+        MemWrite(&item->gpuNewContext.period, period);
+        MemWrite(&item->gpuNewContext.context, m_contextId);
+        //MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
+        MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0));
+        MemWrite(&item->gpuNewContext.type, GpuContextType::Metal);
+        SubmitQueueItem(item);
+    }
+
+    ~MetalCtx()
+    {
+        ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4);
+        TracyMetalDebug(1<<0, ZoneValue(m_previousCheckpoint.load()));
+        TracyMetalDebug(1<<0, ZoneValue(m_queryCounter.load()));
+        // collect the last remnants of Metal GPU activity...
+        // TODO: add a timeout to this loop?
+        while (m_previousCheckpoint.load() != m_queryCounter.load())
+            Collect();
+    }
+
+    static MetalCtx* Create(id<MTLDevice> device)
+    {
+        auto ctx = static_cast<MetalCtx*>(tracy_malloc(sizeof(MetalCtx)));
+        new (ctx) MetalCtx(device);
+        if (ctx->m_contextId == 255)
+        {
+            TracyMetalPanic(return nullptr, "ERROR: unable to create context.");
+            Destroy(ctx);
+        }
+        return ctx;
+    }
+
+    static void Destroy(MetalCtx* ctx)
+    {
+        ctx->~MetalCtx();
+        tracy_free(ctx);
+    }
+
+    void Name( const char* name, uint16_t len )
+    {
+        auto ptr = (char*)tracy_malloc( len );
+        memcpy( ptr, name, len );
+
+        auto* item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuContextName );
+        MemWrite( &item->gpuContextNameFat.context, m_contextId );
+        MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+        MemWrite( &item->gpuContextNameFat.size, len );
+        SubmitQueueItem(item);
+    }
+
+    bool Collect()
+    {
+        ZoneScopedNC("TracyMetal::Collect", Color::Red4);
+
+#ifdef TRACY_ON_DEMAND
+        if (!GetProfiler().IsConnected())
+        {
+            return true;
+        }
+#endif
+
+        // Only one thread is allowed to collect timestamps at any given time
+        // but there's no need to block contending threads
+        if (!m_collectionMutex.try_lock())
+        {
+            return true;
+        }
+
+        std::unique_lock lock (m_collectionMutex, std::adopt_lock);
+
+        uintptr_t begin = m_previousCheckpoint.load();
+        uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?;
+        TracyMetalDebug(1<<3, ZoneValue(begin));
+        TracyMetalDebug(1<<3, ZoneValue(latestCheckpoint));
+
+        uint32_t count = RingCount(begin, latestCheckpoint);
+        if (count == 0)   // no pending timestamp queries
+        {
+            //uintptr_t nextCheckpoint = m_queryCounter.load();
+            //if (nextCheckpoint != latestCheckpoint)
+            //{
+            //    // TODO: signal event / fence now?
+            //}
+            return true;
+        }
+
+        // resolve up until the ring buffer boundary and let a subsequenty call
+        // to Collect handle the wrap-around
+        bool reallocateBuffer = false;
+        if (RingIndex(begin) + count >= RingSize())
+        {
+            count = RingSize() - RingIndex(begin);
+            reallocateBuffer = true;
+        }
+        TracyMetalDebug(1<<3, ZoneValue(count));
+        
+        auto buffer_idx = (begin / MaxQueries) % 2;
+        auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx];
+
+        if (count >= RingSize())
+        {
+            TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count);
+        }
+
+        TracyMetalDebug(1<<3, TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count));
+
+        NSRange range = NSMakeRange(RingIndex(begin), count);
+        NSData* data = [counterSampleBuffer resolveCounterRange:range];
+        NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp);
+        MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes);
+        if (timestamps == nil)
+        {
+            TracyMetalPanic(return false, "Collect: unable to resolve timestamps.");
+        }
+
+        if (numResolvedTimestamps != count)
+        {
+            TracyMetalPanic(, "Collect: numResolvedTimestamps != count : %u != %u", (uint32_t)numResolvedTimestamps, count);
+        }
+
+        int resolved = 0;
+        for (auto i = 0; i < numResolvedTimestamps; i += 2)
+        {
+            ZoneScopedN("TracyMetal::Collect::[i]");
+            MTLTimestamp t_start = timestamps[i+0].timestamp;
+            MTLTimestamp t_end = timestamps[i+1].timestamp;
+            uint32_t k = RingIndex(begin + i);
+            TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)));
+            if ((t_start == MTLCounterErrorValue)  || (t_end == MTLCounterErrorValue))
+            {
+                TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k);
+                break;
+            }
+            // Metal will initialize timestamp buffer with zeroes; encountering a zero-value
+            // timestamp means that the timestamp has not been written and resolved yet
+            if ((t_start == 0) || (t_end == 0))
+            {
+                auto checkTime = std::chrono::high_resolution_clock::now();
+                auto requestTime = m_timestampRequestTime[k];
+                auto ms_in_flight = std::chrono::duration<float>(checkTime-requestTime).count()*1000.0f;
+                TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight));
+                const float timeout_ms = TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT * 1000.0f;
+                if (ms_in_flight < timeout_ms)
+                    break;
+                ZoneScopedN("TracyMetal::Collect::Drop");
+                TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight);
+                t_start = m_mostRecentTimestamp + 5;
+                t_end = t_start + 5;
+            }
+            TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone"));
+            TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone"));
+            {
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuTime);
+            MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(t_start));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k));
+            MemWrite(&item->gpuTime.context, m_contextId);
+            Profiler::QueueSerialFinish();
+            }
+            {
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuTime);
+            MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(t_end));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k+1));
+            MemWrite(&item->gpuTime.context, m_contextId);
+            Profiler::QueueSerialFinish();
+            }
+            m_mostRecentTimestamp = (t_end > m_mostRecentTimestamp) ? t_end : m_mostRecentTimestamp;
+            TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"));
+            resolved += 2;
+        }
+        TracyMetalDebug(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load())));
+        
+        m_previousCheckpoint += resolved;
+        
+        // Check whether the timestamp buffer has been fully resolved/collected:
+        // WARN: there's technically a race condition here: NextQuery() may reference the
+        // buffer that is being released instead of the new one. In practice, this should
+        // never happen so long as Collect is called frequently enough to prevent pending
+        // timestamp query requests from piling up too quickly.
+        if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0)
+        {
+            m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries);
+        }
+
+        //RecalibrateClocks();    // to account for drift
+
+        return true;
+    }
+
+private:
+    tracy_force_inline void SubmitQueueItem(QueueItem* item)
+    {
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem(*item);
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline uint32_t RingIndex(uintptr_t index)
+    {
+        index %= MaxQueries;
+        return static_cast<uint32_t>(index);
+    }
+
+    tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end)
+    {
+        // wrap-around safe: all unsigned
+        uintptr_t count = end - begin;
+        return static_cast<uint32_t>(count);
+    }
+
+    tracy_force_inline uint32_t RingSize() const
+    {
+        return MaxQueries;
+    }
+
+    struct Query { id<MTLCounterSampleBuffer> buffer; uint32_t idx; };
+
+    tracy_force_inline Query NextQuery()
+    {
+        ZoneScopedNC("TracyMetal::NextQuery", tracy::Color::LightCoral);
+        auto id = m_queryCounter.fetch_add(2);
+        TracyMetalDebug(1<<1, ZoneValue(id));
+        auto count = RingCount(m_previousCheckpoint, id);
+        if (count >= MaxQueries)
+        {
+            // TODO: return a proper (hidden) "sentinel" query
+            Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 };
+            TracyMetalPanic(
+                return sentinel,
+                "NextQueryId: FULL! too many pending timestamp queries. Consider calling TracyMetalCollect() more frequently. [%llu, %llu] (%u)",
+                m_previousCheckpoint.load(), id, count
+            );
+        }
+        uint32_t buffer_idx = (id / MaxQueries) % 2;
+        TracyMetalDebug(1<<1, ZoneValue(buffer_idx));
+        auto buffer = m_counterSampleBuffers[buffer_idx];
+        if (buffer == nil)
+            TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id);
+        uint32_t idx = RingIndex(id);
+        TracyMetalDebug(1<<1, ZoneValue(idx));
+        TracyMetalDebug(1<<1, TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"));
+        m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now();
+        return Query{ buffer, idx };
+    }
+
+    tracy_force_inline uint8_t GetContextId() const
+    {
+        return m_contextId;
+    }
+    
+    static id<MTLCounterSampleBuffer> NewTimestampSampleBuffer(id<MTLDevice> device, size_t count)
+    {
+        ZoneScopedN("TracyMetal::NewTimestampSampleBuffer");
+
+        id<MTLCounterSet> timestampCounterSet = nil;
+        for (id<MTLCounterSet> counterSet in device.counterSets)
+        {
+            if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp])
+            {
+                timestampCounterSet = counterSet;
+                break;
+            }
+        }
+        if (timestampCounterSet == nil)
+        {
+            TracyMetalPanic(return nil, "ERROR: timestamp counters are not supported on the platform.");
+        }
+
+        MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init];
+        sampleDescriptor.counterSet = timestampCounterSet;
+        sampleDescriptor.sampleCount = MaxQueries;
+        sampleDescriptor.storageMode = MTLStorageModeShared;
+        sampleDescriptor.label = @"TracyMetalTimestampPool";
+
+        NSError* error = nil;
+        id<MTLCounterSampleBuffer> counterSampleBuffer = [device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error];
+        if (error != nil)
+        {
+            //NSLog(@"%@ | %@", error.localizedDescription, error.localizedFailureReason);
+            TracyMetalPanic(return nil,
+                "ERROR: unable to create sample buffer for timestamp counters : %s | %s",
+                [error.localizedDescription cString], [error.localizedFailureReason cString]);
+        }
+        
+        return counterSampleBuffer;
+    }
+
+    uint8_t m_contextId = 255;
+
+    id<MTLDevice> m_device = nil;
+    id<MTLCounterSampleBuffer> m_counterSampleBuffers [2] = {};
+
+    using atomic_counter = std::atomic<uintptr_t>;
+    static_assert(atomic_counter::is_always_lock_free);
+    atomic_counter m_queryCounter = 0;
+
+    atomic_counter m_previousCheckpoint = 0;
+    MTLTimestamp m_mostRecentTimestamp = 0;
+    
+    std::vector<std::chrono::high_resolution_clock::time_point> m_timestampRequestTime;
+    
+    std::mutex m_collectionMutex;
+};
+
+class MetalZoneScope
+{
+public:
+    tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLComputePassDescriptor* desc, const SourceLocationData* srcloc, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if ( !m_active ) return;
+        if (desc == nil) TracyMetalPanic(return, "compute pass descriptor is nil.");
+        m_ctx = ctx;
+
+        auto& query = m_query = ctx->NextQuery();
+
+        desc.sampleBufferAttachments[0].sampleBuffer = query.buffer;
+        desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0;
+        desc.sampleBufferAttachments[0].endOfEncoderSampleIndex   = query.idx+1;
+
+        SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc);
+    }
+
+    tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if ( !m_active ) return;
+        if (desc == nil) TracyMetalPanic(return, "blit pass descriptor is nil.");
+        m_ctx = ctx;
+
+        auto& query = m_query = ctx->NextQuery();
+
+        desc.sampleBufferAttachments[0].sampleBuffer = query.buffer;
+        desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0;
+        desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1;
+
+        SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc);
+    }
+
+    tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if ( !m_active ) return;
+        if (desc == nil) TracyMetalPanic(return, "render pass descriptor is nil.");
+        m_ctx = ctx;
+
+        auto& query = m_query = ctx->NextQuery();
+
+        desc.sampleBufferAttachments[0].sampleBuffer = query.buffer;
+        desc.sampleBufferAttachments[0].startOfVertexSampleIndex = query.idx+0;
+        desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample;
+        desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample;
+        desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = query.idx+1;
+
+        SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc);
+    }
+
+    /* TODO: implement this constructor interfarce for "command-level" profiling, if the device supports it
+    tracy_force_inline MetalZoneScope( MetalCtx* ctx, id<MTLComputeCommandEncoder> cmdEncoder, const SourceLocationData* srcloc, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        m_ctx = ctx;
+        m_cmdEncoder = cmdEncoder;
+
+        auto& query = m_query = ctx->NextQueryId();
+
+        [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:query.idx withBarrier:YES];
+
+        SubmitZoneBeginGpu(ctx, query.idx, srcloc);
+    }
+    */
+
+    tracy_force_inline ~MetalZoneScope()
+    {
+        if( !m_active ) return;
+
+        SubmitZoneEndGpu(m_ctx, m_query.idx + 1);
+    }
+    
+    TracyMetalDebugZoneScopeWireTap;
+
+private:
+    const bool m_active;
+
+    MetalCtx* m_ctx;
+
+    /* TODO: declare it for "command-level" profiling
+    id<MTLComputeCommandEncoder> m_cmdEncoder;
+    */
+    
+    static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc)
+    {
+        auto* item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() );
+        Profiler::QueueSerialFinish();
+        
+        TracyMetalDebug(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"));
+    }
+    
+    static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId)
+    {
+        auto* item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
+        MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() );
+        Profiler::QueueSerialFinish();
+        
+        TracyMetalDebug(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"));
+    }
+
+    MetalCtx::Query m_query = {};
+};
+
+}
+
+using TracyMetalCtx = tracy::MetalCtx*;
+
+#define TracyMetalContext(device) tracy::MetalCtx::Create(device)
+#define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx)
+#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size)
+
+#define TracyMetalZone( ctx, encoderDesc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, encoderDesc, name, true )
+#define TracyMetalZoneC( ctx, encoderDesc, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, encoderDesc, name, color, true )
+#define TracyMetalNamedZone( ctx, varname, encoderDesc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
+#define TracyMetalNamedZoneC( ctx, varname, encoderDesc, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
+
+#define TracyMetalCollect( ctx ) ctx->Collect();
+
+#endif
+
+#endif//__TRACYMETAL_HMM__