diff --git a/thirdparty/tracy/CMakeLists.txt b/thirdparty/tracy/CMakeLists.txt
index 3911836a3caf43e3ea314ad07451b23fda4516cd..3e9727abe3ec25d086331993fe115b65f07d69b3 100644
--- a/thirdparty/tracy/CMakeLists.txt
+++ b/thirdparty/tracy/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Tracy Profiler Client 0.9.1
+# Tracy Profiler Client 0.10.0
 # BSD 3-clause
 # Copyright (c) 2017-2023, Bartosz Taudul <wolf@nereid.pl>
 
diff --git a/thirdparty/tracy/include/tracy/TracyClient.cpp b/thirdparty/tracy/include/tracy/TracyClient.cpp
index 77f81a4a7cf39258e495545f994d804e83e173c0..26387b762ed88201eafd2045d4de0eff21de53aa 100644
--- a/thirdparty/tracy/include/tracy/TracyClient.cpp
+++ b/thirdparty/tracy/include/tracy/TracyClient.cpp
@@ -22,6 +22,7 @@
 #include "common/tracy_lz4.cpp"
 #include "client/TracyProfiler.cpp"
 #include "client/TracyCallstack.cpp"
+#include "client/TracySysPower.cpp"
 #include "client/TracySysTime.cpp"
 #include "client/TracySysTrace.cpp"
 #include "common/TracySocket.cpp"
diff --git a/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp b/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp
index a874446c2cf29db3a0bc223b395d07354af49584..0de7c9d2e9a6d8f2a6521fdfb6677c446ba099c1 100644
--- a/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp
+++ b/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp
@@ -686,7 +686,9 @@ void InitCallstackCritical()
 void InitCallstack()
 {
     cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
+#ifndef TRACY_DEMANGLE
     ___tracy_init_demangle_buffer();
+#endif
 
 #ifdef __linux
     InitKernelSymbols();
@@ -761,7 +763,9 @@ debuginfod_client* GetDebuginfodClient()
 
 void EndCallstack()
 {
+#ifndef TRACY_DEMANGLE
     ___tracy_free_demangle_buffer();
+#endif
 #ifdef TRACY_DEBUGINFOD
     ClearDebugInfoVector( s_di_known );
     debuginfod_end( s_debuginfod );
diff --git a/thirdparty/tracy/include/tracy/client/TracyLock.hpp b/thirdparty/tracy/include/tracy/client/TracyLock.hpp
index 296a41ba1abf859ecf6d8f4e6603f69fb17e3df2..d12a3c16d6da3a784c3d1b31847507f75433bdfd 100644
--- a/thirdparty/tracy/include/tracy/client/TracyLock.hpp
+++ b/thirdparty/tracy/include/tracy/client/TracyLock.hpp
@@ -21,7 +21,7 @@ public:
         , m_active( false )
 #endif
     {
-        assert( m_id != std::numeric_limits<uint32_t>::max() );
+        assert( m_id != (std::numeric_limits<uint32_t>::max)() );
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::LockAnnounce );
@@ -154,7 +154,7 @@ public:
 
     tracy_force_inline void CustomName( const char* name, size_t size )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
         auto ptr = (char*)tracy_malloc( size );
         memcpy( ptr, name, size );
         auto item = Profiler::QueueSerial();
@@ -235,7 +235,7 @@ public:
         , m_active( false )
 #endif
     {
-        assert( m_id != std::numeric_limits<uint32_t>::max() );
+        assert( m_id != (std::numeric_limits<uint32_t>::max)() );
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::LockAnnounce );
@@ -450,7 +450,7 @@ public:
 
     tracy_force_inline void CustomName( const char* name, size_t size )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
         auto ptr = (char*)tracy_malloc( size );
         memcpy( ptr, name, size );
         auto item = Profiler::QueueSerial();
diff --git a/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp b/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp
index 6104a7edd68178c6e376f6aec28051ff581cc5fb..ed580123a7aeb3dd0dd7a5ce10fa8fc7d5074f69 100644
--- a/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp
+++ b/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp
@@ -83,7 +83,9 @@
 #endif
 
 #ifdef __APPLE__
-#  define TRACY_DELAYED_INIT
+#  ifndef TRACY_DELAYED_INIT
+#    define TRACY_DELAYED_INIT
+#  endif
 #else
 #  ifdef __GNUC__
 #    define init_order( val ) __attribute__ ((init_priority(val)))
@@ -1072,7 +1074,9 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
     }
     closedir( dp );
 
+#ifdef TRACY_HAS_CALLSTACK
     if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release );
+#endif
 
     TracyLfqPrepare( QueueType::Crash );
     TracyLfqCommit;
@@ -1353,6 +1357,7 @@ Profiler::Profiler()
     , m_queryImage( nullptr )
     , m_queryData( nullptr )
     , m_crashHandlerInstalled( false )
+    , m_programName( nullptr )
 {
     assert( !s_instance );
     s_instance = this;
@@ -1711,6 +1716,9 @@ void Profiler::Worker()
             if( m_sock ) break;
 #ifndef TRACY_ON_DEMAND
             ProcessSysTime();
+#  ifdef TRACY_HAS_SYSPOWER
+            m_sysPower.Tick();
+#  endif
 #endif
 
             if( m_broadcast )
@@ -1718,6 +1726,14 @@ void Profiler::Worker()
                 const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
                 if( t - lastBroadcast > 3000000000 )  // 3s
                 {
+                    m_programNameLock.lock();
+                    if( m_programName )
+                    {
+                        broadcastMsg = GetBroadcastMessage( m_programName, strlen( m_programName ), broadcastLen, dataPort );
+                        m_programName = nullptr;
+                    }
+                    m_programNameLock.unlock();
+
                     lastBroadcast = t;
                     const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count();
                     broadcastMsg.activeTime = int32_t( ts - m_epoch );
@@ -1828,6 +1844,9 @@ void Profiler::Worker()
         for(;;)
         {
             ProcessSysTime();
+#ifdef TRACY_HAS_SYSPOWER
+            m_sysPower.Tick();
+#endif
             const auto status = Dequeue( token );
             const auto serialStatus = DequeueSerial();
             if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
@@ -4149,6 +4168,7 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_
 TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); }
 TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); }
 TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); }
+TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); }
 TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); }
 TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); }
 TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); }
@@ -4167,7 +4187,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begi
 {
     TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin );
     tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
-    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
     tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
     tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
     tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@@ -4190,7 +4210,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zon
 {
     TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc  );
     tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
-    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
     tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
     tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
     tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@@ -4202,7 +4222,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tra
     tracy::GetProfiler().SendCallstack( data.depth );
     TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack  );
     tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
-    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
     tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
     tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
     tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@@ -4292,7 +4312,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_
     auto item = tracy::Profiler::QueueSerial();
     tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial );
     tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
-    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
     tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
     tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
     tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@@ -4304,7 +4324,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct
     auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
     tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial );
     tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
-    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
     tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
     tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
     tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
diff --git a/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp b/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp
index 1ed66f66647260b3cc3c36f0d0a70dc081200ca6..e3b256dfa632237837f186f10a14d5db6660fdb2 100644
--- a/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp
+++ b/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp
@@ -10,6 +10,7 @@
 #include "tracy_concurrentqueue.h"
 #include "tracy_SPSCQueue.h"
 #include "TracyCallstack.hpp"
+#include "TracySysPower.hpp"
 #include "TracySysTime.hpp"
 #include "TracyFastVector.hpp"
 #include "../common/TracyQueue.hpp"
@@ -208,7 +209,22 @@ public:
         if( HardwareSupportsInvariantTSC() )
         {
             uint64_t rax, rdx;
+#ifdef TRACY_PATCHABLE_NOPSLEDS
+            // Some external tooling (such as rr) wants to patch our rdtsc and replace it by a
+            // branch to control the external input seen by a program. This kind of patching is
+            // not generally possible depending on the surrounding code and can lead to significant
+            // slowdowns if the compiler generated unlucky code and rr and tracy are used together.
+            // To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence,
+            // which rr promises will be patchable independent of the surrounding code.
+            asm volatile (
+                    // This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether
+                    // they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use
+                    // the 5 byte one.
+                    ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t"
+                    "rdtsc" : "=a" (rax), "=d" (rdx) );
+#else
             asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
+#endif
             return (int64_t)(( rdx << 32 ) + rax);
         }
 #  else
@@ -288,7 +304,7 @@ public:
     {
 #ifndef TRACY_NO_FRAME_IMAGE
         auto& profiler = GetProfiler();
-        assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < std::numeric_limits<uint32_t>::max() );
+        assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < (std::numeric_limits<uint32_t>::max)() );
 #  ifdef TRACY_ON_DEMAND
         if( !profiler.IsConnected() ) return;
 #  endif
@@ -305,6 +321,12 @@ public:
         fi->flip = flip;
         profiler.m_fiQueue.commit_next();
         profiler.m_fiLock.unlock();
+#else
+        static_cast<void>(image); // unused
+        static_cast<void>(w); // unused
+        static_cast<void>(h); // unused
+        static_cast<void>(offset); // unused
+        static_cast<void>(flip); // unused
 #endif
     }
 
@@ -362,7 +384,7 @@ public:
 
     static tracy_force_inline void Message( const char* txt, size_t size, int callstack )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
 #ifdef TRACY_ON_DEMAND
         if( !GetProfiler().IsConnected() ) return;
 #endif
@@ -399,7 +421,7 @@ public:
 
     static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
 #ifdef TRACY_ON_DEMAND
         if( !GetProfiler().IsConnected() ) return;
 #endif
@@ -442,7 +464,7 @@ public:
 
     static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
         auto ptr = (char*)tracy_malloc( size );
         memcpy( ptr, txt, size );
         TracyLfqPrepare( QueueType::MessageAppInfo );
@@ -676,6 +698,13 @@ public:
         return m_isConnected.load( std::memory_order_acquire );
     }
 
+    tracy_force_inline void SetProgramName( const char* name )
+    {
+        m_programNameLock.lock();
+        m_programName = name;
+        m_programNameLock.unlock();
+    }
+
 #ifdef TRACY_ON_DEMAND
     tracy_force_inline uint64_t ConnectionId() const
     {
@@ -730,7 +759,7 @@ public:
     static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz )
     {
         const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz );
-        assert( sz32 <= std::numeric_limits<uint16_t>::max() );
+        assert( sz32 <= (std::numeric_limits<uint16_t>::max)() );
         const auto sz = uint16_t( sz32 );
         auto ptr = (char*)tracy_malloc( sz );
         memcpy( ptr, &sz, 2 );
@@ -941,6 +970,10 @@ private:
     void ProcessSysTime() {}
 #endif
 
+#ifdef TRACY_HAS_SYSPOWER
+    SysPower m_sysPower;
+#endif
+
     ParameterCallback m_paramCallback;
     void* m_paramCallbackData;
     SourceContentsCallback m_sourceCallback;
@@ -959,6 +992,9 @@ private:
     } m_prevSignal;
 #endif
     bool m_crashHandlerInstalled;
+
+    const char* m_programName;
+    TracyMutex m_programNameLock;
 };
 
 }
diff --git a/thirdparty/tracy/include/tracy/client/TracyScoped.hpp b/thirdparty/tracy/include/tracy/client/TracyScoped.hpp
index bc1307916ba71ba7f865a199952f4c63ed6d2b3c..d2274e40b0b284cd6eac111cd5a48b2c7f125253 100644
--- a/thirdparty/tracy/include/tracy/client/TracyScoped.hpp
+++ b/thirdparty/tracy/include/tracy/client/TracyScoped.hpp
@@ -108,7 +108,7 @@ public:
 
     tracy_force_inline void Text( const char* txt, size_t size )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
         if( !m_active ) return;
 #ifdef TRACY_ON_DEMAND
         if( GetProfiler().ConnectionId() != m_connectionId ) return;
@@ -123,7 +123,7 @@ public:
 
     tracy_force_inline void Name( const char* txt, size_t size )
     {
-        assert( size < std::numeric_limits<uint16_t>::max() );
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
         if( !m_active ) return;
 #ifdef TRACY_ON_DEMAND
         if( GetProfiler().ConnectionId() != m_connectionId ) return;
diff --git a/thirdparty/tracy/include/tracy/client/TracySysPower.cpp b/thirdparty/tracy/include/tracy/client/TracySysPower.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd5939da2b4dda27cbff73511b756234e63a3fcd
--- /dev/null
+++ b/thirdparty/tracy/include/tracy/client/TracySysPower.cpp
@@ -0,0 +1,164 @@
+#include "TracySysPower.hpp"
+
+#ifdef TRACY_HAS_SYSPOWER
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <chrono>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "TracyDebug.hpp"
+#include "TracyProfiler.hpp"
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+SysPower::SysPower()
+    : m_domains( 4 )
+    , m_lastTime( 0 )
+{
+    ScanDirectory( "/sys/devices/virtual/powercap/intel-rapl", -1 );
+}
+
+SysPower::~SysPower()
+{
+    for( auto& v : m_domains )
+    {
+        fclose( v.handle );
+        // Do not release v.name, as it may be still needed
+    }
+}
+
+void SysPower::Tick()
+{
+    auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    if( t - m_lastTime > 10000000 )    // 10 ms
+    {
+        m_lastTime = t;
+        for( auto& v : m_domains )
+        {
+            char tmp[32];
+            if( fread( tmp, 1, 32, v.handle ) > 0 )
+            {
+                rewind( v.handle );
+                auto p = (uint64_t)atoll( tmp );
+                uint64_t delta;
+                if( p >= v.value )
+                {
+                    delta = p - v.value;
+                }
+                else
+                {
+                    delta = v.overflow - v.value + p;
+                }
+                v.value = p;
+
+                TracyLfqPrepare( QueueType::SysPowerReport );
+                MemWrite( &item->sysPower.time, Profiler::GetTime() );
+                MemWrite( &item->sysPower.delta, delta );
+                MemWrite( &item->sysPower.name, (uint64_t)v.name );
+                TracyLfqCommit;
+            }
+        }
+    }
+}
+
+void SysPower::ScanDirectory( const char* path, int parent )
+{
+    DIR* dir = opendir( path );
+    if( !dir ) return;
+    struct dirent* ent;
+    uint64_t maxRange = 0;
+    char* name = nullptr;
+    FILE* handle = nullptr;
+    while( ( ent = readdir( dir ) ) )
+    {
+        if( ent->d_type == DT_REG )
+        {
+            if( strcmp( ent->d_name, "max_energy_range_uj" ) == 0 )
+            {
+                char tmp[PATH_MAX];
+                snprintf( tmp, PATH_MAX, "%s/max_energy_range_uj", path );
+                FILE* f = fopen( tmp, "r" );
+                if( f )
+                {
+                    fscanf( f, "%" PRIu64, &maxRange );
+                    fclose( f );
+                }
+            }
+            else if( strcmp( ent->d_name, "name" ) == 0 )
+            {
+                char tmp[PATH_MAX];
+                snprintf( tmp, PATH_MAX, "%s/name", path );
+                FILE* f = fopen( tmp, "r" );
+                if( f )
+                {
+                    char ntmp[128];
+                    if( fgets( ntmp, 128, f ) )
+                    {
+                        // Last character is newline, skip it
+                        const auto sz = strlen( ntmp ) - 1;
+                        if( parent < 0 )
+                        {
+                            name = (char*)tracy_malloc( sz + 1 );
+                            memcpy( name, ntmp, sz );
+                            name[sz] = '\0';
+                        }
+                        else
+                        {
+                            const auto p = m_domains[parent];
+                            const auto psz = strlen( p.name );
+                            name = (char*)tracy_malloc( psz + sz + 2 );
+                            memcpy( name, p.name, psz );
+                            name[psz] = ':';
+                            memcpy( name+psz+1, ntmp, sz );
+                            name[psz+sz+1] = '\0';
+                        }
+                    }
+                    fclose( f );
+                }
+            }
+            else if( strcmp( ent->d_name, "energy_uj" ) == 0 )
+            {
+                char tmp[PATH_MAX];
+                snprintf( tmp, PATH_MAX, "%s/energy_uj", path );
+                handle = fopen( tmp, "r" );
+            }
+        }
+        if( name && handle && maxRange > 0 ) break;
+    }
+    if( name && handle && maxRange > 0 )
+    {
+        parent = (int)m_domains.size();
+        Domain* domain = m_domains.push_next();
+        domain->value = 0;
+        domain->overflow = maxRange;
+        domain->handle = handle;
+        domain->name = name;
+        TracyDebug( "Power domain id %i, %s found at %s\n", parent, name, path );
+    }
+    else
+    {
+        if( name ) tracy_free( name );
+        if( handle ) fclose( handle );
+    }
+
+    rewinddir( dir );
+    while( ( ent = readdir( dir ) ) )
+    {
+        if( ent->d_type == DT_DIR && strncmp( ent->d_name, "intel-rapl:", 11 ) == 0 )
+        {
+            char tmp[PATH_MAX];
+            snprintf( tmp, PATH_MAX, "%s/%s", path, ent->d_name );
+            ScanDirectory( tmp, parent );
+        }
+    }
+    closedir( dir );
+}
+
+}
+
+#endif
diff --git a/thirdparty/tracy/include/tracy/client/TracySysPower.hpp b/thirdparty/tracy/include/tracy/client/TracySysPower.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..210123bce40682fce88d824b1e063622708e6a05
--- /dev/null
+++ b/thirdparty/tracy/include/tracy/client/TracySysPower.hpp
@@ -0,0 +1,44 @@
+#ifndef __TRACYSYSPOWER_HPP__
+#define __TRACYSYSPOWER_HPP__
+
+#if defined __linux__
+#  define TRACY_HAS_SYSPOWER
+#endif
+
+#ifdef TRACY_HAS_SYSPOWER
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "TracyFastVector.hpp"
+
+namespace tracy
+{
+
+class SysPower
+{
+    struct Domain
+    {
+        uint64_t value;
+        uint64_t overflow;
+        FILE* handle;
+        const char* name;
+    };
+
+public:
+    SysPower();
+    ~SysPower();
+
+    void Tick();
+
+private:
+    void ScanDirectory( const char* path, int parent );
+
+    FastVector<Domain> m_domains;
+    uint64_t m_lastTime;
+};
+
+}
+#endif
+
+#endif
diff --git a/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp b/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp
index 4a562eaae2e7935fa823cda84b1c690b22534a88..af0641fef1708fc000b4d312e932a46db49149c9 100644
--- a/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp
+++ b/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp
@@ -770,6 +770,13 @@ bool SysTraceStart( int64_t& samplingPeriod )
     TracyDebug( "sched_wakeup id: %i\n", wakeupId );
     TracyDebug( "drm_vblank_event id: %i\n", vsyncId );
 
+#ifdef TRACY_NO_SAMPLING
+    const bool noSoftwareSampling = true;
+#else
+    const char* noSoftwareSamplingEnv = GetEnvVar( "TRACY_NO_SAMPLING" );
+    const bool noSoftwareSampling = noSoftwareSamplingEnv && noSoftwareSamplingEnv[0] == '1';
+#endif
+
 #ifdef TRACY_NO_SAMPLE_RETIREMENT
     const bool noRetirement = true;
 #else
@@ -839,28 +846,31 @@ bool SysTraceStart( int64_t& samplingPeriod )
     pe.clockid = CLOCK_MONOTONIC_RAW;
 #endif
 
-    TracyDebug( "Setup software sampling\n" );
-    ProbePreciseIp( pe, currentPid );
-    for( int i=0; i<s_numCpus; i++ )
+    if( !noSoftwareSampling )
     {
-        int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
-        if( fd == -1 )
+        TracyDebug( "Setup software sampling\n" );
+        ProbePreciseIp( pe, currentPid );
+        for( int i=0; i<s_numCpus; i++ )
         {
-            pe.exclude_kernel = 1;
-            ProbePreciseIp( pe, currentPid );
-            fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
             if( fd == -1 )
             {
-                TracyDebug( "  Failed to setup!\n");
-                break;
+                pe.exclude_kernel = 1;
+                ProbePreciseIp( pe, currentPid );
+                fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+                if( fd == -1 )
+                {
+                    TracyDebug( "  Failed to setup!\n");
+                    break;
+                }
+                TracyDebug( "  No access to kernel samples\n" );
+            }
+            new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
+            if( s_ring[s_numBuffers].IsValid() )
+            {
+                s_numBuffers++;
+                TracyDebug( "  Core %i ok\n", i );
             }
-            TracyDebug( "  No access to kernel samples\n" );
-        }
-        new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
-        if( s_ring[s_numBuffers].IsValid() )
-        {
-            s_numBuffers++;
-            TracyDebug( "  Core %i ok\n", i );
         }
     }
 
diff --git a/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp b/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp
index 8efa626a9355029155ee9073ae1dbfeca1416af9..711505d21ac1cb3ddf6ea1408d633112c5a62ffd 100644
--- a/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp
+++ b/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp
@@ -147,7 +147,7 @@
 #  if defined(__APPLE__)
 #    include <TargetConditionals.h>
 #    if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
-#    include <mach/mach_vm.h>
+#    include <mach/mach.h>
 #    include <mach/vm_statistics.h>
 #    endif
 #    include <pthread.h>
diff --git a/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp b/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp
index dd30e5391f5b55b6f024f8107ee61a35594bd598..5eb1639db3f10a5e6502aea3637a3a335d13985c 100644
--- a/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp
+++ b/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp
@@ -9,14 +9,14 @@ namespace tracy
 
 constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
 
-enum : uint32_t { ProtocolVersion = 63 };
+enum : uint32_t { ProtocolVersion = 64 };
 enum : uint16_t { BroadcastVersion = 3 };
 
 using lz4sz_t = uint32_t;
 
 enum { TargetFrameSize = 256 * 1024 };
 enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) };
-static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" );
+static_assert( LZ4Size <= (std::numeric_limits<lz4sz_t>::max)(), "LZ4Size greater than lz4sz_t" );
 static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" );
 
 enum { HandshakeShibbolethSize = 8 };
diff --git a/thirdparty/tracy/include/tracy/common/TracyQueue.hpp b/thirdparty/tracy/include/tracy/common/TracyQueue.hpp
index 8443193afbc0d40c3189a5439ce869f7b117df38..051d412abfbee4799dae034026101445721a2b97 100644
--- a/thirdparty/tracy/include/tracy/common/TracyQueue.hpp
+++ b/thirdparty/tracy/include/tracy/common/TracyQueue.hpp
@@ -90,6 +90,7 @@ enum class QueueType : uint8_t
     GpuNewContext,
     CallstackFrame,
     SysTimeReport,
+    SysPowerReport,
     TidToPid,
     HwSampleCpuCycle,
     HwSampleInstructionRetired,
@@ -563,6 +564,13 @@ struct QueueSysTime
     float sysTime;
 };
 
+struct QueueSysPower
+{
+    int64_t time;
+    uint64_t delta;
+    uint64_t name;  // ptr
+};
+
 struct QueueContextSwitch
 {
     int64_t time;
@@ -729,6 +737,7 @@ struct QueueItem
         QueueCrashReport crashReport;
         QueueCrashReportThread crashReportThread;
         QueueSysTime sysTime;
+        QueueSysPower sysPower;
         QueueContextSwitch contextSwitch;
         QueueThreadWakeup threadWakeup;
         QueueTidToPid tidToPid;
@@ -832,6 +841,7 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
     sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
     sizeof( QueueHeader ) + sizeof( QueueSysTime ),
+    sizeof( QueueHeader ) + sizeof( QueueSysPower ),
     sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
     sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cpu cycle
     sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // instruction retired
diff --git a/thirdparty/tracy/include/tracy/common/TracySocket.cpp b/thirdparty/tracy/include/tracy/common/TracySocket.cpp
index 176bbc7aa1f173926226156b97687d30ec054d50..259678989e89d71104b152c3d72733d5116e4da6 100644
--- a/thirdparty/tracy/include/tracy/common/TracySocket.cpp
+++ b/thirdparty/tracy/include/tracy/common/TracySocket.cpp
@@ -353,7 +353,7 @@ int Socket::Recv( void* _buf, int len, int timeout )
     }
 }
 
-int Socket::ReadUpTo( void* _buf, int len, int timeout )
+int Socket::ReadUpTo( void* _buf, int len )
 {
     const auto sock = m_sock.load( std::memory_order_relaxed );
     auto buf = (char*)_buf;
@@ -678,10 +678,10 @@ bool UdpListen::Listen( uint16_t port )
 #endif
 #if defined _WIN32
     unsigned long reuse = 1;
-    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
+    setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
 #else
     int reuse = 1;
-    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
+    setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
 #endif
 #if defined _WIN32
     unsigned long broadcast = 1;
diff --git a/thirdparty/tracy/include/tracy/common/TracySocket.hpp b/thirdparty/tracy/include/tracy/common/TracySocket.hpp
index 4b3075e29d164deed141a3e44665dda422eb155f..f7713aac663797f5a837012b98705b1ebcf61885 100644
--- a/thirdparty/tracy/include/tracy/common/TracySocket.hpp
+++ b/thirdparty/tracy/include/tracy/common/TracySocket.hpp
@@ -29,7 +29,7 @@ public:
     int Send( const void* buf, int len );
     int GetSendBufSize();
 
-    int ReadUpTo( void* buf, int len, int timeout );
+    int ReadUpTo( void* buf, int len );
     bool Read( void* buf, int len, int timeout );
 
     template<typename ShouldExit>
diff --git a/thirdparty/tracy/include/tracy/common/TracySystem.cpp b/thirdparty/tracy/include/tracy/common/TracySystem.cpp
index 2a7d997e4ca17cb90ae39c5c3b3dd98f0f9b1608..9a477aa310c956f67d36a51775e2e6386fd0537e 100644
--- a/thirdparty/tracy/include/tracy/common/TracySystem.cpp
+++ b/thirdparty/tracy/include/tracy/common/TracySystem.cpp
@@ -213,21 +213,24 @@ TRACY_API const char* GetThreadName( uint32_t id )
 # else
    static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
 # endif
-  if( _GetThreadDescription )
-  {
-      auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
-      if( hnd != 0 )
-      {
-          PWSTR tmp;
-          _GetThreadDescription( hnd, &tmp );
-          auto ret = wcstombs( buf, tmp, 256 );
-          CloseHandle( hnd );
-          if( ret != 0 )
-          {
-              return buf;
-          }
-      }
-  }
+    if( _GetThreadDescription )
+    {
+        auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
+        if( hnd != 0 )
+        {
+            PWSTR tmp;
+            if( SUCCEEDED( _GetThreadDescription( hnd, &tmp ) ) )
+            {
+                auto ret = wcstombs( buf, tmp, 256 );
+                CloseHandle( hnd );
+                LocalFree( tmp );
+                if( ret != static_cast<size_t>( -1 ) )
+                {
+                    return buf;
+                }
+            }
+        }
+    }
 #elif defined __linux__
   int cs, fd;
   char path[32];
diff --git a/thirdparty/tracy/include/tracy/common/TracyVersion.hpp b/thirdparty/tracy/include/tracy/common/TracyVersion.hpp
index c82edf93d224a7790935f4b1346a24bd65fb373f..2355279f7219cbb59276086cf03deddd56399de7 100644
--- a/thirdparty/tracy/include/tracy/common/TracyVersion.hpp
+++ b/thirdparty/tracy/include/tracy/common/TracyVersion.hpp
@@ -6,8 +6,8 @@ namespace tracy
 namespace Version
 {
 enum { Major = 0 };
-enum { Minor = 9 };
-enum { Patch = 1 };
+enum { Minor = 10 };
+enum { Patch = 0 };
 }
 }
 
diff --git a/thirdparty/tracy/include/tracy/libbacktrace/config.h b/thirdparty/tracy/include/tracy/libbacktrace/config.h
index aa3259d1198458b4d96719d02ae0cb56a0869d85..87e38a95b5881b4e627dd320160d2aa60636b6c7 100644
--- a/thirdparty/tracy/include/tracy/libbacktrace/config.h
+++ b/thirdparty/tracy/include/tracy/libbacktrace/config.h
@@ -1,4 +1,8 @@
 #include <limits.h>
+#if defined(__linux__) && !defined(__GLIBC__) && !defined(__WORDSIZE)
+// include __WORDSIZE headers for musl
+#  include <bits/reg.h>
+#endif
 #if __WORDSIZE == 64
 #  define BACKTRACE_ELF_SIZE 64
 #else
diff --git a/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp b/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp
index 246cb9f3696489233db67bdc2ac4f2bf920621aa..f3899cbce101b4de114729b48cac4459dad65f80 100644
--- a/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp
+++ b/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp
@@ -473,7 +473,7 @@ enum attr_val_encoding
   /* An address.  */
   ATTR_VAL_ADDRESS,
   /* An index into the .debug_addr section, whose value is relative to
-   * the DW_AT_addr_base attribute of the compilation unit.  */
+     the DW_AT_addr_base attribute of the compilation unit.  */
   ATTR_VAL_ADDRESS_INDEX,
   /* A unsigned integer.  */
   ATTR_VAL_UINT,
@@ -611,8 +611,8 @@ struct function
 struct function_addrs
 {
   /* Range is LOW <= PC < HIGH.  */
-  uint64_t low;
-  uint64_t high;
+  uintptr_t low;
+  uintptr_t high;
   /* Function for this address range.  */
   struct function *function;
 };
@@ -693,8 +693,8 @@ struct unit
 struct unit_addrs
 {
   /* Range is LOW <= PC < HIGH.  */
-  uint64_t low;
-  uint64_t high;
+  uintptr_t low;
+  uintptr_t high;
   /* Compilation unit for this address range.  */
   struct unit *u;
 };
@@ -1431,7 +1431,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections,
 		    uint64_t addr_base, int addrsize, int is_bigendian,
 		    uint64_t addr_index,
 		    backtrace_error_callback error_callback, void *data,
-		    uint64_t *address)
+		    uintptr_t *address)
 {
   uint64_t offset;
   struct dwarf_buf addr_buf;
@@ -1452,7 +1452,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections,
   addr_buf.data = data;
   addr_buf.reported_underflow = 0;
 
-  *address = read_address (&addr_buf, addrsize);
+  *address = (uintptr_t) read_address (&addr_buf, addrsize);
   return 1;
 }
 
@@ -1531,7 +1531,7 @@ function_addrs_search (const void *vkey, const void *ventry)
 
 static int
 add_unit_addr (struct backtrace_state *state, void *rdata,
-	       uint64_t lowpc, uint64_t highpc,
+	       uintptr_t lowpc, uintptr_t highpc,
 	       backtrace_error_callback error_callback, void *data,
 	       void *pvec)
 {
@@ -1867,10 +1867,10 @@ lookup_abbrev (struct abbrevs *abbrevs, uint64_t code,
    lowpc/highpc is set or ranges is set.  */
 
 struct pcrange {
-  uint64_t lowpc;		/* The low PC value.  */
+  uintptr_t lowpc;             /* The low PC value.  */
   int have_lowpc;		/* Whether a low PC value was found.  */
   int lowpc_is_addr_index;	/* Whether lowpc is in .debug_addr.  */
-  uint64_t highpc;		/* The high PC value.  */
+  uintptr_t highpc;            /* The high PC value.  */
   int have_highpc;		/* Whether a high PC value was found.  */
   int highpc_is_relative;	/* Whether highpc is relative to lowpc.  */
   int highpc_is_addr_index;	/* Whether highpc is in .debug_addr.  */
@@ -1890,12 +1890,12 @@ update_pcrange (const struct attr* attr, const struct attr_val* val,
     case DW_AT_low_pc:
       if (val->encoding == ATTR_VAL_ADDRESS)
 	{
-	  pcrange->lowpc = val->u.uint;
+	  pcrange->lowpc = (uintptr_t) val->u.uint;
 	  pcrange->have_lowpc = 1;
 	}
       else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
 	{
-	  pcrange->lowpc = val->u.uint;
+	  pcrange->lowpc = (uintptr_t) val->u.uint;
 	  pcrange->have_lowpc = 1;
 	  pcrange->lowpc_is_addr_index = 1;
 	}
@@ -1904,18 +1904,18 @@ update_pcrange (const struct attr* attr, const struct attr_val* val,
     case DW_AT_high_pc:
       if (val->encoding == ATTR_VAL_ADDRESS)
 	{
-	  pcrange->highpc = val->u.uint;
+	  pcrange->highpc = (uintptr_t) val->u.uint;
 	  pcrange->have_highpc = 1;
 	}
       else if (val->encoding == ATTR_VAL_UINT)
 	{
-	  pcrange->highpc = val->u.uint;
+	  pcrange->highpc = (uintptr_t) val->u.uint;
 	  pcrange->have_highpc = 1;
 	  pcrange->highpc_is_relative = 1;
 	}
       else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
 	{
-	  pcrange->highpc = val->u.uint;
+	  pcrange->highpc = (uintptr_t) val->u.uint;
 	  pcrange->have_highpc = 1;
 	  pcrange->highpc_is_addr_index = 1;
 	}
@@ -1950,16 +1950,16 @@ add_low_high_range (struct backtrace_state *state,
 		    uintptr_t base_address, int is_bigendian,
 		    struct unit *u, const struct pcrange *pcrange,
 		    int (*add_range) (struct backtrace_state *state,
-				      void *rdata, uint64_t lowpc,
-				      uint64_t highpc,
+				      void *rdata, uintptr_t lowpc,
+				      uintptr_t highpc,
 				      backtrace_error_callback error_callback,
 				      void *data, void *vec),
 		    void *rdata,
 		    backtrace_error_callback error_callback, void *data,
 		    void *vec)
 {
-  uint64_t lowpc;
-  uint64_t highpc;
+  uintptr_t lowpc;
+  uintptr_t highpc;
 
   lowpc = pcrange->lowpc;
   if (pcrange->lowpc_is_addr_index)
@@ -1997,10 +1997,10 @@ add_ranges_from_ranges (
     struct backtrace_state *state,
     const struct dwarf_sections *dwarf_sections,
     uintptr_t base_address, int is_bigendian,
-    struct unit *u, uint64_t base,
+    struct unit *u, uintptr_t base,
     const struct pcrange *pcrange,
     int (*add_range) (struct backtrace_state *state, void *rdata,
-		      uint64_t lowpc, uint64_t highpc,
+		      uintptr_t lowpc, uintptr_t highpc,
 		      backtrace_error_callback error_callback, void *data,
 		      void *vec),
     void *rdata,
@@ -2039,12 +2039,12 @@ add_ranges_from_ranges (
 	break;
 
       if (is_highest_address (low, u->addrsize))
-	base = high;
+	base = (uintptr_t) high;
       else
 	{
 	  if (!add_range (state, rdata, 
-			  low + base + base_address,
-			  high + base + base_address,
+			  (uintptr_t) low + base + base_address,
+			  (uintptr_t) high + base + base_address,
 			  error_callback, data, vec))
 	    return 0;
 	}
@@ -2064,10 +2064,10 @@ add_ranges_from_rnglists (
     struct backtrace_state *state,
     const struct dwarf_sections *dwarf_sections,
     uintptr_t base_address, int is_bigendian,
-    struct unit *u, uint64_t base,
+    struct unit *u, uintptr_t base,
     const struct pcrange *pcrange,
     int (*add_range) (struct backtrace_state *state, void *rdata,
-		      uint64_t lowpc, uint64_t highpc,
+		      uintptr_t lowpc, uintptr_t highpc,
 		      backtrace_error_callback error_callback, void *data,
 		      void *vec),
     void *rdata,
@@ -2133,8 +2133,8 @@ add_ranges_from_rnglists (
 	case DW_RLE_startx_endx:
 	  {
 	    uint64_t index;
-	    uint64_t low;
-	    uint64_t high;
+	    uintptr_t low;
+	    uintptr_t high;
 
 	    index = read_uleb128 (&rnglists_buf);
 	    if (!resolve_addr_index (dwarf_sections, u->addr_base,
@@ -2156,8 +2156,8 @@ add_ranges_from_rnglists (
 	case DW_RLE_startx_length:
 	  {
 	    uint64_t index;
-	    uint64_t low;
-	    uint64_t length;
+	    uintptr_t low;
+	    uintptr_t length;
 
 	    index = read_uleb128 (&rnglists_buf);
 	    if (!resolve_addr_index (dwarf_sections, u->addr_base,
@@ -2187,16 +2187,16 @@ add_ranges_from_rnglists (
 	  break;
 
 	case DW_RLE_base_address:
-	  base = read_address (&rnglists_buf, u->addrsize);
+	  base = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
 	  break;
 
 	case DW_RLE_start_end:
 	  {
-	    uint64_t low;
-	    uint64_t high;
+	    uintptr_t low;
+	    uintptr_t high;
 
-	    low = read_address (&rnglists_buf, u->addrsize);
-	    high = read_address (&rnglists_buf, u->addrsize);
+	    low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
+	    high = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
 	    if (!add_range (state, rdata, low + base_address,
 			    high + base_address, error_callback, data,
 			    vec))
@@ -2206,11 +2206,11 @@ add_ranges_from_rnglists (
 
 	case DW_RLE_start_length:
 	  {
-	    uint64_t low;
-	    uint64_t length;
+	    uintptr_t low;
+	    uintptr_t length;
 
-	    low = read_address (&rnglists_buf, u->addrsize);
-	    length = read_uleb128 (&rnglists_buf);
+	    low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
+	    length = (uintptr_t) read_uleb128 (&rnglists_buf);
 	    low += base_address;
 	    if (!add_range (state, rdata, low, low + length,
 			    error_callback, data, vec))
@@ -2240,9 +2240,9 @@ static int
 add_ranges (struct backtrace_state *state,
 	    const struct dwarf_sections *dwarf_sections,
 	    uintptr_t base_address, int is_bigendian,
-	    struct unit *u, uint64_t base, const struct pcrange *pcrange,
+	    struct unit *u, uintptr_t base, const struct pcrange *pcrange,
 	    int (*add_range) (struct backtrace_state *state, void *rdata, 
-			      uint64_t lowpc, uint64_t highpc,
+			      uintptr_t lowpc, uintptr_t highpc,
 			      backtrace_error_callback error_callback,
 			      void *data, void *vec),
 	    void *rdata,
@@ -3520,7 +3520,7 @@ read_referenced_name (struct dwarf_data *ddata, struct unit *u,
 
 static int
 add_function_range (struct backtrace_state *state, void *rdata,
-		    uint64_t lowpc, uint64_t highpc,
+		    uintptr_t lowpc, uintptr_t highpc,
 		    backtrace_error_callback error_callback, void *data,
 		    void *pvec)
 {
@@ -3560,7 +3560,7 @@ add_function_range (struct backtrace_state *state, void *rdata,
 
 static int
 read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
-		     struct unit *u, uint64_t base, struct dwarf_buf *unit_buf,
+		     struct unit *u, uintptr_t base, struct dwarf_buf *unit_buf,
 		     const struct line_header *lhdr,
 		     backtrace_error_callback error_callback, void *data,
 		     struct function_vector *vec_function,
@@ -3624,7 +3624,7 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
 	      && abbrev->attrs[i].name == DW_AT_low_pc)
 	    {
 	      if (val.encoding == ATTR_VAL_ADDRESS)
-		base = val.u.uint;
+		base = (uintptr_t) val.u.uint;
 	      else if (val.encoding == ATTR_VAL_ADDRESS_INDEX)
 		{
 		  if (!resolve_addr_index (&ddata->dwarf_sections,
diff --git a/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp b/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp
index e6d6c4aeeaff86c78eec0a54f08fe6447857d5dc..c65bc4e768af6f1290db00439a9750684a2837cb 100644
--- a/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp
+++ b/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp
@@ -2823,18 +2823,18 @@ elf_zstd_read_fse (const unsigned char **ppin, const unsigned char *pinend,
 	  while ((val & 0xfff) == 0xfff)
 	    {
 	      zidx += 3 * 6;
-	      if  (!elf_fetch_bits (&pin, pinend, &val, &bits))
-		return 0;
 	      val >>= 12;
 	      bits -= 12;
+	      if  (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
 	    }
 	  while ((val & 3) == 3)
 	    {
 	      zidx += 3;
-	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
-		return 0;
 	      val >>= 2;
 	      bits -= 2;
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
 	    }
 	  /* We have at least 13 bits here, don't need to fetch.  */
 	  zidx += val & 3;
@@ -2964,7 +2964,7 @@ elf_zstd_build_fse (const int16_t *norm, int idx, uint16_t *next,
 	    pos = (pos + step) & mask;
 	}
     }
-  if (pos != 0)
+  if (unlikely (pos != 0))
     {
       elf_uncompress_failed ();
       return 0;
@@ -3440,17 +3440,17 @@ static const struct elf_zstd_fse_baseline_entry elf_zstd_match_table[64] =
 
 static const struct elf_zstd_fse_baseline_entry elf_zstd_offset_table[32] =
 {
-  { 1, 0, 5, 0 }, { 64, 6, 4, 0 }, { 512, 9, 5, 0 },
-  { 32768, 15, 5, 0 }, { 2097152, 21, 5, 0 }, { 8, 3, 5, 0 },
-  { 128, 7, 4, 0 }, { 4096, 12, 5, 0 }, { 262144, 18, 5, 0 },
-  { 8388608, 23, 5, 0 }, { 32, 5, 5, 0 }, { 256, 8, 4, 0 },
-  { 16384, 14, 5, 0 }, { 1048576, 20, 5, 0 }, { 4, 2, 5, 0 },
-  { 128, 7, 4, 16 }, { 2048, 11, 5, 0 }, { 131072, 17, 5, 0 },
-  { 4194304, 22, 5, 0 }, { 16, 4, 5, 0 }, { 256, 8, 4, 16 },
-  { 8192, 13, 5, 0 }, { 524288, 19, 5, 0 }, { 2, 1, 5, 0 },
-  { 64, 6, 4, 16 }, { 1024, 10, 5, 0 }, { 65536, 16, 5, 0 },
-  { 268435456, 28, 5, 0 }, { 134217728, 27, 5, 0 }, { 67108864, 26, 5, 0 },
-  { 33554432, 25, 5, 0 }, { 16777216, 24, 5, 0 },
+  { 1, 0, 5, 0 }, { 61, 6, 4, 0 }, { 509, 9, 5, 0 },
+  { 32765, 15, 5, 0 }, { 2097149, 21, 5, 0 }, { 5, 3, 5, 0 },
+  { 125, 7, 4, 0 }, { 4093, 12, 5, 0 }, { 262141, 18, 5, 0 },
+  { 8388605, 23, 5, 0 }, { 29, 5, 5, 0 }, { 253, 8, 4, 0 },
+  { 16381, 14, 5, 0 }, { 1048573, 20, 5, 0 }, { 1, 2, 5, 0 },
+  { 125, 7, 4, 16 }, { 2045, 11, 5, 0 }, { 131069, 17, 5, 0 },
+  { 4194301, 22, 5, 0 }, { 13, 4, 5, 0 }, { 253, 8, 4, 16 },
+  { 8189, 13, 5, 0 }, { 524285, 19, 5, 0 }, { 2, 1, 5, 0 },
+  { 61, 6, 4, 16 }, { 1021, 10, 5, 0 }, { 65533, 16, 5, 0 },
+  { 268435453, 28, 5, 0 }, { 134217725, 27, 5, 0 }, { 67108861, 26, 5, 0 },
+  { 33554429, 25, 5, 0 }, { 16777213, 24, 5, 0 },
 };
 
 /* Read a zstd Huffman table and build the decoding table in *TABLE, reading
@@ -3635,7 +3635,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
     }
 
   weight_mark = (uint32_t *) (weights + 256);
-  memset (weight_mark, 0, 12 * sizeof (uint32_t));
+  memset (weight_mark, 0, 13 * sizeof (uint32_t));
   weight_mask = 0;
   for (i = 0; i < count; ++i)
     {
@@ -3702,7 +3702,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
 
   /* Change WEIGHT_MARK from a count of weights to the index of the first
      symbol for that weight.  We shift the indexes to also store how many we
-     hae seen so far, below.  */
+     have seen so far, below.  */
   {
     uint32_t next;
 
@@ -3783,7 +3783,7 @@ elf_zstd_read_literals (const unsigned char **ppin,
     {
       int raw;
 
-      /* Raw_literals_Block or RLE_Literals_Block */
+      /* Raw_Literals_Block or RLE_Literals_Block */
 
       raw = (hdr & 3) == 0;
 
@@ -3965,7 +3965,7 @@ elf_zstd_read_literals (const unsigned char **ppin,
       unsigned int bits;
       uint32_t i;
 
-      pback = pin + compressed_size - 1;
+      pback = pin + total_streams_size - 1;
       pbackend = pin;
       if (!elf_fetch_backward_init (&pback, pbackend, &val, &bits))
 	return 0;
diff --git a/thirdparty/tracy/include/tracy/tracy/Tracy.hpp b/thirdparty/tracy/include/tracy/tracy/Tracy.hpp
index d42f4bf3b77801abd52369d7049c3aa430df9e8f..978eb5ef15c32765e2cf4d208ad9444be31a790f 100644
--- a/thirdparty/tracy/include/tracy/tracy/Tracy.hpp
+++ b/thirdparty/tracy/include/tracy/tracy/Tracy.hpp
@@ -109,6 +109,7 @@
 #define TracyParameterRegister(x,y)
 #define TracyParameterSetup(x,y,z,w)
 #define TracyIsConnected false
+#define TracySetProgramName(x)
 
 #define TracyFiberEnter(x)
 #define TracyFiberLeave
@@ -270,6 +271,7 @@
 #define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data )
 #define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val )
 #define TracyIsConnected tracy::GetProfiler().IsConnected()
+#define TracySetProgramName( name ) tracy::GetProfiler().SetProgramName( name );
 
 #ifdef TRACY_FIBERS
 #  define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber )
diff --git a/thirdparty/tracy/include/tracy/tracy/TracyC.h b/thirdparty/tracy/include/tracy/tracy/TracyC.h
index bedf5e162558c40fe926ee68efc88a46e397d91b..996889c40f815111ab559deec16a89c8d245cd1a 100644
--- a/thirdparty/tracy/include/tracy/tracy/TracyC.h
+++ b/thirdparty/tracy/include/tracy/tracy/TracyC.h
@@ -11,6 +11,14 @@
 extern "C" {
 #endif
 
+enum TracyPlotFormatEnum
+{
+    TracyPlotFormatNumber,
+    TracyPlotFormatMemory,
+    TracyPlotFormatPercentage,
+    TracyPlotFormatWatt
+};
+
 TRACY_API void ___tracy_set_thread_name( const char* name );
 
 #define TracyCSetThreadName( name ) ___tracy_set_thread_name( name );
@@ -60,6 +68,8 @@ typedef const void* TracyCZoneCtx;
 #define TracyCPlot(x,y)
 #define TracyCPlotF(x,y)
 #define TracyCPlotI(x,y)
+#define TracyCPlotConfig(x,y,z,w,a)
+
 #define TracyCMessage(x,y)
 #define TracyCMessageL(x)
 #define TracyCMessageC(x,y,z)
@@ -289,11 +299,13 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_
 TRACY_API void ___tracy_emit_plot( const char* name, double val );
 TRACY_API void ___tracy_emit_plot_float( const char* name, float val );
 TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val );
+TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color );
 TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size );
 
 #define TracyCPlot( name, val ) ___tracy_emit_plot( name, val );
 #define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val );
 #define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val );
+#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color );
 #define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size );
 
 
diff --git a/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp b/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp
index 9f358c4a5b9e191318ed35fe575417936bbe9c9c..8aebdb2653262ba610ba2a8136f5b4b2b3863dca 100644
--- a/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp
+++ b/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp
@@ -13,13 +13,13 @@
 #define TracyD3D11ZoneC(ctx, name, color)
 #define TracyD3D11NamedZone(ctx, varname, name, active)
 #define TracyD3D11NamedZoneC(ctx, varname, name, color, active)
-#define TracyD3D12ZoneTransient(ctx, varname, name, active)
+#define TracyD3D11ZoneTransient(ctx, varname, name, active)
 
 #define TracyD3D11ZoneS(ctx, name, depth)
 #define TracyD3D11ZoneCS(ctx, name, color, depth)
 #define TracyD3D11NamedZoneS(ctx, varname, name, depth, active)
 #define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active)
-#define TracyD3D12ZoneTransientS(ctx, varname, name, depth, active)
+#define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active)
 
 #define TracyD3D11Collect(ctx)
 
@@ -39,11 +39,12 @@ using TracyD3D11Ctx = void*;
 #include "Tracy.hpp"
 #include "../client/TracyProfiler.hpp"
 #include "../client/TracyCallstack.hpp"
-#include "../common/TracyAlign.hpp"
-#include "../common/TracyAlloc.hpp"
+#include "../common/TracyYield.hpp"
 
 #include <d3d11.h>
 
+#define TracyD3D11Panic(msg, ...) do { assert(false && "TracyD3D11: " msg); TracyMessageLC("TracyD3D11: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
+
 namespace tracy
 {
 
@@ -51,71 +52,83 @@ class D3D11Ctx
 {
     friend class D3D11ZoneScope;
 
-    enum { QueryCount = 64 * 1024 };
+    static constexpr uint32_t MaxQueries = 64 * 1024;
+
+    enum CollectMode { POLL, BLOCK };
 
 public:
     D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx )
-        : m_device( device )
-        , m_devicectx( devicectx )
-        , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
-        , m_head( 0 )
-        , m_tail( 0 )
     {
-        assert( m_context != 255 );
+        // TODO: consider calling ID3D11Device::GetImmediateContext() instead of passing it as an argument
+        m_device = device;
+        device->AddRef();
+        m_immediateDevCtx = devicectx;
+        devicectx->AddRef();
 
-        for (int i = 0; i < QueryCount; i++)
         {
-            HRESULT hr = S_OK;
-            D3D11_QUERY_DESC desc;
-            desc.MiscFlags = 0;
-
-            desc.Query = D3D11_QUERY_TIMESTAMP;
-            hr |= device->CreateQuery(&desc, &m_queries[i]);
-
+            D3D11_QUERY_DESC desc = { };
             desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
-            hr |= device->CreateQuery(&desc, &m_disjoints[i]);
-
-            m_disjointMap[i] = nullptr;
+            if (FAILED(m_device->CreateQuery(&desc, &m_disjointQuery)))
+            {
+                TracyD3D11Panic("unable to create disjoint timestamp query.", return);
+            }
+        }
 
-            assert(SUCCEEDED(hr));
+        for (ID3D11Query*& query : m_queries)
+        {
+            D3D11_QUERY_DESC desc = { };
+            desc.Query = D3D11_QUERY_TIMESTAMP;
+            if (FAILED(m_device->CreateQuery(&desc, &query)))
+            {
+                TracyD3D11Panic("unable to create timestamp query.", return);
+            }
         }
 
-        // Force query the initial GPU timestamp (pipeline stall)
-        D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint;
-        UINT64 timestamp;
+        // Calibrate CPU and GPU timestamps
+        int64_t tcpu = 0;
+        int64_t tgpu = 0;
         for (int attempts = 0; attempts < 50; attempts++)
         {
-            devicectx->Begin(m_disjoints[0]);
-            devicectx->End(m_queries[0]);
-            devicectx->End(m_disjoints[0]);
-            devicectx->Flush();
+            m_immediateDevCtx->Begin(m_disjointQuery);
+            m_immediateDevCtx->End(m_queries[0]);
+            m_immediateDevCtx->End(m_disjointQuery);
+
+            int64_t tcpu0 = Profiler::GetTime();
+            WaitForQuery(m_disjointQuery);
+            int64_t tcpu1 = Profiler::GetTime();
 
-            while (devicectx->GetData(m_disjoints[0], &disjoint, sizeof(disjoint), 0) == S_FALSE)
-                /* Nothing */;
+            D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
+            if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), 0) != S_OK)
+            {
+                TracyMessageLC("TracyD3D11: unable to query GPU timestamp; retrying...", tracy::Color::Tomato);
+                continue;
+            }
 
             if (disjoint.Disjoint)
                 continue;
 
-            while (devicectx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) == S_FALSE)
-                /* Nothing */;
+            UINT64 timestamp = 0;
+            if (m_immediateDevCtx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) != S_OK)
+                continue;   // this should never happen, since the enclosing disjoint query succeeded
 
+            tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2;
+            tgpu = timestamp * (1000000000 / disjoint.Frequency);
             break;
         }
 
-        int64_t tgpu = timestamp * (1000000000ull / disjoint.Frequency);
-        int64_t tcpu = Profiler::GetTime();
-
-        uint8_t flags = 0;
+        // ready to roll
+        m_contextId = GetGpuCtxCounter().fetch_add(1);
+        m_immediateDevCtx->Begin(m_disjointQuery);
+        m_previousCheckpoint = m_nextCheckpoint = 0;
 
-        const float period = 1.f;
         auto* item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::GpuNewContext );
         MemWrite( &item->gpuNewContext.cpuTime, tcpu );
         MemWrite( &item->gpuNewContext.gpuTime, tgpu );
-        memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
-        MemWrite( &item->gpuNewContext.period, period );
-        MemWrite( &item->gpuNewContext.context, m_context );
-        MemWrite( &item->gpuNewContext.flags, flags );
+        MemWrite( &item->gpuNewContext.thread, uint32_t(0) );   // #TODO: why not GetThreadHandle()?
+        MemWrite( &item->gpuNewContext.period, 1.0f );
+        MemWrite( &item->gpuNewContext.context, m_contextId);
+        MemWrite( &item->gpuNewContext.flags, uint8_t(0) );
         MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 );
 
 #ifdef TRACY_ON_DEMAND
@@ -127,12 +140,20 @@ public:
 
     ~D3D11Ctx()
     {
-        for (int i = 0; i < QueryCount; i++)
+        // collect all pending timestamps before destroying everything
+        do
         {
-            m_queries[i]->Release();
-            m_disjoints[i]->Release();
-            m_disjointMap[i] = nullptr;
+            Collect(BLOCK);
+        } while (m_previousCheckpoint != m_queryCounter);
+
+        for (ID3D11Query* query : m_queries)
+        {
+            query->Release();
         }
+        m_immediateDevCtx->End(m_disjointQuery);
+        m_disjointQuery->Release();
+        m_immediateDevCtx->Release();
+        m_device->Release();
     }
 
     void Name( const char* name, uint16_t len )
@@ -142,7 +163,7 @@ public:
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::GpuContextName );
-        MemWrite( &item->gpuContextNameFat.context, m_context );
+        MemWrite( &item->gpuContextNameFat.context, m_contextId );
         MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
         MemWrite( &item->gpuContextNameFat.size, len );
 #ifdef TRACY_ON_DEMAND
@@ -151,217 +172,170 @@ public:
         Profiler::QueueSerialFinish();
     }
 
-    void Collect()
+    void Collect(CollectMode mode = POLL)
     {
         ZoneScopedC( Color::Red4 );
 
-        if( m_tail == m_head ) return;
-
 #ifdef TRACY_ON_DEMAND
         if( !GetProfiler().IsConnected() )
         {
-            m_head = m_tail = 0;
+            m_previousCheckpoint = m_nextCheckpoint = m_queryCounter;
             return;
         }
 #endif
 
-        auto start = m_tail;
-        auto end = m_head + QueryCount;
-        auto cnt = (end - start) % QueryCount;
-        while (cnt > 1)
+        if (m_previousCheckpoint == m_nextCheckpoint)
         {
-            auto mid = start + cnt / 2;
-
-            bool available =
-                m_devicectx->GetData(m_disjointMap[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK &&
-                m_devicectx->GetData(m_queries[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK;
-
-            if (available)
-            {
-                start = mid;
-            }
-            else
+            uintptr_t nextCheckpoint = m_queryCounter;
+            if (nextCheckpoint == m_nextCheckpoint)
             {
-                end = mid;
+                return;
             }
-            cnt = (end - start) % QueryCount;
+            m_nextCheckpoint = nextCheckpoint;
+            m_immediateDevCtx->End(m_disjointQuery);
         }
 
-        start %= QueryCount;
-
-        while (m_tail != start)
+        if (mode == CollectMode::BLOCK)
         {
-            D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint;
-            UINT64 time;
+            WaitForQuery(m_disjointQuery);
+        }
 
-            m_devicectx->GetData(m_disjointMap[m_tail], &disjoint, sizeof(disjoint), 0);
-            m_devicectx->GetData(m_queries[m_tail], &time, sizeof(time), 0);
+        D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
+        if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), D3D11_ASYNC_GETDATA_DONOTFLUSH) != S_OK)
+        {
+            return;
+        }
 
-            time *= (1000000000ull / disjoint.Frequency);
+        if (disjoint.Disjoint == TRUE)
+        {
+            m_previousCheckpoint = m_nextCheckpoint;
+            TracyD3D11Panic("disjoint timestamps detected; dropping.");
+            return;
+        }
 
+        auto begin = m_previousCheckpoint;
+        auto end = m_nextCheckpoint;
+        for (auto i = begin; i != end; ++i)
+        {
+            uint32_t k = RingIndex(i);
+            UINT64 timestamp = 0;
+            if (m_immediateDevCtx->GetData(m_queries[k], &timestamp, sizeof(timestamp), 0) != S_OK)
+            {
+                TracyD3D11Panic("timestamp expected to be ready, but it was not!");
+                break;
+            }
+            timestamp *= (1000000000ull / disjoint.Frequency);
             auto* item = Profiler::QueueSerial();
             MemWrite(&item->hdr.type, QueueType::GpuTime);
-            MemWrite(&item->gpuTime.gpuTime, (int64_t)time);
-            MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
-            MemWrite(&item->gpuTime.context, m_context);
+            MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(timestamp));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k));
+            MemWrite(&item->gpuTime.context, m_contextId);
             Profiler::QueueSerialFinish();
-
-            m_tail = (m_tail + 1) % QueryCount;
         }
+
+        // disjoint timestamp queries should only be invoked once per frame or less
+        // https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_query
+        m_immediateDevCtx->Begin(m_disjointQuery);
+        m_previousCheckpoint = m_nextCheckpoint;
     }
 
 private:
-    tracy_force_inline unsigned int NextQueryId()
+    tracy_force_inline uint32_t RingIndex(uintptr_t index)
+    {
+        index %= MaxQueries;
+        return static_cast<uint32_t>(index);
+    }
+
+    tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end)
+    {
+        // wrap-around safe: all unsigned
+        uintptr_t count = end - begin;
+        return static_cast<uint32_t>(count);
+    }
+
+    tracy_force_inline uint32_t NextQueryId()
     {
-        const auto id = m_head;
-        m_head = ( m_head + 1 ) % QueryCount;
-        assert( m_head != m_tail );
-        return id;
+        auto id = m_queryCounter++;
+        if (RingCount(m_previousCheckpoint, id) >= MaxQueries)
+        {
+            TracyD3D11Panic("too many pending timestamp queries.");
+            // #TODO: return some sentinel value; ideally a "hidden" query index
+        }
+        return RingIndex(id);
     }
 
-    tracy_force_inline ID3D11Query* TranslateQueryId( unsigned int id )
+    tracy_force_inline ID3D11Query* GetQueryObjectFromId(uint32_t id)
     {
         return m_queries[id];
     }
 
-    tracy_force_inline ID3D11Query* MapDisjointQueryId( unsigned int id, unsigned int disjointId )
+    tracy_force_inline void WaitForQuery(ID3D11Query* query)
     {
-        m_disjointMap[id] = m_disjoints[disjointId];
-        return m_disjoints[disjointId];
+        m_immediateDevCtx->Flush();
+        while (m_immediateDevCtx->GetData(query, nullptr, 0, 0) != S_OK)
+            YieldThread();  // busy-wait :-( attempt to reduce power usage with _mm_pause() & friends...
     }
 
-    tracy_force_inline uint8_t GetId() const
+    tracy_force_inline uint8_t GetContextId() const
     {
-        return m_context;
+        return m_contextId;
     }
 
-    ID3D11Device* m_device;
-    ID3D11DeviceContext* m_devicectx;
+    ID3D11Device* m_device = nullptr;
+    ID3D11DeviceContext* m_immediateDevCtx = nullptr;
 
-    ID3D11Query* m_queries[QueryCount];
-    ID3D11Query* m_disjoints[QueryCount];
-    ID3D11Query* m_disjointMap[QueryCount]; // Multiple time queries can have one disjoint query
-    uint8_t m_context;
+    ID3D11Query* m_queries[MaxQueries];
+    ID3D11Query* m_disjointQuery = nullptr;
 
-    unsigned int m_head;
-    unsigned int m_tail;
+    uint8_t m_contextId = 255;  // NOTE: apparently, 255 means invalid id; is this documented anywhere?
+
+    uintptr_t m_queryCounter = 0;
+
+    uintptr_t m_previousCheckpoint = 0;
+    uintptr_t m_nextCheckpoint = 0;
 };
 
 class D3D11ZoneScope
 {
 public:
-    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool is_active )
-#ifdef TRACY_ON_DEMAND
-        : m_active( is_active && GetProfiler().IsConnected() )
-#else
-        : m_active( is_active )
-#endif
+    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool active )
+        : D3D11ZoneScope(ctx, active)
     {
         if( !m_active ) return;
-        m_ctx = ctx;
-
-        const auto queryId = ctx->NextQueryId();
-        ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
-        ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
-
-        m_disjointId = queryId;
 
         auto* item = Profiler::QueueSerial();
-        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
-        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
-        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
-        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
-        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
-        MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
-
-        Profiler::QueueSerialFinish();
+        WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcloc));
     }
 
-    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool is_active )
-#ifdef TRACY_ON_DEMAND
-        : m_active( is_active && GetProfiler().IsConnected() )
-#else
-        : m_active( is_active )
-#endif
+    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool active )
+        : D3D11ZoneScope(ctx, active)
     {
         if( !m_active ) return;
-        m_ctx = ctx;
-
-        const auto queryId = ctx->NextQueryId();
-        ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
-        ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
-
-        m_disjointId = queryId;
-
-        auto* item = Profiler::QueueSerial();
-        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
-        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
-        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
-        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
-        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
-        MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
-
-        Profiler::QueueSerialFinish();
 
-        GetProfiler().SendCallstack( depth );
+        auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+        WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcloc));
     }
 
     tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active)
-#ifdef TRACY_ON_DEMAND
-        : m_active(active&& GetProfiler().IsConnected())
-#else
-        : m_active(active)
-#endif
+        : D3D11ZoneScope(ctx, active)
     {
         if( !m_active ) return;
-        m_ctx = ctx;
-
-        const auto queryId = ctx->NextQueryId();
-        ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
-        ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
-
-        m_disjointId = queryId;
 
         const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
 
         auto* item = Profiler::QueueSerial();
-        MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial);
-        MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
-        MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
-        MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
-        MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId));
-        MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
-
-        Profiler::QueueSerialFinish();
+        WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
     }
 
     tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active)
-#ifdef TRACY_ON_DEMAND
-        : m_active(active&& GetProfiler().IsConnected())
-#else
-        : m_active(active)
-#endif
+        : D3D11ZoneScope(ctx, active)
     {
         if( !m_active ) return;
-        m_ctx = ctx;
-
-        const auto queryId = ctx->NextQueryId();
-        ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
-        ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
-
-        m_disjointId = queryId;
 
         const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
 
         auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
-        MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial);
-        MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
-        MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
-        MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
-        MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId));
-        MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
-
-        Profiler::QueueSerialFinish();
+        WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
     }
 
     tracy_force_inline ~D3D11ZoneScope()
@@ -369,24 +343,46 @@ public:
         if( !m_active ) return;
 
         const auto queryId = m_ctx->NextQueryId();
-        m_ctx->m_devicectx->End(m_ctx->TranslateQueryId(queryId));
-        m_ctx->m_devicectx->End(m_ctx->MapDisjointQueryId(queryId, m_disjointId));
+        m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
 
         auto* item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
         MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
         MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
         MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
-        MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() );
-
+        MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() );
         Profiler::QueueSerialFinish();
     }
 
 private:
+    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, bool active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( active )
+#endif
+    {
+        if( !m_active ) return;
+        m_ctx = ctx;
+    }
+
+    void WriteQueueItem(tracy::QueueItem* item, tracy::QueueType queueItemType, uint64_t sourceLocation)
+    {
+        const auto queryId = m_ctx->NextQueryId();
+        m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
+
+        MemWrite( &item->hdr.type, queueItemType);
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, sourceLocation );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, m_ctx->GetContextId() );
+        Profiler::QueueSerialFinish();
+    }
+
     const bool m_active;
 
     D3D11Ctx* m_ctx;
-    unsigned int m_disjointId;
 };
 
 static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx )
@@ -403,38 +399,44 @@ static inline void DestroyD3D11Context( D3D11Ctx* ctx )
 }
 }
 
+#undef TracyD3D11Panic
+
 using TracyD3D11Ctx = tracy::D3D11Ctx*;
 
 #define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx );
 #define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx);
 #define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size);
 
+#define TracyD3D11UnnamedZone ___tracy_gpu_d3d11_zone
+#define TracyD3D11SrcLocSymbol TracyConcat(__tracy_gpu_d3d11_source_location,TracyLine)
+#define TracyD3D11SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D11SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
+
 #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
-#  define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true )
-#  define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true )
-#  define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
-#  define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, TRACY_CALLSTACK, true )
+#  define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, TRACY_CALLSTACK, true )
+#  define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
+#  define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
 #  define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
 #else
-#  define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, ___tracy_gpu_zone, name, true )
-#  define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, ___tracy_gpu_zone, name, color, true )
-#  define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
-#  define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
+#  define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, TracyD3D11UnnamedZone, name, true )
+#  define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, TracyD3D11UnnamedZone, name, color, true )
+#  define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
+#  define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
 #  define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active };
 #endif
 
 #ifdef TRACY_HAS_CALLSTACK
-#  define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true )
-#  define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true )
-#  define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
-#  define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
+#  define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, depth, true )
+#  define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, depth, true )
+#  define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
+#  define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
 #  define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active };
 #else
 #  define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name )
 #  define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color )
 #  define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active )
 #  define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active )
-#  define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D12ZoneTransient(ctx, varname, name, active)
+#  define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D11ZoneTransient(ctx, varname, name, active)
 #endif
 
 #define TracyD3D11Collect( ctx ) ctx->Collect();
diff --git a/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp b/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp
index d7944cb8e5bfcac8d3fca7184e6287e53695ceed..41567937e833e8e7ea7583fed3bcbc91d97dff61 100644
--- a/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp
+++ b/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp
@@ -25,7 +25,7 @@
 
 namespace tracy
 {
-	class D3D12ZoneScope {};
+    class D3D12ZoneScope {};
 }
 
 using TracyD3D12Ctx = void*;
@@ -40,429 +40,419 @@ using TracyD3D12Ctx = void*;
 #include <cassert>
 #include <d3d12.h>
 #include <dxgi.h>
-#include <wrl/client.h>
 #include <queue>
 
+#define TracyD3D12Panic(msg, ...) do { assert(false && "TracyD3D12: " msg); TracyMessageLC("TracyD3D12: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
+
 namespace tracy
 {
 
-	struct D3D12QueryPayload
-	{
-		uint32_t m_queryIdStart = 0;
-		uint32_t m_queryCount = 0;
-	};
-
-	// Command queue context.
-	class D3D12QueueCtx
-	{
-		friend class D3D12ZoneScope;
-
-		static constexpr uint32_t MaxQueries = 64 * 1024;  // Queries are begin and end markers, so we can store half as many total time durations. Must be even!
-
-		bool m_initialized = false;
-
-		ID3D12Device* m_device = nullptr;
-		ID3D12CommandQueue* m_queue = nullptr;
-		uint8_t m_context;
-		Microsoft::WRL::ComPtr<ID3D12QueryHeap> m_queryHeap;
-		Microsoft::WRL::ComPtr<ID3D12Resource> m_readbackBuffer;
-
-		// In-progress payload.
-		uint32_t m_queryLimit = MaxQueries;
-		std::atomic<uint32_t> m_queryCounter = 0;
-		uint32_t m_previousQueryCounter = 0;
-
-		uint32_t m_activePayload = 0;
-		Microsoft::WRL::ComPtr<ID3D12Fence> m_payloadFence;
-		std::queue<D3D12QueryPayload> m_payloadQueue;
-
-		int64_t m_prevCalibration = 0;
-		int64_t m_qpcToNs = int64_t{ 1000000000 / GetFrequencyQpc() };
-
-	public:
-		D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
-			: m_device(device)
-			, m_queue(queue)
-			, m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
-		{
-			// Verify we support timestamp queries on this queue.
-
-			if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY)
-			{
-				D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{};
-
-				bool Success = SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData)));
-				assert(Success && featureData.CopyQueueTimestampQueriesSupported && "Platform does not support profiling of copy queues.");
-			}
-
-			uint64_t timestampFrequency;
-
-			if (FAILED(queue->GetTimestampFrequency(&timestampFrequency)))
-			{
-				assert(false && "Failed to get timestamp frequency.");
-			}
-
-			uint64_t cpuTimestamp;
-			uint64_t gpuTimestamp;
-
-			if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
-			{
-				assert(false && "Failed to get queue clock calibration.");
-			}
-
-			// Save the device cpu timestamp, not the profiler's timestamp.
-			m_prevCalibration = cpuTimestamp * m_qpcToNs;
-
-			cpuTimestamp = Profiler::GetTime();
-
-			D3D12_QUERY_HEAP_DESC heapDesc{};
-			heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
-			heapDesc.Count = m_queryLimit;
-			heapDesc.NodeMask = 0;  // #TODO: Support multiple adapters.
-
-			while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap))))
-			{
-				m_queryLimit /= 2;
-				heapDesc.Count = m_queryLimit;
-			}
-
-			// Create a readback buffer, which will be used as a destination for the query data.
-
-			D3D12_RESOURCE_DESC readbackBufferDesc{};
-			readbackBufferDesc.Alignment = 0;
-			readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
-			readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t);
-			readbackBufferDesc.Height = 1;
-			readbackBufferDesc.DepthOrArraySize = 1;
-			readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
-			readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;  // Buffers are always row major.
-			readbackBufferDesc.MipLevels = 1;
-			readbackBufferDesc.SampleDesc.Count = 1;
-			readbackBufferDesc.SampleDesc.Quality = 0;
-			readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
-
-			D3D12_HEAP_PROPERTIES readbackHeapProps{};
-			readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
-			readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
-			readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
-			readbackHeapProps.CreationNodeMask = 0;
-			readbackHeapProps.VisibleNodeMask = 0;  // #TODO: Support multiple adapters.
-
-			if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer))))
-			{
-				assert(false && "Failed to create query readback buffer.");
-			}
-
-			if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence))))
-			{
-				assert(false && "Failed to create payload fence.");
-			}
-
-			auto* item = Profiler::QueueSerial();
-			MemWrite(&item->hdr.type, QueueType::GpuNewContext);
-			MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp);
-			MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp);
-			memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
-			MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast<float>(timestampFrequency));
-			MemWrite(&item->gpuNewContext.context, m_context);
-			MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
-			MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
-
-#ifdef TRACY_ON_DEMAND
-			GetProfiler().DeferItem(*item);
-#endif
-
-			Profiler::QueueSerialFinish();
-
-			m_initialized = true;
-		}
-
-		void NewFrame()
-		{
-			uint32_t queryCounter = m_queryCounter.exchange(0);
-			m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
-			m_previousQueryCounter += queryCounter;
-
-			if (m_previousQueryCounter >= m_queryLimit)
-			{
-				m_previousQueryCounter -= m_queryLimit;
-			}
-
-			m_queue->Signal(m_payloadFence.Get(), ++m_activePayload);
-		}
-
-		void Name( const char* name, uint16_t len )
-		{
-			auto ptr = (char*)tracy_malloc( len );
-			memcpy( ptr, name, len );
-
-			auto item = Profiler::QueueSerial();
-			MemWrite( &item->hdr.type, QueueType::GpuContextName );
-			MemWrite( &item->gpuContextNameFat.context, m_context );
-			MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
-			MemWrite( &item->gpuContextNameFat.size, len );
-#ifdef TRACY_ON_DEMAND
-			GetProfiler().DeferItem( *item );
-#endif
-			Profiler::QueueSerialFinish();
-		}
-
-		void Collect()
-		{
-			ZoneScopedC(Color::Red4);
-
-#ifdef TRACY_ON_DEMAND
-			if (!GetProfiler().IsConnected())
-			{
-				m_queryCounter = 0;
-
-				return;
-			}
-#endif
-
-			// Find out what payloads are available.
-			const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
-			const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
-
-			if (!payloadCount)
-			{
-				return;  // No payloads are available yet, exit out.
-			}
-
-			D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
-
-			// Map the readback buffer so we can fetch the query data from the GPU.
-			void* readbackBufferMapping = nullptr;
-
-			if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
-			{
-				assert(false && "Failed to map readback buffer.");
-			}
-
-			auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
-
-			for (uint32_t i = 0; i < payloadCount; ++i)
-			{
-				const auto& payload = m_payloadQueue.front();
-
-				for (uint32_t j = 0; j < payload.m_queryCount; ++j)
-				{
-					const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
-					const auto timestamp = timestampData[counter];
-					const auto queryId = counter;
-
-					auto* item = Profiler::QueueSerial();
-					MemWrite(&item->hdr.type, QueueType::GpuTime);
-					MemWrite(&item->gpuTime.gpuTime, timestamp);
-					MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
-					MemWrite(&item->gpuTime.context, m_context);
-
-					Profiler::QueueSerialFinish();
-				}
-
-				m_payloadQueue.pop();
-			}
-
-			m_readbackBuffer->Unmap(0, nullptr);
-
-			// Recalibrate to account for drift.
-
-			uint64_t cpuTimestamp;
-			uint64_t gpuTimestamp;
-
-			if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
-			{
-				assert(false && "Failed to get queue clock calibration.");
-			}
-
-			cpuTimestamp *= m_qpcToNs;
-
-			const auto cpuDelta = cpuTimestamp - m_prevCalibration;
-			if (cpuDelta > 0)
-			{
-				m_prevCalibration = cpuTimestamp;
-				cpuTimestamp = Profiler::GetTime();
-
-				auto* item = Profiler::QueueSerial();
-				MemWrite(&item->hdr.type, QueueType::GpuCalibration);
-				MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
-				MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
-				MemWrite(&item->gpuCalibration.cpuDelta, cpuDelta);
-				MemWrite(&item->gpuCalibration.context, m_context);
-
-				Profiler::QueueSerialFinish();
-			}
-		}
-
-	private:
-		tracy_force_inline uint32_t NextQueryId()
-		{
-			uint32_t queryCounter = m_queryCounter.fetch_add(2);
-			assert(queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries.");
-
-			const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
-
-			return id;
-		}
-
-		tracy_force_inline uint8_t GetId() const
-		{
-			return m_context;
-		}
-	};
-
-	class D3D12ZoneScope
-	{
-		const bool m_active;
-		D3D12QueueCtx* m_ctx = nullptr;
-		ID3D12GraphicsCommandList* m_cmdList = nullptr;
-		uint32_t m_queryId = 0;  // Used for tracking in nested zones.
-
-	public:
-		tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
+    struct D3D12QueryPayload
+    {
+        uint32_t m_queryIdStart = 0;
+        uint32_t m_queryCount = 0;
+    };
+
+    // Command queue context.
+    class D3D12QueueCtx
+    {
+        friend class D3D12ZoneScope;
+
+        ID3D12Device* m_device = nullptr;
+        ID3D12CommandQueue* m_queue = nullptr;
+        uint8_t m_contextId = 255;  // TODO: apparently, 255 means "invalid id"; is this documented somewhere?
+        ID3D12QueryHeap* m_queryHeap = nullptr;
+        ID3D12Resource* m_readbackBuffer = nullptr;
+
+        // In-progress payload.
+        uint32_t m_queryLimit = 0;
+        std::atomic<uint32_t> m_queryCounter = 0;
+        uint32_t m_previousQueryCounter = 0;
+
+        uint32_t m_activePayload = 0;
+        ID3D12Fence* m_payloadFence = nullptr;
+        std::queue<D3D12QueryPayload> m_payloadQueue;
+
+        UINT64 m_prevCalibrationTicksCPU = 0;
+
+        void RecalibrateClocks()
+        {
+            UINT64 cpuTimestamp;
+            UINT64 gpuTimestamp;
+            if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
+            {
+                TracyD3D12Panic("failed to obtain queue clock calibration counters.", return);
+            }
+
+            int64_t cpuDeltaTicks = cpuTimestamp - m_prevCalibrationTicksCPU;
+            if (cpuDeltaTicks > 0)
+            {
+                static const int64_t nanosecodsPerTick = int64_t(1000000000) / GetFrequencyQpc();
+                int64_t cpuDeltaNS = cpuDeltaTicks * nanosecodsPerTick;
+                // Save the device cpu timestamp, not the Tracy profiler timestamp:
+                m_prevCalibrationTicksCPU = cpuTimestamp;
+
+                cpuTimestamp = Profiler::GetTime();
+
+                auto* item = Profiler::QueueSerial();
+                MemWrite(&item->hdr.type, QueueType::GpuCalibration);
+                MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
+                MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
+                MemWrite(&item->gpuCalibration.cpuDelta, cpuDeltaNS);
+                MemWrite(&item->gpuCalibration.context, GetId());
+                SubmitQueueItem(item);
+            }
+        }
+
+        tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
+        {
 #ifdef TRACY_ON_DEMAND
-			: m_active(active && GetProfiler().IsConnected())
-#else
-			: m_active(active)
+            GetProfiler().DeferItem(*item);
 #endif
-		{
-			if (!m_active) return;
-
-			m_ctx = ctx;
-			m_cmdList = cmdList;
-
-			m_queryId = ctx->NextQueryId();
-			cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
-
-			auto* item = Profiler::QueueSerial();
-			MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
-			MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
-			MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation));
-			MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
-			MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
-			MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
-
-			Profiler::QueueSerialFinish();
-		}
+            Profiler::QueueSerialFinish();
+        }
+
+    public:
+        D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
+            : m_device(device)
+            , m_queue(queue)
+        {
+            // Verify we support timestamp queries on this queue.
+
+            if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY)
+            {
+                D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{};
+
+                HRESULT hr = device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData));
+                if (FAILED(hr) || (featureData.CopyQueueTimestampQueriesSupported == FALSE))
+                {
+                    TracyD3D12Panic("Platform does not support profiling of copy queues.", return);
+                }
+            }
+
+            static constexpr uint32_t MaxQueries = 64 * 1024;  // Must be even, because queries are (begin, end) pairs
+            m_queryLimit = MaxQueries;
+
+            D3D12_QUERY_HEAP_DESC heapDesc{};
+            heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
+            heapDesc.Count = m_queryLimit;
+            heapDesc.NodeMask = 0;  // #TODO: Support multiple adapters.
+
+            while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap))))
+            {
+                m_queryLimit /= 2;
+                heapDesc.Count = m_queryLimit;
+            }
+
+            // Create a readback buffer, which will be used as a destination for the query data.
+
+            D3D12_RESOURCE_DESC readbackBufferDesc{};
+            readbackBufferDesc.Alignment = 0;
+            readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+            readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t);
+            readbackBufferDesc.Height = 1;
+            readbackBufferDesc.DepthOrArraySize = 1;
+            readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
+            readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;  // Buffers are always row major.
+            readbackBufferDesc.MipLevels = 1;
+            readbackBufferDesc.SampleDesc.Count = 1;
+            readbackBufferDesc.SampleDesc.Quality = 0;
+            readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+            D3D12_HEAP_PROPERTIES readbackHeapProps{};
+            readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
+            readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+            readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+            readbackHeapProps.CreationNodeMask = 0;
+            readbackHeapProps.VisibleNodeMask = 0;  // #TODO: Support multiple adapters.
+
+            if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer))))
+            {
+                TracyD3D12Panic("Failed to create query readback buffer.", return);
+            }
+
+            if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence))))
+            {
+                TracyD3D12Panic("Failed to create payload fence.", return);
+            }
+
+            float period = [queue]()
+            {
+                uint64_t timestampFrequency;
+                if (FAILED(queue->GetTimestampFrequency(&timestampFrequency)))
+                {
+                    return 0.0f;
+                }
+                return static_cast<float>( 1E+09 / static_cast<double>(timestampFrequency) );
+            }();
+
+            if (period == 0.0f)
+            {
+                TracyD3D12Panic("Failed to get timestamp frequency.", return);
+            }
+
+            uint64_t cpuTimestamp;
+            uint64_t gpuTimestamp;
+            if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
+            {
+                TracyD3D12Panic("Failed to get queue clock calibration.", return);
+            }
+
+            // Save the device cpu timestamp, not the profiler's timestamp.
+            m_prevCalibrationTicksCPU = cpuTimestamp;
+
+            cpuTimestamp = Profiler::GetTime();
+
+            // all checked: ready to roll
+            m_contextId = GetGpuCtxCounter().fetch_add(1);
+
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+            MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp);
+            MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp);
+            MemWrite(&item->gpuNewContext.thread, decltype(item->gpuNewContext.thread)(0)); // #TODO: why 0 instead of GetThreadHandle()?
+            MemWrite(&item->gpuNewContext.period, period);
+            MemWrite(&item->gpuNewContext.context, GetId());
+            MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
+            MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
+            SubmitQueueItem(item);
+        }
+
+        ~D3D12QueueCtx()
+        {
+            ZoneScopedC(Color::Red4);
+            // collect all pending timestamps
+            while (m_payloadFence->GetCompletedValue() != m_activePayload)
+                /* busy-wait ... */;
+            Collect();
+            m_payloadFence->Release();
+            m_readbackBuffer->Release();
+            m_queryHeap->Release();
+        }
+
+
+        void NewFrame()
+        {
+            uint32_t queryCounter = m_queryCounter.exchange(0);
+            m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
+            m_previousQueryCounter += queryCounter;
+
+            if (m_previousQueryCounter >= m_queryLimit)
+            {
+                m_previousQueryCounter -= m_queryLimit;
+            }
+
+            m_queue->Signal(m_payloadFence, ++m_activePayload);
+        }
+
+        void Name( const char* name, uint16_t len )
+        {
+            auto ptr = (char*)tracy_malloc( len );
+            memcpy( ptr, name, len );
+
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::GpuContextName );
+            MemWrite( &item->gpuContextNameFat.context, GetId());
+            MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+            MemWrite( &item->gpuContextNameFat.size, len );
+            SubmitQueueItem(item);
+        }
+
+        void Collect()
+        {
+            ZoneScopedC(Color::Red4);
 
-		tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active)
 #ifdef TRACY_ON_DEMAND
-			: m_active(active&& GetProfiler().IsConnected())
-#else
-			: m_active(active)
-#endif
-		{
-			if (!m_active) return;
-
-			m_ctx = ctx;
-			m_cmdList = cmdList;
-
-			m_queryId = ctx->NextQueryId();
-			cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
-
-			auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
-			MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
-			MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
-			MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation));
-			MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
-			MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
-			MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            if (!GetProfiler().IsConnected())
+            {
+                m_queryCounter = 0;
 
-			Profiler::QueueSerialFinish();
-		}
-
-		tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active)
-#ifdef TRACY_ON_DEMAND
-			: m_active(active&& GetProfiler().IsConnected())
-#else
-			: m_active(active)
+                return;
+            }
 #endif
-		{
-			if (!m_active) return;
-
-			m_ctx = ctx;
-			m_cmdList = cmdList;
 
-			m_queryId = ctx->NextQueryId();
-			cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
-
-			const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
-
-			auto* item = Profiler::QueueSerial();
-			MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial);
-			MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
-			MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
-			MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
-			MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
-			MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
-
-			Profiler::QueueSerialFinish();
-		}
-
-		tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active)
+            // Find out what payloads are available.
+            const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
+            const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
+
+            if (!payloadCount)
+            {
+                return;  // No payloads are available yet, exit out.
+            }
+
+            D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
+
+            // Map the readback buffer so we can fetch the query data from the GPU.
+            void* readbackBufferMapping = nullptr;
+
+            if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
+            {
+                TracyD3D12Panic("Failed to map readback buffer.", return);
+            }
+
+            auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
+
+            for (uint32_t i = 0; i < payloadCount; ++i)
+            {
+                const auto& payload = m_payloadQueue.front();
+
+                for (uint32_t j = 0; j < payload.m_queryCount; ++j)
+                {
+                    const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
+                    const auto timestamp = timestampData[counter];
+                    const auto queryId = counter;
+
+                    auto* item = Profiler::QueueSerial();
+                    MemWrite(&item->hdr.type, QueueType::GpuTime);
+                    MemWrite(&item->gpuTime.gpuTime, timestamp);
+                    MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
+                    MemWrite(&item->gpuTime.context, GetId());
+
+                    Profiler::QueueSerialFinish();
+                }
+
+                m_payloadQueue.pop();
+            }
+
+            m_readbackBuffer->Unmap(0, nullptr);
+
+            // Recalibrate to account for drift.
+            RecalibrateClocks();
+        }
+
+    private:
+        tracy_force_inline uint32_t NextQueryId()
+        {
+            uint32_t queryCounter = m_queryCounter.fetch_add(2);
+            if (queryCounter >= m_queryLimit)
+            {
+                TracyD3D12Panic("Submitted too many GPU queries! Consider increasing MaxQueries.");
+                // #TODO: consider returning an invalid id or sentinel value here
+            }
+
+            const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
+
+            return id;
+        }
+
+        tracy_force_inline uint8_t GetId() const
+        {
+            return m_contextId;
+        }
+    };
+
+    class D3D12ZoneScope
+    {
+        const bool m_active;
+        D3D12QueueCtx* m_ctx = nullptr;
+        ID3D12GraphicsCommandList* m_cmdList = nullptr;
+        uint32_t m_queryId = 0;  // Used for tracking in nested zones.
+
+        tracy_force_inline void WriteQueueItem(QueueItem* item, QueueType type, uint64_t srcLocation)
+        {
+            MemWrite(&item->hdr.type, type);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, srcLocation);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
+            MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, bool active)
 #ifdef TRACY_ON_DEMAND
-			: m_active(active&& GetProfiler().IsConnected())
+            : m_active(active&& GetProfiler().IsConnected())
 #else
-			: m_active(active)
+            : m_active(active)
 #endif
-		{
-			if (!m_active) return;
-
-			m_ctx = ctx;
-			m_cmdList = cmdList;
-
-			m_queryId = ctx->NextQueryId();
-			cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
-
-			const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
-
-			auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
-			MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial);
-			MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
-			MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
-			MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
-			MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
-			MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
-
-			Profiler::QueueSerialFinish();
-		}
-
-		tracy_force_inline ~D3D12ZoneScope()
-		{
-			if (!m_active) return;
-
-			const auto queryId = m_queryId + 1;  // Our end query slot is immediately after the begin slot.
-			m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId);
-
-			auto* item = Profiler::QueueSerial();
-			MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
-			MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
-			MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
-			MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
-			MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
-
-			Profiler::QueueSerialFinish();
-
-			m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer.Get(), m_queryId * sizeof(uint64_t));
-		}
-	};
-
-	static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
-	{
-		auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
-		new (ctx) D3D12QueueCtx{ device, queue };
-
-		return ctx;
-	}
-
-	static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
-	{
-		ctx->~D3D12QueueCtx();
-		tracy_free(ctx);
-	}
+        {
+            if (!m_active) return;
+
+            m_ctx = ctx;
+            m_cmdList = cmdList;
+
+            m_queryId = m_ctx->NextQueryId();
+            m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
+        }
+
+    public:
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            auto* item = Profiler::QueueSerial();
+            WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcLocation));
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+            WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcLocation));
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
+
+            auto* item = Profiler::QueueSerial();
+            WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
+
+            auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+            WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
+        }
+
+        tracy_force_inline ~D3D12ZoneScope()
+        {
+            if (!m_active) return;
+
+            const auto queryId = m_queryId + 1;  // Our end query slot is immediately after the begin slot.
+            m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, queryId);
+
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+            MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
+            MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+
+            m_cmdList->ResolveQueryData(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer, m_queryId * sizeof(uint64_t));
+        }
+    };
+
+    static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
+    {
+        auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
+        new (ctx) D3D12QueueCtx{ device, queue };
+
+        return ctx;
+    }
+
+    static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
+    {
+        ctx->~D3D12QueueCtx();
+        tracy_free(ctx);
+    }
 
 }
 
+#undef TracyD3D12Panic
+
 using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
 
 #define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue);
@@ -471,25 +461,29 @@ using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
 
 #define TracyD3D12NewFrame(ctx) ctx->NewFrame();
 
+#define TracyD3D12UnnamedZone ___tracy_gpu_d3d12_zone
+#define TracyD3D12SrcLocSymbol TracyConcat(__tracy_d3d12_source_location,TracyLine)
+#define TracyD3D12SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D12SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
+
 #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
-#  define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, TRACY_CALLSTACK, true)
-#  define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, TRACY_CALLSTACK, true)
-#  define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active };
-#  define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active };
+#  define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, TRACY_CALLSTACK, true)
+#  define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, TRACY_CALLSTACK, true)
+#  define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
 #  define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
 #else
-#  define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true)
-#  define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, ___tracy_gpu_zone, cmdList, name, color, true)
-#  define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active };
-#  define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active };
+#  define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, TracyD3D12UnnamedZone, cmdList, name, true)
+#  define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, TracyD3D12UnnamedZone, cmdList, name, color, true)
+#  define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
+#  define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
 #  define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active };
 #endif
 
 #ifdef TRACY_HAS_CALLSTACK
-#  define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, depth, true)
-#  define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, depth, true)
-#  define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active };
-#  define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active };
+#  define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, depth, true)
+#  define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, depth, true)
+#  define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
+#  define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
 #  define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active };
 #else
 #  define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name)
diff --git a/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp b/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp
index 6ee2e3087d73bf04b182079f67558c664014bb98..c972ffb26da6a555183a72768fab8161f7c407ba 100644
--- a/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp
+++ b/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp
@@ -173,10 +173,10 @@ static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth )
     {
         const uint32_t line = dbg[i].currentline;
         memcpy( dst, &line, 4 ); dst += 4;
-        assert( fsz[i] <= std::numeric_limits<uint16_t>::max() );
+        assert( fsz[i] <= (std::numeric_limits<uint16_t>::max)() );
         memcpy( dst, fsz+i, 2 ); dst += 2;
         memcpy( dst, func[i], fsz[i] ); dst += fsz[i];
-        assert( ssz[i] <= std::numeric_limits<uint16_t>::max() );
+        assert( ssz[i] <= (std::numeric_limits<uint16_t>::max)() );
         memcpy( dst, ssz+i, 2 ); dst += 2;
         memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i];
     }
@@ -333,7 +333,7 @@ static inline int LuaZoneText( lua_State* L )
 
     auto txt = lua_tostring( L, 1 );
     const auto size = strlen( txt );
-    assert( size < std::numeric_limits<uint16_t>::max() );
+    assert( size < (std::numeric_limits<uint16_t>::max)() );
 
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
@@ -358,7 +358,7 @@ static inline int LuaZoneName( lua_State* L )
 
     auto txt = lua_tostring( L, 1 );
     const auto size = strlen( txt );
-    assert( size < std::numeric_limits<uint16_t>::max() );
+    assert( size < (std::numeric_limits<uint16_t>::max)() );
 
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
@@ -378,7 +378,7 @@ static inline int LuaMessage( lua_State* L )
 
     auto txt = lua_tostring( L, 1 );
     const auto size = strlen( txt );
-    assert( size < std::numeric_limits<uint16_t>::max() );
+    assert( size < (std::numeric_limits<uint16_t>::max)() );
 
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
diff --git a/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp b/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp
index 3f4f6a31c1a85257aea208be7488d392393edc83..2d079f7b5af96271e6cee8238d6a52d173280cd5 100644
--- a/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp
+++ b/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp
@@ -5,6 +5,9 @@
 
 #define TracyVkContext(x,y,z,w) nullptr
 #define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr
+#if defined VK_EXT_host_query_reset
+#define TracyVkContextHostCalibrated(x,y,z,w,a) nullptr
+#endif
 #define TracyVkDestroy(x)
 #define TracyVkContextName(c,x,y)
 #define TracyVkNamedZone(c,x,y,z,w)
@@ -39,9 +42,47 @@ using TracyVkCtx = void*;
 #include "../client/TracyProfiler.hpp"
 #include "../client/TracyCallstack.hpp"
 
+#include <atomic>
+
 namespace tracy
 {
 
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define LoadVkDeviceCoreSymbols(Operation) \
+    Operation(vkBeginCommandBuffer) \
+    Operation(vkCmdResetQueryPool) \
+    Operation(vkCmdWriteTimestamp) \
+    Operation(vkCreateQueryPool) \
+    Operation(vkDestroyQueryPool) \
+    Operation(vkEndCommandBuffer) \
+    Operation(vkGetQueryPoolResults) \
+    Operation(vkQueueSubmit) \
+    Operation(vkQueueWaitIdle) \
+    Operation(vkResetQueryPool)
+
+#define LoadVkDeviceExtensionSymbols(Operation) \
+    Operation(vkGetCalibratedTimestampsEXT) \
+    Operation(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT)
+
+#define LoadVkInstanceCoreSymbols(Operation) \
+    Operation(vkGetPhysicalDeviceProperties)
+
+struct VkSymbolTable
+{
+#define MAKE_PFN(name) PFN_##name name;
+    LoadVkDeviceCoreSymbols(MAKE_PFN)
+    LoadVkDeviceExtensionSymbols(MAKE_PFN)
+    LoadVkInstanceCoreSymbols(MAKE_PFN)
+#undef MAKE_PFN
+};
+
+#define VK_FUNCTION_WRAPPER(callSignature) m_symbols.callSignature
+#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) m_ctx->m_symbols.callSignature
+#else
+#define VK_FUNCTION_WRAPPER(callSignature) callSignature
+#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) callSignature
+#endif
+
 class VkCtx
 {
     friend class VkCtxScope;
@@ -49,7 +90,11 @@ class VkCtx
     enum { QueryCount = 64 * 1024 };
 
 public:
-    VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT _vkGetCalibratedTimestampsEXT )
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr, bool calibrated )
+#else
+    VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT)
+#endif
         : m_device( device )
         , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
         , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
@@ -57,47 +102,28 @@ public:
         , m_tail( 0 )
         , m_oldCnt( 0 )
         , m_queryCount( QueryCount )
-        , m_vkGetCalibratedTimestampsEXT( _vkGetCalibratedTimestampsEXT )
+#if !defined TRACY_VK_USE_SYMBOL_TABLE
+        , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
+#endif
     {
         assert( m_context != 255 );
 
-        if( _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT && _vkGetCalibratedTimestampsEXT )
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+        PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
+        if ( calibrated )
         {
-            uint32_t num;
-            _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, nullptr );
-            if( num > 4 ) num = 4;
-            VkTimeDomainEXT data[4];
-            _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, data );
-            VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
-#if defined _WIN32
-            supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
-#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
-            supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
-#endif
-            for( uint32_t i=0; i<num; i++ )
-            {
-                if( data[i] == supportedDomain )
-                {
-                    m_timeDomain = data[i];
-                    break;
-                }
-            }
+            m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
         }
 
-        VkPhysicalDeviceProperties prop;
-        vkGetPhysicalDeviceProperties( physdev, &prop );
-        const float period = prop.limits.timestampPeriod;
+#endif
 
-        VkQueryPoolCreateInfo poolInfo = {};
-        poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
-        poolInfo.queryCount = m_queryCount;
-        poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
-        while( vkCreateQueryPool( device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS )
+        if( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) && m_vkGetCalibratedTimestampsEXT )
         {
-            m_queryCount /= 2;
-            poolInfo.queryCount = m_queryCount;
+            FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
         }
 
+        CreateQueryPool();
+
         VkCommandBufferBeginInfo beginInfo = {};
         beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
         beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
@@ -107,87 +133,96 @@ public:
         submitInfo.commandBufferCount = 1;
         submitInfo.pCommandBuffers = &cmdbuf;
 
-        vkBeginCommandBuffer( cmdbuf, &beginInfo );
-        vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount );
-        vkEndCommandBuffer( cmdbuf );
-        vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE );
-        vkQueueWaitIdle( queue );
+        VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
+        VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
+        VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
+        VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
+        VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
 
         int64_t tcpu, tgpu;
         if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT )
         {
-            vkBeginCommandBuffer( cmdbuf, &beginInfo );
-            vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 );
-            vkEndCommandBuffer( cmdbuf );
-            vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE );
-            vkQueueWaitIdle( queue );
+            VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
+            VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ) );
+            VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
+            VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
+            VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
 
             tcpu = Profiler::GetTime();
-            vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT );
+            VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ) );
 
-            vkBeginCommandBuffer( cmdbuf, &beginInfo );
-            vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 );
-            vkEndCommandBuffer( cmdbuf );
-            vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE );
-            vkQueueWaitIdle( queue );
+            VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
+            VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ) );
+            VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
+            VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
+            VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
         }
         else
         {
-            enum { NumProbes = 32 };
-
-            VkCalibratedTimestampInfoEXT spec[2] = {
-                { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
-                { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
-            };
-            uint64_t ts[2];
-            uint64_t deviation[NumProbes];
-            for( int i=0; i<NumProbes; i++ )
-            {
-                _vkGetCalibratedTimestampsEXT( device, 2, spec, ts, deviation+i );
-            }
-            uint64_t minDeviation = deviation[0];
-            for( int i=1; i<NumProbes; i++ )
-            {
-                if( minDeviation > deviation[i] )
-                {
-                    minDeviation = deviation[i];
-                }
-            }
-            m_deviation = minDeviation * 3 / 2;
-
-#if defined _WIN32
-            m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
-#endif
-
+            FindCalibratedTimestampDeviation();
             Calibrate( device, m_prevCalibration, tgpu );
             tcpu = Profiler::GetTime();
         }
 
-        uint8_t flags = 0;
-        if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
+        WriteInitialItem( physdev, tcpu, tgpu );
 
-        auto item = Profiler::QueueSerial();
-        MemWrite( &item->hdr.type, QueueType::GpuNewContext );
-        MemWrite( &item->gpuNewContext.cpuTime, tcpu );
-        MemWrite( &item->gpuNewContext.gpuTime, tgpu );
-        memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
-        MemWrite( &item->gpuNewContext.period, period );
-        MemWrite( &item->gpuNewContext.context, m_context );
-        MemWrite( &item->gpuNewContext.flags, flags );
-        MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
+        m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
+    }
 
-#ifdef TRACY_ON_DEMAND
-        GetProfiler().DeferItem( *item );
+#if defined VK_EXT_host_query_reset
+    /**
+     * This alternative constructor does not use command buffers and instead uses functionality from
+     * VK_EXT_host_query_reset (core with 1.2 and non-optional) and VK_EXT_calibrated_timestamps. This requires
+     * the physical device to have another time domain apart from DEVICE to be calibrateable.
+     */
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
+#else
+    VkCtx( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT vkResetQueryPool, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT )
 #endif
-        Profiler::QueueSerialFinish();
+        : m_device( device )
+        , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
+        , m_context( GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed) )
+        , m_head( 0 )
+        , m_tail( 0 )
+        , m_oldCnt( 0 )
+        , m_queryCount( QueryCount )
+#if !defined TRACY_VK_USE_SYMBOL_TABLE
+        , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
+#endif
+    {
+        assert( m_context != 255);
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+        PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
+        m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
+#endif
+
+        assert( VK_FUNCTION_WRAPPER( vkResetQueryPool ) != nullptr );
+        assert( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) != nullptr );
+        assert( VK_FUNCTION_WRAPPER( vkGetCalibratedTimestampsEXT ) != nullptr );
+
+        FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
+
+        // We require a host time domain to be available to properly calibrate.
+        FindCalibratedTimestampDeviation();
+        int64_t tgpu;
+        Calibrate( device, m_prevCalibration, tgpu );
+        int64_t tcpu = Profiler::GetTime();
+
+        CreateQueryPool();
+        VK_FUNCTION_WRAPPER( vkResetQueryPool( device, m_query, 0, m_queryCount ) );
+
+        WriteInitialItem( physdev, tcpu, tgpu );
 
         m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
     }
+#endif
 
     ~VkCtx()
     {
         tracy_free( m_res );
-        vkDestroyQueryPool( m_device, m_query, nullptr );
+        VK_FUNCTION_WRAPPER( vkDestroyQueryPool( m_device, m_query, nullptr ) );
     }
 
     void Name( const char* name, uint16_t len )
@@ -210,18 +245,23 @@ public:
     {
         ZoneScopedC( Color::Red4 );
 
-        if( m_tail == m_head ) return;
+        const uint64_t head = m_head.load(std::memory_order_relaxed);
+        if( m_tail == head ) return;
 
 #ifdef TRACY_ON_DEMAND
         if( !GetProfiler().IsConnected() )
         {
-            vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount );
-            m_head = m_tail = m_oldCnt = 0;
+            VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
+            m_tail = head;
+            m_oldCnt = 0;
             int64_t tgpu;
             if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu );
             return;
         }
 #endif
+        assert( head > m_tail );
+        
+        const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount );
 
         unsigned int cnt;
         if( m_oldCnt != 0 )
@@ -231,10 +271,16 @@ public:
         }
         else
         {
-            cnt = m_head < m_tail ? m_queryCount - m_tail : m_head - m_tail;
+            cnt = (unsigned int)( head - m_tail );
+            assert( cnt <= m_queryCount );
+            if( wrappedTail + cnt > m_queryCount )
+            {
+                cnt = m_queryCount - wrappedTail;
+            }
         }
 
-        if( vkGetQueryPoolResults( m_device, m_query, m_tail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY )
+
+        if( VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( m_device, m_query, wrappedTail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY ) )
         {
             m_oldCnt = cnt;
             return;
@@ -245,7 +291,7 @@ public:
             auto item = Profiler::QueueSerial();
             MemWrite( &item->hdr.type, QueueType::GpuTime );
             MemWrite( &item->gpuTime.gpuTime, m_res[idx] );
-            MemWrite( &item->gpuTime.queryId, uint16_t( m_tail + idx ) );
+            MemWrite( &item->gpuTime.queryId, uint16_t( wrappedTail + idx ) );
             MemWrite( &item->gpuTime.context, m_context );
             Profiler::QueueSerialFinish();
         }
@@ -269,19 +315,16 @@ public:
             }
         }
 
-        vkCmdResetQueryPool( cmdbuf, m_query, m_tail, cnt );
+        VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) );
 
         m_tail += cnt;
-        if( m_tail == m_queryCount ) m_tail = 0;
     }
 
 private:
     tracy_force_inline unsigned int NextQueryId()
     {
-        const auto id = m_head;
-        m_head = ( m_head + 1 ) % m_queryCount;
-        assert( m_head != m_tail );
-        return id;
+        const uint64_t id = m_head.fetch_add(1, std::memory_order_relaxed);
+        return id % m_queryCount;
     }
 
     tracy_force_inline uint8_t GetId() const
@@ -315,16 +358,126 @@ private:
 #endif
     }
 
+    tracy_force_inline void CreateQueryPool()
+    {
+        VkQueryPoolCreateInfo poolInfo = {};
+        poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
+        poolInfo.queryCount = m_queryCount;
+        poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
+        while ( VK_FUNCTION_WRAPPER( vkCreateQueryPool( m_device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) )
+        {
+            m_queryCount /= 2;
+            poolInfo.queryCount = m_queryCount;
+        }
+    }
+
+    tracy_force_inline void FindAvailableTimeDomains( VkPhysicalDevice physicalDevice, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT )
+    {
+        uint32_t num;
+        _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, nullptr );
+        if(num > 4) num = 4;
+        VkTimeDomainEXT data[4];
+        _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, data );
+        VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
+#if defined _WIN32
+        supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
+#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
+        supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
+#endif
+        for( uint32_t i=0; i<num; i++ ) {
+            if(data[i] == supportedDomain) {
+                m_timeDomain = data[i];
+                break;
+            }
+        }
+    }
+
+    tracy_force_inline void FindCalibratedTimestampDeviation()
+    {
+        assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT );
+        constexpr size_t NumProbes = 32;
+        VkCalibratedTimestampInfoEXT spec[2] = {
+            { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
+            { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
+        };
+        uint64_t ts[2];
+        uint64_t deviation[NumProbes];
+        for( int i=0; i<NumProbes; i++ ) {
+            m_vkGetCalibratedTimestampsEXT( m_device, 2, spec, ts, deviation + i );
+        }
+        uint64_t minDeviation = deviation[0];
+        for( int i=1; i<NumProbes; i++ ) {
+            if ( minDeviation > deviation[i] ) {
+                minDeviation = deviation[i];
+            }
+        }
+        m_deviation = minDeviation * 3 / 2;
+
+#if defined _WIN32
+        m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
+#endif
+    }
+
+    tracy_force_inline void WriteInitialItem( VkPhysicalDevice physdev, int64_t tcpu, int64_t tgpu )
+    {
+        uint8_t flags = 0;
+        if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
+
+        VkPhysicalDeviceProperties prop;
+        VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceProperties( physdev, &prop ) );
+        const float period = prop.limits.timestampPeriod;
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuNewContext );
+        MemWrite( &item->gpuNewContext.cpuTime, tcpu );
+        MemWrite( &item->gpuNewContext.gpuTime, tgpu );
+        memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
+        MemWrite( &item->gpuNewContext.period, period );
+        MemWrite( &item->gpuNewContext.context, m_context );
+        MemWrite( &item->gpuNewContext.flags, flags );
+        MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    void PopulateSymbolTable( VkInstance instance, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
+    {
+#define VK_GET_DEVICE_SYMBOL( name ) \
+        (PFN_##name)deviceProcAddr( m_device, #name );
+#define VK_LOAD_DEVICE_SYMBOL( name ) \
+        m_symbols.name = VK_GET_DEVICE_SYMBOL( name );
+#define VK_GET_INSTANCE_SYMBOL( name ) \
+        (PFN_##name)instanceProcAddr( instance, #name );
+#define VK_LOAD_INSTANCE_SYMBOL( name ) \
+        m_symbols.name = VK_GET_INSTANCE_SYMBOL( name );
+
+        LoadVkDeviceCoreSymbols( VK_LOAD_DEVICE_SYMBOL )
+        LoadVkDeviceExtensionSymbols( VK_LOAD_DEVICE_SYMBOL )
+        LoadVkInstanceCoreSymbols( VK_LOAD_INSTANCE_SYMBOL )
+#undef VK_GET_DEVICE_SYMBOL
+#undef VK_LOAD_DEVICE_SYMBOL
+#undef VK_GET_INSTANCE_SYMBOL
+#undef VK_LOAD_INSTANCE_SYMBOL
+    }
+#endif
+
     VkDevice m_device;
     VkQueryPool m_query;
     VkTimeDomainEXT m_timeDomain;
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    VkSymbolTable m_symbols;
+#endif
     uint64_t m_deviation;
     int64_t m_qpcToNs;
     int64_t m_prevCalibration;
     uint8_t m_context;
 
-    unsigned int m_head;
-    unsigned int m_tail;
+    std::atomic<uint64_t> m_head;
+    uint64_t m_tail;
     unsigned int m_oldCnt;
     unsigned int m_queryCount;
 
@@ -348,7 +501,7 @@ public:
         m_ctx = ctx;
 
         const auto queryId = ctx->NextQueryId();
-        vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
@@ -372,7 +525,7 @@ public:
         m_ctx = ctx;
 
         const auto queryId = ctx->NextQueryId();
-        vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
 
         auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
         MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
@@ -396,7 +549,7 @@ public:
         m_ctx = ctx;
 
         const auto queryId = ctx->NextQueryId();
-        vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
 
         const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
         auto item = Profiler::QueueSerial();
@@ -421,7 +574,7 @@ public:
         m_ctx = ctx;
 
         const auto queryId = ctx->NextQueryId();
-        vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
 
         const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
         auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
@@ -439,7 +592,7 @@ public:
         if( !m_active ) return;
 
         const auto queryId = m_ctx->NextQueryId();
-        vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId );
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ) );
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
@@ -457,13 +610,38 @@ private:
     VkCtx* m_ctx;
 };
 
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr, bool calibrated = false )
+#else
 static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
+#endif
 {
     auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    new(ctx) VkCtx( instance, physdev, device, queue, cmdbuf, instanceProcAddr, getDeviceProcAddr, calibrated );
+#else
     new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct );
+#endif
     return ctx;
 }
 
+#if defined VK_EXT_host_query_reset
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr )
+#else
+static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT qpreset, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
+#endif
+{
+    auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    new(ctx) VkCtx( instance, physdev, device, instanceProcAddr, getDeviceProcAddr );
+#else
+    new(ctx) VkCtx( physdev, device, qpreset, gpdctd, gct );
+#endif
+    return ctx;
+}
+#endif
+
 static inline void DestroyVkContext( VkCtx* ctx )
 {
     ctx->~VkCtx();
@@ -474,8 +652,23 @@ static inline void DestroyVkContext( VkCtx* ctx )
 
 using TracyVkCtx = tracy::VkCtx*;
 
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define TracyVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr );
+#else
 #define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr );
+#endif
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define TracyVkContextCalibrated( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr, true );
+#else
 #define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct );
+#endif
+#if defined VK_EXT_host_query_reset
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define TracyVkContextHostCalibrated( instance, physdev, device, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, instanceProcAddr, deviceProcAddr );
+#else
+#define TracyVkContextHostCalibrated( physdev, device, qpreset, gpdctd, gct ) tracy::CreateVkContext( physdev, device, qpreset, gpdctd, gct );
+#endif
+#endif
 #define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx );
 #define TracyVkContextName( ctx, name, size ) ctx->Name( name, size );
 #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK