diff --git a/thirdparty/tracy/CMakeLists.txt b/thirdparty/tracy/CMakeLists.txt index 3911836a3caf43e3ea314ad07451b23fda4516cd..3e9727abe3ec25d086331993fe115b65f07d69b3 100644 --- a/thirdparty/tracy/CMakeLists.txt +++ b/thirdparty/tracy/CMakeLists.txt @@ -1,4 +1,4 @@ -# Tracy Profiler Client 0.9.1 +# Tracy Profiler Client 0.10.0 # BSD 3-clause # Copyright (c) 2017-2023, Bartosz Taudul <wolf@nereid.pl> diff --git a/thirdparty/tracy/include/tracy/TracyClient.cpp b/thirdparty/tracy/include/tracy/TracyClient.cpp index 77f81a4a7cf39258e495545f994d804e83e173c0..26387b762ed88201eafd2045d4de0eff21de53aa 100644 --- a/thirdparty/tracy/include/tracy/TracyClient.cpp +++ b/thirdparty/tracy/include/tracy/TracyClient.cpp @@ -22,6 +22,7 @@ #include "common/tracy_lz4.cpp" #include "client/TracyProfiler.cpp" #include "client/TracyCallstack.cpp" +#include "client/TracySysPower.cpp" #include "client/TracySysTime.cpp" #include "client/TracySysTrace.cpp" #include "common/TracySocket.cpp" diff --git a/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp b/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp index a874446c2cf29db3a0bc223b395d07354af49584..0de7c9d2e9a6d8f2a6521fdfb6677c446ba099c1 100644 --- a/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp +++ b/thirdparty/tracy/include/tracy/client/TracyCallstack.cpp @@ -686,7 +686,9 @@ void InitCallstackCritical() void InitCallstack() { cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr ); +#ifndef TRACY_DEMANGLE ___tracy_init_demangle_buffer(); +#endif #ifdef __linux InitKernelSymbols(); @@ -761,7 +763,9 @@ debuginfod_client* GetDebuginfodClient() void EndCallstack() { +#ifndef TRACY_DEMANGLE ___tracy_free_demangle_buffer(); +#endif #ifdef TRACY_DEBUGINFOD ClearDebugInfoVector( s_di_known ); debuginfod_end( s_debuginfod ); diff --git a/thirdparty/tracy/include/tracy/client/TracyLock.hpp b/thirdparty/tracy/include/tracy/client/TracyLock.hpp index 296a41ba1abf859ecf6d8f4e6603f69fb17e3df2..d12a3c16d6da3a784c3d1b31847507f75433bdfd 100644 --- a/thirdparty/tracy/include/tracy/client/TracyLock.hpp +++ b/thirdparty/tracy/include/tracy/client/TracyLock.hpp @@ -21,7 +21,7 @@ public: , m_active( false ) #endif { - assert( m_id != std::numeric_limits<uint32_t>::max() ); + assert( m_id != (std::numeric_limits<uint32_t>::max)() ); auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::LockAnnounce ); @@ -154,7 +154,7 @@ public: tracy_force_inline void CustomName( const char* name, size_t size ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, name, size ); auto item = Profiler::QueueSerial(); @@ -235,7 +235,7 @@ public: , m_active( false ) #endif { - assert( m_id != std::numeric_limits<uint32_t>::max() ); + assert( m_id != (std::numeric_limits<uint32_t>::max)() ); auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::LockAnnounce ); @@ -450,7 +450,7 @@ public: tracy_force_inline void CustomName( const char* name, size_t size ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, name, size ); auto item = Profiler::QueueSerial(); diff --git a/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp b/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp index 6104a7edd68178c6e376f6aec28051ff581cc5fb..ed580123a7aeb3dd0dd7a5ce10fa8fc7d5074f69 100644 --- a/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp +++ b/thirdparty/tracy/include/tracy/client/TracyProfiler.cpp @@ -83,7 +83,9 @@ #endif #ifdef __APPLE__ -# define TRACY_DELAYED_INIT +# ifndef TRACY_DELAYED_INIT +# define TRACY_DELAYED_INIT +# endif #else # ifdef __GNUC__ # define init_order( val ) __attribute__ ((init_priority(val))) @@ -1072,7 +1074,9 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) } closedir( dp ); +#ifdef TRACY_HAS_CALLSTACK if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release ); +#endif TracyLfqPrepare( QueueType::Crash ); TracyLfqCommit; @@ -1353,6 +1357,7 @@ Profiler::Profiler() , m_queryImage( nullptr ) , m_queryData( nullptr ) , m_crashHandlerInstalled( false ) + , m_programName( nullptr ) { assert( !s_instance ); s_instance = this; @@ -1711,6 +1716,9 @@ void Profiler::Worker() if( m_sock ) break; #ifndef TRACY_ON_DEMAND ProcessSysTime(); +# ifdef TRACY_HAS_SYSPOWER + m_sysPower.Tick(); +# endif #endif if( m_broadcast ) @@ -1718,6 +1726,14 @@ void Profiler::Worker() const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count(); if( t - lastBroadcast > 3000000000 ) // 3s { + m_programNameLock.lock(); + if( m_programName ) + { + broadcastMsg = GetBroadcastMessage( m_programName, strlen( m_programName ), broadcastLen, dataPort ); + m_programName = nullptr; + } + m_programNameLock.unlock(); + lastBroadcast = t; const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count(); broadcastMsg.activeTime = int32_t( ts - m_epoch ); @@ -1828,6 +1844,9 @@ void Profiler::Worker() for(;;) { ProcessSysTime(); +#ifdef TRACY_HAS_SYSPOWER + m_sysPower.Tick(); +#endif const auto status = Dequeue( token ); const auto serialStatus = DequeueSerial(); if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) @@ -4149,6 +4168,7 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_ TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); } +TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); } TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); } TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); } TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); } @@ -4167,7 +4187,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begi { TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); @@ -4190,7 +4210,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zon { TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); @@ -4202,7 +4222,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tra tracy::GetProfiler().SendCallstack( data.depth ); TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); @@ -4292,7 +4312,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_ auto item = tracy::Profiler::QueueSerial(); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); @@ -4304,7 +4324,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) ); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); diff --git a/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp b/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp index 1ed66f66647260b3cc3c36f0d0a70dc081200ca6..e3b256dfa632237837f186f10a14d5db6660fdb2 100644 --- a/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp +++ b/thirdparty/tracy/include/tracy/client/TracyProfiler.hpp @@ -10,6 +10,7 @@ #include "tracy_concurrentqueue.h" #include "tracy_SPSCQueue.h" #include "TracyCallstack.hpp" +#include "TracySysPower.hpp" #include "TracySysTime.hpp" #include "TracyFastVector.hpp" #include "../common/TracyQueue.hpp" @@ -208,7 +209,22 @@ public: if( HardwareSupportsInvariantTSC() ) { uint64_t rax, rdx; +#ifdef TRACY_PATCHABLE_NOPSLEDS + // Some external tooling (such as rr) wants to patch our rdtsc and replace it by a + // branch to control the external input seen by a program. This kind of patching is + // not generally possible depending on the surrounding code and can lead to significant + // slowdowns if the compiler generated unlucky code and rr and tracy are used together. + // To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence, + // which rr promises will be patchable independent of the surrounding code. + asm volatile ( + // This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether + // they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use + // the 5 byte one. + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t" + "rdtsc" : "=a" (rax), "=d" (rdx) ); +#else asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); +#endif return (int64_t)(( rdx << 32 ) + rax); } # else @@ -288,7 +304,7 @@ public: { #ifndef TRACY_NO_FRAME_IMAGE auto& profiler = GetProfiler(); - assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < std::numeric_limits<uint32_t>::max() ); + assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < (std::numeric_limits<uint32_t>::max)() ); # ifdef TRACY_ON_DEMAND if( !profiler.IsConnected() ) return; # endif @@ -305,6 +321,12 @@ public: fi->flip = flip; profiler.m_fiQueue.commit_next(); profiler.m_fiLock.unlock(); +#else + static_cast<void>(image); // unused + static_cast<void>(w); // unused + static_cast<void>(h); // unused + static_cast<void>(offset); // unused + static_cast<void>(flip); // unused #endif } @@ -362,7 +384,7 @@ public: static tracy_force_inline void Message( const char* txt, size_t size, int callstack ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif @@ -399,7 +421,7 @@ public: static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif @@ -442,7 +464,7 @@ public: static tracy_force_inline void MessageAppInfo( const char* txt, size_t size ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); TracyLfqPrepare( QueueType::MessageAppInfo ); @@ -676,6 +698,13 @@ public: return m_isConnected.load( std::memory_order_acquire ); } + tracy_force_inline void SetProgramName( const char* name ) + { + m_programNameLock.lock(); + m_programName = name; + m_programNameLock.unlock(); + } + #ifdef TRACY_ON_DEMAND tracy_force_inline uint64_t ConnectionId() const { @@ -730,7 +759,7 @@ public: static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz ) { const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz ); - assert( sz32 <= std::numeric_limits<uint16_t>::max() ); + assert( sz32 <= (std::numeric_limits<uint16_t>::max)() ); const auto sz = uint16_t( sz32 ); auto ptr = (char*)tracy_malloc( sz ); memcpy( ptr, &sz, 2 ); @@ -941,6 +970,10 @@ private: void ProcessSysTime() {} #endif +#ifdef TRACY_HAS_SYSPOWER + SysPower m_sysPower; +#endif + ParameterCallback m_paramCallback; void* m_paramCallbackData; SourceContentsCallback m_sourceCallback; @@ -959,6 +992,9 @@ private: } m_prevSignal; #endif bool m_crashHandlerInstalled; + + const char* m_programName; + TracyMutex m_programNameLock; }; } diff --git a/thirdparty/tracy/include/tracy/client/TracyScoped.hpp b/thirdparty/tracy/include/tracy/client/TracyScoped.hpp index bc1307916ba71ba7f865a199952f4c63ed6d2b3c..d2274e40b0b284cd6eac111cd5a48b2c7f125253 100644 --- a/thirdparty/tracy/include/tracy/client/TracyScoped.hpp +++ b/thirdparty/tracy/include/tracy/client/TracyScoped.hpp @@ -108,7 +108,7 @@ public: tracy_force_inline void Text( const char* txt, size_t size ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); if( !m_active ) return; #ifdef TRACY_ON_DEMAND if( GetProfiler().ConnectionId() != m_connectionId ) return; @@ -123,7 +123,7 @@ public: tracy_force_inline void Name( const char* txt, size_t size ) { - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); if( !m_active ) return; #ifdef TRACY_ON_DEMAND if( GetProfiler().ConnectionId() != m_connectionId ) return; diff --git a/thirdparty/tracy/include/tracy/client/TracySysPower.cpp b/thirdparty/tracy/include/tracy/client/TracySysPower.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bd5939da2b4dda27cbff73511b756234e63a3fcd --- /dev/null +++ b/thirdparty/tracy/include/tracy/client/TracySysPower.cpp @@ -0,0 +1,164 @@ +#include "TracySysPower.hpp" + +#ifdef TRACY_HAS_SYSPOWER + +#include <sys/types.h> +#include <dirent.h> +#include <chrono> +#include <inttypes.h> +#include <stdio.h> +#include <string.h> + +#include "TracyDebug.hpp" +#include "TracyProfiler.hpp" +#include "../common/TracyAlloc.hpp" + +namespace tracy +{ + +SysPower::SysPower() + : m_domains( 4 ) + , m_lastTime( 0 ) +{ + ScanDirectory( "/sys/devices/virtual/powercap/intel-rapl", -1 ); +} + +SysPower::~SysPower() +{ + for( auto& v : m_domains ) + { + fclose( v.handle ); + // Do not release v.name, as it may be still needed + } +} + +void SysPower::Tick() +{ + auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + if( t - m_lastTime > 10000000 ) // 10 ms + { + m_lastTime = t; + for( auto& v : m_domains ) + { + char tmp[32]; + if( fread( tmp, 1, 32, v.handle ) > 0 ) + { + rewind( v.handle ); + auto p = (uint64_t)atoll( tmp ); + uint64_t delta; + if( p >= v.value ) + { + delta = p - v.value; + } + else + { + delta = v.overflow - v.value + p; + } + v.value = p; + + TracyLfqPrepare( QueueType::SysPowerReport ); + MemWrite( &item->sysPower.time, Profiler::GetTime() ); + MemWrite( &item->sysPower.delta, delta ); + MemWrite( &item->sysPower.name, (uint64_t)v.name ); + TracyLfqCommit; + } + } + } +} + +void SysPower::ScanDirectory( const char* path, int parent ) +{ + DIR* dir = opendir( path ); + if( !dir ) return; + struct dirent* ent; + uint64_t maxRange = 0; + char* name = nullptr; + FILE* handle = nullptr; + while( ( ent = readdir( dir ) ) ) + { + if( ent->d_type == DT_REG ) + { + if( strcmp( ent->d_name, "max_energy_range_uj" ) == 0 ) + { + char tmp[PATH_MAX]; + snprintf( tmp, PATH_MAX, "%s/max_energy_range_uj", path ); + FILE* f = fopen( tmp, "r" ); + if( f ) + { + fscanf( f, "%" PRIu64, &maxRange ); + fclose( f ); + } + } + else if( strcmp( ent->d_name, "name" ) == 0 ) + { + char tmp[PATH_MAX]; + snprintf( tmp, PATH_MAX, "%s/name", path ); + FILE* f = fopen( tmp, "r" ); + if( f ) + { + char ntmp[128]; + if( fgets( ntmp, 128, f ) ) + { + // Last character is newline, skip it + const auto sz = strlen( ntmp ) - 1; + if( parent < 0 ) + { + name = (char*)tracy_malloc( sz + 1 ); + memcpy( name, ntmp, sz ); + name[sz] = '\0'; + } + else + { + const auto p = m_domains[parent]; + const auto psz = strlen( p.name ); + name = (char*)tracy_malloc( psz + sz + 2 ); + memcpy( name, p.name, psz ); + name[psz] = ':'; + memcpy( name+psz+1, ntmp, sz ); + name[psz+sz+1] = '\0'; + } + } + fclose( f ); + } + } + else if( strcmp( ent->d_name, "energy_uj" ) == 0 ) + { + char tmp[PATH_MAX]; + snprintf( tmp, PATH_MAX, "%s/energy_uj", path ); + handle = fopen( tmp, "r" ); + } + } + if( name && handle && maxRange > 0 ) break; + } + if( name && handle && maxRange > 0 ) + { + parent = (int)m_domains.size(); + Domain* domain = m_domains.push_next(); + domain->value = 0; + domain->overflow = maxRange; + domain->handle = handle; + domain->name = name; + TracyDebug( "Power domain id %i, %s found at %s\n", parent, name, path ); + } + else + { + if( name ) tracy_free( name ); + if( handle ) fclose( handle ); + } + + rewinddir( dir ); + while( ( ent = readdir( dir ) ) ) + { + if( ent->d_type == DT_DIR && strncmp( ent->d_name, "intel-rapl:", 11 ) == 0 ) + { + char tmp[PATH_MAX]; + snprintf( tmp, PATH_MAX, "%s/%s", path, ent->d_name ); + ScanDirectory( tmp, parent ); + } + } + closedir( dir ); +} + +} + +#endif diff --git a/thirdparty/tracy/include/tracy/client/TracySysPower.hpp b/thirdparty/tracy/include/tracy/client/TracySysPower.hpp new file mode 100644 index 0000000000000000000000000000000000000000..210123bce40682fce88d824b1e063622708e6a05 --- /dev/null +++ b/thirdparty/tracy/include/tracy/client/TracySysPower.hpp @@ -0,0 +1,44 @@ +#ifndef __TRACYSYSPOWER_HPP__ +#define __TRACYSYSPOWER_HPP__ + +#if defined __linux__ +# define TRACY_HAS_SYSPOWER +#endif + +#ifdef TRACY_HAS_SYSPOWER + +#include <stdint.h> +#include <stdio.h> + +#include "TracyFastVector.hpp" + +namespace tracy +{ + +class SysPower +{ + struct Domain + { + uint64_t value; + uint64_t overflow; + FILE* handle; + const char* name; + }; + +public: + SysPower(); + ~SysPower(); + + void Tick(); + +private: + void ScanDirectory( const char* path, int parent ); + + FastVector<Domain> m_domains; + uint64_t m_lastTime; +}; + +} +#endif + +#endif diff --git a/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp b/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp index 4a562eaae2e7935fa823cda84b1c690b22534a88..af0641fef1708fc000b4d312e932a46db49149c9 100644 --- a/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp +++ b/thirdparty/tracy/include/tracy/client/TracySysTrace.cpp @@ -770,6 +770,13 @@ bool SysTraceStart( int64_t& samplingPeriod ) TracyDebug( "sched_wakeup id: %i\n", wakeupId ); TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); +#ifdef TRACY_NO_SAMPLING + const bool noSoftwareSampling = true; +#else + const char* noSoftwareSamplingEnv = GetEnvVar( "TRACY_NO_SAMPLING" ); + const bool noSoftwareSampling = noSoftwareSamplingEnv && noSoftwareSamplingEnv[0] == '1'; +#endif + #ifdef TRACY_NO_SAMPLE_RETIREMENT const bool noRetirement = true; #else @@ -839,28 +846,31 @@ bool SysTraceStart( int64_t& samplingPeriod ) pe.clockid = CLOCK_MONOTONIC_RAW; #endif - TracyDebug( "Setup software sampling\n" ); - ProbePreciseIp( pe, currentPid ); - for( int i=0; i<s_numCpus; i++ ) + if( !noSoftwareSampling ) { - int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); - if( fd == -1 ) + TracyDebug( "Setup software sampling\n" ); + ProbePreciseIp( pe, currentPid ); + for( int i=0; i<s_numCpus; i++ ) { - pe.exclude_kernel = 1; - ProbePreciseIp( pe, currentPid ); - fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); + int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); if( fd == -1 ) { - TracyDebug( " Failed to setup!\n"); - break; + pe.exclude_kernel = 1; + ProbePreciseIp( pe, currentPid ); + fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); + if( fd == -1 ) + { + TracyDebug( " Failed to setup!\n"); + break; + } + TracyDebug( " No access to kernel samples\n" ); + } + new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack ); + if( s_ring[s_numBuffers].IsValid() ) + { + s_numBuffers++; + TracyDebug( " Core %i ok\n", i ); } - TracyDebug( " No access to kernel samples\n" ); - } - new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack ); - if( s_ring[s_numBuffers].IsValid() ) - { - s_numBuffers++; - TracyDebug( " Core %i ok\n", i ); } } diff --git a/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp b/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp index 8efa626a9355029155ee9073ae1dbfeca1416af9..711505d21ac1cb3ddf6ea1408d633112c5a62ffd 100644 --- a/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp +++ b/thirdparty/tracy/include/tracy/client/tracy_rpmalloc.cpp @@ -147,7 +147,7 @@ # if defined(__APPLE__) # include <TargetConditionals.h> # if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR -# include <mach/mach_vm.h> +# include <mach/mach.h> # include <mach/vm_statistics.h> # endif # include <pthread.h> diff --git a/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp b/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp index dd30e5391f5b55b6f024f8107ee61a35594bd598..5eb1639db3f10a5e6502aea3637a3a335d13985c 100644 --- a/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp +++ b/thirdparty/tracy/include/tracy/common/TracyProtocol.hpp @@ -9,14 +9,14 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 63 }; +enum : uint32_t { ProtocolVersion = 64 }; enum : uint16_t { BroadcastVersion = 3 }; using lz4sz_t = uint32_t; enum { TargetFrameSize = 256 * 1024 }; enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) }; -static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" ); +static_assert( LZ4Size <= (std::numeric_limits<lz4sz_t>::max)(), "LZ4Size greater than lz4sz_t" ); static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" ); enum { HandshakeShibbolethSize = 8 }; diff --git a/thirdparty/tracy/include/tracy/common/TracyQueue.hpp b/thirdparty/tracy/include/tracy/common/TracyQueue.hpp index 8443193afbc0d40c3189a5439ce869f7b117df38..051d412abfbee4799dae034026101445721a2b97 100644 --- a/thirdparty/tracy/include/tracy/common/TracyQueue.hpp +++ b/thirdparty/tracy/include/tracy/common/TracyQueue.hpp @@ -90,6 +90,7 @@ enum class QueueType : uint8_t GpuNewContext, CallstackFrame, SysTimeReport, + SysPowerReport, TidToPid, HwSampleCpuCycle, HwSampleInstructionRetired, @@ -563,6 +564,13 @@ struct QueueSysTime float sysTime; }; +struct QueueSysPower +{ + int64_t time; + uint64_t delta; + uint64_t name; // ptr +}; + struct QueueContextSwitch { int64_t time; @@ -729,6 +737,7 @@ struct QueueItem QueueCrashReport crashReport; QueueCrashReportThread crashReportThread; QueueSysTime sysTime; + QueueSysPower sysPower; QueueContextSwitch contextSwitch; QueueThreadWakeup threadWakeup; QueueTidToPid tidToPid; @@ -832,6 +841,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ), sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ), sizeof( QueueHeader ) + sizeof( QueueSysTime ), + sizeof( QueueHeader ) + sizeof( QueueSysPower ), sizeof( QueueHeader ) + sizeof( QueueTidToPid ), sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired diff --git a/thirdparty/tracy/include/tracy/common/TracySocket.cpp b/thirdparty/tracy/include/tracy/common/TracySocket.cpp index 176bbc7aa1f173926226156b97687d30ec054d50..259678989e89d71104b152c3d72733d5116e4da6 100644 --- a/thirdparty/tracy/include/tracy/common/TracySocket.cpp +++ b/thirdparty/tracy/include/tracy/common/TracySocket.cpp @@ -353,7 +353,7 @@ int Socket::Recv( void* _buf, int len, int timeout ) } } -int Socket::ReadUpTo( void* _buf, int len, int timeout ) +int Socket::ReadUpTo( void* _buf, int len ) { const auto sock = m_sock.load( std::memory_order_relaxed ); auto buf = (char*)_buf; @@ -678,10 +678,10 @@ bool UdpListen::Listen( uint16_t port ) #endif #if defined _WIN32 unsigned long reuse = 1; - setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) ); + setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) ); #else int reuse = 1; - setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) ); + setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) ); #endif #if defined _WIN32 unsigned long broadcast = 1; diff --git a/thirdparty/tracy/include/tracy/common/TracySocket.hpp b/thirdparty/tracy/include/tracy/common/TracySocket.hpp index 4b3075e29d164deed141a3e44665dda422eb155f..f7713aac663797f5a837012b98705b1ebcf61885 100644 --- a/thirdparty/tracy/include/tracy/common/TracySocket.hpp +++ b/thirdparty/tracy/include/tracy/common/TracySocket.hpp @@ -29,7 +29,7 @@ public: int Send( const void* buf, int len ); int GetSendBufSize(); - int ReadUpTo( void* buf, int len, int timeout ); + int ReadUpTo( void* buf, int len ); bool Read( void* buf, int len, int timeout ); template<typename ShouldExit> diff --git a/thirdparty/tracy/include/tracy/common/TracySystem.cpp b/thirdparty/tracy/include/tracy/common/TracySystem.cpp index 2a7d997e4ca17cb90ae39c5c3b3dd98f0f9b1608..9a477aa310c956f67d36a51775e2e6386fd0537e 100644 --- a/thirdparty/tracy/include/tracy/common/TracySystem.cpp +++ b/thirdparty/tracy/include/tracy/common/TracySystem.cpp @@ -213,21 +213,24 @@ TRACY_API const char* GetThreadName( uint32_t id ) # else static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); # endif - if( _GetThreadDescription ) - { - auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id ); - if( hnd != 0 ) - { - PWSTR tmp; - _GetThreadDescription( hnd, &tmp ); - auto ret = wcstombs( buf, tmp, 256 ); - CloseHandle( hnd ); - if( ret != 0 ) - { - return buf; - } - } - } + if( _GetThreadDescription ) + { + auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id ); + if( hnd != 0 ) + { + PWSTR tmp; + if( SUCCEEDED( _GetThreadDescription( hnd, &tmp ) ) ) + { + auto ret = wcstombs( buf, tmp, 256 ); + CloseHandle( hnd ); + LocalFree( tmp ); + if( ret != static_cast<size_t>( -1 ) ) + { + return buf; + } + } + } + } #elif defined __linux__ int cs, fd; char path[32]; diff --git a/thirdparty/tracy/include/tracy/common/TracyVersion.hpp b/thirdparty/tracy/include/tracy/common/TracyVersion.hpp index c82edf93d224a7790935f4b1346a24bd65fb373f..2355279f7219cbb59276086cf03deddd56399de7 100644 --- a/thirdparty/tracy/include/tracy/common/TracyVersion.hpp +++ b/thirdparty/tracy/include/tracy/common/TracyVersion.hpp @@ -6,8 +6,8 @@ namespace tracy namespace Version { enum { Major = 0 }; -enum { Minor = 9 }; -enum { Patch = 1 }; +enum { Minor = 10 }; +enum { Patch = 0 }; } } diff --git a/thirdparty/tracy/include/tracy/libbacktrace/config.h b/thirdparty/tracy/include/tracy/libbacktrace/config.h index aa3259d1198458b4d96719d02ae0cb56a0869d85..87e38a95b5881b4e627dd320160d2aa60636b6c7 100644 --- a/thirdparty/tracy/include/tracy/libbacktrace/config.h +++ b/thirdparty/tracy/include/tracy/libbacktrace/config.h @@ -1,4 +1,8 @@ #include <limits.h> +#if defined(__linux__) && !defined(__GLIBC__) && !defined(__WORDSIZE) +// include __WORDSIZE headers for musl +# include <bits/reg.h> +#endif #if __WORDSIZE == 64 # define BACKTRACE_ELF_SIZE 64 #else diff --git a/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp b/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp index 246cb9f3696489233db67bdc2ac4f2bf920621aa..f3899cbce101b4de114729b48cac4459dad65f80 100644 --- a/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp +++ b/thirdparty/tracy/include/tracy/libbacktrace/dwarf.cpp @@ -473,7 +473,7 @@ enum attr_val_encoding /* An address. */ ATTR_VAL_ADDRESS, /* An index into the .debug_addr section, whose value is relative to - * the DW_AT_addr_base attribute of the compilation unit. */ + the DW_AT_addr_base attribute of the compilation unit. */ ATTR_VAL_ADDRESS_INDEX, /* A unsigned integer. */ ATTR_VAL_UINT, @@ -611,8 +611,8 @@ struct function struct function_addrs { /* Range is LOW <= PC < HIGH. */ - uint64_t low; - uint64_t high; + uintptr_t low; + uintptr_t high; /* Function for this address range. */ struct function *function; }; @@ -693,8 +693,8 @@ struct unit struct unit_addrs { /* Range is LOW <= PC < HIGH. */ - uint64_t low; - uint64_t high; + uintptr_t low; + uintptr_t high; /* Compilation unit for this address range. */ struct unit *u; }; @@ -1431,7 +1431,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections, uint64_t addr_base, int addrsize, int is_bigendian, uint64_t addr_index, backtrace_error_callback error_callback, void *data, - uint64_t *address) + uintptr_t *address) { uint64_t offset; struct dwarf_buf addr_buf; @@ -1452,7 +1452,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections, addr_buf.data = data; addr_buf.reported_underflow = 0; - *address = read_address (&addr_buf, addrsize); + *address = (uintptr_t) read_address (&addr_buf, addrsize); return 1; } @@ -1531,7 +1531,7 @@ function_addrs_search (const void *vkey, const void *ventry) static int add_unit_addr (struct backtrace_state *state, void *rdata, - uint64_t lowpc, uint64_t highpc, + uintptr_t lowpc, uintptr_t highpc, backtrace_error_callback error_callback, void *data, void *pvec) { @@ -1867,10 +1867,10 @@ lookup_abbrev (struct abbrevs *abbrevs, uint64_t code, lowpc/highpc is set or ranges is set. */ struct pcrange { - uint64_t lowpc; /* The low PC value. */ + uintptr_t lowpc; /* The low PC value. */ int have_lowpc; /* Whether a low PC value was found. */ int lowpc_is_addr_index; /* Whether lowpc is in .debug_addr. */ - uint64_t highpc; /* The high PC value. */ + uintptr_t highpc; /* The high PC value. */ int have_highpc; /* Whether a high PC value was found. */ int highpc_is_relative; /* Whether highpc is relative to lowpc. */ int highpc_is_addr_index; /* Whether highpc is in .debug_addr. */ @@ -1890,12 +1890,12 @@ update_pcrange (const struct attr* attr, const struct attr_val* val, case DW_AT_low_pc: if (val->encoding == ATTR_VAL_ADDRESS) { - pcrange->lowpc = val->u.uint; + pcrange->lowpc = (uintptr_t) val->u.uint; pcrange->have_lowpc = 1; } else if (val->encoding == ATTR_VAL_ADDRESS_INDEX) { - pcrange->lowpc = val->u.uint; + pcrange->lowpc = (uintptr_t) val->u.uint; pcrange->have_lowpc = 1; pcrange->lowpc_is_addr_index = 1; } @@ -1904,18 +1904,18 @@ update_pcrange (const struct attr* attr, const struct attr_val* val, case DW_AT_high_pc: if (val->encoding == ATTR_VAL_ADDRESS) { - pcrange->highpc = val->u.uint; + pcrange->highpc = (uintptr_t) val->u.uint; pcrange->have_highpc = 1; } else if (val->encoding == ATTR_VAL_UINT) { - pcrange->highpc = val->u.uint; + pcrange->highpc = (uintptr_t) val->u.uint; pcrange->have_highpc = 1; pcrange->highpc_is_relative = 1; } else if (val->encoding == ATTR_VAL_ADDRESS_INDEX) { - pcrange->highpc = val->u.uint; + pcrange->highpc = (uintptr_t) val->u.uint; pcrange->have_highpc = 1; pcrange->highpc_is_addr_index = 1; } @@ -1950,16 +1950,16 @@ add_low_high_range (struct backtrace_state *state, uintptr_t base_address, int is_bigendian, struct unit *u, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, - void *rdata, uint64_t lowpc, - uint64_t highpc, + void *rdata, uintptr_t lowpc, + uintptr_t highpc, backtrace_error_callback error_callback, void *data, void *vec), void *rdata, backtrace_error_callback error_callback, void *data, void *vec) { - uint64_t lowpc; - uint64_t highpc; + uintptr_t lowpc; + uintptr_t highpc; lowpc = pcrange->lowpc; if (pcrange->lowpc_is_addr_index) @@ -1997,10 +1997,10 @@ add_ranges_from_ranges ( struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, uintptr_t base_address, int is_bigendian, - struct unit *u, uint64_t base, + struct unit *u, uintptr_t base, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, - uint64_t lowpc, uint64_t highpc, + uintptr_t lowpc, uintptr_t highpc, backtrace_error_callback error_callback, void *data, void *vec), void *rdata, @@ -2039,12 +2039,12 @@ add_ranges_from_ranges ( break; if (is_highest_address (low, u->addrsize)) - base = high; + base = (uintptr_t) high; else { if (!add_range (state, rdata, - low + base + base_address, - high + base + base_address, + (uintptr_t) low + base + base_address, + (uintptr_t) high + base + base_address, error_callback, data, vec)) return 0; } @@ -2064,10 +2064,10 @@ add_ranges_from_rnglists ( struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, uintptr_t base_address, int is_bigendian, - struct unit *u, uint64_t base, + struct unit *u, uintptr_t base, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, - uint64_t lowpc, uint64_t highpc, + uintptr_t lowpc, uintptr_t highpc, backtrace_error_callback error_callback, void *data, void *vec), void *rdata, @@ -2133,8 +2133,8 @@ add_ranges_from_rnglists ( case DW_RLE_startx_endx: { uint64_t index; - uint64_t low; - uint64_t high; + uintptr_t low; + uintptr_t high; index = read_uleb128 (&rnglists_buf); if (!resolve_addr_index (dwarf_sections, u->addr_base, @@ -2156,8 +2156,8 @@ add_ranges_from_rnglists ( case DW_RLE_startx_length: { uint64_t index; - uint64_t low; - uint64_t length; + uintptr_t low; + uintptr_t length; index = read_uleb128 (&rnglists_buf); if (!resolve_addr_index (dwarf_sections, u->addr_base, @@ -2187,16 +2187,16 @@ add_ranges_from_rnglists ( break; case DW_RLE_base_address: - base = read_address (&rnglists_buf, u->addrsize); + base = (uintptr_t) read_address (&rnglists_buf, u->addrsize); break; case DW_RLE_start_end: { - uint64_t low; - uint64_t high; + uintptr_t low; + uintptr_t high; - low = read_address (&rnglists_buf, u->addrsize); - high = read_address (&rnglists_buf, u->addrsize); + low = (uintptr_t) read_address (&rnglists_buf, u->addrsize); + high = (uintptr_t) read_address (&rnglists_buf, u->addrsize); if (!add_range (state, rdata, low + base_address, high + base_address, error_callback, data, vec)) @@ -2206,11 +2206,11 @@ add_ranges_from_rnglists ( case DW_RLE_start_length: { - uint64_t low; - uint64_t length; + uintptr_t low; + uintptr_t length; - low = read_address (&rnglists_buf, u->addrsize); - length = read_uleb128 (&rnglists_buf); + low = (uintptr_t) read_address (&rnglists_buf, u->addrsize); + length = (uintptr_t) read_uleb128 (&rnglists_buf); low += base_address; if (!add_range (state, rdata, low, low + length, error_callback, data, vec)) @@ -2240,9 +2240,9 @@ static int add_ranges (struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, uintptr_t base_address, int is_bigendian, - struct unit *u, uint64_t base, const struct pcrange *pcrange, + struct unit *u, uintptr_t base, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, - uint64_t lowpc, uint64_t highpc, + uintptr_t lowpc, uintptr_t highpc, backtrace_error_callback error_callback, void *data, void *vec), void *rdata, @@ -3520,7 +3520,7 @@ read_referenced_name (struct dwarf_data *ddata, struct unit *u, static int add_function_range (struct backtrace_state *state, void *rdata, - uint64_t lowpc, uint64_t highpc, + uintptr_t lowpc, uintptr_t highpc, backtrace_error_callback error_callback, void *data, void *pvec) { @@ -3560,7 +3560,7 @@ add_function_range (struct backtrace_state *state, void *rdata, static int read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata, - struct unit *u, uint64_t base, struct dwarf_buf *unit_buf, + struct unit *u, uintptr_t base, struct dwarf_buf *unit_buf, const struct line_header *lhdr, backtrace_error_callback error_callback, void *data, struct function_vector *vec_function, @@ -3624,7 +3624,7 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata, && abbrev->attrs[i].name == DW_AT_low_pc) { if (val.encoding == ATTR_VAL_ADDRESS) - base = val.u.uint; + base = (uintptr_t) val.u.uint; else if (val.encoding == ATTR_VAL_ADDRESS_INDEX) { if (!resolve_addr_index (&ddata->dwarf_sections, diff --git a/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp b/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp index e6d6c4aeeaff86c78eec0a54f08fe6447857d5dc..c65bc4e768af6f1290db00439a9750684a2837cb 100644 --- a/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp +++ b/thirdparty/tracy/include/tracy/libbacktrace/elf.cpp @@ -2823,18 +2823,18 @@ elf_zstd_read_fse (const unsigned char **ppin, const unsigned char *pinend, while ((val & 0xfff) == 0xfff) { zidx += 3 * 6; - if (!elf_fetch_bits (&pin, pinend, &val, &bits)) - return 0; val >>= 12; bits -= 12; + if (!elf_fetch_bits (&pin, pinend, &val, &bits)) + return 0; } while ((val & 3) == 3) { zidx += 3; - if (!elf_fetch_bits (&pin, pinend, &val, &bits)) - return 0; val >>= 2; bits -= 2; + if (!elf_fetch_bits (&pin, pinend, &val, &bits)) + return 0; } /* We have at least 13 bits here, don't need to fetch. */ zidx += val & 3; @@ -2964,7 +2964,7 @@ elf_zstd_build_fse (const int16_t *norm, int idx, uint16_t *next, pos = (pos + step) & mask; } } - if (pos != 0) + if (unlikely (pos != 0)) { elf_uncompress_failed (); return 0; @@ -3440,17 +3440,17 @@ static const struct elf_zstd_fse_baseline_entry elf_zstd_match_table[64] = static const struct elf_zstd_fse_baseline_entry elf_zstd_offset_table[32] = { - { 1, 0, 5, 0 }, { 64, 6, 4, 0 }, { 512, 9, 5, 0 }, - { 32768, 15, 5, 0 }, { 2097152, 21, 5, 0 }, { 8, 3, 5, 0 }, - { 128, 7, 4, 0 }, { 4096, 12, 5, 0 }, { 262144, 18, 5, 0 }, - { 8388608, 23, 5, 0 }, { 32, 5, 5, 0 }, { 256, 8, 4, 0 }, - { 16384, 14, 5, 0 }, { 1048576, 20, 5, 0 }, { 4, 2, 5, 0 }, - { 128, 7, 4, 16 }, { 2048, 11, 5, 0 }, { 131072, 17, 5, 0 }, - { 4194304, 22, 5, 0 }, { 16, 4, 5, 0 }, { 256, 8, 4, 16 }, - { 8192, 13, 5, 0 }, { 524288, 19, 5, 0 }, { 2, 1, 5, 0 }, - { 64, 6, 4, 16 }, { 1024, 10, 5, 0 }, { 65536, 16, 5, 0 }, - { 268435456, 28, 5, 0 }, { 134217728, 27, 5, 0 }, { 67108864, 26, 5, 0 }, - { 33554432, 25, 5, 0 }, { 16777216, 24, 5, 0 }, + { 1, 0, 5, 0 }, { 61, 6, 4, 0 }, { 509, 9, 5, 0 }, + { 32765, 15, 5, 0 }, { 2097149, 21, 5, 0 }, { 5, 3, 5, 0 }, + { 125, 7, 4, 0 }, { 4093, 12, 5, 0 }, { 262141, 18, 5, 0 }, + { 8388605, 23, 5, 0 }, { 29, 5, 5, 0 }, { 253, 8, 4, 0 }, + { 16381, 14, 5, 0 }, { 1048573, 20, 5, 0 }, { 1, 2, 5, 0 }, + { 125, 7, 4, 16 }, { 2045, 11, 5, 0 }, { 131069, 17, 5, 0 }, + { 4194301, 22, 5, 0 }, { 13, 4, 5, 0 }, { 253, 8, 4, 16 }, + { 8189, 13, 5, 0 }, { 524285, 19, 5, 0 }, { 2, 1, 5, 0 }, + { 61, 6, 4, 16 }, { 1021, 10, 5, 0 }, { 65533, 16, 5, 0 }, + { 268435453, 28, 5, 0 }, { 134217725, 27, 5, 0 }, { 67108861, 26, 5, 0 }, + { 33554429, 25, 5, 0 }, { 16777213, 24, 5, 0 }, }; /* Read a zstd Huffman table and build the decoding table in *TABLE, reading @@ -3635,7 +3635,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend, } weight_mark = (uint32_t *) (weights + 256); - memset (weight_mark, 0, 12 * sizeof (uint32_t)); + memset (weight_mark, 0, 13 * sizeof (uint32_t)); weight_mask = 0; for (i = 0; i < count; ++i) { @@ -3702,7 +3702,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend, /* Change WEIGHT_MARK from a count of weights to the index of the first symbol for that weight. We shift the indexes to also store how many we - hae seen so far, below. */ + have seen so far, below. */ { uint32_t next; @@ -3783,7 +3783,7 @@ elf_zstd_read_literals (const unsigned char **ppin, { int raw; - /* Raw_literals_Block or RLE_Literals_Block */ + /* Raw_Literals_Block or RLE_Literals_Block */ raw = (hdr & 3) == 0; @@ -3965,7 +3965,7 @@ elf_zstd_read_literals (const unsigned char **ppin, unsigned int bits; uint32_t i; - pback = pin + compressed_size - 1; + pback = pin + total_streams_size - 1; pbackend = pin; if (!elf_fetch_backward_init (&pback, pbackend, &val, &bits)) return 0; diff --git a/thirdparty/tracy/include/tracy/tracy/Tracy.hpp b/thirdparty/tracy/include/tracy/tracy/Tracy.hpp index d42f4bf3b77801abd52369d7049c3aa430df9e8f..978eb5ef15c32765e2cf4d208ad9444be31a790f 100644 --- a/thirdparty/tracy/include/tracy/tracy/Tracy.hpp +++ b/thirdparty/tracy/include/tracy/tracy/Tracy.hpp @@ -109,6 +109,7 @@ #define TracyParameterRegister(x,y) #define TracyParameterSetup(x,y,z,w) #define TracyIsConnected false +#define TracySetProgramName(x) #define TracyFiberEnter(x) #define TracyFiberLeave @@ -270,6 +271,7 @@ #define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data ) #define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ) #define TracyIsConnected tracy::GetProfiler().IsConnected() +#define TracySetProgramName( name ) tracy::GetProfiler().SetProgramName( name ); #ifdef TRACY_FIBERS # define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber ) diff --git a/thirdparty/tracy/include/tracy/tracy/TracyC.h b/thirdparty/tracy/include/tracy/tracy/TracyC.h index bedf5e162558c40fe926ee68efc88a46e397d91b..996889c40f815111ab559deec16a89c8d245cd1a 100644 --- a/thirdparty/tracy/include/tracy/tracy/TracyC.h +++ b/thirdparty/tracy/include/tracy/tracy/TracyC.h @@ -11,6 +11,14 @@ extern "C" { #endif +enum TracyPlotFormatEnum +{ + TracyPlotFormatNumber, + TracyPlotFormatMemory, + TracyPlotFormatPercentage, + TracyPlotFormatWatt +}; + TRACY_API void ___tracy_set_thread_name( const char* name ); #define TracyCSetThreadName( name ) ___tracy_set_thread_name( name ); @@ -60,6 +68,8 @@ typedef const void* TracyCZoneCtx; #define TracyCPlot(x,y) #define TracyCPlotF(x,y) #define TracyCPlotI(x,y) +#define TracyCPlotConfig(x,y,z,w,a) + #define TracyCMessage(x,y) #define TracyCMessageL(x) #define TracyCMessageC(x,y,z) @@ -289,11 +299,13 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_ TRACY_API void ___tracy_emit_plot( const char* name, double val ); TRACY_API void ___tracy_emit_plot_float( const char* name, float val ); TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ); +TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ); TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); #define TracyCPlot( name, val ) ___tracy_emit_plot( name, val ); #define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val ); #define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val ); +#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color ); #define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size ); diff --git a/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp b/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp index 9f358c4a5b9e191318ed35fe575417936bbe9c9c..8aebdb2653262ba610ba2a8136f5b4b2b3863dca 100644 --- a/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp +++ b/thirdparty/tracy/include/tracy/tracy/TracyD3D11.hpp @@ -13,13 +13,13 @@ #define TracyD3D11ZoneC(ctx, name, color) #define TracyD3D11NamedZone(ctx, varname, name, active) #define TracyD3D11NamedZoneC(ctx, varname, name, color, active) -#define TracyD3D12ZoneTransient(ctx, varname, name, active) +#define TracyD3D11ZoneTransient(ctx, varname, name, active) #define TracyD3D11ZoneS(ctx, name, depth) #define TracyD3D11ZoneCS(ctx, name, color, depth) #define TracyD3D11NamedZoneS(ctx, varname, name, depth, active) #define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active) -#define TracyD3D12ZoneTransientS(ctx, varname, name, depth, active) +#define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) #define TracyD3D11Collect(ctx) @@ -39,11 +39,12 @@ using TracyD3D11Ctx = void*; #include "Tracy.hpp" #include "../client/TracyProfiler.hpp" #include "../client/TracyCallstack.hpp" -#include "../common/TracyAlign.hpp" -#include "../common/TracyAlloc.hpp" +#include "../common/TracyYield.hpp" #include <d3d11.h> +#define TracyD3D11Panic(msg, ...) do { assert(false && "TracyD3D11: " msg); TracyMessageLC("TracyD3D11: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false); + namespace tracy { @@ -51,71 +52,83 @@ class D3D11Ctx { friend class D3D11ZoneScope; - enum { QueryCount = 64 * 1024 }; + static constexpr uint32_t MaxQueries = 64 * 1024; + + enum CollectMode { POLL, BLOCK }; public: D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx ) - : m_device( device ) - , m_devicectx( devicectx ) - , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) ) - , m_head( 0 ) - , m_tail( 0 ) { - assert( m_context != 255 ); + // TODO: consider calling ID3D11Device::GetImmediateContext() instead of passing it as an argument + m_device = device; + device->AddRef(); + m_immediateDevCtx = devicectx; + devicectx->AddRef(); - for (int i = 0; i < QueryCount; i++) { - HRESULT hr = S_OK; - D3D11_QUERY_DESC desc; - desc.MiscFlags = 0; - - desc.Query = D3D11_QUERY_TIMESTAMP; - hr |= device->CreateQuery(&desc, &m_queries[i]); - + D3D11_QUERY_DESC desc = { }; desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT; - hr |= device->CreateQuery(&desc, &m_disjoints[i]); - - m_disjointMap[i] = nullptr; + if (FAILED(m_device->CreateQuery(&desc, &m_disjointQuery))) + { + TracyD3D11Panic("unable to create disjoint timestamp query.", return); + } + } - assert(SUCCEEDED(hr)); + for (ID3D11Query*& query : m_queries) + { + D3D11_QUERY_DESC desc = { }; + desc.Query = D3D11_QUERY_TIMESTAMP; + if (FAILED(m_device->CreateQuery(&desc, &query))) + { + TracyD3D11Panic("unable to create timestamp query.", return); + } } - // Force query the initial GPU timestamp (pipeline stall) - D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint; - UINT64 timestamp; + // Calibrate CPU and GPU timestamps + int64_t tcpu = 0; + int64_t tgpu = 0; for (int attempts = 0; attempts < 50; attempts++) { - devicectx->Begin(m_disjoints[0]); - devicectx->End(m_queries[0]); - devicectx->End(m_disjoints[0]); - devicectx->Flush(); + m_immediateDevCtx->Begin(m_disjointQuery); + m_immediateDevCtx->End(m_queries[0]); + m_immediateDevCtx->End(m_disjointQuery); + + int64_t tcpu0 = Profiler::GetTime(); + WaitForQuery(m_disjointQuery); + int64_t tcpu1 = Profiler::GetTime(); - while (devicectx->GetData(m_disjoints[0], &disjoint, sizeof(disjoint), 0) == S_FALSE) - /* Nothing */; + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { }; + if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), 0) != S_OK) + { + TracyMessageLC("TracyD3D11: unable to query GPU timestamp; retrying...", tracy::Color::Tomato); + continue; + } if (disjoint.Disjoint) continue; - while (devicectx->GetData(m_queries[0], ×tamp, sizeof(timestamp), 0) == S_FALSE) - /* Nothing */; + UINT64 timestamp = 0; + if (m_immediateDevCtx->GetData(m_queries[0], ×tamp, sizeof(timestamp), 0) != S_OK) + continue; // this should never happen, since the enclosing disjoint query succeeded + tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2; + tgpu = timestamp * (1000000000 / disjoint.Frequency); break; } - int64_t tgpu = timestamp * (1000000000ull / disjoint.Frequency); - int64_t tcpu = Profiler::GetTime(); - - uint8_t flags = 0; + // ready to roll + m_contextId = GetGpuCtxCounter().fetch_add(1); + m_immediateDevCtx->Begin(m_disjointQuery); + m_previousCheckpoint = m_nextCheckpoint = 0; - const float period = 1.f; auto* item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuNewContext ); MemWrite( &item->gpuNewContext.cpuTime, tcpu ); MemWrite( &item->gpuNewContext.gpuTime, tgpu ); - memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); - MemWrite( &item->gpuNewContext.period, period ); - MemWrite( &item->gpuNewContext.context, m_context ); - MemWrite( &item->gpuNewContext.flags, flags ); + MemWrite( &item->gpuNewContext.thread, uint32_t(0) ); // #TODO: why not GetThreadHandle()? + MemWrite( &item->gpuNewContext.period, 1.0f ); + MemWrite( &item->gpuNewContext.context, m_contextId); + MemWrite( &item->gpuNewContext.flags, uint8_t(0) ); MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 ); #ifdef TRACY_ON_DEMAND @@ -127,12 +140,20 @@ public: ~D3D11Ctx() { - for (int i = 0; i < QueryCount; i++) + // collect all pending timestamps before destroying everything + do { - m_queries[i]->Release(); - m_disjoints[i]->Release(); - m_disjointMap[i] = nullptr; + Collect(BLOCK); + } while (m_previousCheckpoint != m_queryCounter); + + for (ID3D11Query* query : m_queries) + { + query->Release(); } + m_immediateDevCtx->End(m_disjointQuery); + m_disjointQuery->Release(); + m_immediateDevCtx->Release(); + m_device->Release(); } void Name( const char* name, uint16_t len ) @@ -142,7 +163,7 @@ public: auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuContextName ); - MemWrite( &item->gpuContextNameFat.context, m_context ); + MemWrite( &item->gpuContextNameFat.context, m_contextId ); MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); MemWrite( &item->gpuContextNameFat.size, len ); #ifdef TRACY_ON_DEMAND @@ -151,217 +172,170 @@ public: Profiler::QueueSerialFinish(); } - void Collect() + void Collect(CollectMode mode = POLL) { ZoneScopedC( Color::Red4 ); - if( m_tail == m_head ) return; - #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) { - m_head = m_tail = 0; + m_previousCheckpoint = m_nextCheckpoint = m_queryCounter; return; } #endif - auto start = m_tail; - auto end = m_head + QueryCount; - auto cnt = (end - start) % QueryCount; - while (cnt > 1) + if (m_previousCheckpoint == m_nextCheckpoint) { - auto mid = start + cnt / 2; - - bool available = - m_devicectx->GetData(m_disjointMap[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK && - m_devicectx->GetData(m_queries[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK; - - if (available) - { - start = mid; - } - else + uintptr_t nextCheckpoint = m_queryCounter; + if (nextCheckpoint == m_nextCheckpoint) { - end = mid; + return; } - cnt = (end - start) % QueryCount; + m_nextCheckpoint = nextCheckpoint; + m_immediateDevCtx->End(m_disjointQuery); } - start %= QueryCount; - - while (m_tail != start) + if (mode == CollectMode::BLOCK) { - D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint; - UINT64 time; + WaitForQuery(m_disjointQuery); + } - m_devicectx->GetData(m_disjointMap[m_tail], &disjoint, sizeof(disjoint), 0); - m_devicectx->GetData(m_queries[m_tail], &time, sizeof(time), 0); + D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { }; + if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), D3D11_ASYNC_GETDATA_DONOTFLUSH) != S_OK) + { + return; + } - time *= (1000000000ull / disjoint.Frequency); + if (disjoint.Disjoint == TRUE) + { + m_previousCheckpoint = m_nextCheckpoint; + TracyD3D11Panic("disjoint timestamps detected; dropping."); + return; + } + auto begin = m_previousCheckpoint; + auto end = m_nextCheckpoint; + for (auto i = begin; i != end; ++i) + { + uint32_t k = RingIndex(i); + UINT64 timestamp = 0; + if (m_immediateDevCtx->GetData(m_queries[k], ×tamp, sizeof(timestamp), 0) != S_OK) + { + TracyD3D11Panic("timestamp expected to be ready, but it was not!"); + break; + } + timestamp *= (1000000000ull / disjoint.Frequency); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, (int64_t)time); - MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail); - MemWrite(&item->gpuTime.context, m_context); + MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(timestamp)); + MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k)); + MemWrite(&item->gpuTime.context, m_contextId); Profiler::QueueSerialFinish(); - - m_tail = (m_tail + 1) % QueryCount; } + + // disjoint timestamp queries should only be invoked once per frame or less + // https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_query + m_immediateDevCtx->Begin(m_disjointQuery); + m_previousCheckpoint = m_nextCheckpoint; } private: - tracy_force_inline unsigned int NextQueryId() + tracy_force_inline uint32_t RingIndex(uintptr_t index) + { + index %= MaxQueries; + return static_cast<uint32_t>(index); + } + + tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end) + { + // wrap-around safe: all unsigned + uintptr_t count = end - begin; + return static_cast<uint32_t>(count); + } + + tracy_force_inline uint32_t NextQueryId() { - const auto id = m_head; - m_head = ( m_head + 1 ) % QueryCount; - assert( m_head != m_tail ); - return id; + auto id = m_queryCounter++; + if (RingCount(m_previousCheckpoint, id) >= MaxQueries) + { + TracyD3D11Panic("too many pending timestamp queries."); + // #TODO: return some sentinel value; ideally a "hidden" query index + } + return RingIndex(id); } - tracy_force_inline ID3D11Query* TranslateQueryId( unsigned int id ) + tracy_force_inline ID3D11Query* GetQueryObjectFromId(uint32_t id) { return m_queries[id]; } - tracy_force_inline ID3D11Query* MapDisjointQueryId( unsigned int id, unsigned int disjointId ) + tracy_force_inline void WaitForQuery(ID3D11Query* query) { - m_disjointMap[id] = m_disjoints[disjointId]; - return m_disjoints[disjointId]; + m_immediateDevCtx->Flush(); + while (m_immediateDevCtx->GetData(query, nullptr, 0, 0) != S_OK) + YieldThread(); // busy-wait :-( attempt to reduce power usage with _mm_pause() & friends... } - tracy_force_inline uint8_t GetId() const + tracy_force_inline uint8_t GetContextId() const { - return m_context; + return m_contextId; } - ID3D11Device* m_device; - ID3D11DeviceContext* m_devicectx; + ID3D11Device* m_device = nullptr; + ID3D11DeviceContext* m_immediateDevCtx = nullptr; - ID3D11Query* m_queries[QueryCount]; - ID3D11Query* m_disjoints[QueryCount]; - ID3D11Query* m_disjointMap[QueryCount]; // Multiple time queries can have one disjoint query - uint8_t m_context; + ID3D11Query* m_queries[MaxQueries]; + ID3D11Query* m_disjointQuery = nullptr; - unsigned int m_head; - unsigned int m_tail; + uint8_t m_contextId = 255; // NOTE: apparently, 255 means invalid id; is this documented anywhere? + + uintptr_t m_queryCounter = 0; + + uintptr_t m_previousCheckpoint = 0; + uintptr_t m_nextCheckpoint = 0; }; class D3D11ZoneScope { public: - tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool is_active ) -#ifdef TRACY_ON_DEMAND - : m_active( is_active && GetProfiler().IsConnected() ) -#else - : m_active( is_active ) -#endif + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool active ) + : D3D11ZoneScope(ctx, active) { if( !m_active ) return; - m_ctx = ctx; - - const auto queryId = ctx->NextQueryId(); - ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId)); - ctx->m_devicectx->End(ctx->TranslateQueryId(queryId)); - - m_disjointId = queryId; auto* item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); - MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); - MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); - MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); - MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneBegin.context, ctx->GetId() ); - - Profiler::QueueSerialFinish(); + WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcloc)); } - tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool is_active ) -#ifdef TRACY_ON_DEMAND - : m_active( is_active && GetProfiler().IsConnected() ) -#else - : m_active( is_active ) -#endif + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool active ) + : D3D11ZoneScope(ctx, active) { if( !m_active ) return; - m_ctx = ctx; - - const auto queryId = ctx->NextQueryId(); - ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId)); - ctx->m_devicectx->End(ctx->TranslateQueryId(queryId)); - - m_disjointId = queryId; - - auto* item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial ); - MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); - MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); - MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); - MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneBegin.context, ctx->GetId() ); - - Profiler::QueueSerialFinish(); - GetProfiler().SendCallstack( depth ); + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcloc)); } tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active) -#ifdef TRACY_ON_DEMAND - : m_active(active&& GetProfiler().IsConnected()) -#else - : m_active(active) -#endif + : D3D11ZoneScope(ctx, active) { if( !m_active ) return; - m_ctx = ctx; - - const auto queryId = ctx->NextQueryId(); - ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId)); - ctx->m_devicectx->End(ctx->TranslateQueryId(queryId)); - - m_disjointId = queryId; const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial); - MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation); - MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId)); - MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); - - Profiler::QueueSerialFinish(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); } tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active) -#ifdef TRACY_ON_DEMAND - : m_active(active&& GetProfiler().IsConnected()) -#else - : m_active(active) -#endif + : D3D11ZoneScope(ctx, active) { if( !m_active ) return; - m_ctx = ctx; - - const auto queryId = ctx->NextQueryId(); - ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId)); - ctx->m_devicectx->End(ctx->TranslateQueryId(queryId)); - - m_disjointId = queryId; const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); - MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial); - MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation); - MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId)); - MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); - - Profiler::QueueSerialFinish(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation); } tracy_force_inline ~D3D11ZoneScope() @@ -369,24 +343,46 @@ public: if( !m_active ) return; const auto queryId = m_ctx->NextQueryId(); - m_ctx->m_devicectx->End(m_ctx->TranslateQueryId(queryId)); - m_ctx->m_devicectx->End(m_ctx->MapDisjointQueryId(queryId, m_disjointId)); + m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId)); auto* item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); - MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() ); - + MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() ); Profiler::QueueSerialFinish(); } private: + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, bool active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( active ) +#endif + { + if( !m_active ) return; + m_ctx = ctx; + } + + void WriteQueueItem(tracy::QueueItem* item, tracy::QueueType queueItemType, uint64_t sourceLocation) + { + const auto queryId = m_ctx->NextQueryId(); + m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId)); + + MemWrite( &item->hdr.type, queueItemType); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, sourceLocation ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, m_ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + } + const bool m_active; D3D11Ctx* m_ctx; - unsigned int m_disjointId; }; static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx ) @@ -403,38 +399,44 @@ static inline void DestroyD3D11Context( D3D11Ctx* ctx ) } } +#undef TracyD3D11Panic + using TracyD3D11Ctx = tracy::D3D11Ctx*; #define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx ); #define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx); #define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size); +#define TracyD3D11UnnamedZone ___tracy_gpu_d3d11_zone +#define TracyD3D11SrcLocSymbol TracyConcat(__tracy_gpu_d3d11_source_location,TracyLine) +#define TracyD3D11SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D11SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; + #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true ) -# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true ) -# define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); +# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, TRACY_CALLSTACK, true ) +# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, TRACY_CALLSTACK, true ) +# define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active ); +# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active ); # define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) #else -# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, ___tracy_gpu_zone, name, true ) -# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, ___tracy_gpu_zone, name, color, true ) -# define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); -# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, TracyD3D11UnnamedZone, name, true ) +# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, TracyD3D11UnnamedZone, name, color, true ) +# define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active ); +# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active ); # define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; #endif #ifdef TRACY_HAS_CALLSTACK -# define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true ) -# define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true ) -# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); -# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); +# define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, depth, true ) +# define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, depth, true ) +# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active ); +# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active ); # define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; #else # define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name ) # define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color ) # define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active ) # define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active ) -# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D12ZoneTransient(ctx, varname, name, active) +# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D11ZoneTransient(ctx, varname, name, active) #endif #define TracyD3D11Collect( ctx ) ctx->Collect(); diff --git a/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp b/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp index d7944cb8e5bfcac8d3fca7184e6287e53695ceed..41567937e833e8e7ea7583fed3bcbc91d97dff61 100644 --- a/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp +++ b/thirdparty/tracy/include/tracy/tracy/TracyD3D12.hpp @@ -25,7 +25,7 @@ namespace tracy { - class D3D12ZoneScope {}; + class D3D12ZoneScope {}; } using TracyD3D12Ctx = void*; @@ -40,429 +40,419 @@ using TracyD3D12Ctx = void*; #include <cassert> #include <d3d12.h> #include <dxgi.h> -#include <wrl/client.h> #include <queue> +#define TracyD3D12Panic(msg, ...) do { assert(false && "TracyD3D12: " msg); TracyMessageLC("TracyD3D12: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false); + namespace tracy { - struct D3D12QueryPayload - { - uint32_t m_queryIdStart = 0; - uint32_t m_queryCount = 0; - }; - - // Command queue context. - class D3D12QueueCtx - { - friend class D3D12ZoneScope; - - static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. Must be even! - - bool m_initialized = false; - - ID3D12Device* m_device = nullptr; - ID3D12CommandQueue* m_queue = nullptr; - uint8_t m_context; - Microsoft::WRL::ComPtr<ID3D12QueryHeap> m_queryHeap; - Microsoft::WRL::ComPtr<ID3D12Resource> m_readbackBuffer; - - // In-progress payload. - uint32_t m_queryLimit = MaxQueries; - std::atomic<uint32_t> m_queryCounter = 0; - uint32_t m_previousQueryCounter = 0; - - uint32_t m_activePayload = 0; - Microsoft::WRL::ComPtr<ID3D12Fence> m_payloadFence; - std::queue<D3D12QueryPayload> m_payloadQueue; - - int64_t m_prevCalibration = 0; - int64_t m_qpcToNs = int64_t{ 1000000000 / GetFrequencyQpc() }; - - public: - D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) - : m_device(device) - , m_queue(queue) - , m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) - { - // Verify we support timestamp queries on this queue. - - if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY) - { - D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{}; - - bool Success = SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData))); - assert(Success && featureData.CopyQueueTimestampQueriesSupported && "Platform does not support profiling of copy queues."); - } - - uint64_t timestampFrequency; - - if (FAILED(queue->GetTimestampFrequency(×tampFrequency))) - { - assert(false && "Failed to get timestamp frequency."); - } - - uint64_t cpuTimestamp; - uint64_t gpuTimestamp; - - if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) - { - assert(false && "Failed to get queue clock calibration."); - } - - // Save the device cpu timestamp, not the profiler's timestamp. - m_prevCalibration = cpuTimestamp * m_qpcToNs; - - cpuTimestamp = Profiler::GetTime(); - - D3D12_QUERY_HEAP_DESC heapDesc{}; - heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP; - heapDesc.Count = m_queryLimit; - heapDesc.NodeMask = 0; // #TODO: Support multiple adapters. - - while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap)))) - { - m_queryLimit /= 2; - heapDesc.Count = m_queryLimit; - } - - // Create a readback buffer, which will be used as a destination for the query data. - - D3D12_RESOURCE_DESC readbackBufferDesc{}; - readbackBufferDesc.Alignment = 0; - readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; - readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t); - readbackBufferDesc.Height = 1; - readbackBufferDesc.DepthOrArraySize = 1; - readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN; - readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major. - readbackBufferDesc.MipLevels = 1; - readbackBufferDesc.SampleDesc.Count = 1; - readbackBufferDesc.SampleDesc.Quality = 0; - readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; - - D3D12_HEAP_PROPERTIES readbackHeapProps{}; - readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK; - readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; - readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; - readbackHeapProps.CreationNodeMask = 0; - readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters. - - if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer)))) - { - assert(false && "Failed to create query readback buffer."); - } - - if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence)))) - { - assert(false && "Failed to create payload fence."); - } - - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuNewContext); - MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); - MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp); - memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); - MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast<float>(timestampFrequency)); - MemWrite(&item->gpuNewContext.context, m_context); - MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); - MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); - -#ifdef TRACY_ON_DEMAND - GetProfiler().DeferItem(*item); -#endif - - Profiler::QueueSerialFinish(); - - m_initialized = true; - } - - void NewFrame() - { - uint32_t queryCounter = m_queryCounter.exchange(0); - m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter }); - m_previousQueryCounter += queryCounter; - - if (m_previousQueryCounter >= m_queryLimit) - { - m_previousQueryCounter -= m_queryLimit; - } - - m_queue->Signal(m_payloadFence.Get(), ++m_activePayload); - } - - void Name( const char* name, uint16_t len ) - { - auto ptr = (char*)tracy_malloc( len ); - memcpy( ptr, name, len ); - - auto item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuContextName ); - MemWrite( &item->gpuContextNameFat.context, m_context ); - MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); - MemWrite( &item->gpuContextNameFat.size, len ); -#ifdef TRACY_ON_DEMAND - GetProfiler().DeferItem( *item ); -#endif - Profiler::QueueSerialFinish(); - } - - void Collect() - { - ZoneScopedC(Color::Red4); - -#ifdef TRACY_ON_DEMAND - if (!GetProfiler().IsConnected()) - { - m_queryCounter = 0; - - return; - } -#endif - - // Find out what payloads are available. - const auto newestReadyPayload = m_payloadFence->GetCompletedValue(); - const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload); - - if (!payloadCount) - { - return; // No payloads are available yet, exit out. - } - - D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) }; - - // Map the readback buffer so we can fetch the query data from the GPU. - void* readbackBufferMapping = nullptr; - - if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping))) - { - assert(false && "Failed to map readback buffer."); - } - - auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping); - - for (uint32_t i = 0; i < payloadCount; ++i) - { - const auto& payload = m_payloadQueue.front(); - - for (uint32_t j = 0; j < payload.m_queryCount; ++j) - { - const auto counter = (payload.m_queryIdStart + j) % m_queryLimit; - const auto timestamp = timestampData[counter]; - const auto queryId = counter; - - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuTime); - MemWrite(&item->gpuTime.gpuTime, timestamp); - MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId)); - MemWrite(&item->gpuTime.context, m_context); - - Profiler::QueueSerialFinish(); - } - - m_payloadQueue.pop(); - } - - m_readbackBuffer->Unmap(0, nullptr); - - // Recalibrate to account for drift. - - uint64_t cpuTimestamp; - uint64_t gpuTimestamp; - - if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) - { - assert(false && "Failed to get queue clock calibration."); - } - - cpuTimestamp *= m_qpcToNs; - - const auto cpuDelta = cpuTimestamp - m_prevCalibration; - if (cpuDelta > 0) - { - m_prevCalibration = cpuTimestamp; - cpuTimestamp = Profiler::GetTime(); - - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuCalibration); - MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp); - MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp); - MemWrite(&item->gpuCalibration.cpuDelta, cpuDelta); - MemWrite(&item->gpuCalibration.context, m_context); - - Profiler::QueueSerialFinish(); - } - } - - private: - tracy_force_inline uint32_t NextQueryId() - { - uint32_t queryCounter = m_queryCounter.fetch_add(2); - assert(queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries."); - - const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit; - - return id; - } - - tracy_force_inline uint8_t GetId() const - { - return m_context; - } - }; - - class D3D12ZoneScope - { - const bool m_active; - D3D12QueueCtx* m_ctx = nullptr; - ID3D12GraphicsCommandList* m_cmdList = nullptr; - uint32_t m_queryId = 0; // Used for tracking in nested zones. - - public: - tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active) + struct D3D12QueryPayload + { + uint32_t m_queryIdStart = 0; + uint32_t m_queryCount = 0; + }; + + // Command queue context. + class D3D12QueueCtx + { + friend class D3D12ZoneScope; + + ID3D12Device* m_device = nullptr; + ID3D12CommandQueue* m_queue = nullptr; + uint8_t m_contextId = 255; // TODO: apparently, 255 means "invalid id"; is this documented somewhere? + ID3D12QueryHeap* m_queryHeap = nullptr; + ID3D12Resource* m_readbackBuffer = nullptr; + + // In-progress payload. + uint32_t m_queryLimit = 0; + std::atomic<uint32_t> m_queryCounter = 0; + uint32_t m_previousQueryCounter = 0; + + uint32_t m_activePayload = 0; + ID3D12Fence* m_payloadFence = nullptr; + std::queue<D3D12QueryPayload> m_payloadQueue; + + UINT64 m_prevCalibrationTicksCPU = 0; + + void RecalibrateClocks() + { + UINT64 cpuTimestamp; + UINT64 gpuTimestamp; + if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) + { + TracyD3D12Panic("failed to obtain queue clock calibration counters.", return); + } + + int64_t cpuDeltaTicks = cpuTimestamp - m_prevCalibrationTicksCPU; + if (cpuDeltaTicks > 0) + { + static const int64_t nanosecodsPerTick = int64_t(1000000000) / GetFrequencyQpc(); + int64_t cpuDeltaNS = cpuDeltaTicks * nanosecodsPerTick; + // Save the device cpu timestamp, not the Tracy profiler timestamp: + m_prevCalibrationTicksCPU = cpuTimestamp; + + cpuTimestamp = Profiler::GetTime(); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuCalibration); + MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp); + MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp); + MemWrite(&item->gpuCalibration.cpuDelta, cpuDeltaNS); + MemWrite(&item->gpuCalibration.context, GetId()); + SubmitQueueItem(item); + } + } + + tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item) + { #ifdef TRACY_ON_DEMAND - : m_active(active && GetProfiler().IsConnected()) -#else - : m_active(active) + GetProfiler().DeferItem(*item); #endif - { - if (!m_active) return; - - m_ctx = ctx; - m_cmdList = cmdList; - - m_queryId = ctx->NextQueryId(); - cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); - - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); - MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation)); - MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId)); - MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); - - Profiler::QueueSerialFinish(); - } + Profiler::QueueSerialFinish(); + } + + public: + D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) + : m_device(device) + , m_queue(queue) + { + // Verify we support timestamp queries on this queue. + + if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY) + { + D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{}; + + HRESULT hr = device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData)); + if (FAILED(hr) || (featureData.CopyQueueTimestampQueriesSupported == FALSE)) + { + TracyD3D12Panic("Platform does not support profiling of copy queues.", return); + } + } + + static constexpr uint32_t MaxQueries = 64 * 1024; // Must be even, because queries are (begin, end) pairs + m_queryLimit = MaxQueries; + + D3D12_QUERY_HEAP_DESC heapDesc{}; + heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP; + heapDesc.Count = m_queryLimit; + heapDesc.NodeMask = 0; // #TODO: Support multiple adapters. + + while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap)))) + { + m_queryLimit /= 2; + heapDesc.Count = m_queryLimit; + } + + // Create a readback buffer, which will be used as a destination for the query data. + + D3D12_RESOURCE_DESC readbackBufferDesc{}; + readbackBufferDesc.Alignment = 0; + readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t); + readbackBufferDesc.Height = 1; + readbackBufferDesc.DepthOrArraySize = 1; + readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN; + readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major. + readbackBufferDesc.MipLevels = 1; + readbackBufferDesc.SampleDesc.Count = 1; + readbackBufferDesc.SampleDesc.Quality = 0; + readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + + D3D12_HEAP_PROPERTIES readbackHeapProps{}; + readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK; + readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; + readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; + readbackHeapProps.CreationNodeMask = 0; + readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters. + + if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer)))) + { + TracyD3D12Panic("Failed to create query readback buffer.", return); + } + + if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence)))) + { + TracyD3D12Panic("Failed to create payload fence.", return); + } + + float period = [queue]() + { + uint64_t timestampFrequency; + if (FAILED(queue->GetTimestampFrequency(×tampFrequency))) + { + return 0.0f; + } + return static_cast<float>( 1E+09 / static_cast<double>(timestampFrequency) ); + }(); + + if (period == 0.0f) + { + TracyD3D12Panic("Failed to get timestamp frequency.", return); + } + + uint64_t cpuTimestamp; + uint64_t gpuTimestamp; + if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) + { + TracyD3D12Panic("Failed to get queue clock calibration.", return); + } + + // Save the device cpu timestamp, not the profiler's timestamp. + m_prevCalibrationTicksCPU = cpuTimestamp; + + cpuTimestamp = Profiler::GetTime(); + + // all checked: ready to roll + m_contextId = GetGpuCtxCounter().fetch_add(1); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); + MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp); + MemWrite(&item->gpuNewContext.thread, decltype(item->gpuNewContext.thread)(0)); // #TODO: why 0 instead of GetThreadHandle()? + MemWrite(&item->gpuNewContext.period, period); + MemWrite(&item->gpuNewContext.context, GetId()); + MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); + SubmitQueueItem(item); + } + + ~D3D12QueueCtx() + { + ZoneScopedC(Color::Red4); + // collect all pending timestamps + while (m_payloadFence->GetCompletedValue() != m_activePayload) + /* busy-wait ... */; + Collect(); + m_payloadFence->Release(); + m_readbackBuffer->Release(); + m_queryHeap->Release(); + } + + + void NewFrame() + { + uint32_t queryCounter = m_queryCounter.exchange(0); + m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter }); + m_previousQueryCounter += queryCounter; + + if (m_previousQueryCounter >= m_queryLimit) + { + m_previousQueryCounter -= m_queryLimit; + } + + m_queue->Signal(m_payloadFence, ++m_activePayload); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, GetId()); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); + SubmitQueueItem(item); + } + + void Collect() + { + ZoneScopedC(Color::Red4); - tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active) #ifdef TRACY_ON_DEMAND - : m_active(active&& GetProfiler().IsConnected()) -#else - : m_active(active) -#endif - { - if (!m_active) return; - - m_ctx = ctx; - m_cmdList = cmdList; - - m_queryId = ctx->NextQueryId(); - cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); - - auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); - MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); - MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation)); - MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId)); - MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + if (!GetProfiler().IsConnected()) + { + m_queryCounter = 0; - Profiler::QueueSerialFinish(); - } - - tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active) -#ifdef TRACY_ON_DEMAND - : m_active(active&& GetProfiler().IsConnected()) -#else - : m_active(active) + return; + } #endif - { - if (!m_active) return; - - m_ctx = ctx; - m_cmdList = cmdList; - m_queryId = ctx->NextQueryId(); - cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); - - const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); - - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial); - MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation); - MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId)); - MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); - - Profiler::QueueSerialFinish(); - } - - tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active) + // Find out what payloads are available. + const auto newestReadyPayload = m_payloadFence->GetCompletedValue(); + const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload); + + if (!payloadCount) + { + return; // No payloads are available yet, exit out. + } + + D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) }; + + // Map the readback buffer so we can fetch the query data from the GPU. + void* readbackBufferMapping = nullptr; + + if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping))) + { + TracyD3D12Panic("Failed to map readback buffer.", return); + } + + auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping); + + for (uint32_t i = 0; i < payloadCount; ++i) + { + const auto& payload = m_payloadQueue.front(); + + for (uint32_t j = 0; j < payload.m_queryCount; ++j) + { + const auto counter = (payload.m_queryIdStart + j) % m_queryLimit; + const auto timestamp = timestampData[counter]; + const auto queryId = counter; + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, timestamp); + MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId)); + MemWrite(&item->gpuTime.context, GetId()); + + Profiler::QueueSerialFinish(); + } + + m_payloadQueue.pop(); + } + + m_readbackBuffer->Unmap(0, nullptr); + + // Recalibrate to account for drift. + RecalibrateClocks(); + } + + private: + tracy_force_inline uint32_t NextQueryId() + { + uint32_t queryCounter = m_queryCounter.fetch_add(2); + if (queryCounter >= m_queryLimit) + { + TracyD3D12Panic("Submitted too many GPU queries! Consider increasing MaxQueries."); + // #TODO: consider returning an invalid id or sentinel value here + } + + const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit; + + return id; + } + + tracy_force_inline uint8_t GetId() const + { + return m_contextId; + } + }; + + class D3D12ZoneScope + { + const bool m_active; + D3D12QueueCtx* m_ctx = nullptr; + ID3D12GraphicsCommandList* m_cmdList = nullptr; + uint32_t m_queryId = 0; // Used for tracking in nested zones. + + tracy_force_inline void WriteQueueItem(QueueItem* item, QueueType type, uint64_t srcLocation) + { + MemWrite(&item->hdr.type, type); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, srcLocation); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId)); + MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, bool active) #ifdef TRACY_ON_DEMAND - : m_active(active&& GetProfiler().IsConnected()) + : m_active(active&& GetProfiler().IsConnected()) #else - : m_active(active) + : m_active(active) #endif - { - if (!m_active) return; - - m_ctx = ctx; - m_cmdList = cmdList; - - m_queryId = ctx->NextQueryId(); - cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); - - const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); - - auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); - MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial); - MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation); - MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId)); - MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); - - Profiler::QueueSerialFinish(); - } - - tracy_force_inline ~D3D12ZoneScope() - { - if (!m_active) return; - - const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot. - m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId); - - auto* item = Profiler::QueueSerial(); - MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); - MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); - MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); - MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId)); - MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); - - Profiler::QueueSerialFinish(); - - m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer.Get(), m_queryId * sizeof(uint64_t)); - } - }; - - static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue) - { - auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx))); - new (ctx) D3D12QueueCtx{ device, queue }; - - return ctx; - } - - static inline void DestroyD3D12Context(D3D12QueueCtx* ctx) - { - ctx->~D3D12QueueCtx(); - tracy_free(ctx); - } + { + if (!m_active) return; + + m_ctx = ctx; + m_cmdList = cmdList; + + m_queryId = m_ctx->NextQueryId(); + m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); + } + + public: + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcLocation)); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcLocation)); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); + + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); + } + + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active) + : D3D12ZoneScope(ctx, cmdList, active) + { + if (!m_active) return; + + const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); + + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation); + } + + tracy_force_inline ~D3D12ZoneScope() + { + if (!m_active) return; + + const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot. + m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, queryId); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId)); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + + m_cmdList->ResolveQueryData(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer, m_queryId * sizeof(uint64_t)); + } + }; + + static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue) + { + auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx))); + new (ctx) D3D12QueueCtx{ device, queue }; + + return ctx; + } + + static inline void DestroyD3D12Context(D3D12QueueCtx* ctx) + { + ctx->~D3D12QueueCtx(); + tracy_free(ctx); + } } +#undef TracyD3D12Panic + using TracyD3D12Ctx = tracy::D3D12QueueCtx*; #define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue); @@ -471,25 +461,29 @@ using TracyD3D12Ctx = tracy::D3D12QueueCtx*; #define TracyD3D12NewFrame(ctx) ctx->NewFrame(); +#define TracyD3D12UnnamedZone ___tracy_gpu_d3d12_zone +#define TracyD3D12SrcLocSymbol TracyConcat(__tracy_d3d12_source_location,TracyLine) +#define TracyD3D12SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D12SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; + #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, TRACY_CALLSTACK, true) -# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, TRACY_CALLSTACK, true) -# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active }; -# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active }; +# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, TRACY_CALLSTACK, true) +# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, TRACY_CALLSTACK, true) +# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active }; +# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active }; # define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) #else -# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true) -# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, ___tracy_gpu_zone, cmdList, name, color, true) -# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active }; -# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active }; +# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, TracyD3D12UnnamedZone, cmdList, name, true) +# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, TracyD3D12UnnamedZone, cmdList, name, color, true) +# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active }; +# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active }; # define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active }; #endif #ifdef TRACY_HAS_CALLSTACK -# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, depth, true) -# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, depth, true) -# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active }; -# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active }; +# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, depth, true) +# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, depth, true) +# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active }; +# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active }; # define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active }; #else # define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name) diff --git a/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp b/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp index 6ee2e3087d73bf04b182079f67558c664014bb98..c972ffb26da6a555183a72768fab8161f7c407ba 100644 --- a/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp +++ b/thirdparty/tracy/include/tracy/tracy/TracyLua.hpp @@ -173,10 +173,10 @@ static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth ) { const uint32_t line = dbg[i].currentline; memcpy( dst, &line, 4 ); dst += 4; - assert( fsz[i] <= std::numeric_limits<uint16_t>::max() ); + assert( fsz[i] <= (std::numeric_limits<uint16_t>::max)() ); memcpy( dst, fsz+i, 2 ); dst += 2; memcpy( dst, func[i], fsz[i] ); dst += fsz[i]; - assert( ssz[i] <= std::numeric_limits<uint16_t>::max() ); + assert( ssz[i] <= (std::numeric_limits<uint16_t>::max)() ); memcpy( dst, ssz+i, 2 ); dst += 2; memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i]; } @@ -333,7 +333,7 @@ static inline int LuaZoneText( lua_State* L ) auto txt = lua_tostring( L, 1 ); const auto size = strlen( txt ); - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); @@ -358,7 +358,7 @@ static inline int LuaZoneName( lua_State* L ) auto txt = lua_tostring( L, 1 ); const auto size = strlen( txt ); - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); @@ -378,7 +378,7 @@ static inline int LuaMessage( lua_State* L ) auto txt = lua_tostring( L, 1 ); const auto size = strlen( txt ); - assert( size < std::numeric_limits<uint16_t>::max() ); + assert( size < (std::numeric_limits<uint16_t>::max)() ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); diff --git a/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp b/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp index 3f4f6a31c1a85257aea208be7488d392393edc83..2d079f7b5af96271e6cee8238d6a52d173280cd5 100644 --- a/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp +++ b/thirdparty/tracy/include/tracy/tracy/TracyVulkan.hpp @@ -5,6 +5,9 @@ #define TracyVkContext(x,y,z,w) nullptr #define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr +#if defined VK_EXT_host_query_reset +#define TracyVkContextHostCalibrated(x,y,z,w,a) nullptr +#endif #define TracyVkDestroy(x) #define TracyVkContextName(c,x,y) #define TracyVkNamedZone(c,x,y,z,w) @@ -39,9 +42,47 @@ using TracyVkCtx = void*; #include "../client/TracyProfiler.hpp" #include "../client/TracyCallstack.hpp" +#include <atomic> + namespace tracy { +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define LoadVkDeviceCoreSymbols(Operation) \ + Operation(vkBeginCommandBuffer) \ + Operation(vkCmdResetQueryPool) \ + Operation(vkCmdWriteTimestamp) \ + Operation(vkCreateQueryPool) \ + Operation(vkDestroyQueryPool) \ + Operation(vkEndCommandBuffer) \ + Operation(vkGetQueryPoolResults) \ + Operation(vkQueueSubmit) \ + Operation(vkQueueWaitIdle) \ + Operation(vkResetQueryPool) + +#define LoadVkDeviceExtensionSymbols(Operation) \ + Operation(vkGetCalibratedTimestampsEXT) \ + Operation(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT) + +#define LoadVkInstanceCoreSymbols(Operation) \ + Operation(vkGetPhysicalDeviceProperties) + +struct VkSymbolTable +{ +#define MAKE_PFN(name) PFN_##name name; + LoadVkDeviceCoreSymbols(MAKE_PFN) + LoadVkDeviceExtensionSymbols(MAKE_PFN) + LoadVkInstanceCoreSymbols(MAKE_PFN) +#undef MAKE_PFN +}; + +#define VK_FUNCTION_WRAPPER(callSignature) m_symbols.callSignature +#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) m_ctx->m_symbols.callSignature +#else +#define VK_FUNCTION_WRAPPER(callSignature) callSignature +#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) callSignature +#endif + class VkCtx { friend class VkCtxScope; @@ -49,7 +90,11 @@ class VkCtx enum { QueryCount = 64 * 1024 }; public: - VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT _vkGetCalibratedTimestampsEXT ) +#if defined TRACY_VK_USE_SYMBOL_TABLE + VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr, bool calibrated ) +#else + VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT) +#endif : m_device( device ) , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT ) , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) ) @@ -57,47 +102,28 @@ public: , m_tail( 0 ) , m_oldCnt( 0 ) , m_queryCount( QueryCount ) - , m_vkGetCalibratedTimestampsEXT( _vkGetCalibratedTimestampsEXT ) +#if !defined TRACY_VK_USE_SYMBOL_TABLE + , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT ) +#endif { assert( m_context != 255 ); - if( _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT && _vkGetCalibratedTimestampsEXT ) +#if defined TRACY_VK_USE_SYMBOL_TABLE + PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr); + if ( calibrated ) { - uint32_t num; - _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, nullptr ); - if( num > 4 ) num = 4; - VkTimeDomainEXT data[4]; - _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, data ); - VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1; -#if defined _WIN32 - supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT; -#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW - supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT; -#endif - for( uint32_t i=0; i<num; i++ ) - { - if( data[i] == supportedDomain ) - { - m_timeDomain = data[i]; - break; - } - } + m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT; } - VkPhysicalDeviceProperties prop; - vkGetPhysicalDeviceProperties( physdev, &prop ); - const float period = prop.limits.timestampPeriod; +#endif - VkQueryPoolCreateInfo poolInfo = {}; - poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; - poolInfo.queryCount = m_queryCount; - poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; - while( vkCreateQueryPool( device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) + if( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) && m_vkGetCalibratedTimestampsEXT ) { - m_queryCount /= 2; - poolInfo.queryCount = m_queryCount; + FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) ); } + CreateQueryPool(); + VkCommandBufferBeginInfo beginInfo = {}; beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; @@ -107,87 +133,96 @@ public: submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &cmdbuf; - vkBeginCommandBuffer( cmdbuf, &beginInfo ); - vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ); - vkEndCommandBuffer( cmdbuf ); - vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ); - vkQueueWaitIdle( queue ); + VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) ); + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ); + VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) ); + VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) ); + VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) ); int64_t tcpu, tgpu; if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT ) { - vkBeginCommandBuffer( cmdbuf, &beginInfo ); - vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ); - vkEndCommandBuffer( cmdbuf ); - vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ); - vkQueueWaitIdle( queue ); + VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) ); + VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ) ); + VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) ); + VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) ); + VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) ); tcpu = Profiler::GetTime(); - vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ); + VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ) ); - vkBeginCommandBuffer( cmdbuf, &beginInfo ); - vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ); - vkEndCommandBuffer( cmdbuf ); - vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ); - vkQueueWaitIdle( queue ); + VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) ); + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ) ); + VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) ); + VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) ); + VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) ); } else { - enum { NumProbes = 32 }; - - VkCalibratedTimestampInfoEXT spec[2] = { - { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT }, - { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain }, - }; - uint64_t ts[2]; - uint64_t deviation[NumProbes]; - for( int i=0; i<NumProbes; i++ ) - { - _vkGetCalibratedTimestampsEXT( device, 2, spec, ts, deviation+i ); - } - uint64_t minDeviation = deviation[0]; - for( int i=1; i<NumProbes; i++ ) - { - if( minDeviation > deviation[i] ) - { - minDeviation = deviation[i]; - } - } - m_deviation = minDeviation * 3 / 2; - -#if defined _WIN32 - m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() ); -#endif - + FindCalibratedTimestampDeviation(); Calibrate( device, m_prevCalibration, tgpu ); tcpu = Profiler::GetTime(); } - uint8_t flags = 0; - if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration; + WriteInitialItem( physdev, tcpu, tgpu ); - auto item = Profiler::QueueSerial(); - MemWrite( &item->hdr.type, QueueType::GpuNewContext ); - MemWrite( &item->gpuNewContext.cpuTime, tcpu ); - MemWrite( &item->gpuNewContext.gpuTime, tgpu ); - memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) ); - MemWrite( &item->gpuNewContext.period, period ); - MemWrite( &item->gpuNewContext.context, m_context ); - MemWrite( &item->gpuNewContext.flags, flags ); - MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan ); + m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount ); + } -#ifdef TRACY_ON_DEMAND - GetProfiler().DeferItem( *item ); +#if defined VK_EXT_host_query_reset + /** + * This alternative constructor does not use command buffers and instead uses functionality from + * VK_EXT_host_query_reset (core with 1.2 and non-optional) and VK_EXT_calibrated_timestamps. This requires + * the physical device to have another time domain apart from DEVICE to be calibrateable. + */ +#if defined TRACY_VK_USE_SYMBOL_TABLE + VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr ) +#else + VkCtx( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT vkResetQueryPool, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT ) #endif - Profiler::QueueSerialFinish(); + : m_device( device ) + , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT ) + , m_context( GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed) ) + , m_head( 0 ) + , m_tail( 0 ) + , m_oldCnt( 0 ) + , m_queryCount( QueryCount ) +#if !defined TRACY_VK_USE_SYMBOL_TABLE + , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT ) +#endif + { + assert( m_context != 255); + +#if defined TRACY_VK_USE_SYMBOL_TABLE + PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr); + m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT; +#endif + + assert( VK_FUNCTION_WRAPPER( vkResetQueryPool ) != nullptr ); + assert( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) != nullptr ); + assert( VK_FUNCTION_WRAPPER( vkGetCalibratedTimestampsEXT ) != nullptr ); + + FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) ); + + // We require a host time domain to be available to properly calibrate. + FindCalibratedTimestampDeviation(); + int64_t tgpu; + Calibrate( device, m_prevCalibration, tgpu ); + int64_t tcpu = Profiler::GetTime(); + + CreateQueryPool(); + VK_FUNCTION_WRAPPER( vkResetQueryPool( device, m_query, 0, m_queryCount ) ); + + WriteInitialItem( physdev, tcpu, tgpu ); m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount ); } +#endif ~VkCtx() { tracy_free( m_res ); - vkDestroyQueryPool( m_device, m_query, nullptr ); + VK_FUNCTION_WRAPPER( vkDestroyQueryPool( m_device, m_query, nullptr ) ); } void Name( const char* name, uint16_t len ) @@ -210,18 +245,23 @@ public: { ZoneScopedC( Color::Red4 ); - if( m_tail == m_head ) return; + const uint64_t head = m_head.load(std::memory_order_relaxed); + if( m_tail == head ) return; #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) { - vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ); - m_head = m_tail = m_oldCnt = 0; + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ); + m_tail = head; + m_oldCnt = 0; int64_t tgpu; if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu ); return; } #endif + assert( head > m_tail ); + + const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount ); unsigned int cnt; if( m_oldCnt != 0 ) @@ -231,10 +271,16 @@ public: } else { - cnt = m_head < m_tail ? m_queryCount - m_tail : m_head - m_tail; + cnt = (unsigned int)( head - m_tail ); + assert( cnt <= m_queryCount ); + if( wrappedTail + cnt > m_queryCount ) + { + cnt = m_queryCount - wrappedTail; + } } - if( vkGetQueryPoolResults( m_device, m_query, m_tail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY ) + + if( VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( m_device, m_query, wrappedTail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY ) ) { m_oldCnt = cnt; return; @@ -245,7 +291,7 @@ public: auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuTime ); MemWrite( &item->gpuTime.gpuTime, m_res[idx] ); - MemWrite( &item->gpuTime.queryId, uint16_t( m_tail + idx ) ); + MemWrite( &item->gpuTime.queryId, uint16_t( wrappedTail + idx ) ); MemWrite( &item->gpuTime.context, m_context ); Profiler::QueueSerialFinish(); } @@ -269,19 +315,16 @@ public: } } - vkCmdResetQueryPool( cmdbuf, m_query, m_tail, cnt ); + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) ); m_tail += cnt; - if( m_tail == m_queryCount ) m_tail = 0; } private: tracy_force_inline unsigned int NextQueryId() { - const auto id = m_head; - m_head = ( m_head + 1 ) % m_queryCount; - assert( m_head != m_tail ); - return id; + const uint64_t id = m_head.fetch_add(1, std::memory_order_relaxed); + return id % m_queryCount; } tracy_force_inline uint8_t GetId() const @@ -315,16 +358,126 @@ private: #endif } + tracy_force_inline void CreateQueryPool() + { + VkQueryPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + poolInfo.queryCount = m_queryCount; + poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; + while ( VK_FUNCTION_WRAPPER( vkCreateQueryPool( m_device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) ) + { + m_queryCount /= 2; + poolInfo.queryCount = m_queryCount; + } + } + + tracy_force_inline void FindAvailableTimeDomains( VkPhysicalDevice physicalDevice, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) + { + uint32_t num; + _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, nullptr ); + if(num > 4) num = 4; + VkTimeDomainEXT data[4]; + _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, data ); + VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1; +#if defined _WIN32 + supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT; +#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW + supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT; +#endif + for( uint32_t i=0; i<num; i++ ) { + if(data[i] == supportedDomain) { + m_timeDomain = data[i]; + break; + } + } + } + + tracy_force_inline void FindCalibratedTimestampDeviation() + { + assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ); + constexpr size_t NumProbes = 32; + VkCalibratedTimestampInfoEXT spec[2] = { + { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT }, + { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain }, + }; + uint64_t ts[2]; + uint64_t deviation[NumProbes]; + for( int i=0; i<NumProbes; i++ ) { + m_vkGetCalibratedTimestampsEXT( m_device, 2, spec, ts, deviation + i ); + } + uint64_t minDeviation = deviation[0]; + for( int i=1; i<NumProbes; i++ ) { + if ( minDeviation > deviation[i] ) { + minDeviation = deviation[i]; + } + } + m_deviation = minDeviation * 3 / 2; + +#if defined _WIN32 + m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() ); +#endif + } + + tracy_force_inline void WriteInitialItem( VkPhysicalDevice physdev, int64_t tcpu, int64_t tgpu ) + { + uint8_t flags = 0; + if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration; + + VkPhysicalDeviceProperties prop; + VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceProperties( physdev, &prop ) ); + const float period = prop.limits.timestampPeriod; + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuNewContext ); + MemWrite( &item->gpuNewContext.cpuTime, tcpu ); + MemWrite( &item->gpuNewContext.gpuTime, tgpu ); + memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) ); + MemWrite( &item->gpuNewContext.period, period ); + MemWrite( &item->gpuNewContext.context, m_context ); + MemWrite( &item->gpuNewContext.flags, flags ); + MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + +#if defined TRACY_VK_USE_SYMBOL_TABLE + void PopulateSymbolTable( VkInstance instance, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr ) + { +#define VK_GET_DEVICE_SYMBOL( name ) \ + (PFN_##name)deviceProcAddr( m_device, #name ); +#define VK_LOAD_DEVICE_SYMBOL( name ) \ + m_symbols.name = VK_GET_DEVICE_SYMBOL( name ); +#define VK_GET_INSTANCE_SYMBOL( name ) \ + (PFN_##name)instanceProcAddr( instance, #name ); +#define VK_LOAD_INSTANCE_SYMBOL( name ) \ + m_symbols.name = VK_GET_INSTANCE_SYMBOL( name ); + + LoadVkDeviceCoreSymbols( VK_LOAD_DEVICE_SYMBOL ) + LoadVkDeviceExtensionSymbols( VK_LOAD_DEVICE_SYMBOL ) + LoadVkInstanceCoreSymbols( VK_LOAD_INSTANCE_SYMBOL ) +#undef VK_GET_DEVICE_SYMBOL +#undef VK_LOAD_DEVICE_SYMBOL +#undef VK_GET_INSTANCE_SYMBOL +#undef VK_LOAD_INSTANCE_SYMBOL + } +#endif + VkDevice m_device; VkQueryPool m_query; VkTimeDomainEXT m_timeDomain; +#if defined TRACY_VK_USE_SYMBOL_TABLE + VkSymbolTable m_symbols; +#endif uint64_t m_deviation; int64_t m_qpcToNs; int64_t m_prevCalibration; uint8_t m_context; - unsigned int m_head; - unsigned int m_tail; + std::atomic<uint64_t> m_head; + uint64_t m_tail; unsigned int m_oldCnt; unsigned int m_queryCount; @@ -348,7 +501,7 @@ public: m_ctx = ctx; const auto queryId = ctx->NextQueryId(); - vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); @@ -372,7 +525,7 @@ public: m_ctx = ctx; const auto queryId = ctx->NextQueryId(); - vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial ); @@ -396,7 +549,7 @@ public: m_ctx = ctx; const auto queryId = ctx->NextQueryId(); - vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); auto item = Profiler::QueueSerial(); @@ -421,7 +574,7 @@ public: m_ctx = ctx; const auto queryId = ctx->NextQueryId(); - vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); @@ -439,7 +592,7 @@ public: if( !m_active ) return; const auto queryId = m_ctx->NextQueryId(); - vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ); + CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ) ); auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); @@ -457,13 +610,38 @@ private: VkCtx* m_ctx; }; +#if defined TRACY_VK_USE_SYMBOL_TABLE +static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr, bool calibrated = false ) +#else static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct ) +#endif { auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) ); +#if defined TRACY_VK_USE_SYMBOL_TABLE + new(ctx) VkCtx( instance, physdev, device, queue, cmdbuf, instanceProcAddr, getDeviceProcAddr, calibrated ); +#else new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct ); +#endif return ctx; } +#if defined VK_EXT_host_query_reset +#if defined TRACY_VK_USE_SYMBOL_TABLE +static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr ) +#else +static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT qpreset, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct ) +#endif +{ + auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) ); +#if defined TRACY_VK_USE_SYMBOL_TABLE + new(ctx) VkCtx( instance, physdev, device, instanceProcAddr, getDeviceProcAddr ); +#else + new(ctx) VkCtx( physdev, device, qpreset, gpdctd, gct ); +#endif + return ctx; +} +#endif + static inline void DestroyVkContext( VkCtx* ctx ) { ctx->~VkCtx(); @@ -474,8 +652,23 @@ static inline void DestroyVkContext( VkCtx* ctx ) using TracyVkCtx = tracy::VkCtx*; +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define TracyVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ); +#else #define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr ); +#endif +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define TracyVkContextCalibrated( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr, true ); +#else #define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct ); +#endif +#if defined VK_EXT_host_query_reset +#if defined TRACY_VK_USE_SYMBOL_TABLE +#define TracyVkContextHostCalibrated( instance, physdev, device, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, instanceProcAddr, deviceProcAddr ); +#else +#define TracyVkContextHostCalibrated( physdev, device, qpreset, gpdctd, gct ) tracy::CreateVkContext( physdev, device, qpreset, gpdctd, gct ); +#endif +#endif #define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx ); #define TracyVkContextName( ctx, name, size ) ctx->Name( name, size ); #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK