1482 lines
47 KiB
C++
1482 lines
47 KiB
C++
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "D3D12Submission.h"
|
|
#include "D3D12RHIPrivate.h"
|
|
#include "HAL/Runnable.h"
|
|
#include "HAL/RunnableThread.h"
|
|
#include "IRenderCaptureProvider.h"
|
|
#include "Stats/ThreadIdleStats.h"
|
|
#include "ProfilingDebugging/AssetMetadataTrace.h"
|
|
|
|
#ifndef D3D12_PLATFORM_SUPPORTS_BLOCKING_FENCES
|
|
#define D3D12_PLATFORM_SUPPORTS_BLOCKING_FENCES 1
|
|
#endif
|
|
|
|
// These defines control which threads are enabled in the GPU submission pipeline.
|
|
#define D3D12_USE_SUBMISSION_THREAD (1)
|
|
#define D3D12_USE_INTERRUPT_THREAD (1 && D3D12_PLATFORM_SUPPORTS_BLOCKING_FENCES)
|
|
|
|
static TAutoConsoleVariable<int32> CVarRHIUseSubmissionThread(
|
|
TEXT("rhi.UseSubmissionThread"),
|
|
2,
|
|
TEXT("Whether to enable the RHI submission thread.\n")
|
|
TEXT(" 0: No\n")
|
|
TEXT(" 1: Yes, but not when running with multi-gpu.\n")
|
|
TEXT(" 2: Yes, always\n"),
|
|
ECVF_ReadOnly);
|
|
|
|
DECLARE_CYCLE_STAT(TEXT("Submit"), STAT_D3D12Submit, STATGROUP_D3D12RHI);
|
|
|
|
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [All Queues]"), STAT_RHI_GPUTotalTime, STATGROUP_D3D12RHI);
|
|
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Hardware Timer]"), STAT_RHI_GPUTotalTimeHW, STATGROUP_D3D12RHI);
|
|
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Graphics]"), STAT_RHI_GPUTotalTimeGraphics, STATGROUP_D3D12RHI);
|
|
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Async Compute]"), STAT_RHI_GPUTotalTimeAsyncCompute, STATGROUP_D3D12RHI);
|
|
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Copy]"), STAT_RHI_GPUTotalTimeCopy, STATGROUP_D3D12RHI);
|
|
|
|
DECLARE_STATS_GROUP(TEXT("D3D12RHIPipeline"), STATGROUP_D3D12RHIPipeline, STATCAT_Advanced);
|
|
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU IA Vertices" ), STAT_D3D12RHI_IAVertices , STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU IA Primitives" ), STAT_D3D12RHI_IAPrimitives , STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU VS Invocations"), STAT_D3D12RHI_VSInvocations, STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU GS Invocations"), STAT_D3D12RHI_GSInvocations, STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU GS Primitives" ), STAT_D3D12RHI_GSPrimitives , STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU C Invocations" ), STAT_D3D12RHI_CInvocations , STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU C Primitives" ), STAT_D3D12RHI_CPrimitives , STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU PS Invocations"), STAT_D3D12RHI_PSInvocations, STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU HS Invocations"), STAT_D3D12RHI_HSInvocations, STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU DS Invocations"), STAT_D3D12RHI_DSInvocations, STATGROUP_D3D12RHIPipeline);
|
|
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU CS Invocations"), STAT_D3D12RHI_CSInvocations, STATGROUP_D3D12RHIPipeline);
|
|
|
|
static float GD3D12SubmissionTimeout = 5.0;
|
|
static FAutoConsoleVariableRef CVarD3D12SubmissionTimeout(
|
|
TEXT("r.D3D12.SubmissionTimeout"),
|
|
GD3D12SubmissionTimeout,
|
|
TEXT("The maximum time, in seconds, that a submitted GPU command list is allowed to take before the RHI reports a GPU hang"),
|
|
ECVF_RenderThreadSafe);
|
|
|
|
static int32 GD3D12SubmissionMaxExecuteBatchSizeDirect = std::numeric_limits<int32>::max();
|
|
static FAutoConsoleVariableRef CVarD3D12SubmissionMaxExecuteBatchSizeDirect(
|
|
TEXT("r.D3D12.Submission.MaxExecuteBatchSize.Direct"),
|
|
GD3D12SubmissionMaxExecuteBatchSizeDirect,
|
|
TEXT("The maximum number of command lists to pass to a single ExecuteCommandLists invocation for direct queues\n")
|
|
TEXT("The valid range is 1 to INT_MAX inclusive. Values less than 1 will be clamped to 1"),
|
|
ECVF_RenderThreadSafe);
|
|
|
|
static int32 GD3D12SubmissionMaxExecuteBatchSizeCopy = std::numeric_limits<int32>::max();
|
|
static FAutoConsoleVariableRef CVarD3D12SubmissionMaxExecuteBatchSizeCopy(
|
|
TEXT("r.D3D12.Submission.MaxExecuteBatchSize.Copy"),
|
|
GD3D12SubmissionMaxExecuteBatchSizeCopy,
|
|
TEXT("The maximum number of command lists to pass to a single ExecuteCommandLists invocation for copy queues\n")
|
|
TEXT("The valid range is 1 to INT_MAX inclusive. Values less than 1 will be clamped to 1"),
|
|
ECVF_RenderThreadSafe);
|
|
|
|
static int32 GD3D12SubmissionMaxExecuteBatchSizeAsync = std::numeric_limits<int32>::max();
|
|
static FAutoConsoleVariableRef CVarD3D12SubmissionMaxExecuteBatchSizeAsync(
|
|
TEXT("r.D3D12.Submission.MaxExecuteBatchSize.Async"),
|
|
GD3D12SubmissionMaxExecuteBatchSizeAsync,
|
|
TEXT("The maximum number of command lists to pass to a single ExecuteCommandLists invocation for async queues\n")
|
|
TEXT("The valid range is 1 to INT_MAX inclusive. Values less than 1 will be clamped to 1"),
|
|
ECVF_RenderThreadSafe);
|
|
|
|
static std::atomic<int> GGPUCrashDetected = false;
|
|
|
|
class FD3D12Thread final : private FRunnable
|
|
{
|
|
public:
|
|
typedef FD3D12DynamicRHI::FProcessResult(FD3D12DynamicRHI::*FQueueFunc)();
|
|
|
|
FD3D12Thread(TCHAR const* Name, EThreadPriority Priority, FD3D12DynamicRHI* RHI, FQueueFunc QueueFunc)
|
|
: RHI(RHI)
|
|
, QueueFunc(QueueFunc)
|
|
, Event(CreateEvent(nullptr, false, false, nullptr))
|
|
, Thread(FRunnableThread::Create(this, Name, 0, Priority))
|
|
{}
|
|
|
|
virtual ~FD3D12Thread()
|
|
{
|
|
bExit = true;
|
|
SetEvent(Event);
|
|
|
|
Thread->WaitForCompletion();
|
|
delete Thread;
|
|
|
|
CloseHandle(Event);
|
|
}
|
|
|
|
void Kick() const
|
|
{
|
|
SetEvent(Event);
|
|
}
|
|
|
|
void Join() const
|
|
{
|
|
Thread->WaitForCompletion();
|
|
}
|
|
|
|
uint32 GetThreadID() const
|
|
{
|
|
return Thread->GetThreadID();
|
|
}
|
|
|
|
private:
|
|
virtual uint32 Run() override
|
|
{
|
|
while (!bExit)
|
|
{
|
|
// Process the queue until no more progress is made
|
|
FD3D12DynamicRHI::FProcessResult Result;
|
|
do { Result = (RHI->*QueueFunc)(); }
|
|
while (EnumHasAllFlags(Result.Status, FD3D12DynamicRHI::EQueueStatus::Processed));
|
|
|
|
WaitForSingleObject(Event, Result.WaitTimeout);
|
|
}
|
|
|
|
// Drain any remaining work in the queue
|
|
while (EnumHasAllFlags((RHI->*QueueFunc)().Status, FD3D12DynamicRHI::EQueueStatus::Pending)) {}
|
|
|
|
return 0;
|
|
}
|
|
|
|
FD3D12DynamicRHI* RHI;
|
|
FQueueFunc QueueFunc;
|
|
bool bExit = false;
|
|
|
|
public:
|
|
// Can't use FEvent here since we need to be able to get the underlying HANDLE
|
|
// for the ID3D12Fences to signal via ID3D12Fence::SetEventOnCompletion().
|
|
HANDLE const Event;
|
|
|
|
private:
|
|
FRunnableThread* Thread = nullptr;
|
|
};
|
|
|
|
void FD3D12DynamicRHI::InitializeSubmissionPipe()
|
|
{
|
|
if (FPlatformProcess::SupportsMultithreading())
|
|
{
|
|
#if D3D12_USE_INTERRUPT_THREAD
|
|
InterruptThread = new FD3D12Thread(TEXT("RHIInterruptThread"), TPri_Highest, this, &FD3D12DynamicRHI::ProcessInterruptQueue);
|
|
#endif
|
|
|
|
#if D3D12_USE_SUBMISSION_THREAD
|
|
bool bUseSubmissionThread = false;
|
|
switch (CVarRHIUseSubmissionThread.GetValueOnAnyThread())
|
|
{
|
|
case 1: bUseSubmissionThread = FRHIGPUMask::All().HasSingleIndex(); break;
|
|
case 2: bUseSubmissionThread = true; break;
|
|
}
|
|
|
|
// Currently RenderDoc can't make programmatic captures when we use a submission thread.
|
|
bUseSubmissionThread &= !IRenderCaptureProvider::IsAvailable() || IRenderCaptureProvider::Get().CanSupportSubmissionThread();
|
|
|
|
if (bUseSubmissionThread)
|
|
{
|
|
SubmissionThread = new FD3D12Thread(TEXT("RHISubmissionThread"), TPri_Highest, this, &FD3D12DynamicRHI::ProcessSubmissionQueue);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Initialize the timing structs in each queue, and the engine GPU profilers
|
|
{
|
|
TArray<FD3D12Payload*> Payloads;
|
|
#if RHI_NEW_GPU_PROFILER
|
|
TArray<UE::RHI::GPUProfiler::FQueue> ProfilerQueues;
|
|
#endif
|
|
|
|
ForEachQueue([&](FD3D12Queue& Queue)
|
|
{
|
|
FD3D12Payload* Payload = Payloads.Emplace_GetRef(new FD3D12Payload(Queue));
|
|
Payload->Timing = CurrentTimingPerQueue.CreateNew(Queue);
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
ProfilerQueues.Add(Queue.GetProfilerQueue());
|
|
#endif
|
|
});
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
UE::RHI::GPUProfiler::InitializeQueues(ProfilerQueues);
|
|
#endif
|
|
SubmitPayloads(MoveTemp(Payloads));
|
|
}
|
|
}
|
|
|
|
void FD3D12DynamicRHI::ShutdownSubmissionPipe()
|
|
{
|
|
delete SubmissionThread;
|
|
SubmissionThread = nullptr;
|
|
|
|
delete InterruptThread;
|
|
InterruptThread = nullptr;
|
|
|
|
if (EopTask)
|
|
{
|
|
ProcessInterruptQueueUntil(EopTask);
|
|
EopTask = nullptr;
|
|
}
|
|
}
|
|
|
|
// A finalized set of command payloads. This type is used to implement the RHI command list submission API.
|
|
struct FD3D12FinalizedCommands : public IRHIPlatformCommandList, public TArray<FD3D12Payload*>
|
|
{};
|
|
|
|
void FD3D12DynamicRHI::RHIFinalizeContext(FRHIFinalizeContextArgs&& Args, TRHIPipelineArray<IRHIPlatformCommandList*>& Output)
|
|
{
|
|
auto FinalizeContext = [&](FD3D12CommandContext* CmdContext, FD3D12FinalizedCommands& Result)
|
|
{
|
|
CmdContext->Finalize(Result);
|
|
|
|
if (!CmdContext->IsDefaultContext())
|
|
{
|
|
CmdContext->ClearState();
|
|
CmdContext->GetParentDevice()->ReleaseContext(CmdContext);
|
|
}
|
|
};
|
|
|
|
for(IRHIComputeContext* Context : Args.Contexts)
|
|
{
|
|
FD3D12FinalizedCommands Result;
|
|
ERHIPipeline Pipeline = Context->GetPipeline();
|
|
|
|
FD3D12CommandContextBase* CmdContextBase = static_cast<FD3D12CommandContextBase*>(Context);
|
|
if (FD3D12CommandContextRedirector* Redirector = CmdContextBase->AsRedirector())
|
|
{
|
|
for (uint32 GPUIndex : Redirector->GetPhysicalGPUMask())
|
|
FinalizeContext(Redirector->GetSingleDeviceContext(GPUIndex), Result);
|
|
|
|
if (!Redirector->bIsDefaultContext)
|
|
{
|
|
delete Redirector;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
FD3D12CommandContext* CmdContext = static_cast<FD3D12CommandContext*>(CmdContextBase);
|
|
FinalizeContext(CmdContext, Result);
|
|
}
|
|
|
|
Output[Pipeline] = Result.Num() ? new FD3D12FinalizedCommands(MoveTemp(Result)) : nullptr;
|
|
}
|
|
}
|
|
|
|
void FD3D12DynamicRHI::RHISubmitCommandLists(FRHISubmitCommandListsArgs&& Args)
|
|
{
|
|
SubmitCommands(MakeArrayView(reinterpret_cast<FD3D12FinalizedCommands**>(Args.CommandLists.GetData()), Args.CommandLists.Num()));
|
|
}
|
|
|
|
void FD3D12DynamicRHI::SubmitCommands(TConstArrayView<FD3D12FinalizedCommands*> Commands)
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("CommandList_Submit", FColor::Magenta);
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
|
|
TArray<FD3D12Payload*> AllPayloads;
|
|
for (FD3D12FinalizedCommands* Payloads : Commands)
|
|
{
|
|
#if WITH_RHI_BREADCRUMBS
|
|
TSharedPtr<FRHIBreadcrumbAllocatorArray> BreadcrumbAllocators {};
|
|
if (Payloads->BreadcrumbAllocators.Num())
|
|
{
|
|
BreadcrumbAllocators = MakeShared<FRHIBreadcrumbAllocatorArray>(MoveTemp(Payloads->BreadcrumbAllocators));
|
|
}
|
|
|
|
for (FD3D12Payload* Payload : *Payloads)
|
|
{
|
|
Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
|
|
if (BreadcrumbAllocators.IsValid())
|
|
{
|
|
check(!Payload->BreadcrumbAllocators.IsValid());
|
|
Payload->BreadcrumbAllocators = BreadcrumbAllocators;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
AllPayloads.Append(MoveTemp(*Payloads));
|
|
delete Payloads;
|
|
}
|
|
|
|
SubmitPayloads(MoveTemp(AllPayloads));
|
|
|
|
#else
|
|
|
|
TArray<FD3D12Payload*> AllPayloads;
|
|
#if WITH_RHI_BREADCRUMBS
|
|
TArray<TSharedPtr<FRHIBreadcrumbAllocator>> BreadcrumbAllocators;
|
|
#endif
|
|
|
|
for (FD3D12FinalizedCommands* Payloads : Commands)
|
|
{
|
|
#if WITH_RHI_BREADCRUMBS
|
|
for (FD3D12Payload* Payload : *Payloads)
|
|
{
|
|
Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
|
|
}
|
|
#endif
|
|
|
|
AllPayloads.Append(MoveTemp(static_cast<TArray<FD3D12Payload*>&>(*Payloads)));
|
|
#if WITH_RHI_BREADCRUMBS
|
|
BreadcrumbAllocators.Append(MoveTemp(Payloads->BreadcrumbAllocators));
|
|
#endif
|
|
delete Payloads;
|
|
}
|
|
|
|
SubmitPayloads(MoveTemp(AllPayloads));
|
|
|
|
#if WITH_RHI_BREADCRUMBS
|
|
// Enqueue the breadcrumb allocator references for cleanup once all prior payloads have completed on the GPU.
|
|
DeferredDelete([Array = MoveTemp(BreadcrumbAllocators)]() {});
|
|
#endif
|
|
|
|
#endif
|
|
}
|
|
|
|
void FD3D12DynamicRHI::SubmitPayloads(TArray<FD3D12Payload*>&& Payloads)
|
|
{
|
|
if (Payloads.Num())
|
|
{
|
|
PendingPayloadsForSubmission.Enqueue(new TArray<FD3D12Payload*>(MoveTemp(Payloads)));
|
|
}
|
|
|
|
if (SubmissionThread)
|
|
{
|
|
SubmissionThread->Kick();
|
|
}
|
|
else
|
|
{
|
|
// Since we're processing directly on the calling thread, we need to take a scope lock.
|
|
// Multiple engine threads might be calling Submit().
|
|
{
|
|
FScopeLock Lock(&SubmissionCS);
|
|
|
|
// Process the submission queue until no further progress is being made.
|
|
while (EnumHasAnyFlags(ProcessSubmissionQueue().Status, EQueueStatus::Processed)) {}
|
|
}
|
|
}
|
|
|
|
// Use this opportunity to pump the interrupt queue
|
|
ProcessInterruptQueueUntil(nullptr);
|
|
}
|
|
|
|
static int32 GetMaxExecuteBatchSize(ED3D12QueueType QueueType)
|
|
{
|
|
switch (QueueType)
|
|
{
|
|
case ED3D12QueueType::Direct:
|
|
return std::max(1, GD3D12SubmissionMaxExecuteBatchSizeDirect);
|
|
case ED3D12QueueType::Copy:
|
|
return std::max(1, GD3D12SubmissionMaxExecuteBatchSizeCopy);
|
|
case ED3D12QueueType::Async:
|
|
return std::max(1, GD3D12SubmissionMaxExecuteBatchSizeAsync);
|
|
default:
|
|
// Need to add new queue type and CVar
|
|
checkNoEntry();
|
|
return std::numeric_limits<int32>::max();
|
|
}
|
|
}
|
|
|
|
FD3D12DynamicRHI::FProcessResult FD3D12DynamicRHI::ProcessSubmissionQueue()
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("SubmissionQueue_Process", FColor::Turquoise);
|
|
SCOPE_CYCLE_COUNTER(STAT_D3D12Submit);
|
|
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessSubmissionQueue"));
|
|
|
|
FD3D12Queue::FPayloadArray PayloadsToHandDown;
|
|
|
|
FProcessResult Result;
|
|
|
|
auto FlushPayloads = [&PayloadsToHandDown, &Result, DynamicRHI = this](int32 MinPayloadsToFlush = 1)
|
|
{
|
|
if (PayloadsToHandDown.Num() >= MinPayloadsToFlush)
|
|
{
|
|
Result.Status |= EQueueStatus::Processed;
|
|
DynamicRHI->FlushBatchedPayloads(PayloadsToHandDown);
|
|
}
|
|
};
|
|
|
|
bool bProgress;
|
|
bool bKickInterruptThread = false;
|
|
|
|
do
|
|
{
|
|
bProgress = false;
|
|
Result.Status = EQueueStatus::None;
|
|
|
|
// Push all pending payloads into the ordered per-device, per-pipe pending queues
|
|
{
|
|
TArray<FD3D12Payload*>* Array;
|
|
while (PendingPayloadsForSubmission.Dequeue(Array))
|
|
{
|
|
for (FD3D12Payload* Payload : *Array)
|
|
{
|
|
Payload->Queue.PendingSubmission.Enqueue(Payload);
|
|
}
|
|
delete Array;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Fence values for FD3D12SyncPoint are determined on the submission thread,
|
|
// where each queue has a monotonically incrementing fence value.
|
|
//
|
|
// We might receive work that waits on a sync point which has not yet been submitted
|
|
// to the queue that will signal it, so we need to delay processing of those
|
|
// payloads until the fence value is known.
|
|
//
|
|
|
|
// Process all queues (across all devices and adapters) to flush work.
|
|
// Any sync point waits where the fence value is unknown will be left in the
|
|
// appropriate queue, to be processed the next time commands are submitted.
|
|
ForEachQueue([&](FD3D12Queue& CurrentQueue)
|
|
{
|
|
while (true)
|
|
{
|
|
{
|
|
FD3D12Payload** PayloadPtr = CurrentQueue.PendingSubmission.Peek();
|
|
if (!PayloadPtr)
|
|
return;
|
|
|
|
FD3D12Payload* Payload = *PayloadPtr;
|
|
|
|
// Accumulate the list of fences to await, and their maximum values
|
|
while (Payload->SyncPointsToWait.Index < Payload->SyncPointsToWait.Num())
|
|
{
|
|
FD3D12SyncPointRef& SyncPoint = Payload->SyncPointsToWait[Payload->SyncPointsToWait.Index];
|
|
if (!SyncPoint->ResolvedFence.IsSet())
|
|
{
|
|
// Need to wait on a sync point, but the fence value has not been resolved yet
|
|
// (no other payloads have signaled the sync point yet).
|
|
|
|
// Skip processing this queue, and move on to the next. We will retry later when
|
|
// further work is submitted, which may contain the sync point we need.
|
|
Result.Status |= EQueueStatus::Pending;
|
|
return;
|
|
}
|
|
|
|
Payload->AddQueueFenceWait(
|
|
SyncPoint->ResolvedFence->Fence,
|
|
SyncPoint->ResolvedFence->Value
|
|
);
|
|
|
|
Payload->SyncPointsToWait.Index++;
|
|
bProgress = true;
|
|
}
|
|
|
|
// All necessary sync points have been resolved.
|
|
Payload->SyncPointsToWait = {};
|
|
CurrentQueue.PendingSubmission.Dequeue();
|
|
bProgress = true;
|
|
|
|
check(!CurrentQueue.PayloadToSubmit);
|
|
CurrentQueue.PayloadToSubmit = Payload;
|
|
Result.Status |= EQueueStatus::Processed;
|
|
bKickInterruptThread = true;
|
|
|
|
//
|
|
// Now we generate any required barrier command lists. These may require
|
|
// executing on a different queue (e.g. graphics-only transitions required
|
|
// before async compute work), so we gather potential work across all
|
|
// queues for this device.
|
|
//
|
|
auto AccumulateQueries = [&](FD3D12CommandList* CommandList)
|
|
{
|
|
FD3D12Queue& TargetQueue = CommandList->Device->GetQueue(CommandList->QueueType);
|
|
const uint32 MaxBatchSize = GetMaxExecuteBatchSize(TargetQueue.QueueType);
|
|
|
|
// Occlusion + Pipeline Stats Queries
|
|
TargetQueue.BatchedObjects.OcclusionQueries.Append(MoveTemp(CommandList->State.OcclusionQueries));
|
|
TargetQueue.BatchedObjects.PipelineStatsQueries.Append(MoveTemp(CommandList->State.PipelineStatsQueries));
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
TargetQueue.BatchedObjects.TimestampQueries.Append(MoveTemp(CommandList->State.TimestampQueries));
|
|
#else
|
|
// Timestamp Queries
|
|
if (CommandList->State.BeginTimestamp)
|
|
{
|
|
// Keep only the first Begin() in the batch
|
|
if (TargetQueue.NumCommandListsInBatch++ == 0)
|
|
{
|
|
TargetQueue.BatchedObjects.TimestampQueries.Emplace(MoveTemp(CommandList->State.BeginTimestamp));
|
|
}
|
|
else
|
|
{
|
|
// Remove the previous End() timestamp, to join the range together.
|
|
check(TargetQueue.BatchedObjects.TimestampQueries.Last().Type == ED3D12QueryType::CommandListEnd);
|
|
TargetQueue.BatchedObjects.TimestampQueries.RemoveAt(TargetQueue.BatchedObjects.TimestampQueries.Num() - 1);
|
|
}
|
|
|
|
TargetQueue.BatchedObjects.TimestampQueries.Append(MoveTemp(CommandList->State.TimestampQueries));
|
|
TargetQueue.BatchedObjects.TimestampQueries.Emplace(MoveTemp(CommandList->State.EndTimestamp));
|
|
|
|
if (TargetQueue.NumCommandListsInBatch >= MaxBatchSize)
|
|
{
|
|
// Start a new batch
|
|
TargetQueue.NumCommandListsInBatch = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// No begin timestamp means timestamps aren't supported on this queue
|
|
check(CommandList->State.TimestampQueries.IsEmpty());
|
|
check(!CommandList->State.EndTimestamp);
|
|
}
|
|
#endif
|
|
};
|
|
|
|
for (int32 Index = 0; Index < Payload->CommandListsToExecute.Num(); Index++)
|
|
{
|
|
FD3D12CommandList* CurrentCommandList = Payload->CommandListsToExecute[Index];
|
|
AccumulateQueries(CurrentCommandList);
|
|
}
|
|
}
|
|
|
|
FlushPayloads(FD3D12Queue::MaxBatchedPayloads);
|
|
|
|
// Now submit the original payload
|
|
CurrentQueue.FinalizePayload(false, PayloadsToHandDown);
|
|
FlushPayloads(FD3D12Queue::MaxBatchedPayloads);
|
|
}
|
|
});
|
|
} while (bProgress);
|
|
|
|
FlushPayloads();
|
|
|
|
if (InterruptThread && bKickInterruptThread)
|
|
{
|
|
InterruptThread->Kick();
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
uint64 FD3D12Queue::FinalizePayload(bool bRequiresSignal, FPayloadArray& PayloadsToHandDown)
|
|
{
|
|
TRACE_CPUPROFILER_EVENT_SCOPE(ExecuteCommandList);
|
|
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ExecuteCommandLists"));
|
|
|
|
check(PayloadToSubmit && this == &PayloadToSubmit->Queue);
|
|
check(PayloadToSubmit->SyncPointsToWait.Num() == 0);
|
|
|
|
NumCommandListsInBatch = 0;
|
|
|
|
BarrierTimestamps.CloseAndReset(PayloadToSubmit->BatchedObjects.QueryRanges);
|
|
|
|
// Gather query ranges from this payload, grouping by heap pointer
|
|
if (BatchedObjects.QueryRanges.Num())
|
|
{
|
|
for (auto& [Heap, Ranges] : PayloadToSubmit->BatchedObjects.QueryRanges)
|
|
{
|
|
BatchedObjects.QueryRanges.FindOrAdd(Heap).Append(MoveTemp(Ranges));
|
|
}
|
|
PayloadToSubmit->BatchedObjects.QueryRanges.Reset();
|
|
}
|
|
else
|
|
{
|
|
BatchedObjects.QueryRanges = MoveTemp(PayloadToSubmit->BatchedObjects.QueryRanges);
|
|
}
|
|
|
|
check(PayloadToSubmit->BatchedObjects.IsEmpty());
|
|
|
|
if (!BatchedObjects.IsEmpty())
|
|
{
|
|
// Always resolve queries if we're switching the Timing struct,
|
|
// since we need to gather the timestamps for that frame.
|
|
bool bResolveQueries = PayloadToSubmit->Timing.IsSet();
|
|
|
|
if (!bResolveQueries)
|
|
{
|
|
// If this payload will signal a CPU-visible sync point, we need to resolve queries.
|
|
// This makes sure that the query data has reached the CPU before the sync point the CPU is waiting on is signaled.
|
|
for (FD3D12SyncPoint* SyncPoint : PayloadToSubmit->SyncPointsToSignal)
|
|
{
|
|
if (SyncPoint->GetType() == ED3D12SyncPointType::GPUAndCPU)
|
|
{
|
|
bResolveQueries = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bResolveQueries)
|
|
{
|
|
{
|
|
FD3D12CommandList* ResolveCommandList = nullptr;
|
|
|
|
// We've got queries to resolve. Allocate a command list.
|
|
auto GetResolveCommandList = [&]() -> FD3D12CommandList*
|
|
{
|
|
if (ResolveCommandList)
|
|
return ResolveCommandList;
|
|
|
|
if (!BarrierAllocator)
|
|
BarrierAllocator = Device->ObtainCommandAllocator(QueueType);
|
|
|
|
return ResolveCommandList = Device->ObtainCommandList(BarrierAllocator, nullptr, nullptr);
|
|
};
|
|
|
|
// Ranges are grouped by heap pointer.
|
|
for (auto& [Heap, Ranges] : BatchedObjects.QueryRanges)
|
|
{
|
|
{
|
|
#if ENABLE_RESIDENCY_MANAGEMENT
|
|
TArray<FD3D12ResidencyHandle*, TInlineAllocator<2>> ResidencyHandles;
|
|
ResidencyHandles.Add(&Heap->GetHeapResidencyHandle());
|
|
ResidencyHandles.Append(Heap->GetResultBuffer()->GetResidencyHandles());
|
|
GetResolveCommandList()->AddToResidencySet(ResidencyHandles);
|
|
#endif // ENABLE_RESIDENCY_MANAGEMENT
|
|
}
|
|
|
|
if (Heap->GetD3DQueryHeap())
|
|
{
|
|
// Sort the ranges into ascending order so we can merge adjacent ones,
|
|
// to reduce the number of ResolveQueryData calls we need to make.
|
|
Ranges.Sort();
|
|
|
|
for (int32 Index = 0; Index < Ranges.Num(); )
|
|
{
|
|
FD3D12QueryRange Range = Ranges[Index++];
|
|
|
|
while (Index < Ranges.Num() && Ranges[Index].Start == Range.End)
|
|
{
|
|
// Ranges are contiguous. Extend.
|
|
Range.End = Ranges[Index++].End;
|
|
}
|
|
|
|
GetResolveCommandList()->GraphicsCommandList()->ResolveQueryData(
|
|
Heap->GetD3DQueryHeap(),
|
|
Heap->QueryType,
|
|
Range.Start,
|
|
Range.End - Range.Start,
|
|
Heap->GetResultBuffer()->GetResource(),
|
|
Range.Start * Heap->GetResultSize()
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (ResolveCommandList)
|
|
{
|
|
ResolveCommandList->Close();
|
|
PayloadToSubmit->CommandListsToExecute.Add(ResolveCommandList);
|
|
}
|
|
}
|
|
|
|
// Move all the batched objects in this queue into the payload, so they get passed down the pipe.
|
|
PayloadToSubmit->BatchedObjects = MoveTemp(BatchedObjects);
|
|
}
|
|
}
|
|
|
|
if (BarrierAllocator)
|
|
{
|
|
PayloadToSubmit->AllocatorsToRelease.Add(BarrierAllocator);
|
|
BarrierAllocator = nullptr;
|
|
}
|
|
|
|
// Keep the latest fence value in the submitted payload.
|
|
// The interrupt thread uses this to determine when work has completed.
|
|
uint64 NextCompletionValue = Fence.NextCompletionValue;
|
|
|
|
// Set the fence/value pair into any sync points we need to signal.
|
|
for (FD3D12SyncPointRef& SyncPoint : PayloadToSubmit->SyncPointsToSignal)
|
|
{
|
|
check(!SyncPoint->ResolvedFence.IsSet());
|
|
SyncPoint->ResolvedFence.Emplace(Fence, NextCompletionValue);
|
|
}
|
|
|
|
PayloadToSubmit->CompletionFenceValue = NextCompletionValue;
|
|
PayloadToSubmit->bAlwaysSignal |= bRequiresSignal;
|
|
|
|
if (PayloadToSubmit->RequiresQueueFenceSignal())
|
|
{
|
|
++Fence.NextCompletionValue;
|
|
}
|
|
|
|
PayloadsToHandDown.Add(PayloadToSubmit);
|
|
PayloadToSubmit = nullptr;
|
|
|
|
return NextCompletionValue;
|
|
}
|
|
|
|
void FD3D12DynamicRHI::UpdateReservedResources(FD3D12Payload* Payload)
|
|
{
|
|
FD3D12Queue& Queue = Payload->Queue;
|
|
|
|
// On some devices, some queues cannot perform tile remapping operations.
|
|
// We can work around this limitation by running the remapping in lockstep on another queue:
|
|
// - tile mapping queue waits for commands on this queue to finish
|
|
// - tile mapping queue performs the commit/decommit operations
|
|
// - this queue waits for tile mapping queue to finish
|
|
// The extra sync is not required when the current queue is capable of the remapping operations.
|
|
|
|
ID3D12CommandQueue* TileMappingQueue = (Queue.bSupportsTileMapping ? Queue.D3DCommandQueue : Queue.Device->TileMappingQueue).GetReference();
|
|
FD3D12Fence& TileMappingFence = Queue.Device->TileMappingFence;
|
|
|
|
const bool bCrossQueueSyncRequired = TileMappingQueue != Queue.D3DCommandQueue.GetReference();
|
|
|
|
if (bCrossQueueSyncRequired)
|
|
{
|
|
// tile mapping queue waits for commands on this queue to finish
|
|
Queue.D3DCommandQueue->Signal(TileMappingFence.D3DFence, ++TileMappingFence.LastSignaledValue);
|
|
TileMappingQueue->Wait(TileMappingFence.D3DFence, TileMappingFence.LastSignaledValue);
|
|
}
|
|
|
|
for (const FD3D12CommitReservedResourceDesc& CommitDesc : Payload->ReservedResourcesToCommit)
|
|
{
|
|
checkf(CommitDesc.Resource, TEXT("FD3D12CommitReservedResourceDesc::Resource must be set"));
|
|
CommitDesc.Resource->CommitReservedResource(TileMappingQueue, CommitDesc.CommitSizeInBytes);
|
|
}
|
|
|
|
if (bCrossQueueSyncRequired)
|
|
{
|
|
// this queue waits for tile mapping operations to finish
|
|
TileMappingQueue->Signal(TileMappingFence.D3DFence, ++TileMappingFence.LastSignaledValue);
|
|
Queue.D3DCommandQueue->Wait(TileMappingFence.D3DFence, TileMappingFence.LastSignaledValue);
|
|
}
|
|
}
|
|
|
|
void FD3D12DynamicRHI::FlushBatchedPayloads(FD3D12Queue::FPayloadArray& PayloadsToSubmit)
|
|
{
|
|
uint32 FirstPayload = 0, LastPayload = 0;
|
|
|
|
auto Wait = [this](FD3D12Payload* Payload)
|
|
{
|
|
FD3D12Queue& Queue = Payload->Queue;
|
|
|
|
// Wait for queue fences
|
|
for (auto& [LocalFence, Value] : Payload->QueueFencesToWait)
|
|
{
|
|
#if RHI_NEW_GPU_PROFILER
|
|
Payload->EventStream.Emplace<UE::RHI::GPUProfiler::FEvent::FWaitFence>(
|
|
FPlatformTime::Cycles64()
|
|
, Value
|
|
, LocalFence.OwnerQueue->GetProfilerQueue()
|
|
);
|
|
#endif
|
|
|
|
VERIFYD3D12RESULT(Queue.D3DCommandQueue->Wait(LocalFence.D3DFence, Value));
|
|
}
|
|
|
|
// Wait for manual fences
|
|
for (auto& [LocalFence, Value] : Payload->ManualFencesToWait)
|
|
{
|
|
VERIFYD3D12RESULT(Queue.D3DCommandQueue->Wait(LocalFence, Value));
|
|
}
|
|
};
|
|
|
|
auto Flush = [&]()
|
|
{
|
|
if (FirstPayload == LastPayload)
|
|
return;
|
|
|
|
FD3D12Queue& Queue = PayloadsToSubmit[FirstPayload]->Queue;
|
|
|
|
// Build SOA layout needed to call ExecuteCommandLists().
|
|
TArray<FD3D12CommandList*, TInlineAllocator<128>> CommandLists;
|
|
TArray<ID3D12CommandList*, TInlineAllocator<128>> D3DCommandLists;
|
|
#if ENABLE_RESIDENCY_MANAGEMENT
|
|
TArray<FD3D12ResidencySet*, TInlineAllocator<128>> ResidencySets;
|
|
#endif
|
|
|
|
uint64 Time = FPlatformTime::Cycles64();
|
|
|
|
// Accumulate the command lists from the payload
|
|
for (uint32 Index = FirstPayload; Index < LastPayload; ++Index)
|
|
{
|
|
FD3D12Payload* Payload = PayloadsToSubmit[Index];
|
|
check(&Payload->Queue == &Queue);
|
|
|
|
for (FD3D12CommandList* CommandList : Payload->CommandListsToExecute)
|
|
{
|
|
check(CommandList->IsClosed());
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
CommandList->FlushProfilerEvents(Payload->EventStream, Time);
|
|
#endif // RHI_NEW_GPU_PROFILER
|
|
|
|
D3DCommandLists.Add(CommandList->Interfaces.CommandList);
|
|
|
|
#if ENABLE_RESIDENCY_MANAGEMENT
|
|
ResidencySets.Add(CommandList->CloseResidencySet());
|
|
#endif
|
|
}
|
|
CommandLists.Append(MoveTemp(Payload->CommandListsToExecute));
|
|
}
|
|
|
|
const int32 MaxBatchSize = GetMaxExecuteBatchSize(Queue.QueueType);
|
|
const int32 NumCommandLists = D3DCommandLists.Num();
|
|
|
|
for (int32 DispatchNum, Offset = 0; Offset < NumCommandLists; Offset += DispatchNum)
|
|
{
|
|
DispatchNum = FMath::Min(NumCommandLists - Offset, MaxBatchSize);
|
|
|
|
extern int32 GD3D12MaxCommandsPerCommandList;
|
|
if (GD3D12MaxCommandsPerCommandList > 0)
|
|
{
|
|
// Limit the dispatch group based on the total number of commands each command list contains, so that we
|
|
// don't submit more than approx "GD3D12MaxCommandsPerCommandList" commands per call to ExecuteCommandLists().
|
|
int32 Index = 0;
|
|
for (int32 NumCommands = 0; Index < DispatchNum && NumCommands < GD3D12MaxCommandsPerCommandList; ++Index)
|
|
{
|
|
NumCommands += CommandLists[Offset + Index]->State.NumCommands;
|
|
}
|
|
|
|
DispatchNum = Index;
|
|
}
|
|
|
|
INC_DWORD_STAT(STAT_D3D12ExecutedCommandListBatches);
|
|
INC_DWORD_STAT_BY(STAT_D3D12ExecutedCommandLists, DispatchNum);
|
|
|
|
Queue.ExecuteCommandLists(
|
|
MakeArrayView<ID3D12CommandList*>(&D3DCommandLists[Offset], DispatchNum)
|
|
#if ENABLE_RESIDENCY_MANAGEMENT
|
|
, MakeArrayView<FD3D12ResidencySet*>(&ResidencySets[Offset], DispatchNum)
|
|
#endif
|
|
);
|
|
|
|
#if LOG_EXECUTE_COMMAND_LISTS
|
|
LogExecuteCommandLists(DispatchNum, &D3DCommandLists[Offset]);
|
|
#endif
|
|
}
|
|
|
|
// Release the FD3D12CommandList instances back to the parent device object pool.
|
|
for (FD3D12CommandList* CommandList : CommandLists)
|
|
{
|
|
CommandList->Device->ReleaseCommandList(CommandList);
|
|
}
|
|
|
|
FirstPayload = LastPayload;
|
|
};
|
|
|
|
auto Signal = [this](FD3D12Payload* Payload)
|
|
{
|
|
FD3D12Queue& Queue = Payload->Queue;
|
|
|
|
// Signal any manual fences
|
|
for (auto& [ManualFence, Value] : Payload->ManualFencesToSignal)
|
|
{
|
|
VERIFYD3D12RESULT(Queue.D3DCommandQueue->Signal(ManualFence, Value));
|
|
}
|
|
|
|
// Signal the queue fence
|
|
if (Payload->RequiresQueueFenceSignal())
|
|
{
|
|
check(Queue.Fence.LastSignaledValue < Payload->CompletionFenceValue);
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
Payload->EventStream.Emplace<UE::RHI::GPUProfiler::FEvent::FSignalFence>(
|
|
FPlatformTime::Cycles64()
|
|
, Payload->CompletionFenceValue
|
|
);
|
|
#endif
|
|
|
|
VERIFYD3D12RESULT(Queue.D3DCommandQueue->Signal(Queue.Fence.D3DFence, Payload->CompletionFenceValue));
|
|
Queue.Fence.LastSignaledValue.store(Payload->CompletionFenceValue, std::memory_order_release);
|
|
}
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
if (Payload->EndFrameEvent.IsSet())
|
|
{
|
|
Payload->EndFrameEvent->CPUTimestamp = FPlatformTime::Cycles64();
|
|
Payload->EventStream.Emplace<UE::RHI::GPUProfiler::FEvent::FFrameBoundary>(*Payload->EndFrameEvent);
|
|
}
|
|
#endif
|
|
|
|
// Submission of this payload is completed. Signal the submission event if one was provided.
|
|
if (Payload->SubmissionEvent)
|
|
{
|
|
Payload->SubmissionEvent->DispatchSubsequents();
|
|
}
|
|
};
|
|
|
|
FD3D12Queue* PrevQueue = nullptr;
|
|
for (FD3D12Payload* Payload : PayloadsToSubmit)
|
|
{
|
|
if (PrevQueue != &Payload->Queue)
|
|
{
|
|
Flush();
|
|
PrevQueue = &Payload->Queue;
|
|
}
|
|
|
|
Payload->Queue.PendingInterrupt.Enqueue(Payload);
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
if (Payload->Timing.IsSet())
|
|
{
|
|
Flush();
|
|
|
|
if (FD3D12Timing* LocalTiming = *Payload->Timing)
|
|
{
|
|
// Calibrate the GPU timestamp / clock, if the queue type supports calibration.
|
|
if (Payload->Queue.QueueType != ED3D12QueueType::Copy || Payload->Queue.Device->GetParentAdapter()->AreCopyQueueTimestampQueriesSupported())
|
|
{
|
|
SCOPED_NAMED_EVENT(CalibrateClocks, FColor::Red);
|
|
VERIFYD3D12RESULT(Payload->Queue.D3DCommandQueue->GetClockCalibration(&LocalTiming->GPUTimestamp, &LocalTiming->CPUTimestamp));
|
|
VERIFYD3D12RESULT(Payload->Queue.D3DCommandQueue->GetTimestampFrequency(&LocalTiming->GPUFrequency));
|
|
QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(&LocalTiming->CPUFrequency));
|
|
}
|
|
}
|
|
}
|
|
#endif // RHI_NEW_GPU_PROFILER
|
|
|
|
if (Payload->HasWaitWork())
|
|
{
|
|
Flush();
|
|
Wait(Payload);
|
|
}
|
|
|
|
if (Payload->HasUpdateReservedResourcesWork())
|
|
{
|
|
Flush();
|
|
UpdateReservedResources(Payload);
|
|
}
|
|
|
|
if (Payload->HasPreExecuteWork())
|
|
{
|
|
Flush();
|
|
Payload->PreExecute();
|
|
}
|
|
|
|
LastPayload++;
|
|
|
|
if (Payload->HasSignalWork())
|
|
{
|
|
Flush();
|
|
Signal(Payload);
|
|
}
|
|
}
|
|
|
|
Flush();
|
|
|
|
for (FD3D12Payload* Payload : PayloadsToSubmit)
|
|
{
|
|
// Only set this bool to true once we'll never touch the payload again on this thread.
|
|
// This is because the bool hands ownership to the interrupt thread, which might delete the payload.
|
|
Payload->bSubmitted = true;
|
|
}
|
|
|
|
PayloadsToSubmit.Reset();
|
|
}
|
|
|
|
void FD3D12PayloadBase::AddQueueFenceWait(FD3D12Fence& InFence, uint64 InValue)
|
|
{
|
|
for (auto& [Fence, Value] : QueueFencesToWait)
|
|
{
|
|
if (&Fence == &InFence)
|
|
{
|
|
Value = FMath::Max(Value, InValue);
|
|
return;
|
|
}
|
|
}
|
|
|
|
QueueFencesToWait.Add({ InFence, InValue });
|
|
}
|
|
|
|
UE_TRACE_EVENT_BEGIN(Cpu, SyncPoint_Wait, NoSync)
|
|
UE_TRACE_EVENT_FIELD(UE::Trace::WideString, WaitingForName )
|
|
UE_TRACE_EVENT_END()
|
|
|
|
void FD3D12SyncPoint::Wait() const
|
|
{
|
|
checkf(GraphEvent, TEXT("This sync point was not created with a CPU event. Cannot wait for completion on the CPU."));
|
|
|
|
if (!GraphEvent->IsComplete())
|
|
{
|
|
// Block the calling thread until the graph event is signaled by the interrupt thread.
|
|
#if CPUPROFILERTRACE_ENABLED
|
|
UE_TRACE_LOG_SCOPED_T(Cpu, SyncPoint_Wait, CpuChannel)
|
|
<< SyncPoint_Wait.WaitingForName(GetDebugName());
|
|
#endif // CPUPROFILERTRACE_ENABLED
|
|
|
|
FD3D12DynamicRHI::GetD3DRHI()->ProcessInterruptQueueUntil(GraphEvent);
|
|
}
|
|
|
|
check(GraphEvent->IsComplete());
|
|
}
|
|
|
|
void FD3D12DynamicRHI::ProcessInterruptQueueUntil(FGraphEvent* GraphEvent)
|
|
{
|
|
if (InterruptThread)
|
|
{
|
|
if (GraphEvent && !GraphEvent->IsComplete())
|
|
{
|
|
GraphEvent->Wait();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Use the current thread to process the interrupt queue until the sync point we're waiting for is signaled.
|
|
// If GraphEvent is nullptr, process the queue until no further progress is made (assuming we can acquire the lock), then return.
|
|
if (!GraphEvent || !GraphEvent->IsComplete())
|
|
{
|
|
// If we're waiting for a sync point, accumulate the idle time
|
|
UE::Stats::FThreadIdleStats::FScopeIdle IdleScope(/* bIgnore = */GraphEvent == nullptr);
|
|
|
|
Retry:
|
|
if (InterruptCS.TryLock())
|
|
{
|
|
TGuardValue<uint32> Guard(InterruptThreadID, FPlatformTLS::GetCurrentThreadId());
|
|
|
|
FProcessResult Result;
|
|
do { Result = ProcessInterruptQueue(); }
|
|
// If we have a sync point, keep processing until the sync point is signaled.
|
|
// Otherwise, process until no more progress is being made.
|
|
while (GraphEvent
|
|
? !GraphEvent->IsComplete()
|
|
: EnumHasAllFlags(Result.Status, EQueueStatus::Processed)
|
|
);
|
|
|
|
InterruptCS.Unlock();
|
|
}
|
|
else if (GraphEvent && !GraphEvent->IsComplete())
|
|
{
|
|
// Failed to get the lock. Another thread is processing the interrupt queue. Try again...
|
|
FPlatformProcess::SleepNoStats(0);
|
|
goto Retry;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void FD3D12DynamicRHI::ProcessInterruptQueueOnGPUCrash()
|
|
{
|
|
// This function will not return.
|
|
|
|
// We know this function was called due to a GPU crash, so let the thread know.
|
|
GGPUCrashDetected.store(true, std::memory_order_release);
|
|
|
|
if (InterruptThread)
|
|
{
|
|
// Since we have an interrupt thread, allow it to process the GPU crash.
|
|
// This is necessary so it can retrieve all the active payloads for resolving breadcrumbs.
|
|
InterruptThread->Kick();
|
|
|
|
// Wait for the interrupt thread to exit (which will never happen).
|
|
InterruptThread->Join();
|
|
}
|
|
else
|
|
{
|
|
// If we have no interrupt thread, assume ownership on the current thread
|
|
// (or block forever on the scope lock if multiple threads enter this function).
|
|
FScopeLock Lock(&InterruptCS);
|
|
TGuardValue<uint32> Guard(InterruptThreadID, FPlatformTLS::GetCurrentThreadId());
|
|
|
|
while (true)
|
|
{
|
|
ProcessInterruptQueue();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool FD3D12DynamicRHI::IsInInterruptThread() const
|
|
{
|
|
uint32 ThisThreadID = FPlatformTLS::GetCurrentThreadId();
|
|
|
|
// If we don't have a dedicated interrupt thread, the thread currently acting
|
|
// as the interrupt thread is tracked via the InterruptThreadID field.
|
|
|
|
if (InterruptThread)
|
|
{
|
|
return ThisThreadID == InterruptThread->GetThreadID();
|
|
}
|
|
else
|
|
{
|
|
return ThisThreadID == InterruptThreadID;
|
|
}
|
|
}
|
|
|
|
D3D12_QUERY_DATA_PIPELINE_STATISTICS& operator += (D3D12_QUERY_DATA_PIPELINE_STATISTICS& LHS, D3D12_QUERY_DATA_PIPELINE_STATISTICS const& RHS)
|
|
{
|
|
LHS.IAVertices += RHS.IAVertices;
|
|
LHS.IAPrimitives += RHS.IAPrimitives;
|
|
LHS.VSInvocations += RHS.VSInvocations;
|
|
LHS.GSInvocations += RHS.GSInvocations;
|
|
LHS.GSPrimitives += RHS.GSPrimitives;
|
|
LHS.CInvocations += RHS.CInvocations;
|
|
LHS.CPrimitives += RHS.CPrimitives;
|
|
LHS.PSInvocations += RHS.PSInvocations;
|
|
LHS.HSInvocations += RHS.HSInvocations;
|
|
LHS.DSInvocations += RHS.DSInvocations;
|
|
LHS.CSInvocations += RHS.CSInvocations;
|
|
return LHS;
|
|
}
|
|
|
|
FD3D12DynamicRHI::FProcessResult FD3D12DynamicRHI::ProcessInterruptQueue()
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("InterruptQueue_Process", FColor::Yellow);
|
|
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessInterruptQueue"));
|
|
|
|
// Timer that clamps each tick to prevent false positive GPU timeouts
|
|
// when a debugger is attached and the process is broken.
|
|
struct FTimer
|
|
{
|
|
uint64 Elapsed;
|
|
uint64 Last;
|
|
|
|
FTimer()
|
|
: Elapsed(0)
|
|
, Last(FPlatformTime::Cycles64())
|
|
{}
|
|
|
|
void Tick()
|
|
{
|
|
static const uint64 MaxDeltaCycles = uint64(1.0 / FPlatformTime::GetSecondsPerCycle64()); // 1 second
|
|
uint64 Current = FPlatformTime::Cycles64();
|
|
Elapsed += FMath::Min(MaxDeltaCycles, Current - Last);
|
|
Last = Current;
|
|
}
|
|
} static Timer;
|
|
|
|
Timer.Tick();
|
|
|
|
auto CheckForDeviceRemoved = [this](FD3D12Queue& Queue)
|
|
{
|
|
// If we get an error code here, we can't pass it directly to VERIFYD3D12RESULT, because that expects DXGI_ERROR_DEVICE_REMOVED,
|
|
// DXGI_ERROR_DEVICE_RESET etc. and wants to obtain the reason code itself by calling GetDeviceRemovedReason (again).
|
|
HRESULT DeviceRemovedReason = Queue.Device->GetDevice()->GetDeviceRemovedReason();
|
|
if (DeviceRemovedReason != S_OK)
|
|
{
|
|
TerminateOnGPUCrash();
|
|
}
|
|
};
|
|
|
|
FProcessResult Result;
|
|
ForEachQueue([&](FD3D12Queue& CurrentQueue)
|
|
{
|
|
while (FD3D12Payload** PayloadPtr = CurrentQueue.PendingInterrupt.Peek())
|
|
{
|
|
FD3D12Payload* Payload = *PayloadPtr;
|
|
|
|
if (!Payload->bSubmitted)
|
|
break;
|
|
|
|
// Check for GPU completion
|
|
uint64 CompletedFenceValue = CurrentQueue.Fence.D3DFence->GetCompletedValue();
|
|
uint64 LastSignaledFenceValue = CurrentQueue.Fence.LastSignaledValue.load(std::memory_order_acquire);
|
|
|
|
// If the GPU crashes or hangs, the driver will signal all fences to UINT64_MAX.
|
|
if (CompletedFenceValue == UINT64_MAX)
|
|
{
|
|
CheckForDeviceRemoved(CurrentQueue);
|
|
}
|
|
|
|
if (CompletedFenceValue < Payload->CompletionFenceValue)
|
|
{
|
|
// Command list batch has not yet completed on this queue.
|
|
// Ask the driver to wake this thread again when the required value is reached.
|
|
if (InterruptThread && !CurrentQueue.Fence.bInterruptAwaited)
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("SetEventOnCompletion", FColor::Red);
|
|
VERIFYD3D12RESULT(CurrentQueue.Fence.D3DFence->SetEventOnCompletion(Payload->CompletionFenceValue, InterruptThread->Event));
|
|
CurrentQueue.Fence.bInterruptAwaited = true;
|
|
}
|
|
|
|
// Skip processing this queue and move on to the next.
|
|
Result.Status |= EQueueStatus::Pending;
|
|
|
|
// Detect a hung GPU
|
|
if (!Payload->SubmissionTime.IsSet() && LastSignaledFenceValue >= Payload->CompletionFenceValue)
|
|
{
|
|
//
|
|
// Keep track of the first time we've checked for completion on the interrupt thread.
|
|
// We set this here to avoid false positives when a debugger is attached. If we'd set this on the submission thread, it
|
|
// is possible for the title to be paused by the debugger after the time is set but before the payload has reached the GPU.
|
|
//
|
|
Payload->SubmissionTime = Timer.Elapsed;
|
|
}
|
|
|
|
if (Payload->SubmissionTime.IsSet() && Payload->SubmissionTime != TNumericLimits<uint64>::Max())
|
|
{
|
|
static const double CyclesPerSecond = 1.0 / FPlatformTime::GetSecondsPerCycle64();
|
|
const uint64 TimeoutCycles = FMath::TruncToInt64(GD3D12SubmissionTimeout * CyclesPerSecond);
|
|
|
|
uint64 ElapsedCycles = Timer.Elapsed - Payload->SubmissionTime.GetValue();
|
|
|
|
if (ElapsedCycles > TimeoutCycles)
|
|
{
|
|
// The last submission on this pipe did not complete within the timeout period. Assume the GPU has hung.
|
|
HandleGpuTimeout(Payload, ElapsedCycles * FPlatformTime::GetSecondsPerCycle64());
|
|
|
|
// Set to int max to indicate we've already reported the timeout for this payload.
|
|
Payload->SubmissionTime = TNumericLimits<uint64>::Max();
|
|
}
|
|
else
|
|
{
|
|
// Adjust the event wait timeout to cause the interrupt thread to wake automatically when
|
|
// the timeout for this payload is reached, assuming it hasn't been woken by the GPU already.
|
|
uint64 RemainingCycles = TimeoutCycles - ElapsedCycles;
|
|
uint32 RemainingMilliseconds = FMath::TruncToInt(RemainingCycles * FPlatformTime::GetSecondsPerCycle64() * 1000.0);
|
|
Result.WaitTimeout = FMath::Min(Result.WaitTimeout, RemainingMilliseconds);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
// At this point, the current command list has completed on the GPU.
|
|
CurrentQueue.Fence.bInterruptAwaited = false;
|
|
CurrentQueue.PendingInterrupt.Dequeue();
|
|
Result.Status |= EQueueStatus::Processed;
|
|
|
|
// Resolve query results
|
|
{
|
|
for (FD3D12QueryLocation& Query : Payload->BatchedObjects.OcclusionQueries)
|
|
{
|
|
check(Query.Target);
|
|
Query.CopyResultTo(Query.Target);
|
|
}
|
|
|
|
for (FD3D12QueryLocation& Query : Payload->BatchedObjects.PipelineStatsQueries)
|
|
{
|
|
if (Query.Target)
|
|
{
|
|
Query.CopyResultTo(Query.Target);
|
|
}
|
|
else
|
|
{
|
|
// Pipeline stats queries without targets are the ones that surround whole command lists.
|
|
CurrentQueue.Timing->PipelineStats += Query.GetResult<D3D12_QUERY_DATA_PIPELINE_STATISTICS>();
|
|
}
|
|
}
|
|
|
|
if (Payload->BatchedObjects.TimestampQueries.Num())
|
|
{
|
|
// Some timestamp queries report in microseconds
|
|
const double MicrosecondsScale = 1000000.0 / CurrentQueue.Device->GetTimestampFrequency(CurrentQueue.QueueType);
|
|
|
|
for (FD3D12QueryLocation& Query : Payload->BatchedObjects.TimestampQueries)
|
|
{
|
|
if (Query.Target)
|
|
{
|
|
Query.CopyResultTo(Query.Target);
|
|
}
|
|
|
|
switch (Query.Type)
|
|
{
|
|
case ED3D12QueryType::TimestampMicroseconds:
|
|
case ED3D12QueryType::TimestampRaw:
|
|
check(Query.Target);
|
|
if (Query.Type == ED3D12QueryType::TimestampMicroseconds)
|
|
{
|
|
// Convert to microseconds
|
|
*static_cast<uint64*>(Query.Target) = FPlatformMath::TruncToInt(double(*static_cast<uint64*>(Query.Target)) * MicrosecondsScale);
|
|
}
|
|
break;
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
case ED3D12QueryType::ProfilerTimestampTOP:
|
|
case ED3D12QueryType::ProfilerTimestampBOP:
|
|
{
|
|
// Convert from GPU timestamp to CPU timestamp (relative to FPlatformTime::Cycles64())
|
|
uint64& Target = *static_cast<uint64*>(Query.Target);
|
|
|
|
uint64 GPUDelta = Target - CurrentQueue.Timing->GPUTimestamp;
|
|
uint64 CPUDelta = (GPUDelta * CurrentQueue.Timing->CPUFrequency) / CurrentQueue.Timing->GPUFrequency;
|
|
|
|
Target = CPUDelta + CurrentQueue.Timing->CPUTimestamp;
|
|
}
|
|
break;
|
|
#else
|
|
case ED3D12QueryType::CommandListBegin:
|
|
case ED3D12QueryType::CommandListEnd:
|
|
case ED3D12QueryType::IdleBegin:
|
|
case ED3D12QueryType::IdleEnd:
|
|
check(CurrentQueue.Timing);
|
|
CurrentQueue.Timing->Timestamps.Add(Query.GetResult<uint64>());
|
|
break;
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
if (!Payload->EventStream.IsEmpty())
|
|
{
|
|
check(CurrentQueue.Timing);
|
|
CurrentQueue.Timing->EventStream.Append(MoveTemp(Payload->EventStream));
|
|
}
|
|
#endif
|
|
|
|
if (Payload->Timing.IsSet())
|
|
{
|
|
// Switch the new timing struct into the queue. This redirects timestamp results to separate each frame's work.
|
|
CurrentQueue.Timing = Payload->Timing.GetValue();
|
|
}
|
|
|
|
// Signal the CPU events of all sync points associated with this batch.
|
|
for (FD3D12SyncPointRef& SyncPoint : Payload->SyncPointsToSignal)
|
|
{
|
|
if (SyncPoint->GraphEvent)
|
|
{
|
|
SyncPoint->GraphEvent->DispatchSubsequents();
|
|
}
|
|
}
|
|
|
|
// We're done with this payload now.
|
|
|
|
// GPU resources the payload is holding a reference to will be cleaned up here.
|
|
// E.g. command list allocators, which get recycled on the parent device.
|
|
delete Payload;
|
|
}
|
|
|
|
CheckForDeviceRemoved(CurrentQueue);
|
|
});
|
|
|
|
if (GGPUCrashDetected.load(std::memory_order_relaxed))
|
|
{
|
|
// If this was set by ProcessInterruptQueueOnGPUCrash, we know a crash was detected, so process it immediately. We can't always rely on
|
|
// queue processing to catch it, as GetDeviceRemovedReason sometimes returns S_OK despite an earlier API call having reported a lost device.
|
|
TerminateOnGPUCrash();
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
FD3D12PayloadBase::FD3D12PayloadBase(FD3D12Queue& Queue)
|
|
: Queue(Queue)
|
|
#if RHI_NEW_GPU_PROFILER
|
|
, EventStream(Queue.GetProfilerQueue())
|
|
#endif
|
|
{}
|
|
|
|
FD3D12PayloadBase::~FD3D12PayloadBase()
|
|
{
|
|
for (FD3D12CommandAllocator* Allocator : AllocatorsToRelease)
|
|
{
|
|
Queue.Device->ReleaseCommandAllocator(Allocator);
|
|
}
|
|
}
|
|
|
|
void FD3D12PayloadBase::PreExecute()
|
|
{
|
|
if (PreExecuteCallback)
|
|
{
|
|
PreExecuteCallback(Queue.D3DCommandQueue);
|
|
}
|
|
}
|
|
|
|
#ifndef D3D12_PREFER_QUERIES_FOR_GPU_TIME
|
|
#define D3D12_PREFER_QUERIES_FOR_GPU_TIME 0
|
|
#endif
|
|
|
|
static TAutoConsoleVariable<int32> CVarGPUTimeFromTimestamps(
|
|
TEXT("r.D3D12.GPUTimeFromTimestamps"),
|
|
D3D12_PREFER_QUERIES_FOR_GPU_TIME,
|
|
TEXT("Prefer timestamps instead of GetHardwareGPUFrameTime to compute GPU frame time"),
|
|
ECVF_RenderThreadSafe);
|
|
|
|
void FD3D12DynamicRHI::ProcessTimestamps(FD3D12TimingArray const& TimingPerQueue)
|
|
{
|
|
#if RHI_NEW_GPU_PROFILER
|
|
{
|
|
TArray<UE::RHI::GPUProfiler::FEventStream, TInlineAllocator<GD3D12MaxNumQueues>> Streams;
|
|
for (auto const& Timing : TimingPerQueue)
|
|
{
|
|
Streams.Add(MoveTemp(Timing->EventStream));
|
|
}
|
|
UE::RHI::GPUProfiler::ProcessEvents(Streams);
|
|
}
|
|
#else
|
|
|
|
// The total number of cycles where at least one GPU pipe was busy during the frame.
|
|
uint64 UnionBusyCycles = 0;
|
|
int32 BusyPipes = 0;
|
|
|
|
uint64 LastMinCycles = 0;
|
|
bool bFirst = true;
|
|
|
|
// Process the time ranges from each pipe.
|
|
while (true)
|
|
{
|
|
// Find the next minimum timestamp
|
|
FD3D12Timing* NextMin = nullptr;
|
|
for (auto const& Current : TimingPerQueue)
|
|
{
|
|
if (Current->HasMoreTimestamps() && (!NextMin || Current->GetCurrentTimestamp() < NextMin->GetCurrentTimestamp()))
|
|
{
|
|
NextMin = Current.Get();
|
|
}
|
|
}
|
|
|
|
if (!NextMin)
|
|
break; // No more timestamps to process
|
|
|
|
if (!bFirst)
|
|
{
|
|
if (BusyPipes > 0 && NextMin->GetCurrentTimestamp() > LastMinCycles)
|
|
{
|
|
// Accumulate the union busy time across all pipes
|
|
UnionBusyCycles += NextMin->GetCurrentTimestamp() - LastMinCycles;
|
|
}
|
|
|
|
if (!NextMin->IsStartingWork())
|
|
{
|
|
// Accumulate the busy time for this pipe specifically.
|
|
NextMin->BusyCycles += NextMin->GetCurrentTimestamp() - NextMin->GetPreviousTimestamp();
|
|
}
|
|
}
|
|
|
|
LastMinCycles = NextMin->GetCurrentTimestamp();
|
|
|
|
BusyPipes += NextMin->IsStartingWork() ? 1 : -1;
|
|
check(BusyPipes >= 0);
|
|
|
|
NextMin->AdvanceTimestamp();
|
|
bFirst = false;
|
|
}
|
|
|
|
check(BusyPipes == 0);
|
|
|
|
#endif
|
|
|
|
D3D12_QUERY_DATA_PIPELINE_STATISTICS PipelineStats{};
|
|
for (auto const& Current : TimingPerQueue)
|
|
{
|
|
PipelineStats += Current->PipelineStats;
|
|
}
|
|
|
|
SET_DWORD_STAT(STAT_D3D12RHI_IAVertices , PipelineStats.IAVertices );
|
|
SET_DWORD_STAT(STAT_D3D12RHI_IAPrimitives , PipelineStats.IAPrimitives );
|
|
SET_DWORD_STAT(STAT_D3D12RHI_VSInvocations, PipelineStats.VSInvocations);
|
|
SET_DWORD_STAT(STAT_D3D12RHI_GSInvocations, PipelineStats.GSInvocations);
|
|
SET_DWORD_STAT(STAT_D3D12RHI_GSPrimitives , PipelineStats.GSPrimitives );
|
|
SET_DWORD_STAT(STAT_D3D12RHI_CInvocations , PipelineStats.CInvocations );
|
|
SET_DWORD_STAT(STAT_D3D12RHI_CPrimitives , PipelineStats.CPrimitives );
|
|
SET_DWORD_STAT(STAT_D3D12RHI_PSInvocations, PipelineStats.PSInvocations);
|
|
SET_DWORD_STAT(STAT_D3D12RHI_HSInvocations, PipelineStats.HSInvocations);
|
|
SET_DWORD_STAT(STAT_D3D12RHI_DSInvocations, PipelineStats.DSInvocations);
|
|
SET_DWORD_STAT(STAT_D3D12RHI_CSInvocations, PipelineStats.CSInvocations);
|
|
|
|
#if RHI_NEW_GPU_PROFILER == 0
|
|
|
|
// @todo mgpu - how to handle multiple devices / queues with potentially different timestamp frequencies?
|
|
FD3D12Device* Device = GetAdapter().GetDevice(0);
|
|
double Frequency = Device->GetTimestampFrequency(ED3D12QueueType::Direct);
|
|
|
|
const double Scale64 = 1.0 / (Frequency * FPlatformTime::GetSecondsPerCycle64());
|
|
|
|
// Update the global GPU frame time stats
|
|
SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTime, FPlatformMath::TruncToInt(double(UnionBusyCycles) * Scale64));
|
|
|
|
double HardwareGPUTime = 0.0;
|
|
if (GetHardwareGPUFrameTime(HardwareGPUTime) && CVarGPUTimeFromTimestamps.GetValueOnAnyThread() == 0)
|
|
{
|
|
SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeHW, HardwareGPUTime);
|
|
GRHIGPUFrameTimeHistory.PushFrameCycles(1.0 / FPlatformTime::GetSecondsPerCycle64(), HardwareGPUTime);
|
|
}
|
|
else
|
|
{
|
|
SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeHW, 0);
|
|
GRHIGPUFrameTimeHistory.PushFrameCycles(Frequency, UnionBusyCycles);
|
|
}
|
|
|
|
for (auto const& Current : TimingPerQueue)
|
|
{
|
|
switch (Current->Queue.QueueType)
|
|
{
|
|
case ED3D12QueueType::Direct: SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeGraphics , FPlatformMath::TruncToInt(double(Current->BusyCycles) * Scale64)); break;
|
|
case ED3D12QueueType::Async : SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeAsyncCompute, FPlatformMath::TruncToInt(double(Current->BusyCycles) * Scale64)); break;
|
|
case ED3D12QueueType::Copy : SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeCopy , FPlatformMath::TruncToInt(double(Current->BusyCycles) * Scale64)); break;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
}
|