// Copyright Epic Games, Inc. All Rights Reserved. #include "RayTracingInstanceBufferUtil.h" #include "Lumen/Lumen.h" #include "RayTracingDefinitions.h" #include "GPUScene.h" #include "RenderGraphBuilder.h" #include "ShaderParameterUtils.h" #include "RendererInterface.h" #include "RenderCore.h" #include "ShaderParameterStruct.h" #include "GlobalShader.h" #include "PipelineStateCache.h" #include "DataDrivenShaderPlatformInfo.h" #include "ShaderCompilerCore.h" #include "SceneRendering.h" #include "Async/ParallelFor.h" #include "Experimental/Containers/SherwoodHashTable.h" #if RHI_RAYTRACING PRAGMA_DISABLE_DEPRECATION_WARNINGS /* * * Each FRayTracingGeometryInstance can translate to multiple native TLAS instances (see FRayTracingGeometryInstance::NumTransforms). * * The FRayTracingGeometryInstance array (ie: FRayTracingScene::Instances) used to create FRayTracingSceneRHI * can have mix of instances using GPUScene or CPU transforms. * In order to reduce the number of dispatches to build the native RayTracing Instance Buffer, * the upload buffer containing FRayTracingInstanceDescriptor is split in 2 sections, [GPUSceneInstances] [CPUInstances]. * This way native GPUScene and CPU instance descriptors can be built in a single dispatch per type. * * If the ray tracing scene contains multiple layers, the instance buffer is divided into multiple subranges as expected by the RHI. * */ static TAutoConsoleVariable CVarRayTracingInstanceBufferRLE( TEXT("r.RayTracing.InstanceBuffer.RLE"), true, TEXT("Whether to use RLE to build ray tracing instance buffer."), ECVF_RenderThreadSafe | ECVF_Scalability ); static TAutoConsoleVariable CVarRayTracingInstanceBufferParallelFill( TEXT("r.RayTracing.InstanceBuffer.ParallelFill"), true, TEXT("Whether to fill the ray tracing instance upload buffer using parallel loops. 0=disabled, 1=enabled (default)"), ECVF_RenderThreadSafe); static TAutoConsoleVariable CVarRayTracingInstanceBufferLateGetGPUSceneParameters( TEXT("r.RayTracing.InstanceBuffer.LateGetGPUSceneParameters"), false, TEXT(""), ECVF_RenderThreadSafe); struct FRayTracingInstanceGroup { uint32 BaseInstanceIndex : 30; uint32 bIncrementUserDataPerInstance : 1; uint32 bReuseInstance : 1; }; static_assert(sizeof(FRayTracingInstanceGroup) == sizeof(uint32), "FRayTracingInstanceGroup is expected be same size as uint32."); static const uint32 GRayTracingInstanceGroupSize = 64; // Helper structure to assign instances to FRayTracingInstanceGroup depending on whether the primitive is compatible with RLE // TODO: Investigate better schemes to maximize RLE usage // The current implementation fills incomplete "head" group (before generating RLE groups) and might also generate a "tail" group (neither of which can use RLE since they contain instances from different primitives) // which means in practice only ISMs with >128 instances benefit from RLE unless they happen to end up at group boundaries. // An alternative approach is to allow incomplete groups instead of packing so aggressively to maximize the number of groups using RLE, // although that can lead to a lot of inactive threads depending on specific heuristics. // Primitives could also be sorted by number of instances to reduce fragmentation, etc. struct FGroupHelper { uint32 CurrentGroupIndex = 0; uint32 CurrentIndexInGroup = 0; uint32 NumInstanceDescriptors = 0; uint32 OptimalNumInstanceDescriptors = 0; void AddInstances(uint32 NumInstances, bool bRLECompatible) { if (bRLECompatible) { uint32 NumInstancesRemaining = NumInstances; if (CurrentIndexInGroup != 0) { // first N instances are used to fill the current (partial) group const uint32 N = FMath::Min(GRayTracingInstanceGroupSize - CurrentIndexInGroup, NumInstancesRemaining); NumInstancesRemaining -= N; CurrentIndexInGroup += N; CurrentGroupIndex += CurrentIndexInGroup / GRayTracingInstanceGroupSize; CurrentIndexInGroup %= GRayTracingInstanceGroupSize; NumInstanceDescriptors += N; } if (NumInstancesRemaining > 0) { check(CurrentIndexInGroup == 0); // remaining instances go into packed groups + tail group CurrentIndexInGroup += NumInstancesRemaining; CurrentGroupIndex += CurrentIndexInGroup / GRayTracingInstanceGroupSize; CurrentIndexInGroup %= GRayTracingInstanceGroupSize; const uint32 NumPackedGroups = NumInstancesRemaining / GRayTracingInstanceGroupSize; NumInstanceDescriptors += NumPackedGroups; NumInstanceDescriptors += CurrentIndexInGroup; } OptimalNumInstanceDescriptors += FMath::DivideAndRoundUp(NumInstances, GRayTracingInstanceGroupSize); } else { NumInstanceDescriptors += NumInstances; OptimalNumInstanceDescriptors += NumInstances; CurrentIndexInGroup += NumInstances; CurrentGroupIndex += CurrentIndexInGroup / GRayTracingInstanceGroupSize; CurrentIndexInGroup %= GRayTracingInstanceGroupSize; } } }; FRayTracingSceneInitializationData BuildRayTracingSceneInitializationData(TConstArrayView Instances, const TBitArray<>& VisibleInstances) { const bool bRLEAllowed = CVarRayTracingInstanceBufferRLE.GetValueOnRenderThread(); const uint32 NumSceneInstances = Instances.Num(); FRayTracingSceneInitializationData Output; Output.NumNativeGPUSceneInstances = 0; Output.NumNativeCPUInstances = 0; Output.InstanceGeometryIndices.SetNumUninitialized(NumSceneInstances); Output.BaseUploadBufferOffsets.SetNumUninitialized(NumSceneInstances); Output.BaseInstancePrefixSum.SetNumUninitialized(NumSceneInstances); Output.InstanceGroupEntryRefs.SetNumUninitialized(NumSceneInstances); TArray InstanceGroups; Experimental::TSherwoodMap UniqueGeometries; uint32 NumNativeInstances = 0; FGroupHelper GPUGroupHelper; FGroupHelper CPUGroupHelper; for (TConstSetBitIterator BitIt(VisibleInstances); BitIt; ++BitIt) { const uint32 InstanceIndex = BitIt.GetIndex(); const FRayTracingGeometryInstance& InstanceDesc = Instances[InstanceIndex]; const bool bGpuSceneInstance = InstanceDesc.BaseInstanceSceneDataOffset != -1 || !InstanceDesc.InstanceSceneDataOffsets.IsEmpty(); const bool bCpuInstance = !bGpuSceneInstance; checkf(!bGpuSceneInstance || InstanceDesc.BaseInstanceSceneDataOffset != -1 || InstanceDesc.NumTransforms <= uint32(InstanceDesc.InstanceSceneDataOffsets.Num()), TEXT("Expected at least %d ray tracing geometry instance scene data offsets, but got %d."), InstanceDesc.NumTransforms, InstanceDesc.InstanceSceneDataOffsets.Num()); checkf(!bCpuInstance || InstanceDesc.NumTransforms <= uint32(InstanceDesc.Transforms.Num()), TEXT("Expected at least %d ray tracing geometry instance transforms, but got %d."), InstanceDesc.NumTransforms, InstanceDesc.Transforms.Num()); checkf(InstanceDesc.GeometryRHI, TEXT("Ray tracing instance must have a valid geometry.")); uint32 GeometryIndex = UniqueGeometries.FindOrAdd(InstanceDesc.GeometryRHI, Output.ReferencedGeometries.Num()); Output.InstanceGeometryIndices[InstanceIndex] = GeometryIndex; if (GeometryIndex == Output.ReferencedGeometries.Num()) { Output.ReferencedGeometries.Add(InstanceDesc.GeometryRHI); } if (bGpuSceneInstance) { check(InstanceDesc.Transforms.IsEmpty()); Output.BaseUploadBufferOffsets[InstanceIndex] = GPUGroupHelper.NumInstanceDescriptors; Output.NumNativeGPUSceneInstances += InstanceDesc.NumTransforms; } else if (bCpuInstance) { Output.BaseUploadBufferOffsets[InstanceIndex] = CPUGroupHelper.NumInstanceDescriptors; Output.NumNativeCPUInstances += InstanceDesc.NumTransforms; } else { checkNoEntry(); } Output.BaseInstancePrefixSum[InstanceIndex] = NumNativeInstances; NumNativeInstances += InstanceDesc.NumTransforms; const bool bUseUniqueUserData = InstanceDesc.UserData.Num() != 0; if (bGpuSceneInstance) { Output.InstanceGroupEntryRefs[InstanceIndex].GroupIndex = GPUGroupHelper.CurrentGroupIndex; Output.InstanceGroupEntryRefs[InstanceIndex].BaseIndexInGroup = GPUGroupHelper.CurrentIndexInGroup; const bool bRLECompatible = bRLEAllowed && (InstanceDesc.BaseInstanceSceneDataOffset != -1) && !bUseUniqueUserData; GPUGroupHelper.AddInstances(InstanceDesc.NumTransforms, bRLECompatible); } else { Output.InstanceGroupEntryRefs[InstanceIndex].GroupIndex = CPUGroupHelper.CurrentGroupIndex; Output.InstanceGroupEntryRefs[InstanceIndex].BaseIndexInGroup = CPUGroupHelper.CurrentIndexInGroup; const bool bRLECompatible = bRLEAllowed && !bUseUniqueUserData; CPUGroupHelper.AddInstances(InstanceDesc.NumTransforms, bRLECompatible); } } Output.NumGPUInstanceGroups = GPUGroupHelper.CurrentGroupIndex + (GPUGroupHelper.CurrentIndexInGroup > 0 ? 1 : 0); Output.NumCPUInstanceGroups = CPUGroupHelper.CurrentGroupIndex + (CPUGroupHelper.CurrentIndexInGroup > 0 ? 1 : 0); Output.NumGPUInstanceDescriptors = GPUGroupHelper.NumInstanceDescriptors; Output.NumCPUInstanceDescriptors = CPUGroupHelper.NumInstanceDescriptors; return MoveTemp(Output); } FRayTracingSceneInitializationData BuildRayTracingSceneInitializationData(TConstArrayView Instances) { // Deprecated code path assumes all provided instances are visible. TBitArray<> VisibleInstances; VisibleInstances.SetNum(Instances.Num(), true); return BuildRayTracingSceneInitializationData(Instances, VisibleInstances); } void WriteInstanceDescriptor( const FRayTracingGeometryInstance& SceneInstance, uint32 SceneInstanceIndex, uint32 TransformIndex, uint32 AccelerationStructureIndex, bool bGpuSceneInstance, bool bUseUniqueUserData, bool bUseLightingChannels, bool bForceOpaque, bool bDisableTriangleCull, uint32 BaseInstanceIndex, uint32 BaseTransformIndex, FRayTracingInstanceDescriptor& OutInstanceDescriptor) { FRayTracingInstanceDescriptor InstanceDesc; if (bGpuSceneInstance) { if (SceneInstance.BaseInstanceSceneDataOffset != -1) { InstanceDesc.GPUSceneInstanceOrTransformIndex = SceneInstance.BaseInstanceSceneDataOffset + TransformIndex; } else { InstanceDesc.GPUSceneInstanceOrTransformIndex = SceneInstance.InstanceSceneDataOffsets[TransformIndex]; } } else { InstanceDesc.GPUSceneInstanceOrTransformIndex = BaseTransformIndex + TransformIndex; } uint32 UserData; if (bUseUniqueUserData) { UserData = SceneInstance.UserData[TransformIndex]; } else { UserData = SceneInstance.DefaultUserData; if (SceneInstance.bIncrementUserDataPerInstance) { UserData += TransformIndex; } } ERayTracingInstanceFlags InstanceFlags = SceneInstance.Flags; if (bUseLightingChannels && SceneInstance.bUsesLightingChannels) { // AHS is needed for lighting channels since the Raytracing API's work off an "inclusion" mask that's not compatible with lighting channels InstanceFlags &= ~ERayTracingInstanceFlags::ForceOpaque; } if (bForceOpaque) { InstanceFlags |= ERayTracingInstanceFlags::ForceOpaque; } if (bDisableTriangleCull) { InstanceFlags |= ERayTracingInstanceFlags::TriangleCullDisable; } InstanceDesc.OutputDescriptorIndex = BaseInstanceIndex + TransformIndex; InstanceDesc.AccelerationStructureIndex = AccelerationStructureIndex; InstanceDesc.InstanceId = UserData; InstanceDesc.InstanceMaskAndFlags = SceneInstance.Mask | ((uint32)InstanceFlags << 8); InstanceDesc.InstanceContributionToHitGroupIndex = SceneInstance.InstanceContributionToHitGroupIndex; InstanceDesc.SceneInstanceIndexAndApplyLocalBoundsTransform = (SceneInstance.bApplyLocalBoundsTransform ? 0x80000000 : 0) | SceneInstanceIndex; ensureMsgf(InstanceDesc.InstanceId <= 0xFFFFFF, TEXT("InstanceId must fit in 24 bits.")); ensureMsgf(InstanceDesc.InstanceContributionToHitGroupIndex <= 0xFFFFFF, TEXT("InstanceContributionToHitGroupIndex must fit in 24 bits.")); // copy at the end to avoid reading from OutInstanceDescriptor in the checks above OutInstanceDescriptor = InstanceDesc; } // Helper function to fill upload buffers required by BuildRayTracingInstanceBuffer with instance descriptors // Transforms of CPU instances are copied to OutTransformData void FillRayTracingInstanceUploadBuffer( FVector PreViewTranslation, TConstArrayView Instances, const TBitArray<>& VisibleInstances, bool bUseLightingChannels, bool bForceOpaque, bool bDisableTriangleCull, TConstArrayView InstanceGeometryIndices, TConstArrayView BaseUploadBufferOffsets, TConstArrayView BaseInstancePrefixSum, TConstArrayView InstanceGroupEntryRefs, uint32 NumGPUInstanceGroups, uint32 NumCPUInstanceGroups, uint32 NumGPUInstanceDescriptors, uint32 NumCPUInstanceDescriptors, TArrayView OutInstanceGroupUploadData, TArrayView OutInstanceUploadData, TArrayView OutTransformData) { TRACE_CPUPROFILER_EVENT_SCOPE(FillRayTracingInstanceUploadBuffer); const bool bRLEAllowed = CVarRayTracingInstanceBufferRLE.GetValueOnRenderThread(); const int32 NumSceneInstances = VisibleInstances.Num(); const int32 MinBatchSize = 128; ParallelFor( TEXT("FillRayTracingInstanceUploadBuffer_Parallel"), NumSceneInstances, MinBatchSize, [ OutInstanceGroupUploadData, OutInstanceUploadData, OutTransformData, NumGPUInstanceGroups, NumCPUInstanceGroups, NumGPUInstanceDescriptors, NumCPUInstanceDescriptors, Instances, &VisibleInstances, bUseLightingChannels, bForceOpaque, bDisableTriangleCull, InstanceGeometryIndices, BaseUploadBufferOffsets, BaseInstancePrefixSum, InstanceGroupEntryRefs, PreViewTranslation, bRLEAllowed ](int32 SceneInstanceIndex) { if(!VisibleInstances[SceneInstanceIndex]) { return; } const FRayTracingGeometryInstance& SceneInstance = Instances[SceneInstanceIndex]; const uint32 NumTransforms = SceneInstance.NumTransforms; checkf(SceneInstance.UserData.Num() == 0 || SceneInstance.UserData.Num() >= int32(NumTransforms), TEXT("User data array must be either be empty (Instance.DefaultUserData is used), or contain one entry per entry in Transforms array.")); const bool bUseUniqueUserData = SceneInstance.UserData.Num() != 0; const bool bGpuSceneInstance = SceneInstance.BaseInstanceSceneDataOffset != -1 || !SceneInstance.InstanceSceneDataOffsets.IsEmpty(); const bool bCpuInstance = !bGpuSceneInstance; checkf(bGpuSceneInstance + bCpuInstance == 1, TEXT("Instance can only get transforms from one of GPUScene, or Transforms array.")); const uint32 AccelerationStructureIndex = InstanceGeometryIndices[SceneInstanceIndex]; const uint32 BaseInstanceIndex = BaseInstancePrefixSum[SceneInstanceIndex]; const uint32 BaseTransformIndex = bCpuInstance ? BaseUploadBufferOffsets[SceneInstanceIndex] : 0; uint32 BaseDescriptorIndex = BaseUploadBufferOffsets[SceneInstanceIndex]; uint32 BaseDescriptorOffset = 0; // Upload buffer is split into 2 sections [GPUSceneInstances][CPUInstances] if (!bGpuSceneInstance) { BaseDescriptorOffset += NumGPUInstanceDescriptors; } const bool bRLECompatible = bRLEAllowed && (!bGpuSceneInstance || SceneInstance.BaseInstanceSceneDataOffset != -1) && !bUseUniqueUserData; const FRayTracingInstanceGroupEntryRef& GroupEntryRef = InstanceGroupEntryRefs[SceneInstanceIndex]; uint32 GroupIndex = GroupEntryRef.GroupIndex; uint32 BaseIndexInGroup = GroupEntryRef.BaseIndexInGroup; if (bCpuInstance) { GroupIndex += NumGPUInstanceGroups; } uint32 TransformIndex = 0; if (BaseIndexInGroup > 0) { // write N instances to fill (partial) head group const uint32 N = FMath::Min(GRayTracingInstanceGroupSize - BaseIndexInGroup, NumTransforms); for (; TransformIndex < N; ++TransformIndex) { WriteInstanceDescriptor( SceneInstance, SceneInstanceIndex, TransformIndex, AccelerationStructureIndex, bGpuSceneInstance, bUseUniqueUserData, bUseLightingChannels, bForceOpaque, bDisableTriangleCull, BaseInstanceIndex, BaseTransformIndex, OutInstanceUploadData[BaseDescriptorOffset + BaseDescriptorIndex]); ++BaseDescriptorIndex; } ++GroupIndex; } if (bRLECompatible) { const uint32 NumPackedGroups = (NumTransforms - TransformIndex) / GRayTracingInstanceGroupSize; // write packed groups for (uint32 PackedGroupIndex = 0; PackedGroupIndex < NumPackedGroups; ++PackedGroupIndex) { // write packed group FRayTracingInstanceGroup Group; Group.BaseInstanceIndex = BaseDescriptorIndex; Group.bIncrementUserDataPerInstance = SceneInstance.bIncrementUserDataPerInstance ? 1 : 0; Group.bReuseInstance = 1; OutInstanceGroupUploadData[GroupIndex] = Group; ++GroupIndex; // and corresponding instance WriteInstanceDescriptor( SceneInstance, SceneInstanceIndex, TransformIndex, AccelerationStructureIndex, bGpuSceneInstance, bUseUniqueUserData, bUseLightingChannels, bForceOpaque, bDisableTriangleCull, BaseInstanceIndex, BaseTransformIndex, OutInstanceUploadData[BaseDescriptorOffset + BaseDescriptorIndex]); ++BaseDescriptorIndex; TransformIndex += GRayTracingInstanceGroupSize; } } if (TransformIndex < NumTransforms) { // write tail groups (not packed) const uint32 NumTailGroups = FMath::DivideAndRoundUp(NumTransforms - TransformIndex, GRayTracingInstanceGroupSize); for (uint32 TailGroupIndex = 0; TailGroupIndex < NumTailGroups; ++TailGroupIndex) { FRayTracingInstanceGroup Group; Group.BaseInstanceIndex = BaseDescriptorIndex + TailGroupIndex * GRayTracingInstanceGroupSize; Group.bIncrementUserDataPerInstance = 0; Group.bReuseInstance = 0; OutInstanceGroupUploadData[GroupIndex] = Group; ++GroupIndex; } // and instances for (; TransformIndex < NumTransforms; ++TransformIndex) { WriteInstanceDescriptor( SceneInstance, SceneInstanceIndex, TransformIndex, AccelerationStructureIndex, bGpuSceneInstance, bUseUniqueUserData, bUseLightingChannels, bForceOpaque, bDisableTriangleCull, BaseInstanceIndex, BaseTransformIndex, OutInstanceUploadData[BaseDescriptorOffset + BaseDescriptorIndex]); ++BaseDescriptorIndex; } } if (bCpuInstance) { for (uint32 TransformIndex2 = 0; TransformIndex2 < NumTransforms; ++TransformIndex2) { const uint32 TransformDataOffset = (BaseTransformIndex + TransformIndex2) * 3; FMatrix LocalToTranslatedWorld = SceneInstance.Transforms[TransformIndex2].ConcatTranslation(PreViewTranslation); const FMatrix44f LocalToTranslatedWorldF = FMatrix44f(LocalToTranslatedWorld.GetTransposed()); OutTransformData[TransformDataOffset + 0] = *(FVector4f*)&LocalToTranslatedWorldF.M[0]; OutTransformData[TransformDataOffset + 1] = *(FVector4f*)&LocalToTranslatedWorldF.M[1]; OutTransformData[TransformDataOffset + 2] = *(FVector4f*)&LocalToTranslatedWorldF.M[2]; } } }, CVarRayTracingInstanceBufferParallelFill.GetValueOnRenderThread() ? EParallelForFlags::None : EParallelForFlags::ForceSingleThread); } void FillRayTracingInstanceUploadBuffer( FRayTracingSceneRHIRef RayTracingSceneRHI, FVector PreViewTranslation, TConstArrayView Instances, TConstArrayView InstanceGeometryIndices, TConstArrayView BaseUploadBufferOffsets, TConstArrayView BaseInstancePrefixSum, uint32 NumNativeGPUSceneInstances, uint32 NumNativeCPUInstances, TArrayView OutInstanceUploadData, TArrayView OutTransformData) { // Deprecated code path assumes all provided instances are visible. TBitArray<> VisibleInstances; VisibleInstances.SetNum(Instances.Num(), true); FillRayTracingInstanceUploadBuffer( PreViewTranslation, Instances, VisibleInstances, false, false, false, InstanceGeometryIndices, BaseUploadBufferOffsets, BaseInstancePrefixSum, {}, 0, 0, NumNativeGPUSceneInstances, NumNativeCPUInstances, {}, OutInstanceUploadData, OutTransformData); } struct FRayTracingBuildInstanceBufferCS : public FGlobalShader { DECLARE_GLOBAL_SHADER(FRayTracingBuildInstanceBufferCS); SHADER_USE_PARAMETER_STRUCT(FRayTracingBuildInstanceBufferCS, FGlobalShader); BEGIN_SHADER_PARAMETER_STRUCT(FParameters, ) SHADER_PARAMETER_STRUCT_INCLUDE(FGPUSceneResourceParametersRHI, GPUSceneParameters) SHADER_PARAMETER_UAV(RWStructuredBuffer, OutPlatformInstanceDescriptors) SHADER_PARAMETER_UAV(RWStructuredBuffer, OutHitGroupContributions) SHADER_PARAMETER_SRV(StructuredBuffer, InstanceGroupDescriptors) SHADER_PARAMETER_SRV(StructuredBuffer, InstanceDescriptors) SHADER_PARAMETER_SRV(ByteAddressBuffer, AccelerationStructureAddresses) SHADER_PARAMETER_SRV(StructuredBuffer, InstanceTransforms) SHADER_PARAMETER(uint32, MaxNumInstances) SHADER_PARAMETER(uint32, NumGroups) SHADER_PARAMETER(uint32, NumInstanceDescriptors) SHADER_PARAMETER(uint32, BaseGroupDescriptorIndex) SHADER_PARAMETER(uint32, BaseInstanceDescriptorIndex) SHADER_PARAMETER(FVector3f, PreViewTranslationHigh) SHADER_PARAMETER(FVector3f, PreViewTranslationLow) // Instance culling params SHADER_PARAMETER(float, CullingRadius) SHADER_PARAMETER(float, FarFieldCullingRadius) SHADER_PARAMETER(float, AngleThresholdRatioSq) SHADER_PARAMETER(FVector3f, ViewOrigin) SHADER_PARAMETER(uint32, CullingMode) SHADER_PARAMETER(uint32, CullUsingGroups) SHADER_PARAMETER_UAV(RWStructuredBuffer, RWOutputStats) SHADER_PARAMETER(uint32, OutputStatsOffset) // Debug parameters SHADER_PARAMETER_UAV(RWStructuredBuffer, RWInstanceExtraData) END_SHADER_PARAMETER_STRUCT() class FSupportInstanceGroupsDim : SHADER_PERMUTATION_BOOL("SUPPORT_INSTANCE_GROUPS"); class FUseGPUSceneDim : SHADER_PERMUTATION_BOOL("USE_GPUSCENE"); class FOutputInstanceExtraDataDim : SHADER_PERMUTATION_BOOL("OUTPUT_INSTANCE_EXTRA_DATA"); class FGpuCullingDim : SHADER_PERMUTATION_BOOL("GPU_CULLING"); class FOutputStatsDim : SHADER_PERMUTATION_BOOL("OUTPUT_STATS"); class FCompactOutputDim : SHADER_PERMUTATION_BOOL("COMPACT_OUTPUT"); class FUseWaveOpsDim : SHADER_PERMUTATION_BOOL("USE_WAVE_OPS"); class FUseSeparateHitContributionBuffer : SHADER_PERMUTATION_BOOL("USE_SEPARATE_HIT_CONTRIBUTION_BUFFER"); using FPermutationDomain = TShaderPermutationDomain; static constexpr uint32 ThreadGroupSize = GRayTracingInstanceGroupSize; static inline void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment) { FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment); OutEnvironment.SetDefine(TEXT("THREADGROUP_SIZE"), ThreadGroupSize); OutEnvironment.SetDefine(TEXT("VF_SUPPORTS_PRIMITIVE_SCENE_DATA"), 1); OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_SCENE_DATA"), 1); // Force DXC to avoid shader reflection issues. OutEnvironment.CompilerFlags.Add(CFLAG_ForceDXC); } static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters) { FPermutationDomain PermutationVector(Parameters.PermutationId); if (PermutationVector.Get() && !RHISupportsWaveOperations(Parameters.Platform)) { return false; } return IsRayTracingEnabledForProject(Parameters.Platform); } static EShaderPermutationPrecacheRequest ShouldPrecachePermutation(const FGlobalShaderPermutationParameters& Parameters) { FPermutationDomain PermutationVector(Parameters.PermutationId); if (PermutationVector.Get() != GRHISupportsWaveOperations) { return EShaderPermutationPrecacheRequest::NotPrecached; } if (PermutationVector.Get() != GRHIGlobals.RayTracing.RequiresSeparateHitGroupContributionsBuffer) { return EShaderPermutationPrecacheRequest::NotPrecached; } return FGlobalShader::ShouldPrecachePermutation(Parameters); } }; IMPLEMENT_GLOBAL_SHADER(FRayTracingBuildInstanceBufferCS, "/Engine/Private/Raytracing/RayTracingInstanceBufferUtil.usf", "RayTracingBuildInstanceBufferCS", SF_Compute); void BuildRayTracingInstanceBuffer( FRHICommandList& RHICmdList, const FGPUSceneResourceParametersRHI* GPUSceneParameters, const FDFVector3& PreViewTranslation, uint32 MaxNumInstances, uint32 NumGroups, uint32 NumInstanceDescriptors, FRHIUnorderedAccessView* InstancesUAV, FRHIUnorderedAccessView* HitGroupContributionsUAV, FRHIShaderResourceView* InstanceGroupUploadSRV, uint32 InstanceGroupUploadOffset, FRHIShaderResourceView* InstanceUploadSRV, uint32 InstanceUploadOffset, FRHIShaderResourceView* AccelerationStructureAddressesSRV, FRHIShaderResourceView* InstanceTransformSRV, const FRayTracingCullingParameters* CullingParameters, bool bCompactOutput, FRHIUnorderedAccessView* OutputStatsUAV, uint32 OutputStatsOffset, FRHIUnorderedAccessView* InstanceExtraDataUAV) { FRayTracingBuildInstanceBufferCS::FParameters PassParams; PassParams.OutPlatformInstanceDescriptors = InstancesUAV; PassParams.OutHitGroupContributions = HitGroupContributionsUAV; PassParams.InstanceGroupDescriptors = InstanceGroupUploadSRV; PassParams.InstanceDescriptors = InstanceUploadSRV; PassParams.AccelerationStructureAddresses = AccelerationStructureAddressesSRV; PassParams.InstanceTransforms = InstanceTransformSRV; PassParams.MaxNumInstances = MaxNumInstances; PassParams.NumGroups = NumGroups; PassParams.NumInstanceDescriptors = NumInstanceDescriptors; PassParams.BaseGroupDescriptorIndex = InstanceGroupUploadOffset; PassParams.BaseInstanceDescriptorIndex = InstanceUploadOffset; PassParams.PreViewTranslationHigh = PreViewTranslation.High; PassParams.PreViewTranslationLow = PreViewTranslation.Low; if (GPUSceneParameters) { PassParams.GPUSceneParameters = *GPUSceneParameters; } if (CullingParameters) { PassParams.CullingRadius = CullingParameters->CullingRadius; PassParams.FarFieldCullingRadius = CullingParameters->FarFieldCullingRadius; PassParams.AngleThresholdRatioSq = CullingParameters->AngleThresholdRatioSq; PassParams.ViewOrigin = CullingParameters->TranslatedViewOrigin; PassParams.CullingMode = uint32(CullingParameters->CullingMode); PassParams.CullUsingGroups = uint32(CullingParameters->bCullUsingGroupIds); } PassParams.RWOutputStats = OutputStatsUAV; PassParams.OutputStatsOffset = OutputStatsOffset; PassParams.RWInstanceExtraData = InstanceExtraDataUAV; FRayTracingBuildInstanceBufferCS::FPermutationDomain PermutationVector; PermutationVector.Set(InstanceGroupUploadSRV != nullptr); PermutationVector.Set(InstanceTransformSRV == nullptr); PermutationVector.Set(InstanceExtraDataUAV != nullptr); PermutationVector.Set(OutputStatsUAV != nullptr); PermutationVector.Set(bCompactOutput); PermutationVector.Set(GRHISupportsWaveOperations); PermutationVector.Set(CullingParameters != nullptr); PermutationVector.Set(GRHIGlobals.RayTracing.RequiresSeparateHitGroupContributionsBuffer); auto ComputeShader = GetGlobalShaderMap(GMaxRHIFeatureLevel)->GetShader(PermutationVector); const FIntVector GroupCount = FComputeShaderUtils::GetGroupCountWrapped(PassParams.NumGroups); //ClearUnusedGraphResources(ComputeShader, &PassParams); SetComputePipelineState(RHICmdList, ComputeShader.GetComputeShader()); SetShaderParameters(RHICmdList, ComputeShader, ComputeShader.GetComputeShader(), PassParams); DispatchComputeShader(RHICmdList, ComputeShader.GetShader(), GroupCount.X, GroupCount.Y, GroupCount.Z); UnsetShaderUAVs(RHICmdList, ComputeShader, ComputeShader.GetComputeShader()); } void BuildRayTracingInstanceBuffer( FRHICommandList& RHICmdList, const FGPUSceneResourceParametersRHI* GPUSceneParameters, const FDFVector3& PreViewTranslation, FRHIUnorderedAccessView* InstancesUAV, FRHIUnorderedAccessView* HitGroupContributionsUAV, FRHIShaderResourceView* InstanceGroupUploadSRV, FRHIShaderResourceView* InstanceUploadSRV, FRHIShaderResourceView* AccelerationStructureAddressesSRV, FRHIShaderResourceView* CPUInstanceTransformSRV, uint32 MaxNumInstances, uint32 NumGPUGroups, uint32 NumCPUGroups, uint32 NumGPUInstanceDescriptors, uint32 NumCPUInstanceDescriptors, const FRayTracingCullingParameters* CullingParameters, bool bCompactOutput, FRHIUnorderedAccessView* OutputStatsUAV, uint32 OutputStatsOffset, FRHIUnorderedAccessView* InstanceExtraDataUAV) { if (NumGPUInstanceDescriptors > 0) { BuildRayTracingInstanceBuffer( RHICmdList, GPUSceneParameters, PreViewTranslation, MaxNumInstances, NumGPUGroups, NumGPUInstanceDescriptors, InstancesUAV, HitGroupContributionsUAV, InstanceGroupUploadSRV, 0, InstanceUploadSRV, 0, AccelerationStructureAddressesSRV, nullptr, CullingParameters, bCompactOutput, OutputStatsUAV, OutputStatsOffset, InstanceExtraDataUAV); } if (NumCPUInstanceDescriptors > 0) { BuildRayTracingInstanceBuffer( RHICmdList, GPUSceneParameters, PreViewTranslation, MaxNumInstances, NumCPUGroups, NumCPUInstanceDescriptors, InstancesUAV, HitGroupContributionsUAV, InstanceGroupUploadSRV, NumGPUGroups, // CPU instance group descriptors are stored after GPU Scene instance groups InstanceUploadSRV, NumGPUInstanceDescriptors, // CPU input instance descriptors are stored after GPU Scene instances AccelerationStructureAddressesSRV, CPUInstanceTransformSRV, nullptr, bCompactOutput, OutputStatsUAV, OutputStatsOffset, InstanceExtraDataUAV); } } void BuildRayTracingInstanceBuffer( FRHICommandList& RHICmdList, const FGPUScene* GPUScene, const FDFVector3& PreViewTranslation, FRHIUnorderedAccessView* InstancesUAV, FRHIUnorderedAccessView* HitGroupContributionsUAV, FRHIShaderResourceView* InstanceUploadSRV, FRHIShaderResourceView* AccelerationStructureAddressesSRV, FRHIShaderResourceView* CPUInstanceTransformSRV, uint32 NumNativeGPUSceneInstances, uint32 NumNativeCPUInstances, const FRayTracingCullingParameters* CullingParameters, FRHIUnorderedAccessView* OutputStatsUAV, FRHIUnorderedAccessView* InstanceExtraDataUAV) { FGPUSceneResourceParametersRHI GPUSceneParameters; if (GPUScene) { GPUSceneParameters = GPUScene->GetShaderParametersRHI(); } BuildRayTracingInstanceBuffer( RHICmdList, GPUScene ? &GPUSceneParameters : nullptr, PreViewTranslation, InstancesUAV, HitGroupContributionsUAV, nullptr, InstanceUploadSRV, AccelerationStructureAddressesSRV, CPUInstanceTransformSRV, NumNativeGPUSceneInstances + NumNativeCPUInstances, 0, 0, NumNativeGPUSceneInstances, NumNativeCPUInstances, CullingParameters, /*bCompactOutput*/ false, OutputStatsUAV, 0, InstanceExtraDataUAV); } void FRayTracingInstanceBufferBuilder::Init(TConstArrayView InInstances, FVector InPreViewTranslation) { checkf(Instances.IsEmpty(), TEXT("RayTracingInstanceBufferBuilder was already initialized.")); Instances = InInstances; PreViewTranslation = InPreViewTranslation; // Deprecated code path assumes all provided instances are visible. VisibleInstances.SetNum(Instances.Num(), true); Data = BuildRayTracingSceneInitializationData(Instances, VisibleInstances); } void FRayTracingInstanceBufferBuilder::Init(FRayTracingInstanceBufferBuilderInitializer Initializer) { Instances = Initializer.Instances; VisibleInstances = MoveTemp(Initializer.VisibleInstances); PreViewTranslation = Initializer.PreViewTranslation; bUseLightingChannels = Initializer.bUseLightingChannels; bForceOpaque = Initializer.bForceOpaque; bDisableTriangleCull = Initializer.bDisableTriangleCull; Data = BuildRayTracingSceneInitializationData(Instances, VisibleInstances); } void FRayTracingInstanceBufferBuilder::FillRayTracingInstanceUploadBuffer(FRHICommandList& RHICmdList) { // Round up buffer sizes to some multiple to avoid pathological growth reallocations. static constexpr uint32 AllocationGranularity = 8 * 1024; static constexpr uint64 BufferAllocationGranularity = 16 * 1024 * 1024; const uint32 NumInstanceGroups = Data.NumGPUInstanceGroups + Data.NumCPUInstanceGroups; const uint32 NumInstanceGroupsAligned = FMath::DivideAndRoundUp(FMath::Max(NumInstanceGroups, 1U), AllocationGranularity) * AllocationGranularity; const uint32 NumInstanceDescriptors = Data.NumGPUInstanceDescriptors + Data.NumCPUInstanceDescriptors; const uint32 NumInstanceDescriptorsAligned = FMath::DivideAndRoundUp(FMath::Max(NumInstanceDescriptors, 1U), AllocationGranularity) * AllocationGranularity; const uint32 NumTransformsAligned = FMath::DivideAndRoundUp(FMath::Max(Data.NumNativeCPUInstances, 1U), AllocationGranularity) * AllocationGranularity; { // Create/resize instance group upload buffer (if necessary) const uint32 UploadBufferSize = NumInstanceGroupsAligned * sizeof(FRayTracingInstanceGroup); if (!InstanceGroupUploadBuffer.IsValid() || UploadBufferSize > InstanceGroupUploadBuffer->GetSize() || UploadBufferSize < InstanceGroupUploadBuffer->GetSize() / 2) { const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::CreateStructured(TEXT("FRayTracingScene::InstanceGroupUploadBuffer"), UploadBufferSize, sizeof(FRayTracingInstanceGroup)) .AddUsage(EBufferUsageFlags::ShaderResource | EBufferUsageFlags::Volatile) .DetermineInitialState(); InstanceGroupUploadBuffer = RHICmdList.CreateBuffer(CreateDesc); InstanceGroupUploadSRV = RHICmdList.CreateShaderResourceView(InstanceGroupUploadBuffer, FRHIViewDesc::CreateBufferSRV().SetTypeFromBuffer(InstanceGroupUploadBuffer)); } } { // Create/resize instance upload buffer (if necessary) const uint32 UploadBufferSize = NumInstanceDescriptorsAligned * sizeof(FRayTracingInstanceDescriptor); if (!InstanceUploadBuffer.IsValid() || UploadBufferSize > InstanceUploadBuffer->GetSize() || UploadBufferSize < InstanceUploadBuffer->GetSize() / 2) { const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::CreateStructured(TEXT("FRayTracingScene::InstanceUploadBuffer"), UploadBufferSize, sizeof(FRayTracingInstanceDescriptor)) .AddUsage(EBufferUsageFlags::ShaderResource | EBufferUsageFlags::Volatile) .DetermineInitialState(); InstanceUploadBuffer = RHICmdList.CreateBuffer(CreateDesc); InstanceUploadSRV = RHICmdList.CreateShaderResourceView(InstanceUploadBuffer, FRHIViewDesc::CreateBufferSRV().SetTypeFromBuffer(InstanceUploadBuffer)); } } { const uint32 UploadBufferSize = NumTransformsAligned * sizeof(FVector4f) * 3; // Create/resize transform upload buffer (if necessary) if (!TransformUploadBuffer.IsValid() || UploadBufferSize > TransformUploadBuffer->GetSize() || UploadBufferSize < TransformUploadBuffer->GetSize() / 2) { const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::CreateStructured(TEXT("FRayTracingScene::TransformUploadBuffer"), UploadBufferSize, sizeof(FVector4f)) .AddUsage(EBufferUsageFlags::ShaderResource | EBufferUsageFlags::Volatile) .DetermineInitialState(); TransformUploadBuffer = RHICmdList.CreateBuffer(CreateDesc); TransformUploadSRV = RHICmdList.CreateShaderResourceView(TransformUploadBuffer, FRHIViewDesc::CreateBufferSRV().SetTypeFromBuffer(TransformUploadBuffer)); } } const uint32 InstanceGroupUploadBytes = NumInstanceGroups * sizeof(FRayTracingInstanceGroup); const uint32 InstanceUploadBytes = NumInstanceDescriptors * sizeof(FRayTracingInstanceDescriptor); const uint32 TransformUploadBytes = Data.NumNativeCPUInstances * 3 * sizeof(FVector4f); FRayTracingInstanceGroup* InstanceGroupUploadData = (FRayTracingInstanceGroup*)RHICmdList.LockBuffer(InstanceGroupUploadBuffer, 0, InstanceGroupUploadBytes, RLM_WriteOnly); FRayTracingInstanceDescriptor* InstanceUploadData = (FRayTracingInstanceDescriptor*)RHICmdList.LockBuffer(InstanceUploadBuffer, 0, InstanceUploadBytes, RLM_WriteOnly); FVector4f* TransformUploadData = (Data.NumNativeCPUInstances > 0) ? (FVector4f*)RHICmdList.LockBuffer(TransformUploadBuffer, 0, TransformUploadBytes, RLM_WriteOnly) : nullptr; ::FillRayTracingInstanceUploadBuffer( PreViewTranslation, Instances, VisibleInstances, bUseLightingChannels, bForceOpaque, bDisableTriangleCull, Data.InstanceGeometryIndices, Data.BaseUploadBufferOffsets, Data.BaseInstancePrefixSum, Data.InstanceGroupEntryRefs, Data.NumGPUInstanceGroups, Data.NumCPUInstanceGroups, Data.NumGPUInstanceDescriptors, Data.NumCPUInstanceDescriptors, MakeArrayView(InstanceGroupUploadData, NumInstanceGroups), MakeArrayView(InstanceUploadData, NumInstanceDescriptors), MakeArrayView(TransformUploadData, Data.NumNativeCPUInstances * 3)); RHICmdList.UnlockBuffer(InstanceGroupUploadBuffer); RHICmdList.UnlockBuffer(InstanceUploadBuffer); if (Data.NumNativeCPUInstances > 0) { RHICmdList.UnlockBuffer(TransformUploadBuffer); } } void FRayTracingInstanceBufferBuilder::FillAccelerationStructureAddressesBuffer(FRHICommandList& RHICmdList) { const uint32 NumGeometries = FMath::RoundUpToPowerOfTwo(Data.ReferencedGeometries.Num()); { // Round to PoT to avoid resizing too often const uint32 NumGeometriesAligned = FMath::RoundUpToPowerOfTwo(NumGeometries); const uint32 AccelerationStructureAddressesBufferSize = NumGeometriesAligned * sizeof(FRayTracingAccelerationStructureAddress); if (AccelerationStructureAddressesBuffer.NumBytes < AccelerationStructureAddressesBufferSize) { // Need to pass "BUF_MultiGPUAllocate", as virtual addresses are different per GPU AccelerationStructureAddressesBuffer.Initialize(RHICmdList, TEXT("FRayTracingScene::AccelerationStructureAddressesBuffer"), AccelerationStructureAddressesBufferSize, BUF_Volatile | BUF_MultiGPUAllocate); } } for (uint32 GPUIndex : RHICmdList.GetGPUMask()) { FRayTracingAccelerationStructureAddress* AddressesPtr = (FRayTracingAccelerationStructureAddress*)RHICmdList.LockBufferMGPU( AccelerationStructureAddressesBuffer.Buffer, GPUIndex, 0, NumGeometries * sizeof(FRayTracingAccelerationStructureAddress), RLM_WriteOnly); const TArrayView ReferencedGeometries = RHICmdList.AllocArray(MakeConstArrayView(Data.ReferencedGeometries)); RHICmdList.EnqueueLambda([AddressesPtr, ReferencedGeometries, GPUIndex](FRHICommandListBase&) { TRACE_CPUPROFILER_EVENT_SCOPE(GetAccelerationStructuresAddresses); for (int32 GeometryIndex = 0; GeometryIndex < ReferencedGeometries.Num(); ++GeometryIndex) { AddressesPtr[GeometryIndex] = ReferencedGeometries[GeometryIndex]->GetAccelerationStructureAddress(GPUIndex); } }); RHICmdList.UnlockBufferMGPU(AccelerationStructureAddressesBuffer.Buffer, GPUIndex); } } void FRayTracingInstanceBufferBuilder::BuildRayTracingInstanceBuffer( FRHICommandList& RHICmdList, const FGPUScene* GPUScene, const FRayTracingCullingParameters* CullingParameters, FRHIUnorderedAccessView* InstancesUAV, FRHIUnorderedAccessView* HitGroupContributionsUAV, uint32 MaxNumInstances, bool bCompactOutput, FRHIUnorderedAccessView* OutputStatsUAV, uint32 OutputStatsOffset, FRHIUnorderedAccessView* InstanceExtraDataUAV) { FGPUSceneResourceParametersRHI GPUSceneParameters; if (GPUScene) { GPUSceneParameters = GPUScene->GetShaderParametersRHI(); } ::BuildRayTracingInstanceBuffer( RHICmdList, GPUScene ? &GPUSceneParameters : nullptr, FDFVector3(PreViewTranslation), InstancesUAV, HitGroupContributionsUAV, InstanceGroupUploadSRV, InstanceUploadSRV, AccelerationStructureAddressesBuffer.SRV, TransformUploadSRV, MaxNumInstances, Data.NumGPUInstanceGroups, Data.NumCPUInstanceGroups, Data.NumGPUInstanceDescriptors, Data.NumCPUInstanceDescriptors, CullingParameters, bCompactOutput, OutputStatsUAV, OutputStatsOffset, InstanceExtraDataUAV); } void FRayTracingInstanceBufferBuilder::BuildRayTracingInstanceBuffer( FRHICommandList& RHICmdList, const FGPUScene* GPUScene, const FGPUSceneResourceParametersRHI* GPUSceneParameters, const FRayTracingCullingParameters* CullingParameters, FRHIUnorderedAccessView* InstancesUAV, FRHIUnorderedAccessView* HitGroupContributionsUAV, uint32 MaxNumInstances, bool bCompactOutput, FRHIUnorderedAccessView* OutputStatsUAV, uint32 OutputStatsOffset, FRHIUnorderedAccessView* InstanceExtraDataUAV) { if (CVarRayTracingInstanceBufferLateGetGPUSceneParameters.GetValueOnAnyThread()) { BuildRayTracingInstanceBuffer( RHICmdList, GPUScene, CullingParameters, InstancesUAV, HitGroupContributionsUAV, MaxNumInstances, bCompactOutput, OutputStatsUAV, OutputStatsOffset, InstanceExtraDataUAV); } else { ::BuildRayTracingInstanceBuffer( RHICmdList, GPUSceneParameters, FDFVector3(PreViewTranslation), InstancesUAV, HitGroupContributionsUAV, InstanceGroupUploadSRV, InstanceUploadSRV, AccelerationStructureAddressesBuffer.SRV, TransformUploadSRV, MaxNumInstances, Data.NumGPUInstanceGroups, Data.NumCPUInstanceGroups, Data.NumGPUInstanceDescriptors, Data.NumCPUInstanceDescriptors, CullingParameters, bCompactOutput, OutputStatsUAV, OutputStatsOffset, InstanceExtraDataUAV); } } PRAGMA_ENABLE_DEPRECATION_WARNINGS #endif //RHI_RAYTRACING