// Copyright Epic Games, Inc. All Rights Reserved. // Nanite visibility culling // In Nanite scene traversal, visibility determination and LOD selection all happens on the GPU. At the highest level the goal is to calculate a set of triangle clusters // that needs to be rasterized based on the Scene and the set of active views. // (Scene, Views) -> Clusters for rasterization #ifndef CULLING_PASS #define CULLING_PASS 0 #endif #ifndef VIRTUAL_TEXTURE_TARGET #define VIRTUAL_TEXTURE_TARGET 0 #endif #ifndef NANITE_HIERARCHY_TRAVERSAL #define NANITE_HIERARCHY_TRAVERSAL 0 #endif #include "NaniteCulling.ush" #define GROUP_NODE_SIZE NANITE_CANDIDATE_NODE_SIZE_DWORDS(CULLING_PASS == CULLING_PASS_OCCLUSION_POST) // Main and Post pass candidates are allocated from opposite ends of the buffer // Trim count so we don't have to worry about main and post stomping each other #define CHECK_AND_TRIM_CLUSTER_COUNT (CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN || CULLING_PASS == CULLING_PASS_OCCLUSION_POST) #if NANITE_HIERARCHY_TRAVERSAL # define NANITE_HIERARCHY_TRAVERSAL_TYPE (CULLING_TYPE) # include "NaniteHierarchyTraversal.ush" #endif #if MATERIAL_CACHE #include "../MaterialCache/MaterialCacheCommon.ush" #endif // MATERIAL_CACHE // Do not use shared samplers as it requires the View uniform buffer, which is not bound for this shader. #define USE_HZB_SHARED_SAMPLERS 0 #include "../Common.ush" #include "../SceneData.ush" #include "../ViewData.ush" #include "../WaveOpUtil.ush" #include "../ComputeShaderUtils.ush" #if VIRTUAL_TEXTURE_TARGET #include "../VirtualShadowMaps/VirtualShadowMapPageOverlap.ush" #include "../VirtualShadowMaps/VirtualShadowMapPageCacheCommon.ush" #endif #include "NaniteCullingCommon.ush" #include "NaniteDataDecode.ush" #include "NaniteAttributeDecode.ush" #include "NaniteVertexDeformation.ush" #include "NaniteHZBCull.ush" #include "NaniteStreaming.ush" #include "../GPUMessaging.ush" #if USE_SPLINEDEFORM #include "../SplineMeshCommon.ush" #endif #define NANITE_DEPTH_BUCKETING NANITE_EXTENDED_VISIBLE_CLUSTERS #if CULLING_PASS == CULLING_PASS_OCCLUSION_POST static const bool bIsPostPass = true; static const uint QueueStateIndex = 1; #else static const bool bIsPostPass = false; static const uint QueueStateIndex = 0; #endif groupshared uint GroupOccludedBitmask[NANITE_MAX_BVH_NODES_PER_GROUP]; #if CULLING_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS #define OPTIONAL_COHERENT(Name) Name##Coherent RWCoherentByteAddressBuffer CandidateNodes; RWCoherentByteAddressBuffer CandidateClusters; RWCoherentByteAddressBuffer ClusterBatches; RWCoherentByteAddressBuffer InOutAssemblyTransforms; #else #define OPTIONAL_COHERENT(Name) Name RWByteAddressBuffer CandidateNodes; RWByteAddressBuffer CandidateClusters; RWByteAddressBuffer InOutAssemblyTransforms; #endif Buffer OffsetClustersArgsSWHW; StructuredBuffer InTotalPrevDrawClusters; RWStructuredBuffer OutStreamingRequests; // First entry holds count RWByteAddressBuffer OutVisibleClustersSWHW; RWBuffer VisibleClustersArgsSWHW; #if DEBUG_FLAGS RWStructuredBuffer OutStatsBuffer; RWByteAddressBuffer OutDebugBuffer; #endif uint MaxAssemblyTransforms; uint LargePageRectThreshold; float DepthBucketsMinZ; float DepthBucketsMaxZ; void TransformNodeCullingBounds( FNaniteView NaniteView, FPrimitiveSceneData PrimitiveData, FInstanceSceneData InstanceData, FCluster Cluster, bool bCompileTimeCluster, bool bIsAssemblyPart, uint CullingFlags, bool bEnableWPOBoundsExpansion, inout FNodeCullingBounds Bounds ) { // TODO: Nanite-Skinning #if USE_SPLINEDEFORM // To reduce the cost of register pressure from loading the spline mesh parameters, we loop once for each spline // mesh instance in the wave so the compiler can treat the parameters as uniform across the entire wave as an // optimization bool bLoop = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_SPLINE_MESH) != 0 && (InstanceData.Flags & INSTANCE_SCENE_DATA_FLAG_HAS_PAYLOAD_EXTENSION) != 0; LOOP while (WaveActiveAnyTrue(bLoop)) { if (bLoop) { uint UniformPayloadOffset = WaveReadLaneFirst(InstanceData.PayloadExtensionOffset); if (InstanceData.PayloadExtensionOffset == UniformPayloadOffset) { // Calculate the approximate post-deformed cluster bounds and LOD bounds FSplineMeshShaderParams SplineMeshParams = SplineMeshLoadParamsFromInstancePayload(UniformPayloadOffset); FSplineMeshDeformedLocalBounds NewBounds = SplineMeshDeformLocalBounds(SplineMeshParams, Bounds.BoxCenter, Bounds.BoxExtent); Bounds.BoxCenter = NewBounds.BoundsCenter; Bounds.BoxExtent = NewBounds.BoundsExtent; // Also modify the sphere used to select the cut of the DAG for final LOD selection. // NOTE: This solution currently does nothing to maintain the inherent monotonicity of bounds between levels of // the DAG and as a result, it is possible this could result in clusters from different LODs overlapping, or // in clusters dropping out entirely. Bounds.Sphere = SplineMeshDeformLODSphereBounds(SplineMeshParams, Bounds.Sphere); Bounds.MeshMinDeformScale = SplineMeshParams.MeshDeformScaleMinMax.x; Bounds.NodeMaxDeformScale = NewBounds.MaxDeformScale; bLoop = false; } } } #endif // Extend the bounds for WPO or displacement // NOTE: always extend the bounds if any material ignores the Enable WPO flag const bool bFallbackRaster = (CullingFlags & NANITE_CULLING_FLAG_FALLBACK_RASTER); #if VIRTUAL_TEXTURE_TARGET const bool bIsShadowPass = true; // We know at compile time that this permutation is always for shadow #else const bool bIsShadowPass = (RenderFlags & NANITE_RENDER_FLAG_IS_SHADOW_PASS) != 0; #endif const float3 LocalWPOExtent = GetLocalMaxWPOExtent(PrimitiveData, InstanceData, bEnableWPOBoundsExpansion); Bounds.BoxExtent += LocalWPOExtent + GetMaxMaterialDisplacementExtent(PrimitiveData, bFallbackRaster, bIsShadowPass); if ((PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_SKINNED_MESH) != 0 && !bIsAssemblyPart // TODO: Nanite-Assemblies: Remove this when part bone transforms are accessible && GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId).bIsDeforming) { BRANCH if (bCompileTimeCluster) { const FSkinningHeader SkinningHeader = LoadSkinningHeader(InstanceData.PrimitiveId); BRANCH if (Cluster.bVoxel) { const float4x3 SkinningTransform4x3 = SampleVoxelPerClusterSkinningTransform(InstanceData, Cluster, SkinningHeader); Bounds.BoxExtent = mul(Bounds.BoxExtent, abs((float3x3)SkinningTransform4x3)); Bounds.BoxCenter = mul(float4(Bounds.BoxCenter, 1.0f), SkinningTransform4x3); } else { BRANCH if (Cluster.NumClusterBoneInfluences > 0) { SkinClusterBounds(Cluster, InstanceData, SkinningHeader, Bounds.BoxCenter, Bounds.BoxExtent); } else { Bounds.BoxExtent = InstanceData.LocalBoundsExtent; Bounds.BoxCenter = InstanceData.LocalBoundsCenter; } } } else { // TODO: Nanite-Skinning - Fun hack to temporarily "fix" broken cluster culling and VSM // Set the cluster bounds for skinned meshes equal to the skinned instance local bounds // for clusters and also node hierarchy slices. This satisfies the constraint that all // clusters in a node hierarchy have bounds fully enclosed in the parent bounds (monotonic). // Note: We do not touch the bounding sphere in Bounds because that would break actual // LOD decimation of the Nanite mesh. Instead we leave these in the offline computed ref-pose // so that we get reasonable "small enough to draw" calculations driving the actual LOD. // This is not a proper solution, as it hurts culling rate, and also causes VSM to touch far // more pages than necessary. But it's decent in the short term during R&D on a proper calculation. Bounds.BoxExtent = InstanceData.LocalBoundsExtent; Bounds.BoxCenter = InstanceData.LocalBoundsCenter; } } #if SUPPORT_FIRST_PERSON_RENDERING if ((PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_IS_FIRST_PERSON) != 0) { ApplyFirstPersonTransformToBounds(NaniteView, InstanceData, Bounds.BoxCenter, Bounds.BoxExtent); } #endif // SUPPORT_FIRST_PERSON_RENDERING } // Get the area of an "inclusive" rect (which means that the max is inside the rect), also guards against negative area (where min > max) uint GetInclusiveRectArea(uint4 Rect) { if (all(Rect.zw >= Rect.xy)) { uint2 Size = Rect.zw - Rect.xy; return (Size.x + 1) * (Size.y + 1); } return 0; } float2 GetProjectedEdgeScales(FNaniteView NaniteView, FInstanceSceneData InstanceData, FInstanceDynamicData DynamicData, float4 Bounds) // float2(min, max) { if( NaniteView.ViewToClip[ 3 ][ 3 ] >= 1.0f ) { // Ortho return float2( 1, 1 ); } float3 Center = mul( float4( Bounds.xyz, 1.0f ), DynamicData.LocalToTranslatedWorld ).xyz; float Radius = Bounds.w * InstanceData.NonUniformScale.w; float ZNear = NaniteView.NearPlane; float DistToClusterSq = length2( Center ); // camera origin in (0,0,0) float Z = dot(NaniteView.ViewForward.xyz, Center); float XSq = DistToClusterSq - Z * Z; float X = sqrt( max(0.0f, XSq) ); float DistToTSq = DistToClusterSq - Radius * Radius; float DistToT = sqrt( max(0.0f, DistToTSq) ); float ScaledCosTheta = DistToT; float ScaledSinTheta = Radius; float ScaleToUnit = rcp( DistToClusterSq ); float By = ( ScaledSinTheta * X + ScaledCosTheta * Z ) * ScaleToUnit; float Ty = ( -ScaledSinTheta * X + ScaledCosTheta * Z ) * ScaleToUnit; float H = ZNear - Z; if( DistToTSq < 0.0f || By * DistToT < ZNear ) { float Bx = max( X - sqrt( Radius * Radius - H * H ), 0.0f ); By = ZNear * rsqrt( Bx * Bx + ZNear * ZNear ); } if( DistToTSq < 0.0f || Ty * DistToT < ZNear ) { float Tx = X + sqrt( Radius * Radius - H * H ); Ty = ZNear * rsqrt( Tx * Tx + ZNear * ZNear ); } float MinZ = max( Z - Radius, ZNear ); float MaxZ = max( Z + Radius, ZNear ); float MinCosAngle = Ty; float MaxCosAngle = By; if(Z + Radius > ZNear) return float2( MinZ * MinCosAngle, MaxZ * MaxCosAngle ); else return float2( 0.0f, 0.0f ); } bool ShouldVisitChildInternal( FNaniteView NaniteView, FInstanceSceneData InstanceData, FInstanceDynamicData DynamicData, FNodeCullingBounds Bounds, FHierarchyNodeSlice HierarchyNodeSlice, inout float Priority ) { float2 ProjectedEdgeScales = GetProjectedEdgeScales(NaniteView, InstanceData, DynamicData, Bounds.Sphere); float UniformScale = Bounds.MeshMinDeformScale * min3( InstanceData.NonUniformScale.x, InstanceData.NonUniformScale.y, InstanceData.NonUniformScale.z ); float Threshold = NaniteView.LODScale * UniformScale * HierarchyNodeSlice.MaxParentLODError; if( ProjectedEdgeScales.x <= Threshold ) { Priority = Threshold / ProjectedEdgeScales.x; // TODO: Experiment with better priority bool bSkipMinLODCulling = false; #if DEBUG_FLAGS bSkipMinLODCulling |= (DebugFlags & (NANITE_DEBUG_FLAG_DISABLE_CULL_MIN_LOD | NANITE_DEBUG_FLAG_DRAW_ONLY_ROOT_DATA)) != 0u; #endif return bSkipMinLODCulling || !HierarchyNodeSlice.bLeaf || (ProjectedEdgeScales.y >= NaniteView.LODScale * UniformScale * HierarchyNodeSlice.MinLODError); } else { return false; } } bool SmallEnoughToDraw( FNaniteView NaniteView, FInstanceSceneData InstanceData, FInstanceDynamicData DynamicData, FNodeCullingBounds Bounds, float LODError, float EdgeLength, inout bool bUseHWRaster ) { float ProjectedEdgeScale = GetProjectedEdgeScales( NaniteView, InstanceData, DynamicData, Bounds.Sphere ).x; float UniformScale = Bounds.MeshMinDeformScale * min3( InstanceData.NonUniformScale.x, InstanceData.NonUniformScale.y, InstanceData.NonUniformScale.z ); bool bVisible = ProjectedEdgeScale > UniformScale * LODError * NaniteView.LODScale; if (RenderFlags & NANITE_RENDER_FLAG_FORCE_HW_RASTER) { bUseHWRaster = true; } else { float HWEdgeScale = InstanceData.NonUniformScale.w * Bounds.NodeMaxDeformScale; bUseHWRaster |= ProjectedEdgeScale < HWEdgeScale * abs( EdgeLength ) * NaniteView.LODScaleHW; // TODO: EdgeLength shouldn't have sign } return bVisible; } uint DepthToBucket(float Z) { const float ClampedZ = clamp(Z, DepthBucketsMinZ, DepthBucketsMaxZ); #if 1 // TODO: Consider a bucket distribution with infinite max z? const float A = 1.0f / (log2(DepthBucketsMaxZ) - log2(DepthBucketsMinZ)); const float B = -log2(DepthBucketsMinZ) * A; const float T = log2(ClampedZ) * A + B; #else const float T = (ClampedZ - DepthBucketsMinZ) / (DepthBucketsMaxZ - DepthBucketsMinZ); #endif int DepthBucket = floor(T * NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK); return (uint)clamp(DepthBucket, 0, (int)NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK - 1); } float4x4 LoadAssemblyTransform(uint TransformIndex) { #if NANITE_ASSEMBLY_DATA const uint BufferAddress = TransformIndex * (uint)sizeof(float3x4); const float3x4 TransposedTransform = InOutAssemblyTransforms.Load(BufferAddress); return transpose(float4x4( TransposedTransform[0], TransposedTransform[1], TransposedTransform[2], float4(0, 0, 0, 1) )); #else return float4x4( float4(1, 0, 0, 0), float4(0, 1, 0, 0), float4(0, 0, 1, 0), float4(0, 0, 0, 1) ); #endif } #if NANITE_HIERARCHY_TRAVERSAL MAX_OCCUPANCY DISABLE_TARGET_OCCUPANCY_WARNING struct FNaniteTraversalClusterCullCallback { uint ChildIndex; uint LocalNodeIndex; FCandidateNode CandidateNode; FNaniteView NaniteView; FInstanceSceneData InstanceData; uint AssemblyTransformIndex; bool bVisible; float StreamingPriority; void Init(uint InChildIndex, uint InLocalNodeIndex, uint GroupNodeFetchIndex) { ChildIndex = InChildIndex; LocalNodeIndex = InLocalNodeIndex; const uint4 NodeData = GetGroupNodeData(GroupNodeFetchIndex); CandidateNode = UnpackCandidateNode(NodeData, bIsPostPass); NaniteView = GetNaniteView(CandidateNode.ViewId); InstanceData = GetInstanceSceneDataUnchecked(CandidateNode.InstanceId); AssemblyTransformIndex = CandidateNode.AssemblyTransformIndex; } uint GetHierarchyNodeOffset() { return ::GetHierarchyNodeOffset(InstanceData.NaniteHierarchyOffset, CandidateNode.NodeIndex); } bool ShouldVisitChild(FHierarchyNodeSlice HierarchyNodeSlice, bool bInVisible) { bVisible = bInVisible; #if CULLING_PASS == CULLING_PASS_OCCLUSION_POST if ((CandidateNode.EnabledBitmask & (1u << ChildIndex)) == 0u) // Need to check bEnabled because instance cull always writes full mask { bVisible = false; } #endif StreamingPriority = 0.0f; bool bOccluded = false; float4x4 AssemblyTransform = (float4x4)0; float4x4 UnskinnedAssemblyTransform = (float4x4)0; // Used for LOD bounds FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId); bool bWriteAssemblyTransform = false; bool bHasAssemblyTransform = IsValidAssemblyTransformIndex(AssemblyTransformIndex); const FInstanceViewData InstanceViewData = GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId); const bool bSkinned = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_SKINNED_MESH) != 0; const bool bActiveSkinning = bSkinned && InstanceViewData.bIsDeforming; BRANCH if (bHasAssemblyTransform) { AssemblyTransform = LoadAssemblyTransform(AssemblyTransformIndex); BRANCH if (bActiveSkinning) { UnskinnedAssemblyTransform = LoadAssemblyTransform(AssemblyTransformIndex + 2); } else { UnskinnedAssemblyTransform = AssemblyTransform; } } BRANCH if (bVisible) { FInstanceDynamicData DynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData, true); FNodeCullingBounds NodeBounds = InitNodeCullingBounds(HierarchyNodeSlice); BRANCH if (IsValidHierarchyAssemblyTransformIndex(HierarchyNodeSlice.AssemblyTransformIndex)) { // TODO: Combine the matrices for recursive assemblies AssemblyTransform = LoadNaniteHierarchyAssemblyTransform(PrimitiveData.NaniteAssemblyTransformOffset, HierarchyNodeSlice.AssemblyTransformIndex); UnskinnedAssemblyTransform = AssemblyTransform; BRANCH if (bActiveSkinning) { AssemblyTransform = SkinNaniteHierarchyAssemblyTransform(PrimitiveData, InstanceData, HierarchyNodeSlice.AssemblyTransformIndex, AssemblyTransform, false); } bHasAssemblyTransform = true; bWriteAssemblyTransform = true; } BRANCH if (bHasAssemblyTransform) { NodeBounds = TransformNodeCullingBounds(NodeBounds, AssemblyTransform, UnskinnedAssemblyTransform); } const bool bEnableWPO = (CandidateNode.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0u; bool bExpandWPOBounds = bEnableWPO; #if VIRTUAL_TEXTURE_TARGET { // We always need to expand the bounds even if WPO is distance disabled because we still need to // invalidate the whole region in case it starts animating next frame/in the future. const bool bWPOAllowed = VirtualShadowMapIsWPOAllowed(PrimitiveData, NaniteView.TargetLayerIndex); bExpandWPOBounds = ShouldMaterialInvalidateShadowCache(PrimitiveData, bWPOAllowed); } #endif TransformNodeCullingBounds(NaniteView, PrimitiveData, InstanceData, (FCluster)0, false, bHasAssemblyTransform, CandidateNode.Flags, bExpandWPOBounds, NodeBounds); FBoxCull Cull; Cull.Init( NaniteView, NodeBounds.BoxCenter, NodeBounds.BoxExtent, InstanceData.NonUniformScale, DynamicData.LocalToTranslatedWorld, DynamicData.PrevLocalToTranslatedWorld ); Cull.Distance(); Cull.GlobalClipPlane(); #if CULLING_PASS == CULLING_PASS_OCCLUSION_POST if (Cull.bIsVisible && CandidateNode.Flags & NANITE_CULLING_FLAG_TEST_LOD) #endif { Cull.bIsVisible = ShouldVisitChildInternal(NaniteView, InstanceData, DynamicData, NodeBounds, HierarchyNodeSlice, StreamingPriority); } BRANCH if (Cull.bIsVisible) { #if VIRTUAL_TEXTURE_TARGET const bool bCacheAsStatic = (CandidateNode.Flags & NANITE_CULLING_FLAG_CACHE_AS_STATIC) != 0u; // If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page Cull.bUseReceiverMask = Cull.bUseReceiverMask && !bCacheAsStatic; Cull.PageFlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, CandidateNode.InstanceId, NaniteView.SceneRendererPrimaryViewId); Cull.bIsStaticGeometry = bCacheAsStatic; #endif Cull.FrustumHZB( false ); } #if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN BRANCH if (Cull.bIsVisible && Cull.bWasOccluded && HierarchyNodeSlice.bLoaded) { bOccluded = true; } #endif bVisible = Cull.bIsVisible && !Cull.bWasOccluded; } #if DEBUG_FLAGS if ((DebugFlags & NANITE_DEBUG_FLAG_HIDE_ASSEMBLY_PARTS) != 0 && bHasAssemblyTransform) { bVisible = false; bOccluded = false; } #endif BRANCH if (bVisible && bWriteAssemblyTransform) { const uint NumAssemblyTransformsToWrite = bActiveSkinning ? 3 : 1; uint TempAssemblyTransformIndex = 0; // writing to a temp before assignment to member fixes an error with VK WaveInterlockedAdd_(QueueState[0].AssemblyTransformsWriteOffset, NumAssemblyTransformsToWrite, TempAssemblyTransformIndex); AssemblyTransformIndex = TempAssemblyTransformIndex; BRANCH if (AssemblyTransformIndex + NumAssemblyTransformsToWrite <= MaxAssemblyTransforms) { InOutAssemblyTransforms.Store(AssemblyTransformIndex * (uint)sizeof(float3x4), (float3x4)transpose(AssemblyTransform)); if (bActiveSkinning) { // Explicitly reload assembly transform to break dependency and shorten lifetime. // TODO: This will need to be fixed up for recursive assemblies const float4x4 HierarchyAssemblyTransform = LoadNaniteHierarchyAssemblyTransform(PrimitiveData.NaniteAssemblyTransformOffset, HierarchyNodeSlice.AssemblyTransformIndex); const float4x4 PrevSkinnedAssemblyTransform = SkinNaniteHierarchyAssemblyTransform(PrimitiveData, InstanceData, HierarchyNodeSlice.AssemblyTransformIndex, HierarchyAssemblyTransform, true); InOutAssemblyTransforms.Store((AssemblyTransformIndex + 1) * (uint)sizeof(float3x4), (float3x4)transpose(PrevSkinnedAssemblyTransform)); InOutAssemblyTransforms.Store((AssemblyTransformIndex + 2) * (uint)sizeof(float3x4), (float3x4)transpose(HierarchyAssemblyTransform)); } #if DEBUG_FLAGS if ((DebugFlags & NANITE_DEBUG_FLAG_WRITE_ASSEMBLY_META) != 0) { // write the local assembly transform index for the sake of retrieving that for debug visualizers OutDebugBuffer.Store(AssemblyTransformIndex * 4u, HierarchyNodeSlice.AssemblyTransformIndex); } #endif } else { bVisible = false; bOccluded = false; } } if (bOccluded) { InterlockedOr(GroupOccludedBitmask[LocalNodeIndex], 1u << ChildIndex); } return bVisible; } void OnPreProcessNodeBatch(uint GroupIndex) { #if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN if (GroupIndex < NANITE_MAX_BVH_NODES_PER_GROUP) { GroupOccludedBitmask[GroupIndex] = 0u; } #endif } void OnPostNodeVisit(FHierarchyNodeSlice HierarchyNodeSlice) { if (bVisible && HierarchyNodeSlice.bLeaf) { RequestPageRange(OutStreamingRequests, InstanceData.NaniteRuntimeResourceID, HierarchyNodeSlice.ResourcePageRangeKey, NaniteView.StreamingPriorityCategory, StreamingPriority); } #if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN if (ChildIndex == 0 && GroupOccludedBitmask[LocalNodeIndex]) { uint OccludedNodesOffset; WaveInterlockedAddScalar_(QueueState[0].PassState[1].NodeWriteOffset, 1, OccludedNodesOffset); WaveInterlockedAddScalar(QueueState[0].PassState[1].NodeCount, 1); if (OccludedNodesOffset < MaxNodes) { FCandidateNode Node; Node.Flags = CandidateNode.Flags & ~NANITE_CULLING_FLAG_TEST_LOD; Node.ViewId = CandidateNode.ViewId; Node.InstanceId = CandidateNode.InstanceId; Node.NodeIndex = CandidateNode.NodeIndex; Node.EnabledBitmask = GroupOccludedBitmask[LocalNodeIndex]; Node.AssemblyTransformIndex = AssemblyTransformIndex; OPTIONAL_COHERENT(StoreCandidateNodeData)(CandidateNodes, OccludedNodesOffset, PackCandidateNode(Node), true); } } #endif } void StoreChildNode(uint StoreIndex, FHierarchyNodeSlice HierarchyNodeSlice) { FCandidateNode Node; Node.Flags = CandidateNode.Flags | NANITE_CULLING_FLAG_TEST_LOD; Node.ViewId = CandidateNode.ViewId; Node.InstanceId = CandidateNode.InstanceId; Node.NodeIndex = HierarchyNodeSlice.ChildStartReference; Node.EnabledBitmask = NANITE_BVH_NODE_ENABLE_MASK; Node.AssemblyTransformIndex = AssemblyTransformIndex; OPTIONAL_COHERENT(StoreCandidateNodeData)(CandidateNodes, StoreIndex, PackCandidateNode(Node), bIsPostPass); } void StoreCluster(uint StoreIndex, FHierarchyNodeSlice HierarchyNodeSlice, uint ClusterIndex) { StoreIndex = bIsPostPass ? (MaxCandidateClusters - 1 - StoreIndex) : StoreIndex; FVisibleCluster CandidateCluster; CandidateCluster.Flags = CandidateNode.Flags | NANITE_CULLING_FLAG_TEST_LOD; CandidateCluster.ViewId = CandidateNode.ViewId; CandidateCluster.InstanceId = CandidateNode.InstanceId; CandidateCluster.PageIndex = HierarchyNodeSlice.ChildStartReference >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS; CandidateCluster.ClusterIndex = ClusterIndex; CandidateCluster.AssemblyTransformIndex = AssemblyTransformIndex; CandidateCluster.DepthBucket = 0; OPTIONAL_COHERENT(StoreVisibleCluster)(CandidateClusters, StoreIndex, CandidateCluster, false); } uint4 LoadPackedCluster(uint CandidateIndex) { const uint LoadIndex = bIsPostPass ? (MaxCandidateClusters - 1 - CandidateIndex) : CandidateIndex; return OPTIONAL_COHERENT(LoadVisibleClusterData)(CandidateClusters, LoadIndex, false); } bool IsNodeDataReady(uint4 RawData) { return all(RawData != 0xFFFFFFFFu); // Unused parts are 0, which is ignored } bool LoadCandidateNodeDataToGroup(uint NodeIndex, uint GroupIndex, bool bCheckIfReady = true) { uint4 NodeData = OPTIONAL_COHERENT(LoadCandidateNodeData)(CandidateNodes, NodeIndex, bIsPostPass); bool bNodeReady = IsNodeDataReady(NodeData); if (!bCheckIfReady || bNodeReady) { SetGroupNodeData(GroupIndex, NodeData); } return bNodeReady; } void ClearCandidateNodeData(uint NodeIndex) { ::OPTIONAL_COHERENT(StoreCandidateNodeData)(CandidateNodes, NodeIndex, 0xFFFFFFFFu, bIsPostPass); } #if CULLING_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS void AddToClusterBatch(uint BatchIndex, uint Num) { checkSlow(BatchIndex < GetMaxClusterBatches()); ClusterBatches.InterlockedAdd(GetClusterBatchesOffset(bIsPostPass) + BatchIndex * 4, Num); } void ClearClusterBatch(uint BatchIndex) { checkSlow(BatchIndex < GetMaxClusterBatches()); ClusterBatches.Store(GetClusterBatchesOffset(bIsPostPass) + BatchIndex * 4, 0); } uint LoadClusterBatch(uint BatchIndex) { checkSlow(BatchIndex < GetMaxClusterBatches()); return ClusterBatches.Load(GetClusterBatchesOffset(bIsPostPass) + BatchIndex * 4); } #endif void EmitVisibleCluster(bool bUseHWRaster, uint2 TotalPrevDrawClusters, uint HWClusterCounterIndex, FVisibleCluster VisibleCluster) { if (bUseHWRaster) { uint ClusterOffsetHW = 0; WaveInterlockedAddScalar_(VisibleClustersArgsSWHW[HWClusterCounterIndex], 1, ClusterOffsetHW); uint VisibleClusterOffsetHW = ClusterOffsetHW; VisibleClusterOffsetHW += TotalPrevDrawClusters.y; #if CULLING_PASS == CULLING_PASS_OCCLUSION_POST VisibleClusterOffsetHW += OffsetClustersArgsSWHW[HWClusterCounterIndex]; #endif if (VisibleClusterOffsetHW < MaxVisibleClusters) { StoreVisibleCluster(OutVisibleClustersSWHW, (MaxVisibleClusters - 1) - VisibleClusterOffsetHW, VisibleCluster, VIRTUAL_TEXTURE_TARGET); // HW clusters written from the top } } else { uint ClusterOffsetSW = 0; WaveInterlockedAddScalar_(VisibleClustersArgsSWHW[0], 1, ClusterOffsetSW); uint VisibleClusterOffsetSW = ClusterOffsetSW; VisibleClusterOffsetSW += TotalPrevDrawClusters.x; #if CULLING_PASS == CULLING_PASS_OCCLUSION_POST VisibleClusterOffsetSW += OffsetClustersArgsSWHW[0]; #endif if (VisibleClusterOffsetSW < MaxVisibleClusters) { StoreVisibleCluster(OutVisibleClustersSWHW, VisibleClusterOffsetSW, VisibleCluster, VIRTUAL_TEXTURE_TARGET); // SW clusters written from the bottom } } } void ProcessCluster(uint4 PackedCluster) { FVisibleCluster VisibleCluster = UnpackVisibleCluster(PackedCluster, false); FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(VisibleCluster.InstanceId); FNaniteView NaniteView = GetNaniteView(VisibleCluster.ViewId); FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); FInstanceDynamicData DynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData, true); FInstanceViewData InstanceViewData = GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId); const uint HWClusterCounterIndex = GetHWClusterCounterIndex(RenderFlags); FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId); FNodeCullingBounds ClusterBounds = InitNodeCullingBounds(InstanceData, Cluster); const bool bIsAssemblyPartCluster = IsAssemblyPartCluster(VisibleCluster); const bool bActiveSkinning = Cluster.bSkinning && InstanceViewData.bIsDeforming; if (bIsAssemblyPartCluster) { const float4x4 AssemblyTransform = LoadAssemblyTransform(VisibleCluster.AssemblyTransformIndex); BRANCH if (bActiveSkinning) { const float4x4 LODAssemblyTransform = LoadAssemblyTransform(VisibleCluster.AssemblyTransformIndex + 2); ClusterBounds = TransformNodeCullingBounds(ClusterBounds, AssemblyTransform, LODAssemblyTransform); } else { ClusterBounds = TransformNodeCullingBounds(ClusterBounds, AssemblyTransform, AssemblyTransform); } } const bool bEnableWPO = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0u; bool bExpandWPOBounds = bEnableWPO; #if VIRTUAL_TEXTURE_TARGET bool bInvalidatePages = ShouldMaterialInvalidateShadowCache(PrimitiveData, bEnableWPO) || GetInstanceViewData(VisibleCluster.InstanceId, NaniteView.SceneRendererPrimaryViewId).bIsDeforming; const bool bWPOAllowed = ShouldMaterialInvalidateShadowCache(PrimitiveData, VirtualShadowMapIsWPOAllowed(PrimitiveData, NaniteView.TargetLayerIndex)); // We always need to expand the bounds even if WPO is distance disabled because we still need to // mark the whole region in case it starts animating next frame/in the future. bExpandWPOBounds = bWPOAllowed; #endif TransformNodeCullingBounds(NaniteView, PrimitiveData, InstanceData, Cluster, true, bIsAssemblyPartCluster, VisibleCluster.Flags, bExpandWPOBounds, ClusterBounds); bool bUseHWRaster = false; FBoxCull Cull; Cull.Init( NaniteView, ClusterBounds.BoxCenter, ClusterBounds.BoxExtent, InstanceData.NonUniformScale, DynamicData.LocalToTranslatedWorld, DynamicData.PrevLocalToTranslatedWorld ); // If the cluster isn't already sorted into the fallback bin, and the primitive enables per-cluster displacement fallback // rasterization, we can check to disable displacement at a cluster level. // NOTE: We never do WPO or pixel programmable distance at the cluster level, so skip those; Cull.bSkipDisplacementFadeOutDistance |= (VisibleCluster.Flags & NANITE_CULLING_FLAG_FALLBACK_RASTER) != 0 || (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_PER_CLUSTER_DISPLACEMENT_FALLBACK_RASTER) == 0; Cull.bSkipWPODisableDistance = true; Cull.bSkipPixelProgrammableDistance = true; Cull.Distance(); Cull.GlobalClipPlane(); Cull.ProgrammableRasterDistance(PrimitiveData); if (Cull.bFallbackRaster) { VisibleCluster.Flags |= NANITE_CULLING_FLAG_FALLBACK_RASTER; } BRANCH if( Cull.bIsVisible ) { BRANCH if( CULLING_PASS != CULLING_PASS_OCCLUSION_POST || (VisibleCluster.Flags & NANITE_CULLING_FLAG_TEST_LOD) != 0 ) { const bool bSmallEnoughToDraw = SmallEnoughToDraw(NaniteView, InstanceData, DynamicData, ClusterBounds, Cluster.LODError, Cluster.EdgeLength, bUseHWRaster); #if MATERIAL_CACHE const uint TexCoordIndex = min(NANITE_MAX_UVS - 1, GetMaterialCacheUVCoordinateIndex(PrimitiveData)); const FUVHeader UVHeader = GetUVHeader(ClusterPageData, Cluster.PageBaseAddress + Cluster.DecodeInfoOffset, TexCoordIndex); // Get the cluster domain range float4 ClusterCacheUVMinMax = float4( DecodeUVFloat(UVHeader.Min.x, UVHeader.NumMantissaBits), DecodeUVFloat(UVHeader.Min.y, UVHeader.NumMantissaBits), DecodeUVFloat(UVHeader.Min.x + (1u << UVHeader.NumBits.x) - 1, UVHeader.NumMantissaBits), DecodeUVFloat(UVHeader.Min.y + (1u << UVHeader.NumBits.y) - 1, UVHeader.NumMantissaBits) ); const float2 Min = NaniteView.MaterialCacheUnwrapMinAndInvSize.xy; const float2 Max = Min + rcp(NaniteView.MaterialCacheUnwrapMinAndInvSize.zw); const bool bIsInUVDomain = any(ClusterCacheUVMinMax.zw >= Min) && any(ClusterCacheUVMinMax.xy <= Max); Cull.bIsVisible = (bSmallEnoughToDraw || (Cluster.Flags & NANITE_CLUSTER_FLAG_STREAMING_LEAF)) && bIsInUVDomain; #else // MATERIAL_CACHE #if VIRTUAL_TEXTURE_TARGET // If there was a large delta between bSmallEnoughToDraw const bool bInvalidateFromSteamingLODDelta = (RenderFlags & NANITE_RENDER_FLAG_INVALIDATE_VSM_ON_LOD_DELTA) != 0 && !bSmallEnoughToDraw && (Cluster.Flags & NANITE_CLUSTER_FLAG_FULL_LEAF) == 0; bInvalidatePages = bInvalidatePages || bInvalidateFromSteamingLODDelta; #endif #if DEBUG_FLAGS if ((DebugFlags & NANITE_DEBUG_FLAG_DRAW_ONLY_ROOT_DATA) != 0u) { Cull.bIsVisible = (Cluster.Flags & NANITE_CLUSTER_FLAG_ROOT_GROUP) && (bSmallEnoughToDraw || (Cluster.Flags & NANITE_CLUSTER_FLAG_ROOT_LEAF)); } else #endif { Cull.bIsVisible = bSmallEnoughToDraw || (Cluster.Flags & NANITE_CLUSTER_FLAG_STREAMING_LEAF); } #endif // MATERIAL_CACHE } else { bUseHWRaster |= (VisibleCluster.Flags & NANITE_CULLING_FLAG_USE_HW) != 0; } } #if VIRTUAL_TEXTURE_TARGET const bool bCacheAsStatic = (VisibleCluster.Flags & NANITE_CULLING_FLAG_CACHE_AS_STATIC) != 0u; // If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page Cull.bUseReceiverMask = Cull.bUseReceiverMask && !bCacheAsStatic; Cull.PageFlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId); Cull.bIsStaticGeometry = bCacheAsStatic; #endif Cull.FrustumHZB( true ); bUseHWRaster |= Cull.bNeedsClipping; if( CULLING_PASS != CULLING_PASS_OCCLUSION_MAIN ) Cull.bIsVisible &= !Cull.bWasOccluded; if (Cull.bIsVisible) { if (!Cull.bWasOccluded) { const uint2 TotalPrevDrawClusters = (RenderFlags & NANITE_RENDER_FLAG_HAS_PREV_DRAW_DATA) ? InTotalPrevDrawClusters[0] : 0; #if NANITE_DEPTH_BUCKETING const float3 CenterTranslatedWorld = mul(float4(Cluster.BoxBoundsCenter, 1.0f), DynamicData.LocalToTranslatedWorld).xyz; VisibleCluster.DepthBucket = DepthToBucket(dot(NaniteView.ViewForward.xyz, CenterTranslatedWorld)); #endif #if VIRTUAL_TEXTURE_TARGET uint4 RectPages = Cull.RectPages; #if DEBUG_FLAGS uint PageRectArea = GetInclusiveRectArea(RectPages); if (PageRectArea >= LargePageRectThreshold) { WaveInterlockedAddScalar(OutStatsBuffer[0].NumLargePageRectClusters, 1); } #endif FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(NaniteView.TargetLayerIndex, NaniteView.TargetMipLevel); const uint MarkPageDirtyFlags = VirtualShadowMapGetMarkPageDirtyFlags(bCacheAsStatic, bInvalidatePages, bWPOAllowed); uint WindowSize = bUseHWRaster ? VSM_RASTER_WINDOW_PAGES : (NANITE_LATE_VSM_PAGE_TRANSLATION ? NANITE_VSM_PAGE_TABLE_CACHE_DIM : 1); for (uint WY = RectPages.y; WY <= RectPages.w; WY += WindowSize) { for (uint WX = RectPages.x; WX <= RectPages.z; WX += WindowSize) { uint2 WindowEnd = min(uint2(WX, WY) + WindowSize - 1u, RectPages.zw); bool bEmitForWindow = false; // Clip window rect to the mapped pages. uint4 ClippedWindowRect = uint4(WindowEnd, uint2(WX, WY)); for (uint Y = WY; Y <= WindowEnd.y; ++Y) { for (uint X = WX; X <= WindowEnd.x; ++X) { uint2 vPage = uint2(X, Y); FVSMPageOffset PageFlagOffset = CalcPageOffset(PageTableLevelOffset, NaniteView.TargetMipLevel, vPage); uint PageFlag = VirtualShadowMapGetPageFlag(PageFlagOffset); if ((PageFlag & Cull.PageFlagMask) != 0) { if (MarkPageDirtyFlags) { VirtualShadowMapMarkPageDirty(PageFlagOffset, MarkPageDirtyFlags); } FShadowPhysicalPage PhysicalPageEntry = ShadowGetPhysicalPage(PageFlagOffset); if (!PhysicalPageEntry.bThisLODValidForRendering) { // Skip this page continue; } ClippedWindowRect.xy = min(ClippedWindowRect.xy, vPage); ClippedWindowRect.zw = max(ClippedWindowRect.zw, vPage); bEmitForWindow = true; } } } if (bEmitForWindow) { // if bEmitForWindow is true we're guaranteed to have set this to a valid rect. VisibleCluster.vPage = ClippedWindowRect.xy; VisibleCluster.vPageEnd = ClippedWindowRect.zw; EmitVisibleCluster(bUseHWRaster, TotalPrevDrawClusters, HWClusterCounterIndex, VisibleCluster); } } } #else EmitVisibleCluster(bUseHWRaster, TotalPrevDrawClusters, HWClusterCounterIndex, VisibleCluster); #endif } #if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN else { uint ClusterIndex = 0; WaveInterlockedAddScalar_(QueueState[0].TotalClusters, 1, ClusterIndex); if (ClusterIndex < MaxCandidateClusters) { uint OccludedClusterOffset = 0; WaveInterlockedAddScalar_(QueueState[0].PassState[1].ClusterWriteOffset, 1, OccludedClusterOffset); VisibleCluster.Flags |= (bUseHWRaster ? NANITE_CULLING_FLAG_USE_HW : 0u); OPTIONAL_COHERENT(StoreVisibleCluster)(CandidateClusters, (MaxCandidateClusters - 1) - OccludedClusterOffset, VisibleCluster, false); #if CULLING_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS DeviceMemoryBarrier(); const uint BatchIndex = OccludedClusterOffset / NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE; checkSlow(BatchIndex < GetMaxClusterBatches()); ClusterBatches.InterlockedAdd(GetClusterBatchesOffset(true) + BatchIndex * 4, 1); #endif } } #endif } } }; [numthreads(NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, 1, 1)] void NodeAndClusterCull(uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex) { #if CULLING_TYPE == NANITE_CULLING_TYPE_NODES NodeCull(GroupID, GroupIndex, QueueStateIndex); #elif CULLING_TYPE == NANITE_CULLING_TYPE_CLUSTERS ClusterCull(GroupID, GroupIndex, QueueStateIndex); #elif CULLING_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS PersistentNodeAndClusterCull(GroupIndex, QueueStateIndex); #endif } #endif // NANITE_HIERARCHY_TRAVERSAL // Make sure the indirect args we give to the rasterizer are not out of bounds and that the SW/HW ranges are not overlapping. Buffer InRasterizerArgsSWHW; RWBuffer OutSafeRasterizerArgsSWHW; RWStructuredBuffer OutClusterCountSWHW; RWBuffer OutClusterClassifyArgs; [numthreads(1, 1, 1)] void CalculateSafeRasterizerArgs() { int ClusterOffsetSW = 0; int ClusterOffsetHW = 0; BRANCH if ((RenderFlags & NANITE_RENDER_FLAG_HAS_PREV_DRAW_DATA) != 0u) { const uint2 TotalPrevDrawClusters = InTotalPrevDrawClusters[0]; ClusterOffsetSW = TotalPrevDrawClusters.x; ClusterOffsetHW = TotalPrevDrawClusters.y; } const uint HWClusterCounterIndex = GetHWClusterCounterIndex(RenderFlags); #if IS_POST_PASS ClusterOffsetSW += OffsetClustersArgsSWHW[0]; ClusterOffsetHW += OffsetClustersArgsSWHW[HWClusterCounterIndex]; #endif int NumClustersSW = InRasterizerArgsSWHW[0]; int NumClustersHW = InRasterizerArgsSWHW[HWClusterCounterIndex]; const int TotalClustersSW = ClusterOffsetSW + NumClustersSW; const int TotalClustersHW = ClusterOffsetHW + NumClustersHW; if (TotalClustersSW + TotalClustersHW > (int)MaxVisibleClusters) { // Total number of visible clusters don't fit. // Trim away the overlapping range from the SW/HW ranges. // TODO: Write status back to CPU so we can warn the user when this happens and r.NaniteRaster.MaxVisibleClusters needs to be adjusted higher. const int MaxClustersSW = max((int)MaxVisibleClusters - ClusterOffsetSW - TotalClustersHW, 0); const int MaxClustersHW = max((int)MaxVisibleClusters - ClusterOffsetHW - TotalClustersSW, 0); NumClustersSW = min(NumClustersSW, MaxClustersSW); NumClustersHW = min(NumClustersHW, MaxClustersHW); } const uint ArgsOffset = 0u; WriteDispatchArgsSWHW(OutSafeRasterizerArgsSWHW, ArgsOffset, NumClustersSW, NumClustersHW); OutClusterCountSWHW[0] = uint2(NumClustersSW, NumClustersHW); OutClusterClassifyArgs[0] = ((NumClustersSW + NumClustersHW) + 63u) / 64u; OutClusterClassifyArgs[1] = 1; OutClusterClassifyArgs[2] = 1; } RWBuffer< uint > OutOccludedInstancesArgs; RWStructuredBuffer OutQueueState; RWStructuredBuffer< uint2 > InOutTotalPrevDrawClusters; RWBuffer< uint > InOutMainPassRasterizeArgsSWHW; RWBuffer< uint > InOutPostPassRasterizeArgsSWHW; [numthreads(1, 1, 1)] void InitArgs() { const uint HWClusterCounterIndex = GetHWClusterCounterIndex(RenderFlags); uint2 DrawnClusterCounts = 0; OutQueueState[0].TotalClusters = 0; OutQueueState[0].AssemblyTransformsWriteOffset = 0; for (uint i = 0; i < 2; i++) { OutQueueState[0].PassState[i].ClusterBatchReadOffset = 0; OutQueueState[0].PassState[i].ClusterWriteOffset = 0; OutQueueState[0].PassState[i].NodeReadOffset = 0; OutQueueState[0].PassState[i].NodeWriteOffset = 0; OutQueueState[0].PassState[i].NodeCount = 0; } DrawnClusterCounts += uint2(InOutMainPassRasterizeArgsSWHW[0], InOutMainPassRasterizeArgsSWHW[HWClusterCounterIndex]); const uint ArgsOffset = 0u; WriteRasterizerArgsSWHW(InOutMainPassRasterizeArgsSWHW, ArgsOffset, 0, 0); #if OCCLUSION_CULLING OutOccludedInstancesArgs[0] = 0; OutOccludedInstancesArgs[1] = 1; OutOccludedInstancesArgs[2] = 1; OutOccludedInstancesArgs[3] = 0; DrawnClusterCounts += uint2(InOutPostPassRasterizeArgsSWHW[0], InOutPostPassRasterizeArgsSWHW[HWClusterCounterIndex]); WriteRasterizerArgsSWHW(InOutPostPassRasterizeArgsSWHW, ArgsOffset, 0, 0); #endif #if DRAW_PASS_INDEX == 1 InOutTotalPrevDrawClusters[ 0 ] = DrawnClusterCounts; #elif DRAW_PASS_INDEX == 2 InOutTotalPrevDrawClusters[ 0 ] += DrawnClusterCounts; #endif } uint InitIsPostPass; RWBuffer< uint > OutClusterCullArgs; [numthreads(1, 1, 1)] void InitClusterCullArgs() { const uint NumCandidateClusters = min(OutQueueState[0].PassState[InitIsPostPass].ClusterWriteOffset, MaxCandidateClusters); OutClusterCullArgs[0] = (NumCandidateClusters + NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE - 1) / NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE; OutClusterCullArgs[1] = 1; OutClusterCullArgs[2] = 1; } RWBuffer< uint > OutNodeCullArgs0; RWBuffer< uint > OutNodeCullArgs1; [numthreads(NANITE_MAX_CLUSTER_HIERARCHY_DEPTH + 1, 1, 1)] void InitNodeCullArgs(uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex) { const uint Offset = GroupIndex * NANITE_NODE_CULLING_ARG_COUNT; if (GroupID == 0) { uint NumNodes = 0; uint NumGroups = 0; if (GroupIndex == 0) { const uint NodeWriteOffset = OutQueueState[0].PassState[InitIsPostPass].NodeWriteOffset; NumNodes = min(NodeWriteOffset, MaxNodes); NumGroups = (NumNodes + NANITE_MAX_BVH_NODES_PER_GROUP - 1) / NANITE_MAX_BVH_NODES_PER_GROUP; } OutNodeCullArgs0[Offset + 0] = NumGroups; // ThreadGroupCountX OutNodeCullArgs0[Offset + 1] = 1; // ThreadGroupCountY OutNodeCullArgs0[Offset + 2] = 1; // ThreadGroupCountZ OutNodeCullArgs0[Offset + 3] = NumNodes; // NumNodes OutNodeCullArgs0[Offset + 4] = 0; // LevelStartIndex } else { OutNodeCullArgs1[Offset + 0] = 0; // ThreadGroupCountX OutNodeCullArgs1[Offset + 1] = 1; // ThreadGroupCountY OutNodeCullArgs1[Offset + 2] = 1; // ThreadGroupCountZ OutNodeCullArgs1[Offset + 3] = 0; // NumNodes OutNodeCullArgs1[Offset + 4] = 0; // LevelStartIndex } } Buffer InMainRasterizerArgsSWHW; Buffer InPostRasterizerArgsSWHW; uint StatusMessageId; [numthreads(1, 1, 1)] void FeedbackStatus() { const uint HWClusterCounterIndex = GetHWClusterCounterIndex(RenderFlags); const uint PeakNodes = max(OutQueueState[0].PassState[0].NodeWriteOffset, OutQueueState[0].PassState[1].NodeWriteOffset); const uint PeakCandidateClusters = max(OutQueueState[0].PassState[0].ClusterWriteOffset, OutQueueState[0].PassState[1].ClusterWriteOffset); const uint PeakVisibleClusters = max( InMainRasterizerArgsSWHW[0] + InMainRasterizerArgsSWHW[HWClusterCounterIndex], InPostRasterizerArgsSWHW[0] + InPostRasterizerArgsSWHW[HWClusterCounterIndex]); FGPUMessageWriter Mw = GPUMessageBegin(StatusMessageId, 3U); GPUMessageWriteItem(Mw, PeakNodes); GPUMessageWriteItem(Mw, PeakCandidateClusters); GPUMessageWriteItem(Mw, PeakVisibleClusters); }