// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
NiagaraSimulationShader.usf: 
=============================================================================*/
#pragma warning(disable:4008)
#include "/Engine/Public/Platform.ush"

#include "NiagaraShaderVersion.ush"

#ifndef SMALL_NUMBER
	#define SMALL_NUMBER 1e-8
#endif
#ifndef MAX_DISTANCE
	#define MAX_DISTANCE 1e+38
#endif

#if GPU_SIMULATION
	#include "/Engine/Private/Common.ush"
	#include "/Engine/Private/DistanceField/GlobalDistanceFieldShared.ush"

	#define NEEDS_SCENE_TEXTURES 1
	#define SCENE_TEXTURES_DISABLED 0
	#include "/Engine/Generated/UniformBuffers/SubstratePublic.ush"
#else
	const static float PI = 3.1415926535897932f;
#endif
	const static float TWO_PI = 3.1415926535897932f*2.0f;

#include "/Engine/Private/Definitions.usf"

// Most of the vector implementations work this way. This helps us keep proper precision.
float4 ModuloPrecise(float4 x, float4 y){ return x - y * trunc(x/y); }
float3 ModuloPrecise(float3 x, float3 y){ return x - y * trunc(x/y); }
float2 ModuloPrecise(float2 x, float2 y){ return x - y * trunc(x/y);}
float ModuloPrecise(float x, float y){ return x - y * trunc(x/y); }
int ModuloPrecise(int x, int y){ return x - y * (x/y); }
int Modulo(int x, int y){  return x - y * (x/y);  }

	
// using rcp is only 12 bits of precision, we should usually pay for perf
float4 Reciprocal(float4 x){ return 1.0f/x;}
float3 Reciprocal(float3 x){ return 1.0f/x; }
float2 Reciprocal(float2 x){ return 1.0f/x;}
float Reciprocal(float x){ return 1.0f/x; }

// Annoyingly, all(bool) and any(bool) don't exist, so we'll make Niagara versions which
// work with both scalars and vectors.
bool NiagaraAll(bool b) { return b; }
bool NiagaraAny(bool b) { return b; }
#if GPU_SIMULATION
bool NiagaraAll(bool2 b) { return all(b); }
bool NiagaraAll(bool3 b) { return all(b); }
bool NiagaraAll(bool4 b) { return all(b); }
bool NiagaraAny(bool2 b) { return any(b); }
bool NiagaraAny(bool3 b) { return any(b); }
bool NiagaraAny(bool4 b) { return any(b); }
#else
// No all() and any() opcodes in our VM, emulate them.
bool NiagaraAll(bool2 b) { return b.x && b.y; }
bool NiagaraAll(bool3 b) { return b.x && b.y && b.z; }
bool NiagaraAll(bool4 b) { return b.x && b.y && b.z && b.w; }
bool NiagaraAny(bool2 b) { return b.x || b.y; }
bool NiagaraAny(bool3 b) { return b.x || b.y || b.z; }
bool NiagaraAny(bool4 b) { return b.x || b.y || b.z || b.w; }
#endif

#define GetEngineOwnerLWCTile() Engine_Owner_LWCTile.xyz

/* -----------------------------------------------------------------
 * GPU simulation utility functions
 * -----------------------------------------------------------------
 */
#if GPU_SIMULATION
	#ifndef NIAGARA_PARTICLE_PARTIAL_ENABLED
		#define NIAGARA_PARTICLE_PARTIAL_ENABLED 0
	#endif

	// Type of dispatch mode we are in this allows us to customize some HLSL code to avoid / % operations
	#define NIAGARA_DISPATCH_TYPE_ONE_D		0
	#define NIAGARA_DISPATCH_TYPE_TWO_D		1
	#define NIAGARA_DISPATCH_TYPE_THREE_D	2
	#define NIAGARA_DISPATCH_TYPE_CUSTOM	3
	#ifndef NIAGARA_DISPATCH_TYPE
		#define NIAGARA_DISPATCH_TYPE NIAGARA_DISPATCH_TYPE_ONE_D
	#endif

	#if NIAGARA_DISPATCH_INDIRECT
		static uint3	DispatchThreadIdBounds;		// Used to track thread bounds
		Buffer<uint4>	IndirectDispatchArgs;		// Indirect count buffer we lookup into to get actual thread count
		uint			IndirectDispatchArgsOffset;	// Offset into the indirect count buffer
	#else
		uint3 DispatchThreadIdBounds;			// Used to track thread bounds
		uint3 DispatchThreadIdToLinear;			// Used to convert from DispatchThreadId to a linear thread id
	#endif

	static uint3 GDispatchThreadId;		// SV_DispatchThreadId
	static uint3 GGroupId;				// SV_GroupdId
	static uint3 GGroupThreadId;		// SV_GroupThreadId
	static uint  GGroupIndex;			// SV_GroupIndex
	static uint GLinearThreadId;
	static uint GEmitterTickCounter;
	static uint GRandomSeedOffset = 0;

	// To be removed as well but don't know who is using it
	#include "/Engine/Private/SceneTexturesCommon.ush"

	// Physics common 
	//-TOFIX: This should not be included here
	#include "NiagaraPhysicsCommon.ush"

	uint EmitterTickCounter;

	float4 Modulo(float4 x, float4 y){ return fmod(x,y); }
	float3 Modulo(float3 x, float3 y){ return fmod(x,y); }
	float2 Modulo(float2 x, float2 y){ return fmod(x,y); }
	float Modulo(float x, float y){ return fmod(x,y); }

	// utility function used for scene depth calculations
	FLWCVector3 WorldPositionFromSceneDepth(float2 ScreenPosition, float SceneDepth)
	{
		FLWCVector4 HomogeneousWorldPosition = LWCMultiply(float4(ScreenPosition * SceneDepth, SceneDepth, 1), DFFastToTileOffset(PrimaryView.ScreenToWorld)); //DF_TODO
		FLWCVector3 WorldPosition = MakeLWCVector3(LWCGetTile(HomogeneousWorldPosition).xyz, HomogeneousWorldPosition.Offset.xyz);
		FLWCScalar Scale = MakeLWCScalar(LWCGetTile(HomogeneousWorldPosition).w, HomogeneousWorldPosition.Offset.w);
		return LWCDivide(WorldPosition, LWCToFloat(Scale));
	}

	// MASSIVE HACK - Tracked in JIRA UE-69298
	// Hardcoded random function accessible from inner part of node implementation.
	// It works for now at least and avoids exposing every random needed in the UI. 
	// Temporary solution, it will be replaced when a design is validated.
	float NiagaraInternalNoise(uint u, uint v, uint s)
	{
		uint Seed = (u * 1664525u + v) + s + GRandomSeedOffset;
		GRandomSeedOffset += Seed;
		return float(Rand3DPCG32(int3(u,v,Seed)).x) / 4294967296.0f;
	}

	// NIAGARA_MAX_GPU_SPAWN_INFOS is set from the shader compiler
	#define NIAGARA_MAX_GPU_SPAWN_INFOS_V4	((NIAGARA_MAX_GPU_SPAWN_INFOS + 3) / 4)

	int4	EmitterSpawnInfoOffsets[NIAGARA_MAX_GPU_SPAWN_INFOS_V4];
	float4	EmitterSpawnInfoParams[NIAGARA_MAX_GPU_SPAWN_INFOS];			// Packed data where x = IntervalDt, y = InterpStartDt, z = Group, w = Start Particle Index

	static int GInterpSpawnIndex;
	static float Emitter_SpawnInterval;
	static float Emitter_InterpSpawnStartDt;
	static int Emitter_SpawnGroup;

	static int Engine_ExecutionCount;
	static int GGPUExecIndex;

	static uint GSpawnStartInstance;
	uint NumSpawnedInstances;

	void SetupExecIndexForGPU()
	{
		GGPUExecIndex = GLinearThreadId;
		Engine_ExecutionCount = GSpawnStartInstance;
	}

	void SetupExecIndexAndSpawnInfoForGPU()
	{
		GGPUExecIndex = GLinearThreadId - GSpawnStartInstance;

		int SpawnInfoIndex = 0;
		UNROLL
		for (int i = 0; i < NIAGARA_MAX_GPU_SPAWN_INFOS_V4; ++i)
		{
			// This returns 0xffffffff for each component when the comparison is true, so we'll do a
			// bitwise and with 1 to get increment amounts for each spawn info.
			int4 CompareResults = GGPUExecIndex >= EmitterSpawnInfoOffsets[i];
			CompareResults = CompareResults & int4(1, 1, 1, 1);
			SpawnInfoIndex += CompareResults.x + CompareResults.y + CompareResults.z + CompareResults.w;
		}

		Emitter_SpawnInterval		= EmitterSpawnInfoParams[SpawnInfoIndex].x;
		Emitter_InterpSpawnStartDt	= EmitterSpawnInfoParams[SpawnInfoIndex].y;
		Emitter_SpawnGroup			= asint(EmitterSpawnInfoParams[SpawnInfoIndex].z);

		int GroupSpawnStartIndex    = asint(EmitterSpawnInfoParams[SpawnInfoIndex].w);
		GGPUExecIndex		        = GGPUExecIndex - GroupSpawnStartIndex;
		if ( SpawnInfoIndex == (NIAGARA_MAX_GPU_SPAWN_INFOS - 1) )
		{
			Engine_ExecutionCount   = int(NumSpawnedInstances) - GroupSpawnStartIndex;
		}
		else
		{
			int NextGroupSpawnStartIndex = asint(EmitterSpawnInfoParams[SpawnInfoIndex + 1].w);
			Engine_ExecutionCount       = NextGroupSpawnStartIndex - GroupSpawnStartIndex;
		}
	}

	/* Returns the current instance index relative to the operation (spawn/update)
	 */
	int ExecIndex()
	{
		return GGPUExecIndex;
	}

	float4 NiagaraGPU_QuatMul(float4 Q1, float4 Q2)
	{
		float4 QOut;
		QOut.x = Q1.w*Q2.x + Q1.x*Q2.w + Q1.y*Q2.z - Q1.z*Q2.y;
		QOut.y = Q1.w*Q2.y - Q1.x*Q2.z + Q1.y*Q2.w + Q1.z*Q2.x;
		QOut.z = Q1.w*Q2.z + Q1.x*Q2.y - Q1.y*Q2.x + Q1.z*Q2.w;
		QOut.w = Q1.w*Q2.w - Q1.x*Q2.x - Q1.y*Q2.y - Q1.z*Q2.z;
		return QOut;
	}
#endif

/* ----------------------------------------------------------------------------
 * Seeded/Deterministic random number generation functions
 *
 * This is a variant of NiagaraRand4DPCG32 from Random.ush. 
 *
 * uint is not fully supported in the VM so we simply use ints and drop the 
 * top and bottom bit swap. This should be fine since signed overflow should 
 * produce the same results as unsigned overflow when comparing bit-by-bit on 
 * all relevant architectures.
 *
 * Warning: Only contains 24 bits of randomness, since we produce values in 
 *          the unit interval. Uses the upper 24 bits, as they have the best
 *          quality.
 *
 *          By removing the bit swaps in NiagaraRand4DPCG32 we save a few 
 *          operations, but lose a bit of statistical (but not visual) quality, 
 *          and for our use case this is an acceptable compromise.
 * ----------------------------------------------------------------------------
 */

// Returns 4 random normalized floats based on 4 explicit integer seeds
float4 rand4(int Seed1, int Seed2, int Seed3, int Seed4)
{
	int4 v = int4(Seed4, Seed1, Seed2, Seed3) * 1664525 + 1013904223;

	v.x += v.y*v.w;
	v.y += v.z*v.x;
	v.z += v.x*v.y;
	v.w += v.y*v.z;
	v.x += v.y*v.w;
	v.y += v.z*v.x;
	v.z += v.x*v.y;
	v.w += v.y*v.z;

	// We can use 24 bits of randomness, as all integers in [0, 2^24] 
	// are exactly representable in single precision floats.
	// We use the upper 24 bits as they tend to be higher quality.

	// The divide is often folded with the range scale in the rand functions
	return float4((v >> 8) & 0x00ffffff) / 16777216.0; // 0x01000000 == 16777216
	// return float4((v >> 8) & 0x00ffffff) * (1.0/16777216.0); // bugged, see UE-67738
}

// float3 specialization of the above:
// 
// Returns 3 random normalized floats based on 4 explicit integer seeds.
// 
// All bits of the first and second seeds are used, while only 
// the lower 16 bits of the third and fourth seeds are used.
float3 rand3(int Seed1, int Seed2, int Seed3, int Seed4)
{
	int3 v = int3(Seed1, Seed2, Seed4 | (Seed3 << 16)) * 1664525 + 1013904223;

	v.x += v.y*v.z;
	v.y += v.z*v.x;
	v.z += v.x*v.y;
	v.x += v.y*v.z;
	v.y += v.z*v.x;
	v.z += v.x*v.y;

	return float3((v >> 8) & 0x00ffffff) / 16777216.0; // 0x01000000 == 16777216
}

// Internal counter used to generate a different sequence of random numbers for each call
static int RandomCounterDeterministic = 0;

// Cost using rand4: 6 imad, 1 itof, 1 ishr, 1 add, 2 mul
float rand(float x, int Seed1, int Seed2, int Seed3)
{
	RandomCounterDeterministic += 1;
	return rand3(Seed1, Seed2, Seed3, RandomCounterDeterministic).x * x;
}

// Cost using rand4: 7 imad, 1 itof, 1 ishr, 1 add, 2 mul
float2 rand(float2 x, int Seed1, int Seed2, int Seed3)
{
	RandomCounterDeterministic += 1;
	return rand3(Seed1, Seed2, Seed3, RandomCounterDeterministic).xy * x;
}

// Cost using rand4: 8 imad, 1 itof, 1 ishr, 1 add, 2 mul
float3 rand(float3 x, int Seed1, int Seed2, int Seed3)
{
	RandomCounterDeterministic += 1;
	return rand3(Seed1, Seed2, Seed3, RandomCounterDeterministic).xyz * x;
}

// Cost using rand4: 9 imad, 1 itof, 1 ishr, 1 and, 2 mul
float4 rand(float4 x, int Seed1, int Seed2, int Seed3) 
{
	RandomCounterDeterministic += 1;
	return rand4(Seed1, Seed2, Seed3, RandomCounterDeterministic).xyzw * x;
}

// Cost using rand4: 6 imad, 2 itof, 1 ishr, 1 add, 2 mul, 1 ftoi
int rand(int x, int Seed1, int Seed2, int Seed3)
{
	// Scaling a uniform float range provides better distribution of numbers than using %.
	// Inclusive! So [0, x] instead of [0, x)
	RandomCounterDeterministic += 1;
	return int(rand3(Seed1, Seed2, Seed3, RandomCounterDeterministic).x * (x+1));
}

/* -----------------------------------------------------------------
 * Un-seeded/Non-deterministic random number generation functions
 * -----------------------------------------------------------------
 */
#if GPU_SIMULATION
	// This simply calls the deterministic random number functions from the Seeded RNG section, 
	// but uses non-deterministic seeds as input. 

	// This could perhaps be optimized by using slightly cheaper functions, but the difference is likely negligible. 
	
	// Internal counter used to generate a different sequence of random numbers for each call
	// We need to keep this separate from the Deterministic version so that non-deterministic 
	// calls do not interfere with the deterministic ones. 
	static int RandomCounterNonDeterministic = -1;

	float rand(float x)
	{
		RandomCounterNonDeterministic -= 1;
		return rand4(GLinearThreadId, EmitterTickCounter, GLinearThreadId, RandomCounterNonDeterministic).x * x;
	}

	float2 rand(float2 x)
	{
		RandomCounterNonDeterministic -= 1;
		return rand4(GLinearThreadId, EmitterTickCounter, GLinearThreadId, RandomCounterNonDeterministic).xy * x;
	}

	float3 rand(float3 x)
	{
		RandomCounterNonDeterministic -= 1;
		return rand4(GLinearThreadId, EmitterTickCounter, GLinearThreadId, RandomCounterNonDeterministic).xyz * x;
	}

	float4 rand(float4 x) 
	{
		RandomCounterNonDeterministic -= 1;
		return rand4(GLinearThreadId, EmitterTickCounter, GLinearThreadId, RandomCounterNonDeterministic).xyzw * x;
	}

	// Integer randoms are INCLUSIVE, i.e. includes both the upper and lower limits
	int rand(int x)
	{
		RandomCounterNonDeterministic -= 1;
		return int(rand4(GLinearThreadId, EmitterTickCounter, GLinearThreadId, RandomCounterNonDeterministic).x * (x+1));
	}
#else
	// Old unseeded, passthrough to FRandomStream

	float rand(float x); // Invokes EVectorVMOp::random

	float2 rand(float2 x)
	{
		return float2(rand(x.x), rand(x.y));
	}

	float3 rand(float3 x)
	{
		return float3(rand(x.x), rand(x.y), rand(x.z));
	}

	float4 rand(float4 x) 
	{
		return float4(rand(x.x), rand(x.y), rand(x.z), rand(x.w));
	}
	
	int rand(int x); // Invokes EVectorVMOp::randomi and is semi-open. This is inconsistent with the rest of the functions above. As a result this function and the ones above should be deprecated in favor of the functions below
#endif

// Small changes in the input bits should propagate to a lot of output bits, so the resulting hash is not periodic.
// This is important because the hash inputs are often things like particle ID, but the output should be pseudo-random. 
int hash_single(int a)
{
    int x = (a ^ 61) ^ (a >> 16);
	x += x << 3;
	x ^= x >> 4;
	x *= 0x27d4eb2d;
	x ^= x >> 15;
	return x;
}

int hash(int a, int b)
{
	return hash_single(a) ^ hash_single(b * 31);
}

float hash_float(int a, int b)
{
	return (hash(a, b) & 0x00ffffff) / 16777216.0;
}

// this is used when chaining calls from variable number of inputs, e.g. hash_float(hash_float(a, b), c)
float hash_float(float a, int b)
{
	return (hash(a * 16777216.0, b) & 0x00ffffff) / 16777216.0;
}

// Explicit non-deterministic random overrides used by Random Float/Integer and Seeded Random Float/Integer op nodes
float rand_float(float x)
{
	return rand(x.x);
}

float2 rand_float(float2 x)
{
	return float2(rand_float(x.x), rand_float(x.y));
}

float3 rand_float(float3 x)
{
	return float3(rand_float(x.x), rand_float(x.y), rand_float(x.z));
}

float4 rand_float(float4 x) 
{
	return float4(rand_float(x.x), rand_float(x.y), rand_float(x.z), rand_float(x.w));
}

int rand_int(int x)
{
	// Going through the float function also give us a better distribution than using modulo 
	// to get an integer range.
	// This will not include the upper range as rand_float returns [0, max), not [0, max].
	return (int) rand_float(x.x);
}

// Explicit deterministic random overrides used by Random Float/Integer and Seeded Random Float/Integer op nodes
float rand_float(float x, int Seed1, int Seed2, int Seed3)
{
	return rand(x.x, Seed1, Seed2, Seed3);
}

float2 rand_float(float2 x, int Seed1, int Seed2, int Seed3)
{
	return rand(x, Seed1, Seed2, Seed3);
}

float3 rand_float(float3 x, int Seed1, int Seed2, int Seed3)
{
	return rand(x, Seed1, Seed2, Seed3);
}

float4 rand_float(float4 x, int Seed1, int Seed2, int Seed3) 
{
	return rand(x, Seed1, Seed2, Seed3);
}

int rand_int(int x, int Seed1, int Seed2, int Seed3)
{
	// This will not include the upper range as rand_float returns [0, max), not [0, max]
	// The deterministic rand call will include the upper range, so we subtract a one to compensate
	return rand(x.x-1, Seed1, Seed2, Seed3);
}

// used to interpolate rotations in interpolated spawn scripts
float4 NiagaraQuatSLerp(float4 Quat1, float4 Quat2, float Slerp)
{
	const float RawCosom = dot(Quat1, Quat2);
	const float Cosom = abs(RawCosom);
	
	float Scale0, Scale1;
	if (Cosom < 0.9999f)
	{
		const float Omega = acos(Cosom);
		const float InvSin = 1.f / sin(Omega);
		Scale0 = sin((1.f - Slerp) * Omega) * InvSin;
		Scale1 = sin(Slerp * Omega) * InvSin;
	}
	else
	{
		Scale0 = 1.0f - Slerp;
		Scale1 = Slerp;
	}

	Scale1 = RawCosom >= 0.0f ? Scale1 : -Scale1;

	return (Scale0 * Quat1) + (Scale1 * Quat2);
}

/* -----------------------------------------------------------------
 * VM simulation function declarations
 * -----------------------------------------------------------------
 */
#if VM_SIMULATION
	float noise(float x);
	float noise(float2 x);
	float noise(float3 x);

	//Total hack to get around the cross compiler converting fmod() to "X - (Y * trunc(X/Y))";
	//On gpu just define these as fmod(x,y)
	float4 Modulo(float4 x, float4 y);
	float3 Modulo(float3 x, float3 y);
	float2 Modulo(float2 x, float2 y);
	float Modulo(float x, float y);

	/** Returns the index for this particle in the current execution context. On gpu this'll likely be derived from DispatchThreadId */
	int ExecIndex();

	//Some functions that we use to map to special VM operations for reading in data.
	//TODO: replace with proper buffer reading capability and use standard hlsl.
	int AcquireIndex(int DataSetID, bool DoAcquire);

	void AcquireID(int DataSetID, out int IDIndex, out int IDAcquireTag);
	void UpdateID(int DataSetID, int IDIndex, int InstanceIndex);

	float InputDataFloat(int DataSetIndex, int RegisterIdx);  //DataSetIndex is 0 for main dataset
	int InputDataInt(int DataSetIndex, int RegisterIdx);
	bool InputDataBool(int DataSetIndex, int RegisterIdx);
	float InputDataHalf(int DataSetIndex, int RegisterIdx);

	float InputDataNoadvanceFloat(int DataSetIndex, int RegisterIdx);  //DataSetIndex is 0 for main dataset
	int InputDataNoadvanceInt(int DataSetIndex, int RegisterIdx);
	bool InputDataNoadvanceBool(int DataSetIndex, int RegisterIdx);

	void OutputDataFloat(int DataSetIndex, int RegisterIndex, int InstanceIndex, float Value);
	void OutputDataInt(int DataSetIndex, int RegisterIndex, int InstanceIndex, int Value);
	void OutputDataBool(int DataSetIndex, int RegisterIndex, int InstanceIndex, bool Value);
	void OutputDataHalf(int DataSetIndex, int RegisterIndex, int InstanceIndex, float Value);
#endif


/* -----------------------------------------------------------------
 * GPU simulation code
 * -----------------------------------------------------------------
 */
#if GPU_SIMULATION
	uint ComponentBufferSizeRead;
	uint ComponentBufferSizeWrite;
	uint SimStart;

	/* Buffers for particle data and DrawIndirect calls
	 */
	#if NIAGARA_PARTICLE_PARTIAL_ENABLED == 0
		Buffer<float> InputFloat;
		Buffer<int> InputInt;
		Buffer<half> InputHalf;
	#endif
	RWBuffer<int> RWOutputInt;
	RWBuffer<float> RWOutputFloat;
	RWBuffer<half> RWOutputHalf;

	Buffer<float> StaticInputFloat;

	RWBuffer<uint> RWInstanceCounts;
	uint ReadInstanceCountOffset;
	uint WriteInstanceCountOffset;

	Buffer<int> FreeIDList;
	RWBuffer<int> RWIDToIndexTable;
	
	// X = Count Buffer Instance Count Offset (INDEX_NONE == Use Instance Count)
	// Y = Instance Count
	// Z = Iteration Num | Index
	// W = Loop Num | Index
	uint4 SimulationStageIterationInfo;

	// Note: These are referenced from an asset that passes back the data to the user (see SimulationStageIterationInfo)
	#if NIAGARA_DISPATCH_INDIRECT
		int SimulationStage_GetInstanceCount() { return DispatchThreadIdBounds.x * DispatchThreadIdBounds.y * DispatchThreadIdBounds.z; }
	#else
		int SimulationStage_GetInstanceCount() { return SimulationStageIterationInfo.x == uint(-1) ? int(SimulationStageIterationInfo.y) : int(RWInstanceCounts[SimulationStageIterationInfo.x]); }
	#endif
	int SimulationStage_GetNumIterations()  { return int((SimulationStageIterationInfo.z >> 16) & 0xffff); }
	int SimulationStage_GetIterationIndex() { return int((SimulationStageIterationInfo.z >> 0 ) & 0xffff); }
	int SimulationStage_GetNumLoops()  { return int((SimulationStageIterationInfo.w >> 16) & 0xffff); }
	int SimulationStage_GetLoopIndex() { return int((SimulationStageIterationInfo.w >> 0 ) & 0xffff); }

	float SimulationStage_GetNormalizedIterationIndex() { return float(SimulationStage_GetIterationIndex()) / float(max(SimulationStage_GetNumIterations(), 1) - 1); }
	float SimulationStage_GetNormalizedLoopIndex() { return float(SimulationStage_GetLoopIndex()) / float(max(SimulationStage_GetNumLoops(), 1) - 1); }

	// Where X = Parameter Binding, YZ = Inclusive Range
	uint3 ParticleIterationStateInfo;

	void AcquireID(int DataSetID, out int IDIndex, out int IDAcquireTag)
	{
		// Begin static assert : GPU particles only support DataSetID 0
		int MustBe0[1];
		MustBe0[DataSetID] = 0;
		// End static assert

		// This is the same as ExecIndex() right now, but that function may change in the future to accommodate multiple
		// spawn infos. Revisit this computation if the change affects the meaning of GSpawnStartInstance.
		int SpawnIndex = GLinearThreadId - GSpawnStartInstance;
		IDIndex = FreeIDList[SpawnIndex];
		IDAcquireTag = EmitterTickCounter;
	}

 	void UpdateID(int DataSetID, int IDIndex, int InstanceIndex)
	{
		// Begin static assert : GPU particles only support DataSetID 0
		int MustBe0[1];
		MustBe0[DataSetID] = 0;
		// End static assert

		RWIDToIndexTable[IDIndex] = InstanceIndex;
	}

#define USE_GROUP_SHARED ((THREADGROUP_SIZE == 64 || THREADGROUP_SIZE == 32) && !USE_WAVE_INTRINSICS && !VULKAN_PROFILE_SM5 && !VULKAN_PROFILE_SM6 && !VULKAN_PROFILE)

#if USE_GROUP_SHARED
	#if THREADGROUP_SIZE == 64
		groupshared uint GroupSharedIndex[64];
		groupshared uint GroupSharedIndex4[16];
		groupshared uint GroupSharedIndex16[4];
		groupshared uint GroupSharedIndex64;
	#elif THREADGROUP_SIZE == 32
		groupshared uint GroupSharedIndex[32];
		groupshared uint GroupSharedIndex4[8];
		groupshared uint GroupSharedIndex16[2];
		groupshared uint GroupSharedIndex64;
	#endif
#endif // USE_GROUP_SHARED

	/* Acquire an output index - the default index is the scratch instance; one additional instance is allocated 
	 *	at the end of the buffer, so no branching on -1 is necessary during OutputData operations
	 */
	int AcquireIndex(uniform int DataSetID, bool bDoAcquire)
	{
		// Begin static assert : GPU particles only support DataSetID 0
		int MustBe0[1];
		MustBe0[DataSetID] = 0;
		// End static assert

		int PrevIdx = GSpawnStartInstance + NumSpawnedInstances;	// scratch instance as default; write to that for dead particles

		#if USE_WAVE_INTRINSICS

			uint NumCounters = WaveActiveCountBits(bDoAcquire);
			uint PrefixCounts = WavePrefixCountBits(bDoAcquire);
			if (NumCounters > 0)
			{
				if (WaveIsFirstLane())
				{
					uint RetPrevIdx;
					InterlockedAdd(RWInstanceCounts[WriteInstanceCountOffset], NumCounters, RetPrevIdx);
					PrevIdx = (int)RetPrevIdx;
				}

				if (bDoAcquire)
				{
					PrevIdx = WaveReadLaneFirst(PrevIdx);
					PrevIdx += PrefixCounts;
				}
			}	

		#elif USE_GROUP_SHARED

			GroupSharedIndex[GGroupThreadId.x] = bDoAcquire ? 1 : 0;
			GroupMemoryBarrierWithGroupSync(); 

			// Group by 4
			if ((GGroupThreadId.x & 0x3) == 0)
			{
				const uint Index = GGroupThreadId.x;

				const uint ActiveCount1 = GroupSharedIndex[Index];
				const uint ActiveCount2 = ActiveCount1 + GroupSharedIndex[Index + 1];
				const uint ActiveCount3 = ActiveCount2 + GroupSharedIndex[Index + 2];
				const uint ActiveCount4 = ActiveCount3 + GroupSharedIndex[Index + 3];
				
				GroupSharedIndex[Index] = 0;
				GroupSharedIndex[Index + 1] = ActiveCount1;
				GroupSharedIndex[Index + 2] = ActiveCount2;
				GroupSharedIndex[Index + 3] = ActiveCount3;
				GroupSharedIndex4[Index / 4] = ActiveCount4;
			}
			GroupMemoryBarrierWithGroupSync(); 

			// Group by 16
			if ((GGroupThreadId.x & 0xF) == 0)
			{
				const uint Index = GGroupThreadId.x / 4;

				const uint ActiveCount1 = GroupSharedIndex4[Index];
				const uint ActiveCount2 = ActiveCount1 + GroupSharedIndex4[Index + 1];
				const uint ActiveCount3 = ActiveCount2 + GroupSharedIndex4[Index + 2];
				const uint ActiveCount4 = ActiveCount3 + GroupSharedIndex4[Index + 3];

				GroupSharedIndex4[Index] = 0;
				GroupSharedIndex4[Index + 1] = ActiveCount1;
				GroupSharedIndex4[Index + 2] = ActiveCount2;
				GroupSharedIndex4[Index + 3] = ActiveCount3;
				GroupSharedIndex16[Index / 4] = ActiveCount4;
			}
			GroupMemoryBarrierWithGroupSync(); 

			// Group by 64
			if ((GGroupThreadId.x & 0x3F) == 0)
			{
				const uint Index = GGroupThreadId.x / 16;
				uint RetPrevIdx = 0;
		
				const uint ActiveCount1 = GroupSharedIndex16[Index];
				const uint ActiveCount2 = ActiveCount1 + GroupSharedIndex16[Index + 1];
		#if THREADGROUP_SIZE == 64
				const uint ActiveCount3 = ActiveCount2 + GroupSharedIndex16[Index + 2];
				const uint ActiveCount4 = ActiveCount3 + GroupSharedIndex16[Index + 3];
		#endif

				GroupSharedIndex16[Index] = 0;
				GroupSharedIndex16[Index + 1] = ActiveCount1;
		#if THREADGROUP_SIZE == 64
				GroupSharedIndex16[Index + 2] = ActiveCount2;
				GroupSharedIndex16[Index + 3] = ActiveCount3;

				InterlockedAdd(RWInstanceCounts[WriteInstanceCountOffset], ActiveCount4, RetPrevIdx);
		#elif THREADGROUP_SIZE == 32
				InterlockedAdd(RWInstanceCounts[WriteInstanceCountOffset], ActiveCount2, RetPrevIdx);
		#endif
				GroupSharedIndex64 = RetPrevIdx;
			}
			GroupMemoryBarrierWithGroupSync(); 

			PrevIdx = GroupSharedIndex64 + GroupSharedIndex16[GGroupThreadId.x / 16] + GroupSharedIndex4[GGroupThreadId.x / 4] + GroupSharedIndex[GGroupThreadId.x];

		#else // !USE_WAVE_INTRINSICS && !USE_GROUP_SHARED

			if(bDoAcquire == true)
			{
				// Have to use uint's here to avoid PS4 compiler warnings about InterlockedAdd, cannot propagate uint due to CPU VM limitations...
				uint RetPrevIdx;
				// @TODO : add some TLS logic to avoid thread group for doing atomic for each thread. (gathering the actual required count)
				InterlockedAdd(RWInstanceCounts[WriteInstanceCountOffset], (uint)1U, RetPrevIdx);
				PrevIdx = (int)RetPrevIdx;
			}

		#endif // USE_WAVE_INTRISICS || USE_GROUP_SHARED

		return PrevIdx;
	}

	/* ---------------------------------------------------------------------
	 * InputData operations 
	 * ---------------------------------------------------------------------
	 */
	float InputDataFloat(int DataSetIndex, int RegisterIdx, int InstanceIdx)
	{
		#if NIAGARA_PARTICLE_PARTIAL_ENABLED
			return RWOutputFloat[RegisterIdx*ComponentBufferSizeWrite + InstanceIdx];
		#else
			return InputFloat[RegisterIdx*ComponentBufferSizeRead + InstanceIdx];
		#endif
	}

	int InputDataInt(int DataSetIndex, int RegisterIdx, int InstanceIdx)
	{
		#if NIAGARA_PARTICLE_PARTIAL_ENABLED
			return RWOutputInt[RegisterIdx*ComponentBufferSizeWrite + InstanceIdx];
		#else
			return InputInt[RegisterIdx*ComponentBufferSizeRead + InstanceIdx];
		#endif
	}
	
	bool InputDataBool(int DataSetIndex, int RegisterIdx, int InstanceIdx)
	{
		#if NIAGARA_PARTICLE_PARTIAL_ENABLED
			return RWOutputInt[RegisterIdx*ComponentBufferSizeWrite + InstanceIdx] == -1;
		#else
			return InputInt[RegisterIdx*ComponentBufferSizeRead + InstanceIdx] == -1;
		#endif
	}

	float InputDataHalf(int DataSetIndex, int RegisterIdx, int InstanceIdx)
	{
		#if NIAGARA_PARTICLE_PARTIAL_ENABLED
			return RWOutputHalf[RegisterIdx*ComponentBufferSizeWrite + InstanceIdx];
		#else
			return InputHalf[RegisterIdx*ComponentBufferSizeRead + InstanceIdx];
		#endif
	}

	/* ---------------------------------------------------------------------
	 * OutputData operations 
	 * ---------------------------------------------------------------------
	 */
	void OutputDataFloat(int DataSetIndex, int RegisterIndex, int InstanceIndex, float Value)
	{
		RWOutputFloat[RegisterIndex*ComponentBufferSizeWrite + InstanceIndex] = Value;
	}

	void OutputDataInt(int DataSetIndex, int RegisterIndex, int InstanceIndex, int Value)
	{
		RWOutputInt[RegisterIndex*ComponentBufferSizeWrite + InstanceIndex] = Value;
	}

	void OutputDataBool(int DataSetIndex, int RegisterIndex, int InstanceIndex, bool Value)
	{
		RWOutputInt[RegisterIndex*ComponentBufferSizeWrite + InstanceIndex] = Value ? -1 : 0;
	}

	void OutputDataHalf(int DataSetIndex, int RegisterIndex, int InstanceIndex, float Value)
	{
		RWOutputHalf[RegisterIndex*ComponentBufferSizeWrite + InstanceIndex] = Value;
	}

	void EnterStatScope(int ID)	{}
	void ExitStatScope()	{}
#endif // GPU_SIMULATION

	/* 
	 * Get the index to write onto the output buffer
	 */
	int OutputIndex(const int DataSetID, const bool bStageKillsParticles, const bool bIsValid)
	{
	#if GPU_SIMULATION
		// If this stage cannot kill particles, we can just write them out in the same order as they
		// appear in the input. We must use an if here (as opposed to a ternary operator, or some
		// other branchless construct), because we don't want to call AcquireIndex() at all, since
		// that manipulates the RWInstanceCounts UAV. The generated code will copy the source count
		// at the end of the shader.
		if (!bStageKillsParticles)
		{
			return GLinearThreadId;
		}
	#endif

		return AcquireIndex(DataSetID, bIsValid);
	}

////////////////////////////////////////////////////////////////////////////////////
// Random number functions
struct NiagaraRandInfo
{
	int Seed1;
	int Seed2;
	int Seed3;
};

#if GPU_SIMULATION
NiagaraRandInfo MakeRandInfo()
{
	GRandomSeedOffset += 1664525u;

	NiagaraRandInfo RandInfo;
	RandInfo.Seed1 = GLinearThreadId;
	RandInfo.Seed2 = GRandomSeedOffset;
	RandInfo.Seed3 = GEmitterTickCounter;
	return RandInfo;
}

float NiagaraRandomFloat(NiagaraRandInfo RandInfo)
{
	return uint(RandInfo.Seed3) == 0xffffffff ? NiagaraInternalNoise(uint(RandInfo.Seed1), uint(RandInfo.Seed2), uint(RandInfo.Seed3)) : rand(1.0f, uint(RandInfo.Seed1), uint(RandInfo.Seed2), uint(RandInfo.Seed3));
}

int NiagaraRandomInt(NiagaraRandInfo RandInfo, int Range)
{
	float T = NiagaraRandomFloat(RandInfo);
	return int(floor(float(Range) * T));
}

float3 NiagaraRandomBaryCoord(NiagaraRandInfo RandInfo)
{
	float2 r = float2(NiagaraRandomFloat(RandInfo), NiagaraRandomFloat(RandInfo));
	float sqrt0 = sqrt(r.x);
	float sqrt1 = sqrt(r.y);
	return float3(1.0f - sqrt0, sqrt0 * (1.0 - r.y), r.y * sqrt0);
}
#endif

////////////////////////////////////////////////////////////////////////////////////
//Include the simulation shader code generated by the node graph.
#include "/Engine/Generated/NiagaraEmitterInstance.ush"