300 lines
10 KiB
HLSL
300 lines
10 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
/*=============================================================================
|
|
NiagaraDirectSolver.ush
|
|
=============================================================================*/
|
|
|
|
#if GPU_SIMULATION
|
|
|
|
/* -----------------------------------------------------------------
|
|
* Shared memory for datas that will be accessed within the direct solver
|
|
* -----------------------------------------------------------------
|
|
*/
|
|
|
|
groupshared float4 SharedTridiagValues[THREADGROUP_SIZE];
|
|
groupshared float4 SharedTridiagResult[THREADGROUP_SIZE];
|
|
|
|
void SolveTridiagSerial2()
|
|
{
|
|
if(GGroupThreadId.x == 0)
|
|
{
|
|
{
|
|
SharedTridiagValues[0].z = SharedTridiagValues[0].z / SharedTridiagValues[0].y;
|
|
GroupMemoryBarrier();
|
|
SharedTridiagValues[0].w = SharedTridiagValues[0].w / SharedTridiagValues[0].y;
|
|
GroupMemoryBarrier();
|
|
|
|
for(int i = 1; i < THREADGROUP_SIZE; ++i)
|
|
{
|
|
if(i < (THREADGROUP_SIZE-1))
|
|
{
|
|
SharedTridiagValues[i].z = SharedTridiagValues[i].z / ( SharedTridiagValues[i].y - SharedTridiagValues[i-1].z * SharedTridiagValues[i-1].x);
|
|
GroupMemoryBarrier();
|
|
}
|
|
|
|
SharedTridiagValues[i].w = (SharedTridiagValues[i].w -SharedTridiagValues[i-1].w * SharedTridiagValues[i].x) / ( SharedTridiagValues[i].y - SharedTridiagValues[i-1].z * SharedTridiagValues[i-1].x);
|
|
GroupMemoryBarrier();
|
|
}
|
|
}
|
|
{
|
|
for(int i = THREADGROUP_SIZE-2; i >= 0; --i)
|
|
{
|
|
SharedTridiagValues[i].w = SharedTridiagValues[i].w - SharedTridiagValues[i].z * SharedTridiagValues[i+1].w;
|
|
GroupMemoryBarrier();
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrier();
|
|
}
|
|
|
|
void SolveTridiagSerial()
|
|
{
|
|
if(GGroupThreadId.x == 0)
|
|
{
|
|
{
|
|
for(int i = 1; i < THREADGROUP_SIZE; ++i)
|
|
{
|
|
const float W = SharedTridiagValues[i].x / SharedTridiagValues[i-1].y;
|
|
SharedTridiagValues[i].y -= W * SharedTridiagValues[i-1].z;
|
|
GroupMemoryBarrier();
|
|
|
|
SharedTridiagValues[i].w -= W * SharedTridiagValues[i-1].w;
|
|
GroupMemoryBarrier();
|
|
}
|
|
}
|
|
SharedTridiagValues[THREADGROUP_SIZE-1].w = SharedTridiagValues[THREADGROUP_SIZE-1].w / SharedTridiagValues[THREADGROUP_SIZE-1].y;
|
|
GroupMemoryBarrier();
|
|
{
|
|
for(int i = THREADGROUP_SIZE-2; i >= 0; --i)
|
|
{
|
|
SharedTridiagValues[i].w = (SharedTridiagValues[i].w - SharedTridiagValues[i].z * SharedTridiagValues[i+1].w) / SharedTridiagValues[i].y;
|
|
GroupMemoryBarrier();
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrier();
|
|
}
|
|
|
|
void SolveConjugateResidual()
|
|
{
|
|
float X = 0.0;
|
|
|
|
SharedTridiagResult[GGroupThreadId.x] = float4(0.0,0.0,0.0,0.0);
|
|
GroupMemoryBarrier();
|
|
|
|
const int PrevIdx = (GGroupThreadId.x == 0) ? GGroupThreadId.x : GGroupThreadId.x-1;
|
|
const int NextIdx = (GGroupThreadId.x == THREADGROUP_SIZE-1) ? GGroupThreadId.x : GGroupThreadId.x+1;
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].y = SharedTridiagValues[GGroupThreadId.x].w;
|
|
GroupMemoryBarrier();
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].z = SharedTridiagResult[GGroupThreadId.x].y / SharedTridiagValues[GGroupThreadId.x].y;
|
|
SharedTridiagResult[GGroupThreadId.x].w = SharedTridiagResult[GGroupThreadId.x].z;
|
|
GroupMemoryBarrier();
|
|
{
|
|
|
|
for(int ii = 0; ii < 10; ++ii)
|
|
{
|
|
SharedTridiagResult[GGroupThreadId.x].x = SharedTridiagValues[GGroupThreadId.x].x * SharedTridiagResult[PrevIdx].w+
|
|
SharedTridiagValues[GGroupThreadId.x].y * SharedTridiagResult[GGroupThreadId.x].w+
|
|
SharedTridiagValues[GGroupThreadId.x].z * SharedTridiagResult[NextIdx].w;
|
|
GroupMemoryBarrier();
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].y = SharedTridiagValues[GGroupThreadId.x].x * SharedTridiagResult[PrevIdx].z+
|
|
SharedTridiagValues[GGroupThreadId.x].y * SharedTridiagResult[GGroupThreadId.x].z+
|
|
SharedTridiagValues[GGroupThreadId.x].z * SharedTridiagResult[NextIdx].z;
|
|
GroupMemoryBarrier();
|
|
|
|
float rkArk = 0.0;
|
|
float ApMAp = 0.0;
|
|
|
|
[unroll]
|
|
for(int kk = 0; kk < THREADGROUP_SIZE; ++kk)
|
|
{
|
|
rkArk += SharedTridiagResult[kk].z * SharedTridiagResult[kk].y;
|
|
ApMAp += SharedTridiagResult[kk].x * SharedTridiagResult[kk].x / SharedTridiagValues[kk].y;
|
|
}
|
|
|
|
float Alpha = (ApMAp != 0.0) ? rkArk / ApMAp : 0.0;
|
|
X += Alpha * SharedTridiagResult[GGroupThreadId.x].w;
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].z -= Alpha * SharedTridiagResult[GGroupThreadId.x].x / SharedTridiagValues[GGroupThreadId.x].y;
|
|
GroupMemoryBarrier();
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].y = SharedTridiagValues[GGroupThreadId.x].x * SharedTridiagResult[PrevIdx].z+
|
|
SharedTridiagValues[GGroupThreadId.x].y * SharedTridiagResult[GGroupThreadId.x].z+
|
|
SharedTridiagValues[GGroupThreadId.x].z * SharedTridiagResult[NextIdx].z;
|
|
GroupMemoryBarrier();
|
|
|
|
float rkArkn = 0.0;
|
|
|
|
[unroll]
|
|
for(int ll = 0; ll < THREADGROUP_SIZE; ++ll)
|
|
{
|
|
rkArkn += SharedTridiagResult[ll].z * SharedTridiagResult[ll].y;
|
|
}
|
|
float Beta = (rkArk != 0.0) ? rkArkn / rkArk : 0.0;
|
|
SharedTridiagResult[GGroupThreadId.x].w = SharedTridiagResult[GGroupThreadId.x].z + Beta * SharedTridiagResult[GGroupThreadId.x].w;
|
|
//SharedTridiagResult[GGroupThreadId.x].x = SharedTridiagResult[GGroupThreadId.x].y + Beta * SharedTridiagResult[GGroupThreadId.x].x;
|
|
GroupMemoryBarrier();
|
|
}
|
|
}
|
|
|
|
SharedTridiagValues[GGroupThreadId.x].w = X;
|
|
}
|
|
|
|
|
|
void SolveConjugateGradient()
|
|
{
|
|
float X = 0.0;
|
|
|
|
SharedTridiagResult[GGroupThreadId.x] = float4(0.0,0.0,0.0,0.0);
|
|
GroupMemoryBarrier();
|
|
|
|
const int PrevIdx = (GGroupThreadId.x == 0) ? GGroupThreadId.x : GGroupThreadId.x-1;
|
|
const int NextIdx = (GGroupThreadId.x == THREADGROUP_SIZE-1) ? GGroupThreadId.x : GGroupThreadId.x+1;
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].y = SharedTridiagValues[GGroupThreadId.x].w;
|
|
GroupMemoryBarrier();
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].z = SharedTridiagResult[GGroupThreadId.x].y / SharedTridiagValues[GGroupThreadId.x].y;
|
|
SharedTridiagResult[GGroupThreadId.x].w = SharedTridiagResult[GGroupThreadId.x].z;
|
|
GroupMemoryBarrier();
|
|
{
|
|
|
|
for(int ii = 0; ii < 10; ++ii)
|
|
{
|
|
SharedTridiagResult[GGroupThreadId.x].x = SharedTridiagValues[GGroupThreadId.x].x * SharedTridiagResult[PrevIdx].w+
|
|
SharedTridiagValues[GGroupThreadId.x].y * SharedTridiagResult[GGroupThreadId.x].w+
|
|
SharedTridiagValues[GGroupThreadId.x].z * SharedTridiagResult[NextIdx].w;
|
|
GroupMemoryBarrier();
|
|
|
|
float rkzko = 0.0;
|
|
float pkApk = 0.0;
|
|
|
|
[unroll]
|
|
for(int kk = 0; kk < THREADGROUP_SIZE; ++kk)
|
|
{
|
|
rkzko += SharedTridiagResult[kk].y * SharedTridiagResult[kk].z;
|
|
pkApk += SharedTridiagResult[kk].x * SharedTridiagResult[kk].w;
|
|
}
|
|
|
|
float Alpha = (pkApk != 0.0) ? rkzko / pkApk : 0.0;
|
|
X += Alpha * SharedTridiagResult[GGroupThreadId.x].w;
|
|
|
|
SharedTridiagResult[GGroupThreadId.x].y -= Alpha * SharedTridiagResult[GGroupThreadId.x].x;
|
|
GroupMemoryBarrier();
|
|
SharedTridiagResult[GGroupThreadId.x].z = SharedTridiagResult[GGroupThreadId.x].y / SharedTridiagValues[GGroupThreadId.x].y;
|
|
GroupMemoryBarrier();
|
|
|
|
float rkzkn = 0.0;
|
|
|
|
[unroll]
|
|
for(int ll = 0; ll < THREADGROUP_SIZE; ++ll)
|
|
{
|
|
rkzkn += SharedTridiagResult[ll].y * SharedTridiagResult[ll].z;
|
|
}
|
|
float Beta = (rkzko != 0.0) ? rkzkn / rkzko : 0.0;
|
|
SharedTridiagResult[GGroupThreadId.x].w = SharedTridiagResult[GGroupThreadId.x].z + Beta * SharedTridiagResult[GGroupThreadId.x].w;
|
|
GroupMemoryBarrier();
|
|
}
|
|
}
|
|
|
|
SharedTridiagValues[GGroupThreadId.x].w = X;
|
|
}
|
|
|
|
void SolveTridiagSystem()
|
|
{
|
|
float lTemp = 0.0, uTemp = 0.0, hTemp = 0.0;
|
|
|
|
[unroll]
|
|
for(int span = 1; span < THREADGROUP_SIZE; span *= 2)
|
|
{
|
|
if( (GGroupThreadId.x - span) >= 0)
|
|
{
|
|
lTemp = (SharedTridiagValues[GGroupThreadId.x - span].y != 0.0) ? -SharedTridiagValues[GGroupThreadId.x].x / SharedTridiagValues[GGroupThreadId.x - span].y : 0.0;
|
|
}
|
|
else
|
|
{
|
|
lTemp = 0.0;
|
|
}
|
|
if( (GGroupThreadId.x + span) < THREADGROUP_SIZE)
|
|
{
|
|
uTemp = (SharedTridiagValues[GGroupThreadId.x + span].y != 0.0) ? -SharedTridiagValues[GGroupThreadId.x].z / SharedTridiagValues[GGroupThreadId.x + span].y : 0.0;
|
|
}
|
|
else
|
|
{
|
|
uTemp = 0.0;
|
|
}
|
|
GroupMemoryBarrier();
|
|
if( (GGroupThreadId.x - span) >= 0)
|
|
{
|
|
SharedTridiagValues[GGroupThreadId.x].y += lTemp * SharedTridiagValues[GGroupThreadId.x - span].z;
|
|
hTemp += lTemp * SharedTridiagValues[GGroupThreadId.x - span].w;
|
|
lTemp *= SharedTridiagValues[GGroupThreadId.x - span].x;
|
|
}
|
|
if( (GGroupThreadId.x + span) < THREADGROUP_SIZE)
|
|
{
|
|
SharedTridiagValues[GGroupThreadId.x].y += uTemp * SharedTridiagValues[GGroupThreadId.x + span].x;
|
|
hTemp += uTemp * SharedTridiagValues[GGroupThreadId.x + span].w;
|
|
uTemp *= SharedTridiagValues[GGroupThreadId.x + span].z;
|
|
}
|
|
GroupMemoryBarrier();
|
|
SharedTridiagValues[GGroupThreadId.x].x = hTemp;
|
|
SharedTridiagValues[GGroupThreadId.x].z = uTemp;
|
|
SharedTridiagValues[GGroupThreadId.x].w = hTemp;
|
|
GroupMemoryBarrier();
|
|
}
|
|
SharedTridiagValues[GGroupThreadId.x].w /= SharedTridiagValues[GGroupThreadId.x].y;
|
|
GroupMemoryBarrier();
|
|
}
|
|
|
|
float SolveTridiag()
|
|
{
|
|
const int NumIteration = log2(THREADGROUP_SIZE/2);
|
|
|
|
uint DataStride = 1;
|
|
|
|
float4 DataNew = {0,0,0,0};
|
|
|
|
[unroll]
|
|
for(int i = 0; i < NumIteration; ++i)
|
|
{
|
|
int RightIdx = GGroupThreadId.x + DataStride;
|
|
RightIdx = RightIdx & (THREADGROUP_SIZE-1);
|
|
|
|
int LeftIdx = GGroupThreadId.x - DataStride;
|
|
LeftIdx = LeftIdx & (THREADGROUP_SIZE-1);
|
|
|
|
const float DataLeft = SharedTridiagValues[GGroupThreadId.x].x / SharedTridiagValues[LeftIdx].y;
|
|
const float DataRight = SharedTridiagValues[GGroupThreadId.x].z / SharedTridiagValues[RightIdx].y;
|
|
|
|
DataNew.y = SharedTridiagValues[GGroupThreadId.x].y - SharedTridiagValues[LeftIdx].z * DataLeft -
|
|
SharedTridiagValues[RightIdx].x * DataRight;
|
|
DataNew.w = SharedTridiagValues[GGroupThreadId.x].w - SharedTridiagValues[LeftIdx].w * DataLeft -
|
|
SharedTridiagValues[RightIdx].w * DataRight;
|
|
DataNew.x = -SharedTridiagValues[LeftIdx].x * DataLeft;
|
|
DataNew.z = -SharedTridiagValues[RightIdx].z * DataRight;
|
|
|
|
GroupMemoryBarrier();
|
|
|
|
SharedTridiagValues[GGroupThreadId.x] = DataNew;
|
|
DataStride *= 2;
|
|
|
|
GroupMemoryBarrier();
|
|
}
|
|
|
|
const int LeftIdx = GGroupThreadId.x & (DataStride-1);
|
|
const int RightIdx = LeftIdx + DataStride;
|
|
|
|
const float DataValue = SharedTridiagValues[RightIdx].y * SharedTridiagValues[LeftIdx].y -
|
|
SharedTridiagValues[LeftIdx].z * SharedTridiagValues[RightIdx].x;
|
|
|
|
return (GGroupThreadId.x < DataStride) ?
|
|
(SharedTridiagValues[RightIdx].y * SharedTridiagValues[LeftIdx].w -
|
|
SharedTridiagValues[LeftIdx].z * SharedTridiagValues[RightIdx].w) / DataValue :
|
|
(SharedTridiagValues[RightIdx].w * SharedTridiagValues[LeftIdx].y -
|
|
SharedTridiagValues[LeftIdx].w * SharedTridiagValues[RightIdx].x) / DataValue;
|
|
}
|
|
|
|
#endif //GPU_SIMULATION |