summaryrefslogtreecommitdiffstats
path: root/thirdparty/amd-fsr2/shaders/ffx_spd.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/amd-fsr2/shaders/ffx_spd.h')
-rw-r--r--thirdparty/amd-fsr2/shaders/ffx_spd.h936
1 files changed, 936 insertions, 0 deletions
diff --git a/thirdparty/amd-fsr2/shaders/ffx_spd.h b/thirdparty/amd-fsr2/shaders/ffx_spd.h
new file mode 100644
index 0000000000..5ce24ec87c
--- /dev/null
+++ b/thirdparty/amd-fsr2/shaders/ffx_spd.h
@@ -0,0 +1,936 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifdef FFX_CPU
+FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
+ FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
+ FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
+ FfxUInt32x4 rectInfo, // left, top, width, height
+ FfxInt32 mips) // optional: if -1, calculate based on rect width and height
+{
+ workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
+ workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
+
+ FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
+ FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
+
+ dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
+ dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
+
+ numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
+
+ if (mips >= 0)
+ {
+ numWorkGroupsAndMips[1] = FfxUInt32(mips);
+ }
+ else
+ {
+ // calculate based on rect width and height
+ FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]);
+ numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12))));
+ }
+}
+
+FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
+ FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
+ FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
+ FfxUInt32x4 rectInfo) // left, top, width, height
+{
+ SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
+}
+#endif // #ifdef FFX_CPU
+
+
+//==============================================================================================================================
+// NON-PACKED VERSION
+//==============================================================================================================================
+#ifdef FFX_GPU
+#ifdef SPD_PACKED_ONLY
+// Avoid compiler error
+FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice)
+{
+ return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+
+FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice)
+{
+ return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice)
+{
+}
+FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)
+{
+ return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)
+{
+}
+FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)
+{
+ return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+#endif // #ifdef SPD_PACKED_ONLY
+
+//_____________________________________________________________/\_______________________________________________________________
+#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+#extension GL_KHR_shader_subgroup_quad:require
+#endif
+
+void SpdWorkgroupShuffleBarrier()
+{
+#ifdef FFX_GLSL
+ barrier();
+#endif
+#ifdef FFX_HLSL
+ GroupMemoryBarrierWithGroupSync();
+#endif
+}
+
+// Only last active workgroup should proceed
+bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice)
+{
+ // global atomic counter
+ if (localInvocationIndex == 0)
+ {
+ SpdIncreaseAtomicCounter(slice);
+ }
+
+ SpdWorkgroupShuffleBarrier();
+ return (SpdGetAtomicCounter() != (numWorkGroups - 1));
+}
+
+// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3);
+FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v)
+{
+#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+
+ FfxFloat32x4 v0 = v;
+ FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v);
+ FfxFloat32x4 v2 = subgroupQuadSwapVertical(v);
+ FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v);
+ return SpdReduce4(v0, v1, v2, v3);
+
+#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+
+ // requires SM6.0
+ FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
+ FfxFloat32x4 v0 = v;
+ FfxFloat32x4 v1 = WaveReadLaneAt(v, quad | 1);
+ FfxFloat32x4 v2 = WaveReadLaneAt(v, quad | 2);
+ FfxFloat32x4 v3 = WaveReadLaneAt(v, quad | 3);
+ return SpdReduce4(v0, v1, v2, v3);
+/*
+ // if SM6.0 is not available, you can use the AMD shader intrinsics
+ // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
+ // https://gpuopen.com/amd-gpu-services-ags-library/
+ // works for DX11
+ FfxFloat32x4 v0 = v;
+ FfxFloat32x4 v1;
+ v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ FfxFloat32x4 v2;
+ v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ FfxFloat32x4 v3;
+ v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ return SpdReduce4(v0, v1, v2, v3);
+ */
+#endif
+ return v;
+}
+
+FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
+{
+ FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y);
+ FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y);
+ FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y);
+ FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y);
+ return SpdReduce4(v0, v1, v2, v3);
+}
+
+FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+ FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice);
+ FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice);
+ FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice);
+ FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice);
+ return SpdReduce4(v0, v1, v2, v3);
+}
+
+FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice)
+{
+ return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+}
+
+FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+ FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice);
+ FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice);
+ FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice);
+ FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice);
+ return SpdReduce4(v0, v1, v2, v3);
+}
+
+FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice)
+{
+#ifdef SPD_LINEAR_SAMPLER
+ return SpdLoadSourceImage(FfxInt32x2(base), slice);
+#else
+ return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+#endif
+}
+
+void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+ FfxFloat32x4 v[4];
+
+ FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+ FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+ v[0] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[0], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+ v[1] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[1], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+ v[2] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[2], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+ v[3] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[3], 0, slice);
+
+ if (mip <= 1)
+ return;
+
+ v[0] = SpdReduceQuad(v[0]);
+ v[1] = SpdReduceQuad(v[1]);
+ v[2] = SpdReduceQuad(v[2]);
+ v[3] = SpdReduceQuad(v[3]);
+
+ if ((localInvocationIndex % 4) == 0)
+ {
+ SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
+ SpdStoreIntermediate(x / 2, y / 2, v[0]);
+
+ SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
+ SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]);
+
+ SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
+ SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]);
+
+ SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
+ SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]);
+ }
+}
+
+void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+ FfxFloat32x4 v[4];
+
+ FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+ FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+ v[0] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[0], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+ v[1] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[1], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+ v[2] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[2], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+ v[3] = SpdReduceLoadSourceImage(tex, slice);
+ SpdStore(pix, v[3], 0, slice);
+
+ if (mip <= 1)
+ return;
+
+ for (FfxUInt32 i = 0; i < 4; i++)
+ {
+ SpdStoreIntermediate(x, y, v[i]);
+ SpdWorkgroupShuffleBarrier();
+ if (localInvocationIndex < 64)
+ {
+ v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+ SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
+ }
+ SpdWorkgroupShuffleBarrier();
+ }
+
+ if (localInvocationIndex < 64)
+ {
+ SpdStoreIntermediate(x + 0, y + 0, v[0]);
+ SpdStoreIntermediate(x + 8, y + 0, v[1]);
+ SpdStoreIntermediate(x + 0, y + 8, v[2]);
+ SpdStoreIntermediate(x + 8, y + 8, v[3]);
+ }
+}
+
+void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
+#else
+ SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
+#endif
+}
+
+
+void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 64)
+ {
+ FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+ SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
+ // store to LDS, try to reduce bank conflicts
+ // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+ // ...
+ // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+ SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
+ }
+#else
+ FfxFloat32x4 v = SpdLoadIntermediate(x, y);
+ v = SpdReduceQuad(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+ SpdStoreIntermediate(x + (y / 2) % 2, y, v);
+ }
+#endif
+}
+
+void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 16)
+ {
+ // x 0 x 0
+ // 0 0 0 0
+ // 0 x 0 x
+ // 0 0 0 0
+ FfxFloat32x4 v =
+ SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
+ SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
+ // store to LDS
+ // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
+ // ...
+ // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
+ // ...
+ // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
+ // ...
+ SpdStoreIntermediate(x * 4 + y, y * 4, v);
+ }
+#else
+ if (localInvocationIndex < 64)
+ {
+ FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2);
+ v = SpdReduceQuad(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+ SpdStoreIntermediate(x * 2 + y / 2, y * 2, v);
+ }
+ }
+#endif
+}
+
+void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 4)
+ {
+ // x 0 0 0 x 0 0 0
+ // ...
+ // 0 x 0 0 0 x 0 0
+ FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
+ FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
+ FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
+ FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
+ SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
+ // store to LDS
+ // x x x x 0 ...
+ // 0 ...
+ SpdStoreIntermediate(x + y * 2, 0, v);
+ }
+#else
+ if (localInvocationIndex < 16)
+ {
+ FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4);
+ v = SpdReduceQuad(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+ SpdStoreIntermediate(x / 2 + y, 0, v);
+ }
+ }
+#endif
+}
+
+void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 1)
+ {
+ // x x x x 0 ...
+ // 0 ...
+ FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
+ SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
+ }
+#else
+ if (localInvocationIndex < 4)
+ {
+ FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0);
+ v = SpdReduceQuad(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
+ }
+ }
+#endif
+}
+
+void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
+{
+ FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
+ FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
+ FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice);
+ SpdStore(pix, v0, 6, slice);
+
+ tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
+ pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
+ FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice);
+ SpdStore(pix, v1, 6, slice);
+
+ tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
+ pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
+ FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice);
+ SpdStore(pix, v2, 6, slice);
+
+ tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
+ pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
+ FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice);
+ SpdStore(pix, v3, 6, slice);
+
+ if (mips <= 7)
+ return;
+ // no barrier needed, working on values only from the same thread
+
+ FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3);
+ SpdStore(FfxInt32x2(x, y), v, 7, slice);
+ SpdStoreIntermediate(x, y, v);
+}
+
+void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
+{
+ if (mips <= baseMip)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
+
+ if (mips <= baseMip + 1)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
+
+ if (mips <= baseMip + 2)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
+
+ if (mips <= baseMip + 3)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
+}
+
+void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
+{
+ FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
+ FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
+ FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
+ SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
+
+ SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
+
+ if (mips <= 6)
+ return;
+
+ if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
+ return;
+
+ SpdResetAtomicCounter(slice);
+
+ // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
+ SpdDownsampleMips_6_7(x, y, mips, slice);
+
+ SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
+}
+
+void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
+{
+ SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//==============================================================================================================================
+// PACKED VERSION
+//==============================================================================================================================
+
+#if FFX_HALF
+
+#ifdef FFX_GLSL
+#extension GL_EXT_shader_subgroup_extended_types_float16:require
+#endif
+
+FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v)
+{
+#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+ FfxFloat16x4 v0 = v;
+ FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v);
+ FfxFloat16x4 v2 = subgroupQuadSwapVertical(v);
+ FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v);
+ return SpdReduce4H(v0, v1, v2, v3);
+#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+ // requires SM6.0
+ FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
+ FfxFloat16x4 v0 = v;
+ FfxFloat16x4 v1 = WaveReadLaneAt(v, quad | 1);
+ FfxFloat16x4 v2 = WaveReadLaneAt(v, quad | 2);
+ FfxFloat16x4 v3 = WaveReadLaneAt(v, quad | 3);
+ return SpdReduce4H(v0, v1, v2, v3);
+/*
+ // if SM6.0 is not available, you can use the AMD shader intrinsics
+ // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
+ // https://gpuopen.com/amd-gpu-services-ags-library/
+ // works for DX11
+ FfxFloat16x4 v0 = v;
+ FfxFloat16x4 v1;
+ v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+ FfxFloat16x4 v2;
+ v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+ FfxFloat16x4 v3;
+ v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+ return SpdReduce4H(v0, v1, v2, v3);
+ */
+#endif
+ return FfxFloat16x4(0.0, 0.0, 0.0, 0.0);
+}
+
+FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
+{
+ FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
+ FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
+ FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
+ FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
+ return SpdReduce4H(v0, v1, v2, v3);
+}
+
+FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+ FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice);
+ FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice);
+ FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice);
+ FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice);
+ return SpdReduce4H(v0, v1, v2, v3);
+}
+
+FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice)
+{
+ return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+}
+
+FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+ FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice);
+ FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice);
+ FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice);
+ FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice);
+ return SpdReduce4H(v0, v1, v2, v3);
+}
+
+FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice)
+{
+#ifdef SPD_LINEAR_SAMPLER
+ return SpdLoadSourceImageH(FfxInt32x2(base), slice);
+#else
+ return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+#endif
+}
+
+void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
+{
+ FfxFloat16x4 v[4];
+
+ FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+ FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+ v[0] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[0], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+ v[1] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[1], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+ v[2] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[2], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+ v[3] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[3], 0, slice);
+
+ if (mips <= 1)
+ return;
+
+ v[0] = SpdReduceQuadH(v[0]);
+ v[1] = SpdReduceQuadH(v[1]);
+ v[2] = SpdReduceQuadH(v[2]);
+ v[3] = SpdReduceQuadH(v[3]);
+
+ if ((localInvocationIndex % 4) == 0)
+ {
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
+ SpdStoreIntermediateH(x / 2, y / 2, v[0]);
+
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
+ SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
+
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
+ SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
+
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
+ SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
+ }
+}
+
+void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
+{
+ FfxFloat16x4 v[4];
+
+ FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+ FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+ v[0] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[0], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+ v[1] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[1], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+ v[2] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[2], 0, slice);
+
+ tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+ pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+ v[3] = SpdReduceLoadSourceImageH(tex, slice);
+ SpdStoreH(pix, v[3], 0, slice);
+
+ if (mips <= 1)
+ return;
+
+ for (FfxInt32 i = 0; i < 4; i++)
+ {
+ SpdStoreIntermediateH(x, y, v[i]);
+ SpdWorkgroupShuffleBarrier();
+ if (localInvocationIndex < 64)
+ {
+ v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
+ }
+ SpdWorkgroupShuffleBarrier();
+ }
+
+ if (localInvocationIndex < 64)
+ {
+ SpdStoreIntermediateH(x + 0, y + 0, v[0]);
+ SpdStoreIntermediateH(x + 8, y + 0, v[1]);
+ SpdStoreIntermediateH(x + 0, y + 8, v[2]);
+ SpdStoreIntermediateH(x + 8, y + 8, v[3]);
+ }
+}
+
+void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
+#else
+ SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
+#endif
+}
+
+
+void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 64)
+ {
+ FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
+ // store to LDS, try to reduce bank conflicts
+ // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+ // ...
+ // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+ SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
+ }
+#else
+ FfxFloat16x4 v = SpdLoadIntermediateH(x, y);
+ v = SpdReduceQuadH(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+ SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
+ }
+#endif
+}
+
+void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 16)
+ {
+ // x 0 x 0
+ // 0 0 0 0
+ // 0 x 0 x
+ // 0 0 0 0
+ FfxFloat16x4 v =
+ SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
+ // store to LDS
+ // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
+ // ...
+ // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
+ // ...
+ // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
+ // ...
+ SpdStoreIntermediateH(x * 4 + y, y * 4, v);
+ }
+#else
+ if (localInvocationIndex < 64)
+ {
+ FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
+ v = SpdReduceQuadH(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+ SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
+ }
+ }
+#endif
+}
+
+void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 4)
+ {
+ // x 0 0 0 x 0 0 0
+ // ...
+ // 0 x 0 0 0 x 0 0
+ FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
+ FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
+ FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
+ FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
+ // store to LDS
+ // x x x x 0 ...
+ // 0 ...
+ SpdStoreIntermediateH(x + y * 2, 0, v);
+ }
+#else
+ if (localInvocationIndex < 16)
+ {
+ FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
+ v = SpdReduceQuadH(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+ SpdStoreIntermediateH(x / 2 + y, 0, v);
+ }
+ }
+#endif
+}
+
+void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+ if (localInvocationIndex < 1)
+ {
+ // x x x x 0 ...
+ // 0 ...
+ FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
+ SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
+ }
+#else
+ if (localInvocationIndex < 4)
+ {
+ FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
+ v = SpdReduceQuadH(v);
+ // quad index 0 stores result
+ if (localInvocationIndex % 4 == 0)
+ {
+ SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
+ }
+ }
+#endif
+}
+
+void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
+{
+ FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
+ FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
+ FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice);
+ SpdStoreH(pix, v0, 6, slice);
+
+ tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
+ pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
+ FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice);
+ SpdStoreH(pix, v1, 6, slice);
+
+ tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
+ pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
+ FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice);
+ SpdStoreH(pix, v2, 6, slice);
+
+ tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
+ pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
+ FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice);
+ SpdStoreH(pix, v3, 6, slice);
+
+ if (mips < 8)
+ return;
+ // no barrier needed, working on values only from the same thread
+
+ FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3);
+ SpdStoreH(FfxInt32x2(x, y), v, 7, slice);
+ SpdStoreIntermediateH(x, y, v);
+}
+
+void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
+{
+ if (mips <= baseMip)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
+
+ if (mips <= baseMip + 1)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
+
+ if (mips <= baseMip + 2)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
+
+ if (mips <= baseMip + 3)
+ return;
+ SpdWorkgroupShuffleBarrier();
+ SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
+}
+
+void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
+{
+ FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
+ FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
+ FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
+
+ SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
+
+ SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
+
+ if (mips < 7)
+ return;
+
+ if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
+ return;
+
+ SpdResetAtomicCounter(slice);
+
+ // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
+ SpdDownsampleMips_6_7H(x, y, mips, slice);
+
+ SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
+}
+
+void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
+{
+ SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
+}
+
+#endif // #if FFX_HALF
+#endif // #ifdef FFX_GPU