1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
#[versions]
unsigned = "";
signed = "#define SNORM";
#[compute]
#version 450
#include "CrossPlatformSettings_piece_all.glsl"
#include "UavCrossPlatform_piece_all.glsl"
#VERSION_DEFINES
shared float2 g_minMaxValues[4u * 4u * 4u];
shared uint2 g_mask[4u * 4u];
layout(binding = 0) uniform sampler2D srcTex;
layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
layout(push_constant, std430) uniform Params {
uint p_channelIdx;
uint p_padding[3];
}
params;
layout(local_size_x = 4, //
local_size_y = 4, //
local_size_z = 4) in;
/// Each block is 16 pixels
/// Each thread works on 4 pixels
/// Therefore each block needs 4 threads, generating 8 masks
/// At the end these 8 masks get merged into 2 and results written to output
///
/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
///
/// A: It's a sweetspot.
/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
/// overhead, and also more LDS usage which reduces occupancy.
/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities
void main() {
float minVal, maxVal;
float4 srcPixel;
const uint blockThreadId = gl_LocalInvocationID.x;
const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
for (uint i = 0u; i < 4u; ++i) {
const uint2 pixelsToLoad = pixelsToLoadBase + uint2(i, blockThreadId);
const float4 value = OGRE_Load2D(srcTex, int2(pixelsToLoad), 0).xyzw;
srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w);
srcPixel[i] *= 255.0f;
}
minVal = min3(srcPixel.x, srcPixel.y, srcPixel.z);
maxVal = max3(srcPixel.x, srcPixel.y, srcPixel.z);
minVal = min(minVal, srcPixel.w);
maxVal = max(maxVal, srcPixel.w);
const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u);
const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y;
g_minMaxValues[minMaxIdxBase + blockThreadId] = float2(minVal, maxVal);
g_mask[maskIdxBase] = uint2(0u, 0u);
memoryBarrierShared();
barrier();
// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
for (uint i = 0u; i < 4u; ++i) {
minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal);
maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal);
}
// determine bias and emit color indices
// given the choice of maxVal/minVal, these indices are optimal:
// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
float dist = maxVal - minVal;
float dist4 = dist * 4.0f;
float dist2 = dist * 2.0f;
float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2);
bias -= minVal * 7;
uint mask0 = 0u, mask1 = 0u;
for (uint i = 0u; i < 4u; ++i) {
float a = srcPixel[i] * 7.0f + bias;
int ind = 0;
// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
if (a >= dist4) {
ind = 4;
a -= dist4;
}
if (a >= dist2) {
ind += 2;
a -= dist2;
}
if (a >= dist)
ind += 1;
// turn linear scale into DXT index (0/1 are extremal pts)
ind = -ind & 7;
ind ^= (2 > ind) ? 1 : 0;
// write index
const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u;
if (bits < 32u) {
mask0 |= uint(ind) << bits;
if (bits + 3u > 32u) {
mask1 |= uint(ind) >> (32u - bits);
}
} else {
mask1 |= uint(ind) << (bits - 32u);
}
}
if (mask0 != 0u)
atomicOr(g_mask[maskIdxBase].x, mask0);
if (mask1 != 0u)
atomicOr(g_mask[maskIdxBase].y, mask1);
memoryBarrierShared();
barrier();
if (blockThreadId == 0u) {
// Save data
uint2 outputBytes;
#ifdef SNORM
outputBytes.x =
packSnorm4x8(float4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f,
minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f));
#else
outputBytes.x = packUnorm4x8(
float4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f));
#endif
outputBytes.x |= g_mask[maskIdxBase].x;
outputBytes.y = g_mask[maskIdxBase].y;
uint2 dstUV = gl_GlobalInvocationID.yz;
imageStore(dstTexture, int2(dstUV), uint4(outputBytes.xy, 0u, 0u));
}
}
|