1 /// BC7 encoding image loading.
2 /// D translation of bc7enc16 d3b037f33b8c6df184177a0ae6a0f4cfec1434ad
3 module gamut.codecs.bc7enc16;
4 
5 import core.stdc.string: memset, memcpy;
6 import std.math: abs, sqrt, floor;
7 import gamut.internals.mutex;
8 
9 // File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c)
10 
11 enum BC7ENC16_BLOCK_SIZE = 16;
12 enum BC7ENC16_MAX_PARTITIONS1 = 64;
13 enum BC7ENC16_MAX_UBER_LEVEL = 4;
14 
15 alias bc7enc16_bool = ubyte;
16 enum BC7ENC16_TRUE = 1;
17 enum BC7ENC16_FALSE = 0;
18 
19 nothrow @nogc @safe:
20 
21 struct bc7enc16_compress_block_params
22 {
23     // m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC16_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
24     uint m_max_partitions_mode1;
25     
26     // Relative RGBA or YCbCrA weights.
27     uint[4] m_weights;
28     
29     // m_uber_level may range from 0 to BC7ENC16_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
30     uint m_uber_level;
31 
32     // If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
33     bc7enc16_bool m_perceptual;
34 
35     // Set m_try_least_squares to false for slightly faster/lower quality compression.
36     bc7enc16_bool m_try_least_squares;
37     
38     // When m_mode1_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful.
39     // There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings.
40     bc7enc16_bool m_mode1_partition_estimation_filterbank;
41 }
42 
43 void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p) pure
44 {
45     p.m_perceptual = BC7ENC16_FALSE;
46     p.m_weights[0] = 1;
47     p.m_weights[1] = 1;
48     p.m_weights[2] = 1;
49     p.m_weights[3] = 1;
50 }
51 
52 void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p) pure
53 {
54     p.m_perceptual = BC7ENC16_TRUE;
55     p.m_weights[0] = 128;
56     p.m_weights[1] = 64;
57     p.m_weights[2] = 16;
58     p.m_weights[3] = 32;
59 }
60 
61 void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p) pure
62 {
63     p.m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1;
64     p.m_try_least_squares = BC7ENC16_TRUE;
65     p.m_mode1_partition_estimation_filterbank = BC7ENC16_TRUE;
66     p.m_uber_level = 0;
67     bc7enc16_compress_block_params_init_perceptual_weights(p);
68 }
69 
70 
71 // File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file)
72 
73 // Helpers
74 int clampi(int value, int low, int high) pure
75 { 
76     if (value < low) 
77         value = low; 
78     else if (value > high) 
79         value = high;   
80     return value; 
81 }
82 
83 float clampf(float value, float low, float high) pure
84 { 
85     if (value < low) 
86         value = low; 
87     else if (value > high) 
88         value = high;   
89     return value; 
90 }
91 
92 float saturate(float value) pure
93 { 
94     return clampf(value, 0, 1.0f); 
95 }
96 
97 ubyte minimumub(ubyte a, ubyte b) pure
98 { 
99     return (a < b) ? a : b; 
100 }
101 
102 uint minimumu(uint a, uint b) pure
103 { 
104     return (a < b) ? a : b; 
105 }
106 
107 float minimumf(float a, float b) pure
108 { 
109     return (a < b) ? a : b; 
110 }
111 
112 ubyte maximumub(ubyte a, ubyte b) pure
113 { 
114     return (a > b) ? a : b; 
115 }
116 
117 uint maximumu(uint a, uint b) pure
118 {
119     return (a > b) ? a : b; 
120 }
121 
122 float maximumf(float a, float b) pure
123 { 
124     return (a > b) ? a : b; 
125 }
126 
127 int squarei(int i) pure 
128 { 
129     return i * i; 
130 }
131 
132 float squaref(float i) pure
133 { 
134     return i * i; 
135 }
136 
137 struct color_quad_u8 
138 { 
139     ubyte[4] m_c; 
140 }
141 
142 struct vec4F 
143 { 
144     float[4] m_c; 
145 }
146 
147 color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system
148 {
149     pRes.m_c[0] = cast(ubyte)clampi(r, 0, 255); 
150     pRes.m_c[1] = cast(ubyte)clampi(g, 0, 255); 
151     pRes.m_c[2] = cast(ubyte)clampi(b, 0, 255); 
152     pRes.m_c[3] = cast(ubyte)clampi(a, 0, 255); 
153     return pRes; 
154 }
155 
156 color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system
157 {
158     assert(cast(uint)(r | g | b | a) <= 255); 
159     pRes.m_c[0] = cast(ubyte)r; 
160     pRes.m_c[1] = cast(ubyte)g; 
161     pRes.m_c[2] = cast(ubyte)b; 
162     pRes.m_c[3] = cast(ubyte)a; 
163     return pRes; 
164 }
165 
166 bc7enc16_bool color_quad_u8_notequals(ref const(color_quad_u8) pLHS, ref const(color_quad_u8) pRHS) pure
167 {
168     return (pLHS.m_c[0] != pRHS.m_c[0]) 
169         || (pLHS.m_c[1] != pRHS.m_c[1]) 
170         || (pLHS.m_c[2] != pRHS.m_c[2]) 
171         || (pLHS.m_c[3] != pRHS.m_c[3]); 
172 }
173 
174 vec4F* vec4F_set_scalar(vec4F *pV, float x) pure
175 {
176     pV.m_c[0] = x; 
177     pV.m_c[1] = x;
178     pV.m_c[2] = x;  
179     pV.m_c[3] = x;
180     return pV; 
181 }
182 
183 vec4F* vec4F_set(vec4F *pV, float x, float y, float z, float w) pure
184 {
185     pV.m_c[0] = x;  
186     pV.m_c[1] = y;  
187     pV.m_c[2] = z;  
188     pV.m_c[3] = w;  
189     return pV; 
190 }
191 
192 void vec4F_saturate_in_place(ref vec4F pV) pure
193 {
194     pV.m_c[0] = saturate(pV.m_c[0]); 
195     pV.m_c[1] = saturate(pV.m_c[1]); 
196     pV.m_c[2] = saturate(pV.m_c[2]); 
197     pV.m_c[3] = saturate(pV.m_c[3]); 
198 }
199 
200 vec4F vec4F_saturate(const(vec4F)* pV) pure 
201 { 
202     vec4F res; 
203     res.m_c[0] = saturate(pV.m_c[0]); 
204     res.m_c[1] = saturate(pV.m_c[1]); 
205     res.m_c[2] = saturate(pV.m_c[2]); 
206     res.m_c[3] = saturate(pV.m_c[3]); 
207     return res; 
208 }
209 
210 vec4F vec4F_from_color(const(color_quad_u8)* pC) pure @trusted
211 { 
212     vec4F res; 
213     vec4F_set(&res, pC.m_c[0], pC.m_c[1], pC.m_c[2], pC.m_c[3]); 
214     return res; 
215 }
216 
217 vec4F vec4F_add(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted
218 { 
219     vec4F res; 
220     vec4F_set(&res, pLHS.m_c[0] + pRHS.m_c[0], pLHS.m_c[1] + pRHS.m_c[1], 
221                     pLHS.m_c[2] + pRHS.m_c[2], pLHS.m_c[3] + pRHS.m_c[3]); 
222     return res; 
223 }
224 
225 vec4F vec4F_sub(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted
226 { 
227     vec4F res; 
228     vec4F_set(&res, pLHS.m_c[0] - pRHS.m_c[0], pLHS.m_c[1] - pRHS.m_c[1], 
229                     pLHS.m_c[2] - pRHS.m_c[2], pLHS.m_c[3] - pRHS.m_c[3]); 
230     return res; 
231 }
232 
233 float vec4F_dot(const(vec4F)* pLHS, const(vec4F)* pRHS) pure 
234 { 
235     return pLHS.m_c[0] * pRHS.m_c[0] + pLHS.m_c[1] * pRHS.m_c[1] 
236          + pLHS.m_c[2] * pRHS.m_c[2] + pLHS.m_c[3] * pRHS.m_c[3]; 
237 }
238 
239 vec4F vec4F_mul(const(vec4F)* pLHS, float s) pure @trusted
240 { 
241     vec4F res; vec4F_set(&res, pLHS.m_c[0] * s, pLHS.m_c[1] * s, 
242                                pLHS.m_c[2] * s, pLHS.m_c[3] * s); 
243     return res; 
244 }
245 
246 vec4F* vec4F_normalize_in_place(vec4F *pV) pure
247 { 
248     float s = pV.m_c[0] * pV.m_c[0] + pV.m_c[1] * pV.m_c[1] + pV.m_c[2] * pV.m_c[2] + pV.m_c[3] * pV.m_c[3]; 
249     if (s != 0.0f) 
250     { 
251         s = 1.0f / sqrt(s); 
252         pV.m_c[0] *= s; 
253         pV.m_c[1] *= s; 
254         pV.m_c[2] *= s; 
255         pV.m_c[3] *= s; 
256     } 
257     return pV; 
258 }
259 
260 // Various BC7 tables
261 static immutable uint[8] g_bc7_weights3 = [ 0, 9, 18, 27, 37, 46, 55, 64 ];
262 static immutable uint[16] g_bc7_weights4 = [ 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 ];
263 // Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
264 static immutable float[8 * 4] g_bc7_weights3x = 
265 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 
266   0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 
267   0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f, 0.079102f, 0.718750f, 
268   0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ];
269 
270 static immutable float[16 * 4] g_bc7_weights4x = 
271 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 
272   0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 
273   0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f, 0.451416f, 0.328125f, 
274   0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 
275   0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 
276   0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
277   0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 
278   0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ];
279 
280 static immutable ubyte[64] g_bc7_partition1 = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ];
281 static immutable ubyte[64*16] g_bc7_partition2 =
282 [
283     0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,        0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,        0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,        0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,        0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,        0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,        0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
284     0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,        0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,        0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,        0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,        0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
285     0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,        0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,        0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,        0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,        0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,        0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,        0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,        0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
286     0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,        0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,        0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,        0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,        0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,        0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,        0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,        0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
287     0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,        0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,        0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,        0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,        0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,        0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,        0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,        0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
288     0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,        0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,        0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,        0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,        0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,        0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,        0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,        0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
289     0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,        0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,        0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,        0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,        0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,        0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,        0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,        0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
290     0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,        0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,        0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,        0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,        0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,        0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,        0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,        0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
291 ];
292 
293 static immutable ubyte[64] g_bc7_table_anchor_index_second_subset = 
294     [ 15,15,15,15,15,15,15,15,        15,15,15,15,15,15,15,15,
295       15, 2, 8, 2, 2, 8, 8,15,        2, 8, 2, 2, 8, 8, 2, 2,
296       15,15, 6, 8, 2, 8,15,15,        2, 8, 2, 2, 2,15,15, 6,
297        6, 2, 6, 8,15,15, 2, 2,        15,15,15,15,15, 2, 2,15 ];
298 
299 static immutable ubyte[8] g_bc7_num_subsets = [ 3, 2, 3, 2, 1, 1, 1, 2 ];
300 static immutable ubyte[8] g_bc7_partition_bits = [ 4, 6, 6, 6, 0, 0, 0, 6 ];
301 static immutable ubyte[8] g_bc7_color_index_bitcount = [ 3, 3, 2, 2, 2, 2, 4, 2 ];
302 
303 int get_bc7_color_index_size(int mode, int index_selection_bit) pure
304 { 
305     return g_bc7_color_index_bitcount[mode] + index_selection_bit; 
306 }
307 
308 static immutable ubyte[8] g_bc7_mode_has_p_bits        = [ 1, 1, 0, 1, 0, 0, 1, 1 ];
309 static immutable ubyte[8] g_bc7_mode_has_shared_p_bits = [ 0, 1, 0, 0, 0, 0, 0, 0 ];
310 static immutable ubyte[8] g_bc7_color_precision_table  = [ 4, 6, 5, 7, 5, 7, 7, 5 ];
311 static immutable byte[8] g_bc7_alpha_precision_table   = [ 0, 0, 0, 0, 6, 8, 7, 5 ];
312 
313 struct endpoint_err 
314 { 
315     ushort m_error; 
316     ubyte m_lo; 
317     ubyte m_hi; 
318 }
319 
320 __gshared endpoint_err[2][256] g_bc7_mode_1_optimal_endpoints; // [c][pbit]
321 __gshared Mutex g_tableProtect;
322 __gshared bool g_tableInitialized = false;
323 
324 enum uint BC7ENC16_MODE_1_OPTIMAL_INDEX = 2;
325 
326 // Initialize the lookup table used for optimal single color compression in mode 1
327 // Warning: bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts).
328 // Note: this is racey, so we use a self-init mutex.
329 void bc7enc16_compress_block_init() @trusted
330 {
331     g_tableProtect.lockLazy();
332     scope(exit) g_tableProtect.unlock();
333 
334     if (g_tableInitialized)
335         return;
336 
337     g_tableInitialized = true;
338 
339     for (int c = 0; c < 256; c++)
340     {
341         for (uint lp = 0; lp < 2; lp++)
342         {
343             endpoint_err best;
344             best.m_error = ushort.max;
345             for (uint l = 0; l < 64; l++)
346             {
347                 uint low = ((l << 1) | lp) << 1;
348                 low |= (low >> 7);
349                 for (uint h = 0; h < 64; h++)
350                 {
351                     uint high = ((h << 1) | lp) << 1;
352                     high |= (high >> 7);
353                     const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6;
354                     const int err = (k - c) * (k - c);
355                     if (err < best.m_error)
356                     {
357                         best.m_error = cast(ushort)err;
358                         best.m_lo = cast(ubyte)l;
359                         best.m_hi = cast(ubyte)h;
360                     }
361                 }
362             }
363             g_bc7_mode_1_optimal_endpoints[c][lp] = best;
364         }
365     }
366 }
367 
368 void compute_least_squares_endpoints_rgba(uint N, 
369                                           const(ubyte)* pSelectors, 
370                                           const(vec4F)* pSelector_weights, 
371                                           vec4F *pXl, 
372                                           vec4F *pXh, 
373                                           const(color_quad_u8)* pColors) @system
374 {
375     // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
376     // I did this in matrix form first, expanded out all the ops, then optimized it a bit.
377     float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
378     float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
379     float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
380     float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
381     float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f;
382     for (uint i = 0; i < N; i++)
383     {
384         const uint sel = pSelectors[i];
385         z00 += pSelector_weights[sel].m_c[0];
386         z10 += pSelector_weights[sel].m_c[1];
387         z11 += pSelector_weights[sel].m_c[2];
388         float w = pSelector_weights[sel].m_c[3];
389         q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
390         q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
391         q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
392         q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3];
393     }
394 
395     q10_r = t_r - q00_r;
396     q10_g = t_g - q00_g;
397     q10_b = t_b - q00_b;
398     q10_a = t_a - q00_a;
399 
400     z01 = z10;
401 
402     float det = z00 * z11 - z01 * z10;
403     if (det != 0.0f)
404         det = 1.0f / det;
405 
406     float iz00, iz01, iz10, iz11;
407     iz00 = z11 * det;
408     iz01 = -z01 * det;
409     iz10 = -z10 * det;
410     iz11 = z00 * det;
411 
412     pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r);
413     pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g);
414     pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b);
415     pXl.m_c[3] = cast(float)(iz00 * q00_a + iz01 * q10_a); pXh.m_c[3] = cast(float)(iz10 * q00_a + iz11 * q10_a);
416 }
417 
418 void compute_least_squares_endpoints_rgb(uint N, const ubyte *pSelectors, 
419                                          const(vec4F)* pSelector_weights, 
420                                          vec4F *pXl, vec4F *pXh, const(color_quad_u8)*pColors) @system
421 {
422     float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
423     float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
424     float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
425     float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
426     for (uint i = 0; i < N; i++)
427     {
428         const uint sel = pSelectors[i];
429         z00 += pSelector_weights[sel].m_c[0];
430         z10 += pSelector_weights[sel].m_c[1];
431         z11 += pSelector_weights[sel].m_c[2];
432         float w = pSelector_weights[sel].m_c[3];
433         q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
434         q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
435         q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
436     }
437 
438     q10_r = t_r - q00_r;
439     q10_g = t_g - q00_g;
440     q10_b = t_b - q00_b;
441 
442     z01 = z10;
443 
444     float det = z00 * z11 - z01 * z10;
445     if (det != 0.0f)
446         det = 1.0f / det;
447 
448     float iz00, iz01, iz10, iz11;
449     iz00 = z11 * det;
450     iz01 = -z01 * det;
451     iz10 = -z10 * det;
452     iz11 = z00 * det;
453 
454     pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r);
455     pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g);
456     pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b);
457     pXl.m_c[3] = 255.0f; pXh.m_c[3] = 255.0f;
458 }
459 
460 struct color_cell_compressor_params
461 {
462     uint m_num_pixels;
463     const(color_quad_u8)* m_pPixels;
464     uint m_num_selector_weights;
465     const(uint)* m_pSelector_weights;
466     const(vec4F)* m_pSelector_weightsx;
467     uint m_comp_bits;
468     uint[4] m_weights;
469     bc7enc16_bool m_has_alpha;
470     bc7enc16_bool m_has_pbits;
471     bc7enc16_bool m_endpoints_share_pbit;
472     bc7enc16_bool m_perceptual;
473 }
474 
475 struct color_cell_compressor_results
476 {
477     ulong m_best_overall_err;
478     color_quad_u8 m_low_endpoint;
479     color_quad_u8 m_high_endpoint;
480     uint[2] m_pbits;
481     ubyte *m_pSelectors;
482     ubyte *m_pSelectors_temp;
483 }
484 
485 color_quad_u8 scale_color(ref const(color_quad_u8) pC, const(color_cell_compressor_params) *pParams) pure
486 {
487     color_quad_u8 results;
488 
489     const uint n = pParams.m_comp_bits + (pParams.m_has_pbits ? 1 : 0);
490     assert((n >= 4) && (n <= 8));
491 
492     for (uint i = 0; i < 4; i++)
493     {
494         uint v = pC.m_c[i] << (8 - n);
495         v |= (v >> n);
496         assert(v <= 255);
497         results.m_c[i] = cast(ubyte)(v);
498     }
499 
500     return results;
501 }
502 
503 ulong compute_color_distance_rgb(const(color_quad_u8)* pE1, 
504                                  const(color_quad_u8)* pE2, 
505                                  bc7enc16_bool perceptual, 
506                                  const(uint)* weights) pure @system
507 {
508     int dr, dg, db;
509 
510     if (perceptual)
511     {
512         const int l1 = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37;
513         const int cr1 = (cast(int)pE1.m_c[0] << 9) - l1;
514         const int cb1 = (cast(int)pE1.m_c[2] << 9) - l1;
515         const int l2 = pE2.m_c[0] * 109 + pE2.m_c[1] * 366 + pE2.m_c[2] * 37;
516         const int cr2 = (cast(int)pE2.m_c[0] << 9) - l2;
517         const int cb2 = (cast(int)pE2.m_c[2] << 9) - l2;
518         dr = (l1 - l2) >> 8;
519         dg = (cr1 - cr2) >> 8;
520         db = (cb1 - cb2) >> 8;
521     }
522     else
523     {
524         dr = cast(int)pE1.m_c[0] - cast(int)pE2.m_c[0];
525         dg = cast(int)pE1.m_c[1] - cast(int)pE2.m_c[1];
526         db = cast(int)pE1.m_c[2] - cast(int)pE2.m_c[2];
527     }
528 
529     return weights[0] * cast(uint)(dr * dr) + weights[1] * cast(uint)(dg * dg) + weights[2] * cast(uint)(db * db);
530 }
531 
532 ulong compute_color_distance_rgba(const(color_quad_u8)* pE1, const(color_quad_u8)* pE2, bc7enc16_bool perceptual, const(uint)* weights /* [4] */) @system
533 {
534     int da = cast(int)pE1.m_c[3] - cast(int)pE2.m_c[3];
535     return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * cast(uint)(da * da));
536 }
537 
538 ulong pack_mode1_to_one_color(const(color_cell_compressor_params)* pParams, 
539                               color_cell_compressor_results *pResults, 
540                               uint r, uint g, uint b, ubyte *pSelectors) @system
541 {
542     uint best_err = uint.max;
543     uint best_p = 0;
544 
545     for (uint p = 0; p < 2; p++)
546     {
547         uint err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error;
548         if (err < best_err)
549         {
550             best_err = err;
551             best_p = p;
552         }
553     }
554 
555     const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p];
556     const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p];
557     const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p];
558 
559     color_quad_u8_set(&pResults.m_low_endpoint, pEr.m_lo, pEg.m_lo, pEb.m_lo, 0);
560     color_quad_u8_set(&pResults.m_high_endpoint, pEr.m_hi, pEg.m_hi, pEb.m_hi, 0);
561     pResults.m_pbits[0] = best_p;
562     pResults.m_pbits[1] = 0;
563 
564     memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams.m_num_pixels);
565 
566     color_quad_u8 p;
567     for (uint i = 0; i < 3; i++)
568     {
569         uint low = ((pResults.m_low_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1;
570         low |= (low >> 7);
571 
572         uint high = ((pResults.m_high_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1;
573         high |= (high >> 7);
574 
575         p.m_c[i] = cast(ubyte)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6);
576     }
577     p.m_c[3] = 255;
578 
579     ulong total_err = 0;
580     for (uint i = 0; i < pParams.m_num_pixels; i++)
581         total_err += compute_color_distance_rgb(&p, &pParams.m_pPixels[i], pParams.m_perceptual, pParams.m_weights.ptr);
582 
583     pResults.m_best_overall_err = total_err;
584 
585     return total_err;
586 }
587 
588 ulong evaluate_solution(const(color_quad_u8)* pLow, const(color_quad_u8)* pHigh, 
589                         const(uint)* pbits /*[2]*/, const(color_cell_compressor_params)* pParams, 
590                         color_cell_compressor_results *pResults) @system
591 {
592     color_quad_u8 quantMinColor = *pLow;
593     color_quad_u8 quantMaxColor = *pHigh;
594 
595     if (pParams.m_has_pbits)
596     {
597         uint minPBit, maxPBit;
598 
599         if (pParams.m_endpoints_share_pbit)
600             maxPBit = minPBit = pbits[0];
601         else
602         {
603             minPBit = pbits[0];
604             maxPBit = pbits[1];
605         }
606 
607         quantMinColor.m_c[0] = cast(ubyte)((pLow.m_c[0] << 1) | minPBit);
608         quantMinColor.m_c[1] = cast(ubyte)((pLow.m_c[1] << 1) | minPBit);
609         quantMinColor.m_c[2] = cast(ubyte)((pLow.m_c[2] << 1) | minPBit);
610         quantMinColor.m_c[3] = cast(ubyte)((pLow.m_c[3] << 1) | minPBit);
611 
612         quantMaxColor.m_c[0] = cast(ubyte)((pHigh.m_c[0] << 1) | maxPBit);
613         quantMaxColor.m_c[1] = cast(ubyte)((pHigh.m_c[1] << 1) | maxPBit);
614         quantMaxColor.m_c[2] = cast(ubyte)((pHigh.m_c[2] << 1) | maxPBit);
615         quantMaxColor.m_c[3] = cast(ubyte)((pHigh.m_c[3] << 1) | maxPBit);
616     }
617 
618     color_quad_u8 actualMinColor = scale_color(quantMinColor, pParams);
619     color_quad_u8 actualMaxColor = scale_color(quantMaxColor, pParams);
620 
621     const uint N = pParams.m_num_selector_weights;
622 
623     color_quad_u8[16] weightedColors;
624     weightedColors[0] = actualMinColor;
625     weightedColors[N - 1] = actualMaxColor;
626 
627     const uint nc = pParams.m_has_alpha ? 4 : 3;
628     for (uint i = 1; i < (N - 1); i++)
629         for (uint j = 0; j < nc; j++)
630             weightedColors[i].m_c[j] = cast(ubyte)((actualMinColor.m_c[j] * (64 - pParams.m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams.m_pSelector_weights[i] + 32) >> 6);
631 
632     const int lr = actualMinColor.m_c[0];
633     const int lg = actualMinColor.m_c[1];
634     const int lb = actualMinColor.m_c[2];
635     const int dr = actualMaxColor.m_c[0] - lr;
636     const int dg = actualMaxColor.m_c[1] - lg;
637     const int db = actualMaxColor.m_c[2] - lb;
638 
639     ulong total_err = 0;
640 
641     if (!pParams.m_perceptual)
642     {
643         if (pParams.m_has_alpha)
644         {
645             const int la = actualMinColor.m_c[3];
646             const int da = actualMaxColor.m_c[3] - la;
647 
648             const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f);
649 
650             for (uint i = 0; i < pParams.m_num_pixels; i++)
651             {
652                 const(color_quad_u8)* pC = &pParams.m_pPixels[i];
653                 int r = pC.m_c[0];
654                 int g = pC.m_c[1];
655                 int b = pC.m_c[2];
656                 int a = pC.m_c[3];
657 
658                 int best_sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f);
659                 best_sel = clampi(best_sel, 1, N - 1);
660 
661                 ulong err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
662                 ulong err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
663 
664                 if (err1 > err0)
665                 {
666                     err1 = err0;
667                     --best_sel;
668                 }
669                 total_err += err1;
670 
671                 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel;
672             }
673         }
674         else
675         {
676             const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);
677 
678             for (uint i = 0; i < pParams.m_num_pixels; i++)
679             {
680                 const color_quad_u8 *pC = &pParams.m_pPixels[i];
681                 int r = pC.m_c[0];
682                 int g = pC.m_c[1];
683                 int b = pC.m_c[2];
684 
685                 int sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f);
686                 sel = clampi(sel, 1, N - 1);
687 
688                 ulong err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
689                 ulong err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr);
690 
691                 int best_sel = sel;
692                 ulong best_err = err1;
693                 if (err0 < best_err)
694                 {
695                     best_err = err0;
696                     best_sel = sel - 1;
697                 }
698 
699                 total_err += best_err;
700 
701                 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel;
702             }
703         }
704     }
705     else
706     {
707         for (uint i = 0; i < pParams.m_num_pixels; i++)
708         {
709             ulong best_err = ulong.max;
710             uint best_sel = 0;
711 
712             if (pParams.m_has_alpha)
713             {
714                 for (uint j = 0; j < N; j++)
715                 {
716                     ulong err = compute_color_distance_rgba(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr);
717                     if (err < best_err)
718                     {
719                         best_err = err;
720                         best_sel = j;
721                     }
722                 }
723             }
724             else
725             {
726                 for (uint j = 0; j < N; j++)
727                 {
728                     ulong err = compute_color_distance_rgb(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr);
729                     if (err < best_err)
730                     {
731                         best_err = err;
732                         best_sel = j;
733                     }
734                 }
735             }
736 
737             total_err += best_err;
738 
739             pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel;
740         }
741     }
742 
743     if (total_err < pResults.m_best_overall_err)
744     {
745         pResults.m_best_overall_err = total_err;
746 
747         pResults.m_low_endpoint = *pLow;
748         pResults.m_high_endpoint = *pHigh;
749 
750         pResults.m_pbits[0] = pbits[0];
751         pResults.m_pbits[1] = pbits[1];
752 
753         memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels);
754     }
755 
756     return total_err;
757 }
758 
759 void fixDegenerateEndpoints(uint mode, 
760                             ref color_quad_u8 pTrialMinColor, 
761                             ref color_quad_u8 pTrialMaxColor, 
762                             ref const(vec4F) pXl, ref const(vec4F) pXh, uint iscale)
763 {
764     if (mode == 1)
765     {
766         // fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps)
767         for (uint i = 0; i < 3; i++)
768         {
769             if (pTrialMinColor.m_c[i] == pTrialMaxColor.m_c[i])
770             {
771                 if (abs(pXl.m_c[i] - pXh.m_c[i]) > 0.0f)
772                 {
773                     if (pTrialMinColor.m_c[i] > (iscale >> 1))
774                     {
775                         if (pTrialMinColor.m_c[i] > 0)
776                             pTrialMinColor.m_c[i]--;
777                         else
778                             if (pTrialMaxColor.m_c[i] < iscale)
779                                 pTrialMaxColor.m_c[i]++;
780                     }
781                     else
782                     {
783                         if (pTrialMaxColor.m_c[i] < iscale)
784                             pTrialMaxColor.m_c[i]++;
785                         else if (pTrialMinColor.m_c[i] > 0)
786                             pTrialMinColor.m_c[i]--;
787                     }
788                 }
789             }
790         }
791     }
792 }
793 
794 static ulong find_optimal_solution(uint mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) @system
795 {
796     vec4F_saturate_in_place(xl); 
797     vec4F_saturate_in_place(xh);
798 
799     if (pParams.m_has_pbits)
800     {
801         const int iscalep = (1 << (pParams.m_comp_bits + 1)) - 1;
802         const float scalep = cast(float)iscalep;
803 
804         const int totalComps = pParams.m_has_alpha ? 4 : 3;
805 
806         uint[2] best_pbits;
807         color_quad_u8 bestMinColor, bestMaxColor;
808 
809         if (!pParams.m_endpoints_share_pbit)
810         {
811             float best_err0 = 1e+9;
812             float best_err1 = 1e+9;
813 
814             for (int p = 0; p < 2; p++)
815             {
816                 color_quad_u8 xMinColor, xMaxColor;
817 
818                 // Notes: The pbit controls which quantization intervals are selected.
819                 // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc.
820                 // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value
821                 // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5)
822                 // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5)
823                 for (uint c = 0; c < 4; c++)
824                 {
825                     xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
826                     xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
827                 }
828 
829                 color_quad_u8 scaledLow = scale_color(xMinColor, pParams);
830                 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams);
831 
832                 float err0 = 0, err1 = 0;
833                 for (int i = 0; i < totalComps; i++)
834                 {
835                     err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f);
836                     err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f);
837                 }
838 
839                 if (err0 < best_err0)
840                 {
841                     best_err0 = err0;
842                     best_pbits[0] = p;
843 
844                     bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
845                     bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
846                     bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
847                     bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
848                 }
849 
850                 if (err1 < best_err1)
851                 {
852                     best_err1 = err1;
853                     best_pbits[1] = p;
854 
855                     bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
856                     bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
857                     bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
858                     bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
859                 }
860             }
861         }
862         else
863         {
864             // Endpoints share pbits
865             float best_err = 1e+9;
866 
867             for (int p = 0; p < 2; p++)
868             {
869                 color_quad_u8 xMinColor, xMaxColor;
870                 for (uint c = 0; c < 4; c++)
871                 {
872                     xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
873                     xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
874                 }
875 
876                 color_quad_u8 scaledLow = scale_color(xMinColor, pParams);
877                 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams);
878 
879                 float err = 0;
880                 for (int i = 0; i < totalComps; i++)
881                     err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]);
882 
883                 if (err < best_err)
884                 {
885                     best_err = err;
886                     best_pbits[0] = p;
887                     best_pbits[1] = p;
888                     for (uint j = 0; j < 4; j++)
889                     {
890                         bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
891                         bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
892                     }
893                 }
894             }
895         }
896 
897         fixDegenerateEndpoints(mode, bestMinColor, bestMaxColor, xl, xh, iscalep >> 1);
898 
899         if ( (pResults.m_best_overall_err == ulong.max) 
900              || color_quad_u8_notequals(bestMinColor, pResults.m_low_endpoint) 
901              || color_quad_u8_notequals(bestMaxColor, pResults.m_high_endpoint) 
902              || (best_pbits[0] != pResults.m_pbits[0]) 
903              || (best_pbits[1] != pResults.m_pbits[1]) )
904             evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits.ptr, pParams, pResults);
905     }
906     else
907     {
908         const int iscale = (1 << pParams.m_comp_bits) - 1;
909         const float scale = cast(float)iscale;
910 
911         color_quad_u8 trialMinColor, trialMaxColor;
912         color_quad_u8_set_clamped(&trialMinColor, cast(int)(xl.m_c[0] * scale + .5f), cast(int)(xl.m_c[1] * scale + .5f), cast(int)(xl.m_c[2] * scale + .5f), cast(int)(xl.m_c[3] * scale + .5f));
913         color_quad_u8_set_clamped(&trialMaxColor, cast(int)(xh.m_c[0] * scale + .5f), cast(int)(xh.m_c[1] * scale + .5f), cast(int)(xh.m_c[2] * scale + .5f), cast(int)(xh.m_c[3] * scale + .5f));
914 
915         fixDegenerateEndpoints(mode, trialMinColor, trialMaxColor, xl, xh, iscale);
916 
917         if (  (pResults.m_best_overall_err == ulong.max) 
918              || color_quad_u8_notequals(trialMinColor, pResults.m_low_endpoint) 
919              || color_quad_u8_notequals(trialMaxColor, pResults.m_high_endpoint) )
920             evaluate_solution(&trialMinColor, &trialMaxColor, pResults.m_pbits.ptr, pParams, pResults);
921     }
922 
923     return pResults.m_best_overall_err;
924 }
925 
926 ulong color_cell_compression(uint mode, 
927                              const(color_cell_compressor_params)* pParams, 
928                              color_cell_compressor_results *pResults, 
929                              const(bc7enc16_compress_block_params)* pComp_params) @system
930 {
931     assert((mode == 6) || (!pParams.m_has_alpha));
932 
933     pResults.m_best_overall_err = ulong.max;
934 
935     // If the partition's colors are all the same in mode 1, then just pack them as a single color.
936     if (mode == 1)
937     {
938         const uint cr = pParams.m_pPixels[0].m_c[0], cg = pParams.m_pPixels[0].m_c[1], cb = pParams.m_pPixels[0].m_c[2];
939 
940         bc7enc16_bool allSame = BC7ENC16_TRUE;
941         for (uint i = 1; i < pParams.m_num_pixels; i++)
942         {
943             if ((cr != pParams.m_pPixels[i].m_c[0]) || (cg != pParams.m_pPixels[i].m_c[1]) || (cb != pParams.m_pPixels[i].m_c[2]))
944             {
945                 allSame = BC7ENC16_FALSE;
946                 break;
947             }
948         }
949 
950         if (allSame)
951             return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults.m_pSelectors);
952     }
953 
954     // Compute partition's mean color and principle axis.
955     vec4F meanColor, axis;
956     vec4F_set_scalar(&meanColor, 0.0f);
957 
958     for (uint i = 0; i < pParams.m_num_pixels; i++)
959     {
960         vec4F color = vec4F_from_color(&pParams.m_pPixels[i]);
961         meanColor = vec4F_add(&meanColor, &color);
962     }
963 
964     vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels));
965 
966     meanColor = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels * 255.0f));
967     vec4F_saturate_in_place(meanColor);
968 
969     if (pParams.m_has_alpha)
970     {
971         // Use incremental PCA for RGBA PCA, because it's simple.
972         vec4F_set_scalar(&axis, 0.0f);
973         for (uint i = 0; i < pParams.m_num_pixels; i++)
974         {
975             vec4F color = vec4F_from_color(&pParams.m_pPixels[i]);
976             color = vec4F_sub(&color, &meanColorScaled);
977             vec4F a = vec4F_mul(&color, color.m_c[0]);
978             vec4F b = vec4F_mul(&color, color.m_c[1]);
979             vec4F c = vec4F_mul(&color, color.m_c[2]);
980             vec4F d = vec4F_mul(&color, color.m_c[3]);
981             vec4F n = i ? axis : color;
982             vec4F_normalize_in_place(&n);
983             axis.m_c[0] += vec4F_dot(&a, &n);
984             axis.m_c[1] += vec4F_dot(&b, &n);
985             axis.m_c[2] += vec4F_dot(&c, &n);
986             axis.m_c[3] += vec4F_dot(&d, &n);
987         }
988         vec4F_normalize_in_place(&axis);
989     }
990     else
991     {
992         // Use covar technique for RGB PCA, because it doesn't require per-pixel normalization.
993         float[6] cov = [ 0, 0, 0, 0, 0, 0 ];
994 
995         for (uint i = 0; i < pParams.m_num_pixels; i++)
996         {
997             const color_quad_u8 *pV = &pParams.m_pPixels[i];
998             float r = pV.m_c[0] - meanColorScaled.m_c[0];
999             float g = pV.m_c[1] - meanColorScaled.m_c[1];
1000             float b = pV.m_c[2] - meanColorScaled.m_c[2];
1001             cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b;
1002         }
1003 
1004         float vfr = .9f, vfg = 1.0f, vfb = .7f;
1005         for (uint iter = 0; iter < 3; iter++)
1006         {
1007             float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2];
1008             float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4];
1009             float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5];
1010 
1011             float m = maximumf(maximumf(abs(r), abs(g)), abs(b));
1012             if (m > 1e-10f)
1013             {
1014                 m = 1.0f / m;
1015                 r *= m; g *= m; b *= m;
1016             }
1017 
1018             vfr = r; vfg = g; vfb = b;
1019         }
1020 
1021         float len = vfr*vfr + vfg*vfg + vfb*vfb;
1022         if (len < 1e-10f)
1023             vec4F_set_scalar(&axis, 0.0f);
1024         else
1025         {
1026             len = 1.0f / sqrt(len);
1027             vfr *= len; vfg *= len; vfb *= len;
1028             vec4F_set(&axis, vfr, vfg, vfb, 0);
1029         }
1030     }
1031 
1032     if (vec4F_dot(&axis, &axis) < .5f)
1033     {
1034         if (pParams.m_perceptual)
1035             vec4F_set(&axis, .213f, .715f, .072f, pParams.m_has_alpha ? .715f : 0);
1036         else
1037             vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams.m_has_alpha ? 1.0f : 0);
1038         vec4F_normalize_in_place(&axis);
1039     }
1040 
1041     float l = 1e+9f, h = -1e+9f;
1042 
1043     for (uint i = 0; i < pParams.m_num_pixels; i++)
1044     {
1045         vec4F color = vec4F_from_color(&pParams.m_pPixels[i]);
1046 
1047         vec4F q = vec4F_sub(&color, &meanColorScaled);
1048         float d = vec4F_dot(&q, &axis);
1049 
1050         l = minimumf(l, d);
1051         h = maximumf(h, d);
1052     }
1053 
1054     l *= (1.0f / 255.0f);
1055     h *= (1.0f / 255.0f);
1056 
1057     vec4F b0 = vec4F_mul(&axis, l);
1058     vec4F b1 = vec4F_mul(&axis, h);
1059     vec4F c0 = vec4F_add(&meanColor, &b0);
1060     vec4F c1 = vec4F_add(&meanColor, &b1);
1061     vec4F minColor = vec4F_saturate(&c0);
1062     vec4F maxColor = vec4F_saturate(&c1);
1063 
1064     vec4F whiteVec;
1065     vec4F_set_scalar(&whiteVec, 1.0f);
1066     if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec))
1067     {
1068         vec4F temp = minColor;
1069         minColor = maxColor;
1070         maxColor = temp;
1071     }
1072     // First find a solution using the block's PCA.
1073     if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults))
1074         return 0;
1075 
1076     if (pComp_params.m_try_least_squares)
1077     {
1078         // Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors.
1079         vec4F xl, xh;
1080         vec4F_set_scalar(&xl, 0.0f);
1081         vec4F_set_scalar(&xh, 0.0f);
1082         if (pParams.m_has_alpha)
1083             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1084         else
1085             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1086 
1087         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1088         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1089 
1090         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1091             return 0;
1092     }
1093 
1094     if (pComp_params.m_uber_level > 0)
1095     {
1096         // In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors,
1097         // then try decrementing the selectrors, then try both.
1098         ubyte[16] selectors_temp, selectors_temp1;
1099         memcpy(selectors_temp.ptr, pResults.m_pSelectors, pParams.m_num_pixels);
1100 
1101         const int max_selector = pParams.m_num_selector_weights - 1;
1102 
1103         uint min_sel = 16;
1104         uint max_sel = 0;
1105         for (uint i = 0; i < pParams.m_num_pixels; i++)
1106         {
1107             uint sel = selectors_temp[i];
1108             min_sel = minimumu(min_sel, sel);
1109             max_sel = maximumu(max_sel, sel);
1110         }
1111 
1112         for (uint i = 0; i < pParams.m_num_pixels; i++)
1113         {
1114             uint sel = selectors_temp[i];
1115             if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1)))
1116                 sel++;
1117             selectors_temp1[i] = cast(ubyte)sel;
1118         }
1119 
1120         vec4F xl, xh;
1121         vec4F_set_scalar(&xl, 0.0f);
1122         vec4F_set_scalar(&xh, 0.0f);
1123         if (pParams.m_has_alpha)
1124             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 
1125                                                  pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1126         else
1127             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 
1128                                                 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1129 
1130         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1131         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1132 
1133         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1134             return 0;
1135 
1136         for (uint i = 0; i < pParams.m_num_pixels; i++)
1137         {
1138             uint sel = selectors_temp[i];
1139             if ((sel == max_sel) && (sel > 0))
1140                 sel--;
1141             selectors_temp1[i] = cast(ubyte)sel;
1142         }
1143 
1144         if (pParams.m_has_alpha)
1145             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 
1146                                                  pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1147         else
1148             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 
1149                                                 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1150 
1151         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1152         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1153 
1154         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1155             return 0;
1156 
1157         for (uint i = 0; i < pParams.m_num_pixels; i++)
1158         {
1159             uint sel = selectors_temp[i];
1160             if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1)))
1161                 sel++;
1162             else if ((sel == max_sel) && (sel > 0))
1163                 sel--;
1164             selectors_temp1[i] = cast(ubyte)sel;
1165         }
1166 
1167         if (pParams.m_has_alpha)
1168             compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 
1169                                                  pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1170         else
1171             compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 
1172                                                 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1173 
1174         xl = vec4F_mul(&xl, (1.0f / 255.0f));
1175         xh = vec4F_mul(&xh, (1.0f / 255.0f));
1176 
1177         if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1178             return 0;
1179 
1180         // In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another.
1181         const uint uber_err_thresh = (pParams.m_num_pixels * 56) >> 4;
1182         if ((pComp_params.m_uber_level >= 2) && (pResults.m_best_overall_err > uber_err_thresh))
1183         {
1184             const int Q = (pComp_params.m_uber_level >= 4) ? (pComp_params.m_uber_level - 2) : 1;
1185             for (int ly = -Q; ly <= 1; ly++)
1186             {
1187                 for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++)
1188                 {
1189                     if ((ly == 0) && (hy == max_selector))
1190                         continue;
1191 
1192                     for (uint i = 0; i < pParams.m_num_pixels; i++)
1193                         selectors_temp1[i] = cast(ubyte)clampf(floor(cast(float)max_selector * (cast(float)selectors_temp[i] - cast(float)ly) / (cast(float)hy - cast(float)ly) + .5f), 0, cast(float)max_selector);
1194 
1195                     //vec4F xl, xh;
1196                     vec4F_set_scalar(&xl, 0.0f);
1197                     vec4F_set_scalar(&xh, 0.0f);
1198                     if (pParams.m_has_alpha)
1199                         compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1200                     else
1201                         compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels);
1202 
1203                     xl = vec4F_mul(&xl, (1.0f / 255.0f));
1204                     xh = vec4F_mul(&xh, (1.0f / 255.0f));
1205 
1206                     if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
1207                         return 0;
1208                 }
1209             }
1210         }
1211     }
1212 
1213     if (mode == 1)
1214     {
1215         // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean.
1216         color_cell_compressor_results avg_results = *pResults;
1217         const uint r = cast(int)(.5f + meanColor.m_c[0] * 255.0f), 
1218                    g = cast(int)(.5f + meanColor.m_c[1] * 255.0f), 
1219                    b = cast(int)(.5f + meanColor.m_c[2] * 255.0f);
1220         ulong avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults.m_pSelectors_temp);
1221         if (avg_err < pResults.m_best_overall_err)
1222         {
1223             *pResults = avg_results;
1224             memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels);
1225             pResults.m_best_overall_err = avg_err;
1226         }
1227     }
1228 
1229     return pResults.m_best_overall_err;
1230 }
1231 
1232 ulong color_cell_compression_est(uint num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint* pweights/*[4]*/, ulong best_err_so_far) @system
1233 {
1234     // Find RGB bounds as an approximation of the block's principle axis
1235     uint lr = 255, lg = 255, lb = 255;
1236     uint hr = 0, hg = 0, hb = 0;
1237     for (uint i = 0; i < num_pixels; i++)
1238     {
1239         const color_quad_u8 *pC = &pPixels[i];
1240         if (pC.m_c[0] < lr) lr = pC.m_c[0];
1241         if (pC.m_c[1] < lg) lg = pC.m_c[1];
1242         if (pC.m_c[2] < lb) lb = pC.m_c[2];
1243         if (pC.m_c[0] > hr) hr = pC.m_c[0];
1244         if (pC.m_c[1] > hg) hg = pC.m_c[1];
1245         if (pC.m_c[2] > hb) hb = pC.m_c[2];
1246     }
1247 
1248     color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0);
1249     color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0);
1250 
1251     // Place endpoints at bbox diagonals and compute interpolated colors
1252     const uint N = 8;
1253     color_quad_u8[8] weightedColors;
1254 
1255     weightedColors[0] = lowColor;
1256     weightedColors[N - 1] = highColor;
1257     for (uint i = 1; i < (N - 1); i++)
1258     {
1259         weightedColors[i].m_c[0] = cast(ubyte)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6);
1260         weightedColors[i].m_c[1] = cast(ubyte)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6);
1261         weightedColors[i].m_c[2] = cast(ubyte)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6);
1262     }
1263 
1264     // Compute dots and thresholds
1265     const int ar = highColor.m_c[0] - lowColor.m_c[0];
1266     const int ag = highColor.m_c[1] - lowColor.m_c[1];
1267     const int ab = highColor.m_c[2] - lowColor.m_c[2];
1268 
1269     int[8] dots;
1270     for (uint i = 0; i < N; i++)
1271         dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab;
1272 
1273     int[8 - 1] thresh;
1274     for (uint i = 0; i < (N - 1); i++)
1275         thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1;
1276 
1277     ulong total_err = 0;
1278     if (perceptual)
1279     {
1280         // Transform block's interpolated colors to YCbCr
1281         int[8] l1, cr1, cb1;
1282         for (int j = 0; j < 8; j++)
1283         {
1284             const color_quad_u8 *pE1 = &weightedColors[j];
1285             l1[j] = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37;
1286             cr1[j] = (cast(int)pE1.m_c[0] << 9) - l1[j];
1287             cb1[j] = (cast(int)pE1.m_c[2] << 9) - l1[j];
1288         }
1289 
1290         for (uint i = 0; i < num_pixels; i++)
1291         {
1292             const color_quad_u8 *pC = &pPixels[i];
1293 
1294             int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2];
1295 
1296             // Find approximate selector
1297             uint s = 0;
1298             if (d >= thresh[6])
1299                 s = 7;
1300             else if (d >= thresh[5])
1301                 s = 6;
1302             else if (d >= thresh[4])
1303                 s = 5;
1304             else if (d >= thresh[3])
1305                 s = 4;
1306             else if (d >= thresh[2])
1307                 s = 3;
1308             else if (d >= thresh[1])
1309                 s = 2;
1310             else if (d >= thresh[0])
1311                 s = 1;
1312 
1313             // Compute error
1314             const int l2 = pC.m_c[0] * 109 + pC.m_c[1] * 366 + pC.m_c[2] * 37;
1315             const int cr2 = (cast(int)pC.m_c[0] << 9) - l2;
1316             const int cb2 = (cast(int)pC.m_c[2] << 9) - l2;
1317 
1318             const int dl = (l1[s] - l2) >> 8;
1319             const int dcr = (cr1[s] - cr2) >> 8;
1320             const int dcb = (cb1[s] - cb2) >> 8;
1321 
1322             int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb);
1323 
1324             total_err += ie;
1325             if (total_err > best_err_so_far)
1326                 break;
1327         }
1328     }
1329     else
1330     {
1331         for (uint i = 0; i < num_pixels; i++)
1332         {
1333             const color_quad_u8 *pC = &pPixels[i];
1334 
1335             int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2];
1336 
1337             // Find approximate selector
1338             uint s = 0;
1339             if (d >= thresh[6])
1340                 s = 7;
1341             else if (d >= thresh[5])
1342                 s = 6;
1343             else if (d >= thresh[4])
1344                 s = 5;
1345             else if (d >= thresh[3])
1346                 s = 4;
1347             else if (d >= thresh[2])
1348                 s = 3;
1349             else if (d >= thresh[1])
1350                 s = 2;
1351             else if (d >= thresh[0])
1352                 s = 1;
1353 
1354             // Compute error
1355             const color_quad_u8 *pE1 = &weightedColors[s];
1356 
1357             int dr = cast(int)pE1.m_c[0] - cast(int)pC.m_c[0];
1358             int dg = cast(int)pE1.m_c[1] - cast(int)pC.m_c[1];
1359             int db = cast(int)pE1.m_c[2] - cast(int)pC.m_c[2];
1360 
1361             total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db);
1362             if (total_err > best_err_so_far)
1363                 break;
1364         }
1365     }
1366 
1367     return total_err;
1368 }
1369 
1370 // This table contains bitmasks indicating which "key" partitions must be best ranked before this partition is worth evaluating.
1371 // We first rank the best/most used 14 partitions (sorted by usefulness), record the best one found as the key partition, then use
1372 // that to control the other partitions to evaluate. The quality loss is ~.08 dB RGB PSNR, the perf gain is up to ~11% (at uber level 0).
1373 static immutable uint[35] g_partition_predictors =
1374 [
1375     uint.max,
1376     uint.max,
1377     uint.max,
1378     uint.max,
1379     uint.max,
1380     (1 << 1) | (1 << 2) | (1 << 8),
1381     (1 << 1) | (1 << 3) | (1 << 7),
1382     uint.max,
1383     uint.max,
1384     (1 << 2) | (1 << 8) | (1 << 16),
1385     (1 << 7) | (1 << 3) | (1 << 15),
1386     uint.max,
1387     (1 << 8) | (1 << 14) | (1 << 16),
1388     (1 << 7) | (1 << 14) | (1 << 15),
1389     uint.max,
1390     uint.max,
1391     uint.max,
1392     uint.max,
1393     (1 << 14) | (1 << 15),
1394     (1 << 16) | (1 << 22) | (1 << 14),
1395     (1 << 17) | (1 << 24) | (1 << 14),
1396     (1 << 2) | (1 << 14) | (1 << 15) | (1 << 1),
1397     uint.max,
1398     (1 << 1) | (1 << 3) | (1 << 14) | (1 << 16) | (1 << 22),
1399     uint.max,
1400     (1 << 1) | (1 << 2) | (1 << 15) | (1 << 17) | (1 << 24),
1401     (1 << 1) | (1 << 3) | (1 << 22),
1402     uint.max,
1403     uint.max,
1404     uint.max,
1405     (1 << 14) | (1 << 15) | (1 << 16) | (1 << 17),
1406     uint.max,
1407     uint.max,
1408     (1 << 1) | (1 << 2) | (1 << 3) | (1 << 27) | (1 << 4) | (1 << 24),
1409     (1 << 14) | (1 << 15) | (1 << 16) | (1 << 11) | (1 << 17) | (1 << 27)
1410 ];
1411 
1412 // Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each.
1413 uint estimate_partition(const(color_quad_u8)* pPixels, 
1414                         const(bc7enc16_compress_block_params)* pComp_params, 
1415                         uint* pweights/*[4]*/) @system
1416 {
1417     const uint total_partitions = minimumu(pComp_params.m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1);
1418     if (total_partitions <= 1)
1419         return 0;
1420 
1421     ulong best_err = ulong.max;
1422     uint best_partition = 0;
1423 
1424     // Partition order sorted by usage frequency across a large test corpus. Pattern 34 (checkerboard) must appear in slot 34.
1425     // Using a sorted order allows the user to decrease the # of partitions to scan with minimal loss in quality.
1426     static immutable ubyte[64] s_sorted_partition_order =
1427     [
1428         1 - 1, 14 - 1, 2 - 1, 3 - 1, 16 - 1, 15 - 1, 11 - 1, 17 - 1,
1429         4 - 1, 24 - 1, 27 - 1, 7 - 1, 8 - 1, 22 - 1, 20 - 1, 30 - 1,
1430         9 - 1, 5 - 1, 10 - 1, 21 - 1, 6 - 1, 32 - 1, 23 - 1, 18 - 1,
1431         19 - 1, 12 - 1, 13 - 1, 31 - 1, 25 - 1, 26 - 1, 29 - 1, 28 - 1,
1432         33 - 1, 34 - 1, 35 - 1, 46 - 1, 47 - 1, 52 - 1, 50 - 1, 51 - 1,
1433         49 - 1, 39 - 1, 40 - 1, 38 - 1, 54 - 1, 53 - 1, 55 - 1, 37 - 1,
1434         58 - 1, 59 - 1, 56 - 1, 42 - 1, 41 - 1, 43 - 1, 44 - 1, 60 - 1,
1435         45 - 1, 57 - 1, 48 - 1, 36 - 1, 61 - 1, 64 - 1, 63 - 1, 62 - 1
1436     ];
1437 
1438     assert(s_sorted_partition_order[34] == 34);
1439 
1440     int best_key_partition = 0;
1441 
1442     for (uint partition_iter = 0; (partition_iter < total_partitions) && (best_err > 0); partition_iter++)
1443     {
1444         const uint partition = s_sorted_partition_order[partition_iter];
1445 
1446         // Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14.
1447         if (pComp_params.m_mode1_partition_estimation_filterbank)
1448         {
1449             if ((partition_iter >= 14) && (partition_iter <= 34))
1450             {
1451                 const uint best_key_partition_bitmask = 1 << (best_key_partition + 1);
1452                 if ((g_partition_predictors[partition] & best_key_partition_bitmask) == 0)
1453                 {
1454                     if (partition_iter == 34)
1455                         break;
1456 
1457                     continue;
1458                 }
1459             }
1460         }
1461 
1462         const ubyte *pPartition = &g_bc7_partition2[partition * 16];
1463 
1464         color_quad_u8[16][2] subset_colors;
1465         uint[2] subset_total_colors = [ 0, 0 ];
1466         for (uint index = 0; index < 16; index++)
1467             subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index];
1468 
1469         ulong total_subset_err = 0;
1470         for (uint subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
1471             total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params.m_perceptual, pweights, best_err);
1472 
1473         if (total_subset_err < best_err)
1474         {
1475             best_err = total_subset_err;
1476             best_partition = partition;
1477         }
1478 
1479         // If the checkerboard pattern doesn't get the highest ranking vs. the previous (lower frequency) patterns, then just stop now because statistically the subsequent patterns won't do well either.
1480         if ((partition == 34) && (best_partition != 34))
1481             break;
1482 
1483         if (partition_iter == 13)
1484             best_key_partition = best_partition;
1485 
1486     } // partition
1487 
1488     return best_partition;
1489 }
1490 
1491 void set_block_bits(ubyte *pBytes, uint val, uint num_bits, uint *pCur_ofs) @system
1492 {
1493     assert((num_bits <= 32) && (val < (1UL << num_bits)));
1494     while (num_bits)
1495     {
1496         const uint n = minimumu(8 - (*pCur_ofs & 7), num_bits);
1497         pBytes[*pCur_ofs >> 3] |= cast(ubyte)(val << (*pCur_ofs & 7));
1498         val >>= n;
1499         num_bits -= n;
1500         *pCur_ofs += n;
1501     }
1502     assert(*pCur_ofs <= 128);
1503 }
1504 
1505 struct bc7_optimization_results
1506 {
1507     uint m_mode;
1508     uint m_partition;
1509     ubyte[16] m_selectors;
1510     color_quad_u8[2] m_low;
1511     color_quad_u8[2] m_high;
1512     uint[2][2] m_pbits;
1513 }
1514 
1515 static void encode_bc7_block(void *pBlock, const(bc7_optimization_results)* pResults) @system
1516 {
1517     const uint best_mode = pResults.m_mode;
1518     const uint total_subsets = g_bc7_num_subsets[best_mode];
1519     const uint total_partitions = 1 << g_bc7_partition_bits[best_mode];
1520     const ubyte *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults.m_partition * 16] : &g_bc7_partition1[0];
1521 
1522     ubyte[16] color_selectors;
1523     memcpy(color_selectors.ptr, pResults.m_selectors.ptr, 16);
1524 
1525     color_quad_u8[2] low, high;
1526     memcpy(low.ptr, pResults.m_low.ptr, low.sizeof);
1527     memcpy(high.ptr, pResults.m_high.ptr, high.sizeof);
1528 
1529     uint[2][2] pbits;
1530     static assert(pbits.sizeof == 16);
1531     memcpy(pbits.ptr, pResults.m_pbits.ptr, pbits.sizeof);
1532 
1533     int[2] anchor = [ -1, -1 ];
1534 
1535     for (uint k = 0; k < total_subsets; k++)
1536     {
1537         const uint anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults.m_partition] : 0;
1538         anchor[k] = anchor_index;
1539 
1540         const uint color_index_bits = get_bc7_color_index_size(best_mode, 0);
1541         const uint num_color_indices = 1 << color_index_bits;
1542 
1543         if (color_selectors[anchor_index] & (num_color_indices >> 1))
1544         {
1545             for (uint i = 0; i < 16; i++)
1546                 if (pPartition[i] == k)
1547                     color_selectors[i] = cast(ubyte)((num_color_indices - 1) - color_selectors[i]);
1548 
1549             color_quad_u8 tmp = low[k];
1550             low[k] = high[k];
1551             high[k] = tmp;
1552 
1553             if (!g_bc7_mode_has_shared_p_bits[best_mode])
1554             {
1555                 uint t = pbits[k][0];
1556                 pbits[k][0] = pbits[k][1];
1557                 pbits[k][1] = t;
1558             }
1559         }
1560     }
1561 
1562     ubyte *pBlock_bytes = cast(ubyte *)(pBlock);
1563     memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE);
1564 
1565     uint cur_bit_ofs = 0;
1566     set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs);
1567 
1568     if (total_partitions > 1)
1569         set_block_bits(pBlock_bytes, pResults.m_partition, 6, &cur_bit_ofs);
1570 
1571     const uint total_comps = (best_mode >= 4) ? 4 : 3;
1572     for (uint comp = 0; comp < total_comps; comp++)
1573     {
1574         for (uint subset = 0; subset < total_subsets; subset++)
1575         {
1576             set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
1577             set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
1578         }
1579     }
1580 
1581     for (uint subset = 0; subset < total_subsets; subset++)
1582     {
1583         set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs);
1584         if (!g_bc7_mode_has_shared_p_bits[best_mode])
1585             set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs);
1586     }
1587 
1588     for (int idx = 0; idx < 16; idx++)
1589     {
1590         uint n = get_bc7_color_index_size(best_mode, 0);
1591         if ((idx == anchor[0]) || (idx == anchor[1]))
1592             n--;
1593         set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs);
1594     }
1595 
1596     assert(cur_bit_ofs == 128);
1597 }
1598 
1599 void handle_alpha_block(void *pBlock, const(color_quad_u8)* pPixels, 
1600                         const(bc7enc16_compress_block_params)* pComp_params, 
1601                         color_cell_compressor_params *pParams) @system
1602 {
1603     color_cell_compressor_results results6;
1604 
1605     pParams.m_pSelector_weights = g_bc7_weights4.ptr;
1606     pParams.m_pSelector_weightsx = cast(const(vec4F)*) g_bc7_weights4x.ptr;
1607     pParams.m_num_selector_weights = 16;
1608     pParams.m_comp_bits = 7;
1609     pParams.m_has_pbits = BC7ENC16_TRUE;
1610     pParams.m_has_alpha = BC7ENC16_TRUE;
1611     pParams.m_perceptual = pComp_params.m_perceptual;
1612     pParams.m_num_pixels = 16;
1613     pParams.m_pPixels = pPixels;
1614 
1615     bc7_optimization_results opt_results;
1616     results6.m_pSelectors = opt_results.m_selectors.ptr;
1617 
1618     ubyte[16] selectors_temp;
1619     results6.m_pSelectors_temp = selectors_temp.ptr;
1620 
1621     color_cell_compression(6, pParams, &results6, pComp_params);
1622 
1623     opt_results.m_mode = 6;
1624     opt_results.m_partition = 0;
1625     opt_results.m_low[0] = results6.m_low_endpoint;
1626     opt_results.m_high[0] = results6.m_high_endpoint;
1627     opt_results.m_pbits[0][0] = results6.m_pbits[0];
1628     opt_results.m_pbits[0][1] = results6.m_pbits[1];
1629 
1630     encode_bc7_block(pBlock, &opt_results);
1631 }
1632 
1633 static void handle_opaque_block(void *pBlock, 
1634                                 const(color_quad_u8)* pPixels, 
1635                                 const(bc7enc16_compress_block_params)* pComp_params, 
1636                                 color_cell_compressor_params *pParams) @system
1637 {
1638     ubyte[16] selectors_temp;
1639 
1640     // Mode 6
1641     bc7_optimization_results opt_results;
1642 
1643     pParams.m_pSelector_weights = g_bc7_weights4.ptr;
1644     pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights4x;
1645     pParams.m_num_selector_weights = 16;
1646     pParams.m_comp_bits = 7;
1647     pParams.m_has_pbits = BC7ENC16_TRUE;
1648     pParams.m_endpoints_share_pbit = BC7ENC16_FALSE;
1649     pParams.m_perceptual = pComp_params.m_perceptual;
1650     pParams.m_num_pixels = 16;
1651     pParams.m_pPixels = pPixels;
1652     pParams.m_has_alpha = BC7ENC16_FALSE;
1653 
1654     color_cell_compressor_results results6;
1655     results6.m_pSelectors = opt_results.m_selectors.ptr;
1656     results6.m_pSelectors_temp = selectors_temp.ptr;
1657 
1658     ulong best_err = color_cell_compression(6, pParams, &results6, pComp_params);
1659 
1660     opt_results.m_mode = 6;
1661     opt_results.m_partition = 0;
1662     opt_results.m_low[0] = results6.m_low_endpoint;
1663     opt_results.m_high[0] = results6.m_high_endpoint;
1664     opt_results.m_pbits[0][0] = results6.m_pbits[0];
1665     opt_results.m_pbits[0][1] = results6.m_pbits[1];
1666 
1667     // Mode 1
1668     if ((best_err > 0) && (pComp_params.m_max_partitions_mode1 > 0))
1669     {
1670         const uint trial_partition = estimate_partition(pPixels, pComp_params, pParams.m_weights.ptr);
1671         pParams.m_pSelector_weights = g_bc7_weights3.ptr;
1672         pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights3x;
1673         pParams.m_num_selector_weights = 8;
1674         pParams.m_comp_bits = 6;
1675         pParams.m_has_pbits = BC7ENC16_TRUE;
1676         pParams.m_endpoints_share_pbit = BC7ENC16_TRUE;
1677 
1678         const ubyte *pPartition = &g_bc7_partition2[trial_partition * 16];
1679 
1680         color_quad_u8[16][2] subset_colors;
1681 
1682         uint[2] subset_total_colors1 = [ 0, 0 ];
1683 
1684         ubyte[16][2] subset_pixel_index1;
1685         ubyte[16][2] subset_selectors1;
1686         color_cell_compressor_results[2] subset_results1;
1687 
1688         for (uint idx = 0; idx < 16; idx++)
1689         {
1690             const uint p = pPartition[idx];
1691             subset_colors[p][subset_total_colors1[p]] = pPixels[idx];
1692             subset_pixel_index1[p][subset_total_colors1[p]] = cast(ubyte)idx;
1693             subset_total_colors1[p]++;
1694         }
1695 
1696         ulong trial_err = 0;
1697         for (uint subset = 0; subset < 2; subset++)
1698         {
1699             pParams.m_num_pixels = subset_total_colors1[subset];
1700             pParams.m_pPixels = &subset_colors[subset][0];
1701 
1702             color_cell_compressor_results *pResults = &subset_results1[subset];
1703             pResults.m_pSelectors = &subset_selectors1[subset][0];
1704             pResults.m_pSelectors_temp = selectors_temp.ptr;
1705             ulong err = color_cell_compression(1, pParams, pResults, pComp_params);
1706             trial_err += err;
1707             if (trial_err > best_err)
1708                 break;
1709 
1710         } // subset
1711 
1712         if (trial_err < best_err)
1713         {
1714             best_err = trial_err;
1715             opt_results.m_mode = 1;
1716             opt_results.m_partition = trial_partition;
1717             for (uint subset = 0; subset < 2; subset++)
1718             {
1719                 for (uint i = 0; i < subset_total_colors1[subset]; i++)
1720                     opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i];
1721                 opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint;
1722                 opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint;
1723                 opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0];
1724             }
1725         }
1726     }
1727 
1728     encode_bc7_block(pBlock, &opt_results);
1729 }
1730 
1731 // Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6.
1732 // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6.
1733 // Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.)
1734 bc7enc16_bool bc7enc16_compress_block(void *pBlock, 
1735                                       const(void)* pPixelsRGBA, 
1736                                       const(bc7enc16_compress_block_params)* pComp_params) @system
1737 {
1738     assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0);
1739 
1740     const color_quad_u8 *pPixels = cast(const color_quad_u8 *)(pPixelsRGBA);
1741 
1742     color_cell_compressor_params params;
1743     if (pComp_params.m_perceptual)
1744     {
1745         // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
1746         const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f));
1747         const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f));
1748         params.m_weights[0] = cast(int)(pComp_params.m_weights[0] * 4.0f);
1749         params.m_weights[1] = cast(int)(pComp_params.m_weights[1] * 4.0f * pr_weight);
1750         params.m_weights[2] = cast(int)(pComp_params.m_weights[2] * 4.0f * pb_weight);
1751         params.m_weights[3] = pComp_params.m_weights[3] * 4;
1752     }
1753     else
1754         memcpy(params.m_weights.ptr, pComp_params.m_weights.ptr, (params.m_weights).sizeof);
1755 
1756     for (uint i = 0; i < 16; i++)
1757     {
1758         if (pPixels[i].m_c[3] < 255)
1759         {
1760             handle_alpha_block(pBlock, pPixels, pComp_params, &params);
1761             return BC7ENC16_TRUE;
1762         }
1763     }
1764     handle_opaque_block(pBlock, pPixels, pComp_params, &params);
1765     return BC7ENC16_FALSE;
1766 }
1767 
1768 /*
1769 ------------------------------------------------------------------------------
1770 This software is available under 2 licenses -- choose whichever you prefer.
1771 ------------------------------------------------------------------------------
1772 ALTERNATIVE A - MIT License
1773 Copyright(c) 2018 Richard Geldreich, Jr.
1774 Permission is hereby granted, free of charge, to any person obtaining a copy of
1775 this software and associated documentation files(the "Software"), to deal in
1776 the Software without restriction, including without limitation the rights to
1777 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
1778 of the Software, and to permit persons to whom the Software is furnished to do
1779 so, subject to the following conditions :
1780 The above copyright notice and this permission notice shall be included in all
1781 copies or substantial portions of the Software.
1782 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1783 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1784 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
1785 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1786 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1787 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1788 SOFTWARE.
1789 ------------------------------------------------------------------------------
1790 ALTERNATIVE B - Public Domain(www.unlicense.org)
1791 This is free and unencumbered software released into the public domain.
1792 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
1793 software, either in source code form or as a compiled binary, for any purpose,
1794 commercial or non - commercial, and by any means.
1795 In jurisdictions that recognize copyright laws, the author or authors of this
1796 software dedicate any and all copyright interest in the software to the public
1797 domain.We make this dedication for the benefit of the public at large and to
1798 the detriment of our heirs and successors.We intend this dedication to be an
1799 overt act of relinquishment in perpetuity of all present and future rights to
1800 this software under copyright law.
1801 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1802 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1803 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
1804 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
1805 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1806 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1807 ------------------------------------------------------------------------------
1808 */