1 /// BC7 encoding image loading. 2 /// D translation of bc7enc16 d3b037f33b8c6df184177a0ae6a0f4cfec1434ad 3 module gamut.codecs.bc7enc16; 4 5 import core.stdc.string: memset, memcpy; 6 import std.math: abs, sqrt, floor; 7 import gamut.internals.mutex; 8 9 // File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c) 10 11 enum BC7ENC16_BLOCK_SIZE = 16; 12 enum BC7ENC16_MAX_PARTITIONS1 = 64; 13 enum BC7ENC16_MAX_UBER_LEVEL = 4; 14 15 alias bc7enc16_bool = ubyte; 16 enum BC7ENC16_TRUE = 1; 17 enum BC7ENC16_FALSE = 0; 18 19 nothrow @nogc @safe: 20 21 struct bc7enc16_compress_block_params 22 { 23 // m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC16_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality. 24 uint m_max_partitions_mode1; 25 26 // Relative RGBA or YCbCrA weights. 27 uint[4] m_weights; 28 29 // m_uber_level may range from 0 to BC7ENC16_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality. 30 uint m_uber_level; 31 32 // If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB. 33 bc7enc16_bool m_perceptual; 34 35 // Set m_try_least_squares to false for slightly faster/lower quality compression. 36 bc7enc16_bool m_try_least_squares; 37 38 // When m_mode1_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful. 39 // There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings. 40 bc7enc16_bool m_mode1_partition_estimation_filterbank; 41 } 42 43 void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p) pure 44 { 45 p.m_perceptual = BC7ENC16_FALSE; 46 p.m_weights[0] = 1; 47 p.m_weights[1] = 1; 48 p.m_weights[2] = 1; 49 p.m_weights[3] = 1; 50 } 51 52 void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p) pure 53 { 54 p.m_perceptual = BC7ENC16_TRUE; 55 p.m_weights[0] = 128; 56 p.m_weights[1] = 64; 57 p.m_weights[2] = 16; 58 p.m_weights[3] = 32; 59 } 60 61 void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p) pure 62 { 63 p.m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1; 64 p.m_try_least_squares = BC7ENC16_TRUE; 65 p.m_mode1_partition_estimation_filterbank = BC7ENC16_TRUE; 66 p.m_uber_level = 0; 67 bc7enc16_compress_block_params_init_perceptual_weights(p); 68 } 69 70 71 // File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file) 72 73 // Helpers 74 int clampi(int value, int low, int high) pure 75 { 76 if (value < low) 77 value = low; 78 else if (value > high) 79 value = high; 80 return value; 81 } 82 83 float clampf(float value, float low, float high) pure 84 { 85 if (value < low) 86 value = low; 87 else if (value > high) 88 value = high; 89 return value; 90 } 91 92 float saturate(float value) pure 93 { 94 return clampf(value, 0, 1.0f); 95 } 96 97 ubyte minimumub(ubyte a, ubyte b) pure 98 { 99 return (a < b) ? a : b; 100 } 101 102 uint minimumu(uint a, uint b) pure 103 { 104 return (a < b) ? a : b; 105 } 106 107 float minimumf(float a, float b) pure 108 { 109 return (a < b) ? a : b; 110 } 111 112 ubyte maximumub(ubyte a, ubyte b) pure 113 { 114 return (a > b) ? a : b; 115 } 116 117 uint maximumu(uint a, uint b) pure 118 { 119 return (a > b) ? a : b; 120 } 121 122 float maximumf(float a, float b) pure 123 { 124 return (a > b) ? a : b; 125 } 126 127 int squarei(int i) pure 128 { 129 return i * i; 130 } 131 132 float squaref(float i) pure 133 { 134 return i * i; 135 } 136 137 struct color_quad_u8 138 { 139 ubyte[4] m_c; 140 } 141 142 struct vec4F 143 { 144 float[4] m_c; 145 } 146 147 color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system 148 { 149 pRes.m_c[0] = cast(ubyte)clampi(r, 0, 255); 150 pRes.m_c[1] = cast(ubyte)clampi(g, 0, 255); 151 pRes.m_c[2] = cast(ubyte)clampi(b, 0, 255); 152 pRes.m_c[3] = cast(ubyte)clampi(a, 0, 255); 153 return pRes; 154 } 155 156 color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int r, int g, int b, int a) pure @system 157 { 158 assert(cast(uint)(r | g | b | a) <= 255); 159 pRes.m_c[0] = cast(ubyte)r; 160 pRes.m_c[1] = cast(ubyte)g; 161 pRes.m_c[2] = cast(ubyte)b; 162 pRes.m_c[3] = cast(ubyte)a; 163 return pRes; 164 } 165 166 bc7enc16_bool color_quad_u8_notequals(ref const(color_quad_u8) pLHS, ref const(color_quad_u8) pRHS) pure 167 { 168 return (pLHS.m_c[0] != pRHS.m_c[0]) 169 || (pLHS.m_c[1] != pRHS.m_c[1]) 170 || (pLHS.m_c[2] != pRHS.m_c[2]) 171 || (pLHS.m_c[3] != pRHS.m_c[3]); 172 } 173 174 vec4F* vec4F_set_scalar(vec4F *pV, float x) pure 175 { 176 pV.m_c[0] = x; 177 pV.m_c[1] = x; 178 pV.m_c[2] = x; 179 pV.m_c[3] = x; 180 return pV; 181 } 182 183 vec4F* vec4F_set(vec4F *pV, float x, float y, float z, float w) pure 184 { 185 pV.m_c[0] = x; 186 pV.m_c[1] = y; 187 pV.m_c[2] = z; 188 pV.m_c[3] = w; 189 return pV; 190 } 191 192 void vec4F_saturate_in_place(ref vec4F pV) pure 193 { 194 pV.m_c[0] = saturate(pV.m_c[0]); 195 pV.m_c[1] = saturate(pV.m_c[1]); 196 pV.m_c[2] = saturate(pV.m_c[2]); 197 pV.m_c[3] = saturate(pV.m_c[3]); 198 } 199 200 vec4F vec4F_saturate(const(vec4F)* pV) pure 201 { 202 vec4F res; 203 res.m_c[0] = saturate(pV.m_c[0]); 204 res.m_c[1] = saturate(pV.m_c[1]); 205 res.m_c[2] = saturate(pV.m_c[2]); 206 res.m_c[3] = saturate(pV.m_c[3]); 207 return res; 208 } 209 210 vec4F vec4F_from_color(const(color_quad_u8)* pC) pure @trusted 211 { 212 vec4F res; 213 vec4F_set(&res, pC.m_c[0], pC.m_c[1], pC.m_c[2], pC.m_c[3]); 214 return res; 215 } 216 217 vec4F vec4F_add(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted 218 { 219 vec4F res; 220 vec4F_set(&res, pLHS.m_c[0] + pRHS.m_c[0], pLHS.m_c[1] + pRHS.m_c[1], 221 pLHS.m_c[2] + pRHS.m_c[2], pLHS.m_c[3] + pRHS.m_c[3]); 222 return res; 223 } 224 225 vec4F vec4F_sub(const(vec4F)* pLHS, const(vec4F)* pRHS) pure @trusted 226 { 227 vec4F res; 228 vec4F_set(&res, pLHS.m_c[0] - pRHS.m_c[0], pLHS.m_c[1] - pRHS.m_c[1], 229 pLHS.m_c[2] - pRHS.m_c[2], pLHS.m_c[3] - pRHS.m_c[3]); 230 return res; 231 } 232 233 float vec4F_dot(const(vec4F)* pLHS, const(vec4F)* pRHS) pure 234 { 235 return pLHS.m_c[0] * pRHS.m_c[0] + pLHS.m_c[1] * pRHS.m_c[1] 236 + pLHS.m_c[2] * pRHS.m_c[2] + pLHS.m_c[3] * pRHS.m_c[3]; 237 } 238 239 vec4F vec4F_mul(const(vec4F)* pLHS, float s) pure @trusted 240 { 241 vec4F res; vec4F_set(&res, pLHS.m_c[0] * s, pLHS.m_c[1] * s, 242 pLHS.m_c[2] * s, pLHS.m_c[3] * s); 243 return res; 244 } 245 246 vec4F* vec4F_normalize_in_place(vec4F *pV) pure 247 { 248 float s = pV.m_c[0] * pV.m_c[0] + pV.m_c[1] * pV.m_c[1] + pV.m_c[2] * pV.m_c[2] + pV.m_c[3] * pV.m_c[3]; 249 if (s != 0.0f) 250 { 251 s = 1.0f / sqrt(s); 252 pV.m_c[0] *= s; 253 pV.m_c[1] *= s; 254 pV.m_c[2] *= s; 255 pV.m_c[3] *= s; 256 } 257 return pV; 258 } 259 260 // Various BC7 tables 261 static immutable uint[8] g_bc7_weights3 = [ 0, 9, 18, 27, 37, 46, 55, 64 ]; 262 static immutable uint[16] g_bc7_weights4 = [ 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 ]; 263 // Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w 264 static immutable float[8 * 4] g_bc7_weights3x = 265 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 266 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 267 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f, 0.079102f, 0.718750f, 268 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ]; 269 270 static immutable float[16 * 4] g_bc7_weights4x = 271 [ 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 272 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 273 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f, 0.451416f, 0.328125f, 274 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 275 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 276 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f, 277 0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 278 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f ]; 279 280 static immutable ubyte[64] g_bc7_partition1 = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ]; 281 static immutable ubyte[64*16] g_bc7_partition2 = 282 [ 283 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, 284 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, 285 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, 286 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, 287 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, 288 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, 289 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, 290 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 291 ]; 292 293 static immutable ubyte[64] g_bc7_table_anchor_index_second_subset = 294 [ 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 295 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 296 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 297 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 ]; 298 299 static immutable ubyte[8] g_bc7_num_subsets = [ 3, 2, 3, 2, 1, 1, 1, 2 ]; 300 static immutable ubyte[8] g_bc7_partition_bits = [ 4, 6, 6, 6, 0, 0, 0, 6 ]; 301 static immutable ubyte[8] g_bc7_color_index_bitcount = [ 3, 3, 2, 2, 2, 2, 4, 2 ]; 302 303 int get_bc7_color_index_size(int mode, int index_selection_bit) pure 304 { 305 return g_bc7_color_index_bitcount[mode] + index_selection_bit; 306 } 307 308 static immutable ubyte[8] g_bc7_mode_has_p_bits = [ 1, 1, 0, 1, 0, 0, 1, 1 ]; 309 static immutable ubyte[8] g_bc7_mode_has_shared_p_bits = [ 0, 1, 0, 0, 0, 0, 0, 0 ]; 310 static immutable ubyte[8] g_bc7_color_precision_table = [ 4, 6, 5, 7, 5, 7, 7, 5 ]; 311 static immutable byte[8] g_bc7_alpha_precision_table = [ 0, 0, 0, 0, 6, 8, 7, 5 ]; 312 313 struct endpoint_err 314 { 315 ushort m_error; 316 ubyte m_lo; 317 ubyte m_hi; 318 } 319 320 __gshared endpoint_err[2][256] g_bc7_mode_1_optimal_endpoints; // [c][pbit] 321 __gshared Mutex g_tableProtect; 322 __gshared bool g_tableInitialized = false; 323 324 enum uint BC7ENC16_MODE_1_OPTIMAL_INDEX = 2; 325 326 // Initialize the lookup table used for optimal single color compression in mode 1 327 // Warning: bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts). 328 // Note: this is racey, so we use a self-init mutex. 329 void bc7enc16_compress_block_init() @trusted 330 { 331 g_tableProtect.lockLazy(); 332 scope(exit) g_tableProtect.unlock(); 333 334 if (g_tableInitialized) 335 return; 336 337 g_tableInitialized = true; 338 339 for (int c = 0; c < 256; c++) 340 { 341 for (uint lp = 0; lp < 2; lp++) 342 { 343 endpoint_err best; 344 best.m_error = ushort.max; 345 for (uint l = 0; l < 64; l++) 346 { 347 uint low = ((l << 1) | lp) << 1; 348 low |= (low >> 7); 349 for (uint h = 0; h < 64; h++) 350 { 351 uint high = ((h << 1) | lp) << 1; 352 high |= (high >> 7); 353 const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6; 354 const int err = (k - c) * (k - c); 355 if (err < best.m_error) 356 { 357 best.m_error = cast(ushort)err; 358 best.m_lo = cast(ubyte)l; 359 best.m_hi = cast(ubyte)h; 360 } 361 } 362 } 363 g_bc7_mode_1_optimal_endpoints[c][lp] = best; 364 } 365 } 366 } 367 368 void compute_least_squares_endpoints_rgba(uint N, 369 const(ubyte)* pSelectors, 370 const(vec4F)* pSelector_weights, 371 vec4F *pXl, 372 vec4F *pXh, 373 const(color_quad_u8)* pColors) @system 374 { 375 // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 376 // I did this in matrix form first, expanded out all the ops, then optimized it a bit. 377 float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; 378 float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; 379 float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; 380 float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; 381 float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f; 382 for (uint i = 0; i < N; i++) 383 { 384 const uint sel = pSelectors[i]; 385 z00 += pSelector_weights[sel].m_c[0]; 386 z10 += pSelector_weights[sel].m_c[1]; 387 z11 += pSelector_weights[sel].m_c[2]; 388 float w = pSelector_weights[sel].m_c[3]; 389 q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; 390 q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; 391 q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; 392 q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3]; 393 } 394 395 q10_r = t_r - q00_r; 396 q10_g = t_g - q00_g; 397 q10_b = t_b - q00_b; 398 q10_a = t_a - q00_a; 399 400 z01 = z10; 401 402 float det = z00 * z11 - z01 * z10; 403 if (det != 0.0f) 404 det = 1.0f / det; 405 406 float iz00, iz01, iz10, iz11; 407 iz00 = z11 * det; 408 iz01 = -z01 * det; 409 iz10 = -z10 * det; 410 iz11 = z00 * det; 411 412 pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r); 413 pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g); 414 pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b); 415 pXl.m_c[3] = cast(float)(iz00 * q00_a + iz01 * q10_a); pXh.m_c[3] = cast(float)(iz10 * q00_a + iz11 * q10_a); 416 } 417 418 void compute_least_squares_endpoints_rgb(uint N, const ubyte *pSelectors, 419 const(vec4F)* pSelector_weights, 420 vec4F *pXl, vec4F *pXh, const(color_quad_u8)*pColors) @system 421 { 422 float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; 423 float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; 424 float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; 425 float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; 426 for (uint i = 0; i < N; i++) 427 { 428 const uint sel = pSelectors[i]; 429 z00 += pSelector_weights[sel].m_c[0]; 430 z10 += pSelector_weights[sel].m_c[1]; 431 z11 += pSelector_weights[sel].m_c[2]; 432 float w = pSelector_weights[sel].m_c[3]; 433 q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; 434 q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; 435 q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; 436 } 437 438 q10_r = t_r - q00_r; 439 q10_g = t_g - q00_g; 440 q10_b = t_b - q00_b; 441 442 z01 = z10; 443 444 float det = z00 * z11 - z01 * z10; 445 if (det != 0.0f) 446 det = 1.0f / det; 447 448 float iz00, iz01, iz10, iz11; 449 iz00 = z11 * det; 450 iz01 = -z01 * det; 451 iz10 = -z10 * det; 452 iz11 = z00 * det; 453 454 pXl.m_c[0] = cast(float)(iz00 * q00_r + iz01 * q10_r); pXh.m_c[0] = cast(float)(iz10 * q00_r + iz11 * q10_r); 455 pXl.m_c[1] = cast(float)(iz00 * q00_g + iz01 * q10_g); pXh.m_c[1] = cast(float)(iz10 * q00_g + iz11 * q10_g); 456 pXl.m_c[2] = cast(float)(iz00 * q00_b + iz01 * q10_b); pXh.m_c[2] = cast(float)(iz10 * q00_b + iz11 * q10_b); 457 pXl.m_c[3] = 255.0f; pXh.m_c[3] = 255.0f; 458 } 459 460 struct color_cell_compressor_params 461 { 462 uint m_num_pixels; 463 const(color_quad_u8)* m_pPixels; 464 uint m_num_selector_weights; 465 const(uint)* m_pSelector_weights; 466 const(vec4F)* m_pSelector_weightsx; 467 uint m_comp_bits; 468 uint[4] m_weights; 469 bc7enc16_bool m_has_alpha; 470 bc7enc16_bool m_has_pbits; 471 bc7enc16_bool m_endpoints_share_pbit; 472 bc7enc16_bool m_perceptual; 473 } 474 475 struct color_cell_compressor_results 476 { 477 ulong m_best_overall_err; 478 color_quad_u8 m_low_endpoint; 479 color_quad_u8 m_high_endpoint; 480 uint[2] m_pbits; 481 ubyte *m_pSelectors; 482 ubyte *m_pSelectors_temp; 483 } 484 485 color_quad_u8 scale_color(ref const(color_quad_u8) pC, const(color_cell_compressor_params) *pParams) pure 486 { 487 color_quad_u8 results; 488 489 const uint n = pParams.m_comp_bits + (pParams.m_has_pbits ? 1 : 0); 490 assert((n >= 4) && (n <= 8)); 491 492 for (uint i = 0; i < 4; i++) 493 { 494 uint v = pC.m_c[i] << (8 - n); 495 v |= (v >> n); 496 assert(v <= 255); 497 results.m_c[i] = cast(ubyte)(v); 498 } 499 500 return results; 501 } 502 503 ulong compute_color_distance_rgb(const(color_quad_u8)* pE1, 504 const(color_quad_u8)* pE2, 505 bc7enc16_bool perceptual, 506 const(uint)* weights) pure @system 507 { 508 int dr, dg, db; 509 510 if (perceptual) 511 { 512 const int l1 = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37; 513 const int cr1 = (cast(int)pE1.m_c[0] << 9) - l1; 514 const int cb1 = (cast(int)pE1.m_c[2] << 9) - l1; 515 const int l2 = pE2.m_c[0] * 109 + pE2.m_c[1] * 366 + pE2.m_c[2] * 37; 516 const int cr2 = (cast(int)pE2.m_c[0] << 9) - l2; 517 const int cb2 = (cast(int)pE2.m_c[2] << 9) - l2; 518 dr = (l1 - l2) >> 8; 519 dg = (cr1 - cr2) >> 8; 520 db = (cb1 - cb2) >> 8; 521 } 522 else 523 { 524 dr = cast(int)pE1.m_c[0] - cast(int)pE2.m_c[0]; 525 dg = cast(int)pE1.m_c[1] - cast(int)pE2.m_c[1]; 526 db = cast(int)pE1.m_c[2] - cast(int)pE2.m_c[2]; 527 } 528 529 return weights[0] * cast(uint)(dr * dr) + weights[1] * cast(uint)(dg * dg) + weights[2] * cast(uint)(db * db); 530 } 531 532 ulong compute_color_distance_rgba(const(color_quad_u8)* pE1, const(color_quad_u8)* pE2, bc7enc16_bool perceptual, const(uint)* weights /* [4] */) @system 533 { 534 int da = cast(int)pE1.m_c[3] - cast(int)pE2.m_c[3]; 535 return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * cast(uint)(da * da)); 536 } 537 538 ulong pack_mode1_to_one_color(const(color_cell_compressor_params)* pParams, 539 color_cell_compressor_results *pResults, 540 uint r, uint g, uint b, ubyte *pSelectors) @system 541 { 542 uint best_err = uint.max; 543 uint best_p = 0; 544 545 for (uint p = 0; p < 2; p++) 546 { 547 uint err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error; 548 if (err < best_err) 549 { 550 best_err = err; 551 best_p = p; 552 } 553 } 554 555 const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p]; 556 const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p]; 557 const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p]; 558 559 color_quad_u8_set(&pResults.m_low_endpoint, pEr.m_lo, pEg.m_lo, pEb.m_lo, 0); 560 color_quad_u8_set(&pResults.m_high_endpoint, pEr.m_hi, pEg.m_hi, pEb.m_hi, 0); 561 pResults.m_pbits[0] = best_p; 562 pResults.m_pbits[1] = 0; 563 564 memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams.m_num_pixels); 565 566 color_quad_u8 p; 567 for (uint i = 0; i < 3; i++) 568 { 569 uint low = ((pResults.m_low_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1; 570 low |= (low >> 7); 571 572 uint high = ((pResults.m_high_endpoint.m_c[i] << 1) | pResults.m_pbits[0]) << 1; 573 high |= (high >> 7); 574 575 p.m_c[i] = cast(ubyte)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6); 576 } 577 p.m_c[3] = 255; 578 579 ulong total_err = 0; 580 for (uint i = 0; i < pParams.m_num_pixels; i++) 581 total_err += compute_color_distance_rgb(&p, &pParams.m_pPixels[i], pParams.m_perceptual, pParams.m_weights.ptr); 582 583 pResults.m_best_overall_err = total_err; 584 585 return total_err; 586 } 587 588 ulong evaluate_solution(const(color_quad_u8)* pLow, const(color_quad_u8)* pHigh, 589 const(uint)* pbits /*[2]*/, const(color_cell_compressor_params)* pParams, 590 color_cell_compressor_results *pResults) @system 591 { 592 color_quad_u8 quantMinColor = *pLow; 593 color_quad_u8 quantMaxColor = *pHigh; 594 595 if (pParams.m_has_pbits) 596 { 597 uint minPBit, maxPBit; 598 599 if (pParams.m_endpoints_share_pbit) 600 maxPBit = minPBit = pbits[0]; 601 else 602 { 603 minPBit = pbits[0]; 604 maxPBit = pbits[1]; 605 } 606 607 quantMinColor.m_c[0] = cast(ubyte)((pLow.m_c[0] << 1) | minPBit); 608 quantMinColor.m_c[1] = cast(ubyte)((pLow.m_c[1] << 1) | minPBit); 609 quantMinColor.m_c[2] = cast(ubyte)((pLow.m_c[2] << 1) | minPBit); 610 quantMinColor.m_c[3] = cast(ubyte)((pLow.m_c[3] << 1) | minPBit); 611 612 quantMaxColor.m_c[0] = cast(ubyte)((pHigh.m_c[0] << 1) | maxPBit); 613 quantMaxColor.m_c[1] = cast(ubyte)((pHigh.m_c[1] << 1) | maxPBit); 614 quantMaxColor.m_c[2] = cast(ubyte)((pHigh.m_c[2] << 1) | maxPBit); 615 quantMaxColor.m_c[3] = cast(ubyte)((pHigh.m_c[3] << 1) | maxPBit); 616 } 617 618 color_quad_u8 actualMinColor = scale_color(quantMinColor, pParams); 619 color_quad_u8 actualMaxColor = scale_color(quantMaxColor, pParams); 620 621 const uint N = pParams.m_num_selector_weights; 622 623 color_quad_u8[16] weightedColors; 624 weightedColors[0] = actualMinColor; 625 weightedColors[N - 1] = actualMaxColor; 626 627 const uint nc = pParams.m_has_alpha ? 4 : 3; 628 for (uint i = 1; i < (N - 1); i++) 629 for (uint j = 0; j < nc; j++) 630 weightedColors[i].m_c[j] = cast(ubyte)((actualMinColor.m_c[j] * (64 - pParams.m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams.m_pSelector_weights[i] + 32) >> 6); 631 632 const int lr = actualMinColor.m_c[0]; 633 const int lg = actualMinColor.m_c[1]; 634 const int lb = actualMinColor.m_c[2]; 635 const int dr = actualMaxColor.m_c[0] - lr; 636 const int dg = actualMaxColor.m_c[1] - lg; 637 const int db = actualMaxColor.m_c[2] - lb; 638 639 ulong total_err = 0; 640 641 if (!pParams.m_perceptual) 642 { 643 if (pParams.m_has_alpha) 644 { 645 const int la = actualMinColor.m_c[3]; 646 const int da = actualMaxColor.m_c[3] - la; 647 648 const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f); 649 650 for (uint i = 0; i < pParams.m_num_pixels; i++) 651 { 652 const(color_quad_u8)* pC = &pParams.m_pPixels[i]; 653 int r = pC.m_c[0]; 654 int g = pC.m_c[1]; 655 int b = pC.m_c[2]; 656 int a = pC.m_c[3]; 657 658 int best_sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f); 659 best_sel = clampi(best_sel, 1, N - 1); 660 661 ulong err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 662 ulong err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 663 664 if (err1 > err0) 665 { 666 err1 = err0; 667 --best_sel; 668 } 669 total_err += err1; 670 671 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel; 672 } 673 } 674 else 675 { 676 const float f = N / cast(float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f); 677 678 for (uint i = 0; i < pParams.m_num_pixels; i++) 679 { 680 const color_quad_u8 *pC = &pParams.m_pPixels[i]; 681 int r = pC.m_c[0]; 682 int g = pC.m_c[1]; 683 int b = pC.m_c[2]; 684 685 int sel = cast(int)(cast(float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f); 686 sel = clampi(sel, 1, N - 1); 687 688 ulong err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 689 ulong err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams.m_weights.ptr); 690 691 int best_sel = sel; 692 ulong best_err = err1; 693 if (err0 < best_err) 694 { 695 best_err = err0; 696 best_sel = sel - 1; 697 } 698 699 total_err += best_err; 700 701 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel; 702 } 703 } 704 } 705 else 706 { 707 for (uint i = 0; i < pParams.m_num_pixels; i++) 708 { 709 ulong best_err = ulong.max; 710 uint best_sel = 0; 711 712 if (pParams.m_has_alpha) 713 { 714 for (uint j = 0; j < N; j++) 715 { 716 ulong err = compute_color_distance_rgba(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr); 717 if (err < best_err) 718 { 719 best_err = err; 720 best_sel = j; 721 } 722 } 723 } 724 else 725 { 726 for (uint j = 0; j < N; j++) 727 { 728 ulong err = compute_color_distance_rgb(&weightedColors[j], &pParams.m_pPixels[i], BC7ENC16_TRUE, pParams.m_weights.ptr); 729 if (err < best_err) 730 { 731 best_err = err; 732 best_sel = j; 733 } 734 } 735 } 736 737 total_err += best_err; 738 739 pResults.m_pSelectors_temp[i] = cast(ubyte)best_sel; 740 } 741 } 742 743 if (total_err < pResults.m_best_overall_err) 744 { 745 pResults.m_best_overall_err = total_err; 746 747 pResults.m_low_endpoint = *pLow; 748 pResults.m_high_endpoint = *pHigh; 749 750 pResults.m_pbits[0] = pbits[0]; 751 pResults.m_pbits[1] = pbits[1]; 752 753 memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels); 754 } 755 756 return total_err; 757 } 758 759 void fixDegenerateEndpoints(uint mode, 760 ref color_quad_u8 pTrialMinColor, 761 ref color_quad_u8 pTrialMaxColor, 762 ref const(vec4F) pXl, ref const(vec4F) pXh, uint iscale) 763 { 764 if (mode == 1) 765 { 766 // fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps) 767 for (uint i = 0; i < 3; i++) 768 { 769 if (pTrialMinColor.m_c[i] == pTrialMaxColor.m_c[i]) 770 { 771 if (abs(pXl.m_c[i] - pXh.m_c[i]) > 0.0f) 772 { 773 if (pTrialMinColor.m_c[i] > (iscale >> 1)) 774 { 775 if (pTrialMinColor.m_c[i] > 0) 776 pTrialMinColor.m_c[i]--; 777 else 778 if (pTrialMaxColor.m_c[i] < iscale) 779 pTrialMaxColor.m_c[i]++; 780 } 781 else 782 { 783 if (pTrialMaxColor.m_c[i] < iscale) 784 pTrialMaxColor.m_c[i]++; 785 else if (pTrialMinColor.m_c[i] > 0) 786 pTrialMinColor.m_c[i]--; 787 } 788 } 789 } 790 } 791 } 792 } 793 794 static ulong find_optimal_solution(uint mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) @system 795 { 796 vec4F_saturate_in_place(xl); 797 vec4F_saturate_in_place(xh); 798 799 if (pParams.m_has_pbits) 800 { 801 const int iscalep = (1 << (pParams.m_comp_bits + 1)) - 1; 802 const float scalep = cast(float)iscalep; 803 804 const int totalComps = pParams.m_has_alpha ? 4 : 3; 805 806 uint[2] best_pbits; 807 color_quad_u8 bestMinColor, bestMaxColor; 808 809 if (!pParams.m_endpoints_share_pbit) 810 { 811 float best_err0 = 1e+9; 812 float best_err1 = 1e+9; 813 814 for (int p = 0; p < 2; p++) 815 { 816 color_quad_u8 xMinColor, xMaxColor; 817 818 // Notes: The pbit controls which quantization intervals are selected. 819 // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc. 820 // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value 821 // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5) 822 // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5) 823 for (uint c = 0; c < 4; c++) 824 { 825 xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 826 xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 827 } 828 829 color_quad_u8 scaledLow = scale_color(xMinColor, pParams); 830 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams); 831 832 float err0 = 0, err1 = 0; 833 for (int i = 0; i < totalComps; i++) 834 { 835 err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f); 836 err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f); 837 } 838 839 if (err0 < best_err0) 840 { 841 best_err0 = err0; 842 best_pbits[0] = p; 843 844 bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1; 845 bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1; 846 bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1; 847 bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1; 848 } 849 850 if (err1 < best_err1) 851 { 852 best_err1 = err1; 853 best_pbits[1] = p; 854 855 bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1; 856 bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1; 857 bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1; 858 bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1; 859 } 860 } 861 } 862 else 863 { 864 // Endpoints share pbits 865 float best_err = 1e+9; 866 867 for (int p = 0; p < 2; p++) 868 { 869 color_quad_u8 xMinColor, xMaxColor; 870 for (uint c = 0; c < 4; c++) 871 { 872 xMinColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 873 xMaxColor.m_c[c] = cast(ubyte)(clampi((cast(int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 874 } 875 876 color_quad_u8 scaledLow = scale_color(xMinColor, pParams); 877 color_quad_u8 scaledHigh = scale_color(xMaxColor, pParams); 878 879 float err = 0; 880 for (int i = 0; i < totalComps; i++) 881 err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]); 882 883 if (err < best_err) 884 { 885 best_err = err; 886 best_pbits[0] = p; 887 best_pbits[1] = p; 888 for (uint j = 0; j < 4; j++) 889 { 890 bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; 891 bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; 892 } 893 } 894 } 895 } 896 897 fixDegenerateEndpoints(mode, bestMinColor, bestMaxColor, xl, xh, iscalep >> 1); 898 899 if ( (pResults.m_best_overall_err == ulong.max) 900 || color_quad_u8_notequals(bestMinColor, pResults.m_low_endpoint) 901 || color_quad_u8_notequals(bestMaxColor, pResults.m_high_endpoint) 902 || (best_pbits[0] != pResults.m_pbits[0]) 903 || (best_pbits[1] != pResults.m_pbits[1]) ) 904 evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits.ptr, pParams, pResults); 905 } 906 else 907 { 908 const int iscale = (1 << pParams.m_comp_bits) - 1; 909 const float scale = cast(float)iscale; 910 911 color_quad_u8 trialMinColor, trialMaxColor; 912 color_quad_u8_set_clamped(&trialMinColor, cast(int)(xl.m_c[0] * scale + .5f), cast(int)(xl.m_c[1] * scale + .5f), cast(int)(xl.m_c[2] * scale + .5f), cast(int)(xl.m_c[3] * scale + .5f)); 913 color_quad_u8_set_clamped(&trialMaxColor, cast(int)(xh.m_c[0] * scale + .5f), cast(int)(xh.m_c[1] * scale + .5f), cast(int)(xh.m_c[2] * scale + .5f), cast(int)(xh.m_c[3] * scale + .5f)); 914 915 fixDegenerateEndpoints(mode, trialMinColor, trialMaxColor, xl, xh, iscale); 916 917 if ( (pResults.m_best_overall_err == ulong.max) 918 || color_quad_u8_notequals(trialMinColor, pResults.m_low_endpoint) 919 || color_quad_u8_notequals(trialMaxColor, pResults.m_high_endpoint) ) 920 evaluate_solution(&trialMinColor, &trialMaxColor, pResults.m_pbits.ptr, pParams, pResults); 921 } 922 923 return pResults.m_best_overall_err; 924 } 925 926 ulong color_cell_compression(uint mode, 927 const(color_cell_compressor_params)* pParams, 928 color_cell_compressor_results *pResults, 929 const(bc7enc16_compress_block_params)* pComp_params) @system 930 { 931 assert((mode == 6) || (!pParams.m_has_alpha)); 932 933 pResults.m_best_overall_err = ulong.max; 934 935 // If the partition's colors are all the same in mode 1, then just pack them as a single color. 936 if (mode == 1) 937 { 938 const uint cr = pParams.m_pPixels[0].m_c[0], cg = pParams.m_pPixels[0].m_c[1], cb = pParams.m_pPixels[0].m_c[2]; 939 940 bc7enc16_bool allSame = BC7ENC16_TRUE; 941 for (uint i = 1; i < pParams.m_num_pixels; i++) 942 { 943 if ((cr != pParams.m_pPixels[i].m_c[0]) || (cg != pParams.m_pPixels[i].m_c[1]) || (cb != pParams.m_pPixels[i].m_c[2])) 944 { 945 allSame = BC7ENC16_FALSE; 946 break; 947 } 948 } 949 950 if (allSame) 951 return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults.m_pSelectors); 952 } 953 954 // Compute partition's mean color and principle axis. 955 vec4F meanColor, axis; 956 vec4F_set_scalar(&meanColor, 0.0f); 957 958 for (uint i = 0; i < pParams.m_num_pixels; i++) 959 { 960 vec4F color = vec4F_from_color(&pParams.m_pPixels[i]); 961 meanColor = vec4F_add(&meanColor, &color); 962 } 963 964 vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels)); 965 966 meanColor = vec4F_mul(&meanColor, 1.0f / cast(float)(pParams.m_num_pixels * 255.0f)); 967 vec4F_saturate_in_place(meanColor); 968 969 if (pParams.m_has_alpha) 970 { 971 // Use incremental PCA for RGBA PCA, because it's simple. 972 vec4F_set_scalar(&axis, 0.0f); 973 for (uint i = 0; i < pParams.m_num_pixels; i++) 974 { 975 vec4F color = vec4F_from_color(&pParams.m_pPixels[i]); 976 color = vec4F_sub(&color, &meanColorScaled); 977 vec4F a = vec4F_mul(&color, color.m_c[0]); 978 vec4F b = vec4F_mul(&color, color.m_c[1]); 979 vec4F c = vec4F_mul(&color, color.m_c[2]); 980 vec4F d = vec4F_mul(&color, color.m_c[3]); 981 vec4F n = i ? axis : color; 982 vec4F_normalize_in_place(&n); 983 axis.m_c[0] += vec4F_dot(&a, &n); 984 axis.m_c[1] += vec4F_dot(&b, &n); 985 axis.m_c[2] += vec4F_dot(&c, &n); 986 axis.m_c[3] += vec4F_dot(&d, &n); 987 } 988 vec4F_normalize_in_place(&axis); 989 } 990 else 991 { 992 // Use covar technique for RGB PCA, because it doesn't require per-pixel normalization. 993 float[6] cov = [ 0, 0, 0, 0, 0, 0 ]; 994 995 for (uint i = 0; i < pParams.m_num_pixels; i++) 996 { 997 const color_quad_u8 *pV = &pParams.m_pPixels[i]; 998 float r = pV.m_c[0] - meanColorScaled.m_c[0]; 999 float g = pV.m_c[1] - meanColorScaled.m_c[1]; 1000 float b = pV.m_c[2] - meanColorScaled.m_c[2]; 1001 cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b; 1002 } 1003 1004 float vfr = .9f, vfg = 1.0f, vfb = .7f; 1005 for (uint iter = 0; iter < 3; iter++) 1006 { 1007 float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2]; 1008 float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4]; 1009 float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5]; 1010 1011 float m = maximumf(maximumf(abs(r), abs(g)), abs(b)); 1012 if (m > 1e-10f) 1013 { 1014 m = 1.0f / m; 1015 r *= m; g *= m; b *= m; 1016 } 1017 1018 vfr = r; vfg = g; vfb = b; 1019 } 1020 1021 float len = vfr*vfr + vfg*vfg + vfb*vfb; 1022 if (len < 1e-10f) 1023 vec4F_set_scalar(&axis, 0.0f); 1024 else 1025 { 1026 len = 1.0f / sqrt(len); 1027 vfr *= len; vfg *= len; vfb *= len; 1028 vec4F_set(&axis, vfr, vfg, vfb, 0); 1029 } 1030 } 1031 1032 if (vec4F_dot(&axis, &axis) < .5f) 1033 { 1034 if (pParams.m_perceptual) 1035 vec4F_set(&axis, .213f, .715f, .072f, pParams.m_has_alpha ? .715f : 0); 1036 else 1037 vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams.m_has_alpha ? 1.0f : 0); 1038 vec4F_normalize_in_place(&axis); 1039 } 1040 1041 float l = 1e+9f, h = -1e+9f; 1042 1043 for (uint i = 0; i < pParams.m_num_pixels; i++) 1044 { 1045 vec4F color = vec4F_from_color(&pParams.m_pPixels[i]); 1046 1047 vec4F q = vec4F_sub(&color, &meanColorScaled); 1048 float d = vec4F_dot(&q, &axis); 1049 1050 l = minimumf(l, d); 1051 h = maximumf(h, d); 1052 } 1053 1054 l *= (1.0f / 255.0f); 1055 h *= (1.0f / 255.0f); 1056 1057 vec4F b0 = vec4F_mul(&axis, l); 1058 vec4F b1 = vec4F_mul(&axis, h); 1059 vec4F c0 = vec4F_add(&meanColor, &b0); 1060 vec4F c1 = vec4F_add(&meanColor, &b1); 1061 vec4F minColor = vec4F_saturate(&c0); 1062 vec4F maxColor = vec4F_saturate(&c1); 1063 1064 vec4F whiteVec; 1065 vec4F_set_scalar(&whiteVec, 1.0f); 1066 if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec)) 1067 { 1068 vec4F temp = minColor; 1069 minColor = maxColor; 1070 maxColor = temp; 1071 } 1072 // First find a solution using the block's PCA. 1073 if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults)) 1074 return 0; 1075 1076 if (pComp_params.m_try_least_squares) 1077 { 1078 // Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors. 1079 vec4F xl, xh; 1080 vec4F_set_scalar(&xl, 0.0f); 1081 vec4F_set_scalar(&xh, 0.0f); 1082 if (pParams.m_has_alpha) 1083 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1084 else 1085 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, pResults.m_pSelectors, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1086 1087 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1088 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1089 1090 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1091 return 0; 1092 } 1093 1094 if (pComp_params.m_uber_level > 0) 1095 { 1096 // In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors, 1097 // then try decrementing the selectrors, then try both. 1098 ubyte[16] selectors_temp, selectors_temp1; 1099 memcpy(selectors_temp.ptr, pResults.m_pSelectors, pParams.m_num_pixels); 1100 1101 const int max_selector = pParams.m_num_selector_weights - 1; 1102 1103 uint min_sel = 16; 1104 uint max_sel = 0; 1105 for (uint i = 0; i < pParams.m_num_pixels; i++) 1106 { 1107 uint sel = selectors_temp[i]; 1108 min_sel = minimumu(min_sel, sel); 1109 max_sel = maximumu(max_sel, sel); 1110 } 1111 1112 for (uint i = 0; i < pParams.m_num_pixels; i++) 1113 { 1114 uint sel = selectors_temp[i]; 1115 if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1))) 1116 sel++; 1117 selectors_temp1[i] = cast(ubyte)sel; 1118 } 1119 1120 vec4F xl, xh; 1121 vec4F_set_scalar(&xl, 0.0f); 1122 vec4F_set_scalar(&xh, 0.0f); 1123 if (pParams.m_has_alpha) 1124 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 1125 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1126 else 1127 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 1128 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1129 1130 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1131 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1132 1133 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1134 return 0; 1135 1136 for (uint i = 0; i < pParams.m_num_pixels; i++) 1137 { 1138 uint sel = selectors_temp[i]; 1139 if ((sel == max_sel) && (sel > 0)) 1140 sel--; 1141 selectors_temp1[i] = cast(ubyte)sel; 1142 } 1143 1144 if (pParams.m_has_alpha) 1145 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 1146 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1147 else 1148 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 1149 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1150 1151 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1152 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1153 1154 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1155 return 0; 1156 1157 for (uint i = 0; i < pParams.m_num_pixels; i++) 1158 { 1159 uint sel = selectors_temp[i]; 1160 if ((sel == min_sel) && (sel < (pParams.m_num_selector_weights - 1))) 1161 sel++; 1162 else if ((sel == max_sel) && (sel > 0)) 1163 sel--; 1164 selectors_temp1[i] = cast(ubyte)sel; 1165 } 1166 1167 if (pParams.m_has_alpha) 1168 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, 1169 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1170 else 1171 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, 1172 pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1173 1174 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1175 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1176 1177 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1178 return 0; 1179 1180 // In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another. 1181 const uint uber_err_thresh = (pParams.m_num_pixels * 56) >> 4; 1182 if ((pComp_params.m_uber_level >= 2) && (pResults.m_best_overall_err > uber_err_thresh)) 1183 { 1184 const int Q = (pComp_params.m_uber_level >= 4) ? (pComp_params.m_uber_level - 2) : 1; 1185 for (int ly = -Q; ly <= 1; ly++) 1186 { 1187 for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++) 1188 { 1189 if ((ly == 0) && (hy == max_selector)) 1190 continue; 1191 1192 for (uint i = 0; i < pParams.m_num_pixels; i++) 1193 selectors_temp1[i] = cast(ubyte)clampf(floor(cast(float)max_selector * (cast(float)selectors_temp[i] - cast(float)ly) / (cast(float)hy - cast(float)ly) + .5f), 0, cast(float)max_selector); 1194 1195 //vec4F xl, xh; 1196 vec4F_set_scalar(&xl, 0.0f); 1197 vec4F_set_scalar(&xh, 0.0f); 1198 if (pParams.m_has_alpha) 1199 compute_least_squares_endpoints_rgba(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1200 else 1201 compute_least_squares_endpoints_rgb(pParams.m_num_pixels, selectors_temp1.ptr, pParams.m_pSelector_weightsx, &xl, &xh, pParams.m_pPixels); 1202 1203 xl = vec4F_mul(&xl, (1.0f / 255.0f)); 1204 xh = vec4F_mul(&xh, (1.0f / 255.0f)); 1205 1206 if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 1207 return 0; 1208 } 1209 } 1210 } 1211 } 1212 1213 if (mode == 1) 1214 { 1215 // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean. 1216 color_cell_compressor_results avg_results = *pResults; 1217 const uint r = cast(int)(.5f + meanColor.m_c[0] * 255.0f), 1218 g = cast(int)(.5f + meanColor.m_c[1] * 255.0f), 1219 b = cast(int)(.5f + meanColor.m_c[2] * 255.0f); 1220 ulong avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults.m_pSelectors_temp); 1221 if (avg_err < pResults.m_best_overall_err) 1222 { 1223 *pResults = avg_results; 1224 memcpy(pResults.m_pSelectors, pResults.m_pSelectors_temp, (pResults.m_pSelectors[0]).sizeof * pParams.m_num_pixels); 1225 pResults.m_best_overall_err = avg_err; 1226 } 1227 } 1228 1229 return pResults.m_best_overall_err; 1230 } 1231 1232 ulong color_cell_compression_est(uint num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint* pweights/*[4]*/, ulong best_err_so_far) @system 1233 { 1234 // Find RGB bounds as an approximation of the block's principle axis 1235 uint lr = 255, lg = 255, lb = 255; 1236 uint hr = 0, hg = 0, hb = 0; 1237 for (uint i = 0; i < num_pixels; i++) 1238 { 1239 const color_quad_u8 *pC = &pPixels[i]; 1240 if (pC.m_c[0] < lr) lr = pC.m_c[0]; 1241 if (pC.m_c[1] < lg) lg = pC.m_c[1]; 1242 if (pC.m_c[2] < lb) lb = pC.m_c[2]; 1243 if (pC.m_c[0] > hr) hr = pC.m_c[0]; 1244 if (pC.m_c[1] > hg) hg = pC.m_c[1]; 1245 if (pC.m_c[2] > hb) hb = pC.m_c[2]; 1246 } 1247 1248 color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0); 1249 color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0); 1250 1251 // Place endpoints at bbox diagonals and compute interpolated colors 1252 const uint N = 8; 1253 color_quad_u8[8] weightedColors; 1254 1255 weightedColors[0] = lowColor; 1256 weightedColors[N - 1] = highColor; 1257 for (uint i = 1; i < (N - 1); i++) 1258 { 1259 weightedColors[i].m_c[0] = cast(ubyte)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6); 1260 weightedColors[i].m_c[1] = cast(ubyte)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6); 1261 weightedColors[i].m_c[2] = cast(ubyte)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6); 1262 } 1263 1264 // Compute dots and thresholds 1265 const int ar = highColor.m_c[0] - lowColor.m_c[0]; 1266 const int ag = highColor.m_c[1] - lowColor.m_c[1]; 1267 const int ab = highColor.m_c[2] - lowColor.m_c[2]; 1268 1269 int[8] dots; 1270 for (uint i = 0; i < N; i++) 1271 dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab; 1272 1273 int[8 - 1] thresh; 1274 for (uint i = 0; i < (N - 1); i++) 1275 thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1; 1276 1277 ulong total_err = 0; 1278 if (perceptual) 1279 { 1280 // Transform block's interpolated colors to YCbCr 1281 int[8] l1, cr1, cb1; 1282 for (int j = 0; j < 8; j++) 1283 { 1284 const color_quad_u8 *pE1 = &weightedColors[j]; 1285 l1[j] = pE1.m_c[0] * 109 + pE1.m_c[1] * 366 + pE1.m_c[2] * 37; 1286 cr1[j] = (cast(int)pE1.m_c[0] << 9) - l1[j]; 1287 cb1[j] = (cast(int)pE1.m_c[2] << 9) - l1[j]; 1288 } 1289 1290 for (uint i = 0; i < num_pixels; i++) 1291 { 1292 const color_quad_u8 *pC = &pPixels[i]; 1293 1294 int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2]; 1295 1296 // Find approximate selector 1297 uint s = 0; 1298 if (d >= thresh[6]) 1299 s = 7; 1300 else if (d >= thresh[5]) 1301 s = 6; 1302 else if (d >= thresh[4]) 1303 s = 5; 1304 else if (d >= thresh[3]) 1305 s = 4; 1306 else if (d >= thresh[2]) 1307 s = 3; 1308 else if (d >= thresh[1]) 1309 s = 2; 1310 else if (d >= thresh[0]) 1311 s = 1; 1312 1313 // Compute error 1314 const int l2 = pC.m_c[0] * 109 + pC.m_c[1] * 366 + pC.m_c[2] * 37; 1315 const int cr2 = (cast(int)pC.m_c[0] << 9) - l2; 1316 const int cb2 = (cast(int)pC.m_c[2] << 9) - l2; 1317 1318 const int dl = (l1[s] - l2) >> 8; 1319 const int dcr = (cr1[s] - cr2) >> 8; 1320 const int dcb = (cb1[s] - cb2) >> 8; 1321 1322 int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb); 1323 1324 total_err += ie; 1325 if (total_err > best_err_so_far) 1326 break; 1327 } 1328 } 1329 else 1330 { 1331 for (uint i = 0; i < num_pixels; i++) 1332 { 1333 const color_quad_u8 *pC = &pPixels[i]; 1334 1335 int d = ar * pC.m_c[0] + ag * pC.m_c[1] + ab * pC.m_c[2]; 1336 1337 // Find approximate selector 1338 uint s = 0; 1339 if (d >= thresh[6]) 1340 s = 7; 1341 else if (d >= thresh[5]) 1342 s = 6; 1343 else if (d >= thresh[4]) 1344 s = 5; 1345 else if (d >= thresh[3]) 1346 s = 4; 1347 else if (d >= thresh[2]) 1348 s = 3; 1349 else if (d >= thresh[1]) 1350 s = 2; 1351 else if (d >= thresh[0]) 1352 s = 1; 1353 1354 // Compute error 1355 const color_quad_u8 *pE1 = &weightedColors[s]; 1356 1357 int dr = cast(int)pE1.m_c[0] - cast(int)pC.m_c[0]; 1358 int dg = cast(int)pE1.m_c[1] - cast(int)pC.m_c[1]; 1359 int db = cast(int)pE1.m_c[2] - cast(int)pC.m_c[2]; 1360 1361 total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db); 1362 if (total_err > best_err_so_far) 1363 break; 1364 } 1365 } 1366 1367 return total_err; 1368 } 1369 1370 // This table contains bitmasks indicating which "key" partitions must be best ranked before this partition is worth evaluating. 1371 // We first rank the best/most used 14 partitions (sorted by usefulness), record the best one found as the key partition, then use 1372 // that to control the other partitions to evaluate. The quality loss is ~.08 dB RGB PSNR, the perf gain is up to ~11% (at uber level 0). 1373 static immutable uint[35] g_partition_predictors = 1374 [ 1375 uint.max, 1376 uint.max, 1377 uint.max, 1378 uint.max, 1379 uint.max, 1380 (1 << 1) | (1 << 2) | (1 << 8), 1381 (1 << 1) | (1 << 3) | (1 << 7), 1382 uint.max, 1383 uint.max, 1384 (1 << 2) | (1 << 8) | (1 << 16), 1385 (1 << 7) | (1 << 3) | (1 << 15), 1386 uint.max, 1387 (1 << 8) | (1 << 14) | (1 << 16), 1388 (1 << 7) | (1 << 14) | (1 << 15), 1389 uint.max, 1390 uint.max, 1391 uint.max, 1392 uint.max, 1393 (1 << 14) | (1 << 15), 1394 (1 << 16) | (1 << 22) | (1 << 14), 1395 (1 << 17) | (1 << 24) | (1 << 14), 1396 (1 << 2) | (1 << 14) | (1 << 15) | (1 << 1), 1397 uint.max, 1398 (1 << 1) | (1 << 3) | (1 << 14) | (1 << 16) | (1 << 22), 1399 uint.max, 1400 (1 << 1) | (1 << 2) | (1 << 15) | (1 << 17) | (1 << 24), 1401 (1 << 1) | (1 << 3) | (1 << 22), 1402 uint.max, 1403 uint.max, 1404 uint.max, 1405 (1 << 14) | (1 << 15) | (1 << 16) | (1 << 17), 1406 uint.max, 1407 uint.max, 1408 (1 << 1) | (1 << 2) | (1 << 3) | (1 << 27) | (1 << 4) | (1 << 24), 1409 (1 << 14) | (1 << 15) | (1 << 16) | (1 << 11) | (1 << 17) | (1 << 27) 1410 ]; 1411 1412 // Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each. 1413 uint estimate_partition(const(color_quad_u8)* pPixels, 1414 const(bc7enc16_compress_block_params)* pComp_params, 1415 uint* pweights/*[4]*/) @system 1416 { 1417 const uint total_partitions = minimumu(pComp_params.m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1); 1418 if (total_partitions <= 1) 1419 return 0; 1420 1421 ulong best_err = ulong.max; 1422 uint best_partition = 0; 1423 1424 // Partition order sorted by usage frequency across a large test corpus. Pattern 34 (checkerboard) must appear in slot 34. 1425 // Using a sorted order allows the user to decrease the # of partitions to scan with minimal loss in quality. 1426 static immutable ubyte[64] s_sorted_partition_order = 1427 [ 1428 1 - 1, 14 - 1, 2 - 1, 3 - 1, 16 - 1, 15 - 1, 11 - 1, 17 - 1, 1429 4 - 1, 24 - 1, 27 - 1, 7 - 1, 8 - 1, 22 - 1, 20 - 1, 30 - 1, 1430 9 - 1, 5 - 1, 10 - 1, 21 - 1, 6 - 1, 32 - 1, 23 - 1, 18 - 1, 1431 19 - 1, 12 - 1, 13 - 1, 31 - 1, 25 - 1, 26 - 1, 29 - 1, 28 - 1, 1432 33 - 1, 34 - 1, 35 - 1, 46 - 1, 47 - 1, 52 - 1, 50 - 1, 51 - 1, 1433 49 - 1, 39 - 1, 40 - 1, 38 - 1, 54 - 1, 53 - 1, 55 - 1, 37 - 1, 1434 58 - 1, 59 - 1, 56 - 1, 42 - 1, 41 - 1, 43 - 1, 44 - 1, 60 - 1, 1435 45 - 1, 57 - 1, 48 - 1, 36 - 1, 61 - 1, 64 - 1, 63 - 1, 62 - 1 1436 ]; 1437 1438 assert(s_sorted_partition_order[34] == 34); 1439 1440 int best_key_partition = 0; 1441 1442 for (uint partition_iter = 0; (partition_iter < total_partitions) && (best_err > 0); partition_iter++) 1443 { 1444 const uint partition = s_sorted_partition_order[partition_iter]; 1445 1446 // Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14. 1447 if (pComp_params.m_mode1_partition_estimation_filterbank) 1448 { 1449 if ((partition_iter >= 14) && (partition_iter <= 34)) 1450 { 1451 const uint best_key_partition_bitmask = 1 << (best_key_partition + 1); 1452 if ((g_partition_predictors[partition] & best_key_partition_bitmask) == 0) 1453 { 1454 if (partition_iter == 34) 1455 break; 1456 1457 continue; 1458 } 1459 } 1460 } 1461 1462 const ubyte *pPartition = &g_bc7_partition2[partition * 16]; 1463 1464 color_quad_u8[16][2] subset_colors; 1465 uint[2] subset_total_colors = [ 0, 0 ]; 1466 for (uint index = 0; index < 16; index++) 1467 subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index]; 1468 1469 ulong total_subset_err = 0; 1470 for (uint subset = 0; (subset < 2) && (total_subset_err < best_err); subset++) 1471 total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params.m_perceptual, pweights, best_err); 1472 1473 if (total_subset_err < best_err) 1474 { 1475 best_err = total_subset_err; 1476 best_partition = partition; 1477 } 1478 1479 // If the checkerboard pattern doesn't get the highest ranking vs. the previous (lower frequency) patterns, then just stop now because statistically the subsequent patterns won't do well either. 1480 if ((partition == 34) && (best_partition != 34)) 1481 break; 1482 1483 if (partition_iter == 13) 1484 best_key_partition = best_partition; 1485 1486 } // partition 1487 1488 return best_partition; 1489 } 1490 1491 void set_block_bits(ubyte *pBytes, uint val, uint num_bits, uint *pCur_ofs) @system 1492 { 1493 assert((num_bits <= 32) && (val < (1UL << num_bits))); 1494 while (num_bits) 1495 { 1496 const uint n = minimumu(8 - (*pCur_ofs & 7), num_bits); 1497 pBytes[*pCur_ofs >> 3] |= cast(ubyte)(val << (*pCur_ofs & 7)); 1498 val >>= n; 1499 num_bits -= n; 1500 *pCur_ofs += n; 1501 } 1502 assert(*pCur_ofs <= 128); 1503 } 1504 1505 struct bc7_optimization_results 1506 { 1507 uint m_mode; 1508 uint m_partition; 1509 ubyte[16] m_selectors; 1510 color_quad_u8[2] m_low; 1511 color_quad_u8[2] m_high; 1512 uint[2][2] m_pbits; 1513 } 1514 1515 static void encode_bc7_block(void *pBlock, const(bc7_optimization_results)* pResults) @system 1516 { 1517 const uint best_mode = pResults.m_mode; 1518 const uint total_subsets = g_bc7_num_subsets[best_mode]; 1519 const uint total_partitions = 1 << g_bc7_partition_bits[best_mode]; 1520 const ubyte *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults.m_partition * 16] : &g_bc7_partition1[0]; 1521 1522 ubyte[16] color_selectors; 1523 memcpy(color_selectors.ptr, pResults.m_selectors.ptr, 16); 1524 1525 color_quad_u8[2] low, high; 1526 memcpy(low.ptr, pResults.m_low.ptr, low.sizeof); 1527 memcpy(high.ptr, pResults.m_high.ptr, high.sizeof); 1528 1529 uint[2][2] pbits; 1530 static assert(pbits.sizeof == 16); 1531 memcpy(pbits.ptr, pResults.m_pbits.ptr, pbits.sizeof); 1532 1533 int[2] anchor = [ -1, -1 ]; 1534 1535 for (uint k = 0; k < total_subsets; k++) 1536 { 1537 const uint anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults.m_partition] : 0; 1538 anchor[k] = anchor_index; 1539 1540 const uint color_index_bits = get_bc7_color_index_size(best_mode, 0); 1541 const uint num_color_indices = 1 << color_index_bits; 1542 1543 if (color_selectors[anchor_index] & (num_color_indices >> 1)) 1544 { 1545 for (uint i = 0; i < 16; i++) 1546 if (pPartition[i] == k) 1547 color_selectors[i] = cast(ubyte)((num_color_indices - 1) - color_selectors[i]); 1548 1549 color_quad_u8 tmp = low[k]; 1550 low[k] = high[k]; 1551 high[k] = tmp; 1552 1553 if (!g_bc7_mode_has_shared_p_bits[best_mode]) 1554 { 1555 uint t = pbits[k][0]; 1556 pbits[k][0] = pbits[k][1]; 1557 pbits[k][1] = t; 1558 } 1559 } 1560 } 1561 1562 ubyte *pBlock_bytes = cast(ubyte *)(pBlock); 1563 memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE); 1564 1565 uint cur_bit_ofs = 0; 1566 set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs); 1567 1568 if (total_partitions > 1) 1569 set_block_bits(pBlock_bytes, pResults.m_partition, 6, &cur_bit_ofs); 1570 1571 const uint total_comps = (best_mode >= 4) ? 4 : 3; 1572 for (uint comp = 0; comp < total_comps; comp++) 1573 { 1574 for (uint subset = 0; subset < total_subsets; subset++) 1575 { 1576 set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); 1577 set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); 1578 } 1579 } 1580 1581 for (uint subset = 0; subset < total_subsets; subset++) 1582 { 1583 set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs); 1584 if (!g_bc7_mode_has_shared_p_bits[best_mode]) 1585 set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs); 1586 } 1587 1588 for (int idx = 0; idx < 16; idx++) 1589 { 1590 uint n = get_bc7_color_index_size(best_mode, 0); 1591 if ((idx == anchor[0]) || (idx == anchor[1])) 1592 n--; 1593 set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs); 1594 } 1595 1596 assert(cur_bit_ofs == 128); 1597 } 1598 1599 void handle_alpha_block(void *pBlock, const(color_quad_u8)* pPixels, 1600 const(bc7enc16_compress_block_params)* pComp_params, 1601 color_cell_compressor_params *pParams) @system 1602 { 1603 color_cell_compressor_results results6; 1604 1605 pParams.m_pSelector_weights = g_bc7_weights4.ptr; 1606 pParams.m_pSelector_weightsx = cast(const(vec4F)*) g_bc7_weights4x.ptr; 1607 pParams.m_num_selector_weights = 16; 1608 pParams.m_comp_bits = 7; 1609 pParams.m_has_pbits = BC7ENC16_TRUE; 1610 pParams.m_has_alpha = BC7ENC16_TRUE; 1611 pParams.m_perceptual = pComp_params.m_perceptual; 1612 pParams.m_num_pixels = 16; 1613 pParams.m_pPixels = pPixels; 1614 1615 bc7_optimization_results opt_results; 1616 results6.m_pSelectors = opt_results.m_selectors.ptr; 1617 1618 ubyte[16] selectors_temp; 1619 results6.m_pSelectors_temp = selectors_temp.ptr; 1620 1621 color_cell_compression(6, pParams, &results6, pComp_params); 1622 1623 opt_results.m_mode = 6; 1624 opt_results.m_partition = 0; 1625 opt_results.m_low[0] = results6.m_low_endpoint; 1626 opt_results.m_high[0] = results6.m_high_endpoint; 1627 opt_results.m_pbits[0][0] = results6.m_pbits[0]; 1628 opt_results.m_pbits[0][1] = results6.m_pbits[1]; 1629 1630 encode_bc7_block(pBlock, &opt_results); 1631 } 1632 1633 static void handle_opaque_block(void *pBlock, 1634 const(color_quad_u8)* pPixels, 1635 const(bc7enc16_compress_block_params)* pComp_params, 1636 color_cell_compressor_params *pParams) @system 1637 { 1638 ubyte[16] selectors_temp; 1639 1640 // Mode 6 1641 bc7_optimization_results opt_results; 1642 1643 pParams.m_pSelector_weights = g_bc7_weights4.ptr; 1644 pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights4x; 1645 pParams.m_num_selector_weights = 16; 1646 pParams.m_comp_bits = 7; 1647 pParams.m_has_pbits = BC7ENC16_TRUE; 1648 pParams.m_endpoints_share_pbit = BC7ENC16_FALSE; 1649 pParams.m_perceptual = pComp_params.m_perceptual; 1650 pParams.m_num_pixels = 16; 1651 pParams.m_pPixels = pPixels; 1652 pParams.m_has_alpha = BC7ENC16_FALSE; 1653 1654 color_cell_compressor_results results6; 1655 results6.m_pSelectors = opt_results.m_selectors.ptr; 1656 results6.m_pSelectors_temp = selectors_temp.ptr; 1657 1658 ulong best_err = color_cell_compression(6, pParams, &results6, pComp_params); 1659 1660 opt_results.m_mode = 6; 1661 opt_results.m_partition = 0; 1662 opt_results.m_low[0] = results6.m_low_endpoint; 1663 opt_results.m_high[0] = results6.m_high_endpoint; 1664 opt_results.m_pbits[0][0] = results6.m_pbits[0]; 1665 opt_results.m_pbits[0][1] = results6.m_pbits[1]; 1666 1667 // Mode 1 1668 if ((best_err > 0) && (pComp_params.m_max_partitions_mode1 > 0)) 1669 { 1670 const uint trial_partition = estimate_partition(pPixels, pComp_params, pParams.m_weights.ptr); 1671 pParams.m_pSelector_weights = g_bc7_weights3.ptr; 1672 pParams.m_pSelector_weightsx = cast(const vec4F *)g_bc7_weights3x; 1673 pParams.m_num_selector_weights = 8; 1674 pParams.m_comp_bits = 6; 1675 pParams.m_has_pbits = BC7ENC16_TRUE; 1676 pParams.m_endpoints_share_pbit = BC7ENC16_TRUE; 1677 1678 const ubyte *pPartition = &g_bc7_partition2[trial_partition * 16]; 1679 1680 color_quad_u8[16][2] subset_colors; 1681 1682 uint[2] subset_total_colors1 = [ 0, 0 ]; 1683 1684 ubyte[16][2] subset_pixel_index1; 1685 ubyte[16][2] subset_selectors1; 1686 color_cell_compressor_results[2] subset_results1; 1687 1688 for (uint idx = 0; idx < 16; idx++) 1689 { 1690 const uint p = pPartition[idx]; 1691 subset_colors[p][subset_total_colors1[p]] = pPixels[idx]; 1692 subset_pixel_index1[p][subset_total_colors1[p]] = cast(ubyte)idx; 1693 subset_total_colors1[p]++; 1694 } 1695 1696 ulong trial_err = 0; 1697 for (uint subset = 0; subset < 2; subset++) 1698 { 1699 pParams.m_num_pixels = subset_total_colors1[subset]; 1700 pParams.m_pPixels = &subset_colors[subset][0]; 1701 1702 color_cell_compressor_results *pResults = &subset_results1[subset]; 1703 pResults.m_pSelectors = &subset_selectors1[subset][0]; 1704 pResults.m_pSelectors_temp = selectors_temp.ptr; 1705 ulong err = color_cell_compression(1, pParams, pResults, pComp_params); 1706 trial_err += err; 1707 if (trial_err > best_err) 1708 break; 1709 1710 } // subset 1711 1712 if (trial_err < best_err) 1713 { 1714 best_err = trial_err; 1715 opt_results.m_mode = 1; 1716 opt_results.m_partition = trial_partition; 1717 for (uint subset = 0; subset < 2; subset++) 1718 { 1719 for (uint i = 0; i < subset_total_colors1[subset]; i++) 1720 opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i]; 1721 opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint; 1722 opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint; 1723 opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0]; 1724 } 1725 } 1726 } 1727 1728 encode_bc7_block(pBlock, &opt_results); 1729 } 1730 1731 // Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6. 1732 // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6. 1733 // Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.) 1734 bc7enc16_bool bc7enc16_compress_block(void *pBlock, 1735 const(void)* pPixelsRGBA, 1736 const(bc7enc16_compress_block_params)* pComp_params) @system 1737 { 1738 assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0); 1739 1740 const color_quad_u8 *pPixels = cast(const color_quad_u8 *)(pPixelsRGBA); 1741 1742 color_cell_compressor_params params; 1743 if (pComp_params.m_perceptual) 1744 { 1745 // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion 1746 const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f)); 1747 const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f)); 1748 params.m_weights[0] = cast(int)(pComp_params.m_weights[0] * 4.0f); 1749 params.m_weights[1] = cast(int)(pComp_params.m_weights[1] * 4.0f * pr_weight); 1750 params.m_weights[2] = cast(int)(pComp_params.m_weights[2] * 4.0f * pb_weight); 1751 params.m_weights[3] = pComp_params.m_weights[3] * 4; 1752 } 1753 else 1754 memcpy(params.m_weights.ptr, pComp_params.m_weights.ptr, (params.m_weights).sizeof); 1755 1756 for (uint i = 0; i < 16; i++) 1757 { 1758 if (pPixels[i].m_c[3] < 255) 1759 { 1760 handle_alpha_block(pBlock, pPixels, pComp_params, ¶ms); 1761 return BC7ENC16_TRUE; 1762 } 1763 } 1764 handle_opaque_block(pBlock, pPixels, pComp_params, ¶ms); 1765 return BC7ENC16_FALSE; 1766 } 1767 1768 /* 1769 ------------------------------------------------------------------------------ 1770 This software is available under 2 licenses -- choose whichever you prefer. 1771 ------------------------------------------------------------------------------ 1772 ALTERNATIVE A - MIT License 1773 Copyright(c) 2018 Richard Geldreich, Jr. 1774 Permission is hereby granted, free of charge, to any person obtaining a copy of 1775 this software and associated documentation files(the "Software"), to deal in 1776 the Software without restriction, including without limitation the rights to 1777 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 1778 of the Software, and to permit persons to whom the Software is furnished to do 1779 so, subject to the following conditions : 1780 The above copyright notice and this permission notice shall be included in all 1781 copies or substantial portions of the Software. 1782 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1783 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1784 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 1785 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1786 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1787 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1788 SOFTWARE. 1789 ------------------------------------------------------------------------------ 1790 ALTERNATIVE B - Public Domain(www.unlicense.org) 1791 This is free and unencumbered software released into the public domain. 1792 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 1793 software, either in source code form or as a compiled binary, for any purpose, 1794 commercial or non - commercial, and by any means. 1795 In jurisdictions that recognize copyright laws, the author or authors of this 1796 software dedicate any and all copyright interest in the software to the public 1797 domain.We make this dedication for the benefit of the public at large and to 1798 the detriment of our heirs and successors.We intend this dedication to be an 1799 overt act of relinquishment in perpetuity of all present and future rights to 1800 this software under copyright law. 1801 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1802 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1803 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 1804 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 1805 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 1806 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 1807 ------------------------------------------------------------------------------ 1808 */