1 module gamut.codecs.qoi10b; 2 3 nothrow @nogc: 4 5 import core.stdc.stdlib: realloc, malloc, free; 6 import core.stdc.string: memset; 7 8 import gamut.codecs.qoi2avg; 9 import inteli.emmintrin; 10 11 /// A QOI-inspired codec for 10-bit images, called "QOI-10b". 12 /// Input image is 16-bit ushort, but only 10-bits gets encoded making it lossy. 13 /// 14 /// 15 /// Incompatible adaptation of QOI format - https://phoboslab.org 16 /// 17 /// -- LICENSE: The MIT License(MIT) 18 /// Copyright(c) 2021 Dominic Szablewski (original QOI format) 19 /// Copyright(c) 2022 Guillaume Piolat (QOI-10b variant for 10b images, 1/2/3/4 channels). 20 /// Permission is hereby granted, free of charge, to any person obtaining a copy of 21 /// this software and associated documentation files(the "Software"), to deal in 22 /// the Software without restriction, including without limitation the rights to 23 /// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 24 /// of the Software, and to permit persons to whom the Software is furnished to do 25 /// so, subject to the following conditions : 26 /// The above copyright notice and this permission notice shall be included in all 27 /// copies or substantial portions of the Software. 28 /// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 /// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 /// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 31 /// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 /// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 /// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 /// SOFTWARE. 35 36 /// -- Documentation 37 38 /// This library provides the following functions; 39 /// - qoi10b_decode -- decode the raw bytes of a QOI-10b image from memory 40 /// - qoi10b_encode -- encode an rgba buffer into a QOI-10b image in memory 41 /// 42 /// 43 /// A QOI-10b file has a 25 byte header, compatible with Gamut QOIX. 44 /// 45 /// struct qoix_header_t { 46 /// char magic[4]; // magic bytes "qoix" 47 /// uint32_t width; // image width in pixels (BE) 48 /// uint32_t height; // image height in pixels (BE) 49 /// uint8_t version_; // Major version of QOIX format. 50 /// uint8_t channels; // 1, 2, 3 or 4 51 /// uint8_t bitdepth; // 10 = this QOI-10b codec is always 10-bit (8 would indicate QOI2AVG or QOI-plane) 52 /// uint8_t colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear 53 /// uint8_t compression; // 0 = none, 1 = LZ4 54 /// float pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio 55 /// float resolutionX; // -1 = unknown, else physical resolution in DPI 56 /// }; 57 /// 58 /// The decoder and encoder start with {r: 0; g: 0; b: 0; a: 0} as the previous 59 /// pixel value. Pixels are either encoded as 60 /// - a run of the previous pixel 61 /// - a difference to the previous pixel value 62 /// - full luminance value 63 /// 64 /// The byte stream's end is marked with 5 0xff bytes. 65 /// 66 /// This codec is simply like QOI2AVG but with extra 2 bits for each components, and it breaks byte alignment. 67 /// 68 /// Optimized? Opcode Bits(RGB) Bits(grey) Meaning 69 /// [ ] QOI_OP_LUMA 14 6 0ggggg[rrrrbbbb] (less g or more g => doesn't work) 70 /// [ ] QOI_OP_LUMA0 12 6 10gggg[rrrbbb] 71 /// [ ] QOI_OP_LUMA2 22 10 110ggggggg[rrrrrrbbbbbb] 72 /// [ ] QOI_OP_LUMA3 30 14 11100ggggggggg[rrrrrrrrbbbbbbbb] 73 /// [ ] QOI_OP_ADIFF 10 10 11101xxxxx 74 /// [x] QOI_OP_RUN 8 8 11110xxx 75 /// 16 16 11110111xxxxxxxx 76 /// [ ] QOI_OP_ADIFF2 16 16 111110xxxxxxxx 77 /// [x] QOI_OP_GRAY 18 18 11111100gggggggggg 78 /// [ ] QOI_OP_RGB 38 18 11111101rrrrrrrrrr[ggggggggggbbbbbbbbbb] 79 /// [ ] QOI_OP_RGBA 48 28 11111110rrrrrrrrrr[ggggggggggbbbbbbbbbb]aaaaaaaaaa 80 /// [ ] QOI_OP_END 8 8 11111111 81 82 // Note: LOCO-I predictor doesn't really work here, it slowdown quite a bit for 0.5% bitrate improvement. 83 // Not sure what happens here. 84 85 enum ubyte QOI_OP_ADIFF2 = 0xf8; 86 87 enum int WORST_OPCODE_BITS = 48; 88 89 enum enableAveragePrediction = true; 90 91 enum bool newpred = false;//true; 92 93 static immutable ubyte[5] qoi10b_padding = [255,255,255,255,255]; 94 95 enum qoi10_rgba_t initialPredictor = { r:0, g:0, b:0, a:1023 }; 96 97 struct qoi10_rgba_t 98 { 99 // 0 to 1023 values 100 ushort r; 101 ushort g; 102 ushort b; 103 ushort a; 104 } 105 106 uint QOI10b_COLOR_HASH(qoi10_rgba_t C) 107 { 108 long colorAsLong = *(cast(long*)&C); 109 return (((colorAsLong * 2654435769) >> 22) & 1023); 110 } 111 112 /* Encode raw RGBA16 pixels into a QOI-10b image in memory. 113 This immediately loosen precision from 16-bit t 10-bit, so this is lossy. 114 The function either returns null on failure (invalid parameters or malloc 115 failed) or a pointer to the encoded data on success. On success the out_len 116 is set to the size in bytes of the encoded data. 117 The returned qoi data should be free()d after use. */ 118 ubyte* qoi10b_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 119 { 120 if ( (desc.channels != 1 && desc.channels != 2 && desc.channels != 3 && desc.channels != 4) || 121 desc.height >= QOIX_PIXELS_MAX / desc.width || desc.compression != QOIX_COMPRESSION_NONE 122 ) { 123 return null; 124 } 125 126 if (desc.bitdepth != 10) 127 return null; 128 129 int channels = desc.channels; 130 131 // At worst, each pixel take 38 bit to be encoded. 132 int num_pixels = desc.width * desc.height; 133 134 int scanline_size = cast(int)(qoi10_rgba_t.sizeof) * desc.width; 135 136 int max_size = cast(int) (cast(long)num_pixels * WORST_OPCODE_BITS + 7) / 8 + QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof); 137 138 ubyte* stream; 139 140 int p = 0; // write index into output stream 141 ubyte* bytes = cast(ubyte*) QOI_MALLOC(max_size + 2 * scanline_size); 142 if (!bytes) 143 { 144 return null; 145 } 146 147 qoi_write_32(bytes, &p, QOIX_MAGIC); 148 qoi_write_32(bytes, &p, desc.width); 149 qoi_write_32(bytes, &p, desc.height); 150 bytes[p++] = 1; // Put a version number :) 151 bytes[p++] = desc.channels; // 1, 2, 3 or 4 152 bytes[p++] = desc.bitdepth; // 10 153 bytes[p++] = desc.colorspace; 154 bytes[p++] = QOIX_COMPRESSION_NONE; 155 qoi_write_32f(bytes, &p, desc.pixelAspectRatio); 156 qoi_write_32f(bytes, &p, desc.resolutionY); 157 158 int currentBit = 7; // beginning of a byte 159 bytes[p] = 0; 160 161 // write the nbits last bits of x, starting from the highest one 162 void outputBits(uint x, int nbits) nothrow @nogc 163 { 164 assert(nbits >= 2 && nbits <= 16); 165 assert( (nbits % 2) == 0); 166 167 for (int b = nbits - 2; b >= 0; b -= 2) 168 { 169 // which bit to write 170 ubyte pairOfBits = (x >>> b) & 3; 171 bytes[p] |= (pairOfBits << (currentBit - 1)); 172 173 currentBit -= 2; 174 175 if (currentBit == -1) 176 { 177 p++; 178 bytes[p] = 0; 179 currentBit = 7; 180 } 181 } 182 } 183 184 void outputByte(ubyte b) 185 { 186 outputBits(b, 8); 187 } 188 189 qoi10_rgba_t px = initialPredictor; 190 qoi10_rgba_t px_ref = initialPredictor; 191 192 // To serve as predictor 193 qoi10_rgba_t* scanlineConverted = cast(qoi10_rgba_t*)(&bytes[max_size]); 194 qoi10_rgba_t* lastScanlineConverted = cast(qoi10_rgba_t*)(&bytes[max_size + scanline_size]); 195 196 ubyte[1024] index_lookup; 197 uint index_pos = 0; 198 199 200 index_lookup[] = 0; 201 202 bool streamIsGrey = (channels == 1 || channels == 2); 203 int run = 0; 204 int encoded_pixels = 0; 205 206 void encodeRun() 207 { 208 assert(run > 0 && run <= 256); 209 run--; 210 if (run < 7) 211 { 212 outputByte( cast(ubyte)(QOI_OP_RUN | run) ); // run 1 to 7 213 } 214 else 215 { 216 outputByte( cast(ubyte)(QOI_OP_RUN | 7) ); // QOI_OP_RUN2 is inside the QOI_OP_RUN 217 outputBits(run - 7, 8); 218 } 219 run = 0; 220 } 221 222 for (int posy = 0; posy < desc.height; ++posy) 223 { 224 { 225 const(ushort)* line = cast(const(ushort*))(data + desc.pitchBytes * posy); 226 227 // 1. First convert the scanline to full qoi10_rgba_t to serve as predictor. 228 229 for (int posx = 0; posx < desc.width; ++posx) 230 { 231 qoi10_rgba_t pixel; 232 233 // Note that we drop six lower bits here. This codec is lossy 234 // if you really have more than 10-bits of precision. 235 // The use case is PBR knob in Dplug, who needs 10-bit (presumably) for the elevation map. 236 switch(channels) 237 { 238 default: 239 case 4: 240 pixel.r = line[posx * channels + 0]; 241 pixel.g = line[posx * channels + 1]; 242 pixel.b = line[posx * channels + 2]; 243 pixel.a = line[posx * channels + 3]; 244 break; 245 case 3: 246 pixel.r = line[posx * channels + 0]; 247 pixel.g = line[posx * channels + 1]; 248 pixel.b = line[posx * channels + 2]; 249 pixel.a = 65535; 250 break; 251 case 2: 252 pixel.r = line[posx * channels + 0]; 253 pixel.g = pixel.r; 254 pixel.b = pixel.r; 255 pixel.a = line[posx * channels + 1]; 256 break; 257 case 1: 258 pixel.r = line[posx * channels + 0]; 259 pixel.g = pixel.r; 260 pixel.b = pixel.r; 261 pixel.a = 65535; 262 break; 263 } 264 265 pixel.r = pixel.r >>> 6; 266 pixel.g = pixel.g >>> 6; 267 pixel.b = pixel.b >>> 6; 268 pixel.a = pixel.a >>> 6; 269 270 assert(pixel.r <= 1023); 271 assert(pixel.g <= 1023); 272 assert(pixel.b <= 1023); 273 assert(pixel.a <= 1023); 274 275 scanlineConverted[posx] = pixel; // Note: if we'd like lossy, this would be the reconstructed buffer. 276 } 277 } 278 279 for (int posx = 0; posx < desc.width; ++posx) 280 { 281 px_ref = px; 282 283 px = scanlineConverted[posx]; 284 285 if (px == px_ref) 286 { 287 run++; 288 if (run == 256 || encoded_pixels + 1 == num_pixels) 289 encodeRun(); 290 } 291 else 292 { 293 if (run > 0) 294 encodeRun(); 295 296 { 297 int va = (px.a - px_ref.a) & 1023; 298 if (va) 299 { 300 if (va < 16 || va >= (1024 - 16)) // does it fit on 5 bits? 301 { 302 // it fits on 5 bits 303 outputBits((0x1d << 5) | (va & 0x1f), 10); // QOI_OP_ADIFF 304 } 305 else if (va < 128 || va >= (1024 - 128)) // does it fit on 8 bits? 306 { 307 outputBits( (QOI_OP_ADIFF2 >>> 2), 6); 308 outputBits(va, 8); 309 } 310 else 311 { 312 outputByte(QOI_OP_RGBA); 313 outputBits(px.r, 10); 314 if (!streamIsGrey) 315 { 316 outputBits(px.g, 10); 317 outputBits(px.b, 10); 318 } 319 outputBits(px.a, 10); 320 goto pixel_is_encoded; 321 } 322 } 323 324 if (posy > 0 && enableAveragePrediction) 325 { 326 static if (newpred) 327 { 328 if (posx == 0) 329 { 330 // first pixel in the row, take above pixel 331 px_ref.r = lastScanlineConverted[posx].r; 332 px_ref.g = lastScanlineConverted[posx].g; 333 px_ref.b = lastScanlineConverted[posx].b; 334 } 335 else 336 { 337 qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastScanlineConverted[posx], lastScanlineConverted[posx-1]); 338 px_ref.r = pred.r; 339 px_ref.g = pred.g; 340 px_ref.b = pred.b; 341 } 342 } 343 else 344 { 345 px_ref.r = (px_ref.r + lastScanlineConverted[posx].r + 1) >> 1; 346 px_ref.g = (px_ref.g + lastScanlineConverted[posx].g + 1) >> 1; 347 px_ref.b = (px_ref.b + lastScanlineConverted[posx].b + 1) >> 1; 348 } 349 } 350 351 int vg = (px.g - px_ref.g) & 1023; 352 int vg_r = (px.r - px_ref.r - vg) & 1023; 353 int vg_b = (px.b - px_ref.b - vg) & 1023; 354 355 if (streamIsGrey) 356 { 357 assert(vg_r == 0); 358 assert(vg_b == 0); 359 } 360 361 362 if ( ( (vg_r >= (1024- 4)) || (vg_r < 4) ) && // fits in 3 bits? 363 ( (vg >= (1024-8)) || (vg < 8) ) && // fits in 4 bits? 364 ( (vg_b >= (1024- 4)) || (vg_b < 4) ) ) // fits in 3 bits? 365 { 366 outputBits(0x20 | (vg & 0x0f), 6); // QOI_OP_LUMA0 367 if (!streamIsGrey) 368 { 369 outputBits( (vg_r << 3) | (vg_b & 7), 6); 370 } 371 } 372 else if ( ( (vg_r >= (1024- 8)) || (vg_r < 8) ) && // fits in 4 bits? 373 ( (vg >= (1024-16)) || (vg < 16) ) && // fits in 5 bits? 374 ( (vg_b >= (1024- 8)) || (vg_b < 8) ) ) // fits in 4 bits? 375 { 376 outputBits(vg & 0x1f, 6); // QOI_OP_LUMA 377 if (!streamIsGrey) 378 { 379 outputBits(vg_r, 4); 380 outputBits(vg_b, 4); 381 } 382 } 383 else if (!streamIsGrey && px.g == px.r && px.g == px.b) 384 { 385 // Note: in greyscale, more expensive than QOI_OP_LUMA2 386 // This opcode should not be used if input is grey. 387 outputByte(QOI_OP_GRAY); 388 outputBits(px.g, 10); 389 } 390 else 391 if ( ( (vg_r >= (1024-32)) || (vg_r < 32) ) && // fits in 6 bits? 392 ( (vg >= (1024-64)) || (vg < 64) ) && // fits in 7 bits? 393 ( (vg_b >= (1024-32)) || (vg_b < 32) ) ) // fits in 6 bits? 394 { 395 outputBits((0x6 << 7) | (vg & 0x7f), 10); // QOI_OP_LUMA2 396 if (!streamIsGrey) 397 { 398 outputBits(vg_r, 6); 399 outputBits(vg_b, 6); 400 } 401 } 402 else 403 if ( ( (vg_r >= (1024-128)) || (vg_r < 128) ) && // fits in 8 bits? 404 ( (vg >= (1024-256)) || (vg < 256) ) && // fits in 9 bits? 405 ( (vg_b >= (1024-128)) || (vg_b < 128) ) ) // fits in 8 bits? 406 { 407 outputBits((0x1c << 9) | (vg & 0x1ff), 14); // QOI_OP_LUMA3 408 if (!streamIsGrey) 409 { 410 outputBits(vg_r, 8); 411 outputBits(vg_b, 8); 412 } 413 } 414 else 415 { 416 outputByte(QOI_OP_RGB); 417 outputBits(px.r, 10); 418 if (!streamIsGrey) 419 { 420 outputBits(px.g, 10); 421 outputBits(px.b, 10); 422 } 423 } 424 } 425 } 426 427 pixel_is_encoded: 428 429 encoded_pixels++; 430 } 431 432 // Exchange scanline pointers 433 { 434 qoi10_rgba_t* temp = scanlineConverted; 435 scanlineConverted = lastScanlineConverted; 436 lastScanlineConverted = temp; 437 } 438 } 439 440 for (int i = 0; i < cast(int)(qoi10b_padding.sizeof); i++) 441 { 442 outputByte(qoi10b_padding[i]); 443 } 444 445 // finish the last byte 446 if (currentBit != 7) 447 outputBits(0xff, currentBit + 1); 448 assert(currentBit == 7); // full byte 449 450 *out_len = p; 451 return bytes; 452 } 453 454 /* Decode a QOI-10b image from memory. 455 456 The function either returns null on failure (invalid parameters or malloc 457 failed) or a pointer to the decoded 16-bit pixels. On success, the qoi_desc struct 458 is filled with the description from the file header. 459 460 The returned pixel data should be free()d after use. */ 461 ubyte* qoi10b_decode(const(void)* data, int size, qoi_desc *desc, int channels) 462 { 463 const(ubyte)* bytes; 464 uint header_magic; 465 466 int p = 0, run = 0; 467 468 if (data == null || desc == null || 469 (channels < 0 || channels > 4) || 470 size < QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof) 471 ) 472 { 473 return null; 474 } 475 476 bytes = cast(const(ubyte)*)data; 477 478 header_magic = qoi_read_32(bytes, &p); 479 desc.width = qoi_read_32(bytes, &p); 480 desc.height = qoi_read_32(bytes, &p); 481 int qoix_version = bytes[p++]; 482 desc.channels = bytes[p++]; 483 desc.bitdepth = bytes[p++]; 484 desc.colorspace = bytes[p++]; 485 desc.compression = bytes[p++]; 486 desc.pixelAspectRatio = qoi_read_32f(bytes, &p); 487 desc.resolutionY = qoi_read_32f(bytes, &p); 488 489 if (desc.width == 0 || desc.height == 0 || 490 desc.channels < 1 || desc.channels > 4 || 491 desc.colorspace > 1 || 492 desc.bitdepth != 10 || 493 qoix_version > 1 || 494 desc.compression != QOIX_COMPRESSION_NONE || 495 header_magic != QOIX_MAGIC || 496 desc.height >= QOIX_PIXELS_MAX / desc.width 497 ) 498 { 499 return null; 500 } 501 502 int streamChannels = desc.channels; 503 504 if (channels == 0) 505 channels = streamChannels; 506 507 int stride = desc.width * channels * 2; 508 desc.pitchBytes = stride; 509 510 int pixel_data_size = stride * desc.height; 511 int decoded_scanline_size = desc.width * cast(int)qoi10_rgba_t.sizeof; 512 513 ubyte* pixels = cast(ubyte*) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size); 514 if (!pixels) { 515 return null; 516 } 517 518 // double-buffered scanline, for correct average predictors 519 // (else we can't decode 4/3 channels to 1/2 with average prediction, the predictors would be wrong 520 // if taken from the decoded output) 521 qoi10_rgba_t* decodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size]); 522 qoi10_rgba_t* lastDecodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]); 523 524 assert(channels >= 1 && channels <= 4); 525 526 int currentBit = 7; 527 528 void rewindInputBit() nothrow @nogc 529 { 530 if (currentBit == 7) 531 { 532 p--; 533 currentBit = -1; 534 } 535 currentBit++; 536 } 537 538 int read2Bits() nothrow @nogc 539 { 540 ubyte bb = bytes[p]; 541 542 int bit = (bytes[p] >>> (currentBit - 1)) & 3; 543 544 currentBit -= 2; 545 if (currentBit == -1) 546 { 547 currentBit = 7; 548 p++; 549 } 550 return bit; 551 } 552 553 uint readBits(int nbits) nothrow @nogc 554 { 555 assert(nbits % 2 == 0); 556 uint r = 0; 557 for (int b = 0; b < nbits; b += 2) 558 { 559 r = (r << 2) | read2Bits(); 560 } 561 return r; 562 } 563 564 ubyte readByte() nothrow @nogc 565 { 566 return cast(ubyte) readBits(8); 567 } 568 569 bool streamIsGrey = (streamChannels == 1 || streamChannels == 2); 570 571 qoi10_rgba_t px = initialPredictor; 572 qoi10_rgba_t px_ref = initialPredictor; 573 574 int decoded_pixels = 0; 575 int num_pixels = desc.width * desc.height; 576 577 for (int posy = 0; posy < desc.height; ++posy) 578 { 579 for (int posx = 0; posx < desc.width; ++posx) 580 { 581 px_ref = px; 582 583 if (run > 0) 584 { 585 run--; 586 } 587 else if (decoded_pixels < num_pixels) 588 { 589 // Compute averaged predictors then 590 591 if (posy > 0 && enableAveragePrediction) 592 { 593 static if (newpred) 594 { 595 if (posx == 0) 596 { 597 // first pixel in the row, take above pixel 598 px_ref.r = lastDecodedScanline[posx].r; 599 px_ref.g = lastDecodedScanline[posx].g; 600 px_ref.b = lastDecodedScanline[posx].b; 601 } 602 else 603 { 604 qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastDecodedScanline[posx], lastDecodedScanline[posx-1]); 605 px_ref.r = pred.r; 606 px_ref.g = pred.g; 607 px_ref.b = pred.b; 608 } 609 } 610 else 611 { 612 px_ref.r = (px_ref.r + lastDecodedScanline[posx].r + 1) >> 1; 613 px_ref.g = (px_ref.g + lastDecodedScanline[posx].g + 1) >> 1; 614 px_ref.b = (px_ref.b + lastDecodedScanline[posx].b + 1) >> 1; 615 } 616 } 617 618 decode_next_op: 619 620 ubyte op = readByte(); 621 622 if (op < 0x80) // QOI_OP_LUMA 623 { 624 int vg = (op >> 2) & 31; // vg is a signed 5-bit number 625 vg = (vg << 27) >> 27; // extends sign 626 px.g = (px_ref.g + vg ) & 1023; 627 628 if (!streamIsGrey) 629 { 630 int vg_r = ((op & 3) << 2) | readBits(2); // vg_r and vg_b are signed 4-bit number in the stream 631 vg_r = (vg_r << 28) >> 28; // extends sign 632 int vg_b = cast(int)(readBits(4) << 28) >> 28; 633 px.r = (px_ref.r + vg + vg_r) & 1023; 634 px.b = (px_ref.b + vg + vg_b) & 1023; 635 } 636 else 637 { 638 // Rewind two bits, this is always possible 639 rewindInputBit(); 640 rewindInputBit(); 641 px.r = px.g; 642 px.b = px.g; 643 } 644 } 645 else if (op < 0xc0) // QOI_OP_LUMA0 10xxxx 646 { 647 // 10gggg[rrrggg] 648 649 int vg = (op >> 2) & 15; // vg is a signed 4-bit number 650 vg = (vg << 28) >> 28; // extends sign 651 px.g = (px_ref.g + vg ) & 1023; 652 if (!streamIsGrey) 653 { 654 uint remain = readBits(4); 655 int vg_r = ((op & 3) << 1) | (remain >> 3); 656 int vg_b = remain & 7; 657 vg_r = (vg_r << 29) >> 29; 658 vg_b = (vg_b << 29) >> 29; 659 px.r = (px_ref.r + vg + vg_r) & 1023; 660 px.b = (px_ref.b + vg + vg_b) & 1023; 661 } 662 else 663 { 664 // Rewind two bits, this is always possible 665 rewindInputBit(); 666 rewindInputBit(); 667 px.r = px.g; 668 px.b = px.g; 669 } 670 } 671 else if (op < 0xe0) // QOI_OP_LUMA2 672 { 673 int vg = ((op & 31) << 2) | readBits(2); // vg is a signed 7-bit number 674 vg = (vg << 25) >> 25; // extends sign 675 px.g = (px_ref.g + vg ) & 1023; 676 if (!streamIsGrey) 677 { 678 int vg_r = cast(int)(readBits(6) << 26) >> 26; // vg_r and vg_b are signed 6-bit number in the stream 679 int vg_b = cast(int)(readBits(6) << 26) >> 26; 680 px.r = (px_ref.r + vg + vg_r) & 1023; 681 px.b = (px_ref.b + vg + vg_b) & 1023; 682 } 683 else 684 { 685 px.r = px.g; 686 px.b = px.g; 687 } 688 } 689 else if (op < 0xe8) // QOI_OP_LUMA3 690 { 691 int vg = ((op & 7) << 6) | readBits(6); // vg is a signed 9-bit number 692 vg = (vg << 23) >> 23; // extends sign 693 px.g = (px_ref.g + vg ) & 1023; 694 if (!streamIsGrey) 695 { 696 int vg_r = cast(int)(readBits(8) << 24) >> 24; // vg_r and vg_b are signed 8-bit number in the stream 697 int vg_b = cast(int)(readBits(8) << 24) >> 24; 698 px.r = (px_ref.r + vg + vg_r) & 1023; 699 px.b = (px_ref.b + vg + vg_b) & 1023; 700 } 701 else 702 { 703 px.r = px.g; 704 px.b = px.g; 705 } 706 } 707 else if (op < 0xf0) // QOI_OP_ADIFF 708 { 709 int adiff = ((op & 7) << 2) | readBits(2); 710 adiff = adiff << 27; // Need sign extension, else the negatives aren't negatives. 711 adiff = adiff >> 27; 712 px.a = cast(ushort)((px.a + adiff) & 1023); 713 goto decode_next_op; 714 } 715 else if ((op & 0xfc) == QOI_OP_ADIFF2) 716 { 717 int adiff = ((op & 3) << 6) | readBits(6); 718 adiff = (adiff << 24) >> 24; // sign-extend 719 px.a = cast(ushort)((px.a + adiff) & 1023); 720 goto decode_next_op; 721 } 722 else if (op < 0xf8) // QOI_OP_RUN 723 { 724 run = op & 7; 725 if (run == 7) 726 { 727 run = readBits(8) + 7; 728 } 729 } 730 else if (op == QOI_OP_RGB) 731 { 732 px.r = cast(ushort) readBits(10); 733 if (!streamIsGrey) 734 { 735 px.g = cast(ushort) readBits(10); 736 px.b = cast(ushort) readBits(10); 737 } 738 else 739 { 740 px.g = px.r; 741 px.b = px.r; 742 } 743 } 744 else if (op == QOI_OP_RGBA) 745 { 746 px.r = cast(ushort) readBits(10); 747 if (!streamIsGrey) 748 { 749 px.g = cast(ushort) readBits(10); 750 px.b = cast(ushort) readBits(10); 751 } 752 else 753 { 754 px.g = px.r; 755 px.b = px.r; 756 } 757 px.a = cast(ushort) readBits(10); 758 } 759 else if (op == QOI_OP_GRAY) 760 { 761 px.r = cast(ushort) readBits(10); 762 px.g = px.r; 763 px.b = px.r; 764 } 765 else if (op == QOI_OP_END) 766 { 767 goto finished; 768 } 769 else 770 { 771 assert(false); 772 } 773 } 774 775 decodedScanline[posx] = px; 776 decoded_pixels++; 777 } 778 779 // convert just-decoded scanline into output type 780 ushort* line = cast(ushort*)(pixels + desc.pitchBytes * posy); 781 782 // PERF: can be 4 different loops here? 783 for (int posx = 0; posx < desc.width; ++posx) 784 { 785 qoi10_rgba_t px16b = decodedScanline[posx]; // 0..1023 components 786 px16b.r = cast(ushort)(px16b.r << 6 | (px16b.r >> 4)); 787 px16b.g = cast(ushort)(px16b.g << 6 | (px16b.g >> 4)); 788 px16b.b = cast(ushort)(px16b.b << 6 | (px16b.b >> 4)); 789 px16b.a = cast(ushort)(px16b.a << 6 | (px16b.a >> 4)); 790 791 // Expand 10 bit to 16-bits 792 switch(channels) 793 { 794 default: 795 case 4: 796 line[posx * channels + 0] = px16b.r; 797 line[posx * channels + 1] = px16b.g; 798 line[posx * channels + 2] = px16b.b; 799 line[posx * channels + 3] = px16b.a; 800 break; 801 case 3: 802 line[posx * channels + 0] = px16b.r; 803 line[posx * channels + 1] = px16b.g; 804 line[posx * channels + 2] = px16b.b; 805 break; 806 case 2: 807 line[posx * channels + 0] = px16b.r; 808 line[posx * channels + 1] = px16b.a; 809 break; 810 case 1: 811 line[posx * channels + 0] = px16b.r; 812 break; 813 } 814 } 815 816 // swap decoded scanline buffers 817 { 818 qoi10_rgba_t* temp = decodedScanline; 819 decodedScanline = lastDecodedScanline; 820 lastDecodedScanline = temp; 821 } 822 } 823 824 finished: 825 return pixels; 826 } 827 828 static qoi10_rgba_t locoIntraPredictionSIMD(qoi10_rgba_t a, qoi10_rgba_t b, qoi10_rgba_t c) 829 { 830 // load RGBA16 pixels 831 __m128i A = _mm_loadu_si64(&a); 832 __m128i B = _mm_loadu_si64(&b); 833 __m128i C = _mm_loadu_si64(&c); 834 835 // Max predictor (A + B - C) 836 __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C); 837 __m128i maxAB = _mm_max_epi16(A, B); 838 __m128i minAB = _mm_min_epi16(A, B); 839 840 // 1111 where we should use max(A, B) 841 __m128i maxMask = _mm_cmple_epi16(C, minAB); 842 843 // 1111 where we should use min(A, B) 844 __m128i minMask = _mm_cmpge_epi16(C, maxAB); 845 846 P = (P & (~minMask)) | (minAB & minMask); 847 P = (P & (~maxMask)) | (maxAB & maxMask); 848 849 // Get back to 10-bit, clip 850 851 __m128i Z = _mm_setzero_si128(); 852 __m128i m1023 = _mm_set1_epi16(1023); 853 P = _mm_max_epi16(P, Z); 854 P = _mm_min_epi16(P, m1023); 855 856 qoi10_rgba_t r; 857 _mm_storel_epi64(cast(__m128i*)&r, P); 858 859 return r; 860 } 861 862 /+ 863 static short locoIntraPrediction(short a, short b, short c) 864 { 865 short max_ab = a > b ? a : b; 866 short min_ab = a < b ? a : b; 867 if (c >= max_ab) 868 return cast(short)min_ab; 869 else if (c <= min_ab) 870 return cast(short)max_ab; 871 else 872 { 873 int d = a + b - c; 874 if (d < 0) 875 d = 0; 876 if (d > 1023) 877 d = 1023; 878 return cast(short)d; 879 } 880 } 881 +/ 882 883 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe 884 { 885 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 886 } 887 888 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b) 889 { 890 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 891 }