1 module gamut.codecs.qoi10b;
2 
3 nothrow @nogc:
4 
5 import core.stdc.stdlib: realloc, malloc, free;
6 import core.stdc.string: memset;
7 
8 import gamut.codecs.qoi2avg;
9 import inteli.emmintrin;
10 
11 /// A QOI-inspired codec for 10-bit images, called "QOI-10b". 
12 /// Input image is 16-bit ushort, but only 10-bits gets encoded making it lossy.
13 ///
14 ///
15 /// Incompatible adaptation of QOI format - https://phoboslab.org
16 ///
17 /// -- LICENSE: The MIT License(MIT)
18 /// Copyright(c) 2021 Dominic Szablewski (original QOI format)
19 /// Copyright(c) 2022 Guillaume Piolat (QOI-10b variant for 10b images, 1/2/3/4 channels).
20 /// Permission is hereby granted, free of charge, to any person obtaining a copy of
21 /// this software and associated documentation files(the "Software"), to deal in
22 /// the Software without restriction, including without limitation the rights to
23 /// use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
24 /// of the Software, and to permit persons to whom the Software is furnished to do
25 /// so, subject to the following conditions :
26 /// The above copyright notice and this permission notice shall be included in all
27 /// copies or substantial portions of the Software.
28 /// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 /// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 /// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
31 /// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 /// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 /// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 /// SOFTWARE.
35 
36 /// -- Documentation
37 
38 /// This library provides the following functions;
39 /// - qoi10b_decode  -- decode the raw bytes of a QOI-10b image from memory
40 /// - qoi10b_encode  -- encode an rgba buffer into a QOI-10b image in memory
41 /// 
42 ///
43 /// A QOI-10b file has a 25 byte header, compatible with Gamut QOIX.
44 ///
45 /// struct qoix_header_t {
46 ///     char     magic[4];         // magic bytes "qoix"
47 ///     uint32_t width;            // image width in pixels (BE)
48 ///     uint32_t height;           // image height in pixels (BE)
49 ///     uint8_t  version_;         // Major version of QOIX format.
50 ///     uint8_t  channels;         // 1, 2, 3 or 4
51 ///     uint8_t  bitdepth;         // 10 = this QOI-10b codec is always 10-bit (8 would indicate QOI2AVG or QOI-plane)
52 ///     uint8_t  colorspace;       // 0 = sRGB with linear alpha, 1 = all channels linear
53 ///     uint8_t  compression;      // 0 = none, 1 = LZ4
54 ///     float    pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio
55 ///     float    resolutionX;      // -1 = unknown, else physical resolution in DPI
56 /// };
57 ///
58 /// The decoder and encoder start with {r: 0; g: 0; b: 0; a: 0} as the previous
59 /// pixel value. Pixels are either encoded as
60 /// - a run of the previous pixel
61 /// - a difference to the previous pixel value
62 /// - full luminance value
63 ///
64 /// The byte stream's end is marked with 5 0xff bytes.
65 ///
66 /// This codec is simply like QOI2AVG but with extra 2 bits for each components, and it breaks byte alignment.
67 ///
68 /// Optimized?    Opcode       Bits(RGB)   Bits(grey)  Meaning
69 /// [ ]           QOI_OP_LUMA      14           6     0ggggg[rrrrbbbb]  (less g or more g => doesn't work)
70 /// [ ]           QOI_OP_LUMA0     12           6     10gggg[rrrbbb]
71 /// [ ]           QOI_OP_LUMA2     22          10     110ggggggg[rrrrrrbbbbbb]
72 /// [ ]           QOI_OP_LUMA3     30          14     11100ggggggggg[rrrrrrrrbbbbbbbb]
73 /// [ ]           QOI_OP_ADIFF     10          10     11101xxxxx
74 /// [x]           QOI_OP_RUN        8           8     11110xxx
75 ///                                16          16     11110111xxxxxxxx
76 /// [ ]           QOI_OP_ADIFF2    16          16     111110xxxxxxxx 
77 /// [x]           QOI_OP_GRAY      18          18     11111100gggggggggg
78 /// [ ]           QOI_OP_RGB       38          18     11111101rrrrrrrrrr[ggggggggggbbbbbbbbbb]
79 /// [ ]           QOI_OP_RGBA      48          28     11111110rrrrrrrrrr[ggggggggggbbbbbbbbbb]aaaaaaaaaa
80 /// [ ]           QOI_OP_END        8           8     11111111
81 
82 // Note: LOCO-I predictor doesn't really work here, it slowdown quite a bit for 0.5% bitrate improvement.
83 //       Not sure what happens here.
84 
85 enum ubyte QOI_OP_ADIFF2 = 0xf8;
86 
87 enum int WORST_OPCODE_BITS = 48;
88 
89 enum enableAveragePrediction = true; 
90 
91 enum bool newpred = false;//true;
92 
93 static immutable ubyte[5] qoi10b_padding = [255,255,255,255,255];
94 
95 enum qoi10_rgba_t initialPredictor = { r:0, g:0, b:0, a:1023 };
96 
97 struct qoi10_rgba_t 
98 {   
99     // 0 to 1023 values
100     ushort r;
101     ushort g;
102     ushort b;
103     ushort a;
104 }
105 
106 uint QOI10b_COLOR_HASH(qoi10_rgba_t C)
107 {
108     long colorAsLong = *(cast(long*)&C);
109     return (((colorAsLong * 2654435769) >> 22) & 1023);
110 }
111 
112 /* Encode raw RGBA16 pixels into a QOI-10b image in memory.
113 This immediately loosen precision from 16-bit t 10-bit, so this is lossy.
114 The function either returns null on failure (invalid parameters or malloc 
115 failed) or a pointer to the encoded data on success. On success the out_len 
116 is set to the size in bytes of the encoded data.
117 The returned qoi data should be free()d after use. */
118 ubyte* qoi10b_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 
119 {
120     if ( (desc.channels != 1 && desc.channels != 2 && desc.channels != 3 && desc.channels != 4) ||
121         desc.height >= QOIX_PIXELS_MAX / desc.width || desc.compression != QOIX_COMPRESSION_NONE
122     ) {
123         return null;
124     }
125 
126     if (desc.bitdepth != 10)
127         return null;
128 
129     int channels = desc.channels;
130 
131     // At worst, each pixel take 38 bit to be encoded.
132     int num_pixels = desc.width * desc.height;
133 
134    int scanline_size = cast(int)(qoi10_rgba_t.sizeof) * desc.width;
135 
136     int max_size = cast(int) (cast(long)num_pixels * WORST_OPCODE_BITS + 7) / 8 + QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof);
137 
138     ubyte* stream;
139 
140     int p = 0; // write index into output stream
141     ubyte* bytes = cast(ubyte*) QOI_MALLOC(max_size + 2 * scanline_size);
142     if (!bytes) 
143     {
144         return null;
145     }
146 
147     qoi_write_32(bytes, &p, QOIX_MAGIC);
148     qoi_write_32(bytes, &p, desc.width);
149     qoi_write_32(bytes, &p, desc.height);
150     bytes[p++] = 1; // Put a version number :)
151     bytes[p++] = desc.channels; // 1, 2, 3 or 4
152     bytes[p++] = desc.bitdepth; // 10
153     bytes[p++] = desc.colorspace;
154     bytes[p++] = QOIX_COMPRESSION_NONE;
155     qoi_write_32f(bytes, &p, desc.pixelAspectRatio);
156     qoi_write_32f(bytes, &p, desc.resolutionY);
157 
158     int currentBit = 7; // beginning of a byte
159     bytes[p] = 0;
160 
161     // write the nbits last bits of x, starting from the highest one
162     void outputBits(uint x, int nbits) nothrow @nogc
163     {
164         assert(nbits >= 2 && nbits <= 16);
165         assert( (nbits % 2) == 0);
166 
167         for (int b = nbits - 2; b >= 0; b -= 2)
168         {
169             // which bit to write
170             ubyte pairOfBits = (x >>> b) & 3;
171             bytes[p] |= (pairOfBits << (currentBit - 1));
172 
173             currentBit -= 2;
174 
175             if (currentBit == -1)
176             {
177                 p++;
178                 bytes[p] = 0;
179                 currentBit = 7;
180             }
181         }
182     }
183 
184     void outputByte(ubyte b)
185     {
186         outputBits(b, 8);
187     }
188 
189     qoi10_rgba_t px = initialPredictor;
190     qoi10_rgba_t px_ref = initialPredictor;
191 
192     // To serve as predictor
193     qoi10_rgba_t* scanlineConverted     = cast(qoi10_rgba_t*)(&bytes[max_size]);
194     qoi10_rgba_t* lastScanlineConverted = cast(qoi10_rgba_t*)(&bytes[max_size + scanline_size]);
195 
196     ubyte[1024] index_lookup;
197     uint index_pos = 0;
198     
199     
200     index_lookup[] = 0;
201 
202     bool streamIsGrey = (channels == 1 || channels == 2);
203     int run = 0;
204     int encoded_pixels = 0;
205 
206     void encodeRun()
207     {
208         assert(run > 0 && run <= 256);
209         run--;
210         if (run < 7) 
211         {
212             outputByte( cast(ubyte)(QOI_OP_RUN | run) ); // run 1 to 7
213         }
214         else 
215         {
216             outputByte( cast(ubyte)(QOI_OP_RUN | 7) ); // QOI_OP_RUN2 is inside the QOI_OP_RUN
217             outputBits(run - 7, 8);
218         }
219         run = 0;
220     }
221     
222     for (int posy = 0; posy < desc.height; ++posy)
223     {
224         {
225             const(ushort)* line = cast(const(ushort*))(data + desc.pitchBytes * posy);
226    
227             // 1. First convert the scanline to full qoi10_rgba_t to serve as predictor.
228 
229             for (int posx = 0; posx < desc.width; ++posx)
230             {
231                 qoi10_rgba_t pixel;
232 
233                 // Note that we drop six lower bits here. This codec is lossy 
234                 // if you really have more than 10-bits of precision.
235                 // The use case is PBR knob in Dplug, who needs 10-bit (presumably) for the elevation map.
236                 switch(channels)
237                 {
238                     default:
239                     case 4:
240                         pixel.r = line[posx * channels + 0];
241                         pixel.g = line[posx * channels + 1];
242                         pixel.b = line[posx * channels + 2];
243                         pixel.a = line[posx * channels + 3];
244                         break;
245                     case 3:
246                         pixel.r = line[posx * channels + 0];
247                         pixel.g = line[posx * channels + 1];
248                         pixel.b = line[posx * channels + 2];
249                         pixel.a = 65535;
250                         break;
251                     case 2:
252                         pixel.r = line[posx * channels + 0];
253                         pixel.g = pixel.r;
254                         pixel.b = pixel.r;
255                         pixel.a = line[posx * channels + 1];
256                         break;
257                     case 1:
258                         pixel.r = line[posx * channels + 0];
259                         pixel.g = pixel.r;
260                         pixel.b = pixel.r;
261                         pixel.a = 65535;
262                         break;
263                 }
264 
265                 pixel.r = pixel.r >>> 6;
266                 pixel.g = pixel.g >>> 6;
267                 pixel.b = pixel.b >>> 6;
268                 pixel.a = pixel.a >>> 6;
269 
270                 assert(pixel.r <= 1023);
271                 assert(pixel.g <= 1023);
272                 assert(pixel.b <= 1023);
273                 assert(pixel.a <= 1023);
274 
275                 scanlineConverted[posx] = pixel; // Note: if we'd like lossy, this would be the reconstructed buffer.
276             }
277         }
278 
279         for (int posx = 0; posx < desc.width; ++posx)
280         {
281             px_ref = px;
282 
283             px = scanlineConverted[posx];
284 
285             if (px == px_ref) 
286             {
287                 run++;
288                 if (run == 256 || encoded_pixels + 1 == num_pixels)
289                     encodeRun();
290             }
291             else 
292             {
293                 if (run > 0) 
294                     encodeRun();
295 
296                 {
297                     int va = (px.a - px_ref.a) & 1023;
298                     if (va) 
299                     {
300                         if (va < 16 || va >= (1024 - 16)) // does it fit on 5 bits?
301                         {
302                             // it fits on 5 bits
303                             outputBits((0x1d << 5) | (va & 0x1f), 10); // QOI_OP_ADIFF
304                         }
305                         else if (va < 128 || va >= (1024 - 128)) // does it fit on 8 bits?
306                         {
307                             outputBits( (QOI_OP_ADIFF2 >>> 2), 6);
308                             outputBits(va, 8);  
309                         }
310                         else
311                         {
312                             outputByte(QOI_OP_RGBA);
313                             outputBits(px.r, 10);
314                             if (!streamIsGrey)
315                             {
316                                 outputBits(px.g, 10);
317                                 outputBits(px.b, 10);
318                             }
319                             outputBits(px.a, 10);
320                             goto pixel_is_encoded;
321                         }
322                     }
323 
324                     if (posy > 0 && enableAveragePrediction)
325                     {
326                         static if (newpred)
327                         {
328                             if (posx == 0)
329                             {
330                                 // first pixel in the row, take above pixel
331                                 px_ref.r = lastScanlineConverted[posx].r;
332                                 px_ref.g = lastScanlineConverted[posx].g;
333                                 px_ref.b = lastScanlineConverted[posx].b;
334                             }
335                             else 
336                             {
337                                 qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastScanlineConverted[posx], lastScanlineConverted[posx-1]);
338                                 px_ref.r = pred.r;
339                                 px_ref.g = pred.g;
340                                 px_ref.b = pred.b;
341                             }
342                         }
343                         else
344                         {
345                             px_ref.r = (px_ref.r + lastScanlineConverted[posx].r + 1) >> 1;
346                             px_ref.g = (px_ref.g + lastScanlineConverted[posx].g + 1) >> 1;
347                             px_ref.b = (px_ref.b + lastScanlineConverted[posx].b + 1) >> 1;
348                         }
349                     }
350 
351                     int vg   = (px.g - px_ref.g) & 1023;
352                     int vg_r = (px.r - px_ref.r - vg) & 1023;
353                     int vg_b = (px.b - px_ref.b - vg) & 1023;
354 
355                     if (streamIsGrey)
356                     {
357                         assert(vg_r == 0);
358                         assert(vg_b == 0);
359                     }
360 
361 
362                     if ( ( (vg_r >= (1024- 4)) || (vg_r <  4) ) &&  // fits in 3 bits?
363                          ( (vg   >= (1024-8)) || (vg   < 8) ) &&    // fits in 4 bits?
364                         ( (vg_b >= (1024- 4)) || (vg_b <  4) ) )   // fits in 3 bits?
365                     {
366                         outputBits(0x20 | (vg & 0x0f), 6); // QOI_OP_LUMA0
367                         if (!streamIsGrey)
368                         {
369                             outputBits( (vg_r << 3) | (vg_b & 7), 6);
370                         }
371                     }
372                     else if ( ( (vg_r >= (1024- 8)) || (vg_r <  8) ) &&  // fits in 4 bits?
373                          ( (vg   >= (1024-16)) || (vg   < 16) ) &&  // fits in 5 bits?
374                          ( (vg_b >= (1024- 8)) || (vg_b <  8) ) )   // fits in 4 bits?
375                     {
376                         outputBits(vg & 0x1f, 6); // QOI_OP_LUMA
377                         if (!streamIsGrey)
378                         {
379                             outputBits(vg_r, 4);
380                             outputBits(vg_b, 4);
381                         }
382                     }
383                     else if (!streamIsGrey && px.g == px.r && px.g == px.b) 
384                     {  
385                         // Note: in greyscale, more expensive than QOI_OP_LUMA2
386                         // This opcode should not be used if input is grey.
387                         outputByte(QOI_OP_GRAY);
388                         outputBits(px.g, 10);
389                     }
390                     else
391                     if ( ( (vg_r >= (1024-32)) || (vg_r < 32) ) &&  // fits in 6 bits?
392                          ( (vg   >= (1024-64)) || (vg   < 64) ) &&  // fits in 7 bits?
393                          ( (vg_b >= (1024-32)) || (vg_b < 32) ) )   // fits in 6 bits?
394                     {
395                         outputBits((0x6 << 7) | (vg & 0x7f), 10); // QOI_OP_LUMA2
396                         if (!streamIsGrey)
397                         {
398                             outputBits(vg_r, 6);
399                             outputBits(vg_b, 6);
400                         }
401                     } 
402                     else
403                     if ( ( (vg_r >= (1024-128)) || (vg_r < 128) ) && // fits in 8 bits?
404                          ( (vg   >= (1024-256)) || (vg   < 256) ) && // fits in 9 bits?
405                          ( (vg_b >= (1024-128)) || (vg_b < 128) ) )   // fits in 8 bits?
406                     {
407                         outputBits((0x1c << 9) | (vg & 0x1ff), 14); // QOI_OP_LUMA3
408                         if (!streamIsGrey)
409                         {
410                             outputBits(vg_r, 8);
411                             outputBits(vg_b, 8);
412                         }
413                     } 
414                     else
415                     {
416                         outputByte(QOI_OP_RGB);
417                         outputBits(px.r, 10);
418                         if (!streamIsGrey)
419                         {
420                             outputBits(px.g, 10);
421                             outputBits(px.b, 10);
422                         }
423                     }
424                 }
425             }
426 
427             pixel_is_encoded:
428 
429             encoded_pixels++;
430         }
431 
432         // Exchange scanline pointers
433         {
434             qoi10_rgba_t* temp = scanlineConverted;
435             scanlineConverted = lastScanlineConverted;
436             lastScanlineConverted = temp;
437         }
438     }
439 
440     for (int i = 0; i < cast(int)(qoi10b_padding.sizeof); i++) 
441     {
442         outputByte(qoi10b_padding[i]);
443     }
444 
445     // finish the last byte
446     if (currentBit != 7)
447         outputBits(0xff, currentBit + 1);
448     assert(currentBit == 7); // full byte
449 
450     *out_len = p;
451     return bytes;
452 }
453 
454 /* Decode a QOI-10b image from memory.
455 
456 The function either returns null on failure (invalid parameters or malloc 
457 failed) or a pointer to the decoded 16-bit pixels. On success, the qoi_desc struct 
458 is filled with the description from the file header.
459 
460 The returned pixel data should be free()d after use. */
461 ubyte* qoi10b_decode(const(void)* data, int size, qoi_desc *desc, int channels) 
462 {
463     const(ubyte)* bytes;
464     uint header_magic;
465 
466     int p = 0, run = 0;
467 
468     if (data == null || desc == null ||
469         (channels < 0 || channels > 4) ||
470             size < QOIX_HEADER_SIZE + cast(int)(qoi10b_padding.sizeof)
471                 )
472     {
473         return null;
474     }
475 
476     bytes = cast(const(ubyte)*)data;
477 
478     header_magic = qoi_read_32(bytes, &p);
479     desc.width = qoi_read_32(bytes, &p);
480     desc.height = qoi_read_32(bytes, &p);
481     int qoix_version = bytes[p++];
482     desc.channels = bytes[p++];
483     desc.bitdepth = bytes[p++];
484     desc.colorspace = bytes[p++];
485     desc.compression = bytes[p++];
486     desc.pixelAspectRatio = qoi_read_32f(bytes, &p);
487     desc.resolutionY = qoi_read_32f(bytes, &p);
488 
489     if (desc.width == 0 || desc.height == 0 || 
490         desc.channels < 1 || desc.channels > 4 ||
491         desc.colorspace > 1 ||
492         desc.bitdepth != 10 ||
493         qoix_version > 1 ||
494         desc.compression != QOIX_COMPRESSION_NONE ||
495         header_magic != QOIX_MAGIC ||
496         desc.height >= QOIX_PIXELS_MAX / desc.width
497         ) 
498     {
499         return null;
500     }
501 
502     int streamChannels = desc.channels;
503 
504     if (channels == 0)
505         channels = streamChannels;
506 
507     int stride = desc.width * channels * 2;
508     desc.pitchBytes = stride;         
509 
510     int pixel_data_size = stride * desc.height;
511     int decoded_scanline_size = desc.width * cast(int)qoi10_rgba_t.sizeof;
512 
513     ubyte* pixels = cast(ubyte*) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size);
514     if (!pixels) {
515         return null;
516     }
517 
518     // double-buffered scanline, for correct average predictors
519     // (else we can't decode 4/3 channels to 1/2 with average prediction, the predictors would be wrong
520     //  if taken from the decoded output)
521     qoi10_rgba_t* decodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size]);
522     qoi10_rgba_t* lastDecodedScanline = cast(qoi10_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]);
523 
524     assert(channels >= 1 && channels <= 4);
525 
526     int currentBit = 7;
527 
528     void rewindInputBit() nothrow @nogc
529     {
530         if (currentBit == 7)
531         {
532             p--;
533             currentBit = -1;
534         }
535         currentBit++;
536     }
537 
538     int read2Bits() nothrow @nogc
539     {
540         ubyte bb = bytes[p];
541 
542         int bit = (bytes[p] >>> (currentBit - 1)) & 3;
543 
544         currentBit -= 2;
545         if (currentBit == -1)
546         {
547             currentBit = 7;
548             p++;
549         }
550         return bit;
551     }
552 
553     uint readBits(int nbits) nothrow @nogc
554     {
555         assert(nbits % 2 == 0);
556         uint r = 0;
557         for (int b = 0; b < nbits; b += 2)
558         {
559             r = (r << 2) | read2Bits();
560         }
561         return r;
562     }
563 
564     ubyte readByte() nothrow @nogc
565     {
566         return cast(ubyte) readBits(8);
567     }
568 
569     bool streamIsGrey = (streamChannels == 1 || streamChannels == 2);
570 
571     qoi10_rgba_t px = initialPredictor;
572     qoi10_rgba_t px_ref = initialPredictor;
573 
574     int decoded_pixels = 0;
575     int num_pixels = desc.width * desc.height;
576 
577     for (int posy = 0; posy < desc.height; ++posy)
578     {
579         for (int posx = 0; posx < desc.width; ++posx)
580         {
581             px_ref = px;
582 
583             if (run > 0) 
584             {
585                 run--;
586             }
587             else if (decoded_pixels < num_pixels)
588             {
589                 // Compute averaged predictors then
590 
591                 if (posy > 0 && enableAveragePrediction)
592                 {
593                     static if (newpred)
594                     {
595                         if (posx == 0)
596                         {
597                             // first pixel in the row, take above pixel
598                             px_ref.r = lastDecodedScanline[posx].r;
599                             px_ref.g = lastDecodedScanline[posx].g;
600                             px_ref.b = lastDecodedScanline[posx].b;
601                         }
602                         else 
603                         {
604                             qoi10_rgba_t pred = locoIntraPredictionSIMD(px_ref, lastDecodedScanline[posx], lastDecodedScanline[posx-1]);
605                             px_ref.r = pred.r;
606                             px_ref.g = pred.g;
607                             px_ref.b = pred.b;
608                         }
609                     }
610                     else
611                     {
612                         px_ref.r = (px_ref.r + lastDecodedScanline[posx].r + 1) >> 1;
613                         px_ref.g = (px_ref.g + lastDecodedScanline[posx].g + 1) >> 1;
614                         px_ref.b = (px_ref.b + lastDecodedScanline[posx].b + 1) >> 1;
615                     }
616                 }
617 
618                 decode_next_op:
619 
620                 ubyte op = readByte();
621 
622                 if (op < 0x80)              // QOI_OP_LUMA
623                 {
624                     int vg = (op >> 2) & 31;  // vg is a signed 5-bit number
625                     vg = (vg << 27) >> 27;   // extends sign
626                     px.g = (px_ref.g + vg       ) & 1023;
627 
628                     if (!streamIsGrey)
629                     {
630                         int vg_r = ((op & 3) << 2) | readBits(2); // vg_r and vg_b are signed 4-bit number in the stream
631                         vg_r = (vg_r << 28) >> 28;   // extends sign    
632                         int vg_b = cast(int)(readBits(4) << 28) >> 28;
633                         px.r = (px_ref.r + vg + vg_r) & 1023;
634                         px.b = (px_ref.b + vg + vg_b) & 1023;
635                     }
636                     else
637                     {
638                         // Rewind two bits, this is always possible
639                         rewindInputBit();
640                         rewindInputBit();
641                         px.r = px.g;
642                         px.b = px.g;
643                     }
644                 }
645                 else if (op < 0xc0)    // QOI_OP_LUMA0 10xxxx
646                 {
647                     // 10gggg[rrrggg]
648 
649                     int vg = (op >> 2) & 15;  // vg is a signed 4-bit number
650                     vg = (vg << 28) >> 28;   // extends sign
651                     px.g = (px_ref.g + vg       ) & 1023;
652                     if (!streamIsGrey)
653                     {
654                         uint remain = readBits(4);
655                         int vg_r = ((op & 3) << 1) | (remain >> 3);
656                         int vg_b = remain & 7;
657                         vg_r = (vg_r << 29) >> 29;
658                         vg_b = (vg_b << 29) >> 29;
659                         px.r = (px_ref.r + vg + vg_r) & 1023;
660                         px.b = (px_ref.b + vg + vg_b) & 1023;
661                     }
662                     else
663                     {
664                         // Rewind two bits, this is always possible
665                         rewindInputBit();
666                         rewindInputBit();
667                         px.r = px.g;
668                         px.b = px.g;
669                     }
670                 }
671                 else if (op < 0xe0)    // QOI_OP_LUMA2
672                 {
673                     int vg   = ((op & 31) << 2) | readBits(2); // vg is a signed 7-bit number
674                     vg = (vg << 25) >> 25;                     // extends sign
675                     px.g = (px_ref.g + vg       ) & 1023;
676                     if (!streamIsGrey)
677                     {
678                         int vg_r = cast(int)(readBits(6) << 26) >> 26; // vg_r and vg_b are signed 6-bit number in the stream
679                         int vg_b = cast(int)(readBits(6) << 26) >> 26;
680                         px.r = (px_ref.r + vg + vg_r) & 1023;
681                         px.b = (px_ref.b + vg + vg_b) & 1023;
682                     }
683                     else
684                     {
685                         px.r = px.g;
686                         px.b = px.g;
687                     }
688                 }
689                 else if (op < 0xe8)    // QOI_OP_LUMA3
690                 {
691                     int vg   = ((op & 7) << 6) | readBits(6); // vg is a signed 9-bit number
692                     vg = (vg << 23) >> 23;                    // extends sign
693                     px.g = (px_ref.g + vg       ) & 1023;
694                     if (!streamIsGrey)
695                     {
696                         int vg_r = cast(int)(readBits(8) << 24) >> 24; // vg_r and vg_b are signed 8-bit number in the stream
697                         int vg_b = cast(int)(readBits(8) << 24) >> 24;
698                         px.r = (px_ref.r + vg + vg_r) & 1023;
699                         px.b = (px_ref.b + vg + vg_b) & 1023;
700                     }
701                     else
702                     {
703                         px.r = px.g;
704                         px.b = px.g;
705                     }
706                 }  
707                 else if (op < 0xf0)    // QOI_OP_ADIFF
708                 {
709                     int adiff = ((op & 7) << 2) | readBits(2);
710                     adiff = adiff << 27; // Need sign extension, else the negatives aren't negatives.
711                     adiff = adiff >> 27;
712                     px.a = cast(ushort)((px.a + adiff) & 1023);
713                     goto decode_next_op;
714                 }
715                 else if ((op & 0xfc) == QOI_OP_ADIFF2)
716                 {
717                     int adiff = ((op & 3) << 6) | readBits(6);
718                     adiff = (adiff << 24) >> 24; // sign-extend
719                     px.a = cast(ushort)((px.a + adiff) & 1023);
720                     goto decode_next_op;
721                 }
722                 else if (op < 0xf8) // QOI_OP_RUN
723                 {       
724                     run = op & 7;
725                     if (run == 7)
726                     {
727                         run = readBits(8) + 7;
728                     }
729                 }               
730                 else if (op == QOI_OP_RGB)
731                 {
732                     px.r = cast(ushort) readBits(10);
733                     if (!streamIsGrey)
734                     {
735                         px.g = cast(ushort) readBits(10);
736                         px.b = cast(ushort) readBits(10);
737                     }
738                     else
739                     {
740                         px.g = px.r;
741                         px.b = px.r;
742                     }
743                }
744                 else if (op == QOI_OP_RGBA)
745                 {
746                     px.r = cast(ushort) readBits(10);
747                     if (!streamIsGrey)
748                     {
749                         px.g = cast(ushort) readBits(10);
750                         px.b = cast(ushort) readBits(10);
751                     }
752                     else
753                     {
754                         px.g = px.r;
755                         px.b = px.r;
756                     }
757                     px.a = cast(ushort) readBits(10);
758                 }
759                 else if (op == QOI_OP_GRAY)
760                 {
761                     px.r = cast(ushort) readBits(10);
762                     px.g = px.r;
763                     px.b = px.r;
764                 }                    
765                 else if (op == QOI_OP_END)
766                 {
767                     goto finished;
768                 }
769                 else
770                 {
771                     assert(false);
772                 }
773             }
774 
775             decodedScanline[posx] = px;
776             decoded_pixels++;
777         }
778 
779         // convert just-decoded scanline into output type
780         ushort* line      = cast(ushort*)(pixels + desc.pitchBytes * posy);
781 
782         // PERF: can be 4 different loops here?
783         for (int posx = 0; posx < desc.width; ++posx)
784         {
785             qoi10_rgba_t px16b = decodedScanline[posx]; // 0..1023 components
786             px16b.r = cast(ushort)(px16b.r << 6 | (px16b.r >> 4));
787             px16b.g = cast(ushort)(px16b.g << 6 | (px16b.g >> 4));
788             px16b.b = cast(ushort)(px16b.b << 6 | (px16b.b >> 4));
789             px16b.a = cast(ushort)(px16b.a << 6 | (px16b.a >> 4));
790 
791             // Expand 10 bit to 16-bits
792             switch(channels)
793             {
794                 default:
795                 case 4:
796                     line[posx * channels + 0] = px16b.r;
797                     line[posx * channels + 1] = px16b.g;
798                     line[posx * channels + 2] = px16b.b;
799                     line[posx * channels + 3] = px16b.a;
800                     break;
801                 case 3:
802                     line[posx * channels + 0] = px16b.r;
803                     line[posx * channels + 1] = px16b.g;
804                     line[posx * channels + 2] = px16b.b;
805                     break;
806                 case 2:
807                     line[posx * channels + 0] = px16b.r;
808                     line[posx * channels + 1] = px16b.a;
809                     break;
810                 case 1:
811                     line[posx * channels + 0] = px16b.r;
812                     break;
813             }
814         }
815 
816         // swap decoded scanline buffers
817         {
818             qoi10_rgba_t* temp = decodedScanline;
819             decodedScanline = lastDecodedScanline;
820             lastDecodedScanline = temp;
821         }
822     }
823 
824     finished:
825     return pixels;
826 }
827 
828 static qoi10_rgba_t locoIntraPredictionSIMD(qoi10_rgba_t a, qoi10_rgba_t b, qoi10_rgba_t c)
829 {
830     // load RGBA16 pixels
831     __m128i A = _mm_loadu_si64(&a); 
832     __m128i B = _mm_loadu_si64(&b);
833     __m128i C = _mm_loadu_si64(&c);
834 
835     // Max predictor (A + B - C)
836     __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C);
837     __m128i maxAB = _mm_max_epi16(A, B);
838     __m128i minAB = _mm_min_epi16(A, B);
839 
840     // 1111 where we should use max(A, B)
841     __m128i maxMask = _mm_cmple_epi16(C, minAB);
842 
843     // 1111 where we should use min(A, B)
844     __m128i minMask = _mm_cmpge_epi16(C, maxAB);
845 
846     P = (P & (~minMask)) | (minAB & minMask);
847     P = (P & (~maxMask)) | (maxAB & maxMask);
848 
849     // Get back to 10-bit, clip
850 
851     __m128i Z = _mm_setzero_si128();
852     __m128i m1023 = _mm_set1_epi16(1023);
853     P = _mm_max_epi16(P, Z);
854     P = _mm_min_epi16(P, m1023);
855 
856     qoi10_rgba_t r;
857     _mm_storel_epi64(cast(__m128i*)&r, P);
858 
859     return r;
860 }
861 
862 /+
863 static short locoIntraPrediction(short a, short b, short c)
864 {
865     short max_ab = a > b ? a : b;
866     short min_ab = a < b ? a : b;
867     if (c >= max_ab)
868         return cast(short)min_ab;
869     else if (c <= min_ab)
870         return cast(short)max_ab;
871     else
872     {
873         int d = a + b - c;
874         if (d < 0)
875             d = 0;
876         if (d > 1023)
877             d = 1023;
878         return cast(short)d;
879     }
880 }
881 +/
882 
883 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe
884 {
885     return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
886 }
887 
888 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b)
889 {
890     return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
891 }