gamut.codecs.qoi2avg source code

1 module gamut.codecs.qoi2avg;
2 
3 nothrow @nogc:
4 
5 import core.stdc.stdlib: realloc, malloc, free;
6 import core.stdc.string: memset, memcpy;
7 
8 import inteli.emmintrin;
9 
10 /// Note: this is a translation of "QOI2" mods by @wbd73
11 /// revealed in https://github.com/nigeltao/qoi2-bikeshed/issues/34
12 /// Called "QOIX" in Gamut, since it has a few extensions again, such as LZ4.
13 
14 /* 
15 
16 QOI2 - Lossless image format inspired by QOI “Quite OK Image” format
17 
18 Incompatible adaptation of QOI format - https://phoboslab.org
19 
20 -- LICENSE: The MIT License(MIT)
21 Copyright(c) 2021 Dominic Szablewski (original QOI format)
22 Copyright(c) 2021 wbd73 @ GitHub (compression improvements)
23 Copyright(c) 2022 Guillaume Piolat (D translation, add pitch support)
24 
25 Permission is hereby granted, free of charge, to any person obtaining a copy of
26 this software and associated documentation files(the "Software"), to deal in
27 the Software without restriction, including without limitation the rights to
28 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
29 of the Software, and to permit persons to whom the Software is furnished to do
30 so, subject to the following conditions :
31 The above copyright notice and this permission notice shall be included in all
32 copies or substantial portions of the Software.
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
36 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 SOFTWARE.
40 
41 
42 
43 -- Documentation
44 
45 This library provides the following functions;
46 - qoi_decode  -- decode the raw bytes of a QOI image from memory
47 - qoi_encode  -- encode an rgba buffer into a QOI image in memory
48 
49 See the function declaration below for the signature and more information.
50 
51 
52 -- Data Format
53 
54 A QOI2AVG file has a 25 byte header, compatible with Gamut QOIX.
55 Followed by any number of data "chunks" and an 8-byte end marker.
56 
57 struct qoix_header_t {
58     char     magic[4];         // magic bytes "qoix"
59     uint32_t width;            // image width in pixels (BE)
60     uint32_t height;           // image height in pixels (BE)
61     uint8_t  version_;         // Major version of QOIX format.
62     uint8_t  channels;         // 3 = RGB, 4 = RGBA (1 and 2 indicate QOI-plane codec, see qoiplane.d)
63     uint8_t  bitdepth;         // 8 = this qoi2avg codec is always 8-bit (10 indicates QOI-10 codec, see qoi10b.d)
64     uint8_t  colorspace;       // 0 = sRGB with linear alpha, 1 = all channels linear
65     uint8_t  compression;      // 0 = none, 1 = LZ4
66     float    pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio
67     float    resolutionX;      // -1 = unknown, else physical resolution in DPI
68 };
69 */
70 
71 enum QOIX_HEADER_OFFSET_CHANNELS = 13;
72 enum QOIX_HEADER_OFFSET_BITDEPTH = 14;
73 enum QOIX_HEADER_OFFSET_COMPRESSION = 16;
74 
75 
76 /*
77 
78 The decoder and encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous
79 pixel value. Pixels are either encoded as
80  - a run of the previous pixel
81  - an index into an array of previously seen pixels
82  - a difference to the previous pixel value in r,g,b
83  - full r,g,b or a or gray values
84 
85 The color channels are assumed to not be premultiplied with the alpha channel 
86 ("un-premultiplied alpha").
87 
88 Each chunk starts with a tag, followed by a number of data bits. The bit length
89 of chunks is divisible by 8 - i.e. all chunks are byte aligned. All values
90 encoded in these data bits have the most significant bit on the left.
91 
92 The byte stream's end is marked with 4 0xff bytes.
93 
94 A running FIFO array[64] (zero-initialized) of pixel values is maintained by the
95 encoder and decoder. Every pixel en-/decoded by the QOI_OP_LUMA (and variants),
96 QOI_OP_GRAY and QOI_OP_RGB chunks is written to this array. The write position
97 starts at 0 and is incremented with each pixel written. The position wraps back
98 to 0 when it reaches 64. I.e:
99     index[index_pos % 64] = current_pixel;
100     index_pos = index_pos + 1;
101 
102 An encoder can search this array for the current pixel value and, if a match is
103 found, emit a QOI_OP_INDEX with the position within the array.
104 
105 
106 The possible chunks are:
107 
108 
109 .- QOI_OP_INDEX ----------.
110 |         Byte[0]         |
111 |  7  6  5  4  3  2  1  0 |
112 |-------+-----------------|
113 |  1  0 |     index       |
114 `-------------------------`
115 2-bit tag b10
116 6-bit index into the color index array: 0..63
117 
118 
119 .- QOI_OP_LUMA -----(232)-. 
120 |         Byte[0]         |
121 |  7  6  5  4  3  2  1  0 |
122 |----+--------+-----+-----|
123 |  0 | g diff | drg | dbg |
124 `-------------------------`
125 1-bit tag b0
126 3-bit green channel difference from the reference -4..3
127 2-bit   red channel difference minus green channel difference -1..2 or -2..1
128 2-bit  blue channel difference minus green channel difference -1..2 or -2..1
129 
130 For the first line of pixels the reference is the previous pixel.
131 For the next lines of pixels the reference is the rounded down average of the
132 previous pixel and the one above the current pixel.
133 The green channel is used to indicate the general direction of change and is 
134 encoded in 3 bits. The red and green channels (dr and db) base their diffs off
135 of the green channel difference and are encoded in 2 bits. I.e.:
136     dr_dg = (ref.r - cur_px.r) - (ref.g - cur_px.g)
137     db_dg = (ref.b - cur_px.b) - (ref.g - cur_px.g)
138 
139 The difference to the current channel values are using a wraparound operation, 
140 so "1 - 2" will result in 255, while "255 + 1" will result in 0.
141 
142 Values are stored as unsigned integers with a bias of 4 for the green channel 
143 and a bias of 1 or 2 for the red and blue channel depending on the direction
144 (sign bit) of the green channel.
145 
146 
147 .- QOI_OP_LUMA2 ------------------------------(454)-. 
148 |         Byte[0]         |         Byte[1]         |
149 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
150 |----------+--------------+-------------+-----------|
151 |  1  1  0 |  green diff  |   dr - dg   |  db - dg  |
152 `---------------------------------------------------`
153 3-bit tag b110
154 5-bit green channel difference from the reference -16..15
155 4-bit   red channel difference minus green channel difference -8..7
156 4-bit  blue channel difference minus green channel difference -8..7
157 
158 The green channel is used to indicate the general direction of change and is 
159 encoded in 5 bits. The red and green channels (dr and db) base their diffs off
160 of the green channel difference and are encoded in 4 bits.
161 
162 Values are stored as unsigned integers with a bias of 16 for the green channel 
163 and a bias of 8 for the red and blue channel.
164 
165 
166 .- QOI_OP_LUMA3 ------------------------------------.-------------------(676)-. 
167 |         Byte[0]         |         Byte[1]         |         Byte[2]         |
168 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
169 |----------------+----------------------+-------------------+-----------------|
170 |  1  1  1  0  0 |     green diff       |      dr - dg      |     db - dg     |
171 `-----------------------------------------------------------------------------`
172 4-bit tag b1110
173 7-bit green channel difference from the reference -64..63
174 6-bit   red channel difference minus green channel difference -32..31
175 6-bit  blue channel difference minus green channel difference -32..31
176 
177 The green channel is used to indicate the general direction of change and is 
178 encoded in 7 bits. The red and green channels (dr and db) base their diffs off
179 of the green channel difference and are encoded in 6 bits.
180 
181 Values are stored as unsigned integers with a bias of 64 for the green channel 
182 and a bias of 32 for the red and blue channel.
183 
184 
185 .- QOI_OP_RUN ------------.
186 |         Byte[0]         |
187 |  7  6  5  4  3  2  1  0 |
188 |----------------+--------|
189 |  1  1  1  1  0 |  run   |
190 `-------------------------`
191 5-bit tag b11110
192 3-bit run-length repeating the previous pixel: 1..8
193 
194 The run-length is stored with a bias of 1.
195 
196 
197 .- QOI_OP_RUN2 ---------------------.
198 |         Byte[0]         | Byte[1] |
199 |  7  6  5  4  3  2  1  0 | 7 .. 0  |
200 |-------------------+-----+---------|
201 |  1  1  1  1  1  0 |      run      |
202 `-----------------------------------`
203 6-bit tag b111110
204 10-bit run-length repeating the previous pixel: 1..1024
205 
206 The run-length is stored with a bias of 1.
207 
208 
209 .- QOI_OP_GRAY ---------------------.
210 |         Byte[0]         | Byte[1] |
211 |  7  6  5  4  3  2  1  0 | 7 .. 0  |
212 |-------------------------+---------|
213 |  1  1  1  1  1  1  0  0 |  gray   |
214 `-----------------------------------`
215 8-bit tag b11111100
216 8-bit gray channel value
217 
218 
219 .- QOI_OP_RGB ------------------------------------------.
220 |         Byte[0]         | Byte[1] | Byte[2] | Byte[3] |
221 |  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  |
222 |-------------------------+---------+---------+---------|
223 |  1  1  1  1  1  1  0  1 |   red   |  green  |  blue   |
224 `-------------------------------------------------------`
225 8-bit tag b11111101
226 8-bit   red channel value
227 8-bit green channel value
228 8-bit  blue channel value
229 
230 
231 .- QOI_OP_A ------------------------.
232 |         Byte[0]         | Byte[1] |
233 |  7  6  5  4  3  2  1  0 | 7 .. 0  |
234 |-------------------------+---------|
235 |  1  1  1  1  1  1  1  0 |  alpha  |
236 `-----------------------------------`
237 8-bit tag b11111110
238 8-bit alpha channel value
239 
240 
241 .- QOI_OP_END ------------.
242 |         Byte[0]         |
243 |  7  6  5  4  3  2  1  0 |
244 |-------------------------|
245 |  1  1  1  1  1  1  1  1 |
246 `-------------------------`
247 8-bit tag b11111111
248 
249 
250 The byte stream is padded at the end with four 0xff bytes. Since the longest 
251 legal chunk is 4 bytes (QOI_OP_RGB), with this padding it is possible to check 
252 for an overrun only once per decode loop iteration. These 0xff bytes also mark 
253 the end of the data stream, as an encoder should never produce four consecutive
254 0xff bytes within the stream.
255 
256 */
257 
258 /* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions. 
259 It describes either the input format (for qoi_write and qoi_encode), or is 
260 filled with the description read from the file header (for qoi_read and
261 qoi_decode).
262 
263 The colorspace in this qoi_desc is an enum where 
264     0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
265     1 = all channels are linear
266 You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely 
267 informative. It will be saved to the file header, but does not affect
268 en-/decoding in any way. */
269 
270 enum QOI_SRGB = 0;
271 enum QOI_LINEAR = 1;
272 
273 struct qoi_desc
274 {
275     uint width;
276     uint height;
277     int pitchBytes; // number of bytes between start of lines.
278     ubyte channels;
279     ubyte bitdepth;
280     ubyte colorspace;
281     ubyte compression;
282     float pixelAspectRatio; // PAR, in Gamut format
283     float resolutionY;      // Vertical DPI, in Gamut format
284 }
285 
286 alias QOI_MALLOC = malloc;
287 alias QOI_FREE = free;
288 
289 
290 enum int QOI_OP_LUMA   = 0x00; /* 0xxxxxxx */
291 enum int QOI_OP_INDEX  = 0x80; /* 10xxxxxx */
292 enum int QOI_OP_LUMA2  = 0xc0; /* 110xxxxx */
293 enum int QOI_OP_LUMA3  = 0xe0; /* 11100xxx */
294 enum int QOI_OP_ADIFF  = 0xe8; /* 11101xxx */
295 enum int QOI_OP_RUN    = 0xf0; /* 11110xxx */
296 enum int QOI_OP_RUN2   = 0xf8; /* 111110xx */
297 enum int QOI_OP_GRAY   = 0xfc; /* 11111100 */
298 enum int QOI_OP_RGB    = 0xfd; /* 11111101 */
299 enum int QOI_OP_RGBA   = 0xfe; /* 11111110 */
300 enum int QOI_OP_END    = 0xff; /* 11111111 */
301 
302 enum uint QOIX_MAGIC = 0x716F6978; // "qoix"
303 enum QOIX_HEADER_SIZE = 15 + 1 /* version */ + 4 /* PAR */ + 4 /* DPI */ + 1 /* compression */;
304 enum ubyte QOIX_COMPRESSION_NONE = 0;
305 enum ubyte QOIX_COMPRESSION_LZ4  = 1;
306 
307 /* To not have to linearly search through the color index array, we use a hash 
308 of the color value to quickly lookup the index position in a hash table. */
309 uint QOI_COLOR_HASH(qoi_rgba_t C)
310 {
311     return (((C.v * 2654435769) >> 22) & 1023);
312 }
313 
314 /* 2GB is the max file size that this implementation can safely handle. We guard
315 against anything larger than that, assuming the worst case with 5 bytes per 
316 pixel, rounded down to a nice clean value. 400 million pixels ought to be 
317 enough for anybody. */
318 enum uint QOIX_PIXELS_MAX = 400000000;
319 
320 struct RGBA
321 {
322     ubyte r, g, b, a;
323 }
324 static assert(RGBA.sizeof == 4);
325 
326 struct qoi_rgba_t 
327 {   
328     union
329     {
330         RGBA rgba;
331         uint v;
332     }
333 }
334 
335 static immutable ubyte[4] qoi_padding = [255,255,255,255];
336 
337 void qoi_write_32(ubyte* bytes, int *p, uint v) 
338 {
339     bytes[(*p)++] = (0xff000000 & v) >> 24;
340     bytes[(*p)++] = (0x00ff0000 & v) >> 16;
341     bytes[(*p)++] = (0x0000ff00 & v) >> 8;
342     bytes[(*p)++] = (0x000000ff & v);
343 }
344 
345 uint qoi_read_32(const(ubyte)* bytes, int *p) 
346 {
347     uint a = bytes[(*p)++];
348     uint b = bytes[(*p)++];
349     uint c = bytes[(*p)++];
350     uint d = bytes[(*p)++];
351     return a << 24 | b << 16 | c << 8 | d;
352 }
353 
354 void qoi_write_32f(ubyte* bytes, int *p, float f) 
355 {
356     qoi_write_32(bytes, p, *cast(uint*)&f);
357 }
358 
359 float qoi_read_32f(const(ubyte)* bytes, int *p) 
360 {
361     uint r = qoi_read_32(bytes, p);
362     return *cast(float*)&r;
363 }
364 
365 /* Encode raw RGB or RGBA pixels into a QOI2AVG image in memory.
366 
367 The function either returns null on failure (invalid parameters or malloc 
368 failed) or a pointer to the encoded data on success. On success the out_len 
369 is set to the size in bytes of the encoded data.
370 
371 The returned qoi data should be free()d after use. */
372 ubyte* qoix_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 
373 {
374     int i, stride, p, run;
375     int px_len, px_end, px_pos, channels;
376     ubyte* bytes;
377     ubyte[1024] index_lookup;
378     uint index_pos = 0;
379     qoi_rgba_t[64] index;
380     qoi_rgba_t px, px_ref;
381 
382     if (
383         data == null || out_len == null || desc == null ||
384         desc.width == 0 || desc.height == 0 ||
385         desc.channels < 3 || desc.channels > 4 ||
386         desc.colorspace > 1 ||
387         desc.bitdepth != 8 ||
388         desc.compression != QOIX_COMPRESSION_NONE ||
389         desc.height >= QOIX_PIXELS_MAX / desc.width
390     ) {
391         return null;
392     }
393 
394     int pixel_data_size = desc.width * desc.height * channels;
395 
396     // Before encoding a scanline, it is converted to RGBA8.
397     // This is double buffered, to help with prediction.
398     int converted_scanline_size = desc.width * 4;  
399 
400     // Allocated 3 rgba8 scanlines for the need of encoding.
401     int extraAllocSize = converted_scanline_size*2;
402 
403     // Overallocate to make room for everything.
404     int max_size = desc.width * desc.height * (desc.channels + 1) + QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof);
405 
406     p = 0;
407     bytes = cast(ubyte*) QOI_MALLOC(max_size + extraAllocSize);
408     if (!bytes) 
409     {
410         return null;
411     }
412 
413     // double-buffered scanline, this is intended to speed up decoding
414     qoi_rgba_t* inputScanline     = cast(qoi_rgba_t*)(bytes + max_size);
415     qoi_rgba_t* lastInputScanline = cast(qoi_rgba_t*)(bytes + max_size + converted_scanline_size);
416 
417     qoi_write_32(bytes, &p, QOIX_MAGIC);
418     qoi_write_32(bytes, &p, desc.width);
419     qoi_write_32(bytes, &p, desc.height);
420     bytes[p++] = 1; // Put a version number :)
421     bytes[p++] = desc.channels; // 3, or 4
422     bytes[p++] = desc.bitdepth; // 8, or 10
423     bytes[p++] = desc.colorspace;
424     bytes[p++] = QOIX_COMPRESSION_NONE;
425     qoi_write_32f(bytes, &p, desc.pixelAspectRatio);
426     qoi_write_32f(bytes, &p, desc.resolutionY);
427 
428     //pixels = cast(const(ubyte)*) data;
429 
430     memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof);
431     index_lookup[] = 0;
432 
433     run = 0;
434     px.rgba.r = 0;
435     px.rgba.g = 0;
436     px.rgba.b = 0;
437     px.rgba.a = 255;
438     
439     channels = desc.channels;
440     stride = desc.width * channels;
441     px_len = desc.width * desc.height * channels;
442     px_end = px_len - channels;
443 
444     assert (channels != 1 && channels != 2);
445 
446 
447 
448     for (int posy = 0; posy < desc.height; ++posy)
449     {
450         const(ubyte)* line = data + desc.pitchBytes * posy;
451 
452         // Convert one input scanline at once to rgba8
453         if (desc.channels == 4)
454         {
455             // PERF: replace by pointer swap
456             memcpy(inputScanline, line, desc.pitchBytes);
457         }
458         else
459         {
460             assert(desc.channels == 3);
461             for (int posx = 0; posx < desc.width; ++posx)
462             {
463                 inputScanline[posx].rgba = RGBA(line[posx * 3 + 0], line[posx * 3 + 1], line[posx * 3 + 2], 255);
464             }
465         }
466 
467         for (int posx = 0; posx < desc.width; ++posx)
468         {
469             px_ref.v = px.v;
470             px = inputScanline[posx];
471 
472             if (px.v == px_ref.v) {
473                 run++;
474                 if (run == 1024 || px_pos == px_end) {
475                     run--;
476                     bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3);
477                     bytes[p++] = run & 0xff;
478                     run = 0;
479                 }
480             }
481             else {
482                 int hash = QOI_COLOR_HASH(px);
483 
484                 if (run > 0) {
485                     run--;
486                     if (run < 8) {
487                         bytes[p++] = cast(ubyte)(QOI_OP_RUN | run);
488                     }
489                     else {
490                         bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3);
491                         bytes[p++] = run & 0xff;
492                     }
493                     run = 0;
494                 }
495 
496                 if (index[index_lookup[hash]].v == px.v) {
497                     bytes[p++] = QOI_OP_INDEX | index_lookup[hash];
498                 }
499                 else {
500                     index_lookup[hash] = cast(ubyte) index_pos;
501                     index[index_pos] = px;
502                     index_pos = (index_pos + 1) & 63;
503 
504                     byte va = cast(byte)(px.rgba.a - px_ref.rgba.a);
505 
506                     if (va) {
507                         if (va >= -4 && va <= 3){
508                             bytes[p++] = cast(ubyte)(QOI_OP_ADIFF | (va + 4));
509                         } else { 
510                             bytes[p++] = QOI_OP_RGBA; // make a grey + alpha opcode?
511                             bytes[p++] = px.rgba.r;
512                             bytes[p++] = px.rgba.g;
513                             bytes[p++] = px.rgba.b;
514                             bytes[p++] = px.rgba.a;
515                             goto pixel_encoded;
516                         }
517                     }
518 
519                     // Note: computing this predictor for the whole scanline in advance, even with 2x pixels at once, was slower.
520                     // because in normal times, you don't compute this predictor all the time.
521                     if (posy > 0)
522                     {
523                         if (posx == 0)
524                         {
525                             // first pixel in the row, take above pixel
526                             RGBA pred = lastInputScanline[posx].rgba;
527                             px_ref.rgba.r = pred.r;
528                             px_ref.rgba.g = pred.g;
529                             px_ref.rgba.b = pred.b;
530                         }
531                         else 
532                         {
533                             RGBA pred = locoIntraPredictionSIMD(px_ref.rgba, lastInputScanline[posx].rgba, lastInputScanline[posx-1].rgba);
534                             px_ref.rgba.r = pred.r;
535                             px_ref.rgba.g = pred.g;
536                             px_ref.rgba.b = pred.b;
537                         }
538                     }
539 
540                     byte vg   = cast(byte)(px.rgba.g - px_ref.rgba.g);
541                     byte vg_r = cast(byte)(px.rgba.r - px_ref.rgba.r - vg);
542                     byte vg_b = cast(byte)(px.rgba.b - px_ref.rgba.b - vg);
543 
544                     if (
545                         vg   >= -4 && vg   <  0 && 
546                         vg_r >= -1 && vg_r <= 2 &&
547                         vg_b >= -1 && vg_b <= 2
548                     ) {
549                         bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 1) << 2 | (vg_b + 1) );
550                     }
551                     else if (
552                         vg   >=  0 && vg   <= 3 && 
553                         vg_r >= -2 && vg_r <= 1 &&
554                         vg_b >= -2 && vg_b <= 1
555                     ) {
556                         bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 2) << 2 | (vg_b + 2) );
557                     }
558                     else if (
559                         px.rgba.g == px.rgba.r &&
560                         px.rgba.g == px.rgba.b
561                     ) {
562                         bytes[p++] = QOI_OP_GRAY;
563                         bytes[p++] = px.rgba.g;
564                     }
565                     else if (
566                         vg_r >=  -8 && vg_r <=  7 && 
567                         vg   >= -16 && vg   <= 15 && 
568                         vg_b >=  -8 && vg_b <=  7
569                     ) {
570                         bytes[p++] = cast(ubyte)( QOI_OP_LUMA2    | (vg   + 16) );
571                         bytes[p++] = cast(ubyte)( (vg_r + 8) << 4 | (vg_b +  8) );
572                     }
573                     else if (
574                         vg_r >= -32 && vg_r <= 31 && 
575                         vg   >= -64 && vg   <= 63 && 
576                         vg_b >= -32 && vg_b <= 31
577                     ) {
578                         int dv = ((vg + 64) << 12) | ((vg_r + 32) << 6) | (vg_b + 32);
579                         bytes[p++] = QOI_OP_LUMA3 | ((dv >> 16) & 31);
580                         bytes[p++] = (dv >> 8) & 255;
581                         bytes[p++] = dv & 255;
582                     } else {
583                         bytes[p++] = QOI_OP_RGB;
584                         bytes[p++] = px.rgba.r;
585                         bytes[p++] = px.rgba.g;
586                         bytes[p++] = px.rgba.b;
587                     }
588                 }
589             }
590 
591             pixel_encoded:
592 
593             px_pos += channels;
594         }
595 
596         // swap input scanline buffers
597         {
598             qoi_rgba_t* temp = inputScanline;
599             inputScanline = lastInputScanline;
600             lastInputScanline = temp;
601         }
602     }
603 
604     for (i = 0; i < cast(int)(qoi_padding.sizeof); i++) 
605     {
606         bytes[p++] = qoi_padding[i];
607     }
608 
609     *out_len = p;
610     return bytes;
611 }
612 
613 /* Decode a QOI2AVG image from memory.
614 
615 The function either returns null on failure (invalid parameters or malloc 
616 failed) or a pointer to the decoded pixels. On success, the qoi_desc struct 
617 is filled with the description from the file header.
618 
619 The returned pixel data should be free()d after use. */
620 ubyte* qoix_decode(const(void)* data, int size, qoi_desc *desc, int channels) {
621     const(ubyte)* bytes;
622     uint header_magic;
623     qoi_rgba_t[64] index;
624     qoi_rgba_t px, px_ref;
625     int chunks_len;
626     int p = 0, run = 0;
627     int index_pos = 0;
628 
629     if (
630         data == null || desc == null ||
631         (channels != 0 && channels !=  3 && channels !=  4) ||
632         size < QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof)
633     ) {
634         return null;
635     }
636 
637     bytes = cast(const(ubyte)*)data;
638 
639     header_magic = qoi_read_32(bytes, &p);
640     desc.width = qoi_read_32(bytes, &p);
641     desc.height = qoi_read_32(bytes, &p);
642     int qoix_version = bytes[p++];
643     desc.channels = bytes[p++];
644     desc.bitdepth = bytes[p++];
645     desc.colorspace = bytes[p++];
646     desc.compression = bytes[p++];
647     desc.pixelAspectRatio = qoi_read_32f(bytes, &p);
648     desc.resolutionY = qoi_read_32f(bytes, &p);
649 
650     if (
651         desc.width == 0 || desc.height == 0 || 
652         desc.channels < 3 || desc.channels > 4 ||
653         desc.colorspace > 1 ||
654         desc.bitdepth != 8 ||
655         qoix_version > 1 ||
656         desc.compression != QOIX_COMPRESSION_NONE ||
657         header_magic != QOIX_MAGIC ||
658         desc.height >= QOIX_PIXELS_MAX / desc.width
659     ) {
660         return null;
661     }
662 
663     if (channels == 0) {
664         channels = desc.channels;
665     }
666 
667     int samplesPerRow = desc.width * channels;
668 
669     desc.pitchBytes = samplesPerRow;
670 
671     int pixel_data_size = desc.width * desc.height * channels;
672     int decoded_scanline_size = desc.width * 4;  
673 
674     int num_samples = desc.width * desc.height * channels;
675     ubyte* pixels = cast(ubyte *) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size);
676     if (!pixels) {
677         return null;
678     }
679 
680     // double-buffered scanline, this is intended to speed up decoding
681     qoi_rgba_t* decodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size]);
682     qoi_rgba_t* lastDecodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]);
683 
684     assert(channels != 1 && channels != 2);
685 
686     memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof);
687     px.rgba.r = 0;
688     px.rgba.g = 0;
689     px.rgba.b = 0;
690     px.rgba.a = 255;
691 
692     chunks_len = size - cast(int)(qoi_padding.sizeof);
693 
694     int px_pos = 0;
695 
696     for (int posy = 0; posy < desc.height; ++posy)
697     {
698         for (int posx = 0; posx < desc.width; ++posx)
699         {
700             if (run > 0) 
701             {
702                 run--;
703             }
704             else if (p < chunks_len) 
705             {
706                 px_ref.v = px.v;
707 
708                 if (posy > 0)
709                 {
710                     if (posx == 0)
711                     {
712                         // first pixel in the row, take above pixel
713                         px_ref.rgba.r = lastDecodedScanline[posx].rgba.r;
714                         px_ref.rgba.g = lastDecodedScanline[posx].rgba.g;
715                         px_ref.rgba.b = lastDecodedScanline[posx].rgba.b;
716                     }
717                     else 
718                     {
719                         // Called I-LOCO intra prediction
720                         RGBA pred = locoIntraPredictionSIMD(px.rgba, lastDecodedScanline[posx].rgba, lastDecodedScanline[posx-1].rgba);
721                         px_ref.rgba.r = pred.r;
722                         px_ref.rgba.g = pred.g;
723                         px_ref.rgba.b = pred.b;
724                     }
725                 }
726 
727                 decode_op:
728 
729                 int b1 = bytes[p++];
730                 if (b1 < 0x80) {        /* QOI_OP_LUMA */
731                     int vg = ((b1 >> 4) & 7) - 4;
732                     px.rgba.g = cast(ubyte)(px_ref.rgba.g + vg);
733                     if (vg < 0) {
734                         px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 1 + ((b1 >> 2) & 3) );
735                         px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 1 +  (b1 &  3) );
736                     }
737                     else {
738                         px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 2 + ((b1 >> 2) & 3) );
739                         px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 2 +  (b1 &  3) );
740                     }
741                     index[index_pos++ & 63] = px;
742                 }
743                 else if (b1 < 0xc0) {       /* QOI_OP_INDEX */
744                     px = index[b1 & 63];
745                 }
746                 else if (b1 < 0xe0) {       /* QOI_OP_LUMA2 */
747                     int b2 = bytes[p++];
748                     int vg = (b1 & 0x1f) - 16;
749                     px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 8 + ((b2 >> 4) & 0x0f) );
750                     px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg );
751                     px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 8 +  (b2       & 0x0f) );
752                     index[index_pos++ & 63] = px;
753                 }
754                 else if (b1 < 0xe8) {       /* QOI_OP_LUMA3 */
755                     int dv = (b1 << 8) | bytes[p++];
756                     dv = (dv << 8) | bytes[p++];
757                     int vg = ((dv >> 12) & 0x7f) - 64;
758                     px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg + ((dv >> 6) & 0x3f) - 32 );
759                     px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg );
760                     px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg + (dv & 0x3f) - 32 );
761                     index[index_pos++ & 63] = px;
762                 }
763                 else if (b1 < 0xf0) {       /* QOI_OP_ADIFF */
764                     px.rgba.a += (b1 & 7) - 4;
765                     goto decode_op;
766                 }
767                 else if (b1 < 0xf8) {       /* QOI_OP_RUN */
768                     run = b1 & 7;
769                 }
770                 else if (b1 < 0xfc) {       /* QOI_OP_RUN2 */
771                     run = ((b1 & 3) << 8) | bytes[p++];
772                 }
773                 else if (b1 == QOI_OP_GRAY) {
774                     ubyte vg = bytes[p++];
775                     px.rgba.r = vg;
776                     px.rgba.g = vg;
777                     px.rgba.b = vg;
778                     index[index_pos++ & 63] = px;
779                 }
780                 else if (b1 == QOI_OP_RGB) {
781                     px.rgba.r = bytes[p++];
782                     px.rgba.g = bytes[p++];
783                     px.rgba.b = bytes[p++];
784                     index[index_pos++ & 63] = px;
785                 }
786                 else if (b1 == QOI_OP_RGBA) {
787                     px.rgba.r = bytes[p++];
788                     px.rgba.g = bytes[p++];
789                     px.rgba.b = bytes[p++];
790                     px.rgba.a = bytes[p++];
791                     index[index_pos++ & 63] = px;
792                 }
793                 else {              /* QOI_OP_END */
794                     break;
795                 }
796             }
797 
798             decodedScanline[posx] = px;
799             px_pos += channels;
800         }
801 
802         // convert just-decoded scanline into output type
803         ubyte* line = cast(ubyte*)(pixels + desc.pitchBytes * posy);
804 
805         switch(channels)
806         {
807             case 4:
808                 // No particular conversion to do
809                 memcpy(line, &decodedScanline[0], desc.width * 4);
810                 break;
811 
812             case 3:
813                 for (int posx = 0; posx < desc.width; ++posx)
814                 {
815                     qoi_rgba_t decodedPx = decodedScanline[posx]; // No particular conversion to do
816                     line[posx * 3 + 0] = decodedPx.rgba.r;
817                     line[posx * 3 + 1] = decodedPx.rgba.g;
818                     line[posx * 3 + 2] = decodedPx.rgba.b;
819                 }
820                 break;
821             default:
822                 assert(false);
823         }
824 
825         // swap decoded scanline buffers
826         {
827             qoi_rgba_t* temp = decodedScanline;
828             decodedScanline = lastDecodedScanline;
829             lastDecodedScanline = temp;
830         }
831     }
832 
833     return pixels;
834 }
835 
836 private:
837 
838 /* Perform LOCO-I prediction independently over the 4 channels.
839 
840 
841     int max_ab = a > b ? a : b;
842     int min_ab = a < b ? a : b;
843     if (c >= max_ab)
844         return cast(ubyte)min_ab;
845     else if (c <= min_ab)
846         return cast(ubyte)max_ab;
847     else
848     {
849         int d = a + b - c;
850         if (d < 0)
851             d = 0;
852         if (d > 255)
853             d = 0;
854         return cast(ubyte)d;
855     }
856 */
857 
858 static RGBA locoIntraPredictionSIMD(RGBA a, RGBA b, RGBA c)
859 {
860     // load RGBA8 pixels
861     __m128i A = _mm_loadu_si32(&a); 
862     __m128i B = _mm_loadu_si32(&b);
863     __m128i C = _mm_loadu_si32(&c);
864 
865     // extend to 16-bits
866     __m128i Z = _mm_setzero_si128();
867     A = _mm_unpacklo_epi8(A, Z);
868     B = _mm_unpacklo_epi8(B, Z);
869     C = _mm_unpacklo_epi8(C, Z);
870 
871     // Max predictor (A + B - C)
872     __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C);
873     __m128i maxAB = _mm_max_epi16(A, B);
874     __m128i minAB = _mm_min_epi16(A, B);
875 
876     // 1111 where we should use max(A, B)
877     __m128i maxMask = _mm_cmple_epi16(C, minAB);
878 
879     // 1111 where we should use min(A, B)
880     __m128i minMask = _mm_cmpge_epi16(C, maxAB);
881 
882     P = (P & (~minMask)) | (minAB & minMask);
883     P = (P & (~maxMask)) | (maxAB & maxMask);
884 
885     // Get back to u8
886     P = _mm_packus_epi16(P, Z);
887 
888     RGBA r;
889     _mm_storeu_si32(&r, P);
890 
891     return r;
892 }
893 
894 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe
895 {
896     return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
897 }
898 
899 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b)
900 {
901     return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
902 }