1 module gamut.codecs.qoi2avg;
2
3 nothrow @nogc:
4
5 import core.stdc.stdlib: realloc, malloc, free;
6 import core.stdc.string: memset, memcpy;
7
8 import inteli.emmintrin;
9
10 /// Note: this is a translation of "QOI2" mods by @wbd73
11 /// revealed in https://github.com/nigeltao/qoi2-bikeshed/issues/34
12 /// Called "QOIX" in Gamut, since it has a few extensions again, such as LZ4.
13
14 /*
15
16 QOI2 - Lossless image format inspired by QOI “Quite OK Image” format
17
18 Incompatible adaptation of QOI format - https://phoboslab.org
19
20 -- LICENSE: The MIT License(MIT)
21 Copyright(c) 2021 Dominic Szablewski (original QOI format)
22 Copyright(c) 2021 wbd73 @ GitHub (compression improvements)
23 Copyright(c) 2022 Guillaume Piolat (D translation, add pitch support)
24
25 Permission is hereby granted, free of charge, to any person obtaining a copy of
26 this software and associated documentation files(the "Software"), to deal in
27 the Software without restriction, including without limitation the rights to
28 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
29 of the Software, and to permit persons to whom the Software is furnished to do
30 so, subject to the following conditions :
31 The above copyright notice and this permission notice shall be included in all
32 copies or substantial portions of the Software.
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
36 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 SOFTWARE.
40
41
42
43 -- Documentation
44
45 This library provides the following functions;
46 - qoi_decode -- decode the raw bytes of a QOI image from memory
47 - qoi_encode -- encode an rgba buffer into a QOI image in memory
48
49 See the function declaration below for the signature and more information.
50
51
52 -- Data Format
53
54 A QOI2AVG file has a 25 byte header, compatible with Gamut QOIX.
55 Followed by any number of data "chunks" and an 8-byte end marker.
56
57 struct qoix_header_t {
58 char magic[4]; // magic bytes "qoix"
59 uint32_t width; // image width in pixels (BE)
60 uint32_t height; // image height in pixels (BE)
61 uint8_t version_; // Major version of QOIX format.
62 uint8_t channels; // 3 = RGB, 4 = RGBA (1 and 2 indicate QOI-plane codec, see qoiplane.d)
63 uint8_t bitdepth; // 8 = this qoi2avg codec is always 8-bit (10 indicates QOI-10 codec, see qoi10b.d)
64 uint8_t colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear
65 uint8_t compression; // 0 = none, 1 = LZ4
66 float pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio
67 float resolutionX; // -1 = unknown, else physical resolution in DPI
68 };
69 */
70
71 enum QOIX_HEADER_OFFSET_CHANNELS = 13;
72 enum QOIX_HEADER_OFFSET_BITDEPTH = 14;
73 enum QOIX_HEADER_OFFSET_COMPRESSION = 16;
74
75
76 /*
77
78 The decoder and encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous
79 pixel value. Pixels are either encoded as
80 - a run of the previous pixel
81 - an index into an array of previously seen pixels
82 - a difference to the previous pixel value in r,g,b
83 - full r,g,b or a or gray values
84
85 The color channels are assumed to not be premultiplied with the alpha channel
86 ("un-premultiplied alpha").
87
88 Each chunk starts with a tag, followed by a number of data bits. The bit length
89 of chunks is divisible by 8 - i.e. all chunks are byte aligned. All values
90 encoded in these data bits have the most significant bit on the left.
91
92 The byte stream's end is marked with 4 0xff bytes.
93
94 A running FIFO array[64] (zero-initialized) of pixel values is maintained by the
95 encoder and decoder. Every pixel en-/decoded by the QOI_OP_LUMA (and variants),
96 QOI_OP_GRAY and QOI_OP_RGB chunks is written to this array. The write position
97 starts at 0 and is incremented with each pixel written. The position wraps back
98 to 0 when it reaches 64. I.e:
99 index[index_pos % 64] = current_pixel;
100 index_pos = index_pos + 1;
101
102 An encoder can search this array for the current pixel value and, if a match is
103 found, emit a QOI_OP_INDEX with the position within the array.
104
105
106 The possible chunks are:
107
108
109 .- QOI_OP_INDEX ----------.
110 | Byte[0] |
111 | 7 6 5 4 3 2 1 0 |
112 |-------+-----------------|
113 | 1 0 | index |
114 `-------------------------`
115 2-bit tag b10
116 6-bit index into the color index array: 0..63
117
118
119 .- QOI_OP_LUMA -----(232)-.
120 | Byte[0] |
121 | 7 6 5 4 3 2 1 0 |
122 |----+--------+-----+-----|
123 | 0 | g diff | drg | dbg |
124 `-------------------------`
125 1-bit tag b0
126 3-bit green channel difference from the reference -4..3
127 2-bit red channel difference minus green channel difference -1..2 or -2..1
128 2-bit blue channel difference minus green channel difference -1..2 or -2..1
129
130 For the first line of pixels the reference is the previous pixel.
131 For the next lines of pixels the reference is the rounded down average of the
132 previous pixel and the one above the current pixel.
133 The green channel is used to indicate the general direction of change and is
134 encoded in 3 bits. The red and green channels (dr and db) base their diffs off
135 of the green channel difference and are encoded in 2 bits. I.e.:
136 dr_dg = (ref.r - cur_px.r) - (ref.g - cur_px.g)
137 db_dg = (ref.b - cur_px.b) - (ref.g - cur_px.g)
138
139 The difference to the current channel values are using a wraparound operation,
140 so "1 - 2" will result in 255, while "255 + 1" will result in 0.
141
142 Values are stored as unsigned integers with a bias of 4 for the green channel
143 and a bias of 1 or 2 for the red and blue channel depending on the direction
144 (sign bit) of the green channel.
145
146
147 .- QOI_OP_LUMA2 ------------------------------(454)-.
148 | Byte[0] | Byte[1] |
149 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 |
150 |----------+--------------+-------------+-----------|
151 | 1 1 0 | green diff | dr - dg | db - dg |
152 `---------------------------------------------------`
153 3-bit tag b110
154 5-bit green channel difference from the reference -16..15
155 4-bit red channel difference minus green channel difference -8..7
156 4-bit blue channel difference minus green channel difference -8..7
157
158 The green channel is used to indicate the general direction of change and is
159 encoded in 5 bits. The red and green channels (dr and db) base their diffs off
160 of the green channel difference and are encoded in 4 bits.
161
162 Values are stored as unsigned integers with a bias of 16 for the green channel
163 and a bias of 8 for the red and blue channel.
164
165
166 .- QOI_OP_LUMA3 ------------------------------------.-------------------(676)-.
167 | Byte[0] | Byte[1] | Byte[2] |
168 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 |
169 |----------------+----------------------+-------------------+-----------------|
170 | 1 1 1 0 0 | green diff | dr - dg | db - dg |
171 `-----------------------------------------------------------------------------`
172 4-bit tag b1110
173 7-bit green channel difference from the reference -64..63
174 6-bit red channel difference minus green channel difference -32..31
175 6-bit blue channel difference minus green channel difference -32..31
176
177 The green channel is used to indicate the general direction of change and is
178 encoded in 7 bits. The red and green channels (dr and db) base their diffs off
179 of the green channel difference and are encoded in 6 bits.
180
181 Values are stored as unsigned integers with a bias of 64 for the green channel
182 and a bias of 32 for the red and blue channel.
183
184
185 .- QOI_OP_RUN ------------.
186 | Byte[0] |
187 | 7 6 5 4 3 2 1 0 |
188 |----------------+--------|
189 | 1 1 1 1 0 | run |
190 `-------------------------`
191 5-bit tag b11110
192 3-bit run-length repeating the previous pixel: 1..8
193
194 The run-length is stored with a bias of 1.
195
196
197 .- QOI_OP_RUN2 ---------------------.
198 | Byte[0] | Byte[1] |
199 | 7 6 5 4 3 2 1 0 | 7 .. 0 |
200 |-------------------+-----+---------|
201 | 1 1 1 1 1 0 | run |
202 `-----------------------------------`
203 6-bit tag b111110
204 10-bit run-length repeating the previous pixel: 1..1024
205
206 The run-length is stored with a bias of 1.
207
208
209 .- QOI_OP_GRAY ---------------------.
210 | Byte[0] | Byte[1] |
211 | 7 6 5 4 3 2 1 0 | 7 .. 0 |
212 |-------------------------+---------|
213 | 1 1 1 1 1 1 0 0 | gray |
214 `-----------------------------------`
215 8-bit tag b11111100
216 8-bit gray channel value
217
218
219 .- QOI_OP_RGB ------------------------------------------.
220 | Byte[0] | Byte[1] | Byte[2] | Byte[3] |
221 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 7 .. 0 | 7 .. 0 |
222 |-------------------------+---------+---------+---------|
223 | 1 1 1 1 1 1 0 1 | red | green | blue |
224 `-------------------------------------------------------`
225 8-bit tag b11111101
226 8-bit red channel value
227 8-bit green channel value
228 8-bit blue channel value
229
230
231 .- QOI_OP_A ------------------------.
232 | Byte[0] | Byte[1] |
233 | 7 6 5 4 3 2 1 0 | 7 .. 0 |
234 |-------------------------+---------|
235 | 1 1 1 1 1 1 1 0 | alpha |
236 `-----------------------------------`
237 8-bit tag b11111110
238 8-bit alpha channel value
239
240
241 .- QOI_OP_END ------------.
242 | Byte[0] |
243 | 7 6 5 4 3 2 1 0 |
244 |-------------------------|
245 | 1 1 1 1 1 1 1 1 |
246 `-------------------------`
247 8-bit tag b11111111
248
249
250 The byte stream is padded at the end with four 0xff bytes. Since the longest
251 legal chunk is 4 bytes (QOI_OP_RGB), with this padding it is possible to check
252 for an overrun only once per decode loop iteration. These 0xff bytes also mark
253 the end of the data stream, as an encoder should never produce four consecutive
254 0xff bytes within the stream.
255
256 */
257
258 /* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions.
259 It describes either the input format (for qoi_write and qoi_encode), or is
260 filled with the description read from the file header (for qoi_read and
261 qoi_decode).
262
263 The colorspace in this qoi_desc is an enum where
264 0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
265 1 = all channels are linear
266 You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely
267 informative. It will be saved to the file header, but does not affect
268 en-/decoding in any way. */
269
270 enum QOI_SRGB = 0;
271 enum QOI_LINEAR = 1;
272
273 struct qoi_desc
274 {
275 uint width;
276 uint height;
277 int pitchBytes; // number of bytes between start of lines.
278 ubyte channels;
279 ubyte bitdepth;
280 ubyte colorspace;
281 ubyte compression;
282 float pixelAspectRatio; // PAR, in Gamut format
283 float resolutionY; // Vertical DPI, in Gamut format
284 }
285
286 alias QOI_MALLOC = malloc;
287 alias QOI_FREE = free;
288
289
290 enum int QOI_OP_LUMA = 0x00; /* 0xxxxxxx */
291 enum int QOI_OP_INDEX = 0x80; /* 10xxxxxx */
292 enum int QOI_OP_LUMA2 = 0xc0; /* 110xxxxx */
293 enum int QOI_OP_LUMA3 = 0xe0; /* 11100xxx */
294 enum int QOI_OP_ADIFF = 0xe8; /* 11101xxx */
295 enum int QOI_OP_RUN = 0xf0; /* 11110xxx */
296 enum int QOI_OP_RUN2 = 0xf8; /* 111110xx */
297 enum int QOI_OP_GRAY = 0xfc; /* 11111100 */
298 enum int QOI_OP_RGB = 0xfd; /* 11111101 */
299 enum int QOI_OP_RGBA = 0xfe; /* 11111110 */
300 enum int QOI_OP_END = 0xff; /* 11111111 */
301
302 enum uint QOIX_MAGIC = 0x716F6978; // "qoix"
303 enum QOIX_HEADER_SIZE = 15 + 1 /* version */ + 4 /* PAR */ + 4 /* DPI */ + 1 /* compression */;
304 enum ubyte QOIX_COMPRESSION_NONE = 0;
305 enum ubyte QOIX_COMPRESSION_LZ4 = 1;
306
307 /* To not have to linearly search through the color index array, we use a hash
308 of the color value to quickly lookup the index position in a hash table. */
309 uint QOI_COLOR_HASH(qoi_rgba_t C)
310 {
311 return (((C.v * 2654435769) >> 22) & 1023);
312 }
313
314 /* 2GB is the max file size that this implementation can safely handle. We guard
315 against anything larger than that, assuming the worst case with 5 bytes per
316 pixel, rounded down to a nice clean value. 400 million pixels ought to be
317 enough for anybody. */
318 enum uint QOIX_PIXELS_MAX = 400000000;
319
320 struct RGBA
321 {
322 ubyte r, g, b, a;
323 }
324 static assert(RGBA.sizeof == 4);
325
326 struct qoi_rgba_t
327 {
328 union
329 {
330 RGBA rgba;
331 uint v;
332 }
333 }
334
335 static immutable ubyte[4] qoi_padding = [255,255,255,255];
336
337 void qoi_write_32(ubyte* bytes, int *p, uint v)
338 {
339 bytes[(*p)++] = (0xff000000 & v) >> 24;
340 bytes[(*p)++] = (0x00ff0000 & v) >> 16;
341 bytes[(*p)++] = (0x0000ff00 & v) >> 8;
342 bytes[(*p)++] = (0x000000ff & v);
343 }
344
345 uint qoi_read_32(const(ubyte)* bytes, int *p)
346 {
347 uint a = bytes[(*p)++];
348 uint b = bytes[(*p)++];
349 uint c = bytes[(*p)++];
350 uint d = bytes[(*p)++];
351 return a << 24 | b << 16 | c << 8 | d;
352 }
353
354 void qoi_write_32f(ubyte* bytes, int *p, float f)
355 {
356 qoi_write_32(bytes, p, *cast(uint*)&f);
357 }
358
359 float qoi_read_32f(const(ubyte)* bytes, int *p)
360 {
361 uint r = qoi_read_32(bytes, p);
362 return *cast(float*)&r;
363 }
364
365 /* Encode raw RGB or RGBA pixels into a QOI2AVG image in memory.
366
367 The function either returns null on failure (invalid parameters or malloc
368 failed) or a pointer to the encoded data on success. On success the out_len
369 is set to the size in bytes of the encoded data.
370
371 The returned qoi data should be free()d after use. */
372 ubyte* qoix_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len)
373 {
374 int i, stride, p, run;
375 int px_len, px_end, px_pos, channels;
376 ubyte* bytes;
377 ubyte[1024] index_lookup;
378 uint index_pos = 0;
379 qoi_rgba_t[64] index;
380 qoi_rgba_t px, px_ref;
381
382 if (
383 data == null || out_len == null || desc == null ||
384 desc.width == 0 || desc.height == 0 ||
385 desc.channels < 3 || desc.channels > 4 ||
386 desc.colorspace > 1 ||
387 desc.bitdepth != 8 ||
388 desc.compression != QOIX_COMPRESSION_NONE ||
389 desc.height >= QOIX_PIXELS_MAX / desc.width
390 ) {
391 return null;
392 }
393
394 int pixel_data_size = desc.width * desc.height * channels;
395
396 // Before encoding a scanline, it is converted to RGBA8.
397 // This is double buffered, to help with prediction.
398 int converted_scanline_size = desc.width * 4;
399
400 // Allocated 3 rgba8 scanlines for the need of encoding.
401 int extraAllocSize = converted_scanline_size*2;
402
403 // Overallocate to make room for everything.
404 int max_size = desc.width * desc.height * (desc.channels + 1) + QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof);
405
406 p = 0;
407 bytes = cast(ubyte*) QOI_MALLOC(max_size + extraAllocSize);
408 if (!bytes)
409 {
410 return null;
411 }
412
413 // double-buffered scanline, this is intended to speed up decoding
414 qoi_rgba_t* inputScanline = cast(qoi_rgba_t*)(bytes + max_size);
415 qoi_rgba_t* lastInputScanline = cast(qoi_rgba_t*)(bytes + max_size + converted_scanline_size);
416
417 qoi_write_32(bytes, &p, QOIX_MAGIC);
418 qoi_write_32(bytes, &p, desc.width);
419 qoi_write_32(bytes, &p, desc.height);
420 bytes[p++] = 1; // Put a version number :)
421 bytes[p++] = desc.channels; // 3, or 4
422 bytes[p++] = desc.bitdepth; // 8, or 10
423 bytes[p++] = desc.colorspace;
424 bytes[p++] = QOIX_COMPRESSION_NONE;
425 qoi_write_32f(bytes, &p, desc.pixelAspectRatio);
426 qoi_write_32f(bytes, &p, desc.resolutionY);
427
428 //pixels = cast(const(ubyte)*) data;
429
430 memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof);
431 index_lookup[] = 0;
432
433 run = 0;
434 px.rgba.r = 0;
435 px.rgba.g = 0;
436 px.rgba.b = 0;
437 px.rgba.a = 255;
438
439 channels = desc.channels;
440 stride = desc.width * channels;
441 px_len = desc.width * desc.height * channels;
442 px_end = px_len - channels;
443
444 assert (channels != 1 && channels != 2);
445
446
447
448 for (int posy = 0; posy < desc.height; ++posy)
449 {
450 const(ubyte)* line = data + desc.pitchBytes * posy;
451
452 // Convert one input scanline at once to rgba8
453 if (desc.channels == 4)
454 {
455 // PERF: replace by pointer swap
456 memcpy(inputScanline, line, desc.pitchBytes);
457 }
458 else
459 {
460 assert(desc.channels == 3);
461 for (int posx = 0; posx < desc.width; ++posx)
462 {
463 inputScanline[posx].rgba = RGBA(line[posx * 3 + 0], line[posx * 3 + 1], line[posx * 3 + 2], 255);
464 }
465 }
466
467 for (int posx = 0; posx < desc.width; ++posx)
468 {
469 px_ref.v = px.v;
470 px = inputScanline[posx];
471
472 if (px.v == px_ref.v) {
473 run++;
474 if (run == 1024 || px_pos == px_end) {
475 run--;
476 bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3);
477 bytes[p++] = run & 0xff;
478 run = 0;
479 }
480 }
481 else {
482 int hash = QOI_COLOR_HASH(px);
483
484 if (run > 0) {
485 run--;
486 if (run < 8) {
487 bytes[p++] = cast(ubyte)(QOI_OP_RUN | run);
488 }
489 else {
490 bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3);
491 bytes[p++] = run & 0xff;
492 }
493 run = 0;
494 }
495
496 if (index[index_lookup[hash]].v == px.v) {
497 bytes[p++] = QOI_OP_INDEX | index_lookup[hash];
498 }
499 else {
500 index_lookup[hash] = cast(ubyte) index_pos;
501 index[index_pos] = px;
502 index_pos = (index_pos + 1) & 63;
503
504 byte va = cast(byte)(px.rgba.a - px_ref.rgba.a);
505
506 if (va) {
507 if (va >= -4 && va <= 3){
508 bytes[p++] = cast(ubyte)(QOI_OP_ADIFF | (va + 4));
509 } else {
510 bytes[p++] = QOI_OP_RGBA; // make a grey + alpha opcode?
511 bytes[p++] = px.rgba.r;
512 bytes[p++] = px.rgba.g;
513 bytes[p++] = px.rgba.b;
514 bytes[p++] = px.rgba.a;
515 goto pixel_encoded;
516 }
517 }
518
519 // Note: computing this predictor for the whole scanline in advance, even with 2x pixels at once, was slower.
520 // because in normal times, you don't compute this predictor all the time.
521 if (posy > 0)
522 {
523 if (posx == 0)
524 {
525 // first pixel in the row, take above pixel
526 RGBA pred = lastInputScanline[posx].rgba;
527 px_ref.rgba.r = pred.r;
528 px_ref.rgba.g = pred.g;
529 px_ref.rgba.b = pred.b;
530 }
531 else
532 {
533 RGBA pred = locoIntraPredictionSIMD(px_ref.rgba, lastInputScanline[posx].rgba, lastInputScanline[posx-1].rgba);
534 px_ref.rgba.r = pred.r;
535 px_ref.rgba.g = pred.g;
536 px_ref.rgba.b = pred.b;
537 }
538 }
539
540 byte vg = cast(byte)(px.rgba.g - px_ref.rgba.g);
541 byte vg_r = cast(byte)(px.rgba.r - px_ref.rgba.r - vg);
542 byte vg_b = cast(byte)(px.rgba.b - px_ref.rgba.b - vg);
543
544 if (
545 vg >= -4 && vg < 0 &&
546 vg_r >= -1 && vg_r <= 2 &&
547 vg_b >= -1 && vg_b <= 2
548 ) {
549 bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 1) << 2 | (vg_b + 1) );
550 }
551 else if (
552 vg >= 0 && vg <= 3 &&
553 vg_r >= -2 && vg_r <= 1 &&
554 vg_b >= -2 && vg_b <= 1
555 ) {
556 bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 2) << 2 | (vg_b + 2) );
557 }
558 else if (
559 px.rgba.g == px.rgba.r &&
560 px.rgba.g == px.rgba.b
561 ) {
562 bytes[p++] = QOI_OP_GRAY;
563 bytes[p++] = px.rgba.g;
564 }
565 else if (
566 vg_r >= -8 && vg_r <= 7 &&
567 vg >= -16 && vg <= 15 &&
568 vg_b >= -8 && vg_b <= 7
569 ) {
570 bytes[p++] = cast(ubyte)( QOI_OP_LUMA2 | (vg + 16) );
571 bytes[p++] = cast(ubyte)( (vg_r + 8) << 4 | (vg_b + 8) );
572 }
573 else if (
574 vg_r >= -32 && vg_r <= 31 &&
575 vg >= -64 && vg <= 63 &&
576 vg_b >= -32 && vg_b <= 31
577 ) {
578 int dv = ((vg + 64) << 12) | ((vg_r + 32) << 6) | (vg_b + 32);
579 bytes[p++] = QOI_OP_LUMA3 | ((dv >> 16) & 31);
580 bytes[p++] = (dv >> 8) & 255;
581 bytes[p++] = dv & 255;
582 } else {
583 bytes[p++] = QOI_OP_RGB;
584 bytes[p++] = px.rgba.r;
585 bytes[p++] = px.rgba.g;
586 bytes[p++] = px.rgba.b;
587 }
588 }
589 }
590
591 pixel_encoded:
592
593 px_pos += channels;
594 }
595
596 // swap input scanline buffers
597 {
598 qoi_rgba_t* temp = inputScanline;
599 inputScanline = lastInputScanline;
600 lastInputScanline = temp;
601 }
602 }
603
604 for (i = 0; i < cast(int)(qoi_padding.sizeof); i++)
605 {
606 bytes[p++] = qoi_padding[i];
607 }
608
609 *out_len = p;
610 return bytes;
611 }
612
613 /* Decode a QOI2AVG image from memory.
614
615 The function either returns null on failure (invalid parameters or malloc
616 failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
617 is filled with the description from the file header.
618
619 The returned pixel data should be free()d after use. */
620 ubyte* qoix_decode(const(void)* data, int size, qoi_desc *desc, int channels) {
621 const(ubyte)* bytes;
622 uint header_magic;
623 qoi_rgba_t[64] index;
624 qoi_rgba_t px, px_ref;
625 int chunks_len;
626 int p = 0, run = 0;
627 int index_pos = 0;
628
629 if (
630 data == null || desc == null ||
631 (channels != 0 && channels != 3 && channels != 4) ||
632 size < QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof)
633 ) {
634 return null;
635 }
636
637 bytes = cast(const(ubyte)*)data;
638
639 header_magic = qoi_read_32(bytes, &p);
640 desc.width = qoi_read_32(bytes, &p);
641 desc.height = qoi_read_32(bytes, &p);
642 int qoix_version = bytes[p++];
643 desc.channels = bytes[p++];
644 desc.bitdepth = bytes[p++];
645 desc.colorspace = bytes[p++];
646 desc.compression = bytes[p++];
647 desc.pixelAspectRatio = qoi_read_32f(bytes, &p);
648 desc.resolutionY = qoi_read_32f(bytes, &p);
649
650 if (
651 desc.width == 0 || desc.height == 0 ||
652 desc.channels < 3 || desc.channels > 4 ||
653 desc.colorspace > 1 ||
654 desc.bitdepth != 8 ||
655 qoix_version > 1 ||
656 desc.compression != QOIX_COMPRESSION_NONE ||
657 header_magic != QOIX_MAGIC ||
658 desc.height >= QOIX_PIXELS_MAX / desc.width
659 ) {
660 return null;
661 }
662
663 if (channels == 0) {
664 channels = desc.channels;
665 }
666
667 int samplesPerRow = desc.width * channels;
668
669 desc.pitchBytes = samplesPerRow;
670
671 int pixel_data_size = desc.width * desc.height * channels;
672 int decoded_scanline_size = desc.width * 4;
673
674 int num_samples = desc.width * desc.height * channels;
675 ubyte* pixels = cast(ubyte *) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size);
676 if (!pixels) {
677 return null;
678 }
679
680 // double-buffered scanline, this is intended to speed up decoding
681 qoi_rgba_t* decodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size]);
682 qoi_rgba_t* lastDecodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]);
683
684 assert(channels != 1 && channels != 2);
685
686 memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof);
687 px.rgba.r = 0;
688 px.rgba.g = 0;
689 px.rgba.b = 0;
690 px.rgba.a = 255;
691
692 chunks_len = size - cast(int)(qoi_padding.sizeof);
693
694 int px_pos = 0;
695
696 for (int posy = 0; posy < desc.height; ++posy)
697 {
698 for (int posx = 0; posx < desc.width; ++posx)
699 {
700 if (run > 0)
701 {
702 run--;
703 }
704 else if (p < chunks_len)
705 {
706 px_ref.v = px.v;
707
708 if (posy > 0)
709 {
710 if (posx == 0)
711 {
712 // first pixel in the row, take above pixel
713 px_ref.rgba.r = lastDecodedScanline[posx].rgba.r;
714 px_ref.rgba.g = lastDecodedScanline[posx].rgba.g;
715 px_ref.rgba.b = lastDecodedScanline[posx].rgba.b;
716 }
717 else
718 {
719 // Called I-LOCO intra prediction
720 RGBA pred = locoIntraPredictionSIMD(px.rgba, lastDecodedScanline[posx].rgba, lastDecodedScanline[posx-1].rgba);
721 px_ref.rgba.r = pred.r;
722 px_ref.rgba.g = pred.g;
723 px_ref.rgba.b = pred.b;
724 }
725 }
726
727 decode_op:
728
729 int b1 = bytes[p++];
730 if (b1 < 0x80) { /* QOI_OP_LUMA */
731 int vg = ((b1 >> 4) & 7) - 4;
732 px.rgba.g = cast(ubyte)(px_ref.rgba.g + vg);
733 if (vg < 0) {
734 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 1 + ((b1 >> 2) & 3) );
735 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 1 + (b1 & 3) );
736 }
737 else {
738 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 2 + ((b1 >> 2) & 3) );
739 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 2 + (b1 & 3) );
740 }
741 index[index_pos++ & 63] = px;
742 }
743 else if (b1 < 0xc0) { /* QOI_OP_INDEX */
744 px = index[b1 & 63];
745 }
746 else if (b1 < 0xe0) { /* QOI_OP_LUMA2 */
747 int b2 = bytes[p++];
748 int vg = (b1 & 0x1f) - 16;
749 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 8 + ((b2 >> 4) & 0x0f) );
750 px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg );
751 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 8 + (b2 & 0x0f) );
752 index[index_pos++ & 63] = px;
753 }
754 else if (b1 < 0xe8) { /* QOI_OP_LUMA3 */
755 int dv = (b1 << 8) | bytes[p++];
756 dv = (dv << 8) | bytes[p++];
757 int vg = ((dv >> 12) & 0x7f) - 64;
758 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg + ((dv >> 6) & 0x3f) - 32 );
759 px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg );
760 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg + (dv & 0x3f) - 32 );
761 index[index_pos++ & 63] = px;
762 }
763 else if (b1 < 0xf0) { /* QOI_OP_ADIFF */
764 px.rgba.a += (b1 & 7) - 4;
765 goto decode_op;
766 }
767 else if (b1 < 0xf8) { /* QOI_OP_RUN */
768 run = b1 & 7;
769 }
770 else if (b1 < 0xfc) { /* QOI_OP_RUN2 */
771 run = ((b1 & 3) << 8) | bytes[p++];
772 }
773 else if (b1 == QOI_OP_GRAY) {
774 ubyte vg = bytes[p++];
775 px.rgba.r = vg;
776 px.rgba.g = vg;
777 px.rgba.b = vg;
778 index[index_pos++ & 63] = px;
779 }
780 else if (b1 == QOI_OP_RGB) {
781 px.rgba.r = bytes[p++];
782 px.rgba.g = bytes[p++];
783 px.rgba.b = bytes[p++];
784 index[index_pos++ & 63] = px;
785 }
786 else if (b1 == QOI_OP_RGBA) {
787 px.rgba.r = bytes[p++];
788 px.rgba.g = bytes[p++];
789 px.rgba.b = bytes[p++];
790 px.rgba.a = bytes[p++];
791 index[index_pos++ & 63] = px;
792 }
793 else { /* QOI_OP_END */
794 break;
795 }
796 }
797
798 decodedScanline[posx] = px;
799 px_pos += channels;
800 }
801
802 // convert just-decoded scanline into output type
803 ubyte* line = cast(ubyte*)(pixels + desc.pitchBytes * posy);
804
805 switch(channels)
806 {
807 case 4:
808 // No particular conversion to do
809 memcpy(line, &decodedScanline[0], desc.width * 4);
810 break;
811
812 case 3:
813 for (int posx = 0; posx < desc.width; ++posx)
814 {
815 qoi_rgba_t decodedPx = decodedScanline[posx]; // No particular conversion to do
816 line[posx * 3 + 0] = decodedPx.rgba.r;
817 line[posx * 3 + 1] = decodedPx.rgba.g;
818 line[posx * 3 + 2] = decodedPx.rgba.b;
819 }
820 break;
821 default:
822 assert(false);
823 }
824
825 // swap decoded scanline buffers
826 {
827 qoi_rgba_t* temp = decodedScanline;
828 decodedScanline = lastDecodedScanline;
829 lastDecodedScanline = temp;
830 }
831 }
832
833 return pixels;
834 }
835
836 private:
837
838 /* Perform LOCO-I prediction independently over the 4 channels.
839
840
841 int max_ab = a > b ? a : b;
842 int min_ab = a < b ? a : b;
843 if (c >= max_ab)
844 return cast(ubyte)min_ab;
845 else if (c <= min_ab)
846 return cast(ubyte)max_ab;
847 else
848 {
849 int d = a + b - c;
850 if (d < 0)
851 d = 0;
852 if (d > 255)
853 d = 0;
854 return cast(ubyte)d;
855 }
856 */
857
858 static RGBA locoIntraPredictionSIMD(RGBA a, RGBA b, RGBA c)
859 {
860 // load RGBA8 pixels
861 __m128i A = _mm_loadu_si32(&a);
862 __m128i B = _mm_loadu_si32(&b);
863 __m128i C = _mm_loadu_si32(&c);
864
865 // extend to 16-bits
866 __m128i Z = _mm_setzero_si128();
867 A = _mm_unpacklo_epi8(A, Z);
868 B = _mm_unpacklo_epi8(B, Z);
869 C = _mm_unpacklo_epi8(C, Z);
870
871 // Max predictor (A + B - C)
872 __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C);
873 __m128i maxAB = _mm_max_epi16(A, B);
874 __m128i minAB = _mm_min_epi16(A, B);
875
876 // 1111 where we should use max(A, B)
877 __m128i maxMask = _mm_cmple_epi16(C, minAB);
878
879 // 1111 where we should use min(A, B)
880 __m128i minMask = _mm_cmpge_epi16(C, maxAB);
881
882 P = (P & (~minMask)) | (minAB & minMask);
883 P = (P & (~maxMask)) | (maxAB & maxMask);
884
885 // Get back to u8
886 P = _mm_packus_epi16(P, Z);
887
888 RGBA r;
889 _mm_storeu_si32(&r, P);
890
891 return r;
892 }
893
894 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe
895 {
896 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b));
897 }
898
899 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b)
900 {
901 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b));
902 }