1 module gamut.codecs.qoi2avg; 2 3 nothrow @nogc: 4 5 import core.stdc.stdlib: realloc, malloc, free; 6 import core.stdc.string: memset, memcpy; 7 8 import inteli.emmintrin; 9 10 /// Note: this is a translation of "QOI2" mods by @wbd73 11 /// revealed in https://github.com/nigeltao/qoi2-bikeshed/issues/34 12 /// Called "QOIX" in Gamut, since it has a few extensions again, such as LZ4. 13 14 /* 15 16 QOI2 - Lossless image format inspired by QOI “Quite OK Image” format 17 18 Incompatible adaptation of QOI format - https://phoboslab.org 19 20 -- LICENSE: The MIT License(MIT) 21 Copyright(c) 2021 Dominic Szablewski (original QOI format) 22 Copyright(c) 2021 wbd73 @ GitHub (compression improvements) 23 Copyright(c) 2022 Guillaume Piolat (D translation, add pitch support) 24 25 Permission is hereby granted, free of charge, to any person obtaining a copy of 26 this software and associated documentation files(the "Software"), to deal in 27 the Software without restriction, including without limitation the rights to 28 use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 29 of the Software, and to permit persons to whom the Software is furnished to do 30 so, subject to the following conditions : 31 The above copyright notice and this permission notice shall be included in all 32 copies or substantial portions of the Software. 33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 36 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 39 SOFTWARE. 40 41 42 43 -- Documentation 44 45 This library provides the following functions; 46 - qoi_decode -- decode the raw bytes of a QOI image from memory 47 - qoi_encode -- encode an rgba buffer into a QOI image in memory 48 49 See the function declaration below for the signature and more information. 50 51 52 -- Data Format 53 54 A QOI2AVG file has a 25 byte header, compatible with Gamut QOIX. 55 Followed by any number of data "chunks" and an 8-byte end marker. 56 57 struct qoix_header_t { 58 char magic[4]; // magic bytes "qoix" 59 uint32_t width; // image width in pixels (BE) 60 uint32_t height; // image height in pixels (BE) 61 uint8_t version_; // Major version of QOIX format. 62 uint8_t channels; // 3 = RGB, 4 = RGBA (1 and 2 indicate QOI-plane codec, see qoiplane.d) 63 uint8_t bitdepth; // 8 = this qoi2avg codec is always 8-bit (10 indicates QOI-10 codec, see qoi10b.d) 64 uint8_t colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear 65 uint8_t compression; // 0 = none, 1 = LZ4 66 float pixelAspectRatio; // -1 = unknown, else Pixel Aspect Ratio 67 float resolutionX; // -1 = unknown, else physical resolution in DPI 68 }; 69 */ 70 71 enum QOIX_HEADER_OFFSET_CHANNELS = 13; 72 enum QOIX_HEADER_OFFSET_BITDEPTH = 14; 73 enum QOIX_HEADER_OFFSET_COMPRESSION = 16; 74 75 76 /* 77 78 The decoder and encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous 79 pixel value. Pixels are either encoded as 80 - a run of the previous pixel 81 - an index into an array of previously seen pixels 82 - a difference to the previous pixel value in r,g,b 83 - full r,g,b or a or gray values 84 85 The color channels are assumed to not be premultiplied with the alpha channel 86 ("un-premultiplied alpha"). 87 88 Each chunk starts with a tag, followed by a number of data bits. The bit length 89 of chunks is divisible by 8 - i.e. all chunks are byte aligned. All values 90 encoded in these data bits have the most significant bit on the left. 91 92 The byte stream's end is marked with 4 0xff bytes. 93 94 A running FIFO array[64] (zero-initialized) of pixel values is maintained by the 95 encoder and decoder. Every pixel en-/decoded by the QOI_OP_LUMA (and variants), 96 QOI_OP_GRAY and QOI_OP_RGB chunks is written to this array. The write position 97 starts at 0 and is incremented with each pixel written. The position wraps back 98 to 0 when it reaches 64. I.e: 99 index[index_pos % 64] = current_pixel; 100 index_pos = index_pos + 1; 101 102 An encoder can search this array for the current pixel value and, if a match is 103 found, emit a QOI_OP_INDEX with the position within the array. 104 105 106 The possible chunks are: 107 108 109 .- QOI_OP_INDEX ----------. 110 | Byte[0] | 111 | 7 6 5 4 3 2 1 0 | 112 |-------+-----------------| 113 | 1 0 | index | 114 `-------------------------` 115 2-bit tag b10 116 6-bit index into the color index array: 0..63 117 118 119 .- QOI_OP_LUMA -----(232)-. 120 | Byte[0] | 121 | 7 6 5 4 3 2 1 0 | 122 |----+--------+-----+-----| 123 | 0 | g diff | drg | dbg | 124 `-------------------------` 125 1-bit tag b0 126 3-bit green channel difference from the reference -4..3 127 2-bit red channel difference minus green channel difference -1..2 or -2..1 128 2-bit blue channel difference minus green channel difference -1..2 or -2..1 129 130 For the first line of pixels the reference is the previous pixel. 131 For the next lines of pixels the reference is the rounded down average of the 132 previous pixel and the one above the current pixel. 133 The green channel is used to indicate the general direction of change and is 134 encoded in 3 bits. The red and green channels (dr and db) base their diffs off 135 of the green channel difference and are encoded in 2 bits. I.e.: 136 dr_dg = (ref.r - cur_px.r) - (ref.g - cur_px.g) 137 db_dg = (ref.b - cur_px.b) - (ref.g - cur_px.g) 138 139 The difference to the current channel values are using a wraparound operation, 140 so "1 - 2" will result in 255, while "255 + 1" will result in 0. 141 142 Values are stored as unsigned integers with a bias of 4 for the green channel 143 and a bias of 1 or 2 for the red and blue channel depending on the direction 144 (sign bit) of the green channel. 145 146 147 .- QOI_OP_LUMA2 ------------------------------(454)-. 148 | Byte[0] | Byte[1] | 149 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 150 |----------+--------------+-------------+-----------| 151 | 1 1 0 | green diff | dr - dg | db - dg | 152 `---------------------------------------------------` 153 3-bit tag b110 154 5-bit green channel difference from the reference -16..15 155 4-bit red channel difference minus green channel difference -8..7 156 4-bit blue channel difference minus green channel difference -8..7 157 158 The green channel is used to indicate the general direction of change and is 159 encoded in 5 bits. The red and green channels (dr and db) base their diffs off 160 of the green channel difference and are encoded in 4 bits. 161 162 Values are stored as unsigned integers with a bias of 16 for the green channel 163 and a bias of 8 for the red and blue channel. 164 165 166 .- QOI_OP_LUMA3 ------------------------------------.-------------------(676)-. 167 | Byte[0] | Byte[1] | Byte[2] | 168 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 169 |----------------+----------------------+-------------------+-----------------| 170 | 1 1 1 0 0 | green diff | dr - dg | db - dg | 171 `-----------------------------------------------------------------------------` 172 4-bit tag b1110 173 7-bit green channel difference from the reference -64..63 174 6-bit red channel difference minus green channel difference -32..31 175 6-bit blue channel difference minus green channel difference -32..31 176 177 The green channel is used to indicate the general direction of change and is 178 encoded in 7 bits. The red and green channels (dr and db) base their diffs off 179 of the green channel difference and are encoded in 6 bits. 180 181 Values are stored as unsigned integers with a bias of 64 for the green channel 182 and a bias of 32 for the red and blue channel. 183 184 185 .- QOI_OP_RUN ------------. 186 | Byte[0] | 187 | 7 6 5 4 3 2 1 0 | 188 |----------------+--------| 189 | 1 1 1 1 0 | run | 190 `-------------------------` 191 5-bit tag b11110 192 3-bit run-length repeating the previous pixel: 1..8 193 194 The run-length is stored with a bias of 1. 195 196 197 .- QOI_OP_RUN2 ---------------------. 198 | Byte[0] | Byte[1] | 199 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 200 |-------------------+-----+---------| 201 | 1 1 1 1 1 0 | run | 202 `-----------------------------------` 203 6-bit tag b111110 204 10-bit run-length repeating the previous pixel: 1..1024 205 206 The run-length is stored with a bias of 1. 207 208 209 .- QOI_OP_GRAY ---------------------. 210 | Byte[0] | Byte[1] | 211 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 212 |-------------------------+---------| 213 | 1 1 1 1 1 1 0 0 | gray | 214 `-----------------------------------` 215 8-bit tag b11111100 216 8-bit gray channel value 217 218 219 .- QOI_OP_RGB ------------------------------------------. 220 | Byte[0] | Byte[1] | Byte[2] | Byte[3] | 221 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 7 .. 0 | 7 .. 0 | 222 |-------------------------+---------+---------+---------| 223 | 1 1 1 1 1 1 0 1 | red | green | blue | 224 `-------------------------------------------------------` 225 8-bit tag b11111101 226 8-bit red channel value 227 8-bit green channel value 228 8-bit blue channel value 229 230 231 .- QOI_OP_A ------------------------. 232 | Byte[0] | Byte[1] | 233 | 7 6 5 4 3 2 1 0 | 7 .. 0 | 234 |-------------------------+---------| 235 | 1 1 1 1 1 1 1 0 | alpha | 236 `-----------------------------------` 237 8-bit tag b11111110 238 8-bit alpha channel value 239 240 241 .- QOI_OP_END ------------. 242 | Byte[0] | 243 | 7 6 5 4 3 2 1 0 | 244 |-------------------------| 245 | 1 1 1 1 1 1 1 1 | 246 `-------------------------` 247 8-bit tag b11111111 248 249 250 The byte stream is padded at the end with four 0xff bytes. Since the longest 251 legal chunk is 4 bytes (QOI_OP_RGB), with this padding it is possible to check 252 for an overrun only once per decode loop iteration. These 0xff bytes also mark 253 the end of the data stream, as an encoder should never produce four consecutive 254 0xff bytes within the stream. 255 256 */ 257 258 /* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions. 259 It describes either the input format (for qoi_write and qoi_encode), or is 260 filled with the description read from the file header (for qoi_read and 261 qoi_decode). 262 263 The colorspace in this qoi_desc is an enum where 264 0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel 265 1 = all channels are linear 266 You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely 267 informative. It will be saved to the file header, but does not affect 268 en-/decoding in any way. */ 269 270 enum QOI_SRGB = 0; 271 enum QOI_LINEAR = 1; 272 273 struct qoi_desc 274 { 275 uint width; 276 uint height; 277 int pitchBytes; // number of bytes between start of lines. 278 ubyte channels; 279 ubyte bitdepth; 280 ubyte colorspace; 281 ubyte compression; 282 float pixelAspectRatio; // PAR, in Gamut format 283 float resolutionY; // Vertical DPI, in Gamut format 284 } 285 286 alias QOI_MALLOC = malloc; 287 alias QOI_FREE = free; 288 289 290 enum int QOI_OP_LUMA = 0x00; /* 0xxxxxxx */ 291 enum int QOI_OP_INDEX = 0x80; /* 10xxxxxx */ 292 enum int QOI_OP_LUMA2 = 0xc0; /* 110xxxxx */ 293 enum int QOI_OP_LUMA3 = 0xe0; /* 11100xxx */ 294 enum int QOI_OP_ADIFF = 0xe8; /* 11101xxx */ 295 enum int QOI_OP_RUN = 0xf0; /* 11110xxx */ 296 enum int QOI_OP_RUN2 = 0xf8; /* 111110xx */ 297 enum int QOI_OP_GRAY = 0xfc; /* 11111100 */ 298 enum int QOI_OP_RGB = 0xfd; /* 11111101 */ 299 enum int QOI_OP_RGBA = 0xfe; /* 11111110 */ 300 enum int QOI_OP_END = 0xff; /* 11111111 */ 301 302 enum uint QOIX_MAGIC = 0x716F6978; // "qoix" 303 enum QOIX_HEADER_SIZE = 15 + 1 /* version */ + 4 /* PAR */ + 4 /* DPI */ + 1 /* compression */; 304 enum ubyte QOIX_COMPRESSION_NONE = 0; 305 enum ubyte QOIX_COMPRESSION_LZ4 = 1; 306 307 /* To not have to linearly search through the color index array, we use a hash 308 of the color value to quickly lookup the index position in a hash table. */ 309 uint QOI_COLOR_HASH(qoi_rgba_t C) 310 { 311 return (((C.v * 2654435769) >> 22) & 1023); 312 } 313 314 /* 2GB is the max file size that this implementation can safely handle. We guard 315 against anything larger than that, assuming the worst case with 5 bytes per 316 pixel, rounded down to a nice clean value. 400 million pixels ought to be 317 enough for anybody. */ 318 enum uint QOIX_PIXELS_MAX = 400000000; 319 320 struct RGBA 321 { 322 ubyte r, g, b, a; 323 } 324 static assert(RGBA.sizeof == 4); 325 326 struct qoi_rgba_t 327 { 328 union 329 { 330 RGBA rgba; 331 uint v; 332 } 333 } 334 335 static immutable ubyte[4] qoi_padding = [255,255,255,255]; 336 337 void qoi_write_32(ubyte* bytes, int *p, uint v) 338 { 339 bytes[(*p)++] = (0xff000000 & v) >> 24; 340 bytes[(*p)++] = (0x00ff0000 & v) >> 16; 341 bytes[(*p)++] = (0x0000ff00 & v) >> 8; 342 bytes[(*p)++] = (0x000000ff & v); 343 } 344 345 uint qoi_read_32(const(ubyte)* bytes, int *p) 346 { 347 uint a = bytes[(*p)++]; 348 uint b = bytes[(*p)++]; 349 uint c = bytes[(*p)++]; 350 uint d = bytes[(*p)++]; 351 return a << 24 | b << 16 | c << 8 | d; 352 } 353 354 void qoi_write_32f(ubyte* bytes, int *p, float f) 355 { 356 qoi_write_32(bytes, p, *cast(uint*)&f); 357 } 358 359 float qoi_read_32f(const(ubyte)* bytes, int *p) 360 { 361 uint r = qoi_read_32(bytes, p); 362 return *cast(float*)&r; 363 } 364 365 /* Encode raw RGB or RGBA pixels into a QOI2AVG image in memory. 366 367 The function either returns null on failure (invalid parameters or malloc 368 failed) or a pointer to the encoded data on success. On success the out_len 369 is set to the size in bytes of the encoded data. 370 371 The returned qoi data should be free()d after use. */ 372 ubyte* qoix_encode(const(ubyte)* data, const(qoi_desc)* desc, int *out_len) 373 { 374 int i, stride, p, run; 375 int px_len, px_end, px_pos, channels; 376 ubyte* bytes; 377 ubyte[1024] index_lookup; 378 uint index_pos = 0; 379 qoi_rgba_t[64] index; 380 qoi_rgba_t px, px_ref; 381 382 if ( 383 data == null || out_len == null || desc == null || 384 desc.width == 0 || desc.height == 0 || 385 desc.channels < 3 || desc.channels > 4 || 386 desc.colorspace > 1 || 387 desc.bitdepth != 8 || 388 desc.compression != QOIX_COMPRESSION_NONE || 389 desc.height >= QOIX_PIXELS_MAX / desc.width 390 ) { 391 return null; 392 } 393 394 int pixel_data_size = desc.width * desc.height * channels; 395 396 // Before encoding a scanline, it is converted to RGBA8. 397 // This is double buffered, to help with prediction. 398 int converted_scanline_size = desc.width * 4; 399 400 // Allocated 3 rgba8 scanlines for the need of encoding. 401 int extraAllocSize = converted_scanline_size*2; 402 403 // Overallocate to make room for everything. 404 int max_size = desc.width * desc.height * (desc.channels + 1) + QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof); 405 406 p = 0; 407 bytes = cast(ubyte*) QOI_MALLOC(max_size + extraAllocSize); 408 if (!bytes) 409 { 410 return null; 411 } 412 413 // double-buffered scanline, this is intended to speed up decoding 414 qoi_rgba_t* inputScanline = cast(qoi_rgba_t*)(bytes + max_size); 415 qoi_rgba_t* lastInputScanline = cast(qoi_rgba_t*)(bytes + max_size + converted_scanline_size); 416 417 qoi_write_32(bytes, &p, QOIX_MAGIC); 418 qoi_write_32(bytes, &p, desc.width); 419 qoi_write_32(bytes, &p, desc.height); 420 bytes[p++] = 1; // Put a version number :) 421 bytes[p++] = desc.channels; // 3, or 4 422 bytes[p++] = desc.bitdepth; // 8, or 10 423 bytes[p++] = desc.colorspace; 424 bytes[p++] = QOIX_COMPRESSION_NONE; 425 qoi_write_32f(bytes, &p, desc.pixelAspectRatio); 426 qoi_write_32f(bytes, &p, desc.resolutionY); 427 428 //pixels = cast(const(ubyte)*) data; 429 430 memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof); 431 index_lookup[] = 0; 432 433 run = 0; 434 px.rgba.r = 0; 435 px.rgba.g = 0; 436 px.rgba.b = 0; 437 px.rgba.a = 255; 438 439 channels = desc.channels; 440 stride = desc.width * channels; 441 px_len = desc.width * desc.height * channels; 442 px_end = px_len - channels; 443 444 assert (channels != 1 && channels != 2); 445 446 447 448 for (int posy = 0; posy < desc.height; ++posy) 449 { 450 const(ubyte)* line = data + desc.pitchBytes * posy; 451 452 // Convert one input scanline at once to rgba8 453 if (desc.channels == 4) 454 { 455 // PERF: replace by pointer swap 456 memcpy(inputScanline, line, desc.pitchBytes); 457 } 458 else 459 { 460 assert(desc.channels == 3); 461 for (int posx = 0; posx < desc.width; ++posx) 462 { 463 inputScanline[posx].rgba = RGBA(line[posx * 3 + 0], line[posx * 3 + 1], line[posx * 3 + 2], 255); 464 } 465 } 466 467 for (int posx = 0; posx < desc.width; ++posx) 468 { 469 px_ref.v = px.v; 470 px = inputScanline[posx]; 471 472 if (px.v == px_ref.v) { 473 run++; 474 if (run == 1024 || px_pos == px_end) { 475 run--; 476 bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3); 477 bytes[p++] = run & 0xff; 478 run = 0; 479 } 480 } 481 else { 482 int hash = QOI_COLOR_HASH(px); 483 484 if (run > 0) { 485 run--; 486 if (run < 8) { 487 bytes[p++] = cast(ubyte)(QOI_OP_RUN | run); 488 } 489 else { 490 bytes[p++] = QOI_OP_RUN2 | ((run >> 8) & 3); 491 bytes[p++] = run & 0xff; 492 } 493 run = 0; 494 } 495 496 if (index[index_lookup[hash]].v == px.v) { 497 bytes[p++] = QOI_OP_INDEX | index_lookup[hash]; 498 } 499 else { 500 index_lookup[hash] = cast(ubyte) index_pos; 501 index[index_pos] = px; 502 index_pos = (index_pos + 1) & 63; 503 504 byte va = cast(byte)(px.rgba.a - px_ref.rgba.a); 505 506 if (va) { 507 if (va >= -4 && va <= 3){ 508 bytes[p++] = cast(ubyte)(QOI_OP_ADIFF | (va + 4)); 509 } else { 510 bytes[p++] = QOI_OP_RGBA; // make a grey + alpha opcode? 511 bytes[p++] = px.rgba.r; 512 bytes[p++] = px.rgba.g; 513 bytes[p++] = px.rgba.b; 514 bytes[p++] = px.rgba.a; 515 goto pixel_encoded; 516 } 517 } 518 519 // Note: computing this predictor for the whole scanline in advance, even with 2x pixels at once, was slower. 520 // because in normal times, you don't compute this predictor all the time. 521 if (posy > 0) 522 { 523 if (posx == 0) 524 { 525 // first pixel in the row, take above pixel 526 RGBA pred = lastInputScanline[posx].rgba; 527 px_ref.rgba.r = pred.r; 528 px_ref.rgba.g = pred.g; 529 px_ref.rgba.b = pred.b; 530 } 531 else 532 { 533 RGBA pred = locoIntraPredictionSIMD(px_ref.rgba, lastInputScanline[posx].rgba, lastInputScanline[posx-1].rgba); 534 px_ref.rgba.r = pred.r; 535 px_ref.rgba.g = pred.g; 536 px_ref.rgba.b = pred.b; 537 } 538 } 539 540 byte vg = cast(byte)(px.rgba.g - px_ref.rgba.g); 541 byte vg_r = cast(byte)(px.rgba.r - px_ref.rgba.r - vg); 542 byte vg_b = cast(byte)(px.rgba.b - px_ref.rgba.b - vg); 543 544 if ( 545 vg >= -4 && vg < 0 && 546 vg_r >= -1 && vg_r <= 2 && 547 vg_b >= -1 && vg_b <= 2 548 ) { 549 bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 1) << 2 | (vg_b + 1) ); 550 } 551 else if ( 552 vg >= 0 && vg <= 3 && 553 vg_r >= -2 && vg_r <= 1 && 554 vg_b >= -2 && vg_b <= 1 555 ) { 556 bytes[p++] = cast(ubyte)( QOI_OP_LUMA | (vg + 4) << 4 | (vg_r + 2) << 2 | (vg_b + 2) ); 557 } 558 else if ( 559 px.rgba.g == px.rgba.r && 560 px.rgba.g == px.rgba.b 561 ) { 562 bytes[p++] = QOI_OP_GRAY; 563 bytes[p++] = px.rgba.g; 564 } 565 else if ( 566 vg_r >= -8 && vg_r <= 7 && 567 vg >= -16 && vg <= 15 && 568 vg_b >= -8 && vg_b <= 7 569 ) { 570 bytes[p++] = cast(ubyte)( QOI_OP_LUMA2 | (vg + 16) ); 571 bytes[p++] = cast(ubyte)( (vg_r + 8) << 4 | (vg_b + 8) ); 572 } 573 else if ( 574 vg_r >= -32 && vg_r <= 31 && 575 vg >= -64 && vg <= 63 && 576 vg_b >= -32 && vg_b <= 31 577 ) { 578 int dv = ((vg + 64) << 12) | ((vg_r + 32) << 6) | (vg_b + 32); 579 bytes[p++] = QOI_OP_LUMA3 | ((dv >> 16) & 31); 580 bytes[p++] = (dv >> 8) & 255; 581 bytes[p++] = dv & 255; 582 } else { 583 bytes[p++] = QOI_OP_RGB; 584 bytes[p++] = px.rgba.r; 585 bytes[p++] = px.rgba.g; 586 bytes[p++] = px.rgba.b; 587 } 588 } 589 } 590 591 pixel_encoded: 592 593 px_pos += channels; 594 } 595 596 // swap input scanline buffers 597 { 598 qoi_rgba_t* temp = inputScanline; 599 inputScanline = lastInputScanline; 600 lastInputScanline = temp; 601 } 602 } 603 604 for (i = 0; i < cast(int)(qoi_padding.sizeof); i++) 605 { 606 bytes[p++] = qoi_padding[i]; 607 } 608 609 *out_len = p; 610 return bytes; 611 } 612 613 /* Decode a QOI2AVG image from memory. 614 615 The function either returns null on failure (invalid parameters or malloc 616 failed) or a pointer to the decoded pixels. On success, the qoi_desc struct 617 is filled with the description from the file header. 618 619 The returned pixel data should be free()d after use. */ 620 ubyte* qoix_decode(const(void)* data, int size, qoi_desc *desc, int channels) { 621 const(ubyte)* bytes; 622 uint header_magic; 623 qoi_rgba_t[64] index; 624 qoi_rgba_t px, px_ref; 625 int chunks_len; 626 int p = 0, run = 0; 627 int index_pos = 0; 628 629 if ( 630 data == null || desc == null || 631 (channels != 0 && channels != 3 && channels != 4) || 632 size < QOIX_HEADER_SIZE + cast(int)(qoi_padding.sizeof) 633 ) { 634 return null; 635 } 636 637 bytes = cast(const(ubyte)*)data; 638 639 header_magic = qoi_read_32(bytes, &p); 640 desc.width = qoi_read_32(bytes, &p); 641 desc.height = qoi_read_32(bytes, &p); 642 int qoix_version = bytes[p++]; 643 desc.channels = bytes[p++]; 644 desc.bitdepth = bytes[p++]; 645 desc.colorspace = bytes[p++]; 646 desc.compression = bytes[p++]; 647 desc.pixelAspectRatio = qoi_read_32f(bytes, &p); 648 desc.resolutionY = qoi_read_32f(bytes, &p); 649 650 if ( 651 desc.width == 0 || desc.height == 0 || 652 desc.channels < 3 || desc.channels > 4 || 653 desc.colorspace > 1 || 654 desc.bitdepth != 8 || 655 qoix_version > 1 || 656 desc.compression != QOIX_COMPRESSION_NONE || 657 header_magic != QOIX_MAGIC || 658 desc.height >= QOIX_PIXELS_MAX / desc.width 659 ) { 660 return null; 661 } 662 663 if (channels == 0) { 664 channels = desc.channels; 665 } 666 667 int samplesPerRow = desc.width * channels; 668 669 desc.pitchBytes = samplesPerRow; 670 671 int pixel_data_size = desc.width * desc.height * channels; 672 int decoded_scanline_size = desc.width * 4; 673 674 int num_samples = desc.width * desc.height * channels; 675 ubyte* pixels = cast(ubyte *) QOI_MALLOC(pixel_data_size + 2 * decoded_scanline_size); 676 if (!pixels) { 677 return null; 678 } 679 680 // double-buffered scanline, this is intended to speed up decoding 681 qoi_rgba_t* decodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size]); 682 qoi_rgba_t* lastDecodedScanline = cast(qoi_rgba_t*)(&pixels[pixel_data_size + decoded_scanline_size]); 683 684 assert(channels != 1 && channels != 2); 685 686 memset(index.ptr, 0, 64 * qoi_rgba_t.sizeof); 687 px.rgba.r = 0; 688 px.rgba.g = 0; 689 px.rgba.b = 0; 690 px.rgba.a = 255; 691 692 chunks_len = size - cast(int)(qoi_padding.sizeof); 693 694 int px_pos = 0; 695 696 for (int posy = 0; posy < desc.height; ++posy) 697 { 698 for (int posx = 0; posx < desc.width; ++posx) 699 { 700 if (run > 0) 701 { 702 run--; 703 } 704 else if (p < chunks_len) 705 { 706 px_ref.v = px.v; 707 708 if (posy > 0) 709 { 710 if (posx == 0) 711 { 712 // first pixel in the row, take above pixel 713 px_ref.rgba.r = lastDecodedScanline[posx].rgba.r; 714 px_ref.rgba.g = lastDecodedScanline[posx].rgba.g; 715 px_ref.rgba.b = lastDecodedScanline[posx].rgba.b; 716 } 717 else 718 { 719 // Called I-LOCO intra prediction 720 RGBA pred = locoIntraPredictionSIMD(px.rgba, lastDecodedScanline[posx].rgba, lastDecodedScanline[posx-1].rgba); 721 px_ref.rgba.r = pred.r; 722 px_ref.rgba.g = pred.g; 723 px_ref.rgba.b = pred.b; 724 } 725 } 726 727 decode_op: 728 729 int b1 = bytes[p++]; 730 if (b1 < 0x80) { /* QOI_OP_LUMA */ 731 int vg = ((b1 >> 4) & 7) - 4; 732 px.rgba.g = cast(ubyte)(px_ref.rgba.g + vg); 733 if (vg < 0) { 734 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 1 + ((b1 >> 2) & 3) ); 735 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 1 + (b1 & 3) ); 736 } 737 else { 738 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 2 + ((b1 >> 2) & 3) ); 739 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 2 + (b1 & 3) ); 740 } 741 index[index_pos++ & 63] = px; 742 } 743 else if (b1 < 0xc0) { /* QOI_OP_INDEX */ 744 px = index[b1 & 63]; 745 } 746 else if (b1 < 0xe0) { /* QOI_OP_LUMA2 */ 747 int b2 = bytes[p++]; 748 int vg = (b1 & 0x1f) - 16; 749 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg - 8 + ((b2 >> 4) & 0x0f) ); 750 px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg ); 751 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg - 8 + (b2 & 0x0f) ); 752 index[index_pos++ & 63] = px; 753 } 754 else if (b1 < 0xe8) { /* QOI_OP_LUMA3 */ 755 int dv = (b1 << 8) | bytes[p++]; 756 dv = (dv << 8) | bytes[p++]; 757 int vg = ((dv >> 12) & 0x7f) - 64; 758 px.rgba.r = cast(ubyte)( px_ref.rgba.r + vg + ((dv >> 6) & 0x3f) - 32 ); 759 px.rgba.g = cast(ubyte)( px_ref.rgba.g + vg ); 760 px.rgba.b = cast(ubyte)( px_ref.rgba.b + vg + (dv & 0x3f) - 32 ); 761 index[index_pos++ & 63] = px; 762 } 763 else if (b1 < 0xf0) { /* QOI_OP_ADIFF */ 764 px.rgba.a += (b1 & 7) - 4; 765 goto decode_op; 766 } 767 else if (b1 < 0xf8) { /* QOI_OP_RUN */ 768 run = b1 & 7; 769 } 770 else if (b1 < 0xfc) { /* QOI_OP_RUN2 */ 771 run = ((b1 & 3) << 8) | bytes[p++]; 772 } 773 else if (b1 == QOI_OP_GRAY) { 774 ubyte vg = bytes[p++]; 775 px.rgba.r = vg; 776 px.rgba.g = vg; 777 px.rgba.b = vg; 778 index[index_pos++ & 63] = px; 779 } 780 else if (b1 == QOI_OP_RGB) { 781 px.rgba.r = bytes[p++]; 782 px.rgba.g = bytes[p++]; 783 px.rgba.b = bytes[p++]; 784 index[index_pos++ & 63] = px; 785 } 786 else if (b1 == QOI_OP_RGBA) { 787 px.rgba.r = bytes[p++]; 788 px.rgba.g = bytes[p++]; 789 px.rgba.b = bytes[p++]; 790 px.rgba.a = bytes[p++]; 791 index[index_pos++ & 63] = px; 792 } 793 else { /* QOI_OP_END */ 794 break; 795 } 796 } 797 798 decodedScanline[posx] = px; 799 px_pos += channels; 800 } 801 802 // convert just-decoded scanline into output type 803 ubyte* line = cast(ubyte*)(pixels + desc.pitchBytes * posy); 804 805 switch(channels) 806 { 807 case 4: 808 // No particular conversion to do 809 memcpy(line, &decodedScanline[0], desc.width * 4); 810 break; 811 812 case 3: 813 for (int posx = 0; posx < desc.width; ++posx) 814 { 815 qoi_rgba_t decodedPx = decodedScanline[posx]; // No particular conversion to do 816 line[posx * 3 + 0] = decodedPx.rgba.r; 817 line[posx * 3 + 1] = decodedPx.rgba.g; 818 line[posx * 3 + 2] = decodedPx.rgba.b; 819 } 820 break; 821 default: 822 assert(false); 823 } 824 825 // swap decoded scanline buffers 826 { 827 qoi_rgba_t* temp = decodedScanline; 828 decodedScanline = lastDecodedScanline; 829 lastDecodedScanline = temp; 830 } 831 } 832 833 return pixels; 834 } 835 836 private: 837 838 /* Perform LOCO-I prediction independently over the 4 channels. 839 840 841 int max_ab = a > b ? a : b; 842 int min_ab = a < b ? a : b; 843 if (c >= max_ab) 844 return cast(ubyte)min_ab; 845 else if (c <= min_ab) 846 return cast(ubyte)max_ab; 847 else 848 { 849 int d = a + b - c; 850 if (d < 0) 851 d = 0; 852 if (d > 255) 853 d = 0; 854 return cast(ubyte)d; 855 } 856 */ 857 858 static RGBA locoIntraPredictionSIMD(RGBA a, RGBA b, RGBA c) 859 { 860 // load RGBA8 pixels 861 __m128i A = _mm_loadu_si32(&a); 862 __m128i B = _mm_loadu_si32(&b); 863 __m128i C = _mm_loadu_si32(&c); 864 865 // extend to 16-bits 866 __m128i Z = _mm_setzero_si128(); 867 A = _mm_unpacklo_epi8(A, Z); 868 B = _mm_unpacklo_epi8(B, Z); 869 C = _mm_unpacklo_epi8(C, Z); 870 871 // Max predictor (A + B - C) 872 __m128i P = _mm_sub_epi16(_mm_add_epi16(A, B), C); 873 __m128i maxAB = _mm_max_epi16(A, B); 874 __m128i minAB = _mm_min_epi16(A, B); 875 876 // 1111 where we should use max(A, B) 877 __m128i maxMask = _mm_cmple_epi16(C, minAB); 878 879 // 1111 where we should use min(A, B) 880 __m128i minMask = _mm_cmpge_epi16(C, maxAB); 881 882 P = (P & (~minMask)) | (minAB & minMask); 883 P = (P & (~maxMask)) | (maxAB & maxMask); 884 885 // Get back to u8 886 P = _mm_packus_epi16(P, Z); 887 888 RGBA r; 889 _mm_storeu_si32(&r, P); 890 891 return r; 892 } 893 894 private __m128i _mm_cmple_epi16(__m128i a, __m128i b) pure @safe 895 { 896 return _mm_or_si128(_mm_cmplt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 897 } 898 899 private __m128i _mm_cmpge_epi16(__m128i a, __m128i b) 900 { 901 return _mm_or_si128(_mm_cmpgt_epi16(a, b), _mm_cmpeq_epi16(a, b)); 902 }