src/mesa/main/texcompress_astc.cpp

   1 /*
   2  * Copyright 2015 Philip Taylor <philip@zaynar.co.uk>
   3  * Copyright 2018 Advanced Micro Devices, Inc.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file texcompress_astc.c
  27  *
  28  * Decompression code for GL_KHR_texture_compression_astc_ldr, which is just
  29  * ASTC 2D LDR.
  30  *
  31  * The ASTC 2D LDR decoder (without the sRGB part) was copied from the OASTC
  32  * library written by Philip Taylor. I added sRGB support and adjusted it for
  33  * Mesa. - Marek
  34  */
  35
  36 #include "texcompress_astc.h"
  37 #include "macros.h"
  38 #include "util/half_float.h"
  39 #include <stdio.h>
  40
  41 static bool VERBOSE_DECODE = false;
  42 static bool VERBOSE_WRITE = false;
  43
  44 static inline uint8_t
  45 uint16_div_64k_to_half_to_unorm8(uint16_t v)
  46 {
  47    return _mesa_half_to_unorm8(_mesa_uint16_div_64k_to_half(v));
  48 }
  49
  50 class decode_error
  51 {
  52 public:
  53    enum type {
  54       ok,
  55       unsupported_hdr_void_extent,
  56       reserved_block_mode_1,
  57       reserved_block_mode_2,
  58       dual_plane_and_too_many_partitions,
  59       invalid_range_in_void_extent,
  60       weight_grid_exceeds_block_size,
  61       invalid_colour_endpoints_size,
  62       invalid_colour_endpoints_count,
  63       invalid_weight_bits,
  64       invalid_num_weights,
  65    };
  66 };
  67
  68
  69 struct cem_range {
  70    uint8_t max;
  71    uint8_t t, q, b;
  72 };
  73
  74 /* Based on the Color Unquantization Parameters table,
  75  * plus the bit-only representations, sorted by increasing size
  76  */
  77 static cem_range cem_ranges[] = {
  78    { 5, 1, 0, 1 },
  79    { 7, 0, 0, 3 },
  80    { 9, 0, 1, 1 },
  81    { 11, 1, 0, 2 },
  82    { 15, 0, 0, 4 },
  83    { 19, 0, 1, 2 },
  84    { 23, 1, 0, 3 },
  85    { 31, 0, 0, 5 },
  86    { 39, 0, 1, 3 },
  87    { 47, 1, 0, 4 },
  88    { 63, 0, 0, 6 },
  89    { 79, 0, 1, 4 },
  90    { 95, 1, 0, 5 },
  91    { 127, 0, 0, 7 },
  92    { 159, 0, 1, 5 },
  93    { 191, 1, 0, 6 },
  94    { 255, 0, 0, 8 },
  95 };
  96
  97 #define CAT_BITS_2(a, b)          ( ((a) << 1) | (b) )
  98 #define CAT_BITS_3(a, b, c)       ( ((a) << 2) | ((b) << 1) | (c) )
  99 #define CAT_BITS_4(a, b, c, d)    ( ((a) << 3) | ((b) << 2) | ((c) << 1) | (d) )
 100 #define CAT_BITS_5(a, b, c, d, e) ( ((a) << 4) | ((b) << 3) | ((c) << 2) | ((d) << 1) | (e) )
 101
 102 /**
 103  * Unpack 5n+8 bits from 'in' into 5 output values.
 104  * If n <= 4 then T should be uint32_t, else it must be uint64_t.
 105  */
 106 template <typename T>
 107 static void unpack_trit_block(int n, T in, uint8_t *out)
 108 {
 109    assert(n <= 6); /* else output will overflow uint8_t */
 110
 111    uint8_t T0 = (in >> (n)) & 0x1;
 112    uint8_t T1 = (in >> (n+1)) & 0x1;
 113    uint8_t T2 = (in >> (2*n+2)) & 0x1;
 114    uint8_t T3 = (in >> (2*n+3)) & 0x1;
 115    uint8_t T4 = (in >> (3*n+4)) & 0x1;
 116    uint8_t T5 = (in >> (4*n+5)) & 0x1;
 117    uint8_t T6 = (in >> (4*n+6)) & 0x1;
 118    uint8_t T7 = (in >> (5*n+7)) & 0x1;
 119    uint8_t mmask = (1 << n) - 1;
 120    uint8_t m0 = (in >> (0)) & mmask;
 121    uint8_t m1 = (in >> (n+2)) & mmask;
 122    uint8_t m2 = (in >> (2*n+4)) & mmask;
 123    uint8_t m3 = (in >> (3*n+5)) & mmask;
 124    uint8_t m4 = (in >> (4*n+7)) & mmask;
 125
 126    uint8_t C;
 127    uint8_t t4, t3, t2, t1, t0;
 128    if (CAT_BITS_3(T4, T3, T2) == 0x7) {
 129       C = CAT_BITS_5(T7, T6, T5, T1, T0);
 130       t4 = t3 = 2;
 131    } else {
 132       C = CAT_BITS_5(T4, T3, T2, T1, T0);
 133       if (CAT_BITS_2(T6, T5) == 0x3) {
 134          t4 = 2;
 135          t3 = T7;
 136       } else {
 137          t4 = T7;
 138          t3 = CAT_BITS_2(T6, T5);
 139       }
 140    }
 141
 142    if ((C & 0x3) == 0x3) {
 143       t2 = 2;
 144       t1 = (C >> 4) & 0x1;
 145       uint8_t C3 = (C >> 3) & 0x1;
 146       uint8_t C2 = (C >> 2) & 0x1;
 147       t0 = (C3 << 1) | (C2 & ~C3);
 148    } else if (((C >> 2) & 0x3) == 0x3) {
 149       t2 = 2;
 150       t1 = 2;
 151       t0 = C & 0x3;
 152    } else {
 153       t2 = (C >> 4) & 0x1;
 154       t1 = (C >> 2) & 0x3;
 155       uint8_t C1 = (C >> 1) & 0x1;
 156       uint8_t C0 = (C >> 0) & 0x1;
 157       t0 = (C1 << 1) | (C0 & ~C1);
 158    }
 159
 160    out[0] = (t0 << n) | m0;
 161    out[1] = (t1 << n) | m1;
 162    out[2] = (t2 << n) | m2;
 163    out[3] = (t3 << n) | m3;
 164    out[4] = (t4 << n) | m4;
 165 }
 166
 167 /**
 168  * Unpack 3n+7 bits from 'in' into 3 output values
 169  */
 170 static void unpack_quint_block(int n, uint32_t in, uint8_t *out)
 171 {
 172    assert(n <= 5); /* else output will overflow uint8_t */
 173
 174    uint8_t Q0 = (in >> (n)) & 0x1;
 175    uint8_t Q1 = (in >> (n+1)) & 0x1;
 176    uint8_t Q2 = (in >> (n+2)) & 0x1;
 177    uint8_t Q3 = (in >> (2*n+3)) & 0x1;
 178    uint8_t Q4 = (in >> (2*n+4)) & 0x1;
 179    uint8_t Q5 = (in >> (3*n+5)) & 0x1;
 180    uint8_t Q6 = (in >> (3*n+6)) & 0x1;
 181    uint8_t mmask = (1 << n) - 1;
 182    uint8_t m0 = (in >> (0)) & mmask;
 183    uint8_t m1 = (in >> (n+3)) & mmask;
 184    uint8_t m2 = (in >> (2*n+5)) & mmask;
 185
 186    uint8_t C;
 187    uint8_t q2, q1, q0;
 188    if (CAT_BITS_4(Q6, Q5, Q2, Q1) == 0x3) {
 189       q2 = CAT_BITS_3(Q0, Q4 & ~Q0, Q3 & ~Q0);
 190       q1 = 4;
 191       q0 = 4;
 192    } else {
 193       if (CAT_BITS_2(Q2, Q1) == 0x3) {
 194          q2 = 4;
 195          C = CAT_BITS_5(Q4, Q3, 0x1 & ~Q6, 0x1 & ~Q5, Q0);
 196       } else {
 197          q2 = CAT_BITS_2(Q6, Q5);
 198          C = CAT_BITS_5(Q4, Q3, Q2, Q1, Q0);
 199       }
 200       if ((C & 0x7) == 0x5) {
 201          q1 = 4;
 202          q0 = (C >> 3) & 0x3;
 203       } else {
 204          q1 = (C >> 3) & 0x3;
 205          q0 = C & 0x7;
 206       }
 207    }
 208    out[0] = (q0 << n) | m0;
 209    out[1] = (q1 << n) | m1;
 210    out[2] = (q2 << n) | m2;
 211 }
 212
 213
 214 struct uint8x4_t
 215 {
 216    uint8_t v[4];
 217
 218    uint8x4_t() { }
 219
 220    uint8x4_t(int a, int b, int c, int d)
 221    {
 222       assert(0 <= a && a <= 255);
 223       assert(0 <= b && b <= 255);
 224       assert(0 <= c && c <= 255);
 225       assert(0 <= d && d <= 255);
 226       v[0] = a;
 227       v[1] = b;
 228       v[2] = c;
 229       v[3] = d;
 230    }
 231
 232    static uint8x4_t clamped(int a, int b, int c, int d)
 233    {
 234       uint8x4_t r;
 235       r.v[0] = MAX2(0, MIN2(255, a));
 236       r.v[1] = MAX2(0, MIN2(255, b));
 237       r.v[2] = MAX2(0, MIN2(255, c));
 238       r.v[3] = MAX2(0, MIN2(255, d));
 239       return r;
 240    }
 241 };
 242
 243 static uint8x4_t blue_contract(int r, int g, int b, int a)
 244 {
 245    return uint8x4_t((r+b) >> 1, (g+b) >> 1, b, a);
 246 }
 247
 248 static uint8x4_t blue_contract_clamped(int r, int g, int b, int a)
 249 {
 250    return uint8x4_t::clamped((r+b) >> 1, (g+b) >> 1, b, a);
 251 }
 252
 253 static void bit_transfer_signed(int &a, int &b)
 254 {
 255    b >>= 1;
 256    b |= a & 0x80;
 257    a >>= 1;
 258    a &= 0x3f;
 259    if (a & 0x20)
 260       a -= 0x40;
 261 }
 262
 263 static uint32_t hash52(uint32_t p)
 264 {
 265    p ^= p >> 15;
 266    p -= p << 17;
 267    p += p << 7;
 268    p += p << 4;
 269    p ^= p >> 5;
 270    p += p << 16;
 271    p ^= p >> 7;
 272    p ^= p >> 3;
 273    p ^= p << 6;
 274    p ^= p >> 17;
 275    return p;
 276 }
 277
 278 static int select_partition(int seed, int x, int y, int z, int partitioncount,
 279                             int small_block)
 280 {
 281    if (small_block) {
 282       x <<= 1;
 283       y <<= 1;
 284       z <<= 1;
 285    }
 286    seed += (partitioncount - 1) * 1024;
 287    uint32_t rnum = hash52(seed);
 288    uint8_t seed1 = rnum & 0xF;
 289    uint8_t seed2 = (rnum >> 4) & 0xF;
 290    uint8_t seed3 = (rnum >> 8) & 0xF;
 291    uint8_t seed4 = (rnum >> 12) & 0xF;
 292    uint8_t seed5 = (rnum >> 16) & 0xF;
 293    uint8_t seed6 = (rnum >> 20) & 0xF;
 294    uint8_t seed7 = (rnum >> 24) & 0xF;
 295    uint8_t seed8 = (rnum >> 28) & 0xF;
 296    uint8_t seed9 = (rnum >> 18) & 0xF;
 297    uint8_t seed10 = (rnum >> 22) & 0xF;
 298    uint8_t seed11 = (rnum >> 26) & 0xF;
 299    uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
 300
 301    seed1 *= seed1;
 302    seed2 *= seed2;
 303    seed3 *= seed3;
 304    seed4 *= seed4;
 305    seed5 *= seed5;
 306    seed6 *= seed6;
 307    seed7 *= seed7;
 308    seed8 *= seed8;
 309    seed9 *= seed9;
 310    seed10 *= seed10;
 311    seed11 *= seed11;
 312    seed12 *= seed12;
 313
 314    int sh1, sh2, sh3;
 315    if (seed & 1) {
 316       sh1 = (seed & 2 ? 4 : 5);
 317       sh2 = (partitioncount == 3 ? 6 : 5);
 318    } else {
 319       sh1 = (partitioncount == 3 ? 6 : 5);
 320       sh2 = (seed & 2 ? 4 : 5);
 321    }
 322    sh3 = (seed & 0x10) ? sh1 : sh2;
 323
 324    seed1 >>= sh1;
 325    seed2 >>= sh2;
 326    seed3 >>= sh1;
 327    seed4 >>= sh2;
 328    seed5 >>= sh1;
 329    seed6 >>= sh2;
 330    seed7 >>= sh1;
 331    seed8 >>= sh2;
 332    seed9 >>= sh3;
 333    seed10 >>= sh3;
 334    seed11 >>= sh3;
 335    seed12 >>= sh3;
 336
 337    int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
 338    int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
 339    int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
 340    int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
 341
 342    a &= 0x3F;
 343    b &= 0x3F;
 344    c &= 0x3F;
 345    d &= 0x3F;
 346
 347    if (partitioncount < 4)
 348       d = 0;
 349    if (partitioncount < 3)
 350       c = 0;
 351
 352    if (a >= b && a >= c && a >= d)
 353       return 0;
 354    else if (b >= c && b >= d)
 355       return 1;
 356    else if (c >= d)
 357       return 2;
 358    else
 359       return 3;
 360 }
 361
 362
 363 struct InputBitVector
 364 {
 365    uint32_t data[4];
 366
 367    void printf_bits(int offset, int count, const char *fmt = "", ...)
 368    {
 369       char out[129];
 370       memset(out, '.', 128);
 371       out[128] = '\0';
 372       int idx = offset;
 373       for (int i = 0; i < count; ++i) {
 374          out[127 - idx] = ((data[idx >> 5] >> (idx & 31)) & 1) ? '1' : '0';
 375          ++idx;
 376       }
 377       printf("%s ", out);
 378       va_list ap;
 379       va_start(ap, fmt);
 380       vprintf(fmt, ap);
 381       va_end(ap);
 382       printf("\n");
 383    }
 384
 385    uint32_t get_bits(int offset, int count)
 386    {
 387       assert(count >= 0 && count < 32);
 388
 389       uint32_t out = 0;
 390       if (offset < 32)
 391          out |= data[0] >> offset;
 392
 393       if (0 < offset && offset <= 32)
 394          out |= data[1] << (32 - offset);
 395       if (32 < offset && offset < 64)
 396          out |= data[1] >> (offset - 32);
 397
 398       if (32 < offset && offset <= 64)
 399          out |= data[2] << (64 - offset);
 400       if (64 < offset && offset < 96)
 401          out |= data[2] >> (offset - 64);
 402
 403       if (64 < offset && offset <= 96)
 404          out |= data[3] << (96 - offset);
 405       if (96 < offset && offset < 128)
 406          out |= data[3] >> (offset - 96);
 407
 408       out &= (1 << count) - 1;
 409       return out;
 410    }
 411
 412    uint64_t get_bits64(int offset, int count)
 413    {
 414       assert(count >= 0 && count < 64);
 415
 416       uint64_t out = 0;
 417       if (offset < 32)
 418          out |= data[0] >> offset;
 419
 420       if (offset <= 32)
 421          out |= (uint64_t)data[1] << (32 - offset);
 422       if (32 < offset && offset < 64)
 423          out |= data[1] >> (offset - 32);
 424
 425       if (0 < offset && offset <= 64)
 426          out |= (uint64_t)data[2] << (64 - offset);
 427       if (64 < offset && offset < 96)
 428          out |= data[2] >> (offset - 64);
 429
 430       if (32 < offset && offset <= 96)
 431          out |= (uint64_t)data[3] << (96 - offset);
 432       if (96 < offset && offset < 128)
 433          out |= data[3] >> (offset - 96);
 434
 435       out &= ((uint64_t)1 << count) - 1;
 436       return out;
 437    }
 438
 439    uint32_t get_bits_rev(int offset, int count)
 440    {
 441       assert(offset >= count);
 442       uint32_t tmp = get_bits(offset - count, count);
 443       uint32_t out = 0;
 444       for (int i = 0; i < count; ++i)
 445          out |= ((tmp >> i) & 1) << (count - 1 - i);
 446       return out;
 447    }
 448 };
 449
 450 struct OutputBitVector
 451 {
 452    uint32_t data[4];
 453    int offset;
 454
 455    OutputBitVector()
 456       : offset(0)
 457    {
 458       memset(data, 0, sizeof(data));
 459    }
 460
 461    void append(uint32_t value, int size)
 462    {
 463       if (VERBOSE_WRITE)
 464          printf("append offset=%d size=%d values=0x%x\n", offset, size, value);
 465
 466       assert(offset + size <= 128);
 467
 468       assert(size <= 32);
 469       if (size < 32)
 470          assert((value >> size) == 0);
 471
 472       while (size) {
 473          int c = MIN2(size, 32 - (offset & 31));
 474          data[offset >> 5] |= (value << (offset & 31));
 475          offset += c;
 476          size -= c;
 477          value >>= c;
 478       }
 479    }
 480
 481    void append64(uint64_t value, int size)
 482    {
 483       if (VERBOSE_WRITE)
 484          printf("append offset=%d size=%d values=0x%llx\n", offset, size, (unsigned long long)value);
 485
 486       assert(offset + size <= 128);
 487
 488       assert(size <= 64);
 489       if (size < 64)
 490          assert((value >> size) == 0);
 491
 492       while (size) {
 493          int c = MIN2(size, 32 - (offset & 31));
 494          data[offset >> 5] |= (value << (offset & 31));
 495          offset += c;
 496          size -= c;
 497          value >>= c;
 498       }
 499    }
 500
 501    void append(OutputBitVector &v, int size)
 502    {
 503       if (VERBOSE_WRITE)
 504          printf("append vector offset=%d size=%d\n", offset, size);
 505
 506       assert(offset + size <= 128);
 507       int i = 0;
 508       while (size >= 32) {
 509          append(v.data[i++], 32);
 510          size -= 32;
 511       }
 512       if (size > 0)
 513          append(v.data[i] & ((1 << size) - 1), size);
 514    }
 515
 516    void append_end(OutputBitVector &v, int size)
 517    {
 518       for (int i = 0; i < size; ++i)
 519          data[(127 - i) >> 5] |= ((v.data[i >> 5] >> (i & 31)) & 1) << ((127 - i) & 31);
 520    }
 521
 522    /* Insert the given number of '1' bits. (We could use 0s instead, but 1s are
 523     * more likely to flush out bugs where we accidentally read undefined bits.)
 524     */
 525    void skip(int size)
 526    {
 527       if (VERBOSE_WRITE)
 528          printf("skip offset=%d size=%d\n", offset, size);
 529
 530       assert(offset + size <= 128);
 531       while (size >= 32) {
 532          append(0xffffffff, 32);
 533          size -= 32;
 534       }
 535       if (size > 0)
 536          append(0xffffffff >> (32 - size), size);
 537    }
 538 };
 539
 540
 541 class Decoder
 542 {
 543 public:
 544    Decoder(int block_w, int block_h, int block_d, bool srgb, bool output_unorm8)
 545       : block_w(block_w), block_h(block_h), block_d(block_d), srgb(srgb),
 546         output_unorm8(output_unorm8) {}
 547
 548    decode_error::type decode(const uint8_t *in, uint16_t *output) const;
 549
 550    int block_w, block_h, block_d;
 551    bool srgb, output_unorm8;
 552 };
 553
 554 struct Block
 555 {
 556    bool is_error;
 557    bool bogus_colour_endpoints;
 558    bool bogus_weights;
 559
 560    int high_prec;
 561    int dual_plane;
 562    int colour_component_selector;
 563    int wt_range;
 564    int wt_w, wt_h, wt_d;
 565    int num_parts;
 566    int partition_index;
 567
 568    bool is_void_extent;
 569    int void_extent_d;
 570    int void_extent_min_s;
 571    int void_extent_max_s;
 572    int void_extent_min_t;
 573    int void_extent_max_t;
 574    uint16_t void_extent_colour_r;
 575    uint16_t void_extent_colour_g;
 576    uint16_t void_extent_colour_b;
 577    uint16_t void_extent_colour_a;
 578
 579    bool is_multi_cem;
 580    int num_extra_cem_bits;
 581    int colour_endpoint_data_offset;
 582    int extra_cem_bits;
 583    int cem_base_class;
 584    int cems[4];
 585
 586    int num_cem_values;
 587
 588    /* Calculated by unpack_weights(): */
 589    uint8_t weights_quant[64 + 4]; /* max 64 values, plus padding for overflows in trit parsing */
 590
 591    /* Calculated by unquantise_weights(): */
 592    uint8_t weights[64 + 18]; /* max 64 values, plus padding for the infill interpolation */
 593
 594    /* Calculated by unpack_colour_endpoints(): */
 595    uint8_t colour_endpoints_quant[18 + 4]; /* max 18 values, plus padding for overflows in trit parsing */
 596
 597    /* Calculated by unquantise_colour_endpoints(): */
 598    uint8_t colour_endpoints[18];
 599
 600    /* Calculated by calculate_from_weights(): */
 601    int wt_trits;
 602    int wt_quints;
 603    int wt_bits;
 604    int wt_max;
 605    int num_weights;
 606    int weight_bits;
 607
 608    /* Calculated by calculate_remaining_bits(): */
 609    int remaining_bits;
 610
 611    /* Calculated by calculate_colour_endpoints_size(): */
 612    int colour_endpoint_bits;
 613    int ce_max;
 614    int ce_trits;
 615    int ce_quints;
 616    int ce_bits;
 617
 618    /* Calculated by compute_infill_weights(); */
 619    uint8_t infill_weights[2][216]; /* large enough for 6x6x6 */
 620
 621    /* Calculated by decode_colour_endpoints(); */
 622    uint8x4_t endpoints_decoded[2][4];
 623
 624    void calculate_from_weights();
 625    void calculate_remaining_bits();
 626    decode_error::type calculate_colour_endpoints_size();
 627
 628    void unquantise_weights();
 629    void unquantise_colour_endpoints();
 630
 631    decode_error::type decode(const Decoder &decoder, InputBitVector in);
 632
 633    decode_error::type decode_block_mode(InputBitVector in);
 634    decode_error::type decode_void_extent(InputBitVector in);
 635    void decode_cem(InputBitVector in);
 636    void unpack_colour_endpoints(InputBitVector in);
 637    void decode_colour_endpoints();
 638    void unpack_weights(InputBitVector in);
 639    void compute_infill_weights(int block_w, int block_h, int block_d);
 640
 641    void write_decoded(const Decoder &decoder, uint16_t *output);
 642 };
 643
 644
 645 decode_error::type Decoder::decode(const uint8_t *in, uint16_t *output) const
 646 {
 647    Block blk;
 648    InputBitVector in_vec;
 649    memcpy(&in_vec.data, in, 16);
 650    decode_error::type err = blk.decode(*this, in_vec);
 651    if (err == decode_error::ok) {
 652       blk.write_decoded(*this, output);
 653    } else {
 654       /* Fill output with the error colour */
 655       for (int i = 0; i < block_w * block_h * block_d; ++i) {
 656          if (output_unorm8) {
 657             output[i*4+0] = 0xff;
 658             output[i*4+1] = 0;
 659             output[i*4+2] = 0xff;
 660             output[i*4+3] = 0xff;
 661          } else {
 662             assert(!srgb); /* srgb must use unorm8 */
 663
 664             output[i*4+0] = FP16_ONE;
 665             output[i*4+1] = FP16_ZERO;
 666             output[i*4+2] = FP16_ONE;
 667             output[i*4+3] = FP16_ONE;
 668          }
 669       }
 670    }
 671    return err;
 672 }
 673
 674
 675 decode_error::type Block::decode_void_extent(InputBitVector block)
 676 {
 677    /* TODO: 3D */
 678
 679    is_void_extent = true;
 680    void_extent_d = block.get_bits(9, 1);
 681    void_extent_min_s = block.get_bits(12, 13);
 682    void_extent_max_s = block.get_bits(25, 13);
 683    void_extent_min_t = block.get_bits(38, 13);
 684    void_extent_max_t = block.get_bits(51, 13);
 685    void_extent_colour_r = block.get_bits(64, 16);
 686    void_extent_colour_g = block.get_bits(80, 16);
 687    void_extent_colour_b = block.get_bits(96, 16);
 688    void_extent_colour_a = block.get_bits(112, 16);
 689
 690    /* TODO: maybe we should do something useful with the extent coordinates? */
 691
 692    if (void_extent_d) {
 693       return decode_error::unsupported_hdr_void_extent;
 694    }
 695
 696    if (void_extent_min_s == 0x1fff && void_extent_max_s == 0x1fff
 697        && void_extent_min_t == 0x1fff && void_extent_max_t == 0x1fff) {
 698
 699       /* No extents */
 700
 701    } else {
 702
 703       /* Check for illegal encoding */
 704       if (void_extent_min_s >= void_extent_max_s || void_extent_min_t >= void_extent_max_t) {
 705          return decode_error::invalid_range_in_void_extent;
 706       }
 707    }
 708
 709    return decode_error::ok;
 710 }
 711
 712 decode_error::type Block::decode_block_mode(InputBitVector in)
 713 {
 714    dual_plane = in.get_bits(10, 1);
 715    high_prec = in.get_bits(9, 1);
 716
 717    if (in.get_bits(0, 2) != 0x0) {
 718       wt_range = (in.get_bits(0, 2) << 1) | in.get_bits(4, 1);
 719       int a = in.get_bits(5, 2);
 720       int b = in.get_bits(7, 2);
 721       switch (in.get_bits(2, 2)) {
 722       case 0x0:
 723          if (VERBOSE_DECODE)
 724             in.printf_bits(0, 11, "DHBBAAR00RR");
 725          wt_w = b + 4;
 726          wt_h = a + 2;
 727          break;
 728       case 0x1:
 729          if (VERBOSE_DECODE)
 730             in.printf_bits(0, 11, "DHBBAAR01RR");
 731          wt_w = b + 8;
 732          wt_h = a + 2;
 733          break;
 734       case 0x2:
 735          if (VERBOSE_DECODE)
 736             in.printf_bits(0, 11, "DHBBAAR10RR");
 737          wt_w = a + 2;
 738          wt_h = b + 8;
 739          break;
 740       case 0x3:
 741          if ((b & 0x2) == 0) {
 742             if (VERBOSE_DECODE)
 743                in.printf_bits(0, 11, "DH0BAAR11RR");
 744             wt_w = a + 2;
 745             wt_h = b + 6;
 746          } else {
 747             if (VERBOSE_DECODE)
 748                in.printf_bits(0, 11, "DH1BAAR11RR");
 749             wt_w = (b & 0x1) + 2;
 750             wt_h = a + 2;
 751          }
 752          break;
 753       }
 754    } else {
 755       if (in.get_bits(6, 3) == 0x7) {
 756          if (in.get_bits(0, 9) == 0x1fc) {
 757             if (VERBOSE_DECODE)
 758                in.printf_bits(0, 11, "xx111111100 (void extent)");
 759             return decode_void_extent(in);
 760          } else {
 761             if (VERBOSE_DECODE)
 762                in.printf_bits(0, 11, "xx111xxxx00");
 763             return decode_error::reserved_block_mode_1;
 764          }
 765       }
 766       if (in.get_bits(0, 4) == 0x0) {
 767          if (VERBOSE_DECODE)
 768             in.printf_bits(0, 11, "xxxxxxx0000");
 769          return decode_error::reserved_block_mode_2;
 770       }
 771
 772       wt_range = in.get_bits(1, 3) | in.get_bits(4, 1);
 773       int a = in.get_bits(5, 2);
 774       int b;
 775
 776       switch (in.get_bits(7, 2)) {
 777       case 0x0:
 778          if (VERBOSE_DECODE)
 779             in.printf_bits(0, 11, "DH00AARRR00");
 780          wt_w = 12;
 781          wt_h = a + 2;
 782          break;
 783       case 0x1:
 784          if (VERBOSE_DECODE)
 785             in.printf_bits(0, 11, "DH01AARRR00");
 786          wt_w = a + 2;
 787          wt_h = 12;
 788          break;
 789       case 0x3:
 790          if (in.get_bits(5, 1) == 0) {
 791             if (VERBOSE_DECODE)
 792                in.printf_bits(0, 11, "DH1100RRR00");
 793             wt_w = 6;
 794             wt_h = 10;
 795          } else {
 796             if (VERBOSE_DECODE)
 797                in.printf_bits(0, 11, "DH1101RRR00");
 798             wt_w = 10;
 799             wt_h = 6;
 800          }
 801          break;
 802       case 0x2:
 803          if (VERBOSE_DECODE)
 804             in.printf_bits(0, 11, "BB10AARRR00");
 805          b = in.get_bits(9, 2);
 806          wt_w = a + 6;
 807          wt_h = b + 6;
 808          dual_plane = 0;
 809          high_prec = 0;
 810          break;
 811       }
 812    }
 813    return decode_error::ok;
 814 }
 815
 816 void Block::decode_cem(InputBitVector in)
 817 {
 818    cems[0] = cems[1] = cems[2] = cems[3] = -1;
 819
 820    num_extra_cem_bits = 0;
 821    extra_cem_bits = 0;
 822
 823    if (num_parts > 1) {
 824
 825       partition_index = in.get_bits(13, 10);
 826       if (VERBOSE_DECODE)
 827          in.printf_bits(13, 10, "partition ID (%d)", partition_index);
 828
 829       uint32_t cem = in.get_bits(23, 6);
 830
 831       if ((cem & 0x3) == 0x0) {
 832          cem >>= 2;
 833          cem_base_class = cem >> 2;
 834          is_multi_cem = false;
 835
 836          for (int i = 0; i < num_parts; ++i)
 837             cems[i] = cem;
 838
 839          if (VERBOSE_DECODE)
 840             in.printf_bits(23, 6, "CEM (single, %d)", cem);
 841       } else {
 842
 843          cem_base_class = (cem & 0x3) - 1;
 844          is_multi_cem = true;
 845
 846          if (VERBOSE_DECODE)
 847             in.printf_bits(23, 6, "CEM (multi, base class %d)", cem_base_class);
 848
 849          int offset = 128 - weight_bits;
 850
 851          if (num_parts == 2) {
 852             if (VERBOSE_DECODE) {
 853                in.printf_bits(25, 4, "M0M0 C1 C0");
 854                in.printf_bits(offset - 2, 2, "M1M1");
 855             }
 856
 857             uint32_t c0 = in.get_bits(25, 1);
 858             uint32_t c1 = in.get_bits(26, 1);
 859
 860             extra_cem_bits = c0 + c1;
 861
 862             num_extra_cem_bits = 2;
 863
 864             uint32_t m0 = in.get_bits(27, 2);
 865             uint32_t m1 = in.get_bits(offset - 2, 2);
 866
 867             cems[0] = ((cem_base_class + c0) << 2) | m0;
 868             cems[1] = ((cem_base_class + c1) << 2) | m1;
 869
 870          } else if (num_parts == 3) {
 871             if (VERBOSE_DECODE) {
 872                in.printf_bits(25, 4, "M0 C2 C1 C0");
 873                in.printf_bits(offset - 5, 5, "M2M2 M1M1 M0");
 874             }
 875
 876             uint32_t c0 = in.get_bits(25, 1);
 877             uint32_t c1 = in.get_bits(26, 1);
 878             uint32_t c2 = in.get_bits(27, 1);
 879
 880             extra_cem_bits = c0 + c1 + c2;
 881
 882             num_extra_cem_bits = 5;
 883
 884             uint32_t m0 = in.get_bits(28, 1) | (in.get_bits(128 - weight_bits - 5, 1) << 1);
 885             uint32_t m1 = in.get_bits(offset - 4, 2);
 886             uint32_t m2 = in.get_bits(offset - 2, 2);
 887
 888             cems[0] = ((cem_base_class + c0) << 2) | m0;
 889             cems[1] = ((cem_base_class + c1) << 2) | m1;
 890             cems[2] = ((cem_base_class + c2) << 2) | m2;
 891
 892          } else if (num_parts == 4) {
 893             if (VERBOSE_DECODE) {
 894                in.printf_bits(25, 4, "C3 C2 C1 C0");
 895                in.printf_bits(offset - 8, 8, "M3M3 M2M2 M1M1 M0M0");
 896             }
 897
 898             uint32_t c0 = in.get_bits(25, 1);
 899             uint32_t c1 = in.get_bits(26, 1);
 900             uint32_t c2 = in.get_bits(27, 1);
 901             uint32_t c3 = in.get_bits(28, 1);
 902
 903             extra_cem_bits = c0 + c1 + c2 + c3;
 904
 905             num_extra_cem_bits = 8;
 906
 907             uint32_t m0 = in.get_bits(offset - 8, 2);
 908             uint32_t m1 = in.get_bits(offset - 6, 2);
 909             uint32_t m2 = in.get_bits(offset - 4, 2);
 910             uint32_t m3 = in.get_bits(offset - 2, 2);
 911
 912             cems[0] = ((cem_base_class + c0) << 2) | m0;
 913             cems[1] = ((cem_base_class + c1) << 2) | m1;
 914             cems[2] = ((cem_base_class + c2) << 2) | m2;
 915             cems[3] = ((cem_base_class + c3) << 2) | m3;
 916          } else {
 917             unreachable("");
 918          }
 919       }
 920
 921       colour_endpoint_data_offset = 29;
 922
 923    } else {
 924       uint32_t cem = in.get_bits(13, 4);
 925
 926       cem_base_class = cem >> 2;
 927       is_multi_cem = false;
 928
 929       cems[0] = cem;
 930
 931       partition_index = -1;
 932
 933       if (VERBOSE_DECODE)
 934          in.printf_bits(13, 4, "CEM = %d (class %d)", cem, cem_base_class);
 935
 936       colour_endpoint_data_offset = 17;
 937    }
 938 }
 939
 940 void Block::unpack_colour_endpoints(InputBitVector in)
 941 {
 942    if (ce_trits) {
 943       int offset = colour_endpoint_data_offset;
 944       int bits_left = colour_endpoint_bits;
 945       for (int i = 0; i < num_cem_values; i += 5) {
 946          int bits_to_read = MIN2(bits_left, 8 + ce_bits * 5);
 947          /* If ce_trits then ce_bits <= 6, so bits_to_read <= 38 and we have to use uint64_t */
 948          uint64_t raw = in.get_bits64(offset, bits_to_read);
 949          unpack_trit_block(ce_bits, raw, &colour_endpoints_quant[i]);
 950
 951          if (VERBOSE_DECODE)
 952             in.printf_bits(offset, bits_to_read,
 953                            "trits [%d,%d,%d,%d,%d]",
 954                            colour_endpoints_quant[i+0], colour_endpoints_quant[i+1],
 955                   colour_endpoints_quant[i+2], colour_endpoints_quant[i+3],
 956                   colour_endpoints_quant[i+4]);
 957
 958          offset += 8 + ce_bits * 5;
 959          bits_left -= 8 + ce_bits * 5;
 960       }
 961    } else if (ce_quints) {
 962       int offset = colour_endpoint_data_offset;
 963       int bits_left = colour_endpoint_bits;
 964       for (int i = 0; i < num_cem_values; i += 3) {
 965          int bits_to_read = MIN2(bits_left, 7 + ce_bits * 3);
 966          /* If ce_quints then ce_bits <= 5, so bits_to_read <= 22 and we can use uint32_t */
 967          uint32_t raw = in.get_bits(offset, bits_to_read);
 968          unpack_quint_block(ce_bits, raw, &colour_endpoints_quant[i]);
 969
 970          if (VERBOSE_DECODE)
 971             in.printf_bits(offset, bits_to_read,
 972                            "quints [%d,%d,%d]",
 973                            colour_endpoints_quant[i], colour_endpoints_quant[i+1], colour_endpoints_quant[i+2]);
 974
 975          offset += 7 + ce_bits * 3;
 976          bits_left -= 7 + ce_bits * 3;
 977       }
 978    } else {
 979       assert((colour_endpoint_bits % ce_bits) == 0);
 980       int offset = colour_endpoint_data_offset;
 981       for (int i = 0; i < num_cem_values; i++) {
 982          colour_endpoints_quant[i] = in.get_bits(offset, ce_bits);
 983
 984          if (VERBOSE_DECODE)
 985             in.printf_bits(offset, ce_bits, "bits [%d]", colour_endpoints_quant[i]);
 986
 987          offset += ce_bits;
 988       }
 989    }
 990 }
 991
 992 void Block::decode_colour_endpoints()
 993 {
 994    int cem_values_idx = 0;
 995    for (int part = 0; part < num_parts; ++part) {
 996       uint8_t *v = &colour_endpoints[cem_values_idx];
 997       int v0 = v[0];
 998       int v1 = v[1];
 999       int v2 = v[2];
1000       int v3 = v[3];
1001       int v4 = v[4];
1002       int v5 = v[5];
1003       int v6 = v[6];
1004       int v7 = v[7];
1005       cem_values_idx += ((cems[part] >> 2) + 1) * 2;
1006
1007       uint8x4_t e0, e1;
1008       int s0, s1, L0, L1;
1009
1010       switch (cems[part])
1011       {
1012       case 0:
1013          e0 = uint8x4_t(v0, v0, v0, 0xff);
1014          e1 = uint8x4_t(v1, v1, v1, 0xff);
1015          break;
1016       case 1:
1017          L0 = (v0 >> 2) | (v1 & 0xc0);
1018          L1 = L0 + (v1 & 0x3f);
1019          if (L1 > 0xff)
1020             L1 = 0xff;
1021          e0 = uint8x4_t(L0, L0, L0, 0xff);
1022          e1 = uint8x4_t(L1, L1, L1, 0xff);
1023          break;
1024       case 4:
1025          e0 = uint8x4_t(v0, v0, v0, v2);
1026          e1 = uint8x4_t(v1, v1, v1, v3);
1027          break;
1028       case 5:
1029          bit_transfer_signed(v1, v0);
1030          bit_transfer_signed(v3, v2);
1031          e0 = uint8x4_t(v0, v0, v0, v2);
1032          e1 = uint8x4_t::clamped(v0+v1, v0+v1, v0+v1, v2+v3);
1033          break;
1034       case 6:
1035          e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, 0xff);
1036          e1 = uint8x4_t(v0, v1, v2, 0xff);
1037          break;
1038       case 8:
1039          s0 = v0 + v2 + v4;
1040          s1 = v1 + v3 + v5;
1041          if (s1 >= s0) {
1042             e0 = uint8x4_t(v0, v2, v4, 0xff);
1043             e1 = uint8x4_t(v1, v3, v5, 0xff);
1044          } else {
1045             e0 = blue_contract(v1, v3, v5, 0xff);
1046             e1 = blue_contract(v0, v2, v4, 0xff);
1047          }
1048          break;
1049       case 9:
1050          bit_transfer_signed(v1, v0);
1051          bit_transfer_signed(v3, v2);
1052          bit_transfer_signed(v5, v4);
1053          if (v1 + v3 + v5 >= 0) {
1054             e0 = uint8x4_t(v0, v2, v4, 0xff);
1055             e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, 0xff);
1056          } else {
1057             e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, 0xff);
1058             e1 = blue_contract(v0, v2, v4, 0xff);
1059          }
1060          break;
1061       case 10:
1062          e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, v4);
1063          e1 = uint8x4_t(v0, v1, v2, v5);
1064          break;
1065       case 12:
1066          s0 = v0 + v2 + v4;
1067          s1 = v1 + v3 + v5;
1068          if (s1 >= s0) {
1069             e0 = uint8x4_t(v0, v2, v4, v6);
1070             e1 = uint8x4_t(v1, v3, v5, v7);
1071          } else {
1072             e0 = blue_contract(v1, v3, v5, v7);
1073             e1 = blue_contract(v0, v2, v4, v6);
1074          }
1075          break;
1076       case 13:
1077          bit_transfer_signed(v1, v0);
1078          bit_transfer_signed(v3, v2);
1079          bit_transfer_signed(v5, v4);
1080          bit_transfer_signed(v7, v6);
1081          if (v1 + v3 + v5 >= 0) {
1082             e0 = uint8x4_t(v0, v2, v4, v6);
1083             e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, v6+v7);
1084          } else {
1085             e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, v6+v7);
1086             e1 = blue_contract(v0, v2, v4, v6);
1087          }
1088          break;
1089       default:
1090          /* HDR endpoints not supported; return error colour */
1091          e0 = uint8x4_t(255, 0, 255, 255);
1092          e1 = uint8x4_t(255, 0, 255, 255);
1093          break;
1094       }
1095
1096       endpoints_decoded[0][part] = e0;
1097       endpoints_decoded[1][part] = e1;
1098
1099       if (VERBOSE_DECODE) {
1100          printf("cems[%d]=%d v=[", part, cems[part]);
1101          for (int i = 0; i < (cems[part] >> 2) + 1; ++i) {
1102             if (i)
1103                printf(", ");
1104             printf("%3d", v[i]);
1105          }
1106          printf("] e0=[%3d,%4d,%4d,%4d] e1=[%3d,%4d,%4d,%4d]\n",
1107                 e0.v[0], e0.v[1], e0.v[2], e0.v[3],
1108                e1.v[0], e1.v[1], e1.v[2], e1.v[3]);
1109       }
1110    }
1111 }
1112
1113 void Block::unpack_weights(InputBitVector in)
1114 {
1115    if (wt_trits) {
1116       int offset = 128;
1117       int bits_left = weight_bits;
1118       for (int i = 0; i < num_weights; i += 5) {
1119          int bits_to_read = MIN2(bits_left, 8 + 5*wt_bits);
1120          /* If wt_trits then wt_bits <= 3, so bits_to_read <= 23 and we can use uint32_t */
1121          uint32_t raw = in.get_bits_rev(offset, bits_to_read);
1122          unpack_trit_block(wt_bits, raw, &weights_quant[i]);
1123
1124          if (VERBOSE_DECODE)
1125             in.printf_bits(offset - bits_to_read, bits_to_read, "weight trits [%d,%d,%d,%d,%d]",
1126                            weights_quant[i+0], weights_quant[i+1],
1127                   weights_quant[i+2], weights_quant[i+3],
1128                   weights_quant[i+4]);
1129
1130          offset -= 8 + wt_bits * 5;
1131          bits_left -= 8 + wt_bits * 5;
1132       }
1133
1134    } else if (wt_quints) {
1135
1136       int offset = 128;
1137       int bits_left = weight_bits;
1138       for (int i = 0; i < num_weights; i += 3) {
1139          int bits_to_read = MIN2(bits_left, 7 + 3*wt_bits);
1140          /* If wt_quints then wt_bits <= 2, so bits_to_read <= 13 and we can use uint32_t */
1141          uint32_t raw = in.get_bits_rev(offset, bits_to_read);
1142          unpack_quint_block(wt_bits, raw, &weights_quant[i]);
1143
1144          if (VERBOSE_DECODE)
1145             in.printf_bits(offset - bits_to_read, bits_to_read, "weight quints [%d,%d,%d]",
1146                            weights_quant[i], weights_quant[i+1], weights_quant[i+2]);
1147
1148          offset -= 7 + wt_bits * 3;
1149          bits_left -= 7 + wt_bits * 3;
1150       }
1151
1152    } else {
1153       int offset = 128;
1154       assert((weight_bits % wt_bits) == 0);
1155       for (int i = 0; i < num_weights; ++i) {
1156          weights_quant[i] = in.get_bits_rev(offset, wt_bits);
1157
1158          if (VERBOSE_DECODE)
1159             in.printf_bits(offset - wt_bits, wt_bits, "weight bits [%d]", weights_quant[i]);
1160
1161          offset -= wt_bits;
1162       }
1163    }
1164 }
1165
1166 void Block::unquantise_weights()
1167 {
1168    assert(num_weights <= (int)ARRAY_SIZE(weights_quant));
1169    assert(num_weights <= (int)ARRAY_SIZE(weights));
1170
1171    memset(weights, 0, sizeof(weights));
1172
1173    for (int i = 0; i < num_weights; ++i) {
1174
1175       uint8_t v = weights_quant[i];
1176       uint8_t w;
1177
1178       if (wt_trits) {
1179
1180          if (wt_bits == 0) {
1181             w = v * 32;
1182          } else {
1183             uint8_t A, B, C, D;
1184             A = (v & 0x1) ? 0x7F : 0x00;
1185             switch (wt_bits) {
1186             case 1:
1187                B = 0;
1188                C = 50;
1189                D = v >> 1;
1190                break;
1191             case 2:
1192                B = (v & 0x2) ? 0x45 : 0x00;
1193                C = 23;
1194                D = v >> 2;
1195                break;
1196             case 3:
1197                B = ((v & 0x6) >> 1) | ((v & 0x6) << 4);
1198                C = 11;
1199                D = v >> 3;
1200                break;
1201             default:
1202                unreachable("");
1203             }
1204             uint16_t T = D * C + B;
1205             T = T ^ A;
1206             T = (A & 0x20) | (T >> 2);
1207             assert(T < 64);
1208             if (T > 32)
1209                T++;
1210             w = T;
1211          }
1212
1213       } else if (wt_quints) {
1214
1215          if (wt_bits == 0) {
1216             w = v * 16;
1217          } else {
1218             uint8_t A, B, C, D;
1219             A = (v & 0x1) ? 0x7F : 0x00;
1220             switch (wt_bits) {
1221             case 1:
1222                B = 0;
1223                C = 28;
1224                D = v >> 1;
1225                break;
1226             case 2:
1227                B = (v & 0x2) ? 0x42 : 0x00;
1228                C = 13;
1229                D = v >> 2;
1230                break;
1231             default:
1232                unreachable("");
1233             }
1234             uint16_t T = D * C + B;
1235             T = T ^ A;
1236             T = (A & 0x20) | (T >> 2);
1237             assert(T < 64);
1238             if (T > 32)
1239                T++;
1240             w = T;
1241          }
1242          weights[i] = w;
1243
1244       } else {
1245
1246          switch (wt_bits) {
1247          case 1: w = v ? 0x3F : 0x00; break;
1248          case 2: w = v | (v << 2) | (v << 4); break;
1249          case 3: w = v | (v << 3); break;
1250          case 4: w = (v >> 2) | (v << 2); break;
1251          case 5: w = (v >> 4) | (v << 1); break;
1252          default: unreachable("");
1253          }
1254          assert(w < 64);
1255          if (w > 32)
1256             w++;
1257       }
1258       weights[i] = w;
1259    }
1260 }
1261
1262 void Block::compute_infill_weights(int block_w, int block_h, int block_d)
1263 {
1264    int Ds = block_w <= 1 ? 0 : (1024 + block_w / 2) / (block_w - 1);
1265    int Dt = block_h <= 1 ? 0 : (1024 + block_h / 2) / (block_h - 1);
1266    int Dr = block_d <= 1 ? 0 : (1024 + block_d / 2) / (block_d - 1);
1267    for (int r = 0; r < block_d; ++r) {
1268       for (int t = 0; t < block_h; ++t) {
1269          for (int s = 0; s < block_w; ++s) {
1270             int cs = Ds * s;
1271             int ct = Dt * t;
1272             int cr = Dr * r;
1273             int gs = (cs * (wt_w - 1) + 32) >> 6;
1274             int gt = (ct * (wt_h - 1) + 32) >> 6;
1275             int gr = (cr * (wt_d - 1) + 32) >> 6;
1276             assert(gs >= 0 && gs <= 176);
1277             assert(gt >= 0 && gt <= 176);
1278             assert(gr >= 0 && gr <= 176);
1279             int js = gs >> 4;
1280             int fs = gs & 0xf;
1281             int jt = gt >> 4;
1282             int ft = gt & 0xf;
1283             int jr = gr >> 4;
1284             int fr = gr & 0xf;
1285
1286             /* TODO: 3D */
1287             (void)jr;
1288             (void)fr;
1289
1290             int w11 = (fs * ft + 8) >> 4;
1291             int w10 = ft - w11;
1292             int w01 = fs - w11;
1293             int w00 = 16 - fs - ft + w11;
1294
1295             if (dual_plane) {
1296                int p00, p01, p10, p11, i0, i1;
1297                int v0 = js + jt * wt_w;
1298                p00 = weights[(v0) * 2];
1299                p01 = weights[(v0 + 1) * 2];
1300                p10 = weights[(v0 + wt_w) * 2];
1301                p11 = weights[(v0 + wt_w + 1) * 2];
1302                i0 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1303                p00 = weights[(v0) * 2 + 1];
1304                p01 = weights[(v0 + 1) * 2 + 1];
1305                p10 = weights[(v0 + wt_w) * 2 + 1];
1306                p11 = weights[(v0 + wt_w + 1) * 2 + 1];
1307                assert((v0 + wt_w + 1) * 2 + 1 < (int)ARRAY_SIZE(weights));
1308                i1 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1309                assert(0 <= i0 && i0 <= 64);
1310                infill_weights[0][s + t*block_w + r*block_w*block_h] = i0;
1311                infill_weights[1][s + t*block_w + r*block_w*block_h] = i1;
1312             } else {
1313                int p00, p01, p10, p11, i;
1314                int v0 = js + jt * wt_w;
1315                p00 = weights[v0];
1316                p01 = weights[v0 + 1];
1317                p10 = weights[v0 + wt_w];
1318                p11 = weights[v0 + wt_w + 1];
1319                assert(v0 + wt_w + 1 < (int)ARRAY_SIZE(weights));
1320                i = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1321                assert(0 <= i && i <= 64);
1322                infill_weights[0][s + t*block_w + r*block_w*block_h] = i;
1323             }
1324          }
1325       }
1326    }
1327 }
1328
1329 void Block::unquantise_colour_endpoints()
1330 {
1331    assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints_quant));
1332    assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints));
1333
1334    for (int i = 0; i < num_cem_values; ++i) {
1335       uint8_t v = colour_endpoints_quant[i];
1336
1337       if (ce_trits) {
1338          uint16_t A, B, C, D;
1339          uint16_t t;
1340          A = (v & 0x1) ? 0x1FF : 0x000;
1341          switch (ce_bits) {
1342          case 1:
1343             B = 0;
1344             C = 204;
1345             D = v >> 1;
1346             break;
1347          case 2:
1348             B = (v & 0x2) ? 0x116 : 0x000;
1349             C = 93;
1350             D = v >> 2;
1351             break;
1352          case 3:
1353             t = ((v >> 1) & 0x3);
1354             B = t | (t << 2) | (t << 7);
1355             C = 44;
1356             D = v >> 3;
1357             break;
1358          case 4:
1359             t = ((v >> 1) & 0x7);
1360             B = t | (t << 6);
1361             C = 22;
1362             D = v >> 4;
1363             break;
1364          case 5:
1365             t = ((v >> 1) & 0xF);
1366             B = (t >> 2) | (t << 5);
1367             C = 11;
1368             D = v >> 5;
1369             break;
1370          case 6:
1371             B = ((v & 0x3E) << 3) | ((v >> 5) & 0x1);
1372             C = 5;
1373             D = v >> 6;
1374             break;
1375          default:
1376             unreachable("");
1377          }
1378          uint16_t T = D * C + B;
1379          T = T ^ A;
1380          T = (A & 0x80) | (T >> 2);
1381          assert(T < 256);
1382          colour_endpoints[i] = T;
1383       } else if (ce_quints) {
1384          uint16_t A, B, C, D;
1385          uint16_t t;
1386          A = (v & 0x1) ? 0x1FF : 0x000;
1387          switch (ce_bits) {
1388          case 1:
1389             B = 0;
1390             C = 113;
1391             D = v >> 1;
1392             break;
1393          case 2:
1394             B = (v & 0x2) ? 0x10C : 0x000;
1395             C = 54;
1396             D = v >> 2;
1397             break;
1398          case 3:
1399             t = ((v >> 1) & 0x3);
1400             B = (t >> 1) | (t << 1) | (t << 7);
1401             C = 26;
1402             D = v >> 3;
1403             break;
1404          case 4:
1405             t = ((v >> 1) & 0x7);
1406             B = (t >> 1) | (t << 6);
1407             C = 13;
1408             D = v >> 4;
1409             break;
1410          case 5:
1411             t = ((v >> 1) & 0xF);
1412             B = (t >> 4) | (t << 5);
1413             C = 6;
1414             D = v >> 5;
1415             break;
1416          default:
1417             unreachable("");
1418          }
1419          uint16_t T = D * C + B;
1420          T = T ^ A;
1421          T = (A & 0x80) | (T >> 2);
1422          assert(T < 256);
1423          colour_endpoints[i] = T;
1424       } else {
1425          switch (ce_bits) {
1426          case 1: v = v ? 0xFF : 0x00; break;
1427          case 2: v = (v << 6) | (v << 4) | (v << 2) | v; break;
1428          case 3: v = (v << 5) | (v << 2) | (v >> 1); break;
1429          case 4: v = (v << 4) | v; break;
1430          case 5: v = (v << 3) | (v >> 2); break;
1431          case 6: v = (v << 2) | (v >> 4); break;
1432          case 7: v = (v << 1) | (v >> 6); break;
1433          case 8: break;
1434          default: unreachable("");
1435          }
1436          colour_endpoints[i] = v;
1437       }
1438    }
1439 }
1440
1441 decode_error::type Block::decode(const Decoder &decoder, InputBitVector in)
1442 {
1443    decode_error::type err;
1444
1445    is_error = false;
1446    bogus_colour_endpoints = false;
1447    bogus_weights = false;
1448    is_void_extent = false;
1449
1450    wt_d = 1;
1451    /* TODO: 3D */
1452
1453    /* TODO: test for all the illegal encodings */
1454
1455    if (VERBOSE_DECODE)
1456       in.printf_bits(0, 128);
1457
1458    err = decode_block_mode(in);
1459    if (err != decode_error::ok)
1460       return err;
1461
1462    if (is_void_extent)
1463       return decode_error::ok;
1464
1465    /* TODO: 3D */
1466
1467    calculate_from_weights();
1468
1469    if (VERBOSE_DECODE)
1470       printf("weights_grid=%dx%dx%d dual_plane=%d num_weights=%d high_prec=%d r=%d range=0..%d (%dt %dq %db) weight_bits=%d\n",
1471              wt_w, wt_h, wt_d, dual_plane, num_weights, high_prec, wt_range, wt_max, wt_trits, wt_quints, wt_bits, weight_bits);
1472
1473    if (wt_w > decoder.block_w || wt_h > decoder.block_h || wt_d > decoder.block_d)
1474       return decode_error::weight_grid_exceeds_block_size;
1475
1476    num_parts = in.get_bits(11, 2) + 1;
1477
1478    if (VERBOSE_DECODE)
1479       in.printf_bits(11, 2, "partitions = %d", num_parts);
1480
1481    if (dual_plane && num_parts > 3)
1482       return decode_error::dual_plane_and_too_many_partitions;
1483
1484    decode_cem(in);
1485
1486    if (VERBOSE_DECODE)
1487       printf("cem=[%d,%d,%d,%d] base_cem_class=%d\n", cems[0], cems[1], cems[2], cems[3], cem_base_class);
1488
1489    int num_cem_pairs = (cem_base_class + 1) * num_parts + extra_cem_bits;
1490    num_cem_values = num_cem_pairs * 2;
1491
1492    calculate_remaining_bits();
1493    err = calculate_colour_endpoints_size();
1494    if (err != decode_error::ok)
1495       return err;
1496
1497    if (VERBOSE_DECODE)
1498       in.printf_bits(colour_endpoint_data_offset, colour_endpoint_bits,
1499                      "endpoint data (%d bits, %d vals, %dt %dq %db)",
1500                      colour_endpoint_bits, num_cem_values, ce_trits, ce_quints, ce_bits);
1501
1502    unpack_colour_endpoints(in);
1503
1504    if (VERBOSE_DECODE) {
1505       printf("cem values raw =[");
1506       for (int i = 0; i < num_cem_values; i++) {
1507          if (i)
1508             printf(", ");
1509          printf("%3d", colour_endpoints_quant[i]);
1510       }
1511       printf("]\n");
1512    }
1513
1514    if (num_cem_values > 18)
1515       return decode_error::invalid_colour_endpoints_count;
1516
1517    unquantise_colour_endpoints();
1518
1519    if (VERBOSE_DECODE) {
1520       printf("cem values norm=[");
1521       for (int i = 0; i < num_cem_values; i++) {
1522          if (i)
1523             printf(", ");
1524          printf("%3d", colour_endpoints[i]);
1525       }
1526       printf("]\n");
1527    }
1528
1529    decode_colour_endpoints();
1530
1531    if (dual_plane) {
1532       int ccs_offset = 128 - weight_bits - num_extra_cem_bits - 2;
1533       colour_component_selector = in.get_bits(ccs_offset, 2);
1534
1535       if (VERBOSE_DECODE)
1536          in.printf_bits(ccs_offset, 2, "colour component selector = %d", colour_component_selector);
1537    } else {
1538       colour_component_selector = 0;
1539    }
1540
1541
1542    if (VERBOSE_DECODE)
1543       in.printf_bits(128 - weight_bits, weight_bits, "weights (%d bits)", weight_bits);
1544
1545    if (num_weights > 64)
1546       return decode_error::invalid_num_weights;
1547
1548    if (weight_bits < 24 || weight_bits > 96)
1549       return decode_error::invalid_weight_bits;
1550
1551    unpack_weights(in);
1552
1553    unquantise_weights();
1554
1555    if (VERBOSE_DECODE) {
1556       printf("weights=[");
1557       for (int i = 0; i < num_weights; ++i) {
1558          if (i)
1559             printf(", ");
1560          printf("%d", weights[i]);
1561       }
1562       printf("]\n");
1563
1564       for (int plane = 0; plane <= dual_plane; ++plane) {
1565          printf("weights (plane %d):\n", plane);
1566          int i = 0;
1567          (void)i;
1568
1569          for (int r = 0; r < wt_d; ++r) {
1570             for (int t = 0; t < wt_h; ++t) {
1571                for (int s = 0; s < wt_w; ++s) {
1572                   printf("%3d", weights[i++ * (1 + dual_plane) + plane]);
1573                }
1574                printf("\n");
1575             }
1576             if (r < wt_d - 1)
1577                printf("\n");
1578          }
1579       }
1580    }
1581
1582    compute_infill_weights(decoder.block_w, decoder.block_h, decoder.block_d);
1583
1584    if (VERBOSE_DECODE) {
1585       for (int plane = 0; plane <= dual_plane; ++plane) {
1586          printf("infilled weights (plane %d):\n", plane);
1587          int i = 0;
1588          (void)i;
1589
1590          for (int r = 0; r < decoder.block_d; ++r) {
1591             for (int t = 0; t < decoder.block_h; ++t) {
1592                for (int s = 0; s < decoder.block_w; ++s) {
1593                   printf("%3d", infill_weights[plane][i++]);
1594                }
1595                printf("\n");
1596             }
1597             if (r < decoder.block_d - 1)
1598                printf("\n");
1599          }
1600       }
1601    }
1602    if (VERBOSE_DECODE)
1603       printf("\n");
1604
1605    return decode_error::ok;
1606 }
1607
1608 void Block::write_decoded(const Decoder &decoder, uint16_t *output)
1609 {
1610    /* sRGB can only be stored as unorm8. */
1611    assert(!decoder.srgb || decoder.output_unorm8);
1612
1613    if (is_void_extent) {
1614       for (int idx = 0; idx < decoder.block_w*decoder.block_h*decoder.block_d; ++idx) {
1615          if (decoder.output_unorm8) {
1616             if (decoder.srgb) {
1617                output[idx*4+0] = void_extent_colour_r >> 8;
1618                output[idx*4+1] = void_extent_colour_g >> 8;
1619                output[idx*4+2] = void_extent_colour_b >> 8;
1620             } else {
1621                output[idx*4+0] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_r);
1622                output[idx*4+1] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_g);
1623                output[idx*4+2] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_b);
1624             }
1625             output[idx*4+3] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_a);
1626          } else {
1627             /* Store the color as FP16. */
1628             output[idx*4+0] = _mesa_uint16_div_64k_to_half(void_extent_colour_r);
1629             output[idx*4+1] = _mesa_uint16_div_64k_to_half(void_extent_colour_g);
1630             output[idx*4+2] = _mesa_uint16_div_64k_to_half(void_extent_colour_b);
1631             output[idx*4+3] = _mesa_uint16_div_64k_to_half(void_extent_colour_a);
1632          }
1633       }
1634       return;
1635    }
1636
1637    int small_block = (decoder.block_w * decoder.block_h * decoder.block_d) < 31;
1638
1639    int idx = 0;
1640    for (int z = 0; z < decoder.block_d; ++z) {
1641       for (int y = 0; y < decoder.block_h; ++y) {
1642          for (int x = 0; x < decoder.block_w; ++x) {
1643
1644             int partition;
1645             if (num_parts > 1) {
1646                partition = select_partition(partition_index, x, y, z, num_parts, small_block);
1647                assert(partition < num_parts);
1648             } else {
1649                partition = 0;
1650             }
1651
1652             /* TODO: HDR */
1653
1654             uint8x4_t e0 = endpoints_decoded[0][partition];
1655             uint8x4_t e1 = endpoints_decoded[1][partition];
1656             uint16_t c0[4], c1[4];
1657
1658             /* Expand to 16 bits. */
1659             if (decoder.srgb) {
1660                c0[0] = (uint16_t)((e0.v[0] << 8) | 0x80);
1661                c0[1] = (uint16_t)((e0.v[1] << 8) | 0x80);
1662                c0[2] = (uint16_t)((e0.v[2] << 8) | 0x80);
1663                c0[3] = (uint16_t)((e0.v[3] << 8) | 0x80);
1664
1665                c1[0] = (uint16_t)((e1.v[0] << 8) | 0x80);
1666                c1[1] = (uint16_t)((e1.v[1] << 8) | 0x80);
1667                c1[2] = (uint16_t)((e1.v[2] << 8) | 0x80);
1668                c1[3] = (uint16_t)((e1.v[3] << 8) | 0x80);
1669             } else {
1670                c0[0] = (uint16_t)((e0.v[0] << 8) | e0.v[0]);
1671                c0[1] = (uint16_t)((e0.v[1] << 8) | e0.v[1]);
1672                c0[2] = (uint16_t)((e0.v[2] << 8) | e0.v[2]);
1673                c0[3] = (uint16_t)((e0.v[3] << 8) | e0.v[3]);
1674
1675                c1[0] = (uint16_t)((e1.v[0] << 8) | e1.v[0]);
1676                c1[1] = (uint16_t)((e1.v[1] << 8) | e1.v[1]);
1677                c1[2] = (uint16_t)((e1.v[2] << 8) | e1.v[2]);
1678                c1[3] = (uint16_t)((e1.v[3] << 8) | e1.v[3]);
1679             }
1680
1681             int w[4];
1682             if (dual_plane) {
1683                int w0 = infill_weights[0][idx];
1684                int w1 = infill_weights[1][idx];
1685                w[0] = w[1] = w[2] = w[3] = w0;
1686                w[colour_component_selector] = w1;
1687             } else {
1688                int w0 = infill_weights[0][idx];
1689                w[0] = w[1] = w[2] = w[3] = w0;
1690             }
1691
1692             /* Interpolate to produce UNORM16, applying weights. */
1693             uint16_t c[4] = {
1694                (uint16_t)((c0[0] * (64 - w[0]) + c1[0] * w[0] + 32) >> 6),
1695                (uint16_t)((c0[1] * (64 - w[1]) + c1[1] * w[1] + 32) >> 6),
1696                (uint16_t)((c0[2] * (64 - w[2]) + c1[2] * w[2] + 32) >> 6),
1697                (uint16_t)((c0[3] * (64 - w[3]) + c1[3] * w[3] + 32) >> 6),
1698             };
1699
1700             if (decoder.output_unorm8) {
1701                if (decoder.srgb) {
1702                   output[idx*4+0] = c[0] >> 8;
1703                   output[idx*4+1] = c[1] >> 8;
1704                   output[idx*4+2] = c[2] >> 8;
1705                } else {
1706                   output[idx*4+0] = c[0] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[0]);
1707                   output[idx*4+1] = c[1] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[1]);
1708                   output[idx*4+2] = c[2] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[2]);
1709                }
1710                output[idx*4+3] = c[3] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[3]);
1711             } else {
1712                /* Store the color as FP16. */
1713                output[idx*4+0] = c[0] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[0]);
1714                output[idx*4+1] = c[1] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[1]);
1715                output[idx*4+2] = c[2] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[2]);
1716                output[idx*4+3] = c[3] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[3]);
1717             }
1718
1719             idx++;
1720          }
1721       }
1722    }
1723 }
1724
1725 void Block::calculate_from_weights()
1726 {
1727    wt_trits = 0;
1728    wt_quints = 0;
1729    wt_bits = 0;
1730    switch (high_prec) {
1731    case 0:
1732       switch (wt_range) {
1733       case 0x2: wt_max = 1; wt_bits = 1; break;
1734       case 0x3: wt_max = 2; wt_trits = 1; break;
1735       case 0x4: wt_max = 3; wt_bits = 2; break;
1736       case 0x5: wt_max = 4; wt_quints = 1; break;
1737       case 0x6: wt_max = 5; wt_trits = 1; wt_bits = 1; break;
1738       case 0x7: wt_max = 7; wt_bits = 3; break;
1739       default: abort();
1740       }
1741       break;
1742    case 1:
1743       switch (wt_range) {
1744       case 0x2: wt_max = 9; wt_quints = 1; wt_bits = 1; break;
1745       case 0x3: wt_max = 11; wt_trits = 1; wt_bits = 2; break;
1746       case 0x4: wt_max = 15; wt_bits = 4; break;
1747       case 0x5: wt_max = 19; wt_quints = 1; wt_bits = 2; break;
1748       case 0x6: wt_max = 23; wt_trits = 1; wt_bits = 3; break;
1749       case 0x7: wt_max = 31; wt_bits = 5; break;
1750       default: abort();
1751       }
1752       break;
1753    }
1754
1755    assert(wt_trits || wt_quints || wt_bits);
1756
1757    num_weights = wt_w * wt_h * wt_d;
1758
1759    if (dual_plane)
1760       num_weights *= 2;
1761
1762    weight_bits =
1763          (num_weights * 8 * wt_trits + 4) / 5
1764          + (num_weights * 7 * wt_quints + 2) / 3
1765          +  num_weights * wt_bits;
1766 }
1767
1768 void Block::calculate_remaining_bits()
1769 {
1770    int config_bits;
1771    if (num_parts > 1) {
1772       if (!is_multi_cem)
1773          config_bits = 29;
1774       else
1775          config_bits = 25 + 3 * num_parts;
1776    } else {
1777       config_bits = 17;
1778    }
1779
1780    if (dual_plane)
1781       config_bits += 2;
1782
1783    remaining_bits = 128 - config_bits - weight_bits;
1784 }
1785
1786 decode_error::type Block::calculate_colour_endpoints_size()
1787 {
1788    /* Specified as illegal */
1789    if (remaining_bits < (13 * num_cem_values + 4) / 5) {
1790       colour_endpoint_bits = ce_max = ce_trits = ce_quints = ce_bits = 0;
1791       return decode_error::invalid_colour_endpoints_size;
1792    }
1793
1794    /* Find the largest cem_ranges that fits within remaining_bits */
1795    for (int i = ARRAY_SIZE(cem_ranges)-1; i >= 0; --i) {
1796       int cem_bits;
1797       cem_bits = (num_cem_values * 8 * cem_ranges[i].t + 4) / 5
1798                  + (num_cem_values * 7 * cem_ranges[i].q + 2) / 3
1799                  +  num_cem_values * cem_ranges[i].b;
1800
1801       if (cem_bits <= remaining_bits)
1802       {
1803          colour_endpoint_bits = cem_bits;
1804          ce_max = cem_ranges[i].max;
1805          ce_trits = cem_ranges[i].t;
1806          ce_quints = cem_ranges[i].q;
1807          ce_bits = cem_ranges[i].b;
1808          return decode_error::ok;
1809       }
1810    }
1811
1812    assert(0);
1813    return decode_error::invalid_colour_endpoints_size;
1814 }
1815
1816 /**
1817  * Decode ASTC 2D LDR texture data.
1818  *
1819  * \param src_width in pixels
1820  * \param src_height in pixels
1821  * \param dst_stride in bytes
1822  */
1823 extern "C" void
1824 _mesa_unpack_astc_2d_ldr(uint8_t *dst_row,
1825                          unsigned dst_stride,
1826                          const uint8_t *src_row,
1827                          unsigned src_stride,
1828                          unsigned src_width,
1829                          unsigned src_height,
1830                          mesa_format format)
1831 {
1832    assert(_mesa_is_format_astc_2d(format));
1833    bool srgb = _mesa_is_format_srgb(format);
1834
1835    unsigned blk_w, blk_h;
1836    _mesa_get_format_block_size(format, &blk_w, &blk_h);
1837
1838    const unsigned block_size = 16;
1839    unsigned x_blocks = (src_width + blk_w - 1) / blk_w;
1840    unsigned y_blocks = (src_height + blk_h - 1) / blk_h;
1841
1842    Decoder dec(blk_w, blk_h, 1, srgb, true);
1843
1844    for (unsigned y = 0; y < y_blocks; ++y) {
1845       for (unsigned x = 0; x < x_blocks; ++x) {
1846          /* Same size as the largest block. */
1847          uint16_t block_out[12 * 12 * 4];
1848
1849          dec.decode(src_row + x * block_size, block_out);
1850
1851          /* This can be smaller with NPOT dimensions. */
1852          unsigned dst_blk_w = MIN2(blk_w, src_width  - x*blk_w);
1853          unsigned dst_blk_h = MIN2(blk_h, src_height - y*blk_h);
1854
1855          for (unsigned sub_y = 0; sub_y < dst_blk_h; ++sub_y) {
1856             for (unsigned sub_x = 0; sub_x < dst_blk_w; ++sub_x) {
1857                uint8_t *dst = dst_row + sub_y * dst_stride +
1858                               (x * blk_w + sub_x) * 4;
1859                const uint16_t *src = &block_out[(sub_y * blk_w + sub_x) * 4];
1860
1861                dst[0] = src[0];
1862                dst[1] = src[1];
1863                dst[2] = src[2];
1864                dst[3] = src[3];
1865             }
1866          }
1867       }
1868       src_row += src_stride;
1869       dst_row += dst_stride * blk_h;
1870    }
1871 }