src/mesa/main/texcompress_bptc_tmp.h

   1 /*
   2  * Copyright (C) 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /*
  25  * Included by texcompress_bptc and gallium to define BPTC decoding routines.
  26  */
  27
  28 #ifndef TEXCOMPRESS_BPTC_TMP_H
  29 #define TEXCOMPRESS_BPTC_TMP_H
  30
  31 #include "util/format_srgb.h"
  32 #include "util/half_float.h"
  33 #include "macros.h"
  34
  35 #define BLOCK_SIZE 4
  36 #define N_PARTITIONS 64
  37 #define BLOCK_BYTES 16
  38
  39 struct bptc_unorm_mode {
  40    int n_subsets;
  41    int n_partition_bits;
  42    bool has_rotation_bits;
  43    bool has_index_selection_bit;
  44    int n_color_bits;
  45    int n_alpha_bits;
  46    bool has_endpoint_pbits;
  47    bool has_shared_pbits;
  48    int n_index_bits;
  49    int n_secondary_index_bits;
  50 };
  51
  52 struct bptc_float_bitfield {
  53    int8_t endpoint;
  54    uint8_t component;
  55    uint8_t offset;
  56    uint8_t n_bits;
  57    bool reverse;
  58 };
  59
  60 struct bptc_float_mode {
  61    bool reserved;
  62    bool transformed_endpoints;
  63    int n_partition_bits;
  64    int n_endpoint_bits;
  65    int n_index_bits;
  66    int n_delta_bits[3];
  67    struct bptc_float_bitfield bitfields[24];
  68 };
  69
  70 struct bit_writer {
  71    uint8_t buf;
  72    int pos;
  73    uint8_t *dst;
  74 };
  75
  76 static const struct bptc_unorm_mode
  77 bptc_unorm_modes[] = {
  78    /* 0 */ { 3, 4, false, false, 4, 0, true,  false, 3, 0 },
  79    /* 1 */ { 2, 6, false, false, 6, 0, false, true,  3, 0 },
  80    /* 2 */ { 3, 6, false, false, 5, 0, false, false, 2, 0 },
  81    /* 3 */ { 2, 6, false, false, 7, 0, true,  false, 2, 0 },
  82    /* 4 */ { 1, 0, true,  true,  5, 6, false, false, 2, 3 },
  83    /* 5 */ { 1, 0, true,  false, 7, 8, false, false, 2, 2 },
  84    /* 6 */ { 1, 0, false, false, 7, 7, true,  false, 4, 0 },
  85    /* 7 */ { 2, 6, false, false, 5, 5, true,  false, 2, 0 }
  86 };
  87
  88 static const struct bptc_float_mode
  89 bptc_float_modes[] = {
  90    /* 00 */
  91    { false, true, 5, 10, 3, { 5, 5, 5 },
  92      { { 2, 1, 4, 1, false }, { 2, 2, 4, 1, false }, { 3, 2, 4, 1, false },
  93        { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
  94        { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
  95        { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
  96        { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
  97        { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
  98        { 3, 2, 3, 1, false },
  99        { -1 } }
 100    },
 101    /* 01 */
 102    { false, true, 5, 7, 3, { 6, 6, 6 },
 103      { { 2, 1, 5, 1, false }, { 3, 1, 4, 1, false }, { 3, 1, 5, 1, false },
 104        { 0, 0, 0, 7, false }, { 3, 2, 0, 1, false }, { 3, 2, 1, 1, false },
 105        { 2, 2, 4, 1, false }, { 0, 1, 0, 7, false }, { 2, 2, 5, 1, false },
 106        { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false }, { 0, 2, 0, 7, false },
 107        { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
 108        { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
 109        { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
 110        { 2, 0, 0, 6, false },
 111        { 3, 0, 0, 6, false },
 112        { -1 } }
 113    },
 114    /* 00010 */
 115    { false, true, 5, 11, 3, { 5, 4, 4 },
 116      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 117        { 1, 0, 0, 5, false }, { 0, 0, 10, 1, false }, { 2, 1, 0, 4, false },
 118        { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false }, { 3, 2, 0, 1, false },
 119        { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
 120        { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
 121        { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
 122        { -1 } }
 123    },
 124    /* 00011 */
 125    { false, false, 0, 10, 4, { 10, 10, 10 },
 126      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 127        { 1, 0, 0, 10, false }, { 1, 1, 0, 10, false }, { 1, 2, 0, 10, false },
 128        { -1 } }
 129    },
 130    /* 00110 */
 131    { false, true, 5, 11, 3, { 4, 5, 4 },
 132      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 133        { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 3, 1, 4, 1, false },
 134        { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false }, { 0, 1, 10, 1, false },
 135        { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
 136        { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
 137        { 3, 2, 0, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
 138        { 2, 1, 4, 1, false }, { 3, 2, 3, 1, false },
 139        { -1 } }
 140    },
 141    /* 00111 */
 142    { false, true, 0, 11, 4, { 9, 9, 9 },
 143      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 144        { 1, 0, 0, 9, false }, { 0, 0, 10, 1, false }, { 1, 1, 0, 9, false },
 145        { 0, 1, 10, 1, false }, { 1, 2, 0, 9, false }, { 0, 2, 10, 1, false },
 146        { -1 } }
 147    },
 148    /* 01010 */
 149    { false, true, 5, 11, 3, { 4, 4, 5 },
 150      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 151        { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 2, 2, 4, 1, false },
 152        { 2, 1, 0, 4, false }, { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false },
 153        { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
 154        { 0, 2, 10, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
 155        { 3, 2, 1, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
 156        { 3, 2, 4, 1, false }, { 3, 2, 3, 1, false },
 157        { -1 } }
 158    },
 159    /* 01011 */
 160    { false, true, 0, 12, 4, { 8, 8, 8 },
 161      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 162        { 1, 0, 0, 8, false }, { 0, 0, 10, 2, true }, { 1, 1, 0, 8, false },
 163        { 0, 1, 10, 2, true }, { 1, 2, 0, 8, false }, { 0, 2, 10, 2, true },
 164        { -1 } }
 165    },
 166    /* 01110 */
 167    { false, true, 5, 9, 3, { 5, 5, 5 },
 168      { { 0, 0, 0, 9, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 9, false },
 169        { 2, 1, 4, 1, false }, { 0, 2, 0, 9, false }, { 3, 2, 4, 1, false },
 170        { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
 171        { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
 172        { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
 173        { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
 174        { 3, 2, 3, 1, false },
 175        { -1 } }
 176    },
 177    /* 01111 */
 178    { false, true, 0, 16, 4, { 4, 4, 4 },
 179      { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
 180        { 1, 0, 0, 4, false }, { 0, 0, 10, 6, true }, { 1, 1, 0, 4, false },
 181        { 0, 1, 10, 6, true }, { 1, 2, 0, 4, false }, { 0, 2, 10, 6, true },
 182        { -1 } }
 183    },
 184    /* 10010 */
 185    { false, true, 5, 8, 3, { 6, 5, 5 },
 186      { { 0, 0, 0, 8, false }, { 3, 1, 4, 1, false }, { 2, 2, 4, 1, false },
 187        { 0, 1, 0, 8, false }, { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false },
 188        { 0, 2, 0, 8, false }, { 3, 2, 3, 1, false }, { 3, 2, 4, 1, false },
 189        { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false },
 190        { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
 191        { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 6, false },
 192        { 3, 0, 0, 6, false },
 193        { -1 } }
 194    },
 195    /* 10011 */
 196    { true /* reserved */ },
 197    /* 10110 */
 198    { false, true, 5, 8, 3, { 5, 6, 5 },
 199      { { 0, 0, 0, 8, false }, { 3, 2, 0, 1, false }, { 2, 2, 4, 1, false },
 200        { 0, 1, 0, 8, false }, { 2, 1, 5, 1, false }, { 2, 1, 4, 1, false },
 201        { 0, 2, 0, 8, false }, { 3, 1, 5, 1, false }, { 3, 2, 4, 1, false },
 202        { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
 203        { 1, 1, 0, 6, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
 204        { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
 205        { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
 206        { -1 } }
 207    },
 208    /* 10111 */
 209    { true /* reserved */ },
 210    /* 11010 */
 211    { false, true, 5, 8, 3, { 5, 5, 6 },
 212      { { 0, 0, 0, 8, false }, { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false },
 213        { 0, 1, 0, 8, false }, { 2, 2, 5, 1, false }, { 2, 1, 4, 1, false },
 214        { 0, 2, 0, 8, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
 215        { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
 216        { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
 217        { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
 218        { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
 219        { -1 } }
 220    },
 221    /* 11011 */
 222    { true /* reserved */ },
 223    /* 11110 */
 224    { false, false, 5, 6, 3, { 6, 6, 6 },
 225      { { 0, 0, 0, 6, false }, { 3, 1, 4, 1, false }, { 3, 2, 0, 1, false },
 226        { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 6, false },
 227        { 2, 1, 5, 1, false }, { 2, 2, 5, 1, false }, { 3, 2, 2, 1, false },
 228        { 2, 1, 4, 1, false }, { 0, 2, 0, 6, false }, { 3, 1, 5, 1, false },
 229        { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
 230        { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
 231        { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
 232        { 2, 0, 0, 6, false }, { 3, 0, 0, 6, false },
 233        { -1 } }
 234    },
 235    /* 11111 */
 236    { true /* reserved */ },
 237 };
 238
 239 /* This partition table is used when the mode has two subsets. Each
 240  * partition is represented by a 32-bit value which gives 2 bits per texel
 241  * within the block. The value of the two bits represents which subset to use
 242  * (0 or 1).
 243  */
 244 static const uint32_t
 245 partition_table1[N_PARTITIONS] = {
 246    0x50505050U, 0x40404040U, 0x54545454U, 0x54505040U,
 247    0x50404000U, 0x55545450U, 0x55545040U, 0x54504000U,
 248    0x50400000U, 0x55555450U, 0x55544000U, 0x54400000U,
 249    0x55555440U, 0x55550000U, 0x55555500U, 0x55000000U,
 250    0x55150100U, 0x00004054U, 0x15010000U, 0x00405054U,
 251    0x00004050U, 0x15050100U, 0x05010000U, 0x40505054U,
 252    0x00404050U, 0x05010100U, 0x14141414U, 0x05141450U,
 253    0x01155440U, 0x00555500U, 0x15014054U, 0x05414150U,
 254    0x44444444U, 0x55005500U, 0x11441144U, 0x05055050U,
 255    0x05500550U, 0x11114444U, 0x41144114U, 0x44111144U,
 256    0x15055054U, 0x01055040U, 0x05041050U, 0x05455150U,
 257    0x14414114U, 0x50050550U, 0x41411414U, 0x00141400U,
 258    0x00041504U, 0x00105410U, 0x10541000U, 0x04150400U,
 259    0x50410514U, 0x41051450U, 0x05415014U, 0x14054150U,
 260    0x41050514U, 0x41505014U, 0x40011554U, 0x54150140U,
 261    0x50505500U, 0x00555050U, 0x15151010U, 0x54540404U,
 262 };
 263
 264 /* This partition table is used when the mode has three subsets. In this case
 265  * the values can be 0, 1 or 2.
 266  */
 267 static const uint32_t
 268 partition_table2[N_PARTITIONS] = {
 269    0xaa685050U, 0x6a5a5040U, 0x5a5a4200U, 0x5450a0a8U,
 270    0xa5a50000U, 0xa0a05050U, 0x5555a0a0U, 0x5a5a5050U,
 271    0xaa550000U, 0xaa555500U, 0xaaaa5500U, 0x90909090U,
 272    0x94949494U, 0xa4a4a4a4U, 0xa9a59450U, 0x2a0a4250U,
 273    0xa5945040U, 0x0a425054U, 0xa5a5a500U, 0x55a0a0a0U,
 274    0xa8a85454U, 0x6a6a4040U, 0xa4a45000U, 0x1a1a0500U,
 275    0x0050a4a4U, 0xaaa59090U, 0x14696914U, 0x69691400U,
 276    0xa08585a0U, 0xaa821414U, 0x50a4a450U, 0x6a5a0200U,
 277    0xa9a58000U, 0x5090a0a8U, 0xa8a09050U, 0x24242424U,
 278    0x00aa5500U, 0x24924924U, 0x24499224U, 0x50a50a50U,
 279    0x500aa550U, 0xaaaa4444U, 0x66660000U, 0xa5a0a5a0U,
 280    0x50a050a0U, 0x69286928U, 0x44aaaa44U, 0x66666600U,
 281    0xaa444444U, 0x54a854a8U, 0x95809580U, 0x96969600U,
 282    0xa85454a8U, 0x80959580U, 0xaa141414U, 0x96960000U,
 283    0xaaaa1414U, 0xa05050a0U, 0xa0a5a5a0U, 0x96000000U,
 284    0x40804080U, 0xa9a8a9a8U, 0xaaaaaa44U, 0x2a4a5254U
 285 };
 286
 287 static const uint8_t
 288 anchor_indices[][N_PARTITIONS] = {
 289    /* Anchor index values for the second subset of two-subset partitioning */
 290    {
 291       0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,
 292       0xf,0x2,0x8,0x2,0x2,0x8,0x8,0xf,0x2,0x8,0x2,0x2,0x8,0x8,0x2,0x2,
 293       0xf,0xf,0x6,0x8,0x2,0x8,0xf,0xf,0x2,0x8,0x2,0x2,0x2,0xf,0xf,0x6,
 294       0x6,0x2,0x6,0x8,0xf,0xf,0x2,0x2,0xf,0xf,0xf,0xf,0xf,0x2,0x2,0xf
 295    },
 296
 297    /* Anchor index values for the second subset of three-subset partitioning */
 298    {
 299       0x3,0x3,0xf,0xf,0x8,0x3,0xf,0xf,0x8,0x8,0x6,0x6,0x6,0x5,0x3,0x3,
 300       0x3,0x3,0x8,0xf,0x3,0x3,0x6,0xa,0x5,0x8,0x8,0x6,0x8,0x5,0xf,0xf,
 301       0x8,0xf,0x3,0x5,0x6,0xa,0x8,0xf,0xf,0x3,0xf,0x5,0xf,0xf,0xf,0xf,
 302       0x3,0xf,0x5,0x5,0x5,0x8,0x5,0xa,0x5,0xa,0x8,0xd,0xf,0xc,0x3,0x3
 303    },
 304
 305    /* Anchor index values for the third subset of three-subset
 306     * partitioning
 307     */
 308    {
 309       0xf,0x8,0x8,0x3,0xf,0xf,0x3,0x8,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x8,
 310       0xf,0x8,0xf,0x3,0xf,0x8,0xf,0x8,0x3,0xf,0x6,0xa,0xf,0xf,0xa,0x8,
 311       0xf,0x3,0xf,0xa,0xa,0x8,0x9,0xa,0x6,0xf,0x8,0xf,0x3,0x6,0x6,0x8,
 312       0xf,0x3,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x3,0xf,0xf,0x8
 313    }
 314 };
 315
 316 static int
 317 extract_bits(const uint8_t *block,
 318              int offset,
 319              int n_bits)
 320 {
 321    int byte_index = offset / 8;
 322    int bit_index = offset % 8;
 323    int n_bits_in_byte = MIN2(n_bits, 8 - bit_index);
 324    int result = 0;
 325    int bit = 0;
 326
 327    while (true) {
 328       result |= ((block[byte_index] >> bit_index) &
 329                  ((1 << n_bits_in_byte) - 1)) << bit;
 330
 331       n_bits -= n_bits_in_byte;
 332
 333       if (n_bits <= 0)
 334          return result;
 335
 336       bit += n_bits_in_byte;
 337       byte_index++;
 338       bit_index = 0;
 339       n_bits_in_byte = MIN2(n_bits, 8);
 340    }
 341 }
 342
 343 static uint8_t
 344 expand_component(uint8_t byte,
 345                  int n_bits)
 346 {
 347    /* Expands a n-bit quantity into a byte by copying the most-significant
 348     * bits into the unused least-significant bits.
 349     */
 350    return byte << (8 - n_bits) | (byte >> (2 * n_bits - 8));
 351 }
 352
 353 static int
 354 extract_unorm_endpoints(const struct bptc_unorm_mode *mode,
 355                         const uint8_t *block,
 356                         int bit_offset,
 357                         uint8_t endpoints[][4])
 358 {
 359    int component;
 360    int subset;
 361    int endpoint;
 362    int pbit;
 363    int n_components;
 364
 365    /* Extract each color component */
 366    for (component = 0; component < 3; component++) {
 367       for (subset = 0; subset < mode->n_subsets; subset++) {
 368          for (endpoint = 0; endpoint < 2; endpoint++) {
 369             endpoints[subset * 2 + endpoint][component] =
 370                extract_bits(block, bit_offset, mode->n_color_bits);
 371             bit_offset += mode->n_color_bits;
 372          }
 373       }
 374    }
 375
 376    /* Extract the alpha values */
 377    if (mode->n_alpha_bits > 0) {
 378       for (subset = 0; subset < mode->n_subsets; subset++) {
 379          for (endpoint = 0; endpoint < 2; endpoint++) {
 380             endpoints[subset * 2 + endpoint][3] =
 381                extract_bits(block, bit_offset, mode->n_alpha_bits);
 382             bit_offset += mode->n_alpha_bits;
 383          }
 384       }
 385
 386       n_components = 4;
 387    } else {
 388       for (subset = 0; subset < mode->n_subsets; subset++)
 389          for (endpoint = 0; endpoint < 2; endpoint++)
 390             endpoints[subset * 2 + endpoint][3] = 255;
 391
 392       n_components = 3;
 393    }
 394
 395    /* Add in the p-bits */
 396    if (mode->has_endpoint_pbits) {
 397       for (subset = 0; subset < mode->n_subsets; subset++) {
 398          for (endpoint = 0; endpoint < 2; endpoint++) {
 399             pbit = extract_bits(block, bit_offset, 1);
 400             bit_offset += 1;
 401
 402             for (component = 0; component < n_components; component++) {
 403                endpoints[subset * 2 + endpoint][component] <<= 1;
 404                endpoints[subset * 2 + endpoint][component] |= pbit;
 405             }
 406          }
 407       }
 408    } else if (mode->has_shared_pbits) {
 409       for (subset = 0; subset < mode->n_subsets; subset++) {
 410          pbit = extract_bits(block, bit_offset, 1);
 411          bit_offset += 1;
 412
 413          for (endpoint = 0; endpoint < 2; endpoint++) {
 414             for (component = 0; component < n_components; component++) {
 415                endpoints[subset * 2 + endpoint][component] <<= 1;
 416                endpoints[subset * 2 + endpoint][component] |= pbit;
 417             }
 418          }
 419       }
 420    }
 421
 422    /* Expand the n-bit values to a byte */
 423    for (subset = 0; subset < mode->n_subsets; subset++) {
 424       for (endpoint = 0; endpoint < 2; endpoint++) {
 425          for (component = 0; component < 3; component++) {
 426             endpoints[subset * 2 + endpoint][component] =
 427                expand_component(endpoints[subset * 2 + endpoint][component],
 428                                 mode->n_color_bits +
 429                                 mode->has_endpoint_pbits +
 430                                 mode->has_shared_pbits);
 431          }
 432
 433          if (mode->n_alpha_bits > 0) {
 434             endpoints[subset * 2 + endpoint][3] =
 435                expand_component(endpoints[subset * 2 + endpoint][3],
 436                                 mode->n_alpha_bits +
 437                                 mode->has_endpoint_pbits +
 438                                 mode->has_shared_pbits);
 439          }
 440       }
 441    }
 442
 443    return bit_offset;
 444 }
 445
 446 static bool
 447 is_anchor(int n_subsets,
 448           int partition_num,
 449           int texel)
 450 {
 451    if (texel == 0)
 452       return true;
 453
 454    switch (n_subsets) {
 455    case 1:
 456       return false;
 457    case 2:
 458       return anchor_indices[0][partition_num] == texel;
 459    case 3:
 460       return (anchor_indices[1][partition_num] == texel ||
 461               anchor_indices[2][partition_num] == texel);
 462    default:
 463       assert(false);
 464       return false;
 465    }
 466 }
 467
 468 static int
 469 count_anchors_before_texel(int n_subsets,
 470                            int partition_num,
 471                            int texel)
 472 {
 473    int count = 1;
 474
 475    if (texel == 0)
 476       return 0;
 477
 478    switch (n_subsets) {
 479    case 1:
 480       break;
 481    case 2:
 482       if (texel > anchor_indices[0][partition_num])
 483          count++;
 484       break;
 485    case 3:
 486       if (texel > anchor_indices[1][partition_num])
 487          count++;
 488       if (texel > anchor_indices[2][partition_num])
 489          count++;
 490       break;
 491    default:
 492       assert(false);
 493       return 0;
 494    }
 495
 496    return count;
 497 }
 498
 499 static int32_t
 500 interpolate(int32_t a, int32_t b,
 501             int index,
 502             int index_bits)
 503 {
 504    static const uint8_t weights2[] = { 0, 21, 43, 64 };
 505    static const uint8_t weights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
 506    static const uint8_t weights4[] =
 507       { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
 508    static const uint8_t *weights[] = {
 509       NULL, NULL, weights2, weights3, weights4
 510    };
 511    int weight;
 512
 513    weight = weights[index_bits][index];
 514
 515    return ((64 - weight) * a + weight * b + 32) >> 6;
 516 }
 517
 518 static void
 519 apply_rotation(int rotation,
 520                uint8_t *result)
 521 {
 522    uint8_t t;
 523
 524    if (rotation == 0)
 525       return;
 526
 527    rotation--;
 528
 529    t = result[rotation];
 530    result[rotation] = result[3];
 531    result[3] = t;
 532 }
 533
 534 static void
 535 fetch_rgba_unorm_from_block(const uint8_t *block,
 536                             uint8_t *result,
 537                             int texel)
 538 {
 539    int mode_num = ffs(block[0]);
 540    const struct bptc_unorm_mode *mode;
 541    int bit_offset, secondary_bit_offset;
 542    int partition_num;
 543    int subset_num;
 544    int rotation;
 545    int index_selection;
 546    int index_bits;
 547    int indices[2];
 548    int index;
 549    int anchors_before_texel;
 550    bool anchor;
 551    uint8_t endpoints[3 * 2][4];
 552    uint32_t subsets;
 553    int component;
 554
 555    if (mode_num == 0) {
 556       /* According to the spec this mode is reserved and shouldn't be used. */
 557       memset(result, 0, 3);
 558       result[3] = 0xff;
 559       return;
 560    }
 561
 562    mode = bptc_unorm_modes + mode_num - 1;
 563    bit_offset = mode_num;
 564
 565    partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
 566    bit_offset += mode->n_partition_bits;
 567
 568    switch (mode->n_subsets) {
 569    case 1:
 570       subsets = 0;
 571       break;
 572    case 2:
 573       subsets = partition_table1[partition_num];
 574       break;
 575    case 3:
 576       subsets = partition_table2[partition_num];
 577       break;
 578    default:
 579       assert(false);
 580       return;
 581    }
 582
 583    if (mode->has_rotation_bits) {
 584       rotation = extract_bits(block, bit_offset, 2);
 585       bit_offset += 2;
 586    } else {
 587       rotation = 0;
 588    }
 589
 590    if (mode->has_index_selection_bit) {
 591       index_selection = extract_bits(block, bit_offset, 1);
 592       bit_offset++;
 593    } else {
 594       index_selection = 0;
 595    }
 596
 597    bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
 598
 599    anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
 600                                                      partition_num, texel);
 601
 602    /* Calculate the offset to the secondary index */
 603    secondary_bit_offset = (bit_offset +
 604                            BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
 605                            mode->n_subsets +
 606                            mode->n_secondary_index_bits * texel -
 607                            anchors_before_texel);
 608
 609    /* Calculate the offset to the primary index for this texel */
 610    bit_offset += mode->n_index_bits * texel - anchors_before_texel;
 611
 612    subset_num = (subsets >> (texel * 2)) & 3;
 613
 614    anchor = is_anchor(mode->n_subsets, partition_num, texel);
 615
 616    index_bits = mode->n_index_bits;
 617    if (anchor)
 618       index_bits--;
 619    indices[0] = extract_bits(block, bit_offset, index_bits);
 620
 621    if (mode->n_secondary_index_bits) {
 622       index_bits = mode->n_secondary_index_bits;
 623       if (anchor)
 624          index_bits--;
 625       indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
 626    }
 627
 628    index = indices[index_selection];
 629    index_bits = (index_selection ?
 630                  mode->n_secondary_index_bits :
 631                  mode->n_index_bits);
 632
 633    for (component = 0; component < 3; component++)
 634       result[component] = interpolate(endpoints[subset_num * 2][component],
 635                                       endpoints[subset_num * 2 + 1][component],
 636                                       index,
 637                                       index_bits);
 638
 639    /* Alpha uses the opposite index from the color components */
 640    if (mode->n_secondary_index_bits && !index_selection) {
 641       index = indices[1];
 642       index_bits = mode->n_secondary_index_bits;
 643    } else {
 644       index = indices[0];
 645       index_bits = mode->n_index_bits;
 646    }
 647
 648    result[3] = interpolate(endpoints[subset_num * 2][3],
 649                            endpoints[subset_num * 2 + 1][3],
 650                            index,
 651                            index_bits);
 652
 653    apply_rotation(rotation, result);
 654 }
 655
 656 #ifdef BPTC_BLOCK_DECODE
 657 static void
 658 decompress_rgba_unorm_block(int src_width, int src_height,
 659                             const uint8_t *block,
 660                             uint8_t *dst_row, int dst_rowstride)
 661 {
 662    int mode_num = ffs(block[0]);
 663    const struct bptc_unorm_mode *mode;
 664    int bit_offset, secondary_bit_offset;
 665    int partition_num;
 666    int subset_num;
 667    int rotation;
 668    int index_selection;
 669    int index_bits;
 670    int indices[2];
 671    int index;
 672    int anchors_before_texel;
 673    bool anchor;
 674    uint8_t endpoints[3 * 2][4];
 675    uint32_t subsets;
 676    int component;
 677    unsigned x, y;
 678
 679    if (mode_num == 0) {
 680       /* According to the spec this mode is reserved and shouldn't be used. */
 681       for(y = 0; y < src_height; y += 1) {
 682          uint8_t *result = dst_row;
 683          memset(result, 0, 4 * src_width);
 684          for(x = 0; x < src_width; x += 1) {
 685             result[3] = 0xff;
 686             result += 4;
 687          }
 688          dst_row += dst_rowstride;
 689       }
 690       return;
 691    }
 692
 693    mode = bptc_unorm_modes + mode_num - 1;
 694    bit_offset = mode_num;
 695
 696    partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
 697    bit_offset += mode->n_partition_bits;
 698
 699    switch (mode->n_subsets) {
 700    case 1:
 701       subsets = 0;
 702       break;
 703    case 2:
 704       subsets = partition_table1[partition_num];
 705       break;
 706    case 3:
 707       subsets = partition_table2[partition_num];
 708       break;
 709    default:
 710       assert(false);
 711       return;
 712    }
 713
 714    if (mode->has_rotation_bits) {
 715       rotation = extract_bits(block, bit_offset, 2);
 716       bit_offset += 2;
 717    } else {
 718       rotation = 0;
 719    }
 720
 721    if (mode->has_index_selection_bit) {
 722       index_selection = extract_bits(block, bit_offset, 1);
 723       bit_offset++;
 724    } else {
 725       index_selection = 0;
 726    }
 727
 728    bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
 729
 730    for(y = 0; y < src_height; y += 1) {
 731       uint8_t *result = dst_row;
 732       for(x = 0; x < src_width; x += 1) {
 733          int texel;
 734          texel = x + y * 4;
 735
 736          anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
 737                                                            partition_num,
 738                                                            texel);
 739
 740          /* Calculate the offset to the secondary index */
 741          secondary_bit_offset = (bit_offset +
 742                                  BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
 743                                  mode->n_subsets +
 744                                  mode->n_secondary_index_bits * texel -
 745                                  anchors_before_texel);
 746
 747          /* Calculate the offset to the primary index for this texel */
 748          bit_offset += mode->n_index_bits * texel - anchors_before_texel;
 749
 750          subset_num = (subsets >> (texel * 2)) & 3;
 751
 752          anchor = is_anchor(mode->n_subsets, partition_num, texel);
 753
 754          index_bits = mode->n_index_bits;
 755          if (anchor)
 756             index_bits--;
 757          indices[0] = extract_bits(block, bit_offset, index_bits);
 758
 759          if (mode->n_secondary_index_bits) {
 760             index_bits = mode->n_secondary_index_bits;
 761             if (anchor)
 762                index_bits--;
 763             indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
 764          }
 765
 766          index = indices[index_selection];
 767          index_bits = (index_selection ?
 768                        mode->n_secondary_index_bits :
 769                        mode->n_index_bits);
 770
 771          for (component = 0; component < 3; component++)
 772             result[component] = interpolate(endpoints[subset_num * 2][component],
 773                                             endpoints[subset_num * 2 + 1][component],
 774                                             index,
 775                                             index_bits);
 776
 777          /* Alpha uses the opposite index from the color components */
 778          if (mode->n_secondary_index_bits && !index_selection) {
 779             index = indices[1];
 780             index_bits = mode->n_secondary_index_bits;
 781          } else {
 782             index = indices[0];
 783             index_bits = mode->n_index_bits;
 784          }
 785
 786          result[3] = interpolate(endpoints[subset_num * 2][3],
 787                                  endpoints[subset_num * 2 + 1][3],
 788                                  index,
 789                                  index_bits);
 790
 791          apply_rotation(rotation, result);
 792          result += 4;
 793       }
 794       dst_row += dst_rowstride;
 795    }
 796 }
 797
 798 static void
 799 decompress_rgba_unorm(int width, int height,
 800                       const uint8_t *src, int src_rowstride,
 801                       uint8_t *dst, int dst_rowstride)
 802 {
 803    int src_row_diff;
 804    int y, x;
 805
 806    if (src_rowstride >= width * 4)
 807       src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
 808    else
 809       src_row_diff = 0;
 810
 811    for (y = 0; y < height; y += BLOCK_SIZE) {
 812       for (x = 0; x < width; x += BLOCK_SIZE) {
 813          decompress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
 814                                      MIN2(height - y, BLOCK_SIZE),
 815                                      src,
 816                                      dst + x * 4 + y * dst_rowstride,
 817                                      dst_rowstride);
 818          src += BLOCK_BYTES;
 819       }
 820       src += src_row_diff;
 821    }
 822 }
 823 #endif // BPTC_BLOCK_DECODE
 824
 825 static int32_t
 826 sign_extend(int32_t value,
 827             int n_bits)
 828 {
 829    assert(n_bits > 0 && n_bits < 32);
 830
 831    const unsigned n = 32 - n_bits;
 832    return (int32_t)((uint32_t)value << n) >> n;
 833 }
 834
 835 static int
 836 signed_unquantize(int value, int n_endpoint_bits)
 837 {
 838    bool sign;
 839
 840    if (n_endpoint_bits >= 16)
 841       return value;
 842
 843    if (value == 0)
 844       return 0;
 845
 846    sign = false;
 847
 848    if (value < 0) {
 849       sign = true;
 850       value = -value;
 851    }
 852
 853    if (value >= (1 << (n_endpoint_bits - 1)) - 1)
 854       value = 0x7fff;
 855    else
 856       value = ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
 857
 858    if (sign)
 859       value = -value;
 860
 861    return value;
 862 }
 863
 864 static int
 865 unsigned_unquantize(int value, int n_endpoint_bits)
 866 {
 867    if (n_endpoint_bits >= 15)
 868       return value;
 869
 870    if (value == 0)
 871       return 0;
 872
 873    if (value == (1 << n_endpoint_bits) - 1)
 874       return 0xffff;
 875
 876    return ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
 877 }
 878
 879 static int
 880 extract_float_endpoints(const struct bptc_float_mode *mode,
 881                         const uint8_t *block,
 882                         int bit_offset,
 883                         int32_t endpoints[][3],
 884                         bool is_signed)
 885 {
 886    const struct bptc_float_bitfield *bitfield;
 887    int endpoint, component;
 888    int n_endpoints;
 889    int value;
 890    int i;
 891
 892    if (mode->n_partition_bits)
 893       n_endpoints = 4;
 894    else
 895       n_endpoints = 2;
 896
 897    memset(endpoints, 0, sizeof endpoints[0][0] * n_endpoints * 3);
 898
 899    for (bitfield = mode->bitfields; bitfield->endpoint != -1; bitfield++) {
 900       value = extract_bits(block, bit_offset, bitfield->n_bits);
 901       bit_offset += bitfield->n_bits;
 902
 903       if (bitfield->reverse) {
 904          for (i = 0; i < bitfield->n_bits; i++) {
 905             if (value & (1 << i))
 906                endpoints[bitfield->endpoint][bitfield->component] |=
 907                   1 << ((bitfield->n_bits - 1 - i) + bitfield->offset);
 908          }
 909       } else {
 910          endpoints[bitfield->endpoint][bitfield->component] |=
 911             value << bitfield->offset;
 912       }
 913    }
 914
 915    if (mode->transformed_endpoints) {
 916       /* The endpoints are specified as signed offsets from e0 */
 917       for (endpoint = 1; endpoint < n_endpoints; endpoint++) {
 918          for (component = 0; component < 3; component++) {
 919             value = sign_extend(endpoints[endpoint][component],
 920                                 mode->n_delta_bits[component]);
 921             endpoints[endpoint][component] =
 922                ((endpoints[0][component] + value) &
 923                 ((1 << mode->n_endpoint_bits) - 1));
 924          }
 925       }
 926    }
 927
 928    if (is_signed) {
 929       for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
 930          for (component = 0; component < 3; component++) {
 931             value = sign_extend(endpoints[endpoint][component],
 932                                 mode->n_endpoint_bits);
 933             endpoints[endpoint][component] =
 934                signed_unquantize(value, mode->n_endpoint_bits);
 935          }
 936       }
 937    } else {
 938       for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
 939          for (component = 0; component < 3; component++) {
 940             endpoints[endpoint][component] =
 941                unsigned_unquantize(endpoints[endpoint][component],
 942                                    mode->n_endpoint_bits);
 943          }
 944       }
 945    }
 946
 947    return bit_offset;
 948 }
 949
 950 static int32_t
 951 finish_unsigned_unquantize(int32_t value)
 952 {
 953    return value * 31 / 64;
 954 }
 955
 956 static int32_t
 957 finish_signed_unquantize(int32_t value)
 958 {
 959    if (value < 0)
 960       return (-value * 31 / 32) | 0x8000;
 961    else
 962       return value * 31 / 32;
 963 }
 964
 965 static void
 966 fetch_rgb_float_from_block(const uint8_t *block,
 967                            float *result,
 968                            int texel,
 969                            bool is_signed)
 970 {
 971    int mode_num;
 972    const struct bptc_float_mode *mode;
 973    int bit_offset;
 974    int partition_num;
 975    int subset_num;
 976    int index_bits;
 977    int index;
 978    int anchors_before_texel;
 979    int32_t endpoints[2 * 2][3];
 980    uint32_t subsets;
 981    int n_subsets;
 982    int component;
 983    int32_t value;
 984
 985    if (block[0] & 0x2) {
 986       mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
 987       bit_offset = 5;
 988    } else {
 989       mode_num = block[0] & 3;
 990       bit_offset = 2;
 991    }
 992
 993    mode = bptc_float_modes + mode_num;
 994
 995    if (mode->reserved) {
 996       memset(result, 0, sizeof result[0] * 3);
 997       result[3] = 1.0f;
 998       return;
 999    }
1000
1001    bit_offset = extract_float_endpoints(mode, block, bit_offset,
1002                                         endpoints, is_signed);
1003
1004    if (mode->n_partition_bits) {
1005       partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
1006       bit_offset += mode->n_partition_bits;
1007
1008       subsets = partition_table1[partition_num];
1009       n_subsets = 2;
1010    } else {
1011       partition_num = 0;
1012       subsets = 0;
1013       n_subsets = 1;
1014    }
1015
1016    anchors_before_texel =
1017       count_anchors_before_texel(n_subsets, partition_num, texel);
1018
1019    /* Calculate the offset to the primary index for this texel */
1020    bit_offset += mode->n_index_bits * texel - anchors_before_texel;
1021
1022    subset_num = (subsets >> (texel * 2)) & 3;
1023
1024    index_bits = mode->n_index_bits;
1025    if (is_anchor(n_subsets, partition_num, texel))
1026       index_bits--;
1027    index = extract_bits(block, bit_offset, index_bits);
1028
1029    for (component = 0; component < 3; component++) {
1030       value = interpolate(endpoints[subset_num * 2][component],
1031                           endpoints[subset_num * 2 + 1][component],
1032                           index,
1033                           mode->n_index_bits);
1034
1035       if (is_signed)
1036          value = finish_signed_unquantize(value);
1037       else
1038          value = finish_unsigned_unquantize(value);
1039
1040       result[component] = _mesa_half_to_float(value);
1041    }
1042
1043    result[3] = 1.0f;
1044 }
1045
1046 #ifdef BPTC_BLOCK_DECODE
1047 static void
1048 decompress_rgb_float_block(unsigned src_width, unsigned src_height,
1049                            const uint8_t *block,
1050                            float *dst_row, unsigned dst_rowstride,
1051                            bool is_signed)
1052 {
1053    int mode_num;
1054    const struct bptc_float_mode *mode;
1055    int bit_offset;
1056    int partition_num;
1057    int subset_num;
1058    int index_bits;
1059    int index;
1060    int anchors_before_texel;
1061    int32_t endpoints[2 * 2][3];
1062    uint32_t subsets;
1063    int n_subsets;
1064    int component;
1065    int32_t value;
1066    unsigned x, y;
1067
1068    if (block[0] & 0x2) {
1069       mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
1070       bit_offset = 5;
1071    } else {
1072       mode_num = block[0] & 3;
1073       bit_offset = 2;
1074    }
1075
1076    mode = bptc_float_modes + mode_num;
1077
1078    if (mode->reserved) {
1079       for(y = 0; y < src_height; y += 1) {
1080          float *result = dst_row;
1081          memset(result, 0, sizeof result[0] * 4 * src_width);
1082          for(x = 0; x < src_width; x += 1) {
1083             result[3] = 1.0f;
1084             result += 4;
1085          }
1086          dst_row += dst_rowstride / sizeof dst_row[0];
1087       }
1088       return;
1089    }
1090
1091    bit_offset = extract_float_endpoints(mode, block, bit_offset,
1092                                         endpoints, is_signed);
1093
1094    if (mode->n_partition_bits) {
1095       partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
1096       bit_offset += mode->n_partition_bits;
1097
1098       subsets = partition_table1[partition_num];
1099       n_subsets = 2;
1100    } else {
1101       partition_num = 0;
1102       subsets = 0;
1103       n_subsets = 1;
1104    }
1105
1106    for(y = 0; y < src_height; y += 1) {
1107       float *result = dst_row;
1108       for(x = 0; x < src_width; x += 1) {
1109          int texel;
1110
1111          texel = x + y * 4;
1112
1113          anchors_before_texel =
1114             count_anchors_before_texel(n_subsets, partition_num, texel);
1115
1116          /* Calculate the offset to the primary index for this texel */
1117          bit_offset += mode->n_index_bits * texel - anchors_before_texel;
1118
1119          subset_num = (subsets >> (texel * 2)) & 3;
1120
1121          index_bits = mode->n_index_bits;
1122          if (is_anchor(n_subsets, partition_num, texel))
1123             index_bits--;
1124          index = extract_bits(block, bit_offset, index_bits);
1125
1126          for (component = 0; component < 3; component++) {
1127             value = interpolate(endpoints[subset_num * 2][component],
1128                                 endpoints[subset_num * 2 + 1][component],
1129                                 index,
1130                                 mode->n_index_bits);
1131
1132             if (is_signed)
1133                value = finish_signed_unquantize(value);
1134             else
1135                value = finish_unsigned_unquantize(value);
1136
1137             result[component] = _mesa_half_to_float(value);
1138          }
1139
1140          result[3] = 1.0f;
1141          result += 4;
1142       }
1143       dst_row += dst_rowstride / sizeof dst_row[0];
1144    }
1145 }
1146
1147 static void
1148 decompress_rgb_float(int width, int height,
1149                       const uint8_t *src, int src_rowstride,
1150                       float *dst, int dst_rowstride, bool is_signed)
1151 {
1152    int src_row_diff;
1153    int y, x;
1154
1155    if (src_rowstride >= width * 4)
1156       src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
1157    else
1158       src_row_diff = 0;
1159
1160    for (y = 0; y < height; y += BLOCK_SIZE) {
1161       for (x = 0; x < width; x += BLOCK_SIZE) {
1162          decompress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
1163                                     MIN2(height - y, BLOCK_SIZE),
1164                                     src,
1165                                     (dst + x * 4 +
1166                                      (y * dst_rowstride / sizeof dst[0])),
1167                                     dst_rowstride, is_signed);
1168          src += BLOCK_BYTES;
1169       }
1170       src += src_row_diff;
1171    }
1172 }
1173 #endif // BPTC_BLOCK_DECODE
1174
1175 static void
1176 write_bits(struct bit_writer *writer, int n_bits, int value)
1177 {
1178    do {
1179       if (n_bits + writer->pos >= 8) {
1180          *(writer->dst++) = writer->buf | (value << writer->pos);
1181          writer->buf = 0;
1182          value >>= (8 - writer->pos);
1183          n_bits -= (8 - writer->pos);
1184          writer->pos = 0;
1185       } else {
1186          writer->buf |= value << writer->pos;
1187          writer->pos += n_bits;
1188          break;
1189       }
1190    } while (n_bits > 0);
1191 }
1192
1193 static void
1194 get_average_luminance_alpha_unorm(int width, int height,
1195                                   const uint8_t *src, int src_rowstride,
1196                                   int *average_luminance, int *average_alpha)
1197 {
1198    int luminance_sum = 0, alpha_sum = 0;
1199    int y, x;
1200
1201    for (y = 0; y < height; y++) {
1202       for (x = 0; x < width; x++) {
1203          luminance_sum += src[0] + src[1] + src[2];
1204          alpha_sum += src[3];
1205          src += 4;
1206       }
1207       src += src_rowstride - width * 4;
1208    }
1209
1210    *average_luminance = luminance_sum / (width * height);
1211    *average_alpha = alpha_sum / (width * height);
1212 }
1213
1214 static void
1215 get_rgba_endpoints_unorm(int width, int height,
1216                          const uint8_t *src, int src_rowstride,
1217                          int average_luminance, int average_alpha,
1218                          uint8_t endpoints[][4])
1219 {
1220    int endpoint_luminances[2];
1221    int midpoint;
1222    int sums[2][4];
1223    int endpoint;
1224    int luminance;
1225    uint8_t temp[3];
1226    const uint8_t *p = src;
1227    int rgb_left_endpoint_count = 0;
1228    int alpha_left_endpoint_count = 0;
1229    int y, x, i;
1230
1231    memset(sums, 0, sizeof sums);
1232
1233    for (y = 0; y < height; y++) {
1234       for (x = 0; x < width; x++) {
1235          luminance = p[0] + p[1] + p[2];
1236          if (luminance < average_luminance) {
1237             endpoint = 0;
1238             rgb_left_endpoint_count++;
1239          } else {
1240             endpoint = 1;
1241          }
1242          for (i = 0; i < 3; i++)
1243             sums[endpoint][i] += p[i];
1244
1245          if (p[2] < average_alpha) {
1246             endpoint = 0;
1247             alpha_left_endpoint_count++;
1248          } else {
1249             endpoint = 1;
1250          }
1251          sums[endpoint][3] += p[3];
1252
1253          p += 4;
1254       }
1255
1256       p += src_rowstride - width * 4;
1257    }
1258
1259    if (rgb_left_endpoint_count == 0 ||
1260        rgb_left_endpoint_count == width * height) {
1261       for (i = 0; i < 3; i++)
1262          endpoints[0][i] = endpoints[1][i] =
1263             (sums[0][i] + sums[1][i]) / (width * height);
1264    } else {
1265       for (i = 0; i < 3; i++) {
1266          endpoints[0][i] = sums[0][i] / rgb_left_endpoint_count;
1267          endpoints[1][i] = (sums[1][i] /
1268                             (width * height - rgb_left_endpoint_count));
1269       }
1270    }
1271
1272    if (alpha_left_endpoint_count == 0 ||
1273        alpha_left_endpoint_count == width * height) {
1274       endpoints[0][3] = endpoints[1][3] =
1275          (sums[0][3] + sums[1][3]) / (width * height);
1276    } else {
1277          endpoints[0][3] = sums[0][3] / alpha_left_endpoint_count;
1278          endpoints[1][3] = (sums[1][3] /
1279                             (width * height - alpha_left_endpoint_count));
1280    }
1281
1282    /* We may need to swap the endpoints to ensure the most-significant bit of
1283     * the first index is zero */
1284
1285    for (endpoint = 0; endpoint < 2; endpoint++) {
1286       endpoint_luminances[endpoint] =
1287          endpoints[endpoint][0] +
1288          endpoints[endpoint][1] +
1289          endpoints[endpoint][2];
1290    }
1291    midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2;
1292
1293    if ((src[0] + src[1] + src[2] <= midpoint) !=
1294        (endpoint_luminances[0] <= midpoint)) {
1295       memcpy(temp, endpoints[0], 3);
1296       memcpy(endpoints[0], endpoints[1], 3);
1297       memcpy(endpoints[1], temp, 3);
1298    }
1299
1300    /* Same for the alpha endpoints */
1301
1302    midpoint = (endpoints[0][3] + endpoints[1][3]) / 2;
1303
1304    if ((src[3] <= midpoint) != (endpoints[0][3] <= midpoint)) {
1305       temp[0] = endpoints[0][3];
1306       endpoints[0][3] = endpoints[1][3];
1307       endpoints[1][3] = temp[0];
1308    }
1309 }
1310
1311 static void
1312 write_rgb_indices_unorm(struct bit_writer *writer,
1313                         int src_width, int src_height,
1314                         const uint8_t *src, int src_rowstride,
1315                         uint8_t endpoints[][4])
1316 {
1317    int luminance;
1318    int endpoint_luminances[2];
1319    int endpoint;
1320    int index;
1321    int y, x;
1322
1323    for (endpoint = 0; endpoint < 2; endpoint++) {
1324       endpoint_luminances[endpoint] =
1325          endpoints[endpoint][0] +
1326          endpoints[endpoint][1] +
1327          endpoints[endpoint][2];
1328    }
1329
1330    /* If the endpoints have the same luminance then we'll just use index 0 for
1331     * all of the texels */
1332    if (endpoint_luminances[0] == endpoint_luminances[1]) {
1333       write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 2 - 1, 0);
1334       return;
1335    }
1336
1337    for (y = 0; y < src_height; y++) {
1338       for (x = 0; x < src_width; x++) {
1339          luminance = src[0] + src[1] + src[2];
1340
1341          index = ((luminance - endpoint_luminances[0]) * 3 /
1342                   (endpoint_luminances[1] - endpoint_luminances[0]));
1343          if (index < 0)
1344             index = 0;
1345          else if (index > 3)
1346             index = 3;
1347
1348          assert(x != 0 || y != 0 || index < 2);
1349
1350          write_bits(writer, (x == 0 && y == 0) ? 1 : 2, index);
1351
1352          src += 4;
1353       }
1354
1355       /* Pad the indices out to the block size */
1356       if (src_width < BLOCK_SIZE)
1357          write_bits(writer, 2 * (BLOCK_SIZE - src_width), 0);
1358
1359       src += src_rowstride - src_width * 4;
1360    }
1361
1362    /* Pad the indices out to the block size */
1363    if (src_height < BLOCK_SIZE)
1364       write_bits(writer, 2 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1365 }
1366
1367 static void
1368 write_alpha_indices_unorm(struct bit_writer *writer,
1369                           int src_width, int src_height,
1370                           const uint8_t *src, int src_rowstride,
1371                           uint8_t endpoints[][4])
1372 {
1373    int index;
1374    int y, x;
1375
1376    /* If the endpoints have the same alpha then we'll just use index 0 for
1377     * all of the texels */
1378    if (endpoints[0][3] == endpoints[1][3]) {
1379       write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 3 - 1, 0);
1380       return;
1381    }
1382
1383    for (y = 0; y < src_height; y++) {
1384       for (x = 0; x < src_width; x++) {
1385          index = (((int) src[3] - (int) endpoints[0][3]) * 7 /
1386                   ((int) endpoints[1][3] - endpoints[0][3]));
1387          if (index < 0)
1388             index = 0;
1389          else if (index > 7)
1390             index = 7;
1391
1392          assert(x != 0 || y != 0 || index < 4);
1393
1394          /* The first index has one less bit */
1395          write_bits(writer, (x == 0 && y == 0) ? 2 : 3, index);
1396
1397          src += 4;
1398       }
1399
1400       /* Pad the indices out to the block size */
1401       if (src_width < BLOCK_SIZE)
1402          write_bits(writer, 3 * (BLOCK_SIZE - src_width), 0);
1403
1404       src += src_rowstride - src_width * 4;
1405    }
1406
1407    /* Pad the indices out to the block size */
1408    if (src_height < BLOCK_SIZE)
1409       write_bits(writer, 3 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1410 }
1411
1412 static void
1413 compress_rgba_unorm_block(int src_width, int src_height,
1414                           const uint8_t *src, int src_rowstride,
1415                           uint8_t *dst)
1416 {
1417    int average_luminance, average_alpha;
1418    uint8_t endpoints[2][4];
1419    struct bit_writer writer;
1420    int component, endpoint;
1421
1422    get_average_luminance_alpha_unorm(src_width, src_height, src, src_rowstride,
1423                                      &average_luminance, &average_alpha);
1424    get_rgba_endpoints_unorm(src_width, src_height, src, src_rowstride,
1425                             average_luminance, average_alpha,
1426                             endpoints);
1427
1428    writer.dst = dst;
1429    writer.pos = 0;
1430    writer.buf = 0;
1431
1432    write_bits(&writer, 5, 0x10); /* mode 4 */
1433    write_bits(&writer, 2, 0); /* rotation 0 */
1434    write_bits(&writer, 1, 0); /* index selection bit */
1435
1436    /* Write the color endpoints */
1437    for (component = 0; component < 3; component++)
1438       for (endpoint = 0; endpoint < 2; endpoint++)
1439          write_bits(&writer, 5, endpoints[endpoint][component] >> 3);
1440
1441    /* Write the alpha endpoints */
1442    for (endpoint = 0; endpoint < 2; endpoint++)
1443       write_bits(&writer, 6, endpoints[endpoint][3] >> 2);
1444
1445    write_rgb_indices_unorm(&writer,
1446                            src_width, src_height,
1447                            src, src_rowstride,
1448                            endpoints);
1449    write_alpha_indices_unorm(&writer,
1450                              src_width, src_height,
1451                              src, src_rowstride,
1452                              endpoints);
1453 }
1454
1455 static void
1456 compress_rgba_unorm(int width, int height,
1457                     const uint8_t *src, int src_rowstride,
1458                     uint8_t *dst, int dst_rowstride)
1459 {
1460    int dst_row_diff;
1461    int y, x;
1462
1463    if (dst_rowstride >= width * 4)
1464       dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
1465    else
1466       dst_row_diff = 0;
1467
1468    for (y = 0; y < height; y += BLOCK_SIZE) {
1469       for (x = 0; x < width; x += BLOCK_SIZE) {
1470          compress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
1471                                    MIN2(height - y, BLOCK_SIZE),
1472                                    src + x * 4 + y * src_rowstride,
1473                                    src_rowstride,
1474                                    dst);
1475          dst += BLOCK_BYTES;
1476       }
1477       dst += dst_row_diff;
1478    }
1479 }
1480
1481 static float
1482 get_average_luminance_float(int width, int height,
1483                             const float *src, int src_rowstride)
1484 {
1485    float luminance_sum = 0;
1486    int y, x;
1487
1488    for (y = 0; y < height; y++) {
1489       for (x = 0; x < width; x++) {
1490          luminance_sum += src[0] + src[1] + src[2];
1491          src += 3;
1492       }
1493       src += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
1494    }
1495
1496    return luminance_sum / (width * height);
1497 }
1498
1499 static float
1500 clamp_value(float value, bool is_signed)
1501 {
1502    if (value > 65504.0f)
1503       return 65504.0f;
1504
1505    if (is_signed) {
1506       if (value < -65504.0f)
1507          return -65504.0f;
1508       else
1509          return value;
1510    }
1511
1512    if (value < 0.0f)
1513       return 0.0f;
1514
1515    return value;
1516 }
1517
1518 static void
1519 get_endpoints_float(int width, int height,
1520                     const float *src, int src_rowstride,
1521                     float average_luminance, float endpoints[][3],
1522                     bool is_signed)
1523 {
1524    float endpoint_luminances[2];
1525    float midpoint;
1526    float sums[2][3];
1527    int endpoint, component;
1528    float luminance;
1529    float temp[3];
1530    const float *p = src;
1531    int left_endpoint_count = 0;
1532    int y, x, i;
1533
1534    memset(sums, 0, sizeof sums);
1535
1536    for (y = 0; y < height; y++) {
1537       for (x = 0; x < width; x++) {
1538          luminance = p[0] + p[1] + p[2];
1539          if (luminance < average_luminance) {
1540             endpoint = 0;
1541             left_endpoint_count++;
1542          } else {
1543             endpoint = 1;
1544          }
1545          for (i = 0; i < 3; i++)
1546             sums[endpoint][i] += p[i];
1547
1548          p += 3;
1549       }
1550
1551       p += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
1552    }
1553
1554    if (left_endpoint_count == 0 ||
1555        left_endpoint_count == width * height) {
1556       for (i = 0; i < 3; i++)
1557          endpoints[0][i] = endpoints[1][i] =
1558             (sums[0][i] + sums[1][i]) / (width * height);
1559    } else {
1560       for (i = 0; i < 3; i++) {
1561          endpoints[0][i] = sums[0][i] / left_endpoint_count;
1562          endpoints[1][i] = sums[1][i] / (width * height - left_endpoint_count);
1563       }
1564    }
1565
1566    /* Clamp the endpoints to the range of a half float and strip out
1567     * infinities */
1568    for (endpoint = 0; endpoint < 2; endpoint++) {
1569       for (component = 0; component < 3; component++) {
1570          endpoints[endpoint][component] =
1571             clamp_value(endpoints[endpoint][component], is_signed);
1572       }
1573    }
1574
1575    /* We may need to swap the endpoints to ensure the most-significant bit of
1576     * the first index is zero */
1577
1578    for (endpoint = 0; endpoint < 2; endpoint++) {
1579       endpoint_luminances[endpoint] =
1580          endpoints[endpoint][0] +
1581          endpoints[endpoint][1] +
1582          endpoints[endpoint][2];
1583    }
1584    midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2.0f;
1585
1586    if ((src[0] + src[1] + src[2] <= midpoint) !=
1587        (endpoint_luminances[0] <= midpoint)) {
1588       memcpy(temp, endpoints[0], sizeof temp);
1589       memcpy(endpoints[0], endpoints[1], sizeof temp);
1590       memcpy(endpoints[1], temp, sizeof temp);
1591    }
1592 }
1593
1594 static void
1595 write_rgb_indices_float(struct bit_writer *writer,
1596                         int src_width, int src_height,
1597                         const float *src, int src_rowstride,
1598                         float endpoints[][3])
1599 {
1600    float luminance;
1601    float endpoint_luminances[2];
1602    int endpoint;
1603    int index;
1604    int y, x;
1605
1606    for (endpoint = 0; endpoint < 2; endpoint++) {
1607       endpoint_luminances[endpoint] =
1608          endpoints[endpoint][0] +
1609          endpoints[endpoint][1] +
1610          endpoints[endpoint][2];
1611    }
1612
1613    /* If the endpoints have the same luminance then we'll just use index 0 for
1614     * all of the texels */
1615    if (endpoint_luminances[0] == endpoint_luminances[1]) {
1616       write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 4 - 1, 0);
1617       return;
1618    }
1619
1620    for (y = 0; y < src_height; y++) {
1621       for (x = 0; x < src_width; x++) {
1622          luminance = src[0] + src[1] + src[2];
1623
1624          index = ((luminance - endpoint_luminances[0]) * 15 /
1625                   (endpoint_luminances[1] - endpoint_luminances[0]));
1626          if (index < 0)
1627             index = 0;
1628          else if (index > 15)
1629             index = 15;
1630
1631          assert(x != 0 || y != 0 || index < 8);
1632
1633          write_bits(writer, (x == 0 && y == 0) ? 3 : 4, index);
1634
1635          src += 3;
1636       }
1637
1638       /* Pad the indices out to the block size */
1639       if (src_width < BLOCK_SIZE)
1640          write_bits(writer, 4 * (BLOCK_SIZE - src_width), 0);
1641
1642       src += (src_rowstride - src_width * 3 * sizeof (float)) / sizeof (float);
1643    }
1644
1645    /* Pad the indices out to the block size */
1646    if (src_height < BLOCK_SIZE)
1647       write_bits(writer, 4 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1648 }
1649
1650 static int
1651 get_endpoint_value(float value, bool is_signed)
1652 {
1653    bool sign = false;
1654    int half;
1655
1656    if (is_signed) {
1657       half = _mesa_float_to_half(value);
1658
1659       if (half & 0x8000) {
1660          half &= 0x7fff;
1661          sign = true;
1662       }
1663
1664       half = (32 * half / 31) >> 6;
1665
1666       if (sign)
1667          half = -half & ((1 << 10) - 1);
1668
1669       return half;
1670    } else {
1671       if (value <= 0.0f)
1672          return 0;
1673
1674       half = _mesa_float_to_half(value);
1675
1676       return (64 * half / 31) >> 6;
1677    }
1678 }
1679
1680 static void
1681 compress_rgb_float_block(int src_width, int src_height,
1682                          const float *src, int src_rowstride,
1683                          uint8_t *dst,
1684                          bool is_signed)
1685 {
1686    float average_luminance;
1687    float endpoints[2][3];
1688    struct bit_writer writer;
1689    int component, endpoint;
1690    int endpoint_value;
1691
1692    average_luminance =
1693       get_average_luminance_float(src_width, src_height, src, src_rowstride);
1694    get_endpoints_float(src_width, src_height, src, src_rowstride,
1695                        average_luminance, endpoints, is_signed);
1696
1697    writer.dst = dst;
1698    writer.pos = 0;
1699    writer.buf = 0;
1700
1701    write_bits(&writer, 5, 3); /* mode 3 */
1702
1703    /* Write the endpoints */
1704    for (endpoint = 0; endpoint < 2; endpoint++) {
1705       for (component = 0; component < 3; component++) {
1706          endpoint_value =
1707             get_endpoint_value(endpoints[endpoint][component], is_signed);
1708          write_bits(&writer, 10, endpoint_value);
1709       }
1710    }
1711
1712    write_rgb_indices_float(&writer,
1713                            src_width, src_height,
1714                            src, src_rowstride,
1715                            endpoints);
1716 }
1717
1718 static void
1719 compress_rgb_float(int width, int height,
1720                    const float *src, int src_rowstride,
1721                    uint8_t *dst, int dst_rowstride,
1722                    bool is_signed)
1723 {
1724    int dst_row_diff;
1725    int y, x;
1726
1727    if (dst_rowstride >= width * 4)
1728       dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
1729    else
1730       dst_row_diff = 0;
1731
1732    for (y = 0; y < height; y += BLOCK_SIZE) {
1733       for (x = 0; x < width; x += BLOCK_SIZE) {
1734          compress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
1735                                   MIN2(height - y, BLOCK_SIZE),
1736                                   src + x * 3 +
1737                                   y * src_rowstride / sizeof (float),
1738                                   src_rowstride,
1739                                   dst,
1740                                   is_signed);
1741          dst += BLOCK_BYTES;
1742       }
1743       dst += dst_row_diff;
1744    }
1745 }
1746
1747 #endif