src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2010-2018 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  * The above copyright notice and this permission notice (including the
  23  * next paragraph) shall be included in all copies or substantial portions
  24  * of the Software.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * s3tc pixel format manipulation.
  32  *
  33  * @author Roland Scheidegger <sroland@vmware.com>
  34  */
  35
  36
  37 #include "util/u_format.h"
  38 #include "util/u_math.h"
  39 #include "util/u_string.h"
  40 #include "util/u_cpu_detect.h"
  41 #include "util/u_debug.h"
  42
  43 #include "lp_bld_arit.h"
  44 #include "lp_bld_type.h"
  45 #include "lp_bld_const.h"
  46 #include "lp_bld_conv.h"
  47 #include "lp_bld_gather.h"
  48 #include "lp_bld_format.h"
  49 #include "lp_bld_logic.h"
  50 #include "lp_bld_pack.h"
  51 #include "lp_bld_flow.h"
  52 #include "lp_bld_printf.h"
  53 #include "lp_bld_struct.h"
  54 #include "lp_bld_swizzle.h"
  55 #include "lp_bld_init.h"
  56 #include "lp_bld_debug.h"
  57 #include "lp_bld_intr.h"
  58
  59
  60 /**
  61  * Reverse an interleave2_half
  62  * (ie. pick every second element, independent lower/upper halfs)
  63  * sse2 can only do that with 32bit (shufps) or larger elements
  64  * natively. (Otherwise, and/pack (even) or shift/pack (odd)
  65  * could be used, ideally llvm would do that for us.)
  66  * XXX: Unfortunately, this does NOT translate to a shufps if those
  67  * are int vectors (and casting will not help, llvm needs to recognize it
  68  * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
  69  * sequence which I'm pretty sure is a lot worse despite domain transition
  70  * penalties with shufps (except maybe on Nehalem).
  71  */
  72 static LLVMValueRef
  73 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
  74                             struct lp_type type,
  75                             LLVMValueRef a,
  76                             LLVMValueRef b,
  77                             unsigned lo_hi)
  78 {
  79    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
  80    unsigned i;
  81
  82    assert(type.length <= LP_MAX_VECTOR_LENGTH);
  83    assert(lo_hi < 2);
  84
  85    if (type.length * type.width == 256) {
  86       assert(type.length == 8);
  87       assert(type.width == 32);
  88       static const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
  89       for (i = 0; i < type.length; ++i) {
  90          elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
  91       }
  92    } else {
  93       for (i = 0; i < type.length; ++i) {
  94          elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
  95       }
  96    }
  97
  98    shuffle = LLVMConstVector(elems, type.length);
  99
 100    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 101
 102 }
 103
 104
 105 /**
 106  * Build shuffle for extending vectors.
 107  */
 108 static LLVMValueRef
 109 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
 110                               unsigned n, unsigned length)
 111 {
 112    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 113    unsigned i;
 114
 115    assert(n <= length);
 116    assert(length <= LP_MAX_VECTOR_LENGTH);
 117
 118    /* TODO: cache results in a static table */
 119
 120    for(i = 0; i < n; i++) {
 121       elems[i] = lp_build_const_int32(gallivm, i);
 122    }
 123    for (i = n; i < length; i++) {
 124       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 125    }
 126
 127    return LLVMConstVector(elems, length);
 128 }
 129
 130 static LLVMValueRef
 131 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
 132 {
 133    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 134    unsigned i, j;
 135
 136    assert(n <= LP_MAX_VECTOR_LENGTH);
 137
 138    /* TODO: cache results in a static table */
 139
 140    for(i = 0, j = 0; i < n; i += 2, ++j) {
 141       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
 142       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
 143       elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
 144       elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
 145    }
 146
 147    return LLVMConstVector(elems, n * 2);
 148 }
 149
 150 /*
 151  * broadcast 1 element to all elements
 152  */
 153 static LLVMValueRef
 154 lp_build_const_shuffle1(struct gallivm_state *gallivm,
 155                         unsigned index, unsigned n)
 156 {
 157    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 158    unsigned i;
 159
 160    assert(n <= LP_MAX_VECTOR_LENGTH);
 161
 162    /* TODO: cache results in a static table */
 163
 164    for (i = 0; i < n; i++) {
 165       elems[i] = lp_build_const_int32(gallivm, index);
 166    }
 167
 168    return LLVMConstVector(elems, n);
 169 }
 170
 171 /*
 172  * move 1 element to pos 0, rest undef
 173  */
 174 static LLVMValueRef
 175 lp_build_shuffle1undef(struct gallivm_state *gallivm,
 176                        LLVMValueRef a, unsigned index, unsigned n)
 177 {
 178    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
 179    unsigned i;
 180
 181    assert(n <= LP_MAX_VECTOR_LENGTH);
 182
 183    elems[0] = lp_build_const_int32(gallivm, index);
 184
 185    for (i = 1; i < n; i++) {
 186       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 187    }
 188    shuf = LLVMConstVector(elems, n);
 189
 190    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
 191 }
 192
 193 static boolean
 194 format_dxt1_variant(enum pipe_format format)
 195 {
 196   return format == PIPE_FORMAT_DXT1_RGB ||
 197          format == PIPE_FORMAT_DXT1_RGBA ||
 198          format == PIPE_FORMAT_DXT1_SRGB ||
 199          format == PIPE_FORMAT_DXT1_SRGBA;
 200
 201 }
 202
 203 /**
 204  * Gather elements from scatter positions in memory into vectors.
 205  * This is customised for fetching texels from s3tc textures.
 206  * For SSE, typical value is length=4.
 207  *
 208  * @param length length of the offsets
 209  * @param colors the stored colors of the blocks will be extracted into this.
 210  * @param codewords the codewords of the blocks will be extracted into this.
 211  * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
 212  * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
 213  * @param base_ptr base pointer, should be a i8 pointer type.
 214  * @param offsets vector with offsets
 215  */
 216 static void
 217 lp_build_gather_s3tc(struct gallivm_state *gallivm,
 218                      unsigned length,
 219                      const struct util_format_description *format_desc,
 220                      LLVMValueRef *colors,
 221                      LLVMValueRef *codewords,
 222                      LLVMValueRef *alpha_lo,
 223                      LLVMValueRef *alpha_hi,
 224                      LLVMValueRef base_ptr,
 225                      LLVMValueRef offsets)
 226 {
 227    LLVMBuilderRef builder = gallivm->builder;
 228    unsigned block_bits = format_desc->block.bits;
 229    unsigned i;
 230    LLVMValueRef elems[8];
 231    LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
 232    LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
 233    LLVMTypeRef type32dxt;
 234    struct lp_type lp_type32dxt;
 235
 236    memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
 237    lp_type32dxt.width = 32;
 238    lp_type32dxt.length = block_bits / 32;
 239    type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
 240
 241    assert(block_bits == 64 || block_bits == 128);
 242    assert(length == 1 || length == 4 || length == 8);
 243
 244    for (i = 0; i < length; ++i) {
 245       elems[i] = lp_build_gather_elem(gallivm, length,
 246                                       block_bits, block_bits, TRUE,
 247                                       base_ptr, offsets, i, FALSE);
 248       elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
 249    }
 250    if (length == 1) {
 251       LLVMValueRef elem = elems[0];
 252       if (block_bits == 128) {
 253          *alpha_lo = LLVMBuildExtractElement(builder, elem,
 254                                              lp_build_const_int32(gallivm, 0), "");
 255          *alpha_hi = LLVMBuildExtractElement(builder, elem,
 256                                              lp_build_const_int32(gallivm, 1), "");
 257          *colors = LLVMBuildExtractElement(builder, elem,
 258                                            lp_build_const_int32(gallivm, 2), "");
 259          *codewords = LLVMBuildExtractElement(builder, elem,
 260                                               lp_build_const_int32(gallivm, 3), "");
 261       }
 262       else {
 263          *alpha_lo = LLVMGetUndef(type32);
 264          *alpha_hi = LLVMGetUndef(type32);
 265          *colors = LLVMBuildExtractElement(builder, elem,
 266                                            lp_build_const_int32(gallivm, 0), "");
 267          *codewords = LLVMBuildExtractElement(builder, elem,
 268                                               lp_build_const_int32(gallivm, 1), "");
 269       }
 270    }
 271    else {
 272       LLVMValueRef tmp[4], cc01, cc23;
 273       struct lp_type lp_type32, lp_type64;
 274       memset(&lp_type32, 0, sizeof lp_type32);
 275       lp_type32.width = 32;
 276       lp_type32.length = length;
 277       memset(&lp_type64, 0, sizeof lp_type64);
 278       lp_type64.width = 64;
 279       lp_type64.length = length/2;
 280
 281       if (block_bits == 128) {
 282          if (length == 8) {
 283             for (i = 0; i < 4; ++i) {
 284                tmp[0] = elems[i];
 285                tmp[1] = elems[i+4];
 286                elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
 287             }
 288          }
 289          lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
 290          *colors = tmp[2];
 291          *codewords = tmp[3];
 292          *alpha_lo = tmp[0];
 293          *alpha_hi = tmp[1];
 294       } else {
 295          LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
 296          LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
 297
 298          for (i = 0; i < length; ++i) {
 299             /* no-op shuffle */
 300             elems[i] = LLVMBuildShuffleVector(builder, elems[i],
 301                                               LLVMGetUndef(type32dxt),
 302                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
 303          }
 304          if (length == 8) {
 305             struct lp_type lp_type32_4 = {0};
 306             lp_type32_4.width = 32;
 307             lp_type32_4.length = 4;
 308             for (i = 0; i < 4; ++i) {
 309                tmp[0] = elems[i];
 310                tmp[1] = elems[i+4];
 311                elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
 312             }
 313          }
 314          cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
 315          cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
 316          cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
 317          cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
 318          *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
 319          *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
 320          *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
 321          *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
 322       }
 323    }
 324 }
 325
 326 /** Convert from <n x i32> containing 2 x n rgb565 colors
 327  * to 2 <n x i32> rgba8888 colors
 328  * This is the most optimized version I can think of
 329  * should be nearly as fast as decoding only one color
 330  * NOTE: alpha channel will be set to 0
 331  * @param colors  is a <n x i32> vector containing the rgb565 colors
 332  */
 333 static void
 334 color_expand2_565_to_8888(struct gallivm_state *gallivm,
 335                           unsigned n,
 336                           LLVMValueRef colors,
 337                           LLVMValueRef *color0,
 338                           LLVMValueRef *color1)
 339 {
 340    LLVMBuilderRef builder = gallivm->builder;
 341    LLVMValueRef r, g, b, rblo, glo;
 342    LLVMValueRef rgblomask, rb, rgb0, rgb1;
 343    struct lp_type type, type16, type8;
 344
 345    assert(n > 1);
 346
 347    memset(&type, 0, sizeof type);
 348    type.width = 32;
 349    type.length = n;
 350
 351    memset(&type16, 0, sizeof type16);
 352    type16.width = 16;
 353    type16.length = 2 * n;
 354
 355    memset(&type8, 0, sizeof type8);
 356    type8.width = 8;
 357    type8.length = 4 * n;
 358
 359    rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
 360    colors = LLVMBuildBitCast(builder, colors,
 361                              lp_build_vec_type(gallivm, type16), "");
 362    /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
 363     * make sure low bits of r are zero - could use AND but requires constant */
 364    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
 365    r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
 366    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
 367    rb = LLVMBuildOr(builder, r, b, "");
 368    rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
 369    /* don't have byte shift hence need mask */
 370    rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
 371    rb = LLVMBuildOr(builder, rb, rblo, "");
 372
 373    /* make sure low bits of g are zero */
 374    g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
 375    g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
 376    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
 377    g = LLVMBuildOr(builder, g, glo, "");
 378
 379    rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
 380    g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
 381    rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
 382    rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
 383
 384    rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
 385    rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
 386
 387    /* rgb0 is rgb00, rgb01, rgb10, rgb11
 388     * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
 389     * on x86 this _should_ just generate one shufps...
 390     */
 391    *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
 392    *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
 393 }
 394
 395
 396 /** Convert from <n x i32> containing rgb565 colors
 397  * (in first 16 bits) to <n x i32> rgba8888 colors
 398  * bits 16-31 MBZ
 399  * NOTE: alpha channel will be set to 0
 400  * @param colors  is a <n x i32> vector containing the rgb565 colors
 401  */
 402 static LLVMValueRef
 403 color_expand_565_to_8888(struct gallivm_state *gallivm,
 404                          unsigned n,
 405                          LLVMValueRef colors)
 406 {
 407    LLVMBuilderRef builder = gallivm->builder;
 408    LLVMValueRef rgba, r, g, b, rgblo, glo;
 409    LLVMValueRef rbhimask, g6mask, rgblomask;
 410    struct lp_type type;
 411    memset(&type, 0, sizeof type);
 412    type.width = 32;
 413    type.length = n;
 414
 415    /* color expansion:
 416     * first extract and shift colors into their final locations
 417     * (high bits - low bits zero at this point)
 418     * then replicate highest bits to the lowest bits
 419     * note rb replication can be done in parallel but not g
 420     * (different shift)
 421     * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
 422     * rhigh = 8, ghigh = 5, bhigh = 19
 423     * rblow = 5, glow = 6
 424     * rgblowmask = 0x00070307
 425     * r = colors >> rhigh
 426     * b = colors << bhigh
 427     * g = (colors & g6mask) << ghigh
 428     * rb = (r | b) rbhimask
 429     * rbtmp = rb >> rblow
 430     * gtmp = rb >> glow
 431     * rbtmp = rbtmp | gtmp
 432     * rbtmp = rbtmp & rgblowmask
 433     * rgb = rb | g | rbtmp
 434     */
 435    g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
 436    rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
 437    rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
 438
 439    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
 440    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
 441    g = LLVMBuildAnd(builder, colors, g6mask, "");
 442    g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
 443    rgba = LLVMBuildOr(builder, r, b, "");
 444    rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
 445    rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
 446    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
 447    rgblo = LLVMBuildOr(builder, rgblo, glo, "");
 448    rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
 449    rgba = LLVMBuildOr(builder, rgba, g, "");
 450    rgba = LLVMBuildOr(builder, rgba, rgblo, "");
 451
 452    return rgba;
 453 }
 454
 455
 456 /*
 457  * Average two byte vectors. (Will always round up.)
 458  */
 459 static LLVMValueRef
 460 lp_build_pavgb(struct lp_build_context *bld8,
 461                LLVMValueRef v0,
 462                LLVMValueRef v1)
 463 {
 464    struct gallivm_state *gallivm = bld8->gallivm;
 465    LLVMBuilderRef builder = gallivm->builder;
 466    assert(bld8->type.width == 8);
 467    assert(bld8->type.length == 16 || bld8->type.length == 32);
 468    if (HAVE_LLVM < 0x0600) {
 469       LLVMValueRef intrargs[2];
 470       char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
 471                                                   "llvm.x86.sse2.pavg.b";
 472       intrargs[0] = v0;
 473       intrargs[1] = v1;
 474       return lp_build_intrinsic(builder, intr_name,
 475                                 bld8->vec_type, intrargs, 2, 0);
 476    } else {
 477       /*
 478        * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
 479        * You better hope the backend code manages to detect the pattern, and
 480        * the pattern doesn't change there...
 481        */
 482       struct lp_type type_ext = bld8->type;
 483       LLVMTypeRef vec_type_ext;
 484       LLVMValueRef res;
 485       LLVMValueRef ext_one;
 486       type_ext.width = 16;
 487       vec_type_ext = lp_build_vec_type(gallivm, type_ext);
 488       ext_one = lp_build_const_vec(gallivm, type_ext, 1);
 489
 490       v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
 491       v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
 492       res = LLVMBuildAdd(builder, v0, v1, "");
 493       res = LLVMBuildAdd(builder, res, ext_one, "");
 494       res = LLVMBuildLShr(builder, res, ext_one, "");
 495       res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
 496       return res;
 497    }
 498 }
 499
 500 /**
 501  * Calculate 1/3(v1-v0) + v0
 502  * and 2*1/3(v1-v0) + v0
 503  */
 504 static void
 505 lp_build_lerp23(struct lp_build_context *bld,
 506                 LLVMValueRef v0,
 507                 LLVMValueRef v1,
 508                 LLVMValueRef *res0,
 509                 LLVMValueRef *res1)
 510 {
 511    struct gallivm_state *gallivm = bld->gallivm;
 512    LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
 513    LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
 514    const struct lp_type type = bld->type;
 515    LLVMBuilderRef builder = bld->gallivm->builder;
 516    struct lp_type i16_type = lp_wider_type(type);
 517    struct lp_build_context bld2;
 518
 519    assert(lp_check_value(type, v0));
 520    assert(lp_check_value(type, v1));
 521    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
 522
 523    lp_build_context_init(&bld2, gallivm, i16_type);
 524    bld2.type.sign = TRUE;
 525    x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
 526
 527    /* FIXME: use native avx256 unpack/pack */
 528    lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
 529    lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
 530    lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
 531    delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
 532    delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
 533
 534    mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
 535    mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
 536
 537    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
 538    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
 539    /* lerp optimization: pack now, do add afterwards */
 540    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
 541    *res0 = lp_build_add(bld, tmp, v0);
 542
 543    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
 544    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
 545    /* unlike above still need mask (but add still afterwards). */
 546    x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
 547    x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
 548    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
 549    *res1 = lp_build_add(bld, tmp, v0);
 550 }
 551
 552 /**
 553  * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
 554  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 555  * @param codewords  is a <n x i32> vector containing the codewords
 556  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 557  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 558  */
 559 static LLVMValueRef
 560 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
 561                            unsigned n,
 562                            enum pipe_format format,
 563                            LLVMValueRef colors,
 564                            LLVMValueRef codewords,
 565                            LLVMValueRef i,
 566                            LLVMValueRef j)
 567 {
 568    LLVMBuilderRef builder = gallivm->builder;
 569    LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
 570    LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
 571    LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
 572    struct lp_type type, type8;
 573    struct lp_build_context bld8, bld32;
 574    boolean is_dxt1_variant = format_dxt1_variant(format);
 575
 576    memset(&type, 0, sizeof type);
 577    type.width = 32;
 578    type.length = n;
 579
 580    memset(&type8, 0, sizeof type8);
 581    type8.width = 8;
 582    type8.length = 4*n;
 583
 584    assert(lp_check_value(type, i));
 585    assert(lp_check_value(type, j));
 586
 587    a = lp_build_const_int_vec(gallivm, type, 0xff000000);
 588
 589    lp_build_context_init(&bld32, gallivm, type);
 590    lp_build_context_init(&bld8, gallivm, type8);
 591
 592    /*
 593     * works as follows:
 594     * - expand color0/color1 to rgba8888
 595     * - calculate color2/3 (interpolation) according to color0 < color1 rules
 596     * - calculate color2/3 according to color0 >= color1 rules
 597     * - do selection of color2/3 according to comparison of color0/1
 598     * - extract indices (vector shift).
 599     * - use compare/select to select the correct color. Since we have 2bit
 600     *   indices (and 4 colors), needs at least three compare/selects.
 601     */
 602    /*
 603     * expand the two colors
 604     */
 605    col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
 606    col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
 607    if (n > 1) {
 608       color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
 609    }
 610    else {
 611       color0 = color_expand_565_to_8888(gallivm, n, col0);
 612       color1 = color_expand_565_to_8888(gallivm, n, col1);
 613    }
 614
 615    /*
 616     * interpolate colors
 617     * color2_1 is 2/3 color0 + 1/3 color1
 618     * color3_1 is 1/3 color0 + 2/3 color1
 619     * color2_2 is 1/2 color0 + 1/2 color1
 620     * color3_2 is 0
 621     */
 622
 623    colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
 624    colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
 625    /* can combine 2 lerps into one mostly - still looks expensive enough. */
 626    lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
 627    color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
 628    color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
 629
 630    /* dxt3/5 always use 4-color encoding */
 631    if (is_dxt1_variant) {
 632       /* fix up alpha */
 633       if (format == PIPE_FORMAT_DXT1_RGBA ||
 634           format == PIPE_FORMAT_DXT1_SRGBA) {
 635          color0 = LLVMBuildOr(builder, color0, a, "");
 636          color1 = LLVMBuildOr(builder, color1, a, "");
 637          color3 = LLVMBuildOr(builder, color3, a, "");
 638       }
 639       /*
 640        * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
 641        * Much cheaper (but we don't care that much if n == 1).
 642        */
 643       if ((util_cpu_caps.has_sse2 && n == 4) ||
 644           (util_cpu_caps.has_avx2 && n == 8)) {
 645          color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
 646          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
 647       }
 648       else {
 649          struct lp_type i16_type = lp_wider_type(type8);
 650          struct lp_build_context bld2;
 651          LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
 652
 653          lp_build_context_init(&bld2, gallivm, i16_type);
 654          bld2.type.sign = TRUE;
 655
 656          /*
 657           * This isn't as expensive as it looks (the unpack is the same as
 658           * for lerp23), with correct rounding.
 659           * (Note that while rounding is correct, this will always round down,
 660           * whereas pavgb will always round up.)
 661           */
 662          /* FIXME: use native avx256 unpack/pack */
 663          lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
 664          lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
 665
 666          addlo = lp_build_add(&bld2, v0_lo, v1_lo);
 667          addhi = lp_build_add(&bld2, v0_hi, v1_hi);
 668          addlo = LLVMBuildLShr(builder, addlo,
 669                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
 670          addhi = LLVMBuildLShr(builder, addhi,
 671                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
 672          color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
 673          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
 674       }
 675       color3_2 = lp_build_const_int_vec(gallivm, type, 0);
 676
 677       /* select between colors2/3 */
 678       /* signed compare is faster saves some xors */
 679       type.sign = TRUE;
 680       sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
 681       color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
 682       color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
 683       type.sign = FALSE;
 684
 685       if (format == PIPE_FORMAT_DXT1_RGBA ||
 686           format == PIPE_FORMAT_DXT1_SRGBA) {
 687          color2 = LLVMBuildOr(builder, color2, a, "");
 688       }
 689    }
 690
 691    const2 = lp_build_const_int_vec(gallivm, type, 2);
 692    /* extract 2-bit index values */
 693    bit_pos = LLVMBuildShl(builder, j, const2, "");
 694    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 695    bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
 696    /*
 697     * NOTE: This innocent looking shift is very expensive with x86/ssex.
 698     * Shifts with per-elemnent shift count get roughly translated to
 699     * extract (count), extract (value), shift, move (back to xmm), unpack
 700     * per element!
 701     * So about 20 instructions here for 4xi32.
 702     * Newer llvm versions (3.7+) will not do extract/insert but use a
 703     * a couple constant count vector shifts plus shuffles. About same
 704     * amount of instructions unfortunately...
 705     * Would get much worse with 8xi16 even...
 706     * We could actually do better here:
 707     * - subtract bit_pos from 128+30, shl 23, convert float to int...
 708     * - now do mul with codewords followed by shr 30...
 709     * But requires 32bit->32bit mul, sse41 only (well that's emulatable
 710     * with 2 32bit->64bit muls...) and not exactly cheap
 711     * AVX2, of course, fixes this nonsense.
 712     */
 713    indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
 714
 715    /* finally select the colors */
 716    sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
 717    sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
 718    color0 = lp_build_select(&bld32, sel_lo, color1, color0);
 719    color2 = lp_build_select(&bld32, sel_lo, color3, color2);
 720    sel_hi = LLVMBuildAnd(builder, indices, const2, "");
 721    sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
 722    rgba = lp_build_select(&bld32, sel_hi, color2, color0);
 723
 724    /* fix up alpha */
 725    if (format == PIPE_FORMAT_DXT1_RGB ||
 726        format == PIPE_FORMAT_DXT1_SRGB) {
 727       rgba = LLVMBuildOr(builder, rgba, a, "");
 728    }
 729    return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
 730 }
 731
 732
 733 static LLVMValueRef
 734 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
 735                       unsigned n,
 736                       enum pipe_format format,
 737                       LLVMValueRef colors,
 738                       LLVMValueRef codewords,
 739                       LLVMValueRef i,
 740                       LLVMValueRef j)
 741 {
 742    return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
 743                                      colors, codewords, i, j);
 744 }
 745
 746
 747 /**
 748  * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
 749  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 750  * @param codewords  is a <n x i32> vector containing the codewords
 751  * @param alphas  is a <n x i64> vector containing the alpha values
 752  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 753  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 754  */
 755 static LLVMValueRef
 756 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
 757                       unsigned n,
 758                       enum pipe_format format,
 759                       LLVMValueRef colors,
 760                       LLVMValueRef codewords,
 761                       LLVMValueRef alpha_low,
 762                       LLVMValueRef alpha_hi,
 763                       LLVMValueRef i,
 764                       LLVMValueRef j)
 765 {
 766    LLVMBuilderRef builder = gallivm->builder;
 767    LLVMValueRef rgba, tmp, tmp2;
 768    LLVMValueRef bit_pos, sel_mask;
 769    struct lp_type type, type8;
 770    struct lp_build_context bld;
 771
 772    memset(&type, 0, sizeof type);
 773    type.width = 32;
 774    type.length = n;
 775
 776    memset(&type8, 0, sizeof type8);
 777    type8.width = 8;
 778    type8.length = n*4;
 779
 780    assert(lp_check_value(type, i));
 781    assert(lp_check_value(type, j));
 782
 783    lp_build_context_init(&bld, gallivm, type);
 784
 785    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
 786                                 colors, codewords, i, j);
 787
 788    rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
 789
 790    /*
 791     * Extract alpha values. Since we now need to select from
 792     * which 32bit vector values are fetched, construct selection
 793     * mask from highest bit of bit_pos, and use select, then shift
 794     * according to the bit_pos (without the highest bit).
 795     * Note this is pointless for n == 1 case. Could just
 796     * directly use 64bit arithmetic if we'd extract 64bit
 797     * alpha value instead of 2x32...
 798     */
 799    /* pos = 4*(4j+i) */
 800    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
 801    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 802    bit_pos = LLVMBuildShl(builder, bit_pos,
 803                           lp_build_const_int_vec(gallivm, type, 2), "");
 804    sel_mask = LLVMBuildLShr(builder, bit_pos,
 805                             lp_build_const_int_vec(gallivm, type, 5), "");
 806    sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
 807    tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
 808    bit_pos = LLVMBuildAnd(builder, bit_pos,
 809                           lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
 810    /* Warning: slow shift with per element count (without avx2) */
 811    /*
 812     * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
 813     * to select the right byte with pshufb. Then for the remaining one bit
 814     * just do shift/select.
 815     */
 816    tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
 817
 818    /* combined expand from a4 to a8 and shift into position */
 819    tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
 820    tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
 821    tmp = LLVMBuildOr(builder, tmp, tmp2, "");
 822
 823    rgba = LLVMBuildOr(builder, tmp, rgba, "");
 824
 825    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
 826 }
 827
 828 static LLVMValueRef
 829 lp_build_lerpdxta(struct gallivm_state *gallivm,
 830                   LLVMValueRef alpha0,
 831                   LLVMValueRef alpha1,
 832                   LLVMValueRef code,
 833                   LLVMValueRef sel_mask,
 834                   unsigned n)
 835 {
 836    /*
 837     * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
 838     * (plus pmullw is actually faster...)
 839     * we just pretend our 32bit values (which are really only 8bit) are 16bits.
 840     * Note that this is obviously a disaster for the scalar case.
 841     */
 842    LLVMBuilderRef builder = gallivm->builder;
 843    LLVMValueRef delta, ainterp;
 844    LLVMValueRef weight5, weight7, weight;
 845    struct lp_type type32, type16, type8;
 846    struct lp_build_context bld16;
 847
 848    memset(&type32, 0, sizeof type32);
 849    type32.width = 32;
 850    type32.length = n;
 851    memset(&type16, 0, sizeof type16);
 852    type16.width = 16;
 853    type16.length = 2*n;
 854    type16.sign = TRUE;
 855    memset(&type8, 0, sizeof type8);
 856    type8.width = 8;
 857    type8.length = 4*n;
 858
 859    lp_build_context_init(&bld16, gallivm, type16);
 860    /* 255/7 is a bit off - increase accuracy at the expense of shift later */
 861    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
 862    weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
 863    weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
 864    weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
 865
 866    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
 867    alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
 868    code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
 869    /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
 870       but we don't care */
 871    code = LLVMBuildSub(builder, code, bld16.one, "");
 872
 873    weight = LLVMBuildMul(builder, weight, code, "");
 874    weight = LLVMBuildLShr(builder, weight,
 875                           lp_build_const_int_vec(gallivm, type16, 6), "");
 876
 877    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
 878
 879    ainterp = LLVMBuildMul(builder, delta, weight, "");
 880    ainterp = LLVMBuildLShr(builder, ainterp,
 881                            lp_build_const_int_vec(gallivm, type16, 8), "");
 882
 883    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
 884    alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
 885    ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
 886    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
 887
 888    return ainterp;
 889 }
 890
 891 /**
 892  * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
 893  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 894  * @param codewords  is a <n x i32> vector containing the codewords
 895  * @param alphas  is a <n x i64> vector containing the alpha values
 896  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 897  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 898  */
 899 static LLVMValueRef
 900 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
 901                            unsigned n,
 902                            enum pipe_format format,
 903                            LLVMValueRef colors,
 904                            LLVMValueRef codewords,
 905                            LLVMValueRef alpha_lo,
 906                            LLVMValueRef alpha_hi,
 907                            LLVMValueRef i,
 908                            LLVMValueRef j)
 909 {
 910    LLVMBuilderRef builder = gallivm->builder;
 911    LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
 912    LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
 913    LLVMValueRef mask6, mask7, ainterp;
 914    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
 915    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 916    struct lp_type type, type8;
 917    struct lp_build_context bld32;
 918
 919    memset(&type, 0, sizeof type);
 920    type.width = 32;
 921    type.length = n;
 922
 923    memset(&type8, 0, sizeof type8);
 924    type8.width = 8;
 925    type8.length = n*4;
 926
 927    assert(lp_check_value(type, i));
 928    assert(lp_check_value(type, j));
 929
 930    lp_build_context_init(&bld32, gallivm, type);
 931
 932    assert(lp_check_value(type, i));
 933    assert(lp_check_value(type, j));
 934
 935    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
 936                                 colors, codewords, i, j);
 937
 938    rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
 939
 940    /* this looks pretty complex for vectorization:
 941     * extract a0/a1 values
 942     * extract code
 943     * select weights for interpolation depending on a0 > a1
 944     * mul weights by code - 1
 945     * lerp a0/a1/weights
 946     * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
 947     */
 948
 949    alpha0 = LLVMBuildAnd(builder, alpha_lo,
 950                          lp_build_const_int_vec(gallivm, type, 0xff), "");
 951    alpha1 = LLVMBuildLShr(builder, alpha_lo,
 952                           lp_build_const_int_vec(gallivm, type, 8), "");
 953    alpha1 = LLVMBuildAnd(builder, alpha1,
 954                          lp_build_const_int_vec(gallivm, type, 0xff), "");
 955
 956    /* pos = 3*(4j+i) */
 957    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
 958    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 959    tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
 960    bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
 961    /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
 962    bit_pos = LLVMBuildAdd(builder, bit_pos,
 963                           lp_build_const_int_vec(gallivm, type, 16), "");
 964
 965    if (n == 1) {
 966       struct lp_type type64;
 967       memset(&type64, 0, sizeof type64);
 968       type64.width = 64;
 969       type64.length = 1;
 970       /* This is pretty pointless could avoid by just directly extracting
 971          64bit in the first place but makes it more complicated elsewhere */
 972       alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
 973       alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
 974       alphac0 = LLVMBuildShl(builder, alpha_hi,
 975                              lp_build_const_int_vec(gallivm, type64, 32), "");
 976       alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
 977
 978       shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
 979       alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
 980       alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
 981       alphac = LLVMBuildAnd(builder, alphac0,
 982                             lp_build_const_int_vec(gallivm, type, 0x7), "");
 983    }
 984    else {
 985       /*
 986        * Using non-native vector length here (actually, with avx2 and
 987        * n == 4 llvm will indeed expand to ymm regs...)
 988        * At least newer llvm versions handle that ok.
 989        * llvm 3.7+ will even handle the emulated 64bit shift with variable
 990        * shift count without extraction (and it's actually easier to
 991        * emulate than the 32bit one).
 992        */
 993       alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
 994                                        lp_build_const_unpackx2_shuffle(gallivm, n), "");
 995
 996       alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
 997       shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
 998       alphac = LLVMBuildLShr(builder, alpha64, shift, "");
 999       alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
1000
1001       alphac = LLVMBuildAnd(builder, alphac,
1002                             lp_build_const_int_vec(gallivm, type, 0x7), "");
1003    }
1004
1005    /* signed compare is faster saves some xors */
1006    type.sign = TRUE;
1007    /* alpha0 > alpha1 selection */
1008    sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1009                                alpha0, alpha1);
1010    ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
1011
1012    /*
1013     * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1014     * else we select a0 for case 0, a1 for case 1,
1015     * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1016     * a = (c == 0) ? a0 : a1
1017     * a = (c > 1) ? ainterp : a
1018     * Finally handle case 6/7 for !(a0 > a1)
1019     * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1020     * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1021     */
1022    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1023                                alphac, bld32.zero);
1024    alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1025    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1026                                alphac, bld32.one);
1027    alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1028
1029    code_s = LLVMBuildAnd(builder, alphac,
1030                          LLVMBuildNot(builder, sel_mask, ""), "");
1031    mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1032                             code_s, lp_build_const_int_vec(gallivm, type, 6));
1033    mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1034                             code_s, lp_build_const_int_vec(gallivm, type, 7));
1035    alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1036    alpha = LLVMBuildOr(builder, alpha, mask7, "");
1037
1038    alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1039    rgba = LLVMBuildOr(builder, alpha, rgba, "");
1040
1041    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1042 }
1043
1044
1045 static void
1046 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1047                                    const struct util_format_description *format_desc,
1048                                    LLVMValueRef *dxt_block,
1049                                    LLVMValueRef ptr)
1050 {
1051    LLVMBuilderRef builder = gallivm->builder;
1052    unsigned block_bits = format_desc->block.bits;
1053    LLVMValueRef elem, shuf;
1054    LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1055    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1056    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1057    LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1058
1059    assert(block_bits == 64 || block_bits == 128);
1060
1061    ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
1062    elem = LLVMBuildLoad(builder, ptr, "");
1063
1064    if (block_bits == 128) {
1065       /* just return block as is */
1066       *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1067    }
1068    else {
1069       LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1070       shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1071       elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1072       *dxt_block = LLVMBuildShuffleVector(builder, elem,
1073                                           LLVMGetUndef(type32_2), shuf, "");
1074    }
1075 }
1076
1077
1078 static void
1079 s3tc_store_cached_block(struct gallivm_state *gallivm,
1080                         LLVMValueRef *col,
1081                         LLVMValueRef tag_value,
1082                         LLVMValueRef hash_index,
1083                         LLVMValueRef cache)
1084 {
1085    LLVMBuilderRef builder = gallivm->builder;
1086    LLVMValueRef ptr, indices[3];
1087    LLVMTypeRef type_ptr4x32;
1088    unsigned count;
1089
1090    type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1091    indices[0] = lp_build_const_int32(gallivm, 0);
1092    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1093    indices[2] = hash_index;
1094    ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1095    LLVMBuildStore(builder, tag_value, ptr);
1096
1097    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1098    hash_index = LLVMBuildMul(builder, hash_index,
1099                              lp_build_const_int32(gallivm, 16), "");
1100    for (count = 0; count < 4; count++) {
1101       indices[2] = hash_index;
1102       ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1103       ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1104       LLVMBuildStore(builder, col[count], ptr);
1105       hash_index = LLVMBuildAdd(builder, hash_index,
1106                                 lp_build_const_int32(gallivm, 4), "");
1107    }
1108 }
1109
1110 static LLVMValueRef
1111 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1112                          LLVMValueRef ptr,
1113                          LLVMValueRef index)
1114 {
1115    LLVMBuilderRef builder = gallivm->builder;
1116    LLVMValueRef member_ptr, indices[3];
1117
1118    indices[0] = lp_build_const_int32(gallivm, 0);
1119    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1120    indices[2] = index;
1121    member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1122    return LLVMBuildLoad(builder, member_ptr, "cache_data");
1123 }
1124
1125 static LLVMValueRef
1126 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1127                      LLVMValueRef ptr,
1128                      LLVMValueRef index)
1129 {
1130    LLVMBuilderRef builder = gallivm->builder;
1131    LLVMValueRef member_ptr, indices[3];
1132
1133    indices[0] = lp_build_const_int32(gallivm, 0);
1134    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1135    indices[2] = index;
1136    member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1137    return LLVMBuildLoad(builder, member_ptr, "tag_data");
1138 }
1139
1140 #if LP_BUILD_FORMAT_CACHE_DEBUG
1141 static void
1142 s3tc_update_cache_access(struct gallivm_state *gallivm,
1143                          LLVMValueRef ptr,
1144                          unsigned count,
1145                          unsigned index)
1146 {
1147    LLVMBuilderRef builder = gallivm->builder;
1148    LLVMValueRef member_ptr, cache_access;
1149
1150    assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1151           index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1152
1153    member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
1154    cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
1155    cache_access = LLVMBuildAdd(builder, cache_access,
1156                                LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
1157                                                                    count, 0), "");
1158    LLVMBuildStore(builder, cache_access, member_ptr);
1159 }
1160 #endif
1161
1162 /**
1163  * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1164  * The lerp is performed between the first 2 32bit colors
1165  * in the source vector, both results are returned packed in result vector.
1166  */
1167 static LLVMValueRef
1168 lp_build_lerp23_single(struct lp_build_context *bld,
1169                        LLVMValueRef v01)
1170 {
1171    struct gallivm_state *gallivm = bld->gallivm;
1172    LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1173    const struct lp_type type = bld->type;
1174    LLVMBuilderRef builder = bld->gallivm->builder;
1175    struct lp_type i16_type = lp_wider_type(type);
1176    struct lp_type i32_type = lp_wider_type(i16_type);
1177    struct lp_build_context bld2;
1178
1179    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1180
1181    lp_build_context_init(&bld2, gallivm, i16_type);
1182    bld2.type.sign = TRUE;
1183
1184    /* weights 256/3, 256*2/3, with correct rounding */
1185    elems[0] = elems[1] = elems[2] = elems[3] =
1186       lp_build_const_elem(gallivm, i16_type, 255*1/3);
1187    elems[4] = elems[5] = elems[6] = elems[7] =
1188       lp_build_const_elem(gallivm, i16_type, 171);
1189    x = LLVMConstVector(elems, 8);
1190
1191    /*
1192     * v01 has col0 in 32bit elem 0, col1 in elem 1.
1193     * Interleave/unpack will give us separate v0/v1 vectors.
1194     */
1195    v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1196    v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1197
1198    lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1199    delta = lp_build_sub(&bld2, v1, v0);
1200
1201    mul = LLVMBuildMul(builder, x, delta, "");
1202
1203    mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1204    /* lerp optimization: pack now, do add afterwards */
1205    res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1206    /* only lower 2 elems are valid - for these v0 is really v0 */
1207    return lp_build_add(bld, res, v01);
1208 }
1209
1210 /*
1211  * decode one dxt1 block.
1212  */
1213 static void
1214 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1215                        enum pipe_format format,
1216                        LLVMValueRef dxt_block,
1217                        LLVMValueRef *col)
1218 {
1219    LLVMBuilderRef builder = gallivm->builder;
1220    LLVMValueRef color01, color23, color01_16, color0123;
1221    LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1222    struct lp_type type8, type32, type16, type64;
1223    struct lp_build_context bld8, bld32, bld16, bld64;
1224    unsigned i;
1225    boolean is_dxt1_variant = format_dxt1_variant(format);
1226
1227    memset(&type32, 0, sizeof type32);
1228    type32.width = 32;
1229    type32.length = 4;
1230    type32.sign = TRUE;
1231
1232    memset(&type8, 0, sizeof type8);
1233    type8.width = 8;
1234    type8.length = 16;
1235
1236    memset(&type16, 0, sizeof type16);
1237    type16.width = 16;
1238    type16.length = 8;
1239
1240    memset(&type64, 0, sizeof type64);
1241    type64.width = 64;
1242    type64.length = 2;
1243
1244    a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1245    const2 = lp_build_const_int_vec(gallivm, type32, 2);
1246
1247    lp_build_context_init(&bld32, gallivm, type32);
1248    lp_build_context_init(&bld16, gallivm, type16);
1249    lp_build_context_init(&bld8, gallivm, type8);
1250    lp_build_context_init(&bld64, gallivm, type64);
1251
1252    if (is_dxt1_variant) {
1253       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1254       code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1255    } else {
1256       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1257       code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1258    }
1259    code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1260    /* expand bytes to dwords */
1261    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1262    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1263
1264
1265    /*
1266     * works as follows:
1267     * - expand color0/color1 to rgba8888
1268     * - calculate color2/3 (interpolation) according to color0 < color1 rules
1269     * - calculate color2/3 according to color0 >= color1 rules
1270     * - do selection of color2/3 according to comparison of color0/1
1271     * - extract indices.
1272     * - use compare/select to select the correct color. Since we have 2bit
1273     *   indices (and 4 colors), needs at least three compare/selects.
1274     */
1275
1276    /*
1277     * expand the two colors
1278     */
1279    color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1280    color01 = lp_build_interleave2(gallivm, type16, color01,
1281                                   bld16.zero, 0);
1282    color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1283    color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1284
1285    /*
1286     * interpolate colors
1287     * color2_1 is 2/3 color0 + 1/3 color1
1288     * color3_1 is 1/3 color0 + 2/3 color1
1289     * color2_2 is 1/2 color0 + 1/2 color1
1290     * color3_2 is 0
1291     */
1292
1293    /* TODO: since this is now always scalar, should
1294     * probably just use control flow here instead of calculating
1295     * both cases and then selection
1296     */
1297    if (format == PIPE_FORMAT_DXT1_RGBA ||
1298        format == PIPE_FORMAT_DXT1_SRGBA) {
1299       color01 = LLVMBuildOr(builder, color01, a, "");
1300    }
1301    /* can combine 2 lerps into one mostly */
1302    color23 = lp_build_lerp23_single(&bld8, color01);
1303    color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1304
1305    /* dxt3/5 always use 4-color encoding */
1306    if (is_dxt1_variant) {
1307       LLVMValueRef color23_2, color2_2;
1308
1309       if (util_cpu_caps.has_sse2) {
1310          LLVMValueRef intrargs[2];
1311          intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1312          /* same interleave as for lerp23 - correct result in 2nd element */
1313          intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1314          intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1315          color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1316       }
1317       else {
1318          LLVMValueRef v01, v0, v1, vhalf;
1319          /*
1320           * This isn't as expensive as it looks (the unpack is the same as
1321           * for lerp23, which is the reason why we do the pointless
1322           * interleave2 too), with correct rounding (the two lower elements
1323           * will be the same).
1324           */
1325          v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1326          v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1327          lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1328          vhalf = lp_build_add(&bld16, v0, v1);
1329          vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1330          color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1331       }
1332       /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1333       color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1334       color23_2 = LLVMBuildLShr(builder, color23_2,
1335                                 lp_build_const_int_vec(gallivm, type64, 32), "");
1336       color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1337
1338       tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1339       tmp = LLVMBuildLShr(builder, tmp,
1340                           lp_build_const_int_vec(gallivm, type64, 32), "");
1341       tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1342       sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1343                                   color01_16, tmp);
1344       sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1345       color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1346    }
1347
1348    if (util_cpu_caps.has_ssse3) {
1349       /*
1350        * Use pshufb as mini-lut. (Only doable with intrinsics as the
1351        * final shuffles are non-constant. pshufb is awesome!)
1352        */
1353       LLVMValueRef shuf[16], low2mask;
1354       LLVMValueRef intrargs[2], lut_ind, lut_adj;
1355
1356       color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1357       color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1358       color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1359       color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1360
1361       if (format == PIPE_FORMAT_DXT1_RGB ||
1362           format == PIPE_FORMAT_DXT1_SRGB) {
1363          color0123 = LLVMBuildOr(builder, color0123, a, "");
1364       }
1365
1366       /* shuffle as r0r1r2r3g0g1... */
1367       for (i = 0; i < 4; i++) {
1368          shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1369          shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1370          shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1371          shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1372       }
1373       color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1374       color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1375                                          LLVMConstVector(shuf, 16), "");
1376
1377       /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1378       low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1379       /* add 0/4/8/12 for r/g/b/a */
1380       lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1381       lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1382       intrargs[0] = color0123;
1383       for (i = 0; i < 4; i++) {
1384          lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1385          lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1386          intrargs[1] = lut_ind;
1387          col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1388                                      bld8.vec_type, intrargs, 2, 0);
1389          col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1390          code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1391          code = LLVMBuildLShr(builder, code, const2, "");
1392          code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1393       }
1394    }
1395    else {
1396       /* Thanks to vectorization can do 4 texels in parallel */
1397       LLVMValueRef color0, color1, color2, color3;
1398       if (format == PIPE_FORMAT_DXT1_RGB ||
1399           format == PIPE_FORMAT_DXT1_SRGB) {
1400          color01 = LLVMBuildOr(builder, color01, a, "");
1401          color23 = LLVMBuildOr(builder, color23, a, "");
1402       }
1403       color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1404                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1405       color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1406                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1407       color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1408                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1409       color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1410                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1411       code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1412
1413       for (i = 0; i < 4; i++) {
1414          /* select the colors */
1415          LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1416          bitlo = bld32.one;
1417          indices = LLVMBuildAnd(builder, code, bitlo, "");
1418          selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1419                                       indices, bitlo);
1420          rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1421
1422          LLVMValueRef selmaskhi;
1423          indices = LLVMBuildAnd(builder, code, const2, "");
1424          selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1425                                       indices, const2);
1426          rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1427          rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1428
1429          /*
1430           * Note that this will give "wrong" order.
1431           * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1432           * This would be easily fixable by using different shuffle, bitlo/hi
1433           * vectors above (and different shift), but seems slightly easier to
1434           * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1435           */
1436          col[i] = rgba;
1437          code = LLVMBuildLShr(builder, code, const2, "");
1438       }
1439    }
1440 }
1441
1442 /*
1443  * decode one dxt3 block.
1444  */
1445 static void
1446 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1447                        enum pipe_format format,
1448                        LLVMValueRef dxt_block,
1449                        LLVMValueRef *col)
1450 {
1451    LLVMBuilderRef builder = gallivm->builder;
1452    LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1453    struct lp_type type32, type8, type16;
1454    unsigned i;
1455
1456    memset(&type32, 0, sizeof type32);
1457    type32.width = 32;
1458    type32.length = 4;
1459
1460    memset(&type8, 0, sizeof type8);
1461    type8.width = 8;
1462    type8.length = 16;
1463
1464    memset(&type16, 0, sizeof type16);
1465    type16.width = 16;
1466    type16.length = 8;
1467
1468    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1469
1470    shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1471    mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1472
1473    alpha = LLVMBuildBitCast(builder, dxt_block,
1474                             lp_build_vec_type(gallivm, type8), "");
1475    alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1476    alpha = LLVMBuildBitCast(builder, alpha,
1477                             lp_build_vec_type(gallivm, type16), "");
1478    alpha = LLVMBuildAnd(builder, alpha,
1479                         lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1480    alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1481    alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1482    alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1483    alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1484    alpha = LLVMBuildBitCast(builder, alpha,
1485                             lp_build_vec_type(gallivm, type32), "");
1486    /*
1487     * alpha now contains elems 0,1,2,3,... (ubytes)
1488     * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1489     * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1490     */
1491    a[0] = LLVMBuildShl(builder, alpha,
1492                        lp_build_const_int_vec(gallivm, type32, 24), "");
1493    a[1] = LLVMBuildShl(builder, alpha,
1494                        lp_build_const_int_vec(gallivm, type32, 16), "");
1495    a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1496    a[2] = LLVMBuildShl(builder, alpha,
1497                        lp_build_const_int_vec(gallivm, type32, 8), "");
1498    a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1499    a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1500
1501    for (i = 0; i < 4; i++) {
1502       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1503    }
1504 }
1505
1506
1507 static LLVMValueRef
1508 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1509                         LLVMValueRef alpha0,
1510                         LLVMValueRef alpha1,
1511                         LLVMValueRef code,
1512                         LLVMValueRef sel_mask)
1513 {
1514    LLVMBuilderRef builder = gallivm->builder;
1515    LLVMValueRef delta, ainterp;
1516    LLVMValueRef weight5, weight7, weight;
1517    struct lp_type type16;
1518    struct lp_build_context bld;
1519
1520    memset(&type16, 0, sizeof type16);
1521    type16.width = 16;
1522    type16.length = 8;
1523    type16.sign = TRUE;
1524
1525    lp_build_context_init(&bld, gallivm, type16);
1526    /*
1527     * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1528     * actually be desirable to do this here with even higher accuracy than
1529     * even 8 bit (more or less required for rgtc, albeit that's not handled
1530     * here right now), shift the weights after multiplication by code.
1531     */
1532    weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1533    weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1534    weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1535
1536    /*
1537     * we'll get garbage in the elements which had code 0 (or larger than
1538     * 5 or 7) but we don't care (or rather, need to fix up anyway).
1539     */
1540    code = LLVMBuildSub(builder, code, bld.one, "");
1541
1542    weight = LLVMBuildMul(builder, weight, code, "");
1543    weight = LLVMBuildLShr(builder, weight,
1544                           lp_build_const_int_vec(gallivm, type16, 6), "");
1545
1546    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1547
1548    ainterp = LLVMBuildMul(builder, delta, weight, "");
1549    ainterp = LLVMBuildLShr(builder, ainterp,
1550                            lp_build_const_int_vec(gallivm, type16, 8), "");
1551
1552    /* lerp is done later (with packed values) */
1553
1554    return ainterp;
1555 }
1556
1557
1558 /*
1559  * decode one dxt5 block.
1560  */
1561 static void
1562 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1563                        enum pipe_format format,
1564                        LLVMValueRef dxt_block,
1565                        LLVMValueRef *col)
1566 {
1567    LLVMBuilderRef builder = gallivm->builder;
1568    LLVMValueRef alpha, alpha0, alpha1, ares;
1569    LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1570    LLVMValueRef a[4], acode, tmp0, tmp1;
1571    LLVMTypeRef i64t, i32t;
1572    struct lp_type type32, type64, type8, type16;
1573    struct lp_build_context bld16, bld8;
1574    unsigned i;
1575
1576    memset(&type32, 0, sizeof type32);
1577    type32.width = 32;
1578    type32.length = 4;
1579
1580    memset(&type64, 0, sizeof type64);
1581    type64.width = 64;
1582    type64.length = 2;
1583
1584    memset(&type8, 0, sizeof type8);
1585    type8.width = 8;
1586    type8.length = 16;
1587
1588    memset(&type16, 0, sizeof type16);
1589    type16.width = 16;
1590    type16.length = 8;
1591
1592    lp_build_context_init(&bld16, gallivm, type16);
1593    lp_build_context_init(&bld8, gallivm, type8);
1594
1595    i64t = lp_build_vec_type(gallivm, type64);
1596    i32t = lp_build_vec_type(gallivm, type32);
1597
1598    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1599
1600    /*
1601     * three possible strategies for vectorizing alpha:
1602     * 1) compute all 8 values then use scalar extraction
1603     *    (i.e. have all 8 alpha values packed in one 64bit scalar
1604     *    and do something like ax = vals >> (codex * 8) followed
1605     *    by inserting these values back into color)
1606     * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1607     *    (without pshufb would need boatloads of cmp/selects trying to
1608     *    keep things vectorized for essentially scalar selection).
1609     * 3) do something similar to the uncached case
1610     *    needs more calculations (need to calc 16 values instead of 8 though
1611     *    that's only an issue for the lerp which we need to do twice otherwise
1612     *    everything still fits into 128bit) but keeps things vectorized mostly.
1613     * Trying 3) here though not sure it's really faster...
1614     * With pshufb, we try 2) (cheaper and more accurate)
1615     */
1616
1617    /*
1618     * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1619     * help since code crosses 8bit boundaries). But variable shifts are
1620     * AVX2 only, and even then only dword/quadword (intel _really_ hates
1621     * shifts!). Instead, emulate by 16bit muls.
1622     * Also, the required byte shuffles are essentially non-emulatable, so
1623     * require ssse3 (albeit other archs might do them fine).
1624     * This is not directly tied to ssse3 - just need sane byte shuffles.
1625     * But ordering is going to be different below so use same condition.
1626     */
1627
1628
1629    /* vectorize alpha */
1630    alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1631    alpha0 = LLVMBuildAnd(builder, alpha,
1632                          lp_build_const_int_vec(gallivm, type64, 0xff), "");
1633    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1634    alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1635    alpha1 = LLVMBuildLShr(builder, alpha,
1636                           lp_build_const_int_vec(gallivm, type16, 8), "");
1637    alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
1638    shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1639    alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1640    alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1641
1642    type16.sign = TRUE;
1643    sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1644                                alpha0, alpha1);
1645    type16.sign = FALSE;
1646    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1647
1648    if (!util_cpu_caps.has_ssse3) {
1649       LLVMValueRef acodeg, mask1, acode0, acode1;
1650
1651       /* extraction of the 3 bit values into something more useful is HARD */
1652       /* first steps are actually scalar */
1653       acode = LLVMBuildLShr(builder, alpha,
1654                             lp_build_const_int_vec(gallivm, type64, 16), "");
1655       tmp0 = LLVMBuildAnd(builder, acode,
1656                           lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1657       tmp1 =  LLVMBuildLShr(builder, acode,
1658                             lp_build_const_int_vec(gallivm, type64, 24), "");
1659       tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1660       tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1661       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1662       /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1663       tmp0 = LLVMBuildAnd(builder, acode,
1664                           lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1665       tmp1 =  LLVMBuildLShr(builder, acode,
1666                             lp_build_const_int_vec(gallivm, type32, 12), "");
1667       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1668       /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1669       tmp0 = LLVMBuildAnd(builder, acode,
1670                           lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1671       tmp1 =  LLVMBuildLShr(builder, acode,
1672                             lp_build_const_int_vec(gallivm, type32, 6), "");
1673       /* use signed pack doesn't matter and otherwise need sse41 */
1674       type32.sign = type16.sign = TRUE;
1675       acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1676       type32.sign = type16.sign = FALSE;
1677       /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1678       acode0 = LLVMBuildAnd(builder, acode,
1679                             lp_build_const_int_vec(gallivm, type16, 0x7), "");
1680       acode1 =  LLVMBuildLShr(builder, acode,
1681                               lp_build_const_int_vec(gallivm, type16, 3), "");
1682       acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1683       /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1684
1685       acodeg = LLVMBuildAnd(builder, acode,
1686                             LLVMBuildNot(builder, sel_mask, ""), "");
1687       mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1688                                acode, bld8.one);
1689
1690       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1691       ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1692       ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1693       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1694       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1695       alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1696       alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1697       ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1698       /* Fix up val01 */
1699       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1700                                    acode, bld8.zero);
1701       ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1702       ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1703
1704       /* fix up val67 if a0 <= a1 */
1705       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1706                                    acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1707       ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1708       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1709                                    acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1710       ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1711
1712       /* unpack in right order (0,4,8,12,1,5,..) */
1713       /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1714       tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1715       tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1716       tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1717       tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1718
1719       a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1720       a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1721       a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1722       a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1723    }
1724    else {
1725       LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1726       LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1727       LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1728       unsigned i, j;
1729       /*
1730        * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1731        * help since code crosses 8bit boundaries). But variable shifts are
1732        * AVX2 only, and even then only dword/quadword (intel _really_ hates
1733        * shifts!). Instead, emulate by 16bit muls.
1734        * Also, the required byte shuffles are essentially non-emulatable, so
1735        * require ssse3 (albeit other archs might do them fine, but the
1736        * complete path is ssse3 only for now).
1737        */
1738       for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1739          elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1740          elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1741          elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1742       }
1743       shufa = LLVMConstVector(elems, 16);
1744       alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1745       acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1746       acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1747       /*
1748        * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1749        * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1750        * we'd place them into bits 4-7 so could save shift but impossible.)
1751        */
1752       for (i = 0; i < 8; i += 4) {
1753          elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1754          elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1755          elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1756          elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1757       }
1758       mulclo = LLVMConstVector(elems, 8);
1759       for (i = 0; i < 8; i += 4) {
1760          elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1761          elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1762          elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1763          elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1764       }
1765       mulchi = LLVMConstVector(elems, 8);
1766
1767       tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1768       tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1769       tmp0 = LLVMBuildLShr(builder, tmp0,
1770                            lp_build_const_int_vec(gallivm, type16, 13), "");
1771       tmp1 = LLVMBuildLShr(builder, tmp1,
1772                            lp_build_const_int_vec(gallivm, type16, 5), "");
1773       tmp1 = LLVMBuildAnd(builder, tmp1,
1774                           lp_build_const_int_vec(gallivm, type16, 0x700), "");
1775       acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1776       acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1777
1778       /*
1779        * Note that ordering is different here to non-ssse3 path:
1780        * 0/1/2/3/4/5...
1781        */
1782
1783       LLVMValueRef weight0, weight1, weight, delta;
1784       LLVMValueRef constff_elem7, const0_elem6;
1785       /* weights, correctly rounded (round(256*x/7)) */
1786       elems[0] = LLVMConstInt(type16s, 256, 0);
1787       elems[1] = LLVMConstInt(type16s, 0, 0);
1788       elems[2] = LLVMConstInt(type16s, 219, 0);
1789       elems[3] =  LLVMConstInt(type16s, 183, 0);
1790       elems[4] =  LLVMConstInt(type16s, 146, 0);
1791       elems[5] =  LLVMConstInt(type16s, 110, 0);
1792       elems[6] =  LLVMConstInt(type16s, 73, 0);
1793       elems[7] =  LLVMConstInt(type16s, 37, 0);
1794       weight0 = LLVMConstVector(elems, 8);
1795
1796       elems[0] = LLVMConstInt(type16s, 256, 0);
1797       elems[1] = LLVMConstInt(type16s, 0, 0);
1798       elems[2] = LLVMConstInt(type16s, 205, 0);
1799       elems[3] =  LLVMConstInt(type16s, 154, 0);
1800       elems[4] =  LLVMConstInt(type16s, 102, 0);
1801       elems[5] =  LLVMConstInt(type16s, 51, 0);
1802       elems[6] =  LLVMConstInt(type16s, 0, 0);
1803       elems[7] =  LLVMConstInt(type16s, 0, 0);
1804       weight1 = LLVMConstVector(elems, 8);
1805
1806       weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1807       weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1808       weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1809       weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1810
1811       for (i = 0; i < 16; i++) {
1812          elems[i] = LLVMConstNull(type8s);
1813       }
1814       elems[7] = LLVMConstInt(type8s, 255, 0);
1815       constff_elem7 = LLVMConstVector(elems, 16);
1816
1817       for (i = 0; i < 16; i++) {
1818          elems[i] = LLVMConstInt(type8s, 255, 0);
1819       }
1820       elems[6] = LLVMConstInt(type8s, 0, 0);
1821       const0_elem6 = LLVMConstVector(elems, 16);
1822
1823       /* standard simple lerp - but the version we need isn't available */
1824       delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1825       ainterp = LLVMBuildMul(builder, delta, weight, "");
1826       ainterp = LLVMBuildLShr(builder, ainterp,
1827                               lp_build_const_int_vec(gallivm, type16, 8), "");
1828       ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1829       alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1830       ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1831       ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1832       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1833
1834       /* fixing 0/0xff case is slightly more complex */
1835       constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1836                                    LLVMBuildNot(builder, sel_mask, ""), "");
1837       const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1838       ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1839       ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1840
1841       /* now pick all 16 elements at once! */
1842       intrargs[0] = ainterp;
1843       intrargs[1] = acode;
1844       ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1845                                 bld8.vec_type, intrargs, 2, 0);
1846
1847       ares = LLVMBuildBitCast(builder, ares, i32t, "");
1848       mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1849       a[0] = LLVMBuildShl(builder, ares,
1850                           lp_build_const_int_vec(gallivm, type32, 24), "");
1851       a[1] = LLVMBuildShl(builder, ares,
1852                           lp_build_const_int_vec(gallivm, type32, 16), "");
1853       a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1854       a[2] = LLVMBuildShl(builder, ares,
1855                           lp_build_const_int_vec(gallivm, type32, 8), "");
1856       a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1857       a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1858    }
1859
1860    for (i = 0; i < 4; i++) {
1861       a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1862       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1863    }
1864 }
1865
1866
1867 static void
1868 generate_update_cache_one_block(struct gallivm_state *gallivm,
1869                                 LLVMValueRef function,
1870                                 const struct util_format_description *format_desc)
1871 {
1872    LLVMBasicBlockRef block;
1873    LLVMBuilderRef old_builder;
1874    LLVMValueRef ptr_addr;
1875    LLVMValueRef hash_index;
1876    LLVMValueRef cache;
1877    LLVMValueRef dxt_block, tag_value;
1878    LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1879
1880    ptr_addr     = LLVMGetParam(function, 0);
1881    hash_index   = LLVMGetParam(function, 1);
1882    cache        = LLVMGetParam(function, 2);
1883
1884    lp_build_name(ptr_addr,   "ptr_addr"  );
1885    lp_build_name(hash_index, "hash_index");
1886    lp_build_name(cache,      "cache_addr");
1887
1888    /*
1889     * Function body
1890     */
1891
1892    old_builder = gallivm->builder;
1893    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1894    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1895    LLVMPositionBuilderAtEnd(gallivm->builder, block);
1896
1897    lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1898                                       ptr_addr);
1899
1900    switch (format_desc->format) {
1901    case PIPE_FORMAT_DXT1_RGB:
1902    case PIPE_FORMAT_DXT1_RGBA:
1903    case PIPE_FORMAT_DXT1_SRGB:
1904    case PIPE_FORMAT_DXT1_SRGBA:
1905       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1906       break;
1907    case PIPE_FORMAT_DXT3_RGBA:
1908    case PIPE_FORMAT_DXT3_SRGBA:
1909       s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1910       break;
1911    case PIPE_FORMAT_DXT5_RGBA:
1912    case PIPE_FORMAT_DXT5_SRGBA:
1913       s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1914       break;
1915    default:
1916       assert(0);
1917       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1918       break;
1919    }
1920
1921    tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1922                                  LLVMInt64TypeInContext(gallivm->context), "");
1923    s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1924
1925    LLVMBuildRetVoid(gallivm->builder);
1926
1927    LLVMDisposeBuilder(gallivm->builder);
1928    gallivm->builder = old_builder;
1929
1930    gallivm_verify_function(gallivm, function);
1931 }
1932
1933
1934 static void
1935 update_cached_block(struct gallivm_state *gallivm,
1936                     const struct util_format_description *format_desc,
1937                     LLVMValueRef ptr_addr,
1938                     LLVMValueRef hash_index,
1939                     LLVMValueRef cache)
1940
1941 {
1942    LLVMBuilderRef builder = gallivm->builder;
1943    LLVMModuleRef module = gallivm->module;
1944    char name[256];
1945    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1946    LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1947    LLVMValueRef function, inst;
1948    LLVMBasicBlockRef bb;
1949    LLVMValueRef args[3];
1950
1951    util_snprintf(name, sizeof name, "%s_update_cache_one_block",
1952                  format_desc->short_name);
1953    function = LLVMGetNamedFunction(module, name);
1954
1955    if (!function) {
1956       LLVMTypeRef ret_type;
1957       LLVMTypeRef arg_types[3];
1958       LLVMTypeRef function_type;
1959       unsigned arg;
1960
1961       /*
1962        * Generate the function prototype.
1963        */
1964
1965       ret_type = LLVMVoidTypeInContext(gallivm->context);
1966       arg_types[0] = pi8t;
1967       arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
1968       arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
1969       function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
1970       function = LLVMAddFunction(module, name, function_type);
1971
1972       for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
1973          if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
1974             lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
1975
1976       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
1977       LLVMSetVisibility(function, LLVMHiddenVisibility);
1978       generate_update_cache_one_block(gallivm, function, format_desc);
1979    }
1980
1981    args[0] = ptr_addr;
1982    args[1] = hash_index;
1983    args[2] = cache;
1984
1985    LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
1986    bb = LLVMGetInsertBlock(builder);
1987    inst = LLVMGetLastInstruction(bb);
1988    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
1989 }
1990
1991 /*
1992  * cached lookup
1993  */
1994 static LLVMValueRef
1995 compressed_fetch_cached(struct gallivm_state *gallivm,
1996                         const struct util_format_description *format_desc,
1997                         unsigned n,
1998                         LLVMValueRef base_ptr,
1999                         LLVMValueRef offset,
2000                         LLVMValueRef i,
2001                         LLVMValueRef j,
2002                         LLVMValueRef cache)
2003
2004 {
2005    LLVMBuilderRef builder = gallivm->builder;
2006    unsigned count, low_bit, log2size;
2007    LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2008    LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2009    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2010    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2011    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2012    struct lp_type type;
2013    struct lp_build_context bld32;
2014    memset(&type, 0, sizeof type);
2015    type.width = 32;
2016    type.length = n;
2017
2018    lp_build_context_init(&bld32, gallivm, type);
2019
2020    /*
2021     * compute hash - we use direct mapped cache, the hash function could
2022     *                be better but it needs to be simple
2023     * per-element:
2024     *    compare offset with offset stored at tag (hash)
2025     *    if not equal extract block, store block, update tag
2026     *    extract color from cache
2027     *    assemble colors
2028     */
2029
2030    low_bit = util_logbase2(format_desc->block.bits / 8);
2031    log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2032    addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2033    ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2034    ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2035    /* For the hash function, first mask off the unused lowest bits. Then just
2036       do some xor with address bits - only use lower 32bits */
2037    ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2038    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2039                                  lp_build_const_int_vec(gallivm, type, low_bit), "");
2040    /* This only really makes sense for size 64,128,256 */
2041    hash_index = ptr_addrtrunc;
2042    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2043                                  lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2044    hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2045    tmp = LLVMBuildLShr(builder, hash_index,
2046                        lp_build_const_int_vec(gallivm, type, log2size), "");
2047    hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2048
2049    hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2050    hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2051    ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2052    ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2053    block_index = LLVMBuildShl(builder, hash_index,
2054                               lp_build_const_int_vec(gallivm, type, 4), "");
2055    block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2056
2057    if (n > 1) {
2058       color = bld32.undef;
2059       for (count = 0; count < n; count++) {
2060          LLVMValueRef index, cond, colorx;
2061          LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2062          struct lp_build_if_state if_ctx;
2063
2064          index = lp_build_const_int32(gallivm, count);
2065          offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2066          addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2067          addrx = LLVMBuildAdd(builder, addrx, addr, "");
2068          block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2069          hash_indexx = LLVMBuildLShr(builder, block_indexx,
2070                                      lp_build_const_int32(gallivm, 4), "");
2071          offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2072          cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2073
2074          lp_build_if(&if_ctx, gallivm, cond);
2075          {
2076             ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2077                                           LLVMPointerType(i8t, 0), "");
2078             update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2079 #if LP_BUILD_FORMAT_CACHE_DEBUG
2080             s3tc_update_cache_access(gallivm, cache, 1,
2081                                      LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2082 #endif
2083          }
2084          lp_build_endif(&if_ctx);
2085
2086          colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2087
2088          color = LLVMBuildInsertElement(builder, color, colorx,
2089                                         lp_build_const_int32(gallivm, count), "");
2090       }
2091    }
2092    else {
2093       LLVMValueRef cond;
2094       struct lp_build_if_state if_ctx;
2095
2096       tmp = LLVMBuildZExt(builder, offset, i64t, "");
2097       addr = LLVMBuildAdd(builder, tmp, addr, "");
2098       offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2099       cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2100
2101       lp_build_if(&if_ctx, gallivm, cond);
2102       {
2103          tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2104          update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2105 #if LP_BUILD_FORMAT_CACHE_DEBUG
2106          s3tc_update_cache_access(gallivm, cache, 1,
2107                                   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2108 #endif
2109       }
2110       lp_build_endif(&if_ctx);
2111
2112       color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2113    }
2114 #if LP_BUILD_FORMAT_CACHE_DEBUG
2115    s3tc_update_cache_access(gallivm, cache, n,
2116                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2117 #endif
2118    return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2119 }
2120
2121
2122 static LLVMValueRef
2123 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2124                       unsigned n,
2125                       enum pipe_format format,
2126                       LLVMValueRef colors,
2127                       LLVMValueRef codewords,
2128                       LLVMValueRef alpha_lo,
2129                       LLVMValueRef alpha_hi,
2130                       LLVMValueRef i,
2131                       LLVMValueRef j)
2132 {
2133    return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2134                                      codewords, alpha_lo, alpha_hi, i, j);
2135 }
2136
2137
2138 /**
2139  * @param n  number of pixels processed (usually n=4, but it should also work with n=1
2140  *           and multiples of 4)
2141  * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
2142  * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2143  * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
2144  * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
2145  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
2146  */
2147 LLVMValueRef
2148 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2149                              const struct util_format_description *format_desc,
2150                              unsigned n,
2151                              LLVMValueRef base_ptr,
2152                              LLVMValueRef offset,
2153                              LLVMValueRef i,
2154                              LLVMValueRef j,
2155                              LLVMValueRef cache)
2156 {
2157    LLVMValueRef rgba;
2158    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2159    LLVMBuilderRef builder = gallivm->builder;
2160
2161    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2162    assert(format_desc->block.width == 4);
2163    assert(format_desc->block.height == 4);
2164
2165    assert((n == 1) || (n % 4 == 0));
2166
2167 /*   debug_printf("format = %d\n", format_desc->format);*/
2168    if (cache) {
2169       rgba = compressed_fetch_cached(gallivm, format_desc, n,
2170                                      base_ptr, offset, i, j, cache);
2171       return rgba;
2172    }
2173
2174    /*
2175     * Could use n > 8 here with avx2, but doesn't seem faster.
2176     */
2177    if (n > 4) {
2178       unsigned count;
2179       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2180       LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2181       LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
2182       LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2183                                                 gallivm->context), 4);
2184       LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2185       struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2186
2187       assert(n / 4 <= ARRAY_SIZE(rgba4));
2188
2189       rgba = LLVMGetUndef(i128_vectype);
2190
2191       for (count = 0; count < n / 4; count++) {
2192          LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2193
2194          i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2195          j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2196          offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2197
2198          lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2199                               &alpha_lo, &alpha_hi, base_ptr, offset4);
2200
2201          switch (format_desc->format) {
2202          case PIPE_FORMAT_DXT1_RGB:
2203          case PIPE_FORMAT_DXT1_RGBA:
2204          case PIPE_FORMAT_DXT1_SRGB:
2205          case PIPE_FORMAT_DXT1_SRGBA:
2206             rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2207                                                  colors, codewords, i4, j4);
2208             break;
2209          case PIPE_FORMAT_DXT3_RGBA:
2210          case PIPE_FORMAT_DXT3_SRGBA:
2211             rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2212                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2213             break;
2214          case PIPE_FORMAT_DXT5_RGBA:
2215          case PIPE_FORMAT_DXT5_SRGBA:
2216             rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2217                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2218             break;
2219          default:
2220             assert(0);
2221             rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2222             break;
2223          }
2224          /* shuffles typically give best results with dword elements...*/
2225          rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2226       }
2227       rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2228       rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2229    }
2230    else {
2231       LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2232
2233       lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2234                            &alpha_lo, &alpha_hi, base_ptr, offset);
2235
2236       switch (format_desc->format) {
2237       case PIPE_FORMAT_DXT1_RGB:
2238       case PIPE_FORMAT_DXT1_RGBA:
2239       case PIPE_FORMAT_DXT1_SRGB:
2240       case PIPE_FORMAT_DXT1_SRGBA:
2241          rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2242                                       colors, codewords, i, j);
2243          break;
2244       case PIPE_FORMAT_DXT3_RGBA:
2245       case PIPE_FORMAT_DXT3_SRGBA:
2246          rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2247                                       codewords, alpha_lo, alpha_hi, i, j);
2248          break;
2249       case PIPE_FORMAT_DXT5_RGBA:
2250       case PIPE_FORMAT_DXT5_SRGBA:
2251          rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2252                                       codewords, alpha_lo, alpha_hi, i, j);
2253          break;
2254       default:
2255          assert(0);
2256          rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2257          break;
2258       }
2259    }
2260
2261    /* always return just decompressed values - srgb conversion is done later */
2262
2263    return rgba;
2264 }