src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2010-2018 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  * The above copyright notice and this permission notice (including the
  23  * next paragraph) shall be included in all copies or substantial portions
  24  * of the Software.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * s3tc pixel format manipulation.
  32  *
  33  * @author Roland Scheidegger <sroland@vmware.com>
  34  */
  35
  36
  37 #include <llvm/Config/llvm-config.h>
  38
  39 #include "util/format/u_format.h"
  40 #include "util/u_math.h"
  41 #include "util/u_string.h"
  42 #include "util/u_cpu_detect.h"
  43 #include "util/u_debug.h"
  44
  45 #include "lp_bld_arit.h"
  46 #include "lp_bld_type.h"
  47 #include "lp_bld_const.h"
  48 #include "lp_bld_conv.h"
  49 #include "lp_bld_gather.h"
  50 #include "lp_bld_format.h"
  51 #include "lp_bld_logic.h"
  52 #include "lp_bld_pack.h"
  53 #include "lp_bld_flow.h"
  54 #include "lp_bld_printf.h"
  55 #include "lp_bld_struct.h"
  56 #include "lp_bld_swizzle.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_debug.h"
  59 #include "lp_bld_intr.h"
  60
  61
  62 /**
  63  * Reverse an interleave2_half
  64  * (ie. pick every second element, independent lower/upper halfs)
  65  * sse2 can only do that with 32bit (shufps) or larger elements
  66  * natively. (Otherwise, and/pack (even) or shift/pack (odd)
  67  * could be used, ideally llvm would do that for us.)
  68  * XXX: Unfortunately, this does NOT translate to a shufps if those
  69  * are int vectors (and casting will not help, llvm needs to recognize it
  70  * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
  71  * sequence which I'm pretty sure is a lot worse despite domain transition
  72  * penalties with shufps (except maybe on Nehalem).
  73  */
  74 static LLVMValueRef
  75 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
  76                             struct lp_type type,
  77                             LLVMValueRef a,
  78                             LLVMValueRef b,
  79                             unsigned lo_hi)
  80 {
  81    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
  82    unsigned i;
  83
  84    assert(type.length <= LP_MAX_VECTOR_LENGTH);
  85    assert(lo_hi < 2);
  86
  87    if (type.length * type.width == 256) {
  88       assert(type.length == 8);
  89       assert(type.width == 32);
  90       static const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
  91       for (i = 0; i < type.length; ++i) {
  92          elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
  93       }
  94    } else {
  95       for (i = 0; i < type.length; ++i) {
  96          elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
  97       }
  98    }
  99
 100    shuffle = LLVMConstVector(elems, type.length);
 101
 102    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 103
 104 }
 105
 106
 107 /**
 108  * Build shuffle for extending vectors.
 109  */
 110 static LLVMValueRef
 111 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
 112                               unsigned n, unsigned length)
 113 {
 114    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 115    unsigned i;
 116
 117    assert(n <= length);
 118    assert(length <= LP_MAX_VECTOR_LENGTH);
 119
 120    /* TODO: cache results in a static table */
 121
 122    for(i = 0; i < n; i++) {
 123       elems[i] = lp_build_const_int32(gallivm, i);
 124    }
 125    for (i = n; i < length; i++) {
 126       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 127    }
 128
 129    return LLVMConstVector(elems, length);
 130 }
 131
 132 static LLVMValueRef
 133 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
 134 {
 135    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 136    unsigned i, j;
 137
 138    assert(n <= LP_MAX_VECTOR_LENGTH);
 139
 140    /* TODO: cache results in a static table */
 141
 142    for(i = 0, j = 0; i < n; i += 2, ++j) {
 143       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
 144       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
 145       elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
 146       elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
 147    }
 148
 149    return LLVMConstVector(elems, n * 2);
 150 }
 151
 152 /*
 153  * broadcast 1 element to all elements
 154  */
 155 static LLVMValueRef
 156 lp_build_const_shuffle1(struct gallivm_state *gallivm,
 157                         unsigned index, unsigned n)
 158 {
 159    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 160    unsigned i;
 161
 162    assert(n <= LP_MAX_VECTOR_LENGTH);
 163
 164    /* TODO: cache results in a static table */
 165
 166    for (i = 0; i < n; i++) {
 167       elems[i] = lp_build_const_int32(gallivm, index);
 168    }
 169
 170    return LLVMConstVector(elems, n);
 171 }
 172
 173 /*
 174  * move 1 element to pos 0, rest undef
 175  */
 176 static LLVMValueRef
 177 lp_build_shuffle1undef(struct gallivm_state *gallivm,
 178                        LLVMValueRef a, unsigned index, unsigned n)
 179 {
 180    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
 181    unsigned i;
 182
 183    assert(n <= LP_MAX_VECTOR_LENGTH);
 184
 185    elems[0] = lp_build_const_int32(gallivm, index);
 186
 187    for (i = 1; i < n; i++) {
 188       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 189    }
 190    shuf = LLVMConstVector(elems, n);
 191
 192    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
 193 }
 194
 195 static boolean
 196 format_dxt1_variant(enum pipe_format format)
 197 {
 198   return format == PIPE_FORMAT_DXT1_RGB ||
 199          format == PIPE_FORMAT_DXT1_RGBA ||
 200          format == PIPE_FORMAT_DXT1_SRGB ||
 201          format == PIPE_FORMAT_DXT1_SRGBA;
 202
 203 }
 204
 205 /**
 206  * Gather elements from scatter positions in memory into vectors.
 207  * This is customised for fetching texels from s3tc textures.
 208  * For SSE, typical value is length=4.
 209  *
 210  * @param length length of the offsets
 211  * @param colors the stored colors of the blocks will be extracted into this.
 212  * @param codewords the codewords of the blocks will be extracted into this.
 213  * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
 214  * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
 215  * @param base_ptr base pointer, should be a i8 pointer type.
 216  * @param offsets vector with offsets
 217  */
 218 static void
 219 lp_build_gather_s3tc(struct gallivm_state *gallivm,
 220                      unsigned length,
 221                      const struct util_format_description *format_desc,
 222                      LLVMValueRef *colors,
 223                      LLVMValueRef *codewords,
 224                      LLVMValueRef *alpha_lo,
 225                      LLVMValueRef *alpha_hi,
 226                      LLVMValueRef base_ptr,
 227                      LLVMValueRef offsets)
 228 {
 229    LLVMBuilderRef builder = gallivm->builder;
 230    unsigned block_bits = format_desc->block.bits;
 231    unsigned i;
 232    LLVMValueRef elems[8];
 233    LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
 234    LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
 235    LLVMTypeRef type32dxt;
 236    struct lp_type lp_type32dxt;
 237
 238    memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
 239    lp_type32dxt.width = 32;
 240    lp_type32dxt.length = block_bits / 32;
 241    type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
 242
 243    assert(block_bits == 64 || block_bits == 128);
 244    assert(length == 1 || length == 4 || length == 8);
 245
 246    for (i = 0; i < length; ++i) {
 247       elems[i] = lp_build_gather_elem(gallivm, length,
 248                                       block_bits, block_bits, TRUE,
 249                                       base_ptr, offsets, i, FALSE);
 250       elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
 251    }
 252    if (length == 1) {
 253       LLVMValueRef elem = elems[0];
 254       if (block_bits == 128) {
 255          *alpha_lo = LLVMBuildExtractElement(builder, elem,
 256                                              lp_build_const_int32(gallivm, 0), "");
 257          *alpha_hi = LLVMBuildExtractElement(builder, elem,
 258                                              lp_build_const_int32(gallivm, 1), "");
 259          *colors = LLVMBuildExtractElement(builder, elem,
 260                                            lp_build_const_int32(gallivm, 2), "");
 261          *codewords = LLVMBuildExtractElement(builder, elem,
 262                                               lp_build_const_int32(gallivm, 3), "");
 263       }
 264       else {
 265          *alpha_lo = LLVMGetUndef(type32);
 266          *alpha_hi = LLVMGetUndef(type32);
 267          *colors = LLVMBuildExtractElement(builder, elem,
 268                                            lp_build_const_int32(gallivm, 0), "");
 269          *codewords = LLVMBuildExtractElement(builder, elem,
 270                                               lp_build_const_int32(gallivm, 1), "");
 271       }
 272    }
 273    else {
 274       LLVMValueRef tmp[4], cc01, cc23;
 275       struct lp_type lp_type32, lp_type64;
 276       memset(&lp_type32, 0, sizeof lp_type32);
 277       lp_type32.width = 32;
 278       lp_type32.length = length;
 279       memset(&lp_type64, 0, sizeof lp_type64);
 280       lp_type64.width = 64;
 281       lp_type64.length = length/2;
 282
 283       if (block_bits == 128) {
 284          if (length == 8) {
 285             for (i = 0; i < 4; ++i) {
 286                tmp[0] = elems[i];
 287                tmp[1] = elems[i+4];
 288                elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
 289             }
 290          }
 291          lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
 292          *colors = tmp[2];
 293          *codewords = tmp[3];
 294          *alpha_lo = tmp[0];
 295          *alpha_hi = tmp[1];
 296       } else {
 297          LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
 298          LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
 299
 300          for (i = 0; i < length; ++i) {
 301             /* no-op shuffle */
 302             elems[i] = LLVMBuildShuffleVector(builder, elems[i],
 303                                               LLVMGetUndef(type32dxt),
 304                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
 305          }
 306          if (length == 8) {
 307             struct lp_type lp_type32_4 = {0};
 308             lp_type32_4.width = 32;
 309             lp_type32_4.length = 4;
 310             for (i = 0; i < 4; ++i) {
 311                tmp[0] = elems[i];
 312                tmp[1] = elems[i+4];
 313                elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
 314             }
 315          }
 316          cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
 317          cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
 318          cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
 319          cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
 320          *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
 321          *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
 322          *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
 323          *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
 324       }
 325    }
 326 }
 327
 328 /** Convert from <n x i32> containing 2 x n rgb565 colors
 329  * to 2 <n x i32> rgba8888 colors
 330  * This is the most optimized version I can think of
 331  * should be nearly as fast as decoding only one color
 332  * NOTE: alpha channel will be set to 0
 333  * @param colors  is a <n x i32> vector containing the rgb565 colors
 334  */
 335 static void
 336 color_expand2_565_to_8888(struct gallivm_state *gallivm,
 337                           unsigned n,
 338                           LLVMValueRef colors,
 339                           LLVMValueRef *color0,
 340                           LLVMValueRef *color1)
 341 {
 342    LLVMBuilderRef builder = gallivm->builder;
 343    LLVMValueRef r, g, b, rblo, glo;
 344    LLVMValueRef rgblomask, rb, rgb0, rgb1;
 345    struct lp_type type, type16, type8;
 346
 347    assert(n > 1);
 348
 349    memset(&type, 0, sizeof type);
 350    type.width = 32;
 351    type.length = n;
 352
 353    memset(&type16, 0, sizeof type16);
 354    type16.width = 16;
 355    type16.length = 2 * n;
 356
 357    memset(&type8, 0, sizeof type8);
 358    type8.width = 8;
 359    type8.length = 4 * n;
 360
 361    rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
 362    colors = LLVMBuildBitCast(builder, colors,
 363                              lp_build_vec_type(gallivm, type16), "");
 364    /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
 365     * make sure low bits of r are zero - could use AND but requires constant */
 366    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
 367    r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
 368    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
 369    rb = LLVMBuildOr(builder, r, b, "");
 370    rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
 371    /* don't have byte shift hence need mask */
 372    rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
 373    rb = LLVMBuildOr(builder, rb, rblo, "");
 374
 375    /* make sure low bits of g are zero */
 376    g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
 377    g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
 378    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
 379    g = LLVMBuildOr(builder, g, glo, "");
 380
 381    rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
 382    g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
 383    rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
 384    rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
 385
 386    rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
 387    rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
 388
 389    /* rgb0 is rgb00, rgb01, rgb10, rgb11
 390     * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
 391     * on x86 this _should_ just generate one shufps...
 392     */
 393    *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
 394    *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
 395 }
 396
 397
 398 /** Convert from <n x i32> containing rgb565 colors
 399  * (in first 16 bits) to <n x i32> rgba8888 colors
 400  * bits 16-31 MBZ
 401  * NOTE: alpha channel will be set to 0
 402  * @param colors  is a <n x i32> vector containing the rgb565 colors
 403  */
 404 static LLVMValueRef
 405 color_expand_565_to_8888(struct gallivm_state *gallivm,
 406                          unsigned n,
 407                          LLVMValueRef colors)
 408 {
 409    LLVMBuilderRef builder = gallivm->builder;
 410    LLVMValueRef rgba, r, g, b, rgblo, glo;
 411    LLVMValueRef rbhimask, g6mask, rgblomask;
 412    struct lp_type type;
 413    memset(&type, 0, sizeof type);
 414    type.width = 32;
 415    type.length = n;
 416
 417    /* color expansion:
 418     * first extract and shift colors into their final locations
 419     * (high bits - low bits zero at this point)
 420     * then replicate highest bits to the lowest bits
 421     * note rb replication can be done in parallel but not g
 422     * (different shift)
 423     * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
 424     * rhigh = 8, ghigh = 5, bhigh = 19
 425     * rblow = 5, glow = 6
 426     * rgblowmask = 0x00070307
 427     * r = colors >> rhigh
 428     * b = colors << bhigh
 429     * g = (colors & g6mask) << ghigh
 430     * rb = (r | b) rbhimask
 431     * rbtmp = rb >> rblow
 432     * gtmp = rb >> glow
 433     * rbtmp = rbtmp | gtmp
 434     * rbtmp = rbtmp & rgblowmask
 435     * rgb = rb | g | rbtmp
 436     */
 437    g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
 438    rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
 439    rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
 440
 441    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
 442    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
 443    g = LLVMBuildAnd(builder, colors, g6mask, "");
 444    g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
 445    rgba = LLVMBuildOr(builder, r, b, "");
 446    rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
 447    rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
 448    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
 449    rgblo = LLVMBuildOr(builder, rgblo, glo, "");
 450    rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
 451    rgba = LLVMBuildOr(builder, rgba, g, "");
 452    rgba = LLVMBuildOr(builder, rgba, rgblo, "");
 453
 454    return rgba;
 455 }
 456
 457
 458 /*
 459  * Average two byte vectors. (Will always round up.)
 460  */
 461 static LLVMValueRef
 462 lp_build_pavgb(struct lp_build_context *bld8,
 463                LLVMValueRef v0,
 464                LLVMValueRef v1)
 465 {
 466    struct gallivm_state *gallivm = bld8->gallivm;
 467    LLVMBuilderRef builder = gallivm->builder;
 468    assert(bld8->type.width == 8);
 469    assert(bld8->type.length == 16 || bld8->type.length == 32);
 470    if (LLVM_VERSION_MAJOR < 6) {
 471       LLVMValueRef intrargs[2];
 472       char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
 473                                                   "llvm.x86.sse2.pavg.b";
 474       intrargs[0] = v0;
 475       intrargs[1] = v1;
 476       return lp_build_intrinsic(builder, intr_name,
 477                                 bld8->vec_type, intrargs, 2, 0);
 478    } else {
 479       /*
 480        * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
 481        * You better hope the backend code manages to detect the pattern, and
 482        * the pattern doesn't change there...
 483        */
 484       struct lp_type type_ext = bld8->type;
 485       LLVMTypeRef vec_type_ext;
 486       LLVMValueRef res;
 487       LLVMValueRef ext_one;
 488       type_ext.width = 16;
 489       vec_type_ext = lp_build_vec_type(gallivm, type_ext);
 490       ext_one = lp_build_const_vec(gallivm, type_ext, 1);
 491
 492       v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
 493       v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
 494       res = LLVMBuildAdd(builder, v0, v1, "");
 495       res = LLVMBuildAdd(builder, res, ext_one, "");
 496       res = LLVMBuildLShr(builder, res, ext_one, "");
 497       res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
 498       return res;
 499    }
 500 }
 501
 502 /**
 503  * Calculate 1/3(v1-v0) + v0
 504  * and 2*1/3(v1-v0) + v0
 505  */
 506 static void
 507 lp_build_lerp23(struct lp_build_context *bld,
 508                 LLVMValueRef v0,
 509                 LLVMValueRef v1,
 510                 LLVMValueRef *res0,
 511                 LLVMValueRef *res1)
 512 {
 513    struct gallivm_state *gallivm = bld->gallivm;
 514    LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
 515    LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
 516    const struct lp_type type = bld->type;
 517    LLVMBuilderRef builder = bld->gallivm->builder;
 518    struct lp_type i16_type = lp_wider_type(type);
 519    struct lp_build_context bld2;
 520
 521    assert(lp_check_value(type, v0));
 522    assert(lp_check_value(type, v1));
 523    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
 524
 525    lp_build_context_init(&bld2, gallivm, i16_type);
 526    bld2.type.sign = TRUE;
 527    x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
 528
 529    /* FIXME: use native avx256 unpack/pack */
 530    lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
 531    lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
 532    lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
 533    delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
 534    delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
 535
 536    mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
 537    mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
 538
 539    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
 540    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
 541    /* lerp optimization: pack now, do add afterwards */
 542    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
 543    *res0 = lp_build_add(bld, tmp, v0);
 544
 545    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
 546    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
 547    /* unlike above still need mask (but add still afterwards). */
 548    x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
 549    x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
 550    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
 551    *res1 = lp_build_add(bld, tmp, v0);
 552 }
 553
 554 /**
 555  * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
 556  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 557  * @param codewords  is a <n x i32> vector containing the codewords
 558  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 559  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 560  */
 561 static LLVMValueRef
 562 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
 563                            unsigned n,
 564                            enum pipe_format format,
 565                            LLVMValueRef colors,
 566                            LLVMValueRef codewords,
 567                            LLVMValueRef i,
 568                            LLVMValueRef j)
 569 {
 570    LLVMBuilderRef builder = gallivm->builder;
 571    LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
 572    LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
 573    LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
 574    struct lp_type type, type8;
 575    struct lp_build_context bld8, bld32;
 576    boolean is_dxt1_variant = format_dxt1_variant(format);
 577
 578    memset(&type, 0, sizeof type);
 579    type.width = 32;
 580    type.length = n;
 581
 582    memset(&type8, 0, sizeof type8);
 583    type8.width = 8;
 584    type8.length = 4*n;
 585
 586    assert(lp_check_value(type, i));
 587    assert(lp_check_value(type, j));
 588
 589    a = lp_build_const_int_vec(gallivm, type, 0xff000000);
 590
 591    lp_build_context_init(&bld32, gallivm, type);
 592    lp_build_context_init(&bld8, gallivm, type8);
 593
 594    /*
 595     * works as follows:
 596     * - expand color0/color1 to rgba8888
 597     * - calculate color2/3 (interpolation) according to color0 < color1 rules
 598     * - calculate color2/3 according to color0 >= color1 rules
 599     * - do selection of color2/3 according to comparison of color0/1
 600     * - extract indices (vector shift).
 601     * - use compare/select to select the correct color. Since we have 2bit
 602     *   indices (and 4 colors), needs at least three compare/selects.
 603     */
 604    /*
 605     * expand the two colors
 606     */
 607    col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
 608    col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
 609    if (n > 1) {
 610       color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
 611    }
 612    else {
 613       color0 = color_expand_565_to_8888(gallivm, n, col0);
 614       color1 = color_expand_565_to_8888(gallivm, n, col1);
 615    }
 616
 617    /*
 618     * interpolate colors
 619     * color2_1 is 2/3 color0 + 1/3 color1
 620     * color3_1 is 1/3 color0 + 2/3 color1
 621     * color2_2 is 1/2 color0 + 1/2 color1
 622     * color3_2 is 0
 623     */
 624
 625    colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
 626    colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
 627    /* can combine 2 lerps into one mostly - still looks expensive enough. */
 628    lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
 629    color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
 630    color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
 631
 632    /* dxt3/5 always use 4-color encoding */
 633    if (is_dxt1_variant) {
 634       /* fix up alpha */
 635       if (format == PIPE_FORMAT_DXT1_RGBA ||
 636           format == PIPE_FORMAT_DXT1_SRGBA) {
 637          color0 = LLVMBuildOr(builder, color0, a, "");
 638          color1 = LLVMBuildOr(builder, color1, a, "");
 639          color3 = LLVMBuildOr(builder, color3, a, "");
 640       }
 641       /*
 642        * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
 643        * Much cheaper (but we don't care that much if n == 1).
 644        */
 645       if ((util_cpu_caps.has_sse2 && n == 4) ||
 646           (util_cpu_caps.has_avx2 && n == 8)) {
 647          color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
 648          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
 649       }
 650       else {
 651          struct lp_type i16_type = lp_wider_type(type8);
 652          struct lp_build_context bld2;
 653          LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
 654
 655          lp_build_context_init(&bld2, gallivm, i16_type);
 656          bld2.type.sign = TRUE;
 657
 658          /*
 659           * This isn't as expensive as it looks (the unpack is the same as
 660           * for lerp23), with correct rounding.
 661           * (Note that while rounding is correct, this will always round down,
 662           * whereas pavgb will always round up.)
 663           */
 664          /* FIXME: use native avx256 unpack/pack */
 665          lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
 666          lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
 667
 668          addlo = lp_build_add(&bld2, v0_lo, v1_lo);
 669          addhi = lp_build_add(&bld2, v0_hi, v1_hi);
 670          addlo = LLVMBuildLShr(builder, addlo,
 671                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
 672          addhi = LLVMBuildLShr(builder, addhi,
 673                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
 674          color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
 675          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
 676       }
 677       color3_2 = lp_build_const_int_vec(gallivm, type, 0);
 678
 679       /* select between colors2/3 */
 680       /* signed compare is faster saves some xors */
 681       type.sign = TRUE;
 682       sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
 683       color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
 684       color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
 685       type.sign = FALSE;
 686
 687       if (format == PIPE_FORMAT_DXT1_RGBA ||
 688           format == PIPE_FORMAT_DXT1_SRGBA) {
 689          color2 = LLVMBuildOr(builder, color2, a, "");
 690       }
 691    }
 692
 693    const2 = lp_build_const_int_vec(gallivm, type, 2);
 694    /* extract 2-bit index values */
 695    bit_pos = LLVMBuildShl(builder, j, const2, "");
 696    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 697    bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
 698    /*
 699     * NOTE: This innocent looking shift is very expensive with x86/ssex.
 700     * Shifts with per-elemnent shift count get roughly translated to
 701     * extract (count), extract (value), shift, move (back to xmm), unpack
 702     * per element!
 703     * So about 20 instructions here for 4xi32.
 704     * Newer llvm versions (3.7+) will not do extract/insert but use a
 705     * a couple constant count vector shifts plus shuffles. About same
 706     * amount of instructions unfortunately...
 707     * Would get much worse with 8xi16 even...
 708     * We could actually do better here:
 709     * - subtract bit_pos from 128+30, shl 23, convert float to int...
 710     * - now do mul with codewords followed by shr 30...
 711     * But requires 32bit->32bit mul, sse41 only (well that's emulatable
 712     * with 2 32bit->64bit muls...) and not exactly cheap
 713     * AVX2, of course, fixes this nonsense.
 714     */
 715    indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
 716
 717    /* finally select the colors */
 718    sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
 719    sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
 720    color0 = lp_build_select(&bld32, sel_lo, color1, color0);
 721    color2 = lp_build_select(&bld32, sel_lo, color3, color2);
 722    sel_hi = LLVMBuildAnd(builder, indices, const2, "");
 723    sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
 724    rgba = lp_build_select(&bld32, sel_hi, color2, color0);
 725
 726    /* fix up alpha */
 727    if (format == PIPE_FORMAT_DXT1_RGB ||
 728        format == PIPE_FORMAT_DXT1_SRGB) {
 729       rgba = LLVMBuildOr(builder, rgba, a, "");
 730    }
 731    return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
 732 }
 733
 734
 735 static LLVMValueRef
 736 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
 737                       unsigned n,
 738                       enum pipe_format format,
 739                       LLVMValueRef colors,
 740                       LLVMValueRef codewords,
 741                       LLVMValueRef i,
 742                       LLVMValueRef j)
 743 {
 744    return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
 745                                      colors, codewords, i, j);
 746 }
 747
 748
 749 /**
 750  * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
 751  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 752  * @param codewords  is a <n x i32> vector containing the codewords
 753  * @param alphas  is a <n x i64> vector containing the alpha values
 754  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 755  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 756  */
 757 static LLVMValueRef
 758 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
 759                       unsigned n,
 760                       enum pipe_format format,
 761                       LLVMValueRef colors,
 762                       LLVMValueRef codewords,
 763                       LLVMValueRef alpha_low,
 764                       LLVMValueRef alpha_hi,
 765                       LLVMValueRef i,
 766                       LLVMValueRef j)
 767 {
 768    LLVMBuilderRef builder = gallivm->builder;
 769    LLVMValueRef rgba, tmp, tmp2;
 770    LLVMValueRef bit_pos, sel_mask;
 771    struct lp_type type, type8;
 772    struct lp_build_context bld;
 773
 774    memset(&type, 0, sizeof type);
 775    type.width = 32;
 776    type.length = n;
 777
 778    memset(&type8, 0, sizeof type8);
 779    type8.width = 8;
 780    type8.length = n*4;
 781
 782    assert(lp_check_value(type, i));
 783    assert(lp_check_value(type, j));
 784
 785    lp_build_context_init(&bld, gallivm, type);
 786
 787    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
 788                                 colors, codewords, i, j);
 789
 790    rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
 791
 792    /*
 793     * Extract alpha values. Since we now need to select from
 794     * which 32bit vector values are fetched, construct selection
 795     * mask from highest bit of bit_pos, and use select, then shift
 796     * according to the bit_pos (without the highest bit).
 797     * Note this is pointless for n == 1 case. Could just
 798     * directly use 64bit arithmetic if we'd extract 64bit
 799     * alpha value instead of 2x32...
 800     */
 801    /* pos = 4*(4j+i) */
 802    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
 803    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 804    bit_pos = LLVMBuildShl(builder, bit_pos,
 805                           lp_build_const_int_vec(gallivm, type, 2), "");
 806    sel_mask = LLVMBuildLShr(builder, bit_pos,
 807                             lp_build_const_int_vec(gallivm, type, 5), "");
 808    sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
 809    tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
 810    bit_pos = LLVMBuildAnd(builder, bit_pos,
 811                           lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
 812    /* Warning: slow shift with per element count (without avx2) */
 813    /*
 814     * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
 815     * to select the right byte with pshufb. Then for the remaining one bit
 816     * just do shift/select.
 817     */
 818    tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
 819
 820    /* combined expand from a4 to a8 and shift into position */
 821    tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
 822    tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
 823    tmp = LLVMBuildOr(builder, tmp, tmp2, "");
 824
 825    rgba = LLVMBuildOr(builder, tmp, rgba, "");
 826
 827    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
 828 }
 829
 830 static LLVMValueRef
 831 lp_build_lerpdxta(struct gallivm_state *gallivm,
 832                   LLVMValueRef alpha0,
 833                   LLVMValueRef alpha1,
 834                   LLVMValueRef code,
 835                   LLVMValueRef sel_mask,
 836                   unsigned n)
 837 {
 838    /*
 839     * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
 840     * (plus pmullw is actually faster...)
 841     * we just pretend our 32bit values (which are really only 8bit) are 16bits.
 842     * Note that this is obviously a disaster for the scalar case.
 843     */
 844    LLVMBuilderRef builder = gallivm->builder;
 845    LLVMValueRef delta, ainterp;
 846    LLVMValueRef weight5, weight7, weight;
 847    struct lp_type type32, type16, type8;
 848    struct lp_build_context bld16;
 849
 850    memset(&type32, 0, sizeof type32);
 851    type32.width = 32;
 852    type32.length = n;
 853    memset(&type16, 0, sizeof type16);
 854    type16.width = 16;
 855    type16.length = 2*n;
 856    type16.sign = TRUE;
 857    memset(&type8, 0, sizeof type8);
 858    type8.width = 8;
 859    type8.length = 4*n;
 860
 861    lp_build_context_init(&bld16, gallivm, type16);
 862    /* 255/7 is a bit off - increase accuracy at the expense of shift later */
 863    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
 864    weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
 865    weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
 866    weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
 867
 868    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
 869    alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
 870    code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
 871    /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
 872       but we don't care */
 873    code = LLVMBuildSub(builder, code, bld16.one, "");
 874
 875    weight = LLVMBuildMul(builder, weight, code, "");
 876    weight = LLVMBuildLShr(builder, weight,
 877                           lp_build_const_int_vec(gallivm, type16, 6), "");
 878
 879    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
 880
 881    ainterp = LLVMBuildMul(builder, delta, weight, "");
 882    ainterp = LLVMBuildLShr(builder, ainterp,
 883                            lp_build_const_int_vec(gallivm, type16, 8), "");
 884
 885    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
 886    alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
 887    ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
 888    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
 889
 890    return ainterp;
 891 }
 892
 893 static LLVMValueRef
 894 s3tc_dxt5_alpha_channel(struct gallivm_state *gallivm,
 895                         bool is_signed,
 896                         unsigned n,
 897                         LLVMValueRef alpha_hi, LLVMValueRef alpha_lo,
 898                         LLVMValueRef i, LLVMValueRef j)
 899 {
 900    LLVMBuilderRef builder = gallivm->builder;
 901    struct lp_type type;
 902    LLVMValueRef tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
 903    LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
 904    LLVMValueRef mask6, mask7, ainterp;
 905    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
 906    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 907    LLVMTypeRef i8t = LLVMInt32TypeInContext(gallivm->context);
 908    struct lp_build_context bld32;
 909
 910    memset(&type, 0, sizeof type);
 911    type.width = 32;
 912    type.length = n;
 913
 914    lp_build_context_init(&bld32, gallivm, type);
 915    /* this looks pretty complex for vectorization:
 916     * extract a0/a1 values
 917     * extract code
 918     * select weights for interpolation depending on a0 > a1
 919     * mul weights by code - 1
 920     * lerp a0/a1/weights
 921     * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
 922     */
 923
 924    alpha0 = LLVMBuildAnd(builder, alpha_lo,
 925                          lp_build_const_int_vec(gallivm, type, 0xff), "");
 926    if (is_signed) {
 927       alpha0 = LLVMBuildTrunc(builder, alpha0, i8t, "");
 928       alpha0 = LLVMBuildSExt(builder, alpha0, i32t, "");
 929    }
 930
 931    alpha1 = LLVMBuildLShr(builder, alpha_lo,
 932                           lp_build_const_int_vec(gallivm, type, 8), "");
 933    alpha1 = LLVMBuildAnd(builder, alpha1,
 934                          lp_build_const_int_vec(gallivm, type, 0xff), "");
 935    if (is_signed) {
 936       alpha1 = LLVMBuildTrunc(builder, alpha1, i8t, "");
 937       alpha1 = LLVMBuildSExt(builder, alpha1, i32t, "");
 938    }
 939
 940    /* pos = 3*(4j+i) */
 941    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
 942    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 943    tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
 944    bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
 945    /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
 946    bit_pos = LLVMBuildAdd(builder, bit_pos,
 947                           lp_build_const_int_vec(gallivm, type, 16), "");
 948
 949    if (n == 1) {
 950       struct lp_type type64;
 951       memset(&type64, 0, sizeof type64);
 952       type64.width = 64;
 953       type64.length = 1;
 954       /* This is pretty pointless could avoid by just directly extracting
 955          64bit in the first place but makes it more complicated elsewhere */
 956       alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
 957       alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
 958       alphac0 = LLVMBuildShl(builder, alpha_hi,
 959                              lp_build_const_int_vec(gallivm, type64, 32), "");
 960       alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
 961
 962       shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
 963       alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
 964       alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
 965       alphac = LLVMBuildAnd(builder, alphac0,
 966                             lp_build_const_int_vec(gallivm, type, 0x7), "");
 967    }
 968    else {
 969       /*
 970        * Using non-native vector length here (actually, with avx2 and
 971        * n == 4 llvm will indeed expand to ymm regs...)
 972        * At least newer llvm versions handle that ok.
 973        * llvm 3.7+ will even handle the emulated 64bit shift with variable
 974        * shift count without extraction (and it's actually easier to
 975        * emulate than the 32bit one).
 976        */
 977       alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
 978                                        lp_build_const_unpackx2_shuffle(gallivm, n), "");
 979
 980       alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
 981       shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
 982       alphac = LLVMBuildLShr(builder, alpha64, shift, "");
 983       alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
 984
 985       alphac = LLVMBuildAnd(builder, alphac,
 986                             lp_build_const_int_vec(gallivm, type, 0x7), "");
 987    }
 988
 989    /* signed compare is faster saves some xors */
 990    type.sign = TRUE;
 991    /* alpha0 > alpha1 selection */
 992    sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
 993                                alpha0, alpha1);
 994    ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
 995
 996    /*
 997     * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
 998     * else we select a0 for case 0, a1 for case 1,
 999     * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1000     * a = (c == 0) ? a0 : a1
1001     * a = (c > 1) ? ainterp : a
1002     * Finally handle case 6/7 for !(a0 > a1)
1003     * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1004     * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1005     */
1006    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1007                                alphac, bld32.zero);
1008    alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1009    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1010                                alphac, bld32.one);
1011    alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1012
1013    code_s = LLVMBuildAnd(builder, alphac,
1014                          LLVMBuildNot(builder, sel_mask, ""), "");
1015    mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1016                             code_s, lp_build_const_int_vec(gallivm, type, 6));
1017    mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1018                             code_s, lp_build_const_int_vec(gallivm, type, 7));
1019    if (is_signed) {
1020       alpha = lp_build_select(&bld32, mask6, lp_build_const_int_vec(gallivm, type, 127), alpha);
1021       alpha = lp_build_select(&bld32, mask7, lp_build_const_int_vec(gallivm, type, -127), alpha);
1022    } else {
1023       alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1024       alpha = LLVMBuildOr(builder, alpha, mask7, "");
1025    }
1026    /* There can be garbage in upper bits, mask them off for rgtc formats */
1027    alpha = LLVMBuildAnd(builder, alpha, lp_build_const_int_vec(gallivm, type, 0xff), "");
1028
1029    return alpha;
1030 }
1031
1032 /**
1033  * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
1034  * @param colors  is a <n x i32> vector with n x 2x16bit colors
1035  * @param codewords  is a <n x i32> vector containing the codewords
1036  * @param alphas  is a <n x i64> vector containing the alpha values
1037  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
1038  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
1039  */
1040 static LLVMValueRef
1041 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
1042                            unsigned n,
1043                            enum pipe_format format,
1044                            LLVMValueRef colors,
1045                            LLVMValueRef codewords,
1046                            LLVMValueRef alpha_lo,
1047                            LLVMValueRef alpha_hi,
1048                            LLVMValueRef i,
1049                            LLVMValueRef j)
1050 {
1051    LLVMBuilderRef builder = gallivm->builder;
1052    LLVMValueRef rgba, alpha;
1053    struct lp_type type, type8;
1054    struct lp_build_context bld32;
1055
1056    memset(&type, 0, sizeof type);
1057    type.width = 32;
1058    type.length = n;
1059
1060    memset(&type8, 0, sizeof type8);
1061    type8.width = 8;
1062    type8.length = n*4;
1063
1064    assert(lp_check_value(type, i));
1065    assert(lp_check_value(type, j));
1066
1067    lp_build_context_init(&bld32, gallivm, type);
1068
1069    assert(lp_check_value(type, i));
1070    assert(lp_check_value(type, j));
1071
1072    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
1073                                 colors, codewords, i, j);
1074
1075    rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
1076
1077    alpha = s3tc_dxt5_alpha_channel(gallivm, false, n, alpha_hi, alpha_lo, i, j);
1078    alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1079    rgba = LLVMBuildOr(builder, alpha, rgba, "");
1080
1081    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1082 }
1083
1084
1085 static void
1086 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1087                                    const struct util_format_description *format_desc,
1088                                    LLVMValueRef *dxt_block,
1089                                    LLVMValueRef ptr)
1090 {
1091    LLVMBuilderRef builder = gallivm->builder;
1092    unsigned block_bits = format_desc->block.bits;
1093    LLVMValueRef elem, shuf;
1094    LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1095    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1096    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1097    LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1098
1099    assert(block_bits == 64 || block_bits == 128);
1100
1101    ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
1102    elem = LLVMBuildLoad(builder, ptr, "");
1103
1104    if (block_bits == 128) {
1105       /* just return block as is */
1106       *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1107    }
1108    else {
1109       LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1110       shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1111       elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1112       *dxt_block = LLVMBuildShuffleVector(builder, elem,
1113                                           LLVMGetUndef(type32_2), shuf, "");
1114    }
1115 }
1116
1117
1118 static void
1119 s3tc_store_cached_block(struct gallivm_state *gallivm,
1120                         LLVMValueRef *col,
1121                         LLVMValueRef tag_value,
1122                         LLVMValueRef hash_index,
1123                         LLVMValueRef cache)
1124 {
1125    LLVMBuilderRef builder = gallivm->builder;
1126    LLVMValueRef ptr, indices[3];
1127    LLVMTypeRef type_ptr4x32;
1128    unsigned count;
1129
1130    type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1131    indices[0] = lp_build_const_int32(gallivm, 0);
1132    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1133    indices[2] = hash_index;
1134    ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1135    LLVMBuildStore(builder, tag_value, ptr);
1136
1137    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1138    hash_index = LLVMBuildMul(builder, hash_index,
1139                              lp_build_const_int32(gallivm, 16), "");
1140    for (count = 0; count < 4; count++) {
1141       indices[2] = hash_index;
1142       ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1143       ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1144       LLVMBuildStore(builder, col[count], ptr);
1145       hash_index = LLVMBuildAdd(builder, hash_index,
1146                                 lp_build_const_int32(gallivm, 4), "");
1147    }
1148 }
1149
1150 static LLVMValueRef
1151 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1152                          LLVMValueRef ptr,
1153                          LLVMValueRef index)
1154 {
1155    LLVMBuilderRef builder = gallivm->builder;
1156    LLVMValueRef member_ptr, indices[3];
1157
1158    indices[0] = lp_build_const_int32(gallivm, 0);
1159    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1160    indices[2] = index;
1161    member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1162    return LLVMBuildLoad(builder, member_ptr, "cache_data");
1163 }
1164
1165 static LLVMValueRef
1166 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1167                      LLVMValueRef ptr,
1168                      LLVMValueRef index)
1169 {
1170    LLVMBuilderRef builder = gallivm->builder;
1171    LLVMValueRef member_ptr, indices[3];
1172
1173    indices[0] = lp_build_const_int32(gallivm, 0);
1174    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1175    indices[2] = index;
1176    member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1177    return LLVMBuildLoad(builder, member_ptr, "tag_data");
1178 }
1179
1180 #if LP_BUILD_FORMAT_CACHE_DEBUG
1181 static void
1182 s3tc_update_cache_access(struct gallivm_state *gallivm,
1183                          LLVMValueRef ptr,
1184                          unsigned count,
1185                          unsigned index)
1186 {
1187    LLVMBuilderRef builder = gallivm->builder;
1188    LLVMValueRef member_ptr, cache_access;
1189
1190    assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1191           index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1192
1193    member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
1194    cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
1195    cache_access = LLVMBuildAdd(builder, cache_access,
1196                                LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
1197                                                                    count, 0), "");
1198    LLVMBuildStore(builder, cache_access, member_ptr);
1199 }
1200 #endif
1201
1202 /**
1203  * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1204  * The lerp is performed between the first 2 32bit colors
1205  * in the source vector, both results are returned packed in result vector.
1206  */
1207 static LLVMValueRef
1208 lp_build_lerp23_single(struct lp_build_context *bld,
1209                        LLVMValueRef v01)
1210 {
1211    struct gallivm_state *gallivm = bld->gallivm;
1212    LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1213    const struct lp_type type = bld->type;
1214    LLVMBuilderRef builder = bld->gallivm->builder;
1215    struct lp_type i16_type = lp_wider_type(type);
1216    struct lp_type i32_type = lp_wider_type(i16_type);
1217    struct lp_build_context bld2;
1218
1219    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1220
1221    lp_build_context_init(&bld2, gallivm, i16_type);
1222    bld2.type.sign = TRUE;
1223
1224    /* weights 256/3, 256*2/3, with correct rounding */
1225    elems[0] = elems[1] = elems[2] = elems[3] =
1226       lp_build_const_elem(gallivm, i16_type, 255*1/3);
1227    elems[4] = elems[5] = elems[6] = elems[7] =
1228       lp_build_const_elem(gallivm, i16_type, 171);
1229    x = LLVMConstVector(elems, 8);
1230
1231    /*
1232     * v01 has col0 in 32bit elem 0, col1 in elem 1.
1233     * Interleave/unpack will give us separate v0/v1 vectors.
1234     */
1235    v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1236    v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1237
1238    lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1239    delta = lp_build_sub(&bld2, v1, v0);
1240
1241    mul = LLVMBuildMul(builder, x, delta, "");
1242
1243    mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1244    /* lerp optimization: pack now, do add afterwards */
1245    res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1246    /* only lower 2 elems are valid - for these v0 is really v0 */
1247    return lp_build_add(bld, res, v01);
1248 }
1249
1250 /*
1251  * decode one dxt1 block.
1252  */
1253 static void
1254 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1255                        enum pipe_format format,
1256                        LLVMValueRef dxt_block,
1257                        LLVMValueRef *col)
1258 {
1259    LLVMBuilderRef builder = gallivm->builder;
1260    LLVMValueRef color01, color23, color01_16, color0123;
1261    LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1262    struct lp_type type8, type32, type16, type64;
1263    struct lp_build_context bld8, bld32, bld16, bld64;
1264    unsigned i;
1265    boolean is_dxt1_variant = format_dxt1_variant(format);
1266
1267    memset(&type32, 0, sizeof type32);
1268    type32.width = 32;
1269    type32.length = 4;
1270    type32.sign = TRUE;
1271
1272    memset(&type8, 0, sizeof type8);
1273    type8.width = 8;
1274    type8.length = 16;
1275
1276    memset(&type16, 0, sizeof type16);
1277    type16.width = 16;
1278    type16.length = 8;
1279
1280    memset(&type64, 0, sizeof type64);
1281    type64.width = 64;
1282    type64.length = 2;
1283
1284    a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1285    const2 = lp_build_const_int_vec(gallivm, type32, 2);
1286
1287    lp_build_context_init(&bld32, gallivm, type32);
1288    lp_build_context_init(&bld16, gallivm, type16);
1289    lp_build_context_init(&bld8, gallivm, type8);
1290    lp_build_context_init(&bld64, gallivm, type64);
1291
1292    if (is_dxt1_variant) {
1293       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1294       code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1295    } else {
1296       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1297       code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1298    }
1299    code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1300    /* expand bytes to dwords */
1301    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1302    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1303
1304
1305    /*
1306     * works as follows:
1307     * - expand color0/color1 to rgba8888
1308     * - calculate color2/3 (interpolation) according to color0 < color1 rules
1309     * - calculate color2/3 according to color0 >= color1 rules
1310     * - do selection of color2/3 according to comparison of color0/1
1311     * - extract indices.
1312     * - use compare/select to select the correct color. Since we have 2bit
1313     *   indices (and 4 colors), needs at least three compare/selects.
1314     */
1315
1316    /*
1317     * expand the two colors
1318     */
1319    color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1320    color01 = lp_build_interleave2(gallivm, type16, color01,
1321                                   bld16.zero, 0);
1322    color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1323    color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1324
1325    /*
1326     * interpolate colors
1327     * color2_1 is 2/3 color0 + 1/3 color1
1328     * color3_1 is 1/3 color0 + 2/3 color1
1329     * color2_2 is 1/2 color0 + 1/2 color1
1330     * color3_2 is 0
1331     */
1332
1333    /* TODO: since this is now always scalar, should
1334     * probably just use control flow here instead of calculating
1335     * both cases and then selection
1336     */
1337    if (format == PIPE_FORMAT_DXT1_RGBA ||
1338        format == PIPE_FORMAT_DXT1_SRGBA) {
1339       color01 = LLVMBuildOr(builder, color01, a, "");
1340    }
1341    /* can combine 2 lerps into one mostly */
1342    color23 = lp_build_lerp23_single(&bld8, color01);
1343    color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1344
1345    /* dxt3/5 always use 4-color encoding */
1346    if (is_dxt1_variant) {
1347       LLVMValueRef color23_2, color2_2;
1348
1349       if (util_cpu_caps.has_sse2) {
1350          LLVMValueRef intrargs[2];
1351          intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1352          /* same interleave as for lerp23 - correct result in 2nd element */
1353          intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1354          intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1355          color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1356       }
1357       else {
1358          LLVMValueRef v01, v0, v1, vhalf;
1359          /*
1360           * This isn't as expensive as it looks (the unpack is the same as
1361           * for lerp23, which is the reason why we do the pointless
1362           * interleave2 too), with correct rounding (the two lower elements
1363           * will be the same).
1364           */
1365          v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1366          v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1367          lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1368          vhalf = lp_build_add(&bld16, v0, v1);
1369          vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1370          color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1371       }
1372       /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1373       color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1374       color23_2 = LLVMBuildLShr(builder, color23_2,
1375                                 lp_build_const_int_vec(gallivm, type64, 32), "");
1376       color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1377
1378       tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1379       tmp = LLVMBuildLShr(builder, tmp,
1380                           lp_build_const_int_vec(gallivm, type64, 32), "");
1381       tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1382       sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1383                                   color01_16, tmp);
1384       sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1385       color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1386    }
1387
1388    if (util_cpu_caps.has_ssse3) {
1389       /*
1390        * Use pshufb as mini-lut. (Only doable with intrinsics as the
1391        * final shuffles are non-constant. pshufb is awesome!)
1392        */
1393       LLVMValueRef shuf[16], low2mask;
1394       LLVMValueRef intrargs[2], lut_ind, lut_adj;
1395
1396       color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1397       color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1398       color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1399       color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1400
1401       if (format == PIPE_FORMAT_DXT1_RGB ||
1402           format == PIPE_FORMAT_DXT1_SRGB) {
1403          color0123 = LLVMBuildOr(builder, color0123, a, "");
1404       }
1405
1406       /* shuffle as r0r1r2r3g0g1... */
1407       for (i = 0; i < 4; i++) {
1408          shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1409          shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1410          shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1411          shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1412       }
1413       color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1414       color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1415                                          LLVMConstVector(shuf, 16), "");
1416
1417       /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1418       low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1419       /* add 0/4/8/12 for r/g/b/a */
1420       lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1421       lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1422       intrargs[0] = color0123;
1423       for (i = 0; i < 4; i++) {
1424          lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1425          lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1426          intrargs[1] = lut_ind;
1427          col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1428                                      bld8.vec_type, intrargs, 2, 0);
1429          col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1430          code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1431          code = LLVMBuildLShr(builder, code, const2, "");
1432          code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1433       }
1434    }
1435    else {
1436       /* Thanks to vectorization can do 4 texels in parallel */
1437       LLVMValueRef color0, color1, color2, color3;
1438       if (format == PIPE_FORMAT_DXT1_RGB ||
1439           format == PIPE_FORMAT_DXT1_SRGB) {
1440          color01 = LLVMBuildOr(builder, color01, a, "");
1441          color23 = LLVMBuildOr(builder, color23, a, "");
1442       }
1443       color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1444                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1445       color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1446                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1447       color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1448                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1449       color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1450                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1451       code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1452
1453       for (i = 0; i < 4; i++) {
1454          /* select the colors */
1455          LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1456          bitlo = bld32.one;
1457          indices = LLVMBuildAnd(builder, code, bitlo, "");
1458          selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1459                                       indices, bitlo);
1460          rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1461
1462          LLVMValueRef selmaskhi;
1463          indices = LLVMBuildAnd(builder, code, const2, "");
1464          selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1465                                       indices, const2);
1466          rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1467          rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1468
1469          /*
1470           * Note that this will give "wrong" order.
1471           * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1472           * This would be easily fixable by using different shuffle, bitlo/hi
1473           * vectors above (and different shift), but seems slightly easier to
1474           * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1475           */
1476          col[i] = rgba;
1477          code = LLVMBuildLShr(builder, code, const2, "");
1478       }
1479    }
1480 }
1481
1482 /*
1483  * decode one dxt3 block.
1484  */
1485 static void
1486 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1487                        enum pipe_format format,
1488                        LLVMValueRef dxt_block,
1489                        LLVMValueRef *col)
1490 {
1491    LLVMBuilderRef builder = gallivm->builder;
1492    LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1493    struct lp_type type32, type8, type16;
1494    unsigned i;
1495
1496    memset(&type32, 0, sizeof type32);
1497    type32.width = 32;
1498    type32.length = 4;
1499
1500    memset(&type8, 0, sizeof type8);
1501    type8.width = 8;
1502    type8.length = 16;
1503
1504    memset(&type16, 0, sizeof type16);
1505    type16.width = 16;
1506    type16.length = 8;
1507
1508    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1509
1510    shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1511    mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1512
1513    alpha = LLVMBuildBitCast(builder, dxt_block,
1514                             lp_build_vec_type(gallivm, type8), "");
1515    alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1516    alpha = LLVMBuildBitCast(builder, alpha,
1517                             lp_build_vec_type(gallivm, type16), "");
1518    alpha = LLVMBuildAnd(builder, alpha,
1519                         lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1520    alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1521    alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1522    alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1523    alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1524    alpha = LLVMBuildBitCast(builder, alpha,
1525                             lp_build_vec_type(gallivm, type32), "");
1526    /*
1527     * alpha now contains elems 0,1,2,3,... (ubytes)
1528     * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1529     * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1530     */
1531    a[0] = LLVMBuildShl(builder, alpha,
1532                        lp_build_const_int_vec(gallivm, type32, 24), "");
1533    a[1] = LLVMBuildShl(builder, alpha,
1534                        lp_build_const_int_vec(gallivm, type32, 16), "");
1535    a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1536    a[2] = LLVMBuildShl(builder, alpha,
1537                        lp_build_const_int_vec(gallivm, type32, 8), "");
1538    a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1539    a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1540
1541    for (i = 0; i < 4; i++) {
1542       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1543    }
1544 }
1545
1546
1547 static LLVMValueRef
1548 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1549                         LLVMValueRef alpha0,
1550                         LLVMValueRef alpha1,
1551                         LLVMValueRef code,
1552                         LLVMValueRef sel_mask)
1553 {
1554    LLVMBuilderRef builder = gallivm->builder;
1555    LLVMValueRef delta, ainterp;
1556    LLVMValueRef weight5, weight7, weight;
1557    struct lp_type type16;
1558    struct lp_build_context bld;
1559
1560    memset(&type16, 0, sizeof type16);
1561    type16.width = 16;
1562    type16.length = 8;
1563    type16.sign = TRUE;
1564
1565    lp_build_context_init(&bld, gallivm, type16);
1566    /*
1567     * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1568     * actually be desirable to do this here with even higher accuracy than
1569     * even 8 bit (more or less required for rgtc, albeit that's not handled
1570     * here right now), shift the weights after multiplication by code.
1571     */
1572    weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1573    weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1574    weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1575
1576    /*
1577     * we'll get garbage in the elements which had code 0 (or larger than
1578     * 5 or 7) but we don't care (or rather, need to fix up anyway).
1579     */
1580    code = LLVMBuildSub(builder, code, bld.one, "");
1581
1582    weight = LLVMBuildMul(builder, weight, code, "");
1583    weight = LLVMBuildLShr(builder, weight,
1584                           lp_build_const_int_vec(gallivm, type16, 6), "");
1585
1586    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1587
1588    ainterp = LLVMBuildMul(builder, delta, weight, "");
1589    ainterp = LLVMBuildLShr(builder, ainterp,
1590                            lp_build_const_int_vec(gallivm, type16, 8), "");
1591
1592    /* lerp is done later (with packed values) */
1593
1594    return ainterp;
1595 }
1596
1597
1598 /*
1599  * decode one dxt5 block.
1600  */
1601 static void
1602 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1603                        enum pipe_format format,
1604                        LLVMValueRef dxt_block,
1605                        LLVMValueRef *col)
1606 {
1607    LLVMBuilderRef builder = gallivm->builder;
1608    LLVMValueRef alpha, alpha0, alpha1, ares;
1609    LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1610    LLVMValueRef a[4], acode, tmp0, tmp1;
1611    LLVMTypeRef i64t, i32t;
1612    struct lp_type type32, type64, type8, type16;
1613    struct lp_build_context bld16, bld8;
1614    unsigned i;
1615
1616    memset(&type32, 0, sizeof type32);
1617    type32.width = 32;
1618    type32.length = 4;
1619
1620    memset(&type64, 0, sizeof type64);
1621    type64.width = 64;
1622    type64.length = 2;
1623
1624    memset(&type8, 0, sizeof type8);
1625    type8.width = 8;
1626    type8.length = 16;
1627
1628    memset(&type16, 0, sizeof type16);
1629    type16.width = 16;
1630    type16.length = 8;
1631
1632    lp_build_context_init(&bld16, gallivm, type16);
1633    lp_build_context_init(&bld8, gallivm, type8);
1634
1635    i64t = lp_build_vec_type(gallivm, type64);
1636    i32t = lp_build_vec_type(gallivm, type32);
1637
1638    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1639
1640    /*
1641     * three possible strategies for vectorizing alpha:
1642     * 1) compute all 8 values then use scalar extraction
1643     *    (i.e. have all 8 alpha values packed in one 64bit scalar
1644     *    and do something like ax = vals >> (codex * 8) followed
1645     *    by inserting these values back into color)
1646     * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1647     *    (without pshufb would need boatloads of cmp/selects trying to
1648     *    keep things vectorized for essentially scalar selection).
1649     * 3) do something similar to the uncached case
1650     *    needs more calculations (need to calc 16 values instead of 8 though
1651     *    that's only an issue for the lerp which we need to do twice otherwise
1652     *    everything still fits into 128bit) but keeps things vectorized mostly.
1653     * Trying 3) here though not sure it's really faster...
1654     * With pshufb, we try 2) (cheaper and more accurate)
1655     */
1656
1657    /*
1658     * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1659     * help since code crosses 8bit boundaries). But variable shifts are
1660     * AVX2 only, and even then only dword/quadword (intel _really_ hates
1661     * shifts!). Instead, emulate by 16bit muls.
1662     * Also, the required byte shuffles are essentially non-emulatable, so
1663     * require ssse3 (albeit other archs might do them fine).
1664     * This is not directly tied to ssse3 - just need sane byte shuffles.
1665     * But ordering is going to be different below so use same condition.
1666     */
1667
1668
1669    /* vectorize alpha */
1670    alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1671    alpha0 = LLVMBuildAnd(builder, alpha,
1672                          lp_build_const_int_vec(gallivm, type64, 0xff), "");
1673    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1674    alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1675    alpha1 = LLVMBuildLShr(builder, alpha,
1676                           lp_build_const_int_vec(gallivm, type16, 8), "");
1677    alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
1678    shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1679    alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1680    alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1681
1682    type16.sign = TRUE;
1683    sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1684                                alpha0, alpha1);
1685    type16.sign = FALSE;
1686    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1687
1688    if (!util_cpu_caps.has_ssse3) {
1689       LLVMValueRef acodeg, mask1, acode0, acode1;
1690
1691       /* extraction of the 3 bit values into something more useful is HARD */
1692       /* first steps are actually scalar */
1693       acode = LLVMBuildLShr(builder, alpha,
1694                             lp_build_const_int_vec(gallivm, type64, 16), "");
1695       tmp0 = LLVMBuildAnd(builder, acode,
1696                           lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1697       tmp1 =  LLVMBuildLShr(builder, acode,
1698                             lp_build_const_int_vec(gallivm, type64, 24), "");
1699       tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1700       tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1701       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1702       /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1703       tmp0 = LLVMBuildAnd(builder, acode,
1704                           lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1705       tmp1 =  LLVMBuildLShr(builder, acode,
1706                             lp_build_const_int_vec(gallivm, type32, 12), "");
1707       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1708       /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1709       tmp0 = LLVMBuildAnd(builder, acode,
1710                           lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1711       tmp1 =  LLVMBuildLShr(builder, acode,
1712                             lp_build_const_int_vec(gallivm, type32, 6), "");
1713       /* use signed pack doesn't matter and otherwise need sse41 */
1714       type32.sign = type16.sign = TRUE;
1715       acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1716       type32.sign = type16.sign = FALSE;
1717       /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1718       acode0 = LLVMBuildAnd(builder, acode,
1719                             lp_build_const_int_vec(gallivm, type16, 0x7), "");
1720       acode1 =  LLVMBuildLShr(builder, acode,
1721                               lp_build_const_int_vec(gallivm, type16, 3), "");
1722       acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1723       /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1724
1725       acodeg = LLVMBuildAnd(builder, acode,
1726                             LLVMBuildNot(builder, sel_mask, ""), "");
1727       mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1728                                acode, bld8.one);
1729
1730       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1731       ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1732       ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1733       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1734       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1735       alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1736       alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1737       ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1738       /* Fix up val01 */
1739       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1740                                    acode, bld8.zero);
1741       ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1742       ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1743
1744       /* fix up val67 if a0 <= a1 */
1745       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1746                                    acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1747       ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1748       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1749                                    acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1750       ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1751
1752       /* unpack in right order (0,4,8,12,1,5,..) */
1753       /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1754       tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1755       tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1756       tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1757       tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1758
1759       a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1760       a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1761       a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1762       a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1763    }
1764    else {
1765       LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1766       LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1767       LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1768       unsigned i, j;
1769       /*
1770        * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1771        * help since code crosses 8bit boundaries). But variable shifts are
1772        * AVX2 only, and even then only dword/quadword (intel _really_ hates
1773        * shifts!). Instead, emulate by 16bit muls.
1774        * Also, the required byte shuffles are essentially non-emulatable, so
1775        * require ssse3 (albeit other archs might do them fine, but the
1776        * complete path is ssse3 only for now).
1777        */
1778       for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1779          elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1780          elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1781          elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1782       }
1783       shufa = LLVMConstVector(elems, 16);
1784       alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1785       acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1786       acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1787       /*
1788        * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1789        * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1790        * we'd place them into bits 4-7 so could save shift but impossible.)
1791        */
1792       for (i = 0; i < 8; i += 4) {
1793          elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1794          elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1795          elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1796          elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1797       }
1798       mulclo = LLVMConstVector(elems, 8);
1799       for (i = 0; i < 8; i += 4) {
1800          elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1801          elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1802          elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1803          elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1804       }
1805       mulchi = LLVMConstVector(elems, 8);
1806
1807       tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1808       tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1809       tmp0 = LLVMBuildLShr(builder, tmp0,
1810                            lp_build_const_int_vec(gallivm, type16, 13), "");
1811       tmp1 = LLVMBuildLShr(builder, tmp1,
1812                            lp_build_const_int_vec(gallivm, type16, 5), "");
1813       tmp1 = LLVMBuildAnd(builder, tmp1,
1814                           lp_build_const_int_vec(gallivm, type16, 0x700), "");
1815       acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1816       acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1817
1818       /*
1819        * Note that ordering is different here to non-ssse3 path:
1820        * 0/1/2/3/4/5...
1821        */
1822
1823       LLVMValueRef weight0, weight1, weight, delta;
1824       LLVMValueRef constff_elem7, const0_elem6;
1825       /* weights, correctly rounded (round(256*x/7)) */
1826       elems[0] = LLVMConstInt(type16s, 256, 0);
1827       elems[1] = LLVMConstInt(type16s, 0, 0);
1828       elems[2] = LLVMConstInt(type16s, 219, 0);
1829       elems[3] =  LLVMConstInt(type16s, 183, 0);
1830       elems[4] =  LLVMConstInt(type16s, 146, 0);
1831       elems[5] =  LLVMConstInt(type16s, 110, 0);
1832       elems[6] =  LLVMConstInt(type16s, 73, 0);
1833       elems[7] =  LLVMConstInt(type16s, 37, 0);
1834       weight0 = LLVMConstVector(elems, 8);
1835
1836       elems[0] = LLVMConstInt(type16s, 256, 0);
1837       elems[1] = LLVMConstInt(type16s, 0, 0);
1838       elems[2] = LLVMConstInt(type16s, 205, 0);
1839       elems[3] =  LLVMConstInt(type16s, 154, 0);
1840       elems[4] =  LLVMConstInt(type16s, 102, 0);
1841       elems[5] =  LLVMConstInt(type16s, 51, 0);
1842       elems[6] =  LLVMConstInt(type16s, 0, 0);
1843       elems[7] =  LLVMConstInt(type16s, 0, 0);
1844       weight1 = LLVMConstVector(elems, 8);
1845
1846       weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1847       weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1848       weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1849       weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1850
1851       for (i = 0; i < 16; i++) {
1852          elems[i] = LLVMConstNull(type8s);
1853       }
1854       elems[7] = LLVMConstInt(type8s, 255, 0);
1855       constff_elem7 = LLVMConstVector(elems, 16);
1856
1857       for (i = 0; i < 16; i++) {
1858          elems[i] = LLVMConstInt(type8s, 255, 0);
1859       }
1860       elems[6] = LLVMConstInt(type8s, 0, 0);
1861       const0_elem6 = LLVMConstVector(elems, 16);
1862
1863       /* standard simple lerp - but the version we need isn't available */
1864       delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1865       ainterp = LLVMBuildMul(builder, delta, weight, "");
1866       ainterp = LLVMBuildLShr(builder, ainterp,
1867                               lp_build_const_int_vec(gallivm, type16, 8), "");
1868       ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1869       alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1870       ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1871       ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1872       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1873
1874       /* fixing 0/0xff case is slightly more complex */
1875       constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1876                                    LLVMBuildNot(builder, sel_mask, ""), "");
1877       const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1878       ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1879       ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1880
1881       /* now pick all 16 elements at once! */
1882       intrargs[0] = ainterp;
1883       intrargs[1] = acode;
1884       ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1885                                 bld8.vec_type, intrargs, 2, 0);
1886
1887       ares = LLVMBuildBitCast(builder, ares, i32t, "");
1888       mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1889       a[0] = LLVMBuildShl(builder, ares,
1890                           lp_build_const_int_vec(gallivm, type32, 24), "");
1891       a[1] = LLVMBuildShl(builder, ares,
1892                           lp_build_const_int_vec(gallivm, type32, 16), "");
1893       a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1894       a[2] = LLVMBuildShl(builder, ares,
1895                           lp_build_const_int_vec(gallivm, type32, 8), "");
1896       a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1897       a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1898    }
1899
1900    for (i = 0; i < 4; i++) {
1901       a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1902       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1903    }
1904 }
1905
1906
1907 static void
1908 generate_update_cache_one_block(struct gallivm_state *gallivm,
1909                                 LLVMValueRef function,
1910                                 const struct util_format_description *format_desc)
1911 {
1912    LLVMBasicBlockRef block;
1913    LLVMBuilderRef old_builder;
1914    LLVMValueRef ptr_addr;
1915    LLVMValueRef hash_index;
1916    LLVMValueRef cache;
1917    LLVMValueRef dxt_block, tag_value;
1918    LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1919
1920    ptr_addr     = LLVMGetParam(function, 0);
1921    hash_index   = LLVMGetParam(function, 1);
1922    cache        = LLVMGetParam(function, 2);
1923
1924    lp_build_name(ptr_addr,   "ptr_addr"  );
1925    lp_build_name(hash_index, "hash_index");
1926    lp_build_name(cache,      "cache_addr");
1927
1928    /*
1929     * Function body
1930     */
1931
1932    old_builder = gallivm->builder;
1933    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1934    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1935    LLVMPositionBuilderAtEnd(gallivm->builder, block);
1936
1937    lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1938                                       ptr_addr);
1939
1940    switch (format_desc->format) {
1941    case PIPE_FORMAT_DXT1_RGB:
1942    case PIPE_FORMAT_DXT1_RGBA:
1943    case PIPE_FORMAT_DXT1_SRGB:
1944    case PIPE_FORMAT_DXT1_SRGBA:
1945       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1946       break;
1947    case PIPE_FORMAT_DXT3_RGBA:
1948    case PIPE_FORMAT_DXT3_SRGBA:
1949       s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1950       break;
1951    case PIPE_FORMAT_DXT5_RGBA:
1952    case PIPE_FORMAT_DXT5_SRGBA:
1953       s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1954       break;
1955    default:
1956       assert(0);
1957       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1958       break;
1959    }
1960
1961    tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1962                                  LLVMInt64TypeInContext(gallivm->context), "");
1963    s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1964
1965    LLVMBuildRetVoid(gallivm->builder);
1966
1967    LLVMDisposeBuilder(gallivm->builder);
1968    gallivm->builder = old_builder;
1969
1970    gallivm_verify_function(gallivm, function);
1971 }
1972
1973
1974 static void
1975 update_cached_block(struct gallivm_state *gallivm,
1976                     const struct util_format_description *format_desc,
1977                     LLVMValueRef ptr_addr,
1978                     LLVMValueRef hash_index,
1979                     LLVMValueRef cache)
1980
1981 {
1982    LLVMBuilderRef builder = gallivm->builder;
1983    LLVMModuleRef module = gallivm->module;
1984    char name[256];
1985    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1986    LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1987    LLVMValueRef function, inst;
1988    LLVMBasicBlockRef bb;
1989    LLVMValueRef args[3];
1990
1991    snprintf(name, sizeof name, "%s_update_cache_one_block",
1992             format_desc->short_name);
1993    function = LLVMGetNamedFunction(module, name);
1994
1995    if (!function) {
1996       LLVMTypeRef ret_type;
1997       LLVMTypeRef arg_types[3];
1998       LLVMTypeRef function_type;
1999       unsigned arg;
2000
2001       /*
2002        * Generate the function prototype.
2003        */
2004
2005       ret_type = LLVMVoidTypeInContext(gallivm->context);
2006       arg_types[0] = pi8t;
2007       arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
2008       arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
2009       function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
2010       function = LLVMAddFunction(module, name, function_type);
2011
2012       for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
2013          if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
2014             lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
2015
2016       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
2017       LLVMSetVisibility(function, LLVMHiddenVisibility);
2018       generate_update_cache_one_block(gallivm, function, format_desc);
2019    }
2020
2021    args[0] = ptr_addr;
2022    args[1] = hash_index;
2023    args[2] = cache;
2024
2025    LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
2026    bb = LLVMGetInsertBlock(builder);
2027    inst = LLVMGetLastInstruction(bb);
2028    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
2029 }
2030
2031 /*
2032  * cached lookup
2033  */
2034 static LLVMValueRef
2035 compressed_fetch_cached(struct gallivm_state *gallivm,
2036                         const struct util_format_description *format_desc,
2037                         unsigned n,
2038                         LLVMValueRef base_ptr,
2039                         LLVMValueRef offset,
2040                         LLVMValueRef i,
2041                         LLVMValueRef j,
2042                         LLVMValueRef cache)
2043
2044 {
2045    LLVMBuilderRef builder = gallivm->builder;
2046    unsigned count, low_bit, log2size;
2047    LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2048    LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2049    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2050    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2051    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2052    struct lp_type type;
2053    struct lp_build_context bld32;
2054    memset(&type, 0, sizeof type);
2055    type.width = 32;
2056    type.length = n;
2057
2058    lp_build_context_init(&bld32, gallivm, type);
2059
2060    /*
2061     * compute hash - we use direct mapped cache, the hash function could
2062     *                be better but it needs to be simple
2063     * per-element:
2064     *    compare offset with offset stored at tag (hash)
2065     *    if not equal extract block, store block, update tag
2066     *    extract color from cache
2067     *    assemble colors
2068     */
2069
2070    low_bit = util_logbase2(format_desc->block.bits / 8);
2071    log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2072    addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2073    ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2074    ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2075    /* For the hash function, first mask off the unused lowest bits. Then just
2076       do some xor with address bits - only use lower 32bits */
2077    ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2078    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2079                                  lp_build_const_int_vec(gallivm, type, low_bit), "");
2080    /* This only really makes sense for size 64,128,256 */
2081    hash_index = ptr_addrtrunc;
2082    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2083                                  lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2084    hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2085    tmp = LLVMBuildLShr(builder, hash_index,
2086                        lp_build_const_int_vec(gallivm, type, log2size), "");
2087    hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2088
2089    hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2090    hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2091    ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2092    ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2093    block_index = LLVMBuildShl(builder, hash_index,
2094                               lp_build_const_int_vec(gallivm, type, 4), "");
2095    block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2096
2097    if (n > 1) {
2098       color = bld32.undef;
2099       for (count = 0; count < n; count++) {
2100          LLVMValueRef index, cond, colorx;
2101          LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2102          struct lp_build_if_state if_ctx;
2103
2104          index = lp_build_const_int32(gallivm, count);
2105          offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2106          addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2107          addrx = LLVMBuildAdd(builder, addrx, addr, "");
2108          block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2109          hash_indexx = LLVMBuildLShr(builder, block_indexx,
2110                                      lp_build_const_int32(gallivm, 4), "");
2111          offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2112          cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2113
2114          lp_build_if(&if_ctx, gallivm, cond);
2115          {
2116             ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2117                                           LLVMPointerType(i8t, 0), "");
2118             update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2119 #if LP_BUILD_FORMAT_CACHE_DEBUG
2120             s3tc_update_cache_access(gallivm, cache, 1,
2121                                      LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2122 #endif
2123          }
2124          lp_build_endif(&if_ctx);
2125
2126          colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2127
2128          color = LLVMBuildInsertElement(builder, color, colorx,
2129                                         lp_build_const_int32(gallivm, count), "");
2130       }
2131    }
2132    else {
2133       LLVMValueRef cond;
2134       struct lp_build_if_state if_ctx;
2135
2136       tmp = LLVMBuildZExt(builder, offset, i64t, "");
2137       addr = LLVMBuildAdd(builder, tmp, addr, "");
2138       offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2139       cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2140
2141       lp_build_if(&if_ctx, gallivm, cond);
2142       {
2143          tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2144          update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2145 #if LP_BUILD_FORMAT_CACHE_DEBUG
2146          s3tc_update_cache_access(gallivm, cache, 1,
2147                                   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2148 #endif
2149       }
2150       lp_build_endif(&if_ctx);
2151
2152       color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2153    }
2154 #if LP_BUILD_FORMAT_CACHE_DEBUG
2155    s3tc_update_cache_access(gallivm, cache, n,
2156                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2157 #endif
2158    return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2159 }
2160
2161
2162 static LLVMValueRef
2163 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2164                       unsigned n,
2165                       enum pipe_format format,
2166                       LLVMValueRef colors,
2167                       LLVMValueRef codewords,
2168                       LLVMValueRef alpha_lo,
2169                       LLVMValueRef alpha_hi,
2170                       LLVMValueRef i,
2171                       LLVMValueRef j)
2172 {
2173    return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2174                                      codewords, alpha_lo, alpha_hi, i, j);
2175 }
2176
2177
2178 /**
2179  * @param n  number of pixels processed (usually n=4, but it should also work with n=1
2180  *           and multiples of 4)
2181  * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
2182  * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2183  * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
2184  * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
2185  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
2186  */
2187 LLVMValueRef
2188 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2189                              const struct util_format_description *format_desc,
2190                              unsigned n,
2191                              LLVMValueRef base_ptr,
2192                              LLVMValueRef offset,
2193                              LLVMValueRef i,
2194                              LLVMValueRef j,
2195                              LLVMValueRef cache)
2196 {
2197    LLVMValueRef rgba;
2198    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2199    LLVMBuilderRef builder = gallivm->builder;
2200
2201    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2202    assert(format_desc->block.width == 4);
2203    assert(format_desc->block.height == 4);
2204
2205    assert((n == 1) || (n % 4 == 0));
2206
2207 /*   debug_printf("format = %d\n", format_desc->format);*/
2208    if (cache) {
2209       rgba = compressed_fetch_cached(gallivm, format_desc, n,
2210                                      base_ptr, offset, i, j, cache);
2211       return rgba;
2212    }
2213
2214    /*
2215     * Could use n > 8 here with avx2, but doesn't seem faster.
2216     */
2217    if (n > 4) {
2218       unsigned count;
2219       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2220       LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2221       LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
2222       LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2223                                                 gallivm->context), 4);
2224       LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2225       struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2226
2227       assert(n / 4 <= ARRAY_SIZE(rgba4));
2228
2229       rgba = LLVMGetUndef(i128_vectype);
2230
2231       for (count = 0; count < n / 4; count++) {
2232          LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2233
2234          i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2235          j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2236          offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2237
2238          lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2239                               &alpha_lo, &alpha_hi, base_ptr, offset4);
2240
2241          switch (format_desc->format) {
2242          case PIPE_FORMAT_DXT1_RGB:
2243          case PIPE_FORMAT_DXT1_RGBA:
2244          case PIPE_FORMAT_DXT1_SRGB:
2245          case PIPE_FORMAT_DXT1_SRGBA:
2246             rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2247                                                  colors, codewords, i4, j4);
2248             break;
2249          case PIPE_FORMAT_DXT3_RGBA:
2250          case PIPE_FORMAT_DXT3_SRGBA:
2251             rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2252                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2253             break;
2254          case PIPE_FORMAT_DXT5_RGBA:
2255          case PIPE_FORMAT_DXT5_SRGBA:
2256             rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2257                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2258             break;
2259          default:
2260             assert(0);
2261             rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2262             break;
2263          }
2264          /* shuffles typically give best results with dword elements...*/
2265          rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2266       }
2267       rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2268       rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2269    }
2270    else {
2271       LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2272
2273       lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2274                            &alpha_lo, &alpha_hi, base_ptr, offset);
2275
2276       switch (format_desc->format) {
2277       case PIPE_FORMAT_DXT1_RGB:
2278       case PIPE_FORMAT_DXT1_RGBA:
2279       case PIPE_FORMAT_DXT1_SRGB:
2280       case PIPE_FORMAT_DXT1_SRGBA:
2281          rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2282                                       colors, codewords, i, j);
2283          break;
2284       case PIPE_FORMAT_DXT3_RGBA:
2285       case PIPE_FORMAT_DXT3_SRGBA:
2286          rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2287                                       codewords, alpha_lo, alpha_hi, i, j);
2288          break;
2289       case PIPE_FORMAT_DXT5_RGBA:
2290       case PIPE_FORMAT_DXT5_SRGBA:
2291          rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2292                                       codewords, alpha_lo, alpha_hi, i, j);
2293          break;
2294       default:
2295          assert(0);
2296          rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2297          break;
2298       }
2299    }
2300
2301    /* always return just decompressed values - srgb conversion is done later */
2302
2303    return rgba;
2304 }
2305
2306 /**
2307  * Gather elements from scatter positions in memory into vectors.
2308  * This is customised for fetching texels from s3tc textures.
2309  * For SSE, typical value is length=4.
2310  *
2311  * @param length length of the offsets
2312  * @param colors the stored colors of the blocks will be extracted into this.
2313  * @param codewords the codewords of the blocks will be extracted into this.
2314  * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
2315  * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
2316  * @param base_ptr base pointer, should be a i8 pointer type.
2317  * @param offsets vector with offsets
2318  */
2319 static void
2320 lp_build_gather_rgtc(struct gallivm_state *gallivm,
2321                      unsigned length,
2322                      const struct util_format_description *format_desc,
2323                      LLVMValueRef *red_lo, LLVMValueRef *red_hi,
2324                      LLVMValueRef *green_lo, LLVMValueRef *green_hi,
2325                      LLVMValueRef base_ptr,
2326                      LLVMValueRef offsets)
2327 {
2328    LLVMBuilderRef builder = gallivm->builder;
2329    unsigned block_bits = format_desc->block.bits;
2330    unsigned i;
2331    LLVMValueRef elems[8];
2332    LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
2333    LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
2334    LLVMTypeRef type32dxt;
2335    struct lp_type lp_type32dxt;
2336
2337    memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
2338    lp_type32dxt.width = 32;
2339    lp_type32dxt.length = block_bits / 32;
2340    type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
2341
2342    assert(block_bits == 64 || block_bits == 128);
2343    assert(length == 1 || length == 4 || length == 8);
2344
2345    for (i = 0; i < length; ++i) {
2346       elems[i] = lp_build_gather_elem(gallivm, length,
2347                                       block_bits, block_bits, TRUE,
2348                                       base_ptr, offsets, i, FALSE);
2349       elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
2350    }
2351    if (length == 1) {
2352       LLVMValueRef elem = elems[0];
2353
2354       *red_lo = LLVMBuildExtractElement(builder, elem,
2355                                         lp_build_const_int32(gallivm, 0), "");
2356       *red_hi = LLVMBuildExtractElement(builder, elem,
2357                                         lp_build_const_int32(gallivm, 1), "");
2358
2359       if (block_bits == 128) {
2360          *green_lo = LLVMBuildExtractElement(builder, elem,
2361                                              lp_build_const_int32(gallivm, 2), "");
2362          *green_hi = LLVMBuildExtractElement(builder, elem,
2363                                              lp_build_const_int32(gallivm, 3), "");
2364       }
2365    } else {
2366       LLVMValueRef tmp[4];
2367       struct lp_type lp_type32, lp_type64;
2368       memset(&lp_type32, 0, sizeof lp_type32);
2369       lp_type32.width = 32;
2370       lp_type32.length = length;
2371       lp_type32.sign = lp_type32dxt.sign;
2372       memset(&lp_type64, 0, sizeof lp_type64);
2373       lp_type64.width = 64;
2374       lp_type64.length = length/2;
2375       if (block_bits == 128) {
2376          if (length == 8) {
2377             for (i = 0; i < 4; ++i) {
2378                tmp[0] = elems[i];
2379                tmp[1] = elems[i+4];
2380                elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
2381             }
2382          }
2383          lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
2384          *green_lo = tmp[2];
2385          *green_hi = tmp[3];
2386          *red_lo = tmp[0];
2387          *red_hi = tmp[1];
2388       } else {
2389          LLVMValueRef red01, red23;
2390          LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
2391          LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
2392
2393          for (i = 0; i < length; ++i) {
2394             /* no-op shuffle */
2395             elems[i] = LLVMBuildShuffleVector(builder, elems[i],
2396                                               LLVMGetUndef(type32dxt),
2397                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
2398          }
2399          if (length == 8) {
2400             struct lp_type lp_type32_4 = {0};
2401             lp_type32_4.width = 32;
2402             lp_type32_4.length = 4;
2403             for (i = 0; i < 4; ++i) {
2404                tmp[0] = elems[i];
2405                tmp[1] = elems[i+4];
2406                elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
2407             }
2408          }
2409          red01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
2410          red23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
2411          red01 = LLVMBuildBitCast(builder, red01, type64_vec, "");
2412          red23 = LLVMBuildBitCast(builder, red23, type64_vec, "");
2413          *red_lo = lp_build_interleave2_half(gallivm, lp_type64, red01, red23, 0);
2414          *red_hi = lp_build_interleave2_half(gallivm, lp_type64, red01, red23, 1);
2415          *red_lo = LLVMBuildBitCast(builder, *red_lo, type32_vec, "");
2416          *red_hi = LLVMBuildBitCast(builder, *red_hi, type32_vec, "");
2417          *green_lo = NULL;
2418          *green_hi = NULL;
2419       }
2420    }
2421 }
2422
2423 static LLVMValueRef
2424 rgtc1_to_rgba_aos(struct gallivm_state *gallivm,
2425                   unsigned n,
2426                   enum pipe_format format,
2427                   LLVMValueRef red_lo,
2428                   LLVMValueRef red_hi,
2429                   LLVMValueRef i,
2430                   LLVMValueRef j)
2431 {
2432    LLVMBuilderRef builder = gallivm->builder;
2433    bool is_signed = (format == PIPE_FORMAT_RGTC1_SNORM);
2434    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2435    LLVMValueRef rgba;
2436    struct lp_type type, type8;
2437    memset(&type, 0, sizeof type);
2438    type.width = 32;
2439    type.length = n;
2440    memset(&type8, 0, sizeof type8);
2441    type8.width = 8;
2442    type8.length = n*4;
2443    rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xff << 24));
2444    rgba = LLVMBuildOr(builder, rgba, red, "");
2445    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2446 }
2447
2448 static LLVMValueRef
2449 rgtc2_to_rgba_aos(struct gallivm_state *gallivm,
2450                   unsigned n,
2451                   enum pipe_format format,
2452                   LLVMValueRef red_lo,
2453                   LLVMValueRef red_hi,
2454                   LLVMValueRef green_lo,
2455                   LLVMValueRef green_hi,
2456                   LLVMValueRef i,
2457                   LLVMValueRef j)
2458 {
2459    LLVMBuilderRef builder = gallivm->builder;
2460    bool is_signed = (format == PIPE_FORMAT_RGTC2_SNORM);
2461    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2462    LLVMValueRef green = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, green_hi, green_lo, i, j);
2463    LLVMValueRef rgba;
2464    struct lp_type type, type8;
2465    memset(&type, 0, sizeof type);
2466    type.width = 32;
2467    type.length = n;
2468    memset(&type8, 0, sizeof type8);
2469    type8.width = 8;
2470    type8.length = n*4;
2471    rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xff << 24));
2472    rgba = LLVMBuildOr(builder, rgba, red, "");
2473    green = LLVMBuildShl(builder, green, lp_build_const_int_vec(gallivm, type, 8), "");
2474    rgba = LLVMBuildOr(builder, rgba, green, "");
2475    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2476 }
2477
2478 static LLVMValueRef
2479 latc1_to_rgba_aos(struct gallivm_state *gallivm,
2480                   unsigned n,
2481                   enum pipe_format format,
2482                   LLVMValueRef red_lo,
2483                   LLVMValueRef red_hi,
2484                   LLVMValueRef i,
2485                   LLVMValueRef j)
2486 {
2487    LLVMBuilderRef builder = gallivm->builder;
2488    bool is_signed = (format == PIPE_FORMAT_LATC1_SNORM);
2489    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2490    LLVMValueRef rgba, temp;
2491    struct lp_type type, type8;
2492    memset(&type, 0, sizeof type);
2493    type.width = 32;
2494    type.length = n;
2495    memset(&type8, 0, sizeof type8);
2496    type8.width = 8;
2497    type8.length = n*4;
2498    rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xff << 24));
2499    rgba = LLVMBuildOr(builder, rgba, red, "");
2500    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 8), "");
2501    rgba = LLVMBuildOr(builder, rgba, temp, "");
2502    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 16), "");
2503    rgba = LLVMBuildOr(builder, rgba, temp, "");
2504    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2505 }
2506
2507 static LLVMValueRef
2508 latc2_to_rgba_aos(struct gallivm_state *gallivm,
2509                   unsigned n,
2510                   enum pipe_format format,
2511                   LLVMValueRef red_lo,
2512                   LLVMValueRef red_hi,
2513                   LLVMValueRef green_lo,
2514                   LLVMValueRef green_hi,
2515                   LLVMValueRef i,
2516                   LLVMValueRef j)
2517 {
2518    LLVMBuilderRef builder = gallivm->builder;
2519    bool is_signed = (format == PIPE_FORMAT_LATC2_SNORM);
2520    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2521    LLVMValueRef green = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, green_hi, green_lo, i, j);
2522    LLVMValueRef rgba, temp;
2523    struct lp_type type, type8;
2524    memset(&type, 0, sizeof type);
2525    type.width = 32;
2526    type.length = n;
2527    memset(&type8, 0, sizeof type8);
2528    type8.width = 8;
2529    type8.length = n*4;
2530
2531    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 8), "");
2532    rgba = LLVMBuildOr(builder, red, temp, "");
2533    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 16), "");
2534    rgba = LLVMBuildOr(builder, rgba, temp, "");
2535    temp = LLVMBuildShl(builder, green, lp_build_const_int_vec(gallivm, type, 24), "");
2536    rgba = LLVMBuildOr(builder, rgba, temp, "");
2537    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2538 }
2539
2540 /**
2541  * @param n  number of pixels processed (usually n=4, but it should also work with n=1
2542  *           and multiples of 4)
2543  * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
2544  * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2545  * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
2546  * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
2547  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
2548  */
2549 LLVMValueRef
2550 lp_build_fetch_rgtc_rgba_aos(struct gallivm_state *gallivm,
2551                              const struct util_format_description *format_desc,
2552                              unsigned n,
2553                              LLVMValueRef base_ptr,
2554                              LLVMValueRef offset,
2555                              LLVMValueRef i,
2556                              LLVMValueRef j,
2557                              LLVMValueRef cache)
2558 {
2559    LLVMValueRef rgba;
2560    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2561    LLVMBuilderRef builder = gallivm->builder;
2562    LLVMValueRef red_lo, red_hi, green_lo, green_hi;
2563    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC);
2564    assert(format_desc->block.width == 4);
2565    assert(format_desc->block.height == 4);
2566
2567    assert((n == 1) || (n % 4 == 0));
2568
2569    if (n > 4) {
2570       unsigned count;
2571       LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2572       LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
2573       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2574       LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2575                                                    gallivm->context), 4);
2576       LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2577       struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2578
2579       rgba = LLVMGetUndef(i128_vectype);
2580
2581       for (count = 0; count < n / 4; count++) {
2582
2583          i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2584          j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2585          offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2586
2587          lp_build_gather_rgtc(gallivm, 4, format_desc, &red_lo, &red_hi,
2588                               &green_lo, &green_hi, base_ptr, offset4);
2589
2590          switch (format_desc->format) {
2591          case PIPE_FORMAT_RGTC1_UNORM:
2592          case PIPE_FORMAT_RGTC1_SNORM:
2593             rgba4[count] = rgtc1_to_rgba_aos(gallivm, 4, format_desc->format,
2594                                              red_lo, red_hi, i4, j4);
2595             break;
2596          case PIPE_FORMAT_RGTC2_UNORM:
2597          case PIPE_FORMAT_RGTC2_SNORM:
2598             rgba4[count] = rgtc2_to_rgba_aos(gallivm, 4, format_desc->format,
2599                                              red_lo, red_hi, green_lo, green_hi, i4, j4);
2600             break;
2601          case PIPE_FORMAT_LATC1_UNORM:
2602          case PIPE_FORMAT_LATC1_SNORM:
2603             rgba4[count] = latc1_to_rgba_aos(gallivm, 4, format_desc->format,
2604                                              red_lo, red_hi, i4, j4);
2605             break;
2606          case PIPE_FORMAT_LATC2_UNORM:
2607          case PIPE_FORMAT_LATC2_SNORM:
2608             rgba4[count] = latc2_to_rgba_aos(gallivm, 4, format_desc->format,
2609                                              red_lo, red_hi, green_lo, green_hi, i4, j4);
2610             break;
2611          default:
2612             assert(0);
2613             rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2614             break;
2615          }
2616          /* shuffles typically give best results with dword elements...*/
2617          rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2618       }
2619       rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2620       rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2621    } else {
2622       LLVMValueRef red_lo, red_hi, green_lo, green_hi;
2623
2624       lp_build_gather_rgtc(gallivm, n, format_desc, &red_lo, &red_hi,
2625                            &green_lo, &green_hi, base_ptr, offset);
2626
2627       switch (format_desc->format) {
2628       case PIPE_FORMAT_RGTC1_UNORM:
2629       case PIPE_FORMAT_RGTC1_SNORM:
2630          rgba = rgtc1_to_rgba_aos(gallivm, n, format_desc->format,
2631                                   red_lo, red_hi, i, j);
2632          break;
2633       case PIPE_FORMAT_RGTC2_UNORM:
2634       case PIPE_FORMAT_RGTC2_SNORM:
2635          rgba = rgtc2_to_rgba_aos(gallivm, n, format_desc->format,
2636                                   red_lo, red_hi, green_lo, green_hi, i, j);
2637          break;
2638       case PIPE_FORMAT_LATC1_UNORM:
2639       case PIPE_FORMAT_LATC1_SNORM:
2640          rgba = latc1_to_rgba_aos(gallivm, n, format_desc->format,
2641                                   red_lo, red_hi, i, j);
2642          break;
2643       case PIPE_FORMAT_LATC2_UNORM:
2644       case PIPE_FORMAT_LATC2_SNORM:
2645          rgba = latc2_to_rgba_aos(gallivm, n, format_desc->format,
2646                                   red_lo, red_hi, green_lo, green_hi, i, j);
2647          break;
2648       default:
2649          assert(0);
2650          rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2651          break;
2652       }
2653    }
2654    return rgba;
2655 }