src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2010-2018 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  * The above copyright notice and this permission notice (including the
  23  * next paragraph) shall be included in all copies or substantial portions
  24  * of the Software.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * s3tc pixel format manipulation.
  32  *
  33  * @author Roland Scheidegger <sroland@vmware.com>
  34  */
  35
  36
  37 #include "util/u_format.h"
  38 #include "util/u_math.h"
  39 #include "util/u_string.h"
  40 #include "util/u_cpu_detect.h"
  41 #include "util/u_debug.h"
  42
  43 #include "lp_bld_arit.h"
  44 #include "lp_bld_type.h"
  45 #include "lp_bld_const.h"
  46 #include "lp_bld_conv.h"
  47 #include "lp_bld_gather.h"
  48 #include "lp_bld_format.h"
  49 #include "lp_bld_logic.h"
  50 #include "lp_bld_pack.h"
  51 #include "lp_bld_flow.h"
  52 #include "lp_bld_printf.h"
  53 #include "lp_bld_struct.h"
  54 #include "lp_bld_swizzle.h"
  55 #include "lp_bld_init.h"
  56 #include "lp_bld_debug.h"
  57 #include "lp_bld_intr.h"
  58
  59
  60 /**
  61  * Reverse an interleave2_half
  62  * (ie. pick every second element, independent lower/upper halfs)
  63  * sse2 can only do that with 32bit (shufps) or larger elements
  64  * natively. (Otherwise, and/pack (even) or shift/pack (odd)
  65  * could be used, ideally llvm would do that for us.)
  66  * XXX: Unfortunately, this does NOT translate to a shufps if those
  67  * are int vectors (and casting will not help, llvm needs to recognize it
  68  * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
  69  * sequence which I'm pretty sure is a lot worse despite domain transition
  70  * penalties with shufps (except maybe on Nehalem).
  71  */
  72 static LLVMValueRef
  73 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
  74                             struct lp_type type,
  75                             LLVMValueRef a,
  76                             LLVMValueRef b,
  77                             unsigned lo_hi)
  78 {
  79    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
  80    unsigned i, j;
  81
  82    assert(type.length <= LP_MAX_VECTOR_LENGTH);
  83    assert(lo_hi < 2);
  84
  85    if (type.length * type.width == 256) {
  86       assert(type.length >= 4);
  87       for (i = 0, j = 0; i < type.length; ++i) {
  88          if (i == type.length / 4) {
  89             j = type.length;
  90          } else if (i == type.length / 2) {
  91             j = type.length / 2;
  92          } else if (i == 3 * type.length / 4) {
  93             j = 3 * type.length / 4;
  94          } else {
  95             j += 2;
  96          }
  97          elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
  98       }
  99    } else {
 100       for (i = 0; i < type.length; ++i) {
 101          elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
 102       }
 103    }
 104
 105    shuffle = LLVMConstVector(elems, type.length);
 106
 107    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 108
 109 }
 110
 111
 112 /**
 113  * Build shuffle for extending vectors.
 114  */
 115 static LLVMValueRef
 116 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
 117                               unsigned n, unsigned length)
 118 {
 119    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 120    unsigned i;
 121
 122    assert(n <= length);
 123    assert(length <= LP_MAX_VECTOR_LENGTH);
 124
 125    /* TODO: cache results in a static table */
 126
 127    for(i = 0; i < n; i++) {
 128       elems[i] = lp_build_const_int32(gallivm, i);
 129    }
 130    for (i = n; i < length; i++) {
 131       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 132    }
 133
 134    return LLVMConstVector(elems, length);
 135 }
 136
 137 static LLVMValueRef
 138 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
 139 {
 140    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 141    unsigned i, j;
 142
 143    assert(n <= LP_MAX_VECTOR_LENGTH);
 144
 145    /* TODO: cache results in a static table */
 146
 147    for(i = 0, j = 0; i < n; i += 2, ++j) {
 148       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
 149       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
 150       elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
 151       elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
 152    }
 153
 154    return LLVMConstVector(elems, n * 2);
 155 }
 156
 157 /*
 158  * broadcast 1 element to all elements
 159  */
 160 static LLVMValueRef
 161 lp_build_const_shuffle1(struct gallivm_state *gallivm,
 162                         unsigned index, unsigned n)
 163 {
 164    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 165    unsigned i;
 166
 167    assert(n <= LP_MAX_VECTOR_LENGTH);
 168
 169    /* TODO: cache results in a static table */
 170
 171    for (i = 0; i < n; i++) {
 172       elems[i] = lp_build_const_int32(gallivm, index);
 173    }
 174
 175    return LLVMConstVector(elems, n);
 176 }
 177
 178 /*
 179  * move 1 element to pos 0, rest undef
 180  */
 181 static LLVMValueRef
 182 lp_build_shuffle1undef(struct gallivm_state *gallivm,
 183                        LLVMValueRef a, unsigned index, unsigned n)
 184 {
 185    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
 186    unsigned i;
 187
 188    assert(n <= LP_MAX_VECTOR_LENGTH);
 189
 190    elems[0] = lp_build_const_int32(gallivm, index);
 191
 192    for (i = 1; i < n; i++) {
 193       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 194    }
 195    shuf = LLVMConstVector(elems, n);
 196
 197    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
 198 }
 199
 200 static boolean
 201 format_dxt1_variant(enum pipe_format format)
 202 {
 203   return format == PIPE_FORMAT_DXT1_RGB ||
 204          format == PIPE_FORMAT_DXT1_RGBA ||
 205          format == PIPE_FORMAT_DXT1_SRGB ||
 206          format == PIPE_FORMAT_DXT1_SRGBA;
 207
 208 }
 209
 210 /**
 211  * Gather elements from scatter positions in memory into vectors.
 212  * This is customised for fetching texels from s3tc textures.
 213  * For SSE, typical value is length=4.
 214  *
 215  * @param length length of the offsets
 216  * @param colors the stored colors of the blocks will be extracted into this.
 217  * @param codewords the codewords of the blocks will be extracted into this.
 218  * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
 219  * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
 220  * @param base_ptr base pointer, should be a i8 pointer type.
 221  * @param offsets vector with offsets
 222  */
 223 static void
 224 lp_build_gather_s3tc(struct gallivm_state *gallivm,
 225                      unsigned length,
 226                      const struct util_format_description *format_desc,
 227                      LLVMValueRef *colors,
 228                      LLVMValueRef *codewords,
 229                      LLVMValueRef *alpha_lo,
 230                      LLVMValueRef *alpha_hi,
 231                      LLVMValueRef base_ptr,
 232                      LLVMValueRef offsets)
 233 {
 234    LLVMBuilderRef builder = gallivm->builder;
 235    unsigned block_bits = format_desc->block.bits;
 236    unsigned i;
 237    LLVMValueRef elems[8];
 238    LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
 239    LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
 240    LLVMTypeRef type32dxt;
 241    struct lp_type lp_type32dxt;
 242
 243    memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
 244    lp_type32dxt.width = 32;
 245    lp_type32dxt.length = block_bits / 32;
 246    type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
 247
 248    assert(block_bits == 64 || block_bits == 128);
 249    assert(length == 1 || length == 4 || length == 8);
 250
 251    for (i = 0; i < length; ++i) {
 252       elems[i] = lp_build_gather_elem(gallivm, length,
 253                                       block_bits, block_bits, TRUE,
 254                                       base_ptr, offsets, i, FALSE);
 255       elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
 256    }
 257    if (length == 1) {
 258       LLVMValueRef elem = elems[0];
 259       if (block_bits == 128) {
 260          *alpha_lo = LLVMBuildExtractElement(builder, elem,
 261                                              lp_build_const_int32(gallivm, 0), "");
 262          *alpha_hi = LLVMBuildExtractElement(builder, elem,
 263                                              lp_build_const_int32(gallivm, 1), "");
 264          *colors = LLVMBuildExtractElement(builder, elem,
 265                                            lp_build_const_int32(gallivm, 2), "");
 266          *codewords = LLVMBuildExtractElement(builder, elem,
 267                                               lp_build_const_int32(gallivm, 3), "");
 268       }
 269       else {
 270          *alpha_lo = LLVMGetUndef(type32);
 271          *alpha_hi = LLVMGetUndef(type32);
 272          *colors = LLVMBuildExtractElement(builder, elem,
 273                                            lp_build_const_int32(gallivm, 0), "");
 274          *codewords = LLVMBuildExtractElement(builder, elem,
 275                                               lp_build_const_int32(gallivm, 1), "");
 276       }
 277    }
 278    else {
 279       LLVMValueRef tmp[4], cc01, cc23;
 280       struct lp_type lp_type32, lp_type64, lp_type32dxt;
 281       memset(&lp_type32, 0, sizeof lp_type32);
 282       lp_type32.width = 32;
 283       lp_type32.length = length;
 284       memset(&lp_type64, 0, sizeof lp_type64);
 285       lp_type64.width = 64;
 286       lp_type64.length = length/2;
 287
 288       if (block_bits == 128) {
 289          if (length == 8) {
 290             for (i = 0; i < 4; ++i) {
 291                tmp[0] = elems[i];
 292                tmp[1] = elems[i+4];
 293                elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
 294             }
 295          }
 296          lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
 297          *colors = tmp[2];
 298          *codewords = tmp[3];
 299          *alpha_lo = tmp[0];
 300          *alpha_hi = tmp[1];
 301       } else {
 302          LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
 303          LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
 304
 305          for (i = 0; i < length; ++i) {
 306             /* no-op shuffle */
 307             elems[i] = LLVMBuildShuffleVector(builder, elems[i],
 308                                               LLVMGetUndef(type32dxt),
 309                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
 310          }
 311          if (length == 8) {
 312             for (i = 0; i < 4; ++i) {
 313                tmp[0] = elems[i];
 314                tmp[1] = elems[i+4];
 315                elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
 316             }
 317          }
 318          cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
 319          cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
 320          cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
 321          cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
 322          *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
 323          *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
 324          *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
 325          *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
 326       }
 327    }
 328 }
 329
 330 /** Convert from <n x i32> containing 2 x n rgb565 colors
 331  * to 2 <n x i32> rgba8888 colors
 332  * This is the most optimized version I can think of
 333  * should be nearly as fast as decoding only one color
 334  * NOTE: alpha channel will be set to 0
 335  * @param colors  is a <n x i32> vector containing the rgb565 colors
 336  */
 337 static void
 338 color_expand2_565_to_8888(struct gallivm_state *gallivm,
 339                           unsigned n,
 340                           LLVMValueRef colors,
 341                           LLVMValueRef *color0,
 342                           LLVMValueRef *color1)
 343 {
 344    LLVMBuilderRef builder = gallivm->builder;
 345    LLVMValueRef r, g, b, rblo, glo;
 346    LLVMValueRef rgblomask, rb, rgb0, rgb1;
 347    struct lp_type type, type16, type8;
 348
 349    assert(n > 1);
 350
 351    memset(&type, 0, sizeof type);
 352    type.width = 32;
 353    type.length = n;
 354
 355    memset(&type16, 0, sizeof type16);
 356    type16.width = 16;
 357    type16.length = 2 * n;
 358
 359    memset(&type8, 0, sizeof type8);
 360    type8.width = 8;
 361    type8.length = 4 * n;
 362
 363    rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
 364    colors = LLVMBuildBitCast(builder, colors,
 365                              lp_build_vec_type(gallivm, type16), "");
 366    /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
 367     * make sure low bits of r are zero - could use AND but requires constant */
 368    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
 369    r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
 370    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
 371    rb = LLVMBuildOr(builder, r, b, "");
 372    rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
 373    /* don't have byte shift hence need mask */
 374    rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
 375    rb = LLVMBuildOr(builder, rb, rblo, "");
 376
 377    /* make sure low bits of g are zero */
 378    g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
 379    g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
 380    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
 381    g = LLVMBuildOr(builder, g, glo, "");
 382
 383    rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
 384    g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
 385    rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
 386    rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
 387
 388    rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
 389    rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
 390
 391    /* rgb0 is rgb00, rgb01, rgb10, rgb11
 392     * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
 393     * on x86 this _should_ just generate one shufps...
 394     */
 395    *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
 396    *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
 397 }
 398
 399
 400 /** Convert from <n x i32> containing rgb565 colors
 401  * (in first 16 bits) to <n x i32> rgba8888 colors
 402  * bits 16-31 MBZ
 403  * NOTE: alpha channel will be set to 0
 404  * @param colors  is a <n x i32> vector containing the rgb565 colors
 405  */
 406 static LLVMValueRef
 407 color_expand_565_to_8888(struct gallivm_state *gallivm,
 408                          unsigned n,
 409                          LLVMValueRef colors)
 410 {
 411    LLVMBuilderRef builder = gallivm->builder;
 412    LLVMValueRef rgba, r, g, b, rgblo, glo;
 413    LLVMValueRef rbhimask, g6mask, rgblomask;
 414    struct lp_type type;
 415    memset(&type, 0, sizeof type);
 416    type.width = 32;
 417    type.length = n;
 418
 419    /* color expansion:
 420     * first extract and shift colors into their final locations
 421     * (high bits - low bits zero at this point)
 422     * then replicate highest bits to the lowest bits
 423     * note rb replication can be done in parallel but not g
 424     * (different shift)
 425     * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
 426     * rhigh = 8, ghigh = 5, bhigh = 19
 427     * rblow = 5, glow = 6
 428     * rgblowmask = 0x00070307
 429     * r = colors >> rhigh
 430     * b = colors << bhigh
 431     * g = (colors & g6mask) << ghigh
 432     * rb = (r | b) rbhimask
 433     * rbtmp = rb >> rblow
 434     * gtmp = rb >> glow
 435     * rbtmp = rbtmp | gtmp
 436     * rbtmp = rbtmp & rgblowmask
 437     * rgb = rb | g | rbtmp
 438     */
 439    g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
 440    rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
 441    rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
 442
 443    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
 444    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
 445    g = LLVMBuildAnd(builder, colors, g6mask, "");
 446    g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
 447    rgba = LLVMBuildOr(builder, r, b, "");
 448    rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
 449    rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
 450    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
 451    rgblo = LLVMBuildOr(builder, rgblo, glo, "");
 452    rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
 453    rgba = LLVMBuildOr(builder, rgba, g, "");
 454    rgba = LLVMBuildOr(builder, rgba, rgblo, "");
 455
 456    return rgba;
 457 }
 458
 459
 460 /**
 461  * Calculate 1/3(v1-v0) + v0
 462  * and 2*1/3(v1-v0) + v0
 463  */
 464 static void
 465 lp_build_lerp23(struct lp_build_context *bld,
 466                 LLVMValueRef v0,
 467                 LLVMValueRef v1,
 468                 LLVMValueRef *res0,
 469                 LLVMValueRef *res1)
 470 {
 471    struct gallivm_state *gallivm = bld->gallivm;
 472    LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
 473    LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
 474    const struct lp_type type = bld->type;
 475    LLVMBuilderRef builder = bld->gallivm->builder;
 476    struct lp_type i16_type = lp_wider_type(type);
 477    struct lp_build_context bld2;
 478
 479    assert(lp_check_value(type, v0));
 480    assert(lp_check_value(type, v1));
 481    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
 482
 483    lp_build_context_init(&bld2, gallivm, i16_type);
 484    bld2.type.sign = TRUE;
 485    x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
 486
 487    /* FIXME: use native avx256 unpack/pack */
 488    lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
 489    lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
 490    lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
 491    delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
 492    delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
 493
 494    mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
 495    mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
 496
 497    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
 498    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
 499    /* lerp optimization: pack now, do add afterwards */
 500    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
 501    *res0 = lp_build_add(bld, tmp, v0);
 502
 503    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
 504    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
 505    /* unlike above still need mask (but add still afterwards). */
 506    x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
 507    x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
 508    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
 509    *res1 = lp_build_add(bld, tmp, v0);
 510 }
 511
 512 /**
 513  * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
 514  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 515  * @param codewords  is a <n x i32> vector containing the codewords
 516  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 517  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 518  */
 519 static LLVMValueRef
 520 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
 521                            unsigned n,
 522                            enum pipe_format format,
 523                            LLVMValueRef colors,
 524                            LLVMValueRef codewords,
 525                            LLVMValueRef i,
 526                            LLVMValueRef j)
 527 {
 528    LLVMBuilderRef builder = gallivm->builder;
 529    LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
 530    LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
 531    LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
 532    struct lp_type type, type8;
 533    struct lp_build_context bld8, bld32;
 534    boolean is_dxt1_variant = format_dxt1_variant(format);
 535
 536    memset(&type, 0, sizeof type);
 537    type.width = 32;
 538    type.length = n;
 539
 540    memset(&type8, 0, sizeof type8);
 541    type8.width = 8;
 542    type8.length = 4*n;
 543
 544    assert(lp_check_value(type, i));
 545    assert(lp_check_value(type, j));
 546
 547    a = lp_build_const_int_vec(gallivm, type, 0xff000000);
 548
 549    lp_build_context_init(&bld32, gallivm, type);
 550    lp_build_context_init(&bld8, gallivm, type8);
 551
 552    /*
 553     * works as follows:
 554     * - expand color0/color1 to rgba8888
 555     * - calculate color2/3 (interpolation) according to color0 < color1 rules
 556     * - calculate color2/3 according to color0 >= color1 rules
 557     * - do selection of color2/3 according to comparison of color0/1
 558     * - extract indices (vector shift).
 559     * - use compare/select to select the correct color. Since we have 2bit
 560     *   indices (and 4 colors), needs at least three compare/selects.
 561     */
 562    /*
 563     * expand the two colors
 564     */
 565    col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
 566    col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
 567    if (n > 1) {
 568       color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
 569    }
 570    else {
 571       color0 = color_expand_565_to_8888(gallivm, n, col0);
 572       color1 = color_expand_565_to_8888(gallivm, n, col1);
 573    }
 574
 575    /*
 576     * interpolate colors
 577     * color2_1 is 2/3 color0 + 1/3 color1
 578     * color3_1 is 1/3 color0 + 2/3 color1
 579     * color2_2 is 1/2 color0 + 1/2 color1
 580     * color3_2 is 0
 581     */
 582
 583    colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
 584    colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
 585    /* can combine 2 lerps into one mostly - still looks expensive enough. */
 586    lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
 587    color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
 588    color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
 589
 590    /* dxt3/5 always use 4-color encoding */
 591    if (is_dxt1_variant) {
 592       /* fix up alpha */
 593       if (format == PIPE_FORMAT_DXT1_RGBA ||
 594           format == PIPE_FORMAT_DXT1_SRGBA) {
 595          color0 = LLVMBuildOr(builder, color0, a, "");
 596          color1 = LLVMBuildOr(builder, color1, a, "");
 597          color3 = LLVMBuildOr(builder, color3, a, "");
 598       }
 599       /*
 600        * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
 601        * Much cheaper (but we don't care that much if n == 1).
 602        */
 603       if ((util_cpu_caps.has_sse2 && n == 4) ||
 604           (util_cpu_caps.has_avx2 && n == 8)) {
 605          LLVMValueRef intrargs[2];
 606          char *intr_name = n == 8 ? "llvm.x86.avx2.pavg.b" :
 607                                     "llvm.x86.sse2.pavg.b";
 608          intrargs[0] = colors0;
 609          intrargs[1] = colors1;
 610          color2_2 = lp_build_intrinsic(builder, intr_name,
 611                                        bld8.vec_type, intrargs, 2, 0);
 612          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
 613       }
 614       else {
 615          struct lp_type i16_type = lp_wider_type(type8);
 616          struct lp_build_context bld2;
 617          LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
 618
 619          lp_build_context_init(&bld2, gallivm, i16_type);
 620          bld2.type.sign = TRUE;
 621
 622          /*
 623           * This isn't as expensive as it looks (the unpack is the same as
 624           * for lerp23), with correct rounding.
 625           * (Note that while rounding is correct, this will always round down,
 626           * whereas pavgb will always round up.)
 627           */
 628          /* FIXME: use native avx256 unpack/pack */
 629          lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
 630          lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
 631
 632          addlo = lp_build_add(&bld2, v0_lo, v1_lo);
 633          addhi = lp_build_add(&bld2, v0_hi, v1_hi);
 634          addlo = LLVMBuildLShr(builder, addlo,
 635                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
 636          addhi = LLVMBuildLShr(builder, addhi,
 637                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
 638          color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
 639          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
 640       }
 641       color3_2 = lp_build_const_int_vec(gallivm, type, 0);
 642
 643       /* select between colors2/3 */
 644       /* signed compare is faster saves some xors */
 645       type.sign = TRUE;
 646       sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
 647       color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
 648       color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
 649       type.sign = FALSE;
 650
 651       if (format == PIPE_FORMAT_DXT1_RGBA ||
 652           format == PIPE_FORMAT_DXT1_SRGBA) {
 653          color2 = LLVMBuildOr(builder, color2, a, "");
 654       }
 655    }
 656
 657    const2 = lp_build_const_int_vec(gallivm, type, 2);
 658    /* extract 2-bit index values */
 659    bit_pos = LLVMBuildShl(builder, j, const2, "");
 660    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 661    bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
 662    /*
 663     * NOTE: This innocent looking shift is very expensive with x86/ssex.
 664     * Shifts with per-elemnent shift count get roughly translated to
 665     * extract (count), extract (value), shift, move (back to xmm), unpack
 666     * per element!
 667     * So about 20 instructions here for 4xi32.
 668     * Newer llvm versions (3.7+) will not do extract/insert but use a
 669     * a couple constant count vector shifts plus shuffles. About same
 670     * amount of instructions unfortunately...
 671     * Would get much worse with 8xi16 even...
 672     * We could actually do better here:
 673     * - subtract bit_pos from 128+30, shl 23, convert float to int...
 674     * - now do mul with codewords followed by shr 30...
 675     * But requires 32bit->32bit mul, sse41 only (well that's emulatable
 676     * with 2 32bit->64bit muls...) and not exactly cheap
 677     * AVX2, of course, fixes this nonsense.
 678     */
 679    indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
 680
 681    /* finally select the colors */
 682    sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
 683    sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
 684    color0 = lp_build_select(&bld32, sel_lo, color1, color0);
 685    color2 = lp_build_select(&bld32, sel_lo, color3, color2);
 686    sel_hi = LLVMBuildAnd(builder, indices, const2, "");
 687    sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
 688    rgba = lp_build_select(&bld32, sel_hi, color2, color0);
 689
 690    /* fix up alpha */
 691    if (format == PIPE_FORMAT_DXT1_RGB ||
 692        format == PIPE_FORMAT_DXT1_SRGB) {
 693       rgba = LLVMBuildOr(builder, rgba, a, "");
 694    }
 695    return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
 696 }
 697
 698
 699 static LLVMValueRef
 700 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
 701                       unsigned n,
 702                       enum pipe_format format,
 703                       LLVMValueRef colors,
 704                       LLVMValueRef codewords,
 705                       LLVMValueRef i,
 706                       LLVMValueRef j)
 707 {
 708    return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
 709                                      colors, codewords, i, j);
 710 }
 711
 712
 713 /**
 714  * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
 715  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 716  * @param codewords  is a <n x i32> vector containing the codewords
 717  * @param alphas  is a <n x i64> vector containing the alpha values
 718  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 719  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 720  */
 721 static LLVMValueRef
 722 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
 723                       unsigned n,
 724                       enum pipe_format format,
 725                       LLVMValueRef colors,
 726                       LLVMValueRef codewords,
 727                       LLVMValueRef alpha_low,
 728                       LLVMValueRef alpha_hi,
 729                       LLVMValueRef i,
 730                       LLVMValueRef j)
 731 {
 732    LLVMBuilderRef builder = gallivm->builder;
 733    LLVMValueRef rgba, tmp, tmp2;
 734    LLVMValueRef bit_pos, sel_mask;
 735    struct lp_type type, type8;
 736    struct lp_build_context bld;
 737
 738    memset(&type, 0, sizeof type);
 739    type.width = 32;
 740    type.length = n;
 741
 742    memset(&type8, 0, sizeof type8);
 743    type8.width = 8;
 744    type8.length = n*4;
 745
 746    assert(lp_check_value(type, i));
 747    assert(lp_check_value(type, j));
 748
 749    lp_build_context_init(&bld, gallivm, type);
 750
 751    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
 752                                 colors, codewords, i, j);
 753
 754    rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
 755
 756    /*
 757     * Extract alpha values. Since we now need to select from
 758     * which 32bit vector values are fetched, construct selection
 759     * mask from highest bit of bit_pos, and use select, then shift
 760     * according to the bit_pos (without the highest bit).
 761     * Note this is pointless for n == 1 case. Could just
 762     * directly use 64bit arithmetic if we'd extract 64bit
 763     * alpha value instead of 2x32...
 764     */
 765    /* pos = 4*(4j+i) */
 766    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
 767    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 768    bit_pos = LLVMBuildShl(builder, bit_pos,
 769                           lp_build_const_int_vec(gallivm, type, 2), "");
 770    sel_mask = LLVMBuildLShr(builder, bit_pos,
 771                             lp_build_const_int_vec(gallivm, type, 5), "");
 772    sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
 773    tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
 774    bit_pos = LLVMBuildAnd(builder, bit_pos,
 775                           lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
 776    /* Warning: slow shift with per element count */
 777    /*
 778     * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
 779     * to select the right byte with pshufb. Then for the remaining one bit
 780     * just do shift/select.
 781     */
 782    tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
 783
 784    /* combined expand from a4 to a8 and shift into position */
 785    tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
 786    tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
 787    tmp = LLVMBuildOr(builder, tmp, tmp2, "");
 788
 789    rgba = LLVMBuildOr(builder, tmp, rgba, "");
 790
 791    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
 792 }
 793
 794 static LLVMValueRef
 795 lp_build_lerpdxta(struct gallivm_state *gallivm,
 796                   LLVMValueRef alpha0,
 797                   LLVMValueRef alpha1,
 798                   LLVMValueRef code,
 799                   LLVMValueRef sel_mask,
 800                   unsigned n)
 801 {
 802    /*
 803     * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
 804     * (plus pmullw is actually faster...)
 805     * we just pretend our 32bit values (which are really only 8bit) are 16bits.
 806     * Note that this is obviously a disaster for the scalar case.
 807     */
 808    LLVMBuilderRef builder = gallivm->builder;
 809    LLVMValueRef delta, ainterp;
 810    LLVMValueRef weight5, weight7, weight;
 811    struct lp_type type32, type16, type8;
 812    struct lp_build_context bld16;
 813
 814    memset(&type32, 0, sizeof type32);
 815    type32.width = 32;
 816    type32.length = n;
 817    memset(&type16, 0, sizeof type16);
 818    type16.width = 16;
 819    type16.length = 2*n;
 820    type16.sign = TRUE;
 821    memset(&type8, 0, sizeof type8);
 822    type8.width = 8;
 823    type8.length = 4*n;
 824
 825    lp_build_context_init(&bld16, gallivm, type16);
 826    /* 255/7 is a bit off - increase accuracy at the expense of shift later */
 827    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
 828    weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
 829    weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
 830    weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
 831
 832    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
 833    alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
 834    code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
 835    /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
 836       but we don't care */
 837    code = LLVMBuildSub(builder, code, bld16.one, "");
 838
 839    weight = LLVMBuildMul(builder, weight, code, "");
 840    weight = LLVMBuildLShr(builder, weight,
 841                           lp_build_const_int_vec(gallivm, type16, 6), "");
 842
 843    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
 844
 845    ainterp = LLVMBuildMul(builder, delta, weight, "");
 846    ainterp = LLVMBuildLShr(builder, ainterp,
 847                            lp_build_const_int_vec(gallivm, type16, 8), "");
 848
 849    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
 850    alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
 851    ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
 852    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
 853
 854    return ainterp;
 855 }
 856
 857 /**
 858  * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
 859  * @param colors  is a <n x i32> vector with n x 2x16bit colors
 860  * @param codewords  is a <n x i32> vector containing the codewords
 861  * @param alphas  is a <n x i64> vector containing the alpha values
 862  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
 863  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
 864  */
 865 static LLVMValueRef
 866 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
 867                            unsigned n,
 868                            enum pipe_format format,
 869                            LLVMValueRef colors,
 870                            LLVMValueRef codewords,
 871                            LLVMValueRef alpha_lo,
 872                            LLVMValueRef alpha_hi,
 873                            LLVMValueRef i,
 874                            LLVMValueRef j)
 875 {
 876    LLVMBuilderRef builder = gallivm->builder;
 877    LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
 878    LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
 879    LLVMValueRef mask6, mask7, ainterp;
 880    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
 881    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 882    struct lp_type type, type8;
 883    struct lp_build_context bld32;
 884
 885    memset(&type, 0, sizeof type);
 886    type.width = 32;
 887    type.length = n;
 888
 889    memset(&type8, 0, sizeof type8);
 890    type8.width = 8;
 891    type8.length = n*4;
 892
 893    assert(lp_check_value(type, i));
 894    assert(lp_check_value(type, j));
 895
 896    lp_build_context_init(&bld32, gallivm, type);
 897
 898    assert(lp_check_value(type, i));
 899    assert(lp_check_value(type, j));
 900
 901    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
 902                                 colors, codewords, i, j);
 903
 904    rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
 905
 906    /* this looks pretty complex for vectorization:
 907     * extract a0/a1 values
 908     * extract code
 909     * select weights for interpolation depending on a0 > a1
 910     * mul weights by code - 1
 911     * lerp a0/a1/weights
 912     * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
 913     */
 914
 915    alpha0 = LLVMBuildAnd(builder, alpha_lo,
 916                          lp_build_const_int_vec(gallivm, type, 0xff), "");
 917    alpha1 = LLVMBuildLShr(builder, alpha_lo,
 918                           lp_build_const_int_vec(gallivm, type, 8), "");
 919    alpha1 = LLVMBuildAnd(builder, alpha1,
 920                          lp_build_const_int_vec(gallivm, type, 0xff), "");
 921
 922    /* pos = 3*(4j+i) */
 923    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
 924    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
 925    tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
 926    bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
 927    /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
 928    bit_pos = LLVMBuildAdd(builder, bit_pos,
 929                           lp_build_const_int_vec(gallivm, type, 16), "");
 930
 931    if (n == 1) {
 932       struct lp_type type64;
 933       memset(&type64, 0, sizeof type64);
 934       type64.width = 64;
 935       type64.length = 1;
 936       /* This is pretty pointless could avoid by just directly extracting
 937          64bit in the first place but makes it more complicated elsewhere */
 938       alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
 939       alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
 940       alphac0 = LLVMBuildShl(builder, alpha_hi,
 941                              lp_build_const_int_vec(gallivm, type64, 32), "");
 942       alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
 943
 944       shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
 945       alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
 946       alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
 947       alphac = LLVMBuildAnd(builder, alphac0,
 948                             lp_build_const_int_vec(gallivm, type, 0x7), "");
 949    }
 950    else {
 951       /*
 952        * Using non-native vector length here (actually, with avx2 and
 953        * n == 4 llvm will indeed expand to ymm regs...)
 954        * At least newer llvm versions handle that ok.
 955        * llvm 3.7+ will even handle the emulated 64bit shift with variable
 956        * shift count without extraction (and it's actually easier to
 957        * emulate than the 32bit one).
 958        */
 959       alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
 960                                        lp_build_const_unpackx2_shuffle(gallivm, n), "");
 961
 962       alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
 963       shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
 964       alphac = LLVMBuildLShr(builder, alpha64, shift, "");
 965       alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
 966
 967       alphac = LLVMBuildAnd(builder, alphac,
 968                             lp_build_const_int_vec(gallivm, type, 0x7), "");
 969    }
 970
 971    /* signed compare is faster saves some xors */
 972    type.sign = TRUE;
 973    /* alpha0 > alpha1 selection */
 974    sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
 975                                alpha0, alpha1);
 976    ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
 977
 978    /*
 979     * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
 980     * else we select a0 for case 0, a1 for case 1,
 981     * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
 982     * a = (c == 0) ? a0 : a1
 983     * a = (c > 1) ? ainterp : a
 984     * Finally handle case 6/7 for !(a0 > a1)
 985     * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
 986     * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
 987     */
 988    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
 989                                alphac, bld32.zero);
 990    alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
 991    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
 992                                alphac, bld32.one);
 993    alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
 994
 995    code_s = LLVMBuildAnd(builder, alphac,
 996                          LLVMBuildNot(builder, sel_mask, ""), "");
 997    mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
 998                             code_s, lp_build_const_int_vec(gallivm, type, 6));
 999    mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1000                             code_s, lp_build_const_int_vec(gallivm, type, 7));
1001    alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1002    alpha = LLVMBuildOr(builder, alpha, mask7, "");
1003
1004    alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1005    rgba = LLVMBuildOr(builder, alpha, rgba, "");
1006
1007    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1008 }
1009
1010
1011 static void
1012 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1013                                    const struct util_format_description *format_desc,
1014                                    LLVMValueRef *dxt_block,
1015                                    LLVMValueRef ptr)
1016 {
1017    LLVMBuilderRef builder = gallivm->builder;
1018    unsigned block_bits = format_desc->block.bits;
1019    LLVMValueRef elem, shuf;
1020    LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1021    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1022    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1023    LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1024
1025    assert(block_bits == 64 || block_bits == 128);
1026
1027    ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
1028    elem = LLVMBuildLoad(builder, ptr, "");
1029
1030    if (block_bits == 128) {
1031       /* just return block as is */
1032       *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1033    }
1034    else {
1035       LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1036       shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1037       elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1038       *dxt_block = LLVMBuildShuffleVector(builder, elem,
1039                                           LLVMGetUndef(type32_2), shuf, "");
1040    }
1041 }
1042
1043
1044 static void
1045 s3tc_store_cached_block(struct gallivm_state *gallivm,
1046                         LLVMValueRef *col,
1047                         LLVMValueRef tag_value,
1048                         LLVMValueRef hash_index,
1049                         LLVMValueRef cache)
1050 {
1051    LLVMBuilderRef builder = gallivm->builder;
1052    LLVMValueRef ptr, indices[3];
1053    LLVMTypeRef type_ptr4x32;
1054    unsigned count;
1055
1056    type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1057    indices[0] = lp_build_const_int32(gallivm, 0);
1058    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1059    indices[2] = hash_index;
1060    ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1061    LLVMBuildStore(builder, tag_value, ptr);
1062
1063    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1064    hash_index = LLVMBuildMul(builder, hash_index,
1065                              lp_build_const_int32(gallivm, 16), "");
1066    for (count = 0; count < 4; count++) {
1067       indices[2] = hash_index;
1068       ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1069       ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1070       LLVMBuildStore(builder, col[count], ptr);
1071       hash_index = LLVMBuildAdd(builder, hash_index,
1072                                 lp_build_const_int32(gallivm, 4), "");
1073    }
1074 }
1075
1076 static LLVMValueRef
1077 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1078                          LLVMValueRef ptr,
1079                          LLVMValueRef index)
1080 {
1081    LLVMBuilderRef builder = gallivm->builder;
1082    LLVMValueRef member_ptr, indices[3];
1083
1084    indices[0] = lp_build_const_int32(gallivm, 0);
1085    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1086    indices[2] = index;
1087    member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1088    return LLVMBuildLoad(builder, member_ptr, "cache_data");
1089 }
1090
1091 static LLVMValueRef
1092 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1093                      LLVMValueRef ptr,
1094                      LLVMValueRef index)
1095 {
1096    LLVMBuilderRef builder = gallivm->builder;
1097    LLVMValueRef member_ptr, indices[3];
1098
1099    indices[0] = lp_build_const_int32(gallivm, 0);
1100    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1101    indices[2] = index;
1102    member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1103    return LLVMBuildLoad(builder, member_ptr, "tag_data");
1104 }
1105
1106 #if LP_BUILD_FORMAT_CACHE_DEBUG
1107 static void
1108 s3tc_update_cache_access(struct gallivm_state *gallivm,
1109                          LLVMValueRef ptr,
1110                          unsigned count,
1111                          unsigned index)
1112 {
1113    LLVMBuilderRef builder = gallivm->builder;
1114    LLVMValueRef member_ptr, cache_access;
1115
1116    assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1117           index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1118
1119    member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
1120    cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
1121    cache_access = LLVMBuildAdd(builder, cache_access,
1122                                LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
1123                                                                    count, 0), "");
1124    LLVMBuildStore(builder, cache_access, member_ptr);
1125 }
1126 #endif
1127
1128 /**
1129  * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1130  * The lerp is performed between the first 2 32bit colors
1131  * in the source vector, both results are returned packed in result vector.
1132  */
1133 static LLVMValueRef
1134 lp_build_lerp23_single(struct lp_build_context *bld,
1135                        LLVMValueRef v01)
1136 {
1137    struct gallivm_state *gallivm = bld->gallivm;
1138    LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1139    const struct lp_type type = bld->type;
1140    LLVMBuilderRef builder = bld->gallivm->builder;
1141    struct lp_type i16_type = lp_wider_type(type);
1142    struct lp_type i32_type = lp_wider_type(i16_type);
1143    struct lp_build_context bld2;
1144
1145    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1146
1147    lp_build_context_init(&bld2, gallivm, i16_type);
1148    bld2.type.sign = TRUE;
1149
1150    /* weights 256/3, 256*2/3, with correct rounding */
1151    elems[0] = elems[1] = elems[2] = elems[3] =
1152       lp_build_const_elem(gallivm, i16_type, 255*1/3);
1153    elems[4] = elems[5] = elems[6] = elems[7] =
1154       lp_build_const_elem(gallivm, i16_type, 171);
1155    x = LLVMConstVector(elems, 8);
1156
1157    /*
1158     * v01 has col0 in 32bit elem 0, col1 in elem 1.
1159     * Interleave/unpack will give us separate v0/v1 vectors.
1160     */
1161    v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1162    v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1163
1164    lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1165    delta = lp_build_sub(&bld2, v1, v0);
1166
1167    mul = LLVMBuildMul(builder, x, delta, "");
1168
1169    mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1170    /* lerp optimization: pack now, do add afterwards */
1171    res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1172    /* only lower 2 elems are valid - for these v0 is really v0 */
1173    return lp_build_add(bld, res, v01);
1174 }
1175
1176 /*
1177  * decode one dxt1 block.
1178  */
1179 static void
1180 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1181                        enum pipe_format format,
1182                        LLVMValueRef dxt_block,
1183                        LLVMValueRef *col)
1184 {
1185    LLVMBuilderRef builder = gallivm->builder;
1186    LLVMValueRef color01, color23, color01_16, color0123;
1187    LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1188    struct lp_type type8, type32, type16, type64;
1189    struct lp_build_context bld8, bld32, bld16, bld64;
1190    unsigned i;
1191    boolean is_dxt1_variant = format_dxt1_variant(format);
1192
1193    memset(&type32, 0, sizeof type32);
1194    type32.width = 32;
1195    type32.length = 4;
1196    type32.sign = TRUE;
1197
1198    memset(&type8, 0, sizeof type8);
1199    type8.width = 8;
1200    type8.length = 16;
1201
1202    memset(&type16, 0, sizeof type16);
1203    type16.width = 16;
1204    type16.length = 8;
1205
1206    memset(&type64, 0, sizeof type64);
1207    type64.width = 64;
1208    type64.length = 2;
1209
1210    a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1211    const2 = lp_build_const_int_vec(gallivm, type32, 2);
1212
1213    lp_build_context_init(&bld32, gallivm, type32);
1214    lp_build_context_init(&bld16, gallivm, type16);
1215    lp_build_context_init(&bld8, gallivm, type8);
1216    lp_build_context_init(&bld64, gallivm, type64);
1217
1218    if (is_dxt1_variant) {
1219       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1220       code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1221    } else {
1222       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1223       code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1224    }
1225    code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1226    /* expand bytes to dwords */
1227    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1228    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1229
1230
1231    /*
1232     * works as follows:
1233     * - expand color0/color1 to rgba8888
1234     * - calculate color2/3 (interpolation) according to color0 < color1 rules
1235     * - calculate color2/3 according to color0 >= color1 rules
1236     * - do selection of color2/3 according to comparison of color0/1
1237     * - extract indices.
1238     * - use compare/select to select the correct color. Since we have 2bit
1239     *   indices (and 4 colors), needs at least three compare/selects.
1240     */
1241
1242    /*
1243     * expand the two colors
1244     */
1245    color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1246    color01 = lp_build_interleave2(gallivm, type16, color01,
1247                                   bld16.zero, 0);
1248    color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1249    color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1250
1251    /*
1252     * interpolate colors
1253     * color2_1 is 2/3 color0 + 1/3 color1
1254     * color3_1 is 1/3 color0 + 2/3 color1
1255     * color2_2 is 1/2 color0 + 1/2 color1
1256     * color3_2 is 0
1257     */
1258
1259    /* TODO: since this is now always scalar, should
1260     * probably just use control flow here instead of calculating
1261     * both cases and then selection
1262     */
1263    if (format == PIPE_FORMAT_DXT1_RGBA ||
1264        format == PIPE_FORMAT_DXT1_SRGBA) {
1265       color01 = LLVMBuildOr(builder, color01, a, "");
1266    }
1267    /* can combine 2 lerps into one mostly */
1268    color23 = lp_build_lerp23_single(&bld8, color01);
1269    color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1270
1271    /* dxt3/5 always use 4-color encoding */
1272    if (is_dxt1_variant) {
1273       LLVMValueRef color23_2, color2_2;
1274
1275       if (util_cpu_caps.has_sse2) {
1276          LLVMValueRef intrargs[2];
1277          intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1278          /* same interleave as for lerp23 - correct result in 2nd element */
1279          intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1280          intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1281          color2_2 = lp_build_intrinsic(builder, "llvm.x86.sse2.pavg.b",
1282                                        bld8.vec_type, intrargs, 2, 0);
1283       }
1284       else {
1285          LLVMValueRef v01, v0, v1, vhalf;
1286          /*
1287           * This isn't as expensive as it looks (the unpack is the same as
1288           * for lerp23, which is the reason why we do the pointless
1289           * interleave2 too), with correct rounding (the two lower elements
1290           * will be the same).
1291           */
1292          v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1293          v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1294          lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1295          vhalf = lp_build_add(&bld16, v0, v1);
1296          vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1297          color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1298       }
1299       /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1300       color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1301       color23_2 = LLVMBuildLShr(builder, color23_2,
1302                                 lp_build_const_int_vec(gallivm, type64, 32), "");
1303       color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1304
1305       tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1306       tmp = LLVMBuildLShr(builder, tmp,
1307                           lp_build_const_int_vec(gallivm, type64, 32), "");
1308       tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1309       sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1310                                   color01_16, tmp);
1311       sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1312       color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1313    }
1314
1315    if (util_cpu_caps.has_ssse3) {
1316       /*
1317        * Use pshufb as mini-lut. (Only doable with intrinsics as the
1318        * final shuffles are non-constant. pshufb is awesome!)
1319        */
1320       LLVMValueRef shuf[16], low2mask;
1321       LLVMValueRef intrargs[2], lut_ind, lut_adj;
1322
1323       color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1324       color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1325       color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1326       color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1327
1328       if (format == PIPE_FORMAT_DXT1_RGB ||
1329           format == PIPE_FORMAT_DXT1_SRGB) {
1330          color0123 = LLVMBuildOr(builder, color0123, a, "");
1331       }
1332
1333       /* shuffle as r0r1r2r3g0g1... */
1334       for (i = 0; i < 4; i++) {
1335          shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1336          shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1337          shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1338          shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1339       }
1340       color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1341       color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1342                                          LLVMConstVector(shuf, 16), "");
1343
1344       /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1345       low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1346       /* add 0/4/8/12 for r/g/b/a */
1347       lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1348       lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1349       intrargs[0] = color0123;
1350       for (i = 0; i < 4; i++) {
1351          lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1352          lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1353          intrargs[1] = lut_ind;
1354          col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1355                                      bld8.vec_type, intrargs, 2, 0);
1356          col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1357          code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1358          code = LLVMBuildLShr(builder, code, const2, "");
1359          code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1360       }
1361    }
1362    else {
1363       /* Thanks to vectorization can do 4 texels in parallel */
1364       LLVMValueRef color0, color1, color2, color3;
1365       if (format == PIPE_FORMAT_DXT1_RGB ||
1366           format == PIPE_FORMAT_DXT1_SRGB) {
1367          color01 = LLVMBuildOr(builder, color01, a, "");
1368          color23 = LLVMBuildOr(builder, color23, a, "");
1369       }
1370       color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1371                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1372       color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1373                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1374       color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1375                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1376       color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1377                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1378       code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1379
1380       for (i = 0; i < 4; i++) {
1381          /* select the colors */
1382          LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1383          bitlo = bld32.one;
1384          indices = LLVMBuildAnd(builder, code, bitlo, "");
1385          selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1386                                       indices, bitlo);
1387          rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1388
1389          LLVMValueRef selmaskhi;
1390          indices = LLVMBuildAnd(builder, code, const2, "");
1391          selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1392                                       indices, const2);
1393          rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1394          rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1395
1396          /*
1397           * Note that this will give "wrong" order.
1398           * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1399           * This would be easily fixable by using different shuffle, bitlo/hi
1400           * vectors above (and different shift), but seems slightly easier to
1401           * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1402           */
1403          col[i] = rgba;
1404          code = LLVMBuildLShr(builder, code, const2, "");
1405       }
1406    }
1407 }
1408
1409 /*
1410  * decode one dxt3 block.
1411  */
1412 static void
1413 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1414                        enum pipe_format format,
1415                        LLVMValueRef dxt_block,
1416                        LLVMValueRef *col)
1417 {
1418    LLVMBuilderRef builder = gallivm->builder;
1419    LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1420    struct lp_type type32, type8, type16;
1421    unsigned i;
1422
1423    memset(&type32, 0, sizeof type32);
1424    type32.width = 32;
1425    type32.length = 4;
1426
1427    memset(&type8, 0, sizeof type8);
1428    type8.width = 8;
1429    type8.length = 16;
1430
1431    memset(&type16, 0, sizeof type16);
1432    type16.width = 16;
1433    type16.length = 8;
1434
1435    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1436
1437    shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1438    mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1439
1440    alpha = LLVMBuildBitCast(builder, dxt_block,
1441                             lp_build_vec_type(gallivm, type8), "");
1442    alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1443    alpha = LLVMBuildBitCast(builder, alpha,
1444                             lp_build_vec_type(gallivm, type16), "");
1445    alpha = LLVMBuildAnd(builder, alpha,
1446                         lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1447    alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1448    alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1449    alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1450    alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1451    alpha = LLVMBuildBitCast(builder, alpha,
1452                             lp_build_vec_type(gallivm, type32), "");
1453    /*
1454     * alpha now contains elems 0,1,2,3,... (ubytes)
1455     * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1456     * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1457     */
1458    a[0] = LLVMBuildShl(builder, alpha,
1459                        lp_build_const_int_vec(gallivm, type32, 24), "");
1460    a[1] = LLVMBuildShl(builder, alpha,
1461                        lp_build_const_int_vec(gallivm, type32, 16), "");
1462    a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1463    a[2] = LLVMBuildShl(builder, alpha,
1464                        lp_build_const_int_vec(gallivm, type32, 8), "");
1465    a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1466    a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1467
1468    for (i = 0; i < 4; i++) {
1469       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1470    }
1471 }
1472
1473
1474 static LLVMValueRef
1475 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1476                         LLVMValueRef alpha0,
1477                         LLVMValueRef alpha1,
1478                         LLVMValueRef code,
1479                         LLVMValueRef sel_mask)
1480 {
1481    LLVMBuilderRef builder = gallivm->builder;
1482    LLVMValueRef delta, ainterp;
1483    LLVMValueRef weight5, weight7, weight;
1484    struct lp_type type16;
1485    struct lp_build_context bld;
1486
1487    memset(&type16, 0, sizeof type16);
1488    type16.width = 16;
1489    type16.length = 8;
1490    type16.sign = TRUE;
1491
1492    lp_build_context_init(&bld, gallivm, type16);
1493    /*
1494     * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1495     * actually be desirable to do this here with even higher accuracy than
1496     * even 8 bit (more or less required for rgtc, albeit that's not handled
1497     * here right now), shift the weights after multiplication by code.
1498     */
1499    weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1500    weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1501    weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1502
1503    /*
1504     * we'll get garbage in the elements which had code 0 (or larger than
1505     * 5 or 7) but we don't care (or rather, need to fix up anyway).
1506     */
1507    code = LLVMBuildSub(builder, code, bld.one, "");
1508
1509    weight = LLVMBuildMul(builder, weight, code, "");
1510    weight = LLVMBuildLShr(builder, weight,
1511                           lp_build_const_int_vec(gallivm, type16, 6), "");
1512
1513    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1514
1515    ainterp = LLVMBuildMul(builder, delta, weight, "");
1516    ainterp = LLVMBuildLShr(builder, ainterp,
1517                            lp_build_const_int_vec(gallivm, type16, 8), "");
1518
1519    /* lerp is done later (with packed values) */
1520
1521    return ainterp;
1522 }
1523
1524
1525 /*
1526  * decode one dxt5 block.
1527  */
1528 static void
1529 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1530                        enum pipe_format format,
1531                        LLVMValueRef dxt_block,
1532                        LLVMValueRef *col)
1533 {
1534    LLVMBuilderRef builder = gallivm->builder;
1535    LLVMValueRef alpha, alpha0, alpha1, ares;
1536    LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1537    LLVMValueRef a[4], acode, tmp0, tmp1;
1538    LLVMTypeRef i64t, i32t;
1539    struct lp_type type32, type64, type8, type16;
1540    struct lp_build_context bld16, bld8;
1541    unsigned i;
1542
1543    memset(&type32, 0, sizeof type32);
1544    type32.width = 32;
1545    type32.length = 4;
1546
1547    memset(&type64, 0, sizeof type64);
1548    type64.width = 64;
1549    type64.length = 2;
1550
1551    memset(&type8, 0, sizeof type8);
1552    type8.width = 8;
1553    type8.length = 16;
1554
1555    memset(&type16, 0, sizeof type16);
1556    type16.width = 16;
1557    type16.length = 8;
1558
1559    lp_build_context_init(&bld16, gallivm, type16);
1560    lp_build_context_init(&bld8, gallivm, type8);
1561
1562    i64t = lp_build_vec_type(gallivm, type64);
1563    i32t = lp_build_vec_type(gallivm, type32);
1564
1565    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1566
1567    /*
1568     * three possible strategies for vectorizing alpha:
1569     * 1) compute all 8 values then use scalar extraction
1570     *    (i.e. have all 8 alpha values packed in one 64bit scalar
1571     *    and do something like ax = vals >> (codex * 8) followed
1572     *    by inserting these values back into color)
1573     * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1574     *    (without pshufb would need boatloads of cmp/selects trying to
1575     *    keep things vectorized for essentially scalar selection).
1576     * 3) do something similar to the uncached case
1577     *    needs more calculations (need to calc 16 values instead of 8 though
1578     *    that's only an issue for the lerp which we need to do twice otherwise
1579     *    everything still fits into 128bit) but keeps things vectorized mostly.
1580     * Trying 3) here though not sure it's really faster...
1581     * With pshufb, we try 2) (cheaper and more accurate)
1582     */
1583
1584    /*
1585     * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1586     * help since code crosses 8bit boundaries). But variable shifts are
1587     * AVX2 only, and even then only dword/quadword (intel _really_ hates
1588     * shifts!). Instead, emulate by 16bit muls.
1589     * Also, the required byte shuffles are essentially non-emulatable, so
1590     * require ssse3 (albeit other archs might do them fine).
1591     * This is not directly tied to ssse3 - just need sane byte shuffles.
1592     * But ordering is going to be different below so use same condition.
1593     */
1594
1595
1596    /* vectorize alpha */
1597    alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1598    alpha0 = LLVMBuildAnd(builder, alpha,
1599                          lp_build_const_int_vec(gallivm, type64, 0xff), "");
1600    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1601    alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1602    alpha1 = LLVMBuildLShr(builder, alpha,
1603                           lp_build_const_int_vec(gallivm, type16, 8), "");
1604    alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
1605    shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1606    /* XXX this shuffle broken with LLVM 2.8 */
1607    alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1608    alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1609
1610    type16.sign = TRUE;
1611    sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1612                                alpha0, alpha1);
1613    type16.sign = FALSE;
1614    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1615
1616    if (!util_cpu_caps.has_ssse3) {
1617       LLVMValueRef acodeg, mask1, acode0, acode1;
1618
1619       /* extraction of the 3 bit values into something more useful is HARD */
1620       /* first steps are actually scalar */
1621       acode = LLVMBuildLShr(builder, alpha,
1622                             lp_build_const_int_vec(gallivm, type64, 16), "");
1623       tmp0 = LLVMBuildAnd(builder, acode,
1624                           lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1625       tmp1 =  LLVMBuildLShr(builder, acode,
1626                             lp_build_const_int_vec(gallivm, type64, 24), "");
1627       tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1628       tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1629       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1630       /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1631       tmp0 = LLVMBuildAnd(builder, acode,
1632                           lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1633       tmp1 =  LLVMBuildLShr(builder, acode,
1634                             lp_build_const_int_vec(gallivm, type32, 12), "");
1635       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1636       /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1637       tmp0 = LLVMBuildAnd(builder, acode,
1638                           lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1639       tmp1 =  LLVMBuildLShr(builder, acode,
1640                             lp_build_const_int_vec(gallivm, type32, 6), "");
1641       /* use signed pack doesn't matter and otherwise need sse41 */
1642       type32.sign = type16.sign = TRUE;
1643       acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1644       type32.sign = type16.sign = FALSE;
1645       /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1646       acode0 = LLVMBuildAnd(builder, acode,
1647                             lp_build_const_int_vec(gallivm, type16, 0x7), "");
1648       acode1 =  LLVMBuildLShr(builder, acode,
1649                               lp_build_const_int_vec(gallivm, type16, 3), "");
1650       acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1651       /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1652
1653       acodeg = LLVMBuildAnd(builder, acode,
1654                             LLVMBuildNot(builder, sel_mask, ""), "");
1655       mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1656                                acode, bld8.one);
1657
1658       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1659       ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1660       ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1661       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1662       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1663       alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1664       alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1665       ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1666       /* Fix up val01 */
1667       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1668                                    acode, bld8.zero);
1669       ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1670       ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1671
1672       /* fix up val67 if a0 <= a1 */
1673       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1674                                    acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1675       ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1676       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1677                                    acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1678       ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1679
1680       /* unpack in right order (0,4,8,12,1,5,..) */
1681       /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1682       tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1683       tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1684       tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1685       tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1686
1687       a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1688       a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1689       a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1690       a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1691    }
1692    else {
1693       LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1694       LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1695       LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1696       unsigned i, j;
1697       /*
1698        * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1699        * help since code crosses 8bit boundaries). But variable shifts are
1700        * AVX2 only, and even then only dword/quadword (intel _really_ hates
1701        * shifts!). Instead, emulate by 16bit muls.
1702        * Also, the required byte shuffles are essentially non-emulatable, so
1703        * require ssse3 (albeit other archs might do them fine, but the
1704        * complete path is ssse3 only for now).
1705        */
1706       for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1707          elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1708          elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1709          elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1710       }
1711       shufa = LLVMConstVector(elems, 16);
1712       alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1713       acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1714       acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1715       /*
1716        * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1717        * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1718        * we'd place them into bits 4-7 so could save shift but impossible.)
1719        */
1720       for (i = 0; i < 8; i += 4) {
1721          elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1722          elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1723          elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1724          elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1725       }
1726       mulclo = LLVMConstVector(elems, 8);
1727       for (i = 0; i < 8; i += 4) {
1728          elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1729          elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1730          elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1731          elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1732       }
1733       mulchi = LLVMConstVector(elems, 8);
1734
1735       tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1736       tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1737       tmp0 = LLVMBuildLShr(builder, tmp0,
1738                            lp_build_const_int_vec(gallivm, type16, 13), "");
1739       tmp1 = LLVMBuildLShr(builder, tmp1,
1740                            lp_build_const_int_vec(gallivm, type16, 5), "");
1741       tmp1 = LLVMBuildAnd(builder, tmp1,
1742                           lp_build_const_int_vec(gallivm, type16, 0x700), "");
1743       acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1744       acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1745
1746       /*
1747        * Note that ordering is different here to non-ssse3 path:
1748        * 0/1/2/3/4/5...
1749        */
1750
1751       LLVMValueRef weight0, weight1, weight, delta;
1752       LLVMValueRef constff_elem7, const0_elem6;
1753       /* weights, correctly rounded (round(256*x/7)) */
1754       elems[0] = LLVMConstInt(type16s, 256, 0);
1755       elems[1] = LLVMConstInt(type16s, 0, 0);
1756       elems[2] = LLVMConstInt(type16s, 219, 0);
1757       elems[3] =  LLVMConstInt(type16s, 183, 0);
1758       elems[4] =  LLVMConstInt(type16s, 146, 0);
1759       elems[5] =  LLVMConstInt(type16s, 110, 0);
1760       elems[6] =  LLVMConstInt(type16s, 73, 0);
1761       elems[7] =  LLVMConstInt(type16s, 37, 0);
1762       weight0 = LLVMConstVector(elems, 8);
1763
1764       elems[0] = LLVMConstInt(type16s, 256, 0);
1765       elems[1] = LLVMConstInt(type16s, 0, 0);
1766       elems[2] = LLVMConstInt(type16s, 205, 0);
1767       elems[3] =  LLVMConstInt(type16s, 154, 0);
1768       elems[4] =  LLVMConstInt(type16s, 102, 0);
1769       elems[5] =  LLVMConstInt(type16s, 51, 0);
1770       elems[6] =  LLVMConstInt(type16s, 0, 0);
1771       elems[7] =  LLVMConstInt(type16s, 0, 0);
1772       weight1 = LLVMConstVector(elems, 8);
1773
1774       weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1775       weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1776       weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1777       weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1778
1779       for (i = 0; i < 16; i++) {
1780          elems[i] = LLVMConstNull(type8s);
1781       }
1782       elems[7] = LLVMConstInt(type8s, 255, 0);
1783       constff_elem7 = LLVMConstVector(elems, 16);
1784
1785       for (i = 0; i < 16; i++) {
1786          elems[i] = LLVMConstInt(type8s, 255, 0);
1787       }
1788       elems[6] = LLVMConstInt(type8s, 0, 0);
1789       const0_elem6 = LLVMConstVector(elems, 16);
1790
1791       /* standard simple lerp - but the version we need isn't available */
1792       delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1793       ainterp = LLVMBuildMul(builder, delta, weight, "");
1794       ainterp = LLVMBuildLShr(builder, ainterp,
1795                               lp_build_const_int_vec(gallivm, type16, 8), "");
1796       ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1797       alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1798       ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1799       ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1800       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1801
1802       /* fixing 0/0xff case is slightly more complex */
1803       constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1804                                    LLVMBuildNot(builder, sel_mask, ""), "");
1805       const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1806       ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1807       ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1808
1809       /* now pick all 16 elements at once! */
1810       intrargs[0] = ainterp;
1811       intrargs[1] = acode;
1812       ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1813                                 bld8.vec_type, intrargs, 2, 0);
1814
1815       ares = LLVMBuildBitCast(builder, ares, i32t, "");
1816       mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1817       a[0] = LLVMBuildShl(builder, ares,
1818                           lp_build_const_int_vec(gallivm, type32, 24), "");
1819       a[1] = LLVMBuildShl(builder, ares,
1820                           lp_build_const_int_vec(gallivm, type32, 16), "");
1821       a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1822       a[2] = LLVMBuildShl(builder, ares,
1823                           lp_build_const_int_vec(gallivm, type32, 8), "");
1824       a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1825       a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1826    }
1827
1828    for (i = 0; i < 4; i++) {
1829       a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1830       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1831    }
1832 }
1833
1834
1835 static void
1836 generate_update_cache_one_block(struct gallivm_state *gallivm,
1837                                 LLVMValueRef function,
1838                                 const struct util_format_description *format_desc)
1839 {
1840    LLVMBasicBlockRef block;
1841    LLVMBuilderRef old_builder;
1842    LLVMValueRef ptr_addr;
1843    LLVMValueRef hash_index;
1844    LLVMValueRef cache;
1845    LLVMValueRef dxt_block, tag_value;
1846    LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1847
1848    ptr_addr     = LLVMGetParam(function, 0);
1849    hash_index   = LLVMGetParam(function, 1);
1850    cache        = LLVMGetParam(function, 2);
1851
1852    lp_build_name(ptr_addr,   "ptr_addr"  );
1853    lp_build_name(hash_index, "hash_index");
1854    lp_build_name(cache,      "cache_addr");
1855
1856    /*
1857     * Function body
1858     */
1859
1860    old_builder = gallivm->builder;
1861    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1862    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1863    LLVMPositionBuilderAtEnd(gallivm->builder, block);
1864
1865    lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1866                                       ptr_addr);
1867
1868    switch (format_desc->format) {
1869    case PIPE_FORMAT_DXT1_RGB:
1870    case PIPE_FORMAT_DXT1_RGBA:
1871    case PIPE_FORMAT_DXT1_SRGB:
1872    case PIPE_FORMAT_DXT1_SRGBA:
1873       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1874       break;
1875    case PIPE_FORMAT_DXT3_RGBA:
1876    case PIPE_FORMAT_DXT3_SRGBA:
1877       s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1878       break;
1879    case PIPE_FORMAT_DXT5_RGBA:
1880    case PIPE_FORMAT_DXT5_SRGBA:
1881       s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1882       break;
1883    default:
1884       assert(0);
1885       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1886       break;
1887    }
1888
1889    tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1890                                  LLVMInt64TypeInContext(gallivm->context), "");
1891    s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1892
1893    LLVMBuildRetVoid(gallivm->builder);
1894
1895    LLVMDisposeBuilder(gallivm->builder);
1896    gallivm->builder = old_builder;
1897
1898    gallivm_verify_function(gallivm, function);
1899 }
1900
1901
1902 static void
1903 update_cached_block(struct gallivm_state *gallivm,
1904                     const struct util_format_description *format_desc,
1905                     LLVMValueRef ptr_addr,
1906                     LLVMValueRef hash_index,
1907                     LLVMValueRef cache)
1908
1909 {
1910    LLVMBuilderRef builder = gallivm->builder;
1911    LLVMModuleRef module = gallivm->module;
1912    char name[256];
1913    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1914    LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1915    LLVMValueRef function, inst;
1916    LLVMBasicBlockRef bb;
1917    LLVMValueRef args[3];
1918
1919    util_snprintf(name, sizeof name, "%s_update_cache_one_block",
1920                  format_desc->short_name);
1921    function = LLVMGetNamedFunction(module, name);
1922
1923    if (!function) {
1924       LLVMTypeRef ret_type;
1925       LLVMTypeRef arg_types[3];
1926       LLVMTypeRef function_type;
1927       unsigned arg;
1928
1929       /*
1930        * Generate the function prototype.
1931        */
1932
1933       ret_type = LLVMVoidTypeInContext(gallivm->context);
1934       arg_types[0] = pi8t;
1935       arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
1936       arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
1937       function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
1938       function = LLVMAddFunction(module, name, function_type);
1939
1940       for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
1941          if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
1942             lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
1943
1944       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
1945       LLVMSetVisibility(function, LLVMHiddenVisibility);
1946       generate_update_cache_one_block(gallivm, function, format_desc);
1947    }
1948
1949    args[0] = ptr_addr;
1950    args[1] = hash_index;
1951    args[2] = cache;
1952
1953    LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
1954    bb = LLVMGetInsertBlock(builder);
1955    inst = LLVMGetLastInstruction(bb);
1956    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
1957 }
1958
1959 /*
1960  * cached lookup
1961  */
1962 static LLVMValueRef
1963 compressed_fetch_cached(struct gallivm_state *gallivm,
1964                         const struct util_format_description *format_desc,
1965                         unsigned n,
1966                         LLVMValueRef base_ptr,
1967                         LLVMValueRef offset,
1968                         LLVMValueRef i,
1969                         LLVMValueRef j,
1970                         LLVMValueRef cache)
1971
1972 {
1973    LLVMBuilderRef builder = gallivm->builder;
1974    unsigned count, low_bit, log2size;
1975    LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
1976    LLVMValueRef ij_index, hash_index, hash_mask, block_index;
1977    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1978    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
1979    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
1980    struct lp_type type;
1981    struct lp_build_context bld32;
1982    memset(&type, 0, sizeof type);
1983    type.width = 32;
1984    type.length = n;
1985
1986    lp_build_context_init(&bld32, gallivm, type);
1987
1988    /*
1989     * compute hash - we use direct mapped cache, the hash function could
1990     *                be better but it needs to be simple
1991     * per-element:
1992     *    compare offset with offset stored at tag (hash)
1993     *    if not equal extract block, store block, update tag
1994     *    extract color from cache
1995     *    assemble colors
1996     */
1997
1998    low_bit = util_logbase2(format_desc->block.bits / 8);
1999    log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2000    addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2001    ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2002    ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2003    /* For the hash function, first mask off the unused lowest bits. Then just
2004       do some xor with address bits - only use lower 32bits */
2005    ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2006    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2007                                  lp_build_const_int_vec(gallivm, type, low_bit), "");
2008    /* This only really makes sense for size 64,128,256 */
2009    hash_index = ptr_addrtrunc;
2010    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2011                                  lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2012    hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2013    tmp = LLVMBuildLShr(builder, hash_index,
2014                        lp_build_const_int_vec(gallivm, type, log2size), "");
2015    hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2016
2017    hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2018    hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2019    ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2020    ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2021    block_index = LLVMBuildShl(builder, hash_index,
2022                               lp_build_const_int_vec(gallivm, type, 4), "");
2023    block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2024
2025    if (n > 1) {
2026       color = bld32.undef;
2027       for (count = 0; count < n; count++) {
2028          LLVMValueRef index, cond, colorx;
2029          LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2030          struct lp_build_if_state if_ctx;
2031
2032          index = lp_build_const_int32(gallivm, count);
2033          offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2034          addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2035          addrx = LLVMBuildAdd(builder, addrx, addr, "");
2036          block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2037          hash_indexx = LLVMBuildLShr(builder, block_indexx,
2038                                      lp_build_const_int32(gallivm, 4), "");
2039          offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2040          cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2041
2042          lp_build_if(&if_ctx, gallivm, cond);
2043          {
2044             ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2045                                           LLVMPointerType(i8t, 0), "");
2046             update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2047 #if LP_BUILD_FORMAT_CACHE_DEBUG
2048             s3tc_update_cache_access(gallivm, cache, 1,
2049                                      LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2050 #endif
2051          }
2052          lp_build_endif(&if_ctx);
2053
2054          colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2055
2056          color = LLVMBuildInsertElement(builder, color, colorx,
2057                                         lp_build_const_int32(gallivm, count), "");
2058       }
2059    }
2060    else {
2061       LLVMValueRef cond;
2062       struct lp_build_if_state if_ctx;
2063
2064       tmp = LLVMBuildZExt(builder, offset, i64t, "");
2065       addr = LLVMBuildAdd(builder, tmp, addr, "");
2066       offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2067       cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2068
2069       lp_build_if(&if_ctx, gallivm, cond);
2070       {
2071          tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2072          update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2073 #if LP_BUILD_FORMAT_CACHE_DEBUG
2074          s3tc_update_cache_access(gallivm, cache, 1,
2075                                   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2076 #endif
2077       }
2078       lp_build_endif(&if_ctx);
2079
2080       color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2081    }
2082 #if LP_BUILD_FORMAT_CACHE_DEBUG
2083    s3tc_update_cache_access(gallivm, cache, n,
2084                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2085 #endif
2086    return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2087 }
2088
2089
2090 static LLVMValueRef
2091 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2092                       unsigned n,
2093                       enum pipe_format format,
2094                       LLVMValueRef colors,
2095                       LLVMValueRef codewords,
2096                       LLVMValueRef alpha_lo,
2097                       LLVMValueRef alpha_hi,
2098                       LLVMValueRef i,
2099                       LLVMValueRef j)
2100 {
2101    return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2102                                      codewords, alpha_lo, alpha_hi, i, j);
2103 }
2104
2105
2106 /**
2107  * @param n  number of pixels processed (usually n=4, but it should also work with n=1
2108  *           and multiples of 4)
2109  * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
2110  * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2111  * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
2112  * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
2113  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
2114  */
2115 LLVMValueRef
2116 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2117                              const struct util_format_description *format_desc,
2118                              unsigned n,
2119                              LLVMValueRef base_ptr,
2120                              LLVMValueRef offset,
2121                              LLVMValueRef i,
2122                              LLVMValueRef j,
2123                              LLVMValueRef cache)
2124 {
2125    LLVMValueRef rgba;
2126    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2127    LLVMBuilderRef builder = gallivm->builder;
2128
2129    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2130    assert(format_desc->block.width == 4);
2131    assert(format_desc->block.height == 4);
2132
2133    assert((n == 1) || (n % 4 == 0));
2134
2135 /*   debug_printf("format = %d\n", format_desc->format);*/
2136    if (cache) {
2137       rgba = compressed_fetch_cached(gallivm, format_desc, n,
2138                                      base_ptr, offset, i, j, cache);
2139       return rgba;
2140    }
2141
2142    if (n > 4) {
2143       unsigned count;
2144       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2145       LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2146       LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
2147       LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2148                                                 gallivm->context), 4);
2149       LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2150       struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2151
2152       assert(n / 4 <= ARRAY_SIZE(rgba4));
2153
2154       rgba = LLVMGetUndef(i128_vectype);
2155
2156       for (count = 0; count < n / 4; count++) {
2157          LLVMValueRef colors, codewords, alpha_lo, alpha_hi;
2158
2159          i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2160          j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2161          offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2162
2163          lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2164                               &alpha_lo, &alpha_hi, base_ptr, offset4);
2165
2166          switch (format_desc->format) {
2167          case PIPE_FORMAT_DXT1_RGB:
2168          case PIPE_FORMAT_DXT1_RGBA:
2169          case PIPE_FORMAT_DXT1_SRGB:
2170          case PIPE_FORMAT_DXT1_SRGBA:
2171             rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2172                                                  colors, codewords, i4, j4);
2173             break;
2174          case PIPE_FORMAT_DXT3_RGBA:
2175          case PIPE_FORMAT_DXT3_SRGBA:
2176             rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2177                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2178             break;
2179          case PIPE_FORMAT_DXT5_RGBA:
2180          case PIPE_FORMAT_DXT5_SRGBA:
2181             rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2182                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2183             break;
2184          default:
2185             assert(0);
2186             rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2187             break;
2188          }
2189          /* shuffles typically give best results with dword elements...*/
2190          rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2191       }
2192       rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2193       rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2194    }
2195    else {
2196       LLVMValueRef colors, codewords, alpha_lo, alpha_hi;
2197
2198       lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2199                            &alpha_lo, &alpha_hi, base_ptr, offset);
2200
2201       switch (format_desc->format) {
2202       case PIPE_FORMAT_DXT1_RGB:
2203       case PIPE_FORMAT_DXT1_RGBA:
2204       case PIPE_FORMAT_DXT1_SRGB:
2205       case PIPE_FORMAT_DXT1_SRGBA:
2206          rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2207                                       colors, codewords, i, j);
2208          break;
2209       case PIPE_FORMAT_DXT3_RGBA:
2210       case PIPE_FORMAT_DXT3_SRGBA:
2211          rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2212                                       codewords, alpha_lo, alpha_hi, i, j);
2213          break;
2214       case PIPE_FORMAT_DXT5_RGBA:
2215       case PIPE_FORMAT_DXT5_SRGBA:
2216          rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2217                                       codewords, alpha_lo, alpha_hi, i, j);
2218          break;
2219       default:
2220          assert(0);
2221          rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2222          break;
2223       }
2224    }
2225
2226    /* always return just decompressed values - srgb conversion is done later */
2227
2228    return rgba;
2229 }