src/gallium/auxiliary/gallivm/lp_bld_gather.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  * The above copyright notice and this permission notice (including the
  23  * next paragraph) shall be included in all copies or substantial portions
  24  * of the Software.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include "util/u_debug.h"
  30 #include "util/u_cpu_detect.h"
  31 #include "util/u_math.h"
  32 #include "lp_bld_debug.h"
  33 #include "lp_bld_const.h"
  34 #include "lp_bld_format.h"
  35 #include "lp_bld_gather.h"
  36 #include "lp_bld_swizzle.h"
  37 #include "lp_bld_type.h"
  38 #include "lp_bld_init.h"
  39 #include "lp_bld_intr.h"
  40 #include "lp_bld_pack.h"
  41
  42
  43 /**
  44  * Get the pointer to one element from scatter positions in memory.
  45  *
  46  * @sa lp_build_gather()
  47  */
  48 LLVMValueRef
  49 lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
  50                          unsigned length,
  51                          LLVMValueRef base_ptr,
  52                          LLVMValueRef offsets,
  53                          unsigned i)
  54 {
  55    LLVMValueRef offset;
  56    LLVMValueRef ptr;
  57
  58    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
  59
  60    if (length == 1) {
  61       assert(i == 0);
  62       offset = offsets;
  63    } else {
  64       LLVMValueRef index = lp_build_const_int32(gallivm, i);
  65       offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
  66    }
  67
  68    ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
  69
  70    return ptr;
  71 }
  72
  73
  74 /**
  75  * Gather one element from scatter positions in memory.
  76  *
  77  * @sa lp_build_gather()
  78  */
  79 LLVMValueRef
  80 lp_build_gather_elem(struct gallivm_state *gallivm,
  81                      unsigned length,
  82                      unsigned src_width,
  83                      unsigned dst_width,
  84                      boolean aligned,
  85                      LLVMValueRef base_ptr,
  86                      LLVMValueRef offsets,
  87                      unsigned i,
  88                      boolean vector_justify)
  89 {
  90    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
  91    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
  92    LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
  93    LLVMValueRef ptr;
  94    LLVMValueRef res;
  95
  96    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
  97
  98    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
  99    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
 100    res = LLVMBuildLoad(gallivm->builder, ptr, "");
 101
 102    /* XXX
 103     * On some archs we probably really want to avoid having to deal
 104     * with alignments lower than 4 bytes (if fetch size is a power of
 105     * two >= 32). On x86 it doesn't matter, however.
 106     * We should be able to guarantee full alignment for any kind of texture
 107     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
 108     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
 109     * but I don't think that's quite what we wanted).
 110     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
 111     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
 112     * enforcing what we want (which is what d3d10 does, the offset needs to
 113     * be aligned to element size, but GL has bytes regardless of element
 114     * size which would only leave us with minimum alignment restriction of 16
 115     * which doesn't make much sense if the type isn't 4x32bit). Due to
 116     * translation of offsets to first_elem in sampler_views it actually seems
 117     * gallium could not do anything else except 16 no matter what...
 118     */
 119    if (!aligned) {
 120       LLVMSetAlignment(res, 1);
 121    } else if (!util_is_power_of_two_or_zero(src_width)) {
 122       /*
 123        * Full alignment is impossible, assume the caller really meant
 124        * the individual elements were aligned (e.g. 3x32bit format).
 125        * And yes the generated code may otherwise crash, llvm will
 126        * really assume 128bit alignment with a 96bit fetch (I suppose
 127        * that makes sense as it can just assume the upper 32bit to be
 128        * whatever).
 129        * Maybe the caller should be able to explicitly set this, but
 130        * this should cover all the 3-channel formats.
 131        */
 132       if (((src_width / 24) * 24 == src_width) &&
 133            util_is_power_of_two_or_zero(src_width / 24)) {
 134           LLVMSetAlignment(res, src_width / 24);
 135       } else {
 136          LLVMSetAlignment(res, 1);
 137       }
 138    }
 139
 140    assert(src_width <= dst_width);
 141    if (src_width < dst_width) {
 142       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
 143       if (vector_justify) {
 144 #ifdef PIPE_ARCH_BIG_ENDIAN
 145          res = LLVMBuildShl(gallivm->builder, res,
 146                             LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
 147 #endif
 148       }
 149    }
 150
 151    return res;
 152 }
 153
 154
 155 /**
 156  * Gather one element from scatter positions in memory.
 157  * Nearly the same as above, however the individual elements
 158  * may be vectors themselves, and fetches may be float type.
 159  * Can also do pad vector instead of ZExt.
 160  *
 161  * @sa lp_build_gather()
 162  */
 163 static LLVMValueRef
 164 lp_build_gather_elem_vec(struct gallivm_state *gallivm,
 165                          unsigned length,
 166                          unsigned src_width,
 167                          LLVMTypeRef src_type,
 168                          struct lp_type dst_type,
 169                          boolean aligned,
 170                          LLVMValueRef base_ptr,
 171                          LLVMValueRef offsets,
 172                          unsigned i,
 173                          boolean vector_justify)
 174 {
 175    LLVMValueRef ptr, res;
 176    LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
 177    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
 178
 179    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
 180    ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
 181    res = LLVMBuildLoad(gallivm->builder, ptr, "");
 182
 183    /* XXX
 184     * On some archs we probably really want to avoid having to deal
 185     * with alignments lower than 4 bytes (if fetch size is a power of
 186     * two >= 32). On x86 it doesn't matter, however.
 187     * We should be able to guarantee full alignment for any kind of texture
 188     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
 189     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
 190     * but I don't think that's quite what we wanted).
 191     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
 192     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
 193     * enforcing what we want (which is what d3d10 does, the offset needs to
 194     * be aligned to element size, but GL has bytes regardless of element
 195     * size which would only leave us with minimum alignment restriction of 16
 196     * which doesn't make much sense if the type isn't 4x32bit). Due to
 197     * translation of offsets to first_elem in sampler_views it actually seems
 198     * gallium could not do anything else except 16 no matter what...
 199     */
 200    if (!aligned) {
 201       LLVMSetAlignment(res, 1);
 202    } else if (!util_is_power_of_two_or_zero(src_width)) {
 203       /*
 204        * Full alignment is impossible, assume the caller really meant
 205        * the individual elements were aligned (e.g. 3x32bit format).
 206        * And yes the generated code may otherwise crash, llvm will
 207        * really assume 128bit alignment with a 96bit fetch (I suppose
 208        * that makes sense as it can just assume the upper 32bit to be
 209        * whatever).
 210        * Maybe the caller should be able to explicitly set this, but
 211        * this should cover all the 3-channel formats.
 212        */
 213       if (((src_width / 24) * 24 == src_width) &&
 214            util_is_power_of_two_or_zero(src_width / 24)) {
 215           LLVMSetAlignment(res, src_width / 24);
 216       } else {
 217          LLVMSetAlignment(res, 1);
 218       }
 219    }
 220
 221    assert(src_width <= dst_type.width * dst_type.length);
 222    if (src_width < dst_type.width * dst_type.length) {
 223       if (dst_type.length > 1) {
 224          res = lp_build_pad_vector(gallivm, res, dst_type.length);
 225          /*
 226           * vector_justify hopefully a non-issue since we only deal
 227           * with src_width >= 32 here?
 228           */
 229       } else {
 230          LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
 231
 232          /*
 233           * Only valid if src_ptr_type is int type...
 234           */
 235          res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
 236
 237 #ifdef PIPE_ARCH_BIG_ENDIAN
 238          if (vector_justify) {
 239          res = LLVMBuildShl(gallivm->builder, res,
 240                             LLVMConstInt(dst_elem_type,
 241                                          dst_type.width - src_width, 0), "");
 242          }
 243          if (src_width == 48) {
 244             /* Load 3x16 bit vector.
 245              * The sequence of loads on big-endian hardware proceeds as follows.
 246              * 16-bit fields are denoted by X, Y, Z, and 0.  In memory, the sequence
 247              * of three fields appears in the order X, Y, Z.
 248              *
 249              * Load 32-bit word: 0.0.X.Y
 250              * Load 16-bit halfword: 0.0.0.Z
 251              * Rotate left: 0.X.Y.0
 252              * Bitwise OR: 0.X.Y.Z
 253              *
 254              * The order in which we need the fields in the result is 0.Z.Y.X,
 255              * the same as on little-endian; permute 16-bit fields accordingly
 256              * within 64-bit register:
 257              */
 258             LLVMValueRef shuffles[4] = {
 259                lp_build_const_int32(gallivm, 2),
 260                lp_build_const_int32(gallivm, 1),
 261                lp_build_const_int32(gallivm, 0),
 262                lp_build_const_int32(gallivm, 3),
 263             };
 264             res = LLVMBuildBitCast(gallivm->builder, res,
 265                                    lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
 266             res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
 267             res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
 268          }
 269 #endif
 270       }
 271    }
 272    return res;
 273 }
 274
 275
 276
 277
 278 static LLVMValueRef
 279 lp_build_gather_avx2(struct gallivm_state *gallivm,
 280                      unsigned length,
 281                      unsigned src_width,
 282                      struct lp_type dst_type,
 283                      LLVMValueRef base_ptr,
 284                      LLVMValueRef offsets)
 285 {
 286    LLVMBuilderRef builder = gallivm->builder;
 287    LLVMTypeRef src_type, src_vec_type;
 288    LLVMValueRef res;
 289    struct lp_type res_type = dst_type;
 290    res_type.length *= length;
 291
 292    if (dst_type.floating) {
 293       src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
 294                                    LLVMFloatTypeInContext(gallivm->context);
 295    } else {
 296       src_type = LLVMIntTypeInContext(gallivm->context, src_width);
 297    }
 298    src_vec_type = LLVMVectorType(src_type, length);
 299
 300    /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
 301    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
 302
 303    if (0) {
 304       /*
 305        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
 306        * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
 307        * least with Haswell. See
 308        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
 309        * And the generated code doing the emulation is quite a bit worse
 310        * than what we get by doing it ourselves too.
 311        */
 312       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
 313       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
 314       LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
 315       LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
 316       LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
 317       LLVMValueRef src_ptr;
 318
 319       base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
 320
 321       /* Rescale offsets from bytes to elements */
 322       LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
 323       scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
 324       assert(LLVMTypeOf(offsets) == i32_vec_type);
 325       offsets = LLVMBuildSDiv(builder, offsets, scale, "");
 326
 327       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
 328
 329       char intrinsic[64];
 330       util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
 331                     length, dst_type.floating ? "f" : "i", src_width);
 332       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
 333       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
 334       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
 335
 336       LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
 337
 338       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
 339    } else {
 340       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
 341       const char *intrinsic = NULL;
 342       unsigned l_idx = 0;
 343
 344       assert(src_width == 32 || src_width == 64);
 345       if (src_width == 32) {
 346          assert(length == 4 || length == 8);
 347       } else {
 348          assert(length == 2 || length == 4);
 349       }
 350
 351       static const char *intrinsics[2][2][2] = {
 352
 353          {{"llvm.x86.avx2.gather.d.d",
 354            "llvm.x86.avx2.gather.d.d.256"},
 355           {"llvm.x86.avx2.gather.d.q",
 356            "llvm.x86.avx2.gather.d.q.256"}},
 357
 358          {{"llvm.x86.avx2.gather.d.ps",
 359            "llvm.x86.avx2.gather.d.ps.256"},
 360           {"llvm.x86.avx2.gather.d.pd",
 361            "llvm.x86.avx2.gather.d.pd.256"}},
 362       };
 363
 364       if ((src_width == 32 && length == 8) ||
 365           (src_width == 64 && length == 4)) {
 366          l_idx = 1;
 367       }
 368       intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
 369
 370       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
 371       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
 372       mask = LLVMConstBitCast(mask, src_vec_type);
 373       LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
 374
 375       LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
 376
 377       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
 378    }
 379    res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
 380
 381    return res;
 382 }
 383
 384
 385 /**
 386  * Gather elements from scatter positions in memory into a single vector.
 387  * Use for fetching texels from a texture.
 388  * For SSE, typical values are length=4, src_width=32, dst_width=32.
 389  *
 390  * When src_width < dst_width, the return value can be justified in
 391  * one of two ways:
 392  * "integer justification" is used when the caller treats the destination
 393  * as a packed integer bitmask, as described by the channels' "shift" and
 394  * "width" fields;
 395  * "vector justification" is used when the caller casts the destination
 396  * to a vector and needs channel X to be in vector element 0.
 397  *
 398  * @param length length of the offsets
 399  * @param src_width src element width in bits
 400  * @param dst_type result element type (src will be expanded to fit,
 401  *        but truncation is not allowed)
 402  *        (this may be a vector, must be pot sized)
 403  * @param aligned whether the data is guaranteed to be aligned (to src_width)
 404  * @param base_ptr base pointer, needs to be a i8 pointer type.
 405  * @param offsets vector with offsets
 406  * @param vector_justify select vector rather than integer justification
 407  */
 408 LLVMValueRef
 409 lp_build_gather(struct gallivm_state *gallivm,
 410                 unsigned length,
 411                 unsigned src_width,
 412                 struct lp_type dst_type,
 413                 boolean aligned,
 414                 LLVMValueRef base_ptr,
 415                 LLVMValueRef offsets,
 416                 boolean vector_justify)
 417 {
 418    LLVMValueRef res;
 419    boolean need_expansion = src_width < dst_type.width * dst_type.length;
 420    boolean vec_fetch;
 421    struct lp_type fetch_type, fetch_dst_type;
 422    LLVMTypeRef src_type;
 423
 424    assert(src_width <= dst_type.width * dst_type.length);
 425
 426    /*
 427     * This is quite a mess...
 428     * Figure out if the fetch should be done as:
 429     * a) scalar or vector
 430     * b) float or int
 431     *
 432     * As an example, for a 96bit fetch expanded into 4x32bit, it is better
 433     * to use (3x32bit) vector type (then pad the vector). Otherwise, the
 434     * zext will cause extra instructions.
 435     * However, the same isn't true for 3x16bit (the codegen for that is
 436     * completely worthless on x86 simd, and for 3x8bit is is way worse
 437     * still, don't try that... (To get really good code out of llvm for
 438     * these cases, the only way is to decompose the fetches manually
 439     * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
 440     * case requires sse41, otherwise simple scalar zext is way better.
 441     * But probably not important enough, so don't bother.)
 442     * Also, we try to honor the floating bit of destination (but isn't
 443     * possible if caller asks for instance for 2x32bit dst_type with
 444     * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
 445     * cast to 2x32f type, so the fetch is always int and on top of that
 446     * we avoid the vec pad and use scalar zext due the above mentioned
 447     * issue).
 448     * Note this is optimized for x86 sse2 and up backend. Could be tweaked
 449     * for other archs if necessary...
 450     */
 451    if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
 452        (dst_type.length > 1)) {
 453       /* use vector fetch (if dst_type is vector) */
 454       vec_fetch = TRUE;
 455       if (dst_type.floating) {
 456          fetch_type = lp_type_float_vec(dst_type.width, src_width);
 457       } else {
 458          fetch_type = lp_type_int_vec(dst_type.width, src_width);
 459       }
 460       /* intentionally not using lp_build_vec_type here */
 461       src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
 462                                 fetch_type.length);
 463       fetch_dst_type = fetch_type;
 464       fetch_dst_type.length = dst_type.length;
 465     } else {
 466       /* use scalar fetch */
 467       vec_fetch = FALSE;
 468       if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
 469          fetch_type = lp_type_float(src_width);
 470       } else {
 471          fetch_type = lp_type_int(src_width);
 472       }
 473       src_type = lp_build_vec_type(gallivm, fetch_type);
 474       fetch_dst_type = fetch_type;
 475       fetch_dst_type.width = dst_type.width * dst_type.length;
 476    }
 477
 478    if (length == 1) {
 479       /* Scalar */
 480       res = lp_build_gather_elem_vec(gallivm, length,
 481                                      src_width, src_type, fetch_dst_type,
 482                                      aligned, base_ptr, offsets, 0,
 483                                      vector_justify);
 484       return LLVMBuildBitCast(gallivm->builder, res,
 485                               lp_build_vec_type(gallivm, dst_type), "");
 486       /*
 487        * Excluding expansion from these paths because if you need it for
 488        * 32bit/64bit fetches you're doing it wrong (this is gather, not
 489        * conversion) and it would be awkward for floats.
 490        */
 491    } else if (util_cpu_caps.has_avx2 && !need_expansion &&
 492               src_width == 32 && (length == 4 || length == 8)) {
 493       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
 494                                   base_ptr, offsets);
 495    /*
 496     * This looks bad on paper wrt throughtput/latency on Haswell.
 497     * Even on Broadwell it doesn't look stellar.
 498     * Albeit no measurements were done (but tested to work).
 499     * Should definitely enable on Skylake.
 500     * (In general, should be more of a win if the fetch is 256bit wide -
 501     * this is true for the 32bit case above too.)
 502     */
 503    } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
 504               src_width == 64 && (length == 2 || length == 4)) {
 505       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
 506                                   base_ptr, offsets);
 507    } else {
 508       /* Vector */
 509
 510       LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
 511       unsigned i;
 512       boolean vec_zext = FALSE;
 513       struct lp_type res_type, gather_res_type;
 514       LLVMTypeRef res_t, gather_res_t;
 515
 516       res_type = fetch_dst_type;
 517       res_type.length *= length;
 518       gather_res_type = res_type;
 519
 520       if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
 521          /*
 522           * Note that llvm is never able to optimize zext/insert combos
 523           * directly (i.e. zero the simd reg, then place the elements into
 524           * the appropriate place directly). (I think this has to do with
 525           * scalar/vector transition.) And scalar 16->32bit zext simd loads
 526           * aren't possible (instead loading to scalar reg first).
 527           * No idea about other archs...
 528           * We could do this manually, but instead we just use a vector
 529           * zext, which is simple enough (and, in fact, llvm might optimize
 530           * this away).
 531           * (We're not trying that with other bit widths as that might not be
 532           * easier, in particular with 8 bit values at least with only sse2.)
 533           */
 534          assert(vec_fetch == FALSE);
 535          gather_res_type.width /= 2;
 536          fetch_dst_type = fetch_type;
 537          src_type = lp_build_vec_type(gallivm, fetch_type);
 538          vec_zext = TRUE;
 539       }
 540       res_t = lp_build_vec_type(gallivm, res_type);
 541       gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
 542       res = LLVMGetUndef(gather_res_t);
 543       for (i = 0; i < length; ++i) {
 544          LLVMValueRef index = lp_build_const_int32(gallivm, i);
 545          elems[i] = lp_build_gather_elem_vec(gallivm, length,
 546                                              src_width, src_type, fetch_dst_type,
 547                                              aligned, base_ptr, offsets, i,
 548                                              vector_justify);
 549          if (!vec_fetch) {
 550             res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
 551          }
 552       }
 553       if (vec_zext) {
 554          res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
 555          if (vector_justify) {
 556 #ifdef PIPE_ARCH_BIG_ENDIAN
 557             unsigned sv = dst_type.width - src_width;
 558             res = LLVMBuildShl(gallivm->builder, res,
 559                                lp_build_const_int_vec(gallivm, res_type, sv), "");
 560 #endif
 561          }
 562       }
 563       if (vec_fetch) {
 564          /*
 565           * Do bitcast now otherwise llvm might get some funny ideas wrt
 566           * float/int types...
 567           */
 568          for (i = 0; i < length; i++) {
 569             elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
 570                                         lp_build_vec_type(gallivm, dst_type), "");
 571          }
 572          res = lp_build_concat(gallivm, elems, dst_type, length);
 573       } else {
 574          struct lp_type really_final_type = dst_type;
 575          assert(res_type.length * res_type.width ==
 576                 dst_type.length * dst_type.width * length);
 577          really_final_type.length *= length;
 578          res = LLVMBuildBitCast(gallivm->builder, res,
 579                                 lp_build_vec_type(gallivm, really_final_type), "");
 580       }
 581    }
 582
 583    return res;
 584 }
 585
 586 LLVMValueRef
 587 lp_build_gather_values(struct gallivm_state * gallivm,
 588                        LLVMValueRef * values,
 589                        unsigned value_count)
 590 {
 591    LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
 592    LLVMBuilderRef builder = gallivm->builder;
 593    LLVMValueRef vec = LLVMGetUndef(vec_type);
 594    unsigned i;
 595
 596    for (i = 0; i < value_count; i++) {
 597       LLVMValueRef index = lp_build_const_int32(gallivm, i);
 598       vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
 599    }
 600    return vec;
 601 }