src/gallium/auxiliary/gallivm/lp_bld_pack.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper functions for packing/unpacking.
  32  *
  33  * Pack/unpacking is necessary for conversion between types of different
  34  * bit width.
  35  *
  36  * They are also commonly used when an computation needs higher
  37  * precision for the intermediate values. For example, if one needs the
  38  * function:
  39  *
  40  *   c = compute(a, b);
  41  *
  42  * to use more precision for intermediate results then one should implement it
  43  * as:
  44  *
  45  *   LLVMValueRef
  46  *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
  47  *   {
  48  *      struct lp_type wide_type = lp_wider_type(type);
  49  *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
  50  *
  51  *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
  52  *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
  53  *
  54  *      cl = compute_half(al, bl);
  55  *      ch = compute_half(ah, bh);
  56  *
  57  *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
  58  *
  59  *      return c;
  60  *   }
  61  *
  62  * where compute_half() would do the computation for half the elements with
  63  * twice the precision.
  64  *
  65  * @author Jose Fonseca <jfonseca@vmware.com>
  66  */
  67
  68
  69 #include "util/u_debug.h"
  70 #include "util/u_math.h"
  71 #include "util/u_cpu_detect.h"
  72 #include "util/u_memory.h"
  73
  74 #include "lp_bld_type.h"
  75 #include "lp_bld_const.h"
  76 #include "lp_bld_init.h"
  77 #include "lp_bld_intr.h"
  78 #include "lp_bld_arit.h"
  79 #include "lp_bld_pack.h"
  80 #include "lp_bld_swizzle.h"
  81
  82
  83 /**
  84  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
  85  */
  86 static LLVMValueRef
  87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
  88                               unsigned n, unsigned lo_hi)
  89 {
  90    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  91    unsigned i, j;
  92
  93    assert(n <= LP_MAX_VECTOR_LENGTH);
  94    assert(lo_hi < 2);
  95
  96    /* TODO: cache results in a static table */
  97
  98    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
  99       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
 100       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
 101    }
 102
 103    return LLVMConstVector(elems, n);
 104 }
 105
 106 /**
 107  * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
 108  * See comment above lp_build_interleave2_half for more details.
 109  */
 110 static LLVMValueRef
 111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
 112                                    unsigned n, unsigned lo_hi)
 113 {
 114    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 115    unsigned i, j;
 116
 117    assert(n <= LP_MAX_VECTOR_LENGTH);
 118    assert(lo_hi < 2);
 119
 120    for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
 121       if (i == (n / 2))
 122          j += n / 4;
 123
 124       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
 125       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
 126    }
 127
 128    return LLVMConstVector(elems, n);
 129 }
 130
 131 /**
 132  * Similar to lp_build_const_unpack_shuffle_half, but for AVX512
 133  * See comment above lp_build_interleave2_half for more details.
 134  */
 135 static LLVMValueRef
 136 lp_build_const_unpack_shuffle_16wide(struct gallivm_state *gallivm,
 137                                      unsigned lo_hi)
 138 {
 139    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 140    unsigned i, j;
 141
 142    assert(lo_hi < 2);
 143
 144    // for the following lo_hi setting, convert 0 -> f to:
 145    // 0: 0 16 4 20  8 24 12 28 1 17 5 21  9 25 13 29
 146    // 1: 2 18 6 22 10 26 14 30 3 19 7 23 11 27 15 31
 147    for (i = 0; i < 16; i++) {
 148       j = ((i&0x06)<<1) + ((i&1)<<4) + (i>>3) + (lo_hi<<1);
 149
 150       elems[i] = lp_build_const_int32(gallivm, j);
 151    }
 152
 153    return LLVMConstVector(elems, 16);
 154 }
 155
 156 /**
 157  * Build shuffle vectors that match PACKxx (SSE) instructions or
 158  * VPERM (Altivec).
 159  */
 160 static LLVMValueRef
 161 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
 162 {
 163    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 164    unsigned i;
 165
 166    assert(n <= LP_MAX_VECTOR_LENGTH);
 167
 168    for(i = 0; i < n; ++i)
 169 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 170       elems[i] = lp_build_const_int32(gallivm, 2*i);
 171 #else
 172       elems[i] = lp_build_const_int32(gallivm, 2*i+1);
 173 #endif
 174
 175    return LLVMConstVector(elems, n);
 176 }
 177
 178 /**
 179  * Return a vector with elements src[start:start+size]
 180  * Most useful for getting half the values out of a 256bit sized vector,
 181  * otherwise may cause data rearrangement to happen.
 182  */
 183 LLVMValueRef
 184 lp_build_extract_range(struct gallivm_state *gallivm,
 185                        LLVMValueRef src,
 186                        unsigned start,
 187                        unsigned size)
 188 {
 189    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 190    unsigned i;
 191
 192    assert(size <= ARRAY_SIZE(elems));
 193
 194    for (i = 0; i < size; ++i)
 195       elems[i] = lp_build_const_int32(gallivm, i + start);
 196
 197    if (size == 1) {
 198       return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
 199    }
 200    else {
 201       return LLVMBuildShuffleVector(gallivm->builder, src, src,
 202                                     LLVMConstVector(elems, size), "");
 203    }
 204 }
 205
 206 /**
 207  * Concatenates several (must be a power of 2) vectors (of same type)
 208  * into a larger one.
 209  * Most useful for building up a 256bit sized vector out of two 128bit ones.
 210  */
 211 LLVMValueRef
 212 lp_build_concat(struct gallivm_state *gallivm,
 213                 LLVMValueRef src[],
 214                 struct lp_type src_type,
 215                 unsigned num_vectors)
 216 {
 217    unsigned new_length, i;
 218    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
 219    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
 220
 221    assert(src_type.length * num_vectors <= ARRAY_SIZE(shuffles));
 222    assert(util_is_power_of_two(num_vectors));
 223
 224    new_length = src_type.length;
 225
 226    for (i = 0; i < num_vectors; i++)
 227       tmp[i] = src[i];
 228
 229    while (num_vectors > 1) {
 230       num_vectors >>= 1;
 231       new_length <<= 1;
 232       for (i = 0; i < new_length; i++) {
 233          shuffles[i] = lp_build_const_int32(gallivm, i);
 234       }
 235       for (i = 0; i < num_vectors; i++) {
 236          tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
 237                                          LLVMConstVector(shuffles, new_length), "");
 238       }
 239    }
 240
 241    return tmp[0];
 242 }
 243
 244
 245 /**
 246  * Combines vectors to reduce from num_srcs to num_dsts.
 247  * Returns the number of src vectors concatenated in a single dst.
 248  *
 249  * num_srcs must be exactly divisible by num_dsts.
 250  *
 251  * e.g. For num_srcs = 4 and src = [x, y, z, w]
 252  *          num_dsts = 1  dst = [xyzw]    return = 4
 253  *          num_dsts = 2  dst = [xy, zw]  return = 2
 254  */
 255 int
 256 lp_build_concat_n(struct gallivm_state *gallivm,
 257                   struct lp_type src_type,
 258                   LLVMValueRef *src,
 259                   unsigned num_srcs,
 260                   LLVMValueRef *dst,
 261                   unsigned num_dsts)
 262 {
 263    int size = num_srcs / num_dsts;
 264    unsigned i;
 265
 266    assert(num_srcs >= num_dsts);
 267    assert((num_srcs % size) == 0);
 268
 269    if (num_srcs == num_dsts) {
 270       for (i = 0; i < num_dsts; ++i) {
 271          dst[i] = src[i];
 272       }
 273       return 1;
 274    }
 275
 276    for (i = 0; i < num_dsts; ++i) {
 277       dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
 278    }
 279
 280    return size;
 281 }
 282
 283
 284 /**
 285  * Un-interleave vector.
 286  * This will return a vector consisting of every second element
 287  * (depending on lo_hi, beginning at 0 or 1).
 288  * The returned vector size (elems and width) will only be half
 289  * that of the source vector.
 290  */
 291 LLVMValueRef
 292 lp_build_uninterleave1(struct gallivm_state *gallivm,
 293                        unsigned num_elems,
 294                        LLVMValueRef a,
 295                        unsigned lo_hi)
 296 {
 297    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
 298    unsigned i;
 299    assert(num_elems <= LP_MAX_VECTOR_LENGTH);
 300
 301    for (i = 0; i < num_elems / 2; ++i)
 302       elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
 303
 304    shuffle = LLVMConstVector(elems, num_elems / 2);
 305
 306    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
 307 }
 308
 309
 310 /**
 311  * Interleave vector elements.
 312  *
 313  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
 314  * (but not for 256bit AVX vectors).
 315  */
 316 LLVMValueRef
 317 lp_build_interleave2(struct gallivm_state *gallivm,
 318                      struct lp_type type,
 319                      LLVMValueRef a,
 320                      LLVMValueRef b,
 321                      unsigned lo_hi)
 322 {
 323    LLVMValueRef shuffle;
 324
 325    if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
 326       /*
 327        * XXX: This is a workaround for llvm code generation deficiency. Strangely
 328        * enough, while this needs vinsertf128/vextractf128 instructions (hence
 329        * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
 330        * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
 331        * So use some different shuffles instead (the exact shuffles don't seem to
 332        * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
 333        */
 334       struct lp_type tmp_type = type;
 335       LLVMValueRef srchalf[2], tmpdst;
 336       tmp_type.length = 4;
 337       tmp_type.width = 64;
 338       a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
 339       b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
 340       srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
 341       srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
 342       tmp_type.length = 2;
 343       tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
 344       return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
 345    }
 346
 347    shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
 348
 349    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 350 }
 351
 352 /**
 353  * Interleave vector elements but with 256 (or 512) bit,
 354  * treats it as interleave with 2 concatenated 128 (or 256) bit vectors.
 355  *
 356  * This differs to lp_build_interleave2 as that function would do the following (for lo):
 357  * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
 358  *
 359  *
 360  * An example interleave 8x float with 8x float on AVX 256bit unpack:
 361  *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
 362  *
 363  * Equivalent to interleaving 2x 128 bit vectors
 364  *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
 365  *
 366  * So interleave-lo would result in:
 367  *   a0 b0 a1 b1 a4 b4 a5 b5
 368  *
 369  * And interleave-hi would result in:
 370  *   a2 b2 a3 b3 a6 b6 a7 b7
 371  *
 372  * For 512 bits, the following are true:
 373  *
 374  * Interleave-lo would result in (capital letters denote hex indices):
 375  *   a0 b0 a1 b1 a4 b4 a5 b5 a8 b8 a9 b9 aC bC aD bD
 376  *
 377  * Interleave-hi would result in:
 378  *   a2 b2 a3 b3 a6 b6 a7 b7 aA bA aB bB aE bE aF bF
 379  */
 380 LLVMValueRef
 381 lp_build_interleave2_half(struct gallivm_state *gallivm,
 382                           struct lp_type type,
 383                           LLVMValueRef a,
 384                           LLVMValueRef b,
 385                           unsigned lo_hi)
 386 {
 387    if (type.length * type.width == 256) {
 388       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
 389       return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 390    } else if ((type.length == 16) && (type.width == 32)) {
 391       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_16wide(gallivm, lo_hi);
 392       return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 393    } else {
 394       return lp_build_interleave2(gallivm, type, a, b, lo_hi);
 395    }
 396 }
 397
 398
 399 /**
 400  * Double the bit width.
 401  *
 402  * This will only change the number of bits the values are represented, not the
 403  * values themselves.
 404  *
 405  */
 406 void
 407 lp_build_unpack2(struct gallivm_state *gallivm,
 408                  struct lp_type src_type,
 409                  struct lp_type dst_type,
 410                  LLVMValueRef src,
 411                  LLVMValueRef *dst_lo,
 412                  LLVMValueRef *dst_hi)
 413 {
 414    LLVMBuilderRef builder = gallivm->builder;
 415    LLVMValueRef msb;
 416    LLVMTypeRef dst_vec_type;
 417
 418    assert(!src_type.floating);
 419    assert(!dst_type.floating);
 420    assert(dst_type.width == src_type.width * 2);
 421    assert(dst_type.length * 2 == src_type.length);
 422
 423    if(dst_type.sign && src_type.sign) {
 424       /* Replicate the sign bit in the most significant bits */
 425       msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
 426    }
 427    else
 428       /* Most significant bits always zero */
 429       msb = lp_build_zero(gallivm, src_type);
 430
 431    /* Interleave bits */
 432 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 433    *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
 434    *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
 435
 436 #else
 437    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
 438    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
 439 #endif
 440
 441    /* Cast the result into the new type (twice as wide) */
 442
 443    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
 444
 445    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
 446    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
 447 }
 448
 449
 450 /**
 451  * Double the bit width, with an order which fits the cpu nicely.
 452  *
 453  * This will only change the number of bits the values are represented, not the
 454  * values themselves.
 455  *
 456  * The order of the results is not guaranteed, other than it will match
 457  * the corresponding lp_build_pack2_native call.
 458  */
 459 void
 460 lp_build_unpack2_native(struct gallivm_state *gallivm,
 461                         struct lp_type src_type,
 462                         struct lp_type dst_type,
 463                         LLVMValueRef src,
 464                         LLVMValueRef *dst_lo,
 465                         LLVMValueRef *dst_hi)
 466 {
 467    LLVMBuilderRef builder = gallivm->builder;
 468    LLVMValueRef msb;
 469    LLVMTypeRef dst_vec_type;
 470
 471    assert(!src_type.floating);
 472    assert(!dst_type.floating);
 473    assert(dst_type.width == src_type.width * 2);
 474    assert(dst_type.length * 2 == src_type.length);
 475
 476    if(dst_type.sign && src_type.sign) {
 477       /* Replicate the sign bit in the most significant bits */
 478       msb = LLVMBuildAShr(builder, src,
 479                lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
 480    }
 481    else
 482       /* Most significant bits always zero */
 483       msb = lp_build_zero(gallivm, src_type);
 484
 485    /* Interleave bits */
 486 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 487    if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
 488       *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
 489       *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
 490    } else {
 491       *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
 492       *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
 493    }
 494 #else
 495    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
 496    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
 497 #endif
 498
 499    /* Cast the result into the new type (twice as wide) */
 500
 501    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
 502
 503    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
 504    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
 505 }
 506
 507
 508 /**
 509  * Expand the bit width.
 510  *
 511  * This will only change the number of bits the values are represented, not the
 512  * values themselves.
 513  */
 514 void
 515 lp_build_unpack(struct gallivm_state *gallivm,
 516                 struct lp_type src_type,
 517                 struct lp_type dst_type,
 518                 LLVMValueRef src,
 519                 LLVMValueRef *dst, unsigned num_dsts)
 520 {
 521    unsigned num_tmps;
 522    unsigned i;
 523
 524    /* Register width must remain constant */
 525    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 526
 527    /* We must not loose or gain channels. Only precision */
 528    assert(src_type.length == dst_type.length * num_dsts);
 529
 530    num_tmps = 1;
 531    dst[0] = src;
 532
 533    while(src_type.width < dst_type.width) {
 534       struct lp_type tmp_type = src_type;
 535
 536       tmp_type.width *= 2;
 537       tmp_type.length /= 2;
 538
 539       for(i = num_tmps; i--; ) {
 540          lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
 541                           &dst[2*i + 1]);
 542       }
 543
 544       src_type = tmp_type;
 545
 546       num_tmps *= 2;
 547    }
 548
 549    assert(num_tmps == num_dsts);
 550 }
 551
 552
 553 /**
 554  * Non-interleaved pack.
 555  *
 556  * This will move values as
 557  *         (LSB)                     (MSB)
 558  *   lo =   l0 __ l1 __ l2 __..  __ ln __
 559  *   hi =   h0 __ h1 __ h2 __..  __ hn __
 560  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
 561  *
 562  * This will only change the number of bits the values are represented, not the
 563  * values themselves.
 564  *
 565  * It is assumed the values are already clamped into the destination type range.
 566  * Values outside that range will produce undefined results. Use
 567  * lp_build_packs2 instead.
 568  */
 569 LLVMValueRef
 570 lp_build_pack2(struct gallivm_state *gallivm,
 571                struct lp_type src_type,
 572                struct lp_type dst_type,
 573                LLVMValueRef lo,
 574                LLVMValueRef hi)
 575 {
 576    LLVMBuilderRef builder = gallivm->builder;
 577    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
 578    LLVMValueRef shuffle;
 579    LLVMValueRef res = NULL;
 580    struct lp_type intr_type = dst_type;
 581
 582    assert(!src_type.floating);
 583    assert(!dst_type.floating);
 584    assert(src_type.width == dst_type.width * 2);
 585    assert(src_type.length * 2 == dst_type.length);
 586
 587    /* Check for special cases first */
 588    if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
 589         src_type.width * src_type.length >= 128) {
 590       const char *intrinsic = NULL;
 591       boolean swap_intrinsic_operands = FALSE;
 592
 593       switch(src_type.width) {
 594       case 32:
 595          if (util_cpu_caps.has_sse2) {
 596            if (dst_type.sign) {
 597               intrinsic = "llvm.x86.sse2.packssdw.128";
 598            } else {
 599               if (util_cpu_caps.has_sse4_1) {
 600                  intrinsic = "llvm.x86.sse41.packusdw";
 601               }
 602            }
 603          } else if (util_cpu_caps.has_altivec) {
 604             if (dst_type.sign) {
 605                intrinsic = "llvm.ppc.altivec.vpkswss";
 606             } else {
 607                intrinsic = "llvm.ppc.altivec.vpkuwus";
 608             }
 609 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 610             swap_intrinsic_operands = TRUE;
 611 #endif
 612          }
 613          break;
 614       case 16:
 615          if (dst_type.sign) {
 616             if (util_cpu_caps.has_sse2) {
 617                intrinsic = "llvm.x86.sse2.packsswb.128";
 618             } else if (util_cpu_caps.has_altivec) {
 619                intrinsic = "llvm.ppc.altivec.vpkshss";
 620 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 621                swap_intrinsic_operands = TRUE;
 622 #endif
 623             }
 624          } else {
 625             if (util_cpu_caps.has_sse2) {
 626                intrinsic = "llvm.x86.sse2.packuswb.128";
 627             } else if (util_cpu_caps.has_altivec) {
 628                intrinsic = "llvm.ppc.altivec.vpkshus";
 629 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 630                swap_intrinsic_operands = TRUE;
 631 #endif
 632             }
 633          }
 634          break;
 635       /* default uses generic shuffle below */
 636       }
 637       if (intrinsic) {
 638          if (src_type.width * src_type.length == 128) {
 639             LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
 640             if (swap_intrinsic_operands) {
 641                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
 642             } else {
 643                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
 644             }
 645             if (dst_vec_type != intr_vec_type) {
 646                res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
 647             }
 648          }
 649          else {
 650             int num_split = src_type.width * src_type.length / 128;
 651             int i;
 652             int nlen = 128 / src_type.width;
 653             int lo_off = swap_intrinsic_operands ? nlen : 0;
 654             int hi_off = swap_intrinsic_operands ? 0 : nlen;
 655             struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
 656             struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
 657             LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
 658             LLVMValueRef tmplo, tmphi;
 659             LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
 660             LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
 661
 662             assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
 663
 664             for (i = 0; i < num_split / 2; i++) {
 665                tmplo = lp_build_extract_range(gallivm,
 666                                               lo, i*nlen*2 + lo_off, nlen);
 667                tmphi = lp_build_extract_range(gallivm,
 668                                               lo, i*nlen*2 + hi_off, nlen);
 669                tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
 670                                                      nintr_vec_type, tmplo, tmphi);
 671                if (ndst_vec_type != nintr_vec_type) {
 672                   tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
 673                }
 674             }
 675             for (i = 0; i < num_split / 2; i++) {
 676                tmplo = lp_build_extract_range(gallivm,
 677                                               hi, i*nlen*2 + lo_off, nlen);
 678                tmphi = lp_build_extract_range(gallivm,
 679                                               hi, i*nlen*2 + hi_off, nlen);
 680                tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
 681                                                                  nintr_vec_type,
 682                                                                  tmplo, tmphi);
 683                if (ndst_vec_type != nintr_vec_type) {
 684                   tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
 685                                                            ndst_vec_type, "");
 686                }
 687             }
 688             res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
 689          }
 690          return res;
 691       }
 692    }
 693
 694    /* generic shuffle */
 695    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
 696    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
 697
 698    shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
 699
 700    res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
 701
 702    return res;
 703 }
 704
 705
 706 /**
 707  * Non-interleaved native pack.
 708  *
 709  * Similar to lp_build_pack2, but the ordering of values is not
 710  * guaranteed, other than it will match lp_build_unpack2_native.
 711  *
 712  * In particular, with avx2, the lower and upper 128bits of the vectors will
 713  * be packed independently, so that (with 32bit->16bit values)
 714  *         (LSB)                                       (MSB)
 715  *   lo =   l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
 716  *   hi =   h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
 717  *   res =  l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
 718  *
 719  * This will only change the number of bits the values are represented, not the
 720  * values themselves.
 721  *
 722  * It is assumed the values are already clamped into the destination type range.
 723  * Values outside that range will produce undefined results.
 724  */
 725 LLVMValueRef
 726 lp_build_pack2_native(struct gallivm_state *gallivm,
 727                       struct lp_type src_type,
 728                       struct lp_type dst_type,
 729                       LLVMValueRef lo,
 730                       LLVMValueRef hi)
 731 {
 732    LLVMBuilderRef builder = gallivm->builder;
 733    struct lp_type intr_type = dst_type;
 734    const char *intrinsic = NULL;
 735
 736    assert(!src_type.floating);
 737    assert(!dst_type.floating);
 738    assert(src_type.width == dst_type.width * 2);
 739    assert(src_type.length * 2 == dst_type.length);
 740
 741    /* At this point only have special case for avx2 */
 742    if (src_type.length * src_type.width == 256 &&
 743        util_cpu_caps.has_avx2) {
 744       switch(src_type.width) {
 745       case 32:
 746          if (dst_type.sign) {
 747             intrinsic = "llvm.x86.avx2.packssdw";
 748          } else {
 749             intrinsic = "llvm.x86.avx2.packusdw";
 750          }
 751          break;
 752       case 16:
 753          if (dst_type.sign) {
 754             intrinsic = "llvm.x86.avx2.packsswb";
 755          } else {
 756             intrinsic = "llvm.x86.avx2.packuswb";
 757          }
 758          break;
 759       }
 760    }
 761    if (intrinsic) {
 762       LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
 763       return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
 764                                        lo, hi);
 765    }
 766    else {
 767       return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
 768    }
 769 }
 770
 771 /**
 772  * Non-interleaved pack and saturate.
 773  *
 774  * Same as lp_build_pack2 but will saturate values so that they fit into the
 775  * destination type.
 776  */
 777 LLVMValueRef
 778 lp_build_packs2(struct gallivm_state *gallivm,
 779                 struct lp_type src_type,
 780                 struct lp_type dst_type,
 781                 LLVMValueRef lo,
 782                 LLVMValueRef hi)
 783 {
 784    boolean clamp;
 785
 786    assert(!src_type.floating);
 787    assert(!dst_type.floating);
 788    assert(src_type.sign == dst_type.sign);
 789    assert(src_type.width == dst_type.width * 2);
 790    assert(src_type.length * 2 == dst_type.length);
 791
 792    clamp = TRUE;
 793
 794    /* All X86 SSE non-interleaved pack instructions take signed inputs and
 795     * saturate them, so no need to clamp for those cases. */
 796    if(util_cpu_caps.has_sse2 &&
 797       src_type.width * src_type.length >= 128 &&
 798       src_type.sign &&
 799       (src_type.width == 32 || src_type.width == 16))
 800       clamp = FALSE;
 801
 802    if(clamp) {
 803       struct lp_build_context bld;
 804       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
 805       LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
 806                                 ((unsigned long long)1 << dst_bits) - 1);
 807       lp_build_context_init(&bld, gallivm, src_type);
 808       lo = lp_build_min(&bld, lo, dst_max);
 809       hi = lp_build_min(&bld, hi, dst_max);
 810       /* FIXME: What about lower bound? */
 811    }
 812
 813    return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
 814 }
 815
 816
 817 /**
 818  * Truncate the bit width.
 819  *
 820  * TODO: Handle saturation consistently.
 821  */
 822 LLVMValueRef
 823 lp_build_pack(struct gallivm_state *gallivm,
 824               struct lp_type src_type,
 825               struct lp_type dst_type,
 826               boolean clamped,
 827               const LLVMValueRef *src, unsigned num_srcs)
 828 {
 829    LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
 830                          struct lp_type src_type,
 831                          struct lp_type dst_type,
 832                          LLVMValueRef lo,
 833                          LLVMValueRef hi);
 834    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 835    unsigned i;
 836
 837    /* Register width must remain constant */
 838    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 839
 840    /* We must not loose or gain channels. Only precision */
 841    assert(src_type.length * num_srcs == dst_type.length);
 842
 843    if(clamped)
 844       pack2 = &lp_build_pack2;
 845    else
 846       pack2 = &lp_build_packs2;
 847
 848    for(i = 0; i < num_srcs; ++i)
 849       tmp[i] = src[i];
 850
 851    while(src_type.width > dst_type.width) {
 852       struct lp_type tmp_type = src_type;
 853
 854       tmp_type.width /= 2;
 855       tmp_type.length *= 2;
 856
 857       /* Take in consideration the sign changes only in the last step */
 858       if(tmp_type.width == dst_type.width)
 859          tmp_type.sign = dst_type.sign;
 860
 861       num_srcs /= 2;
 862
 863       for(i = 0; i < num_srcs; ++i)
 864          tmp[i] = pack2(gallivm, src_type, tmp_type,
 865                         tmp[2*i + 0], tmp[2*i + 1]);
 866
 867       src_type = tmp_type;
 868    }
 869
 870    assert(num_srcs == 1);
 871
 872    return tmp[0];
 873 }
 874
 875
 876 /**
 877  * Truncate or expand the bitwidth.
 878  *
 879  * NOTE: Getting the right sign flags is crucial here, as we employ some
 880  * intrinsics that do saturation.
 881  */
 882 void
 883 lp_build_resize(struct gallivm_state *gallivm,
 884                 struct lp_type src_type,
 885                 struct lp_type dst_type,
 886                 const LLVMValueRef *src, unsigned num_srcs,
 887                 LLVMValueRef *dst, unsigned num_dsts)
 888 {
 889    LLVMBuilderRef builder = gallivm->builder;
 890    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 891    unsigned i;
 892
 893    /*
 894     * We don't support float <-> int conversion here. That must be done
 895     * before/after calling this function.
 896     */
 897    assert(src_type.floating == dst_type.floating);
 898
 899    /*
 900     * We don't support double <-> float conversion yet, although it could be
 901     * added with little effort.
 902     */
 903    assert((!src_type.floating && !dst_type.floating) ||
 904           src_type.width == dst_type.width);
 905
 906    /* We must not loose or gain channels. Only precision */
 907    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 908
 909    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
 910    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
 911    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
 912    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 913
 914    if (src_type.width > dst_type.width) {
 915       /*
 916        * Truncate bit width.
 917        */
 918
 919       /* Conversion must be M:1 */
 920       assert(num_dsts == 1);
 921
 922       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
 923         /*
 924          * Register width remains constant -- use vector packing intrinsics
 925          */
 926          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
 927       }
 928       else {
 929          if (src_type.width / dst_type.width > num_srcs) {
 930             /*
 931             * First change src vectors size (with shuffle) so they have the
 932             * same size as the destination vector, then pack normally.
 933             * Note: cannot use cast/extract because llvm generates atrocious code.
 934             */
 935             unsigned size_ratio = (src_type.width * src_type.length) /
 936                                   (dst_type.length * dst_type.width);
 937             unsigned new_length = src_type.length / size_ratio;
 938
 939             for (i = 0; i < size_ratio * num_srcs; i++) {
 940                unsigned start_index = (i % size_ratio) * new_length;
 941                tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
 942                                                start_index, new_length);
 943             }
 944             num_srcs *= size_ratio;
 945             src_type.length = new_length;
 946             tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
 947          }
 948          else {
 949             /*
 950              * Truncate bit width but expand vector size - first pack
 951              * then expand simply because this should be more AVX-friendly
 952              * for the cases we probably hit.
 953              */
 954             unsigned size_ratio = (dst_type.width * dst_type.length) /
 955                                   (src_type.length * src_type.width);
 956             unsigned num_pack_srcs = num_srcs / size_ratio;
 957             dst_type.length = dst_type.length / size_ratio;
 958
 959             for (i = 0; i < size_ratio; i++) {
 960                tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
 961                                       &src[i*num_pack_srcs], num_pack_srcs);
 962             }
 963             tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
 964          }
 965       }
 966    }
 967    else if (src_type.width < dst_type.width) {
 968       /*
 969        * Expand bit width.
 970        */
 971
 972       /* Conversion must be 1:N */
 973       assert(num_srcs == 1);
 974
 975       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
 976          /*
 977           * Register width remains constant -- use vector unpack intrinsics
 978           */
 979          lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
 980       }
 981       else {
 982          /*
 983           * Do it element-wise.
 984           */
 985          assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 986
 987          for (i = 0; i < num_dsts; i++) {
 988             tmp[i] = lp_build_undef(gallivm, dst_type);
 989          }
 990
 991          for (i = 0; i < src_type.length; ++i) {
 992             unsigned j = i / dst_type.length;
 993             LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
 994             LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
 995             LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
 996
 997             if (src_type.sign && dst_type.sign) {
 998                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
 999             } else {
1000                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
1001             }
1002             tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
1003          }
1004       }
1005    }
1006    else {
1007       /*
1008        * No-op
1009        */
1010
1011       /* "Conversion" must be N:N */
1012       assert(num_srcs == num_dsts);
1013
1014       for(i = 0; i < num_dsts; ++i)
1015          tmp[i] = src[i];
1016    }
1017
1018    for(i = 0; i < num_dsts; ++i)
1019       dst[i] = tmp[i];
1020 }
1021
1022
1023 /**
1024  * Expands src vector from src.length to dst_length
1025  */
1026 LLVMValueRef
1027 lp_build_pad_vector(struct gallivm_state *gallivm,
1028                     LLVMValueRef src,
1029                     unsigned dst_length)
1030 {
1031    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
1032    LLVMValueRef undef;
1033    LLVMTypeRef type;
1034    unsigned i, src_length;
1035
1036    type = LLVMTypeOf(src);
1037
1038    if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
1039       /* Can't use ShuffleVector on non-vector type */
1040       undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
1041       return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
1042    }
1043
1044    undef      = LLVMGetUndef(type);
1045    src_length = LLVMGetVectorSize(type);
1046
1047    assert(dst_length <= ARRAY_SIZE(elems));
1048    assert(dst_length >= src_length);
1049
1050    if (src_length == dst_length)
1051       return src;
1052
1053    /* All elements from src vector */
1054    for (i = 0; i < src_length; ++i)
1055       elems[i] = lp_build_const_int32(gallivm, i);
1056
1057    /* Undef fill remaining space */
1058    for (i = src_length; i < dst_length; ++i)
1059       elems[i] = lp_build_const_int32(gallivm, src_length);
1060
1061    /* Combine the two vectors */
1062    return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
1063 }