src/gallium/auxiliary/gallivm/lp_bld_format_soa.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include "pipe/p_defines.h"
  30
  31 #include "util/u_format.h"
  32 #include "util/u_memory.h"
  33 #include "util/u_string.h"
  34 #include "util/u_math.h"
  35
  36 #include "lp_bld_type.h"
  37 #include "lp_bld_const.h"
  38 #include "lp_bld_conv.h"
  39 #include "lp_bld_swizzle.h"
  40 #include "lp_bld_gather.h"
  41 #include "lp_bld_debug.h"
  42 #include "lp_bld_format.h"
  43 #include "lp_bld_arit.h"
  44 #include "lp_bld_pack.h"
  45 #include "lp_bld_flow.h"
  46 #include "lp_bld_printf.h"
  47 #include "lp_bld_intr.h"
  48
  49 static void
  50 convert_to_soa(struct gallivm_state *gallivm,
  51                LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
  52                LLVMValueRef dst_soa[4],
  53                const struct lp_type soa_type)
  54 {
  55    unsigned j, k;
  56    struct lp_type aos_channel_type = soa_type;
  57
  58    LLVMValueRef aos_channels[4];
  59    unsigned pixels_per_channel = soa_type.length / 4;
  60
  61    debug_assert((soa_type.length % 4) == 0);
  62
  63    aos_channel_type.length >>= 1;
  64
  65    for (j = 0; j < 4; ++j) {
  66       LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
  67
  68       assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
  69
  70       for (k = 0; k < pixels_per_channel; ++k) {
  71          channel[k] = src_aos[j + 4 * k];
  72       }
  73
  74       aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
  75    }
  76
  77    lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
  78 }
  79
  80
  81 void
  82 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
  83                             struct lp_build_context *bld,
  84                             const LLVMValueRef *unswizzled,
  85                             LLVMValueRef swizzled_out[4])
  86 {
  87    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
  88       enum pipe_swizzle swizzle;
  89       LLVMValueRef depth_or_stencil;
  90
  91       if (util_format_has_stencil(format_desc) &&
  92           !util_format_has_depth(format_desc)) {
  93          assert(!bld->type.floating);
  94          swizzle = format_desc->swizzle[1];
  95       }
  96       else {
  97          assert(bld->type.floating);
  98          swizzle = format_desc->swizzle[0];
  99       }
 100       /*
 101        * Return zzz1 or sss1 for depth-stencil formats here.
 102        * Correct swizzling will be handled by apply_sampler_swizzle() later.
 103        */
 104       depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
 105
 106       swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
 107       swizzled_out[3] = bld->one;
 108    }
 109    else {
 110       unsigned chan;
 111       for (chan = 0; chan < 4; ++chan) {
 112          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
 113          swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
 114       }
 115    }
 116 }
 117
 118
 119
 120 static LLVMValueRef
 121 lp_build_extract_soa_chan(struct lp_build_context *bld,
 122                           unsigned blockbits,
 123                           boolean srgb_chan,
 124                           struct util_format_channel_description chan_desc,
 125                           LLVMValueRef packed)
 126 {
 127    struct gallivm_state *gallivm = bld->gallivm;
 128    LLVMBuilderRef builder = gallivm->builder;
 129    struct lp_type type = bld->type;
 130    LLVMValueRef input = packed;
 131    const unsigned width = chan_desc.size;
 132    const unsigned start = chan_desc.shift;
 133    const unsigned stop = start + width;
 134
 135    /* Decode the input vector component */
 136
 137    switch(chan_desc.type) {
 138    case UTIL_FORMAT_TYPE_VOID:
 139       input = bld->undef;
 140       break;
 141
 142    case UTIL_FORMAT_TYPE_UNSIGNED:
 143       /*
 144        * Align the LSB
 145        */
 146       if (start) {
 147          input = LLVMBuildLShr(builder, input,
 148                                lp_build_const_int_vec(gallivm, type, start), "");
 149       }
 150
 151       /*
 152        * Zero the MSBs
 153        */
 154       if (stop < blockbits) {
 155          unsigned mask = ((unsigned long long)1 << width) - 1;
 156          input = LLVMBuildAnd(builder, input,
 157                               lp_build_const_int_vec(gallivm, type, mask), "");
 158       }
 159
 160       /*
 161        * Type conversion
 162        */
 163       if (type.floating) {
 164          if (srgb_chan) {
 165             struct lp_type conv_type = lp_uint_type(type);
 166             input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
 167          }
 168          else {
 169             if(chan_desc.normalized)
 170                input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
 171             else
 172                input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
 173          }
 174       }
 175       else if (chan_desc.pure_integer) {
 176          /* Nothing to do */
 177       } else {
 178           /* FIXME */
 179           assert(0);
 180       }
 181       break;
 182
 183    case UTIL_FORMAT_TYPE_SIGNED:
 184       /*
 185        * Align the sign bit first.
 186        */
 187       if (stop < type.width) {
 188          unsigned bits = type.width - stop;
 189          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
 190          input = LLVMBuildShl(builder, input, bits_val, "");
 191       }
 192
 193       /*
 194        * Align the LSB (with an arithmetic shift to preserve the sign)
 195        */
 196       if (chan_desc.size < type.width) {
 197          unsigned bits = type.width - chan_desc.size;
 198          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
 199          input = LLVMBuildAShr(builder, input, bits_val, "");
 200       }
 201
 202       /*
 203        * Type conversion
 204        */
 205       if (type.floating) {
 206          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
 207          if (chan_desc.normalized) {
 208             double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
 209             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
 210             input = LLVMBuildFMul(builder, input, scale_val, "");
 211             /*
 212              * The formula above will produce value below -1.0 for most negative
 213              * value but everything seems happy with that hence disable for now.
 214              */
 215             if (0)
 216                input = lp_build_max(bld, input,
 217                                     lp_build_const_vec(gallivm, type, -1.0f));
 218          }
 219       }
 220       else if (chan_desc.pure_integer) {
 221          /* Nothing to do */
 222       } else {
 223           /* FIXME */
 224           assert(0);
 225       }
 226       break;
 227
 228    case UTIL_FORMAT_TYPE_FLOAT:
 229       if (type.floating) {
 230          if (chan_desc.size == 16) {
 231             struct lp_type f16i_type = type;
 232             f16i_type.width /= 2;
 233             f16i_type.floating = 0;
 234             if (start) {
 235                input = LLVMBuildLShr(builder, input,
 236                                      lp_build_const_int_vec(gallivm, type, start), "");
 237             }
 238             input = LLVMBuildTrunc(builder, input,
 239                                    lp_build_vec_type(gallivm, f16i_type), "");
 240             input = lp_build_half_to_float(gallivm, input);
 241          } else {
 242             assert(start == 0);
 243             assert(stop == 32);
 244             assert(type.width == 32);
 245          }
 246          input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
 247       }
 248       else {
 249          /* FIXME */
 250          assert(0);
 251          input = bld->undef;
 252       }
 253       break;
 254
 255    case UTIL_FORMAT_TYPE_FIXED:
 256       if (type.floating) {
 257          double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
 258          LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
 259          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
 260          input = LLVMBuildFMul(builder, input, scale_val, "");
 261       }
 262       else {
 263          /* FIXME */
 264          assert(0);
 265          input = bld->undef;
 266       }
 267       break;
 268
 269    default:
 270       assert(0);
 271       input = bld->undef;
 272       break;
 273    }
 274
 275    return input;
 276 }
 277
 278
 279 /**
 280  * Unpack several pixels in SoA.
 281  *
 282  * It takes a vector of packed pixels:
 283  *
 284  *   packed = {P0, P1, P2, P3, ..., Pn}
 285  *
 286  * And will produce four vectors:
 287  *
 288  *   red    = {R0, R1, R2, R3, ..., Rn}
 289  *   green  = {G0, G1, G2, G3, ..., Gn}
 290  *   blue   = {B0, B1, B2, B3, ..., Bn}
 291  *   alpha  = {A0, A1, A2, A3, ..., An}
 292  *
 293  * It requires that a packed pixel fits into an element of the output
 294  * channels. The common case is when converting pixel with a depth of 32 bit or
 295  * less into floats.
 296  *
 297  * \param format_desc  the format of the 'packed' incoming pixel vector
 298  * \param type  the desired type for rgba_out (type.length = n, above)
 299  * \param packed  the incoming vector of packed pixels
 300  * \param rgba_out  returns the SoA R,G,B,A vectors
 301  */
 302 void
 303 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
 304                          const struct util_format_description *format_desc,
 305                          struct lp_type type,
 306                          LLVMValueRef packed,
 307                          LLVMValueRef rgba_out[4])
 308 {
 309    struct lp_build_context bld;
 310    LLVMValueRef inputs[4];
 311    unsigned chan;
 312
 313    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 314    assert(format_desc->block.width == 1);
 315    assert(format_desc->block.height == 1);
 316    assert(format_desc->block.bits <= type.width);
 317    /* FIXME: Support more output types */
 318    assert(type.width == 32);
 319
 320    lp_build_context_init(&bld, gallivm, type);
 321
 322    /* Decode the input vector components */
 323    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
 324       struct util_format_channel_description chan_desc = format_desc->channel[chan];
 325       boolean srgb_chan = FALSE;
 326
 327       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
 328           format_desc->swizzle[3] != chan) {
 329          srgb_chan = TRUE;
 330       }
 331
 332       inputs[chan] = lp_build_extract_soa_chan(&bld,
 333                                                format_desc->block.bits,
 334                                                srgb_chan,
 335                                                chan_desc,
 336                                                packed);
 337    }
 338
 339    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
 340 }
 341
 342
 343 /**
 344  * Convert a vector of rgba8 values into 32bit wide SoA vectors.
 345  *
 346  * \param dst_type  The desired return type. For pure integer formats
 347  *                  this should be a 32bit wide int or uint vector type,
 348  *                  otherwise a float vector type.
 349  *
 350  * \param packed    The rgba8 values to pack.
 351  *
 352  * \param rgba      The 4 SoA return vectors.
 353  */
 354 void
 355 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
 356                            struct lp_type dst_type,
 357                            LLVMValueRef packed,
 358                            LLVMValueRef *rgba)
 359 {
 360    LLVMBuilderRef builder = gallivm->builder;
 361    LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
 362    unsigned chan;
 363
 364    /* XXX technically shouldn't use that for uint dst_type */
 365    packed = LLVMBuildBitCast(builder, packed,
 366                              lp_build_int_vec_type(gallivm, dst_type), "");
 367
 368    /* Decode the input vector components */
 369    for (chan = 0; chan < 4; ++chan) {
 370 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 371       unsigned start = chan*8;
 372 #else
 373       unsigned start = (3-chan)*8;
 374 #endif
 375       unsigned stop = start + 8;
 376       LLVMValueRef input;
 377
 378       input = packed;
 379
 380       if (start)
 381          input = LLVMBuildLShr(builder, input,
 382                                lp_build_const_int_vec(gallivm, dst_type, start), "");
 383
 384       if (stop < 32)
 385          input = LLVMBuildAnd(builder, input, mask, "");
 386
 387       if (dst_type.floating)
 388          input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
 389
 390       rgba[chan] = input;
 391    }
 392 }
 393
 394
 395
 396 /**
 397  * Fetch a texels from a texture, returning them in SoA layout.
 398  *
 399  * \param type  the desired return type for 'rgba'.  The vector length
 400  *              is the number of texels to fetch
 401  * \param aligned if the offset is guaranteed to be aligned to element width
 402  *
 403  * \param base_ptr  points to the base of the texture mip tree.
 404  * \param offset    offset to start of the texture image block.  For non-
 405  *                  compressed formats, this simply is an offset to the texel.
 406  *                  For compressed formats, it is an offset to the start of the
 407  *                  compressed data block.
 408  *
 409  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 410  *              these will always be (0,0).  For compressed formats, i will
 411  *              be in [0, block_width-1] and j will be in [0, block_height-1].
 412  * \param cache  optional value pointing to a lp_build_format_cache structure
 413  */
 414 void
 415 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 416                         const struct util_format_description *format_desc,
 417                         struct lp_type type,
 418                         boolean aligned,
 419                         LLVMValueRef base_ptr,
 420                         LLVMValueRef offset,
 421                         LLVMValueRef i,
 422                         LLVMValueRef j,
 423                         LLVMValueRef cache,
 424                         LLVMValueRef rgba_out[4])
 425 {
 426    LLVMBuilderRef builder = gallivm->builder;
 427    enum pipe_format format = format_desc->format;
 428    struct lp_type fetch_type;
 429
 430    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 431        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
 432         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
 433         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
 434        format_desc->block.width == 1 &&
 435        format_desc->block.height == 1 &&
 436        format_desc->block.bits <= type.width &&
 437        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
 438         format_desc->channel[0].size == 32 ||
 439         format_desc->channel[0].size == 16))
 440    {
 441       /*
 442        * The packed pixel fits into an element of the destination format. Put
 443        * the packed pixels into a vector and extract each component for all
 444        * vector elements in parallel.
 445        */
 446
 447       LLVMValueRef packed;
 448
 449       /*
 450        * gather the texels from the texture
 451        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
 452        */
 453       assert(format_desc->block.bits <= type.width);
 454       fetch_type = lp_type_uint(type.width);
 455       packed = lp_build_gather(gallivm,
 456                                type.length,
 457                                format_desc->block.bits,
 458                                fetch_type,
 459                                aligned,
 460                                base_ptr, offset, FALSE);
 461
 462       /*
 463        * convert texels to float rgba
 464        */
 465       lp_build_unpack_rgba_soa(gallivm,
 466                                format_desc,
 467                                type,
 468                                packed, rgba_out);
 469       return;
 470    }
 471
 472
 473    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 474        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
 475        format_desc->block.width == 1 &&
 476        format_desc->block.height == 1 &&
 477        format_desc->block.bits > type.width &&
 478        ((format_desc->block.bits <= type.width * type.length &&
 479          format_desc->channel[0].size <= type.width) ||
 480         (format_desc->channel[0].size == 64 &&
 481          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
 482          type.floating)))
 483    {
 484       /*
 485        * Similar to above, but the packed pixel is larger than what fits
 486        * into an element of the destination format. The packed pixels will be
 487        * shuffled into SoA vectors appropriately, and then the extraction will
 488        * be done in parallel as much as possible.
 489        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
 490        * the gathered vectors can be shuffled easily (even with avx).
 491        * 64xn float -> 32xn float is handled too but it's a bit special as
 492        * it does the conversion pre-shuffle.
 493        */
 494
 495       LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
 496       struct lp_type fetch_type, gather_type = type;
 497       unsigned num_gather, fetch_width, i, j;
 498       struct lp_build_context bld;
 499       boolean fp64 = format_desc->channel[0].size == 64;
 500
 501       lp_build_context_init(&bld, gallivm, type);
 502
 503       assert(type.width == 32);
 504       assert(format_desc->block.bits > type.width);
 505
 506       /*
 507        * First, figure out fetch order.
 508        */
 509       fetch_width = util_next_power_of_two(format_desc->block.bits);
 510       /*
 511        * fp64 are treated like fp32 except we fetch twice wide values
 512        * (as we shuffle after trunc). The shuffles for that work out
 513        * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
 514        * albeit we miss the potential opportunity for hw gather (as it
 515        * only handles native size).
 516        */
 517       num_gather = fetch_width / type.width;
 518       gather_type.width *= num_gather;
 519       if (fp64) {
 520          num_gather /= 2;
 521       }
 522       gather_type.length /= num_gather;
 523
 524       for (i = 0; i < num_gather; i++) {
 525          LLVMValueRef offsetr, shuf_vec;
 526          if(num_gather == 4) {
 527             for (j = 0; j < gather_type.length; j++) {
 528                unsigned idx = i + 4*j;
 529                shuffles[j] = lp_build_const_int32(gallivm, idx);
 530             }
 531             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
 532             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
 533
 534          }
 535          else if (num_gather == 2) {
 536             assert(num_gather == 2);
 537             for (j = 0; j < gather_type.length; j++) {
 538                unsigned idx = i*2 + (j%2) + (j/2)*4;
 539                shuffles[j] = lp_build_const_int32(gallivm, idx);
 540             }
 541             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
 542             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
 543          }
 544          else {
 545             assert(num_gather == 1);
 546             offsetr = offset;
 547          }
 548          if (gather_type.length == 1) {
 549             LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
 550             offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
 551          }
 552
 553          /*
 554           * Determine whether to use float or int loads. This is mostly
 555           * to outsmart the (stupid) llvm int/float shuffle logic, we
 556           * don't really care much if the data is floats or ints...
 557           * But llvm will refuse to use single float shuffle with int data
 558           * and instead use 3 int shuffles instead, the code looks atrocious.
 559           * (Note bitcasts often won't help, as llvm is too smart to be
 560           * fooled by that.)
 561           * Nobody cares about simd float<->int domain transition penalties,
 562           * which usually don't even exist for shuffles anyway.
 563           * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
 564           * going into transpose, which is unpacks, so doesn't really matter
 565           * much).
 566           * With 2x32bit or 4x16bit fetch, we use float vec, since those
 567           * go into the weird channel separation shuffle. With floats,
 568           * this is (with 128bit vectors):
 569           * - 2 movq, 2 movhpd, 2 shufps
 570           * With ints it would be:
 571           * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
 572           * I've seen texture functions increase in code size by 15% just due
 573           * to that (there's lots of such fetches in them...)
 574           * (We could chose a different gather order to improve this somewhat
 575           * for the int path, but it would basically just drop the blends,
 576           * so the float path with this order really is optimal.)
 577           * Albeit it is tricky sometimes llvm doesn't ignore the float->int
 578           * casts so must avoid them until we're done with the float shuffle...
 579           * 3x16bit formats (the same is also true for 3x8) are pretty bad but
 580           * there's nothing we can do about them (we could overallocate by
 581           * those couple bytes and use unaligned but pot sized load).
 582           * Note that this is very much x86 specific. I don't know if this
 583           * affect other archs at all.
 584           */
 585          if (num_gather > 1) {
 586             /*
 587              * We always want some float type here (with x86)
 588              * due to shuffles being float ones afterwards (albeit for
 589              * the num_gather == 4 case int should work fine too
 590              * (unless there's some problems with avx but not avx2).
 591              */
 592             if (format_desc->channel[0].size == 64) {
 593                fetch_type = lp_type_float_vec(64, gather_type.width);
 594             } else {
 595                fetch_type = lp_type_int_vec(32, gather_type.width);
 596             }
 597          }
 598          else {
 599             /* type doesn't matter much */
 600             if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
 601                 (format_desc->channel[0].size == 32 ||
 602                  format_desc->channel[0].size == 64)) {
 603             fetch_type = lp_type_float(gather_type.width);
 604             } else {
 605                fetch_type = lp_type_uint(gather_type.width);
 606             }
 607          }
 608
 609          /* Now finally gather the values */
 610          packed[i] = lp_build_gather(gallivm, gather_type.length,
 611                                      format_desc->block.bits,
 612                                      fetch_type, aligned,
 613                                      base_ptr, offsetr, FALSE);
 614          if (fp64) {
 615             struct lp_type conv_type = type;
 616             conv_type.width *= 2;
 617             packed[i] = LLVMBuildBitCast(builder, packed[i],
 618                                          lp_build_vec_type(gallivm, conv_type), "");
 619             packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
 620          }
 621       }
 622
 623       /* shuffle the gathered values to SoA */
 624       if (num_gather == 2) {
 625          for (i = 0; i < num_gather; i++) {
 626             for (j = 0; j < type.length; j++) {
 627                unsigned idx = (j%2)*2 + (j/4)*4 + i;
 628                if ((j/2)%2)
 629                   idx += type.length;
 630                shuffles[j] = lp_build_const_int32(gallivm, idx);
 631             }
 632             dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
 633                                             LLVMConstVector(shuffles, type.length), "");
 634          }
 635       }
 636       else if (num_gather == 4) {
 637          lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
 638       }
 639       else {
 640          assert(num_gather == 1);
 641          dst[0] = packed[0];
 642       }
 643
 644       /*
 645        * And finally unpack exactly as above, except that
 646        * chan shift is adjusted and the right vector selected.
 647        */
 648       if (!fp64) {
 649          for (i = 0; i < num_gather; i++) {
 650             dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
 651          }
 652          for (i = 0; i < format_desc->nr_channels; i++) {
 653             struct util_format_channel_description chan_desc = format_desc->channel[i];
 654             unsigned blockbits = type.width;
 655             unsigned vec_nr;
 656
 657 #ifdef PIPE_ARCH_BIG_ENDIAN
 658             vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
 659 #else
 660             vec_nr = chan_desc.shift / type.width;
 661 #endif
 662             chan_desc.shift %= type.width;
 663
 664             output[i] = lp_build_extract_soa_chan(&bld,
 665                                                   blockbits,
 666                                                   FALSE,
 667                                                   chan_desc,
 668                                                   dst[vec_nr]);
 669          }
 670       }
 671       else {
 672          for (i = 0; i < format_desc->nr_channels; i++)  {
 673             output[i] = dst[i];
 674          }
 675       }
 676
 677       lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
 678       return;
 679    }
 680
 681    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
 682        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
 683       /*
 684        * similar conceptually to above but requiring special
 685        * AoS packed -> SoA float conversion code.
 686        */
 687       LLVMValueRef packed;
 688       struct lp_type fetch_type = lp_type_uint(type.width);
 689
 690       assert(type.floating);
 691       assert(type.width == 32);
 692
 693       packed = lp_build_gather(gallivm, type.length,
 694                                format_desc->block.bits,
 695                                fetch_type, aligned,
 696                                base_ptr, offset, FALSE);
 697       if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
 698          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
 699       }
 700       else {
 701          lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
 702       }
 703       return;
 704    }
 705
 706    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
 707        format_desc->block.bits == 64) {
 708       /*
 709        * special case the format is 64 bits but we only require
 710        * 32bit (or 8bit) from each block.
 711        */
 712       LLVMValueRef packed;
 713       struct lp_type fetch_type = lp_type_uint(type.width);
 714
 715       if (format == PIPE_FORMAT_X32_S8X24_UINT) {
 716          /*
 717           * for stencil simply fix up offsets - could in fact change
 718           * base_ptr instead even outside the shader.
 719           */
 720          unsigned mask = (1 << 8) - 1;
 721          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
 722          offset = LLVMBuildAdd(builder, offset, s_offset, "");
 723          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
 724                                   aligned, base_ptr, offset, FALSE);
 725          packed = LLVMBuildAnd(builder, packed,
 726                                lp_build_const_int_vec(gallivm, type, mask), "");
 727       }
 728       else {
 729          assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
 730          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
 731                                   aligned, base_ptr, offset, TRUE);
 732          packed = LLVMBuildBitCast(builder, packed,
 733                                    lp_build_vec_type(gallivm, type), "");
 734       }
 735       /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
 736       rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
 737       rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
 738       return;
 739    }
 740
 741    /*
 742     * Try calling lp_build_fetch_rgba_aos for all pixels.
 743     * Should only really hit subsampled, compressed
 744     * (for s3tc srgb too, for rgtc the unorm ones only) by now.
 745     * (This is invalid for plain 8unorm formats because we're lazy with
 746     * the swizzle since some results would arrive swizzled, some not.)
 747     */
 748
 749    if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
 750        (util_format_fits_8unorm(format_desc) ||
 751         format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
 752        type.floating && type.width == 32 &&
 753        (type.length == 1 || (type.length % 4 == 0))) {
 754       struct lp_type tmp_type;
 755       struct lp_build_context bld;
 756       LLVMValueRef packed, rgba[4];
 757       const struct util_format_description *flinear_desc;
 758       const struct util_format_description *frgba8_desc;
 759       unsigned chan;
 760
 761       lp_build_context_init(&bld, gallivm, type);
 762
 763       /*
 764        * Make sure the conversion in aos really only does convert to rgba8
 765        * and not anything more (so use linear format, adjust type).
 766        */
 767       flinear_desc = util_format_description(util_format_linear(format));
 768       memset(&tmp_type, 0, sizeof tmp_type);
 769       tmp_type.width = 8;
 770       tmp_type.length = type.length * 4;
 771       tmp_type.norm = TRUE;
 772
 773       packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
 774                                        aligned, base_ptr, offset, i, j, cache);
 775       packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
 776
 777       /*
 778        * The values are now packed so they match ordinary (srgb) RGBA8 format,
 779        * hence need to use matching format for unpack.
 780        */
 781       frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
 782       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
 783          assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
 784          frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
 785       }
 786       lp_build_unpack_rgba_soa(gallivm,
 787                                frgba8_desc,
 788                                type,
 789                                packed, rgba);
 790
 791       /*
 792        * We converted 4 channels. Make sure llvm can drop unneeded ones
 793        * (luckily the rgba order is fixed, only LA needs special case).
 794        */
 795       for (chan = 0; chan < 4; chan++) {
 796          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
 797          if (chan == 3 && util_format_is_luminance_alpha(format)) {
 798             swizzle = PIPE_SWIZZLE_W;
 799          }
 800          rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
 801       }
 802       return;
 803    }
 804
 805
 806    /*
 807     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
 808     *
 809     * This is not the most efficient way of fetching pixels, as we
 810     * miss some opportunities to do vectorization, but this is
 811     * convenient for formats or scenarios for which there was no
 812     * opportunity or incentive to optimize.
 813     *
 814     * We do NOT want to end up here, this typically is quite terrible,
 815     * in particular if the formats have less than 4 channels.
 816     *
 817     * Right now, this should only be hit for:
 818     * - RGTC snorm formats
 819     *   (those miss fast fetch functions hence they are terrible anyway)
 820     */
 821
 822    {
 823       unsigned k;
 824       struct lp_type tmp_type;
 825       LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
 826
 827       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
 828          debug_printf("%s: AoS fetch fallback for %s\n",
 829                       __FUNCTION__, format_desc->short_name);
 830       }
 831
 832       tmp_type = type;
 833       tmp_type.length = 4;
 834
 835       /*
 836        * Note that vector transpose can be worse compared to insert/extract
 837        * for aos->soa conversion (for formats with 1 or 2 channels). However,
 838        * we should try to avoid getting here for just about all formats, so
 839        * don't bother.
 840        */
 841
 842       /* loop over number of pixels */
 843       for(k = 0; k < type.length; ++k) {
 844          LLVMValueRef index = lp_build_const_int32(gallivm, k);
 845          LLVMValueRef offset_elem;
 846          LLVMValueRef i_elem, j_elem;
 847
 848          offset_elem = LLVMBuildExtractElement(builder, offset,
 849                                                index, "");
 850
 851          i_elem = LLVMBuildExtractElement(builder, i, index, "");
 852          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 853
 854          /* Get a single float[4]={R,G,B,A} pixel */
 855          aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
 856                                                 aligned, base_ptr, offset_elem,
 857                                                 i_elem, j_elem, cache);
 858
 859       }
 860       convert_to_soa(gallivm, aos_fetch, rgba_out, type);
 861    }
 862 }
 863
 864 static void
 865 lp_build_insert_soa_chan(struct lp_build_context *bld,
 866                          unsigned blockbits,
 867                          struct util_format_channel_description chan_desc,
 868                          LLVMValueRef *output,
 869                          LLVMValueRef rgba)
 870 {
 871     struct gallivm_state *gallivm = bld->gallivm;
 872     LLVMBuilderRef builder = gallivm->builder;
 873     struct lp_type type = bld->type;
 874     const unsigned width = chan_desc.size;
 875     const unsigned start = chan_desc.shift;
 876     const unsigned stop = start + width;
 877     LLVMValueRef chan;
 878     switch(chan_desc.type) {
 879     case UTIL_FORMAT_TYPE_UNSIGNED:
 880
 881        if (chan_desc.pure_integer)
 882           chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
 883        else if (type.floating) {
 884           if (chan_desc.normalized)
 885              chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
 886           else
 887              chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
 888        }
 889        if (start)
 890           chan = LLVMBuildShl(builder, chan,
 891                               lp_build_const_int_vec(gallivm, type, start), "");
 892        if (!*output)
 893           *output = chan;
 894        else
 895           *output = LLVMBuildOr(builder, *output, chan, "");
 896        break;
 897     case UTIL_FORMAT_TYPE_SIGNED:
 898        if (chan_desc.pure_integer)
 899           chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
 900        else if (type.floating) {
 901           uint32_t mask_val = (1UL << chan_desc.size) - 1;
 902           if (chan_desc.normalized) {
 903              char intrin[32];
 904              double scale = ((1 << (chan_desc.size - 1)) - 1);
 905              LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
 906              rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
 907              rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
 908              lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
 909              rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
 910           }
 911           chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
 912           chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), "");
 913        }
 914        if (start)
 915           chan = LLVMBuildShl(builder, chan,
 916                               lp_build_const_int_vec(gallivm, type, start), "");
 917        if (!*output)
 918           *output = chan;
 919        else
 920           *output = LLVMBuildOr(builder, *output, chan, "");
 921        break;
 922     case UTIL_FORMAT_TYPE_FLOAT:
 923        if (type.floating) {
 924           if (chan_desc.size == 16) {
 925              chan = lp_build_float_to_half(gallivm, rgba);
 926              chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
 927              if (start)
 928                 chan = LLVMBuildShl(builder, chan,
 929                                     lp_build_const_int_vec(gallivm, type, start), "");
 930              if (!*output)
 931                 *output = chan;
 932              else
 933                 *output = LLVMBuildOr(builder, *output, chan, "");
 934           } else {
 935              assert(start == 0);
 936              assert(stop == 32);
 937              assert(type.width == 32);
 938              *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
 939           }
 940        } else
 941           assert(0);
 942        break;
 943     default:
 944        assert(0);
 945        *output = bld->undef;
 946     }
 947 }
 948
 949 static void
 950 lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
 951                        const struct util_format_description *format_desc,
 952                        struct lp_type type,
 953                        const LLVMValueRef rgba_in[4],
 954                        LLVMValueRef *packed)
 955 {
 956    unsigned chan;
 957    struct lp_build_context bld;
 958    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 959    assert(format_desc->block.width == 1);
 960    assert(format_desc->block.height == 1);
 961    assert(format_desc->block.bits <= type.width);
 962    /* FIXME: Support more output types */
 963    assert(type.width == 32);
 964
 965    lp_build_context_init(&bld, gallivm, type);
 966    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
 967       struct util_format_channel_description chan_desc = format_desc->channel[chan];
 968
 969       lp_build_insert_soa_chan(&bld, format_desc->block.bits,
 970                                chan_desc,
 971                                packed,
 972                                rgba_in[chan]);
 973    }
 974 }
 975
 976 void
 977 lp_build_store_rgba_soa(struct gallivm_state *gallivm,
 978                         const struct util_format_description *format_desc,
 979                         struct lp_type type,
 980                         LLVMValueRef exec_mask,
 981                         LLVMValueRef base_ptr,
 982                         LLVMValueRef offset,
 983                         LLVMValueRef out_of_bounds,
 984                         const LLVMValueRef rgba_in[4])
 985 {
 986    enum pipe_format format = format_desc->format;
 987    LLVMValueRef packed[4];
 988    unsigned num_stores;
 989
 990    memset(packed, 0, sizeof(LLVMValueRef) * 4);
 991    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 992        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
 993        format_desc->block.width == 1 &&
 994        format_desc->block.height == 1 &&
 995        format_desc->block.bits <= type.width &&
 996        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
 997         format_desc->channel[0].size == 32 ||
 998         format_desc->channel[0].size == 16))
 999    {
1000       lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
1001
1002       num_stores = 1;
1003    } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1004        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
1005        format_desc->block.width == 1 &&
1006        format_desc->block.height == 1 &&
1007        format_desc->block.bits > type.width &&
1008        ((format_desc->block.bits <= type.width * type.length &&
1009          format_desc->channel[0].size <= type.width) ||
1010         (format_desc->channel[0].size == 64 &&
1011          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1012          type.floating)))
1013    {
1014       /*
1015        * Similar to above, but the packed pixel is larger than what fits
1016        * into an element of the destination format. The packed pixels will be
1017        * shuffled into SoA vectors appropriately, and then the extraction will
1018        * be done in parallel as much as possible.
1019        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1020        * the gathered vectors can be shuffled easily (even with avx).
1021        * 64xn float -> 32xn float is handled too but it's a bit special as
1022        * it does the conversion pre-shuffle.
1023        */
1024       struct lp_build_context bld;
1025
1026       lp_build_context_init(&bld, gallivm, type);
1027       assert(type.width == 32);
1028       assert(format_desc->block.bits > type.width);
1029
1030       unsigned store_width = util_next_power_of_two(format_desc->block.bits);
1031       num_stores = store_width / type.width;
1032       for (unsigned i = 0; i < format_desc->nr_channels; i++) {
1033             struct util_format_channel_description chan_desc = format_desc->channel[i];
1034             unsigned blockbits = type.width;
1035             unsigned vec_nr;
1036
1037             vec_nr = chan_desc.shift / type.width;
1038             chan_desc.shift %= type.width;
1039
1040             lp_build_insert_soa_chan(&bld, blockbits,
1041                                      chan_desc,
1042                                      &packed[vec_nr],
1043                                      rgba_in[i]);
1044       }
1045
1046       assert(num_stores == 4 || num_stores == 2);
1047       /* we can transpose and store at the same time */
1048    } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1049       packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
1050       num_stores = 1;
1051    } else
1052       assert(0);
1053
1054    assert(exec_mask);
1055
1056    LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
1057    LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
1058    LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
1059
1060    LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
1061    should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
1062    for (unsigned i = 0; i < num_stores; i++) {
1063       struct lp_build_loop_state loop_state;
1064
1065       LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
1066       store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
1067
1068       lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
1069
1070       struct lp_build_if_state ifthen;
1071       LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
1072       lp_build_if(&ifthen, gallivm, cond);
1073
1074       LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
1075       LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
1076
1077       if (format_desc->block.bits == 8) {
1078          this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
1079          data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
1080       } else if (format_desc->block.bits == 16) {
1081          this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
1082          data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
1083       } else
1084          this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
1085       LLVMBuildStore(gallivm->builder, data, this_offset);
1086       lp_build_endif(&ifthen);
1087       lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
1088                              NULL, LLVMIntUGE);
1089    }
1090 }