src/gallium/auxiliary/gallivm/lp_bld_format_soa.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include "pipe/p_defines.h"
  30
  31 #include "util/u_format.h"
  32 #include "util/u_memory.h"
  33 #include "util/u_string.h"
  34 #include "util/u_math.h"
  35
  36 #include "lp_bld_type.h"
  37 #include "lp_bld_const.h"
  38 #include "lp_bld_conv.h"
  39 #include "lp_bld_swizzle.h"
  40 #include "lp_bld_gather.h"
  41 #include "lp_bld_debug.h"
  42 #include "lp_bld_format.h"
  43 #include "lp_bld_arit.h"
  44 #include "lp_bld_pack.h"
  45
  46
  47 static void
  48 convert_to_soa(struct gallivm_state *gallivm,
  49                LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
  50                LLVMValueRef dst_soa[4],
  51                const struct lp_type soa_type)
  52 {
  53    unsigned j, k;
  54    struct lp_type aos_channel_type = soa_type;
  55
  56    LLVMValueRef aos_channels[4];
  57    unsigned pixels_per_channel = soa_type.length / 4;
  58
  59    debug_assert((soa_type.length % 4) == 0);
  60
  61    aos_channel_type.length >>= 1;
  62
  63    for (j = 0; j < 4; ++j) {
  64       LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
  65
  66       assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
  67
  68       for (k = 0; k < pixels_per_channel; ++k) {
  69          channel[k] = src_aos[j + 4 * k];
  70       }
  71
  72       aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
  73    }
  74
  75    lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
  76 }
  77
  78
  79 void
  80 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
  81                             struct lp_build_context *bld,
  82                             const LLVMValueRef *unswizzled,
  83                             LLVMValueRef swizzled_out[4])
  84 {
  85    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
  86       enum pipe_swizzle swizzle;
  87       LLVMValueRef depth_or_stencil;
  88
  89       if (util_format_has_stencil(format_desc) &&
  90           !util_format_has_depth(format_desc)) {
  91          assert(!bld->type.floating);
  92          swizzle = format_desc->swizzle[1];
  93       }
  94       else {
  95          assert(bld->type.floating);
  96          swizzle = format_desc->swizzle[0];
  97       }
  98       /*
  99        * Return zzz1 or sss1 for depth-stencil formats here.
 100        * Correct swizzling will be handled by apply_sampler_swizzle() later.
 101        */
 102       depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
 103
 104       swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
 105       swizzled_out[3] = bld->one;
 106    }
 107    else {
 108       unsigned chan;
 109       for (chan = 0; chan < 4; ++chan) {
 110          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
 111          swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
 112       }
 113    }
 114 }
 115
 116
 117
 118 static LLVMValueRef
 119 lp_build_extract_soa_chan(struct lp_build_context *bld,
 120                           unsigned blockbits,
 121                           boolean srgb_chan,
 122                           struct util_format_channel_description chan_desc,
 123                           LLVMValueRef packed)
 124 {
 125    struct gallivm_state *gallivm = bld->gallivm;
 126    LLVMBuilderRef builder = gallivm->builder;
 127    struct lp_type type = bld->type;
 128    LLVMValueRef input = packed;
 129    const unsigned width = chan_desc.size;
 130    const unsigned start = chan_desc.shift;
 131    const unsigned stop = start + width;
 132
 133    /* Decode the input vector component */
 134
 135    switch(chan_desc.type) {
 136    case UTIL_FORMAT_TYPE_VOID:
 137       input = bld->undef;
 138       break;
 139
 140    case UTIL_FORMAT_TYPE_UNSIGNED:
 141       /*
 142        * Align the LSB
 143        */
 144       if (start) {
 145          input = LLVMBuildLShr(builder, input,
 146                                lp_build_const_int_vec(gallivm, type, start), "");
 147       }
 148
 149       /*
 150        * Zero the MSBs
 151        */
 152       if (stop < blockbits) {
 153          unsigned mask = ((unsigned long long)1 << width) - 1;
 154          input = LLVMBuildAnd(builder, input,
 155                               lp_build_const_int_vec(gallivm, type, mask), "");
 156       }
 157
 158       /*
 159        * Type conversion
 160        */
 161       if (type.floating) {
 162          if (srgb_chan) {
 163             struct lp_type conv_type = lp_uint_type(type);
 164             input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
 165          }
 166          else {
 167             if(chan_desc.normalized)
 168                input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
 169             else
 170                input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
 171          }
 172       }
 173       else if (chan_desc.pure_integer) {
 174          /* Nothing to do */
 175       } else {
 176           /* FIXME */
 177           assert(0);
 178       }
 179       break;
 180
 181    case UTIL_FORMAT_TYPE_SIGNED:
 182       /*
 183        * Align the sign bit first.
 184        */
 185       if (stop < type.width) {
 186          unsigned bits = type.width - stop;
 187          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
 188          input = LLVMBuildShl(builder, input, bits_val, "");
 189       }
 190
 191       /*
 192        * Align the LSB (with an arithmetic shift to preserve the sign)
 193        */
 194       if (chan_desc.size < type.width) {
 195          unsigned bits = type.width - chan_desc.size;
 196          LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
 197          input = LLVMBuildAShr(builder, input, bits_val, "");
 198       }
 199
 200       /*
 201        * Type conversion
 202        */
 203       if (type.floating) {
 204          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
 205          if (chan_desc.normalized) {
 206             double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
 207             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
 208             input = LLVMBuildFMul(builder, input, scale_val, "");
 209             /*
 210              * The formula above will produce value below -1.0 for most negative
 211              * value but everything seems happy with that hence disable for now.
 212              */
 213             if (0)
 214                input = lp_build_max(bld, input,
 215                                     lp_build_const_vec(gallivm, type, -1.0f));
 216          }
 217       }
 218       else if (chan_desc.pure_integer) {
 219          /* Nothing to do */
 220       } else {
 221           /* FIXME */
 222           assert(0);
 223       }
 224       break;
 225
 226    case UTIL_FORMAT_TYPE_FLOAT:
 227       if (type.floating) {
 228          if (chan_desc.size == 16) {
 229             struct lp_type f16i_type = type;
 230             f16i_type.width /= 2;
 231             f16i_type.floating = 0;
 232             if (start) {
 233                input = LLVMBuildLShr(builder, input,
 234                                      lp_build_const_int_vec(gallivm, type, start), "");
 235             }
 236             input = LLVMBuildTrunc(builder, input,
 237                                    lp_build_vec_type(gallivm, f16i_type), "");
 238             input = lp_build_half_to_float(gallivm, input);
 239          } else {
 240             assert(start == 0);
 241             assert(stop == 32);
 242             assert(type.width == 32);
 243          }
 244          input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
 245       }
 246       else {
 247          /* FIXME */
 248          assert(0);
 249          input = bld->undef;
 250       }
 251       break;
 252
 253    case UTIL_FORMAT_TYPE_FIXED:
 254       if (type.floating) {
 255          double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
 256          LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
 257          input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
 258          input = LLVMBuildFMul(builder, input, scale_val, "");
 259       }
 260       else {
 261          /* FIXME */
 262          assert(0);
 263          input = bld->undef;
 264       }
 265       break;
 266
 267    default:
 268       assert(0);
 269       input = bld->undef;
 270       break;
 271    }
 272
 273    return input;
 274 }
 275
 276
 277 /**
 278  * Unpack several pixels in SoA.
 279  *
 280  * It takes a vector of packed pixels:
 281  *
 282  *   packed = {P0, P1, P2, P3, ..., Pn}
 283  *
 284  * And will produce four vectors:
 285  *
 286  *   red    = {R0, R1, R2, R3, ..., Rn}
 287  *   green  = {G0, G1, G2, G3, ..., Gn}
 288  *   blue   = {B0, B1, B2, B3, ..., Bn}
 289  *   alpha  = {A0, A1, A2, A3, ..., An}
 290  *
 291  * It requires that a packed pixel fits into an element of the output
 292  * channels. The common case is when converting pixel with a depth of 32 bit or
 293  * less into floats.
 294  *
 295  * \param format_desc  the format of the 'packed' incoming pixel vector
 296  * \param type  the desired type for rgba_out (type.length = n, above)
 297  * \param packed  the incoming vector of packed pixels
 298  * \param rgba_out  returns the SoA R,G,B,A vectors
 299  */
 300 void
 301 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
 302                          const struct util_format_description *format_desc,
 303                          struct lp_type type,
 304                          LLVMValueRef packed,
 305                          LLVMValueRef rgba_out[4])
 306 {
 307    struct lp_build_context bld;
 308    LLVMValueRef inputs[4];
 309    unsigned chan;
 310
 311    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 312    assert(format_desc->block.width == 1);
 313    assert(format_desc->block.height == 1);
 314    assert(format_desc->block.bits <= type.width);
 315    /* FIXME: Support more output types */
 316    assert(type.width == 32);
 317
 318    lp_build_context_init(&bld, gallivm, type);
 319
 320    /* Decode the input vector components */
 321    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
 322       struct util_format_channel_description chan_desc = format_desc->channel[chan];
 323       boolean srgb_chan = FALSE;
 324
 325       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
 326           format_desc->swizzle[3] != chan) {
 327          srgb_chan = TRUE;
 328       }
 329
 330       inputs[chan] = lp_build_extract_soa_chan(&bld,
 331                                                format_desc->block.bits,
 332                                                srgb_chan,
 333                                                chan_desc,
 334                                                packed);
 335    }
 336
 337    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
 338 }
 339
 340
 341 /**
 342  * Convert a vector of rgba8 values into 32bit wide SoA vectors.
 343  *
 344  * \param dst_type  The desired return type. For pure integer formats
 345  *                  this should be a 32bit wide int or uint vector type,
 346  *                  otherwise a float vector type.
 347  *
 348  * \param packed    The rgba8 values to pack.
 349  *
 350  * \param rgba      The 4 SoA return vectors.
 351  */
 352 void
 353 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
 354                            struct lp_type dst_type,
 355                            LLVMValueRef packed,
 356                            LLVMValueRef *rgba)
 357 {
 358    LLVMBuilderRef builder = gallivm->builder;
 359    LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
 360    unsigned chan;
 361
 362    /* XXX technically shouldn't use that for uint dst_type */
 363    packed = LLVMBuildBitCast(builder, packed,
 364                              lp_build_int_vec_type(gallivm, dst_type), "");
 365
 366    /* Decode the input vector components */
 367    for (chan = 0; chan < 4; ++chan) {
 368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 369       unsigned start = chan*8;
 370 #else
 371       unsigned start = (3-chan)*8;
 372 #endif
 373       unsigned stop = start + 8;
 374       LLVMValueRef input;
 375
 376       input = packed;
 377
 378       if (start)
 379          input = LLVMBuildLShr(builder, input,
 380                                lp_build_const_int_vec(gallivm, dst_type, start), "");
 381
 382       if (stop < 32)
 383          input = LLVMBuildAnd(builder, input, mask, "");
 384
 385       if (dst_type.floating)
 386          input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
 387
 388       rgba[chan] = input;
 389    }
 390 }
 391
 392
 393
 394 /**
 395  * Fetch a texels from a texture, returning them in SoA layout.
 396  *
 397  * \param type  the desired return type for 'rgba'.  The vector length
 398  *              is the number of texels to fetch
 399  * \param aligned if the offset is guaranteed to be aligned to element width
 400  *
 401  * \param base_ptr  points to the base of the texture mip tree.
 402  * \param offset    offset to start of the texture image block.  For non-
 403  *                  compressed formats, this simply is an offset to the texel.
 404  *                  For compressed formats, it is an offset to the start of the
 405  *                  compressed data block.
 406  *
 407  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 408  *              these will always be (0,0).  For compressed formats, i will
 409  *              be in [0, block_width-1] and j will be in [0, block_height-1].
 410  * \param cache  optional value pointing to a lp_build_format_cache structure
 411  */
 412 void
 413 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 414                         const struct util_format_description *format_desc,
 415                         struct lp_type type,
 416                         boolean aligned,
 417                         LLVMValueRef base_ptr,
 418                         LLVMValueRef offset,
 419                         LLVMValueRef i,
 420                         LLVMValueRef j,
 421                         LLVMValueRef cache,
 422                         LLVMValueRef rgba_out[4])
 423 {
 424    LLVMBuilderRef builder = gallivm->builder;
 425    enum pipe_format format = format_desc->format;
 426    struct lp_type fetch_type;
 427
 428    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 429        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
 430         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
 431         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
 432        format_desc->block.width == 1 &&
 433        format_desc->block.height == 1 &&
 434        format_desc->block.bits <= type.width &&
 435        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
 436         format_desc->channel[0].size == 32 ||
 437         format_desc->channel[0].size == 16))
 438    {
 439       /*
 440        * The packed pixel fits into an element of the destination format. Put
 441        * the packed pixels into a vector and extract each component for all
 442        * vector elements in parallel.
 443        */
 444
 445       LLVMValueRef packed;
 446
 447       /*
 448        * gather the texels from the texture
 449        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
 450        */
 451       assert(format_desc->block.bits <= type.width);
 452       fetch_type = lp_type_uint(type.width);
 453       packed = lp_build_gather(gallivm,
 454                                type.length,
 455                                format_desc->block.bits,
 456                                fetch_type,
 457                                aligned,
 458                                base_ptr, offset, FALSE);
 459
 460       /*
 461        * convert texels to float rgba
 462        */
 463       lp_build_unpack_rgba_soa(gallivm,
 464                                format_desc,
 465                                type,
 466                                packed, rgba_out);
 467       return;
 468    }
 469
 470
 471    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 472        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
 473        format_desc->block.width == 1 &&
 474        format_desc->block.height == 1 &&
 475        format_desc->block.bits > type.width &&
 476        ((format_desc->block.bits <= type.width * type.length &&
 477          format_desc->channel[0].size <= type.width) ||
 478         (format_desc->channel[0].size == 64 &&
 479          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
 480          type.floating)))
 481    {
 482       /*
 483        * Similar to above, but the packed pixel is larger than what fits
 484        * into an element of the destination format. The packed pixels will be
 485        * shuffled into SoA vectors appropriately, and then the extraction will
 486        * be done in parallel as much as possible.
 487        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
 488        * the gathered vectors can be shuffled easily (even with avx).
 489        * 64xn float -> 32xn float is handled too but it's a bit special as
 490        * it does the conversion pre-shuffle.
 491        */
 492
 493       LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
 494       struct lp_type fetch_type, gather_type = type;
 495       unsigned num_gather, fetch_width, i, j;
 496       struct lp_build_context bld;
 497       boolean fp64 = format_desc->channel[0].size == 64;
 498
 499       lp_build_context_init(&bld, gallivm, type);
 500
 501       assert(type.width == 32);
 502       assert(format_desc->block.bits > type.width);
 503
 504       /*
 505        * First, figure out fetch order.
 506        */
 507       fetch_width = util_next_power_of_two(format_desc->block.bits);
 508       /*
 509        * fp64 are treated like fp32 except we fetch twice wide values
 510        * (as we shuffle after trunc). The shuffles for that work out
 511        * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
 512        * albeit we miss the potential opportunity for hw gather (as it
 513        * only handles native size).
 514        */
 515       num_gather = fetch_width / type.width;
 516       gather_type.width *= num_gather;
 517       if (fp64) {
 518          num_gather /= 2;
 519       }
 520       gather_type.length /= num_gather;
 521
 522       for (i = 0; i < num_gather; i++) {
 523          LLVMValueRef offsetr, shuf_vec;
 524          if(num_gather == 4) {
 525             for (j = 0; j < gather_type.length; j++) {
 526                unsigned idx = i + 4*j;
 527                shuffles[j] = lp_build_const_int32(gallivm, idx);
 528             }
 529             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
 530             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
 531
 532          }
 533          else if (num_gather == 2) {
 534             assert(num_gather == 2);
 535             for (j = 0; j < gather_type.length; j++) {
 536                unsigned idx = i*2 + (j%2) + (j/2)*4;
 537                shuffles[j] = lp_build_const_int32(gallivm, idx);
 538             }
 539             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
 540             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
 541          }
 542          else {
 543             assert(num_gather == 1);
 544             offsetr = offset;
 545          }
 546          if (gather_type.length == 1) {
 547             LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
 548             offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
 549          }
 550
 551          /*
 552           * Determine whether to use float or int loads. This is mostly
 553           * to outsmart the (stupid) llvm int/float shuffle logic, we
 554           * don't really care much if the data is floats or ints...
 555           * But llvm will refuse to use single float shuffle with int data
 556           * and instead use 3 int shuffles instead, the code looks atrocious.
 557           * (Note bitcasts often won't help, as llvm is too smart to be
 558           * fooled by that.)
 559           * Nobody cares about simd float<->int domain transition penalties,
 560           * which usually don't even exist for shuffles anyway.
 561           * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
 562           * going into transpose, which is unpacks, so doesn't really matter
 563           * much).
 564           * With 2x32bit or 4x16bit fetch, we use float vec, since those
 565           * go into the weird channel separation shuffle. With floats,
 566           * this is (with 128bit vectors):
 567           * - 2 movq, 2 movhpd, 2 shufps
 568           * With ints it would be:
 569           * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
 570           * I've seen texture functions increase in code size by 15% just due
 571           * to that (there's lots of such fetches in them...)
 572           * (We could chose a different gather order to improve this somewhat
 573           * for the int path, but it would basically just drop the blends,
 574           * so the float path with this order really is optimal.)
 575           * Albeit it is tricky sometimes llvm doesn't ignore the float->int
 576           * casts so must avoid them until we're done with the float shuffle...
 577           * 3x16bit formats (the same is also true for 3x8) are pretty bad but
 578           * there's nothing we can do about them (we could overallocate by
 579           * those couple bytes and use unaligned but pot sized load).
 580           * Note that this is very much x86 specific. I don't know if this
 581           * affect other archs at all.
 582           */
 583          if (num_gather > 1) {
 584             /*
 585              * We always want some float type here (with x86)
 586              * due to shuffles being float ones afterwards (albeit for
 587              * the num_gather == 4 case int should work fine too
 588              * (unless there's some problems with avx but not avx2).
 589              */
 590             if (format_desc->channel[0].size == 64) {
 591                fetch_type = lp_type_float_vec(64, gather_type.width);
 592             } else {
 593                fetch_type = lp_type_int_vec(32, gather_type.width);
 594             }
 595          }
 596          else {
 597             /* type doesn't matter much */
 598             if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
 599                 (format_desc->channel[0].size == 32 ||
 600                  format_desc->channel[0].size == 64)) {
 601             fetch_type = lp_type_float(gather_type.width);
 602             } else {
 603                fetch_type = lp_type_uint(gather_type.width);
 604             }
 605          }
 606
 607          /* Now finally gather the values */
 608          packed[i] = lp_build_gather(gallivm, gather_type.length,
 609                                      format_desc->block.bits,
 610                                      fetch_type, aligned,
 611                                      base_ptr, offsetr, FALSE);
 612          if (fp64) {
 613             struct lp_type conv_type = type;
 614             conv_type.width *= 2;
 615             packed[i] = LLVMBuildBitCast(builder, packed[i],
 616                                          lp_build_vec_type(gallivm, conv_type), "");
 617             packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
 618          }
 619       }
 620
 621       /* shuffle the gathered values to SoA */
 622       if (num_gather == 2) {
 623          for (i = 0; i < num_gather; i++) {
 624             for (j = 0; j < type.length; j++) {
 625                unsigned idx = (j%2)*2 + (j/4)*4 + i;
 626                if ((j/2)%2)
 627                   idx += type.length;
 628                shuffles[j] = lp_build_const_int32(gallivm, idx);
 629             }
 630             dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
 631                                             LLVMConstVector(shuffles, type.length), "");
 632          }
 633       }
 634       else if (num_gather == 4) {
 635          lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
 636       }
 637       else {
 638          assert(num_gather == 1);
 639          dst[0] = packed[0];
 640       }
 641
 642       /*
 643        * And finally unpack exactly as above, except that
 644        * chan shift is adjusted and the right vector selected.
 645        */
 646       if (!fp64) {
 647          for (i = 0; i < num_gather; i++) {
 648             dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
 649          }
 650          for (i = 0; i < format_desc->nr_channels; i++) {
 651             struct util_format_channel_description chan_desc = format_desc->channel[i];
 652             unsigned blockbits = type.width;
 653             unsigned vec_nr;
 654
 655 #ifdef PIPE_ARCH_BIG_ENDIAN
 656             vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
 657 #else
 658             vec_nr = chan_desc.shift / type.width;
 659 #endif
 660             chan_desc.shift %= type.width;
 661
 662             output[i] = lp_build_extract_soa_chan(&bld,
 663                                                   blockbits,
 664                                                   FALSE,
 665                                                   chan_desc,
 666                                                   dst[vec_nr]);
 667          }
 668       }
 669       else {
 670          for (i = 0; i < format_desc->nr_channels; i++)  {
 671             output[i] = dst[i];
 672          }
 673       }
 674
 675       lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
 676       return;
 677    }
 678
 679    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
 680        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
 681       /*
 682        * similar conceptually to above but requiring special
 683        * AoS packed -> SoA float conversion code.
 684        */
 685       LLVMValueRef packed;
 686       struct lp_type fetch_type = lp_type_uint(type.width);
 687
 688       assert(type.floating);
 689       assert(type.width == 32);
 690
 691       packed = lp_build_gather(gallivm, type.length,
 692                                format_desc->block.bits,
 693                                fetch_type, aligned,
 694                                base_ptr, offset, FALSE);
 695       if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
 696          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
 697       }
 698       else {
 699          lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
 700       }
 701       return;
 702    }
 703
 704    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
 705        format_desc->block.bits == 64) {
 706       /*
 707        * special case the format is 64 bits but we only require
 708        * 32bit (or 8bit) from each block.
 709        */
 710       LLVMValueRef packed;
 711       struct lp_type fetch_type = lp_type_uint(type.width);
 712
 713       if (format == PIPE_FORMAT_X32_S8X24_UINT) {
 714          /*
 715           * for stencil simply fix up offsets - could in fact change
 716           * base_ptr instead even outside the shader.
 717           */
 718          unsigned mask = (1 << 8) - 1;
 719          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
 720          offset = LLVMBuildAdd(builder, offset, s_offset, "");
 721          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
 722                                   aligned, base_ptr, offset, FALSE);
 723          packed = LLVMBuildAnd(builder, packed,
 724                                lp_build_const_int_vec(gallivm, type, mask), "");
 725       }
 726       else {
 727          assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
 728          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
 729                                   aligned, base_ptr, offset, TRUE);
 730          packed = LLVMBuildBitCast(builder, packed,
 731                                    lp_build_vec_type(gallivm, type), "");
 732       }
 733       /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
 734       rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
 735       rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
 736       return;
 737    }
 738
 739    /*
 740     * Try calling lp_build_fetch_rgba_aos for all pixels.
 741     * Should only really hit subsampled, compressed
 742     * (for s3tc srgb too, for rgtc the unorm ones only) by now.
 743     * (This is invalid for plain 8unorm formats because we're lazy with
 744     * the swizzle since some results would arrive swizzled, some not.)
 745     */
 746
 747    if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
 748        (util_format_fits_8unorm(format_desc) ||
 749         format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
 750        type.floating && type.width == 32 &&
 751        (type.length == 1 || (type.length % 4 == 0))) {
 752       struct lp_type tmp_type;
 753       struct lp_build_context bld;
 754       LLVMValueRef packed, rgba[4];
 755       const struct util_format_description *flinear_desc;
 756       const struct util_format_description *frgba8_desc;
 757       unsigned chan;
 758
 759       lp_build_context_init(&bld, gallivm, type);
 760
 761       /*
 762        * Make sure the conversion in aos really only does convert to rgba8
 763        * and not anything more (so use linear format, adjust type).
 764        */
 765       flinear_desc = util_format_description(util_format_linear(format));
 766       memset(&tmp_type, 0, sizeof tmp_type);
 767       tmp_type.width = 8;
 768       tmp_type.length = type.length * 4;
 769       tmp_type.norm = TRUE;
 770
 771       packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
 772                                        aligned, base_ptr, offset, i, j, cache);
 773       packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
 774
 775       /*
 776        * The values are now packed so they match ordinary (srgb) RGBA8 format,
 777        * hence need to use matching format for unpack.
 778        */
 779       frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
 780       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
 781          assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
 782          frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
 783       }
 784       lp_build_unpack_rgba_soa(gallivm,
 785                                frgba8_desc,
 786                                type,
 787                                packed, rgba);
 788
 789       /*
 790        * We converted 4 channels. Make sure llvm can drop unneeded ones
 791        * (luckily the rgba order is fixed, only LA needs special case).
 792        */
 793       for (chan = 0; chan < 4; chan++) {
 794          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
 795          if (chan == 3 && util_format_is_luminance_alpha(format)) {
 796             swizzle = PIPE_SWIZZLE_W;
 797          }
 798          rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
 799       }
 800       return;
 801    }
 802
 803
 804    /*
 805     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
 806     *
 807     * This is not the most efficient way of fetching pixels, as we
 808     * miss some opportunities to do vectorization, but this is
 809     * convenient for formats or scenarios for which there was no
 810     * opportunity or incentive to optimize.
 811     *
 812     * We do NOT want to end up here, this typically is quite terrible,
 813     * in particular if the formats have less than 4 channels.
 814     *
 815     * Right now, this should only be hit for:
 816     * - RGTC snorm formats
 817     *   (those miss fast fetch functions hence they are terrible anyway)
 818     */
 819
 820    {
 821       unsigned k;
 822       struct lp_type tmp_type;
 823       LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
 824
 825       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
 826          debug_printf("%s: AoS fetch fallback for %s\n",
 827                       __FUNCTION__, format_desc->short_name);
 828       }
 829
 830       tmp_type = type;
 831       tmp_type.length = 4;
 832
 833       /*
 834        * Note that vector transpose can be worse compared to insert/extract
 835        * for aos->soa conversion (for formats with 1 or 2 channels). However,
 836        * we should try to avoid getting here for just about all formats, so
 837        * don't bother.
 838        */
 839
 840       /* loop over number of pixels */
 841       for(k = 0; k < type.length; ++k) {
 842          LLVMValueRef index = lp_build_const_int32(gallivm, k);
 843          LLVMValueRef offset_elem;
 844          LLVMValueRef i_elem, j_elem;
 845
 846          offset_elem = LLVMBuildExtractElement(builder, offset,
 847                                                index, "");
 848
 849          i_elem = LLVMBuildExtractElement(builder, i, index, "");
 850          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 851
 852          /* Get a single float[4]={R,G,B,A} pixel */
 853          aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
 854                                                 aligned, base_ptr, offset_elem,
 855                                                 i_elem, j_elem, cache);
 856
 857       }
 858       convert_to_soa(gallivm, aos_fetch, rgba_out, type);
 859    }
 860 }