src/gallium/auxiliary/gallivm/lp_bld_format_aos.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * AoS pixel format manipulation.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  */
  34
  35
  36 #include "util/format/u_format.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_math.h"
  39 #include "util/u_pointer.h"
  40 #include "util/u_string.h"
  41 #include "util/u_cpu_detect.h"
  42
  43 #include "lp_bld_arit.h"
  44 #include "lp_bld_init.h"
  45 #include "lp_bld_type.h"
  46 #include "lp_bld_flow.h"
  47 #include "lp_bld_const.h"
  48 #include "lp_bld_conv.h"
  49 #include "lp_bld_swizzle.h"
  50 #include "lp_bld_gather.h"
  51 #include "lp_bld_debug.h"
  52 #include "lp_bld_format.h"
  53 #include "lp_bld_pack.h"
  54 #include "lp_bld_intr.h"
  55 #include "lp_bld_logic.h"
  56 #include "lp_bld_bitarit.h"
  57 #include "lp_bld_misc.h"
  58
  59 /**
  60  * Basic swizzling.  Rearrange the order of the unswizzled array elements
  61  * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
  62  * too.
  63  * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
  64  */
  65 LLVMValueRef
  66 lp_build_format_swizzle_aos(const struct util_format_description *desc,
  67                             struct lp_build_context *bld,
  68                             LLVMValueRef unswizzled)
  69 {
  70    unsigned char swizzles[4];
  71    unsigned chan;
  72
  73    assert(bld->type.length % 4 == 0);
  74
  75    for (chan = 0; chan < 4; ++chan) {
  76       enum pipe_swizzle swizzle;
  77
  78       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
  79          /*
  80           * For ZS formats do RGBA = ZZZ1
  81           */
  82          if (chan == 3) {
  83             swizzle = PIPE_SWIZZLE_1;
  84          } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
  85             swizzle = PIPE_SWIZZLE_0;
  86          } else {
  87             swizzle = desc->swizzle[0];
  88          }
  89       } else {
  90          swizzle = desc->swizzle[chan];
  91       }
  92       swizzles[chan] = swizzle;
  93    }
  94
  95    return lp_build_swizzle_aos(bld, unswizzled, swizzles);
  96 }
  97
  98
  99 /**
 100  * Whether the format matches the vector type, apart of swizzles.
 101  */
 102 static inline boolean
 103 format_matches_type(const struct util_format_description *desc,
 104                     struct lp_type type)
 105 {
 106    enum util_format_type chan_type;
 107    unsigned chan;
 108
 109    assert(type.length % 4 == 0);
 110
 111    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
 112        desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
 113        desc->block.width != 1 ||
 114        desc->block.height != 1) {
 115       return FALSE;
 116    }
 117
 118    if (type.floating) {
 119       chan_type = UTIL_FORMAT_TYPE_FLOAT;
 120    } else if (type.fixed) {
 121       chan_type = UTIL_FORMAT_TYPE_FIXED;
 122    } else if (type.sign) {
 123       chan_type = UTIL_FORMAT_TYPE_SIGNED;
 124    } else {
 125       chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
 126    }
 127
 128    for (chan = 0; chan < desc->nr_channels; ++chan) {
 129       if (desc->channel[chan].size != type.width) {
 130          return FALSE;
 131       }
 132
 133       if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
 134          if (desc->channel[chan].type != chan_type ||
 135              desc->channel[chan].normalized != type.norm) {
 136             return FALSE;
 137          }
 138       }
 139    }
 140
 141    return TRUE;
 142 }
 143
 144 /*
 145  * Do rounding when converting small unorm values to larger ones.
 146  * Not quite 100% accurate, as it's done by appending MSBs, but
 147  * should be good enough.
 148  */
 149
 150 static inline LLVMValueRef
 151 scale_bits_up(struct gallivm_state *gallivm,
 152               int src_bits,
 153               int dst_bits,
 154               LLVMValueRef src,
 155               struct lp_type src_type)
 156 {
 157    LLVMBuilderRef builder = gallivm->builder;
 158    LLVMValueRef result = src;
 159
 160    if (src_bits == 1 && dst_bits > 1) {
 161       /*
 162        * Useful for a1 - we'd need quite some repeated copies otherwise.
 163        */
 164       struct lp_build_context bld;
 165       LLVMValueRef dst_mask;
 166       lp_build_context_init(&bld, gallivm, src_type);
 167       dst_mask = lp_build_const_int_vec(gallivm, src_type,
 168                                         (1 << dst_bits) - 1),
 169       result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
 170                             lp_build_const_int_vec(gallivm, src_type, 0));
 171       result = lp_build_andnot(&bld, dst_mask, result);
 172    }
 173    else if (dst_bits > src_bits) {
 174       /* Scale up bits */
 175       int db = dst_bits - src_bits;
 176
 177       /* Shift left by difference in bits */
 178       result = LLVMBuildShl(builder,
 179                             src,
 180                             lp_build_const_int_vec(gallivm, src_type, db),
 181                             "");
 182
 183       if (db <= src_bits) {
 184          /* Enough bits in src to fill the remainder */
 185          LLVMValueRef lower = LLVMBuildLShr(builder,
 186                                             src,
 187                                             lp_build_const_int_vec(gallivm, src_type,
 188                                                                    src_bits - db),
 189                                             "");
 190
 191          result = LLVMBuildOr(builder, result, lower, "");
 192       } else if (db > src_bits) {
 193          /* Need to repeatedly copy src bits to fill remainder in dst */
 194          unsigned n;
 195
 196          for (n = src_bits; n < dst_bits; n *= 2) {
 197             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
 198
 199             result = LLVMBuildOr(builder,
 200                                  result,
 201                                  LLVMBuildLShr(builder, result, shuv, ""),
 202                                  "");
 203          }
 204       }
 205    } else {
 206       assert (dst_bits == src_bits);
 207    }
 208
 209    return result;
 210 }
 211
 212 /**
 213  * Unpack a single pixel into its XYZW components.
 214  *
 215  * @param desc  the pixel format for the packed pixel value
 216  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
 217  *
 218  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
 219  */
 220 static inline LLVMValueRef
 221 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
 222                                const struct util_format_description *desc,
 223                                LLVMValueRef packed)
 224 {
 225    LLVMBuilderRef builder = gallivm->builder;
 226    LLVMValueRef shifted, casted, scaled, masked;
 227    LLVMValueRef shifts[4];
 228    LLVMValueRef masks[4];
 229    LLVMValueRef scales[4];
 230    LLVMTypeRef vec32_type;
 231
 232    boolean normalized;
 233    boolean needs_uitofp;
 234    unsigned i;
 235
 236    /* TODO: Support more formats */
 237    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 238    assert(desc->block.width == 1);
 239    assert(desc->block.height == 1);
 240    assert(desc->block.bits <= 32);
 241
 242    /* Do the intermediate integer computations with 32bit integers since it
 243     * matches floating point size */
 244    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
 245
 246    vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
 247
 248    /* Broadcast the packed value to all four channels
 249     * before: packed = BGRA
 250     * after: packed = {BGRA, BGRA, BGRA, BGRA}
 251     */
 252    packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
 253                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
 254                                    "");
 255    packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
 256                                    LLVMConstNull(vec32_type),
 257                                    "");
 258
 259    /* Initialize vector constants */
 260    normalized = FALSE;
 261    needs_uitofp = FALSE;
 262
 263    /* Loop over 4 color components */
 264    for (i = 0; i < 4; ++i) {
 265       unsigned bits = desc->channel[i].size;
 266       unsigned shift = desc->channel[i].shift;
 267
 268       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
 269          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 270          masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
 271          scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
 272       }
 273       else {
 274          unsigned long long mask = (1ULL << bits) - 1;
 275
 276          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
 277
 278          if (bits == 32) {
 279             needs_uitofp = TRUE;
 280          }
 281
 282          shifts[i] = lp_build_const_int32(gallivm, shift);
 283          masks[i] = lp_build_const_int32(gallivm, mask);
 284
 285          if (desc->channel[i].normalized) {
 286             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
 287             normalized = TRUE;
 288          }
 289          else
 290             scales[i] =  lp_build_const_float(gallivm, 1.0);
 291       }
 292    }
 293
 294    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
 295     * into masked = {X, Y, Z, W}
 296     */
 297    if (desc->block.bits < 32 && normalized) {
 298       /*
 299        * Note: we cannot do the shift below on x86 natively until AVX2.
 300        *
 301        * Old llvm versions will resort to scalar extract/shift insert,
 302        * which is definitely terrible, new versions will just do
 303        * several vector shifts and shuffle/blend results together.
 304        * We could turn this into a variable left shift plus a constant
 305        * right shift, and llvm would then turn the variable left shift
 306        * into a mul for us (albeit without sse41 the mul needs emulation
 307        * too...). However, since we're going to do a float mul
 308        * anyway, we just adjust that mul instead (plus the mask), skipping
 309        * the shift completely.
 310        * We could also use a extra mul when the format isn't normalized and
 311        * we don't have AVX2 support, but don't bother for now. Unfortunately,
 312        * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
 313        * rgba8 if it ends up here), as that would require UIToFP, albeit that
 314        * would be fixable with easy 16bit shuffle (unless there's channels
 315        * crossing 16bit boundaries).
 316        */
 317       for (i = 0; i < 4; ++i) {
 318          if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
 319             unsigned bits = desc->channel[i].size;
 320             unsigned shift = desc->channel[i].shift;
 321             unsigned long long mask = ((1ULL << bits) - 1) << shift;
 322             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
 323             masks[i] = lp_build_const_int32(gallivm, mask);
 324          }
 325       }
 326       masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
 327    } else {
 328       shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
 329       masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
 330    }
 331
 332    if (!needs_uitofp) {
 333       /* UIToFP can't be expressed in SSE2 */
 334       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
 335    } else {
 336       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
 337    }
 338
 339    /*
 340     * At this point 'casted' may be a vector of floats such as
 341     * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
 342     * by powers of two). Next, if the pixel values are normalized
 343     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
 344     */
 345
 346    if (normalized)
 347       scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
 348    else
 349       scaled = casted;
 350
 351    return scaled;
 352 }
 353
 354
 355 /**
 356  * Pack a single pixel.
 357  *
 358  * @param rgba 4 float vector with the unpacked components.
 359  *
 360  * XXX: This is mostly for reference and testing -- operating a single pixel at
 361  * a time is rarely if ever needed.
 362  */
 363 LLVMValueRef
 364 lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
 365                        const struct util_format_description *desc,
 366                        LLVMValueRef rgba)
 367 {
 368    LLVMBuilderRef builder = gallivm->builder;
 369    LLVMTypeRef type;
 370    LLVMValueRef packed = NULL;
 371    LLVMValueRef swizzles[4];
 372    LLVMValueRef shifted, casted, scaled, unswizzled;
 373    LLVMValueRef shifts[4];
 374    LLVMValueRef scales[4];
 375    boolean normalized;
 376    unsigned i, j;
 377
 378    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 379    assert(desc->block.width == 1);
 380    assert(desc->block.height == 1);
 381
 382    type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
 383
 384    /* Unswizzle the color components into the source vector. */
 385    for (i = 0; i < 4; ++i) {
 386       for (j = 0; j < 4; ++j) {
 387          if (desc->swizzle[j] == i)
 388             break;
 389       }
 390       if (j < 4)
 391          swizzles[i] = lp_build_const_int32(gallivm, j);
 392       else
 393          swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 394    }
 395
 396    unswizzled = LLVMBuildShuffleVector(builder, rgba,
 397                                        LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
 398                                        LLVMConstVector(swizzles, 4), "");
 399
 400    normalized = FALSE;
 401    for (i = 0; i < 4; ++i) {
 402       unsigned bits = desc->channel[i].size;
 403       unsigned shift = desc->channel[i].shift;
 404
 405       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
 406          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 407          scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
 408       }
 409       else {
 410          unsigned mask = (1 << bits) - 1;
 411
 412          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
 413          assert(bits < 32);
 414
 415          shifts[i] = lp_build_const_int32(gallivm, shift);
 416
 417          if (desc->channel[i].normalized) {
 418             scales[i] = lp_build_const_float(gallivm, mask);
 419             normalized = TRUE;
 420          }
 421          else
 422             scales[i] = lp_build_const_float(gallivm, 1.0);
 423       }
 424    }
 425
 426    if (normalized)
 427       scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
 428    else
 429       scaled = unswizzled;
 430
 431    casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
 432
 433    shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
 434
 435    /* Bitwise or all components */
 436    for (i = 0; i < 4; ++i) {
 437       if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
 438          LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
 439                                                lp_build_const_int32(gallivm, i), "");
 440          if (packed)
 441             packed = LLVMBuildOr(builder, packed, component, "");
 442          else
 443             packed = component;
 444       }
 445    }
 446
 447    if (!packed)
 448       packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 449
 450    if (desc->block.bits < 32)
 451       packed = LLVMBuildTrunc(builder, packed, type, "");
 452
 453    return packed;
 454 }
 455
 456
 457
 458
 459 /**
 460  * Fetch a pixel into a 4 float AoS.
 461  *
 462  * \param format_desc  describes format of the image we're fetching from
 463  * \param aligned  whether the data is guaranteed to be aligned
 464  * \param ptr  address of the pixel block (or the texel if uncompressed)
 465  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 466  *              these will always be (0, 0).
 467  * \param cache  optional value pointing to a lp_build_format_cache structure
 468  * \return  a 4 element vector with the pixel's RGBA values.
 469  */
 470 LLVMValueRef
 471 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
 472                         const struct util_format_description *format_desc,
 473                         struct lp_type type,
 474                         boolean aligned,
 475                         LLVMValueRef base_ptr,
 476                         LLVMValueRef offset,
 477                         LLVMValueRef i,
 478                         LLVMValueRef j,
 479                         LLVMValueRef cache)
 480 {
 481    LLVMBuilderRef builder = gallivm->builder;
 482    unsigned num_pixels = type.length / 4;
 483    struct lp_build_context bld;
 484
 485    assert(type.length <= LP_MAX_VECTOR_LENGTH);
 486    assert(type.length % 4 == 0);
 487
 488    lp_build_context_init(&bld, gallivm, type);
 489
 490    /*
 491     * Trivial case
 492     *
 493     * The format matches the type (apart of a swizzle) so no need for
 494     * scaling or converting.
 495     */
 496
 497    if (format_matches_type(format_desc, type) &&
 498        format_desc->block.bits <= type.width * 4 &&
 499        /* XXX this shouldn't be needed */
 500        util_is_power_of_two_or_zero(format_desc->block.bits)) {
 501       LLVMValueRef packed;
 502       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
 503       struct lp_type fetch_type;
 504       unsigned vec_len = type.width * type.length;
 505
 506       /*
 507        * The format matches the type (apart of a swizzle) so no need for
 508        * scaling or converting.
 509        */
 510
 511       fetch_type = lp_type_uint(type.width*4);
 512       packed = lp_build_gather(gallivm, type.length/4,
 513                                format_desc->block.bits, fetch_type,
 514                                aligned, base_ptr, offset, TRUE);
 515
 516       assert(format_desc->block.bits <= vec_len);
 517       (void) vec_len; /* silence unused var warning for non-debug build */
 518
 519       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
 520       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
 521    }
 522
 523    /*
 524     * Bit arithmetic for converting small_unorm to unorm8.
 525     *
 526     * This misses some opportunities for optimizations (like skipping mask
 527     * for the highest channel for instance, or doing bit scaling in parallel
 528     * for channels with the same bit width) but it should be passable for
 529     * all arithmetic formats.
 530     */
 531    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 532        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
 533        util_format_fits_8unorm(format_desc) &&
 534        type.width == 8 && type.norm == 1 && type.sign == 0 &&
 535        type.fixed == 0 && type.floating == 0) {
 536       LLVMValueRef packed, res = NULL, chans[4], rgba[4];
 537       LLVMTypeRef dst_vec_type, conv_vec_type;
 538       struct lp_type fetch_type, conv_type;
 539       struct lp_build_context bld_conv;
 540       unsigned j;
 541
 542       fetch_type = lp_type_uint(type.width*4);
 543       conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
 544       dst_vec_type = lp_build_vec_type(gallivm, type);
 545       conv_vec_type = lp_build_vec_type(gallivm, conv_type);
 546       lp_build_context_init(&bld_conv, gallivm, conv_type);
 547
 548       packed = lp_build_gather(gallivm, type.length/4,
 549                                format_desc->block.bits, fetch_type,
 550                                aligned, base_ptr, offset, TRUE);
 551
 552       assert(format_desc->block.bits * type.length / 4 <=
 553              type.width * type.length);
 554
 555       packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
 556
 557       for (j = 0; j < format_desc->nr_channels; ++j) {
 558          unsigned mask = 0;
 559          unsigned sa = format_desc->channel[j].shift;
 560
 561          mask = (1 << format_desc->channel[j].size) - 1;
 562
 563          /* Extract bits from source */
 564          chans[j] = LLVMBuildLShr(builder, packed,
 565                                   lp_build_const_int_vec(gallivm, conv_type, sa),
 566                                   "");
 567
 568          chans[j] = LLVMBuildAnd(builder, chans[j],
 569                                  lp_build_const_int_vec(gallivm, conv_type, mask),
 570                                  "");
 571
 572          /* Scale bits */
 573          if (type.norm) {
 574             chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
 575                                      type.width, chans[j], conv_type);
 576          }
 577       }
 578       /*
 579        * This is a hacked lp_build_format_swizzle_soa() since we need a
 580        * normalized 1 but only 8 bits in a 32bit vector...
 581        */
 582       for (j = 0; j < 4; ++j) {
 583          enum pipe_swizzle swizzle = format_desc->swizzle[j];
 584          if (swizzle == PIPE_SWIZZLE_1) {
 585             rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
 586          } else {
 587             rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
 588          }
 589          if (j == 0) {
 590             res = rgba[j];
 591          } else {
 592             rgba[j] = LLVMBuildShl(builder, rgba[j],
 593                                    lp_build_const_int_vec(gallivm, conv_type,
 594                                                           j * type.width), "");
 595             res = LLVMBuildOr(builder, res, rgba[j], "");
 596          }
 597       }
 598       res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
 599
 600       return res;
 601    }
 602
 603    /*
 604     * Bit arithmetic
 605     */
 606
 607    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 608        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
 609         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
 610        format_desc->block.width == 1 &&
 611        format_desc->block.height == 1 &&
 612        /* XXX this shouldn't be needed */
 613        util_is_power_of_two_or_zero(format_desc->block.bits) &&
 614        format_desc->block.bits <= 32 &&
 615        format_desc->is_bitmask &&
 616        !format_desc->is_mixed &&
 617        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
 618         format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
 619        !format_desc->channel[0].pure_integer) {
 620
 621       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
 622       LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
 623       struct lp_type conv_type;
 624       unsigned k, num_conv_src, num_conv_dst;
 625
 626       /*
 627        * Note this path is generally terrible for fetching multiple pixels.
 628        * We should make sure we cannot hit this code path for anything but
 629        * single pixels.
 630        */
 631
 632       /*
 633        * Unpack a pixel at a time into a <4 x float> RGBA vector
 634        */
 635
 636       for (k = 0; k < num_pixels; ++k) {
 637          LLVMValueRef packed;
 638
 639          packed = lp_build_gather_elem(gallivm, num_pixels,
 640                                        format_desc->block.bits, 32, aligned,
 641                                        base_ptr, offset, k, FALSE);
 642
 643          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
 644                                                   format_desc,
 645                                                   packed);
 646       }
 647
 648       /*
 649        * Type conversion.
 650        *
 651        * TODO: We could avoid floating conversion for integer to
 652        * integer conversions.
 653        */
 654
 655       if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
 656          debug_printf("%s: unpacking %s with floating point\n",
 657                       __FUNCTION__, format_desc->short_name);
 658       }
 659
 660       conv_type = lp_float32_vec4_type();
 661       num_conv_src = num_pixels;
 662       num_conv_dst = 1;
 663
 664       if (num_pixels % 8 == 0) {
 665          lp_build_concat_n(gallivm, lp_float32_vec4_type(),
 666                            tmps, num_pixels, tmps, num_pixels / 2);
 667          conv_type.length *= num_pixels / 4;
 668          num_conv_src = 4 * num_pixels / 8;
 669          if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
 670             /*
 671              * FIXME: The fast float->unorm path (which is basically
 672              * skipping the MIN/MAX which are extremely pointless in any
 673              * case) requires that there's 2 destinations...
 674              * In any case, we really should make sure we don't hit this
 675              * code with multiple pixels for unorm8 dst types, it's
 676              * completely hopeless even if we do hit the right conversion.
 677              */
 678             type.length /= num_pixels / 4;
 679             num_conv_dst = num_pixels / 4;
 680          }
 681       }
 682
 683       lp_build_conv(gallivm, conv_type, type,
 684                     tmps, num_conv_src, res, num_conv_dst);
 685
 686       if (num_pixels % 8 == 0 &&
 687           (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
 688          lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
 689       }
 690
 691       return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
 692    }
 693
 694    /* If all channels are of same type and we are not using half-floats */
 695    if (format_desc->is_array &&
 696        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
 697       assert(!format_desc->is_mixed);
 698       return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
 699    }
 700
 701    /*
 702     * YUV / subsampled formats
 703     */
 704
 705    if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
 706       struct lp_type tmp_type;
 707       LLVMValueRef tmp;
 708
 709       memset(&tmp_type, 0, sizeof tmp_type);
 710       tmp_type.width = 8;
 711       tmp_type.length = num_pixels * 4;
 712       tmp_type.norm = TRUE;
 713
 714       tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
 715                                                format_desc,
 716                                                num_pixels,
 717                                                base_ptr,
 718                                                offset,
 719                                                i, j);
 720
 721       lp_build_conv(gallivm,
 722                     tmp_type, type,
 723                     &tmp, 1, &tmp, 1);
 724
 725       return tmp;
 726    }
 727
 728    /*
 729     * s3tc rgb formats
 730     */
 731
 732    if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
 733       struct lp_type tmp_type;
 734       LLVMValueRef tmp;
 735
 736       memset(&tmp_type, 0, sizeof tmp_type);
 737       tmp_type.width = 8;
 738       tmp_type.length = num_pixels * 4;
 739       tmp_type.norm = TRUE;
 740
 741       tmp = lp_build_fetch_s3tc_rgba_aos(gallivm,
 742                                          format_desc,
 743                                          num_pixels,
 744                                          base_ptr,
 745                                          offset,
 746                                          i, j,
 747                                          cache);
 748
 749       lp_build_conv(gallivm,
 750                     tmp_type, type,
 751                     &tmp, 1, &tmp, 1);
 752
 753        return tmp;
 754    }
 755
 756    /*
 757     * rgtc rgb formats
 758     */
 759
 760    if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
 761       struct lp_type tmp_type;
 762       LLVMValueRef tmp;
 763
 764       memset(&tmp_type, 0, sizeof tmp_type);
 765       tmp_type.width = 8;
 766       tmp_type.length = num_pixels * 4;
 767       tmp_type.norm = TRUE;
 768       tmp_type.sign = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
 769                        format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
 770                        format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
 771                        format_desc->format == PIPE_FORMAT_LATC2_SNORM);
 772
 773       tmp = lp_build_fetch_rgtc_rgba_aos(gallivm,
 774                                          format_desc,
 775                                          num_pixels,
 776                                          base_ptr,
 777                                          offset,
 778                                          i, j,
 779                                          cache);
 780
 781       lp_build_conv(gallivm,
 782                     tmp_type, type,
 783                     &tmp, 1, &tmp, 1);
 784
 785        return tmp;
 786    }
 787
 788    /*
 789     * Fallback to util_format_description::fetch_rgba_8unorm().
 790     */
 791
 792    if (format_desc->fetch_rgba_8unorm &&
 793        !type.floating && type.width == 8 && !type.sign && type.norm) {
 794       /*
 795        * Fallback to calling util_format_description::fetch_rgba_8unorm.
 796        *
 797        * This is definitely not the most efficient way of fetching pixels, as
 798        * we miss the opportunity to do vectorization, but this it is a
 799        * convenient for formats or scenarios for which there was no opportunity
 800        * or incentive to optimize.
 801        */
 802
 803       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
 804       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
 805       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 806       LLVMValueRef function;
 807       LLVMValueRef tmp_ptr;
 808       LLVMValueRef tmp;
 809       LLVMValueRef res;
 810       unsigned k;
 811
 812       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
 813          debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
 814                       __FUNCTION__, format_desc->short_name);
 815       }
 816
 817       /*
 818        * Declare and bind format_desc->fetch_rgba_8unorm().
 819        */
 820
 821       {
 822          /*
 823           * Function to call looks like:
 824           *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
 825           */
 826          LLVMTypeRef ret_type;
 827          LLVMTypeRef arg_types[4];
 828          LLVMTypeRef function_type;
 829
 830          ret_type = LLVMVoidTypeInContext(gallivm->context);
 831          arg_types[0] = pi8t;
 832          arg_types[1] = pi8t;
 833          arg_types[2] = i32t;
 834          arg_types[3] = i32t;
 835          function_type = LLVMFunctionType(ret_type, arg_types,
 836                                           ARRAY_SIZE(arg_types), 0);
 837
 838          if (gallivm->cache)
 839             gallivm->cache->dont_cache = true;
 840          /* make const pointer for the C fetch_rgba_8unorm function */
 841          function = lp_build_const_int_pointer(gallivm,
 842             func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
 843
 844          /* cast the callee pointer to the function's type */
 845          function = LLVMBuildBitCast(builder, function,
 846                                      LLVMPointerType(function_type, 0),
 847                                      "cast callee");
 848       }
 849
 850       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
 851
 852       res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
 853
 854       /*
 855        * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
 856        * in the SoA vectors.
 857        */
 858
 859       for (k = 0; k < num_pixels; ++k) {
 860          LLVMValueRef index = lp_build_const_int32(gallivm, k);
 861          LLVMValueRef args[4];
 862
 863          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
 864          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
 865                                             base_ptr, offset, k);
 866
 867          if (num_pixels == 1) {
 868             args[2] = i;
 869             args[3] = j;
 870          }
 871          else {
 872             args[2] = LLVMBuildExtractElement(builder, i, index, "");
 873             args[3] = LLVMBuildExtractElement(builder, j, index, "");
 874          }
 875
 876          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
 877
 878          tmp = LLVMBuildLoad(builder, tmp_ptr, "");
 879
 880          if (num_pixels == 1) {
 881             res = tmp;
 882          }
 883          else {
 884             res = LLVMBuildInsertElement(builder, res, tmp, index, "");
 885          }
 886       }
 887
 888       /* Bitcast from <n x i32> to <4n x i8> */
 889       res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
 890
 891       return res;
 892    }
 893
 894    /*
 895     * Fallback to util_format_description::fetch_rgba_float().
 896     */
 897
 898    if (format_desc->fetch_rgba_float) {
 899       /*
 900        * Fallback to calling util_format_description::fetch_rgba_float.
 901        *
 902        * This is definitely not the most efficient way of fetching pixels, as
 903        * we miss the opportunity to do vectorization, but this it is a
 904        * convenient for formats or scenarios for which there was no opportunity
 905        * or incentive to optimize.
 906        */
 907
 908       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
 909       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
 910       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
 911       LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
 912       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 913       LLVMValueRef function;
 914       LLVMValueRef tmp_ptr;
 915       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
 916       LLVMValueRef res;
 917       unsigned k;
 918
 919       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
 920          debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
 921                       __FUNCTION__, format_desc->short_name);
 922       }
 923
 924       /*
 925        * Declare and bind format_desc->fetch_rgba_float().
 926        */
 927
 928       {
 929          /*
 930           * Function to call looks like:
 931           *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
 932           */
 933          LLVMTypeRef ret_type;
 934          LLVMTypeRef arg_types[4];
 935
 936          ret_type = LLVMVoidTypeInContext(gallivm->context);
 937          arg_types[0] = pf32t;
 938          arg_types[1] = pi8t;
 939          arg_types[2] = i32t;
 940          arg_types[3] = i32t;
 941
 942          if (gallivm->cache)
 943             gallivm->cache->dont_cache = true;
 944          function = lp_build_const_func_pointer(gallivm,
 945                                                 func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
 946                                                 ret_type,
 947                                                 arg_types, ARRAY_SIZE(arg_types),
 948                                                 format_desc->short_name);
 949       }
 950
 951       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
 952
 953       /*
 954        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
 955        * in the SoA vectors.
 956        */
 957
 958       for (k = 0; k < num_pixels; ++k) {
 959          LLVMValueRef args[4];
 960
 961          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
 962          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
 963                                             base_ptr, offset, k);
 964
 965          if (num_pixels == 1) {
 966             args[2] = i;
 967             args[3] = j;
 968          }
 969          else {
 970             LLVMValueRef index = lp_build_const_int32(gallivm, k);
 971             args[2] = LLVMBuildExtractElement(builder, i, index, "");
 972             args[3] = LLVMBuildExtractElement(builder, j, index, "");
 973          }
 974
 975          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
 976
 977          tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
 978       }
 979
 980       lp_build_conv(gallivm,
 981                     lp_float32_vec4_type(),
 982                     type,
 983                     tmps, num_pixels, &res, 1);
 984
 985       return res;
 986    }
 987
 988    assert(!util_format_is_pure_integer(format_desc->format));
 989
 990    assert(0);
 991    return lp_build_undef(gallivm, type);
 992 }