src/gallium/auxiliary/gallivm/lp_bld_format_aos.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * AoS pixel format manipulation.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  */
  34
  35
  36 #include "util/format/u_format.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_math.h"
  39 #include "util/u_pointer.h"
  40 #include "util/u_string.h"
  41 #include "util/u_cpu_detect.h"
  42
  43 #include "lp_bld_arit.h"
  44 #include "lp_bld_init.h"
  45 #include "lp_bld_type.h"
  46 #include "lp_bld_flow.h"
  47 #include "lp_bld_const.h"
  48 #include "lp_bld_conv.h"
  49 #include "lp_bld_swizzle.h"
  50 #include "lp_bld_gather.h"
  51 #include "lp_bld_debug.h"
  52 #include "lp_bld_format.h"
  53 #include "lp_bld_pack.h"
  54 #include "lp_bld_intr.h"
  55 #include "lp_bld_logic.h"
  56 #include "lp_bld_bitarit.h"
  57 #include "lp_bld_misc.h"
  58
  59 /**
  60  * Basic swizzling.  Rearrange the order of the unswizzled array elements
  61  * according to the format description.  PIPE_SWIZZLE_0/ONE are supported
  62  * too.
  63  * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
  64  */
  65 LLVMValueRef
  66 lp_build_format_swizzle_aos(const struct util_format_description *desc,
  67                             struct lp_build_context *bld,
  68                             LLVMValueRef unswizzled)
  69 {
  70    unsigned char swizzles[4];
  71    unsigned chan;
  72
  73    assert(bld->type.length % 4 == 0);
  74
  75    for (chan = 0; chan < 4; ++chan) {
  76       enum pipe_swizzle swizzle;
  77
  78       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
  79          /*
  80           * For ZS formats do RGBA = ZZZ1
  81           */
  82          if (chan == 3) {
  83             swizzle = PIPE_SWIZZLE_1;
  84          } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) {
  85             swizzle = PIPE_SWIZZLE_0;
  86          } else {
  87             swizzle = desc->swizzle[0];
  88          }
  89       } else {
  90          swizzle = desc->swizzle[chan];
  91       }
  92       swizzles[chan] = swizzle;
  93    }
  94
  95    return lp_build_swizzle_aos(bld, unswizzled, swizzles);
  96 }
  97
  98
  99 /**
 100  * Whether the format matches the vector type, apart of swizzles.
 101  */
 102 static inline boolean
 103 format_matches_type(const struct util_format_description *desc,
 104                     struct lp_type type)
 105 {
 106    enum util_format_type chan_type;
 107    unsigned chan;
 108
 109    assert(type.length % 4 == 0);
 110
 111    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
 112        desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
 113        desc->block.width != 1 ||
 114        desc->block.height != 1) {
 115       return FALSE;
 116    }
 117
 118    if (type.floating) {
 119       chan_type = UTIL_FORMAT_TYPE_FLOAT;
 120    } else if (type.fixed) {
 121       chan_type = UTIL_FORMAT_TYPE_FIXED;
 122    } else if (type.sign) {
 123       chan_type = UTIL_FORMAT_TYPE_SIGNED;
 124    } else {
 125       chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
 126    }
 127
 128    for (chan = 0; chan < desc->nr_channels; ++chan) {
 129       if (desc->channel[chan].size != type.width) {
 130          return FALSE;
 131       }
 132
 133       if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
 134          if (desc->channel[chan].type != chan_type ||
 135              desc->channel[chan].normalized != type.norm) {
 136             return FALSE;
 137          }
 138       }
 139    }
 140
 141    return TRUE;
 142 }
 143
 144 /*
 145  * Do rounding when converting small unorm values to larger ones.
 146  * Not quite 100% accurate, as it's done by appending MSBs, but
 147  * should be good enough.
 148  */
 149
 150 static inline LLVMValueRef
 151 scale_bits_up(struct gallivm_state *gallivm,
 152               int src_bits,
 153               int dst_bits,
 154               LLVMValueRef src,
 155               struct lp_type src_type)
 156 {
 157    LLVMBuilderRef builder = gallivm->builder;
 158    LLVMValueRef result = src;
 159
 160    if (src_bits == 1 && dst_bits > 1) {
 161       /*
 162        * Useful for a1 - we'd need quite some repeated copies otherwise.
 163        */
 164       struct lp_build_context bld;
 165       LLVMValueRef dst_mask;
 166       lp_build_context_init(&bld, gallivm, src_type);
 167       dst_mask = lp_build_const_int_vec(gallivm, src_type,
 168                                         (1 << dst_bits) - 1),
 169       result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
 170                             lp_build_const_int_vec(gallivm, src_type, 0));
 171       result = lp_build_andnot(&bld, dst_mask, result);
 172    }
 173    else if (dst_bits > src_bits) {
 174       /* Scale up bits */
 175       int db = dst_bits - src_bits;
 176
 177       /* Shift left by difference in bits */
 178       result = LLVMBuildShl(builder,
 179                             src,
 180                             lp_build_const_int_vec(gallivm, src_type, db),
 181                             "");
 182
 183       if (db <= src_bits) {
 184          /* Enough bits in src to fill the remainder */
 185          LLVMValueRef lower = LLVMBuildLShr(builder,
 186                                             src,
 187                                             lp_build_const_int_vec(gallivm, src_type,
 188                                                                    src_bits - db),
 189                                             "");
 190
 191          result = LLVMBuildOr(builder, result, lower, "");
 192       } else if (db > src_bits) {
 193          /* Need to repeatedly copy src bits to fill remainder in dst */
 194          unsigned n;
 195
 196          for (n = src_bits; n < dst_bits; n *= 2) {
 197             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
 198
 199             result = LLVMBuildOr(builder,
 200                                  result,
 201                                  LLVMBuildLShr(builder, result, shuv, ""),
 202                                  "");
 203          }
 204       }
 205    } else {
 206       assert (dst_bits == src_bits);
 207    }
 208
 209    return result;
 210 }
 211
 212 /**
 213  * Unpack a single pixel into its XYZW components.
 214  *
 215  * @param desc  the pixel format for the packed pixel value
 216  * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
 217  *
 218  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
 219  */
 220 static inline LLVMValueRef
 221 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
 222                                const struct util_format_description *desc,
 223                                LLVMValueRef packed)
 224 {
 225    LLVMBuilderRef builder = gallivm->builder;
 226    LLVMValueRef shifted, casted, scaled, masked;
 227    LLVMValueRef shifts[4];
 228    LLVMValueRef masks[4];
 229    LLVMValueRef scales[4];
 230    LLVMTypeRef vec32_type;
 231
 232    boolean normalized;
 233    boolean needs_uitofp;
 234    unsigned i;
 235
 236    /* TODO: Support more formats */
 237    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 238    assert(desc->block.width == 1);
 239    assert(desc->block.height == 1);
 240    assert(desc->block.bits <= 32);
 241
 242    /* Do the intermediate integer computations with 32bit integers since it
 243     * matches floating point size */
 244    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
 245
 246    vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
 247
 248    /* Broadcast the packed value to all four channels
 249     * before: packed = BGRA
 250     * after: packed = {BGRA, BGRA, BGRA, BGRA}
 251     */
 252    packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
 253                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
 254                                    "");
 255    packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
 256                                    LLVMConstNull(vec32_type),
 257                                    "");
 258
 259    /* Initialize vector constants */
 260    normalized = FALSE;
 261    needs_uitofp = FALSE;
 262
 263    /* Loop over 4 color components */
 264    for (i = 0; i < 4; ++i) {
 265       unsigned bits = desc->channel[i].size;
 266       unsigned shift = desc->channel[i].shift;
 267
 268       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
 269          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 270          masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
 271          scales[i] =  LLVMConstNull(LLVMFloatTypeInContext(gallivm->context));
 272       }
 273       else {
 274          unsigned long long mask = (1ULL << bits) - 1;
 275
 276          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
 277
 278          if (bits == 32) {
 279             needs_uitofp = TRUE;
 280          }
 281
 282          shifts[i] = lp_build_const_int32(gallivm, shift);
 283          masks[i] = lp_build_const_int32(gallivm, mask);
 284
 285          if (desc->channel[i].normalized) {
 286             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
 287             normalized = TRUE;
 288          }
 289          else
 290             scales[i] =  lp_build_const_float(gallivm, 1.0);
 291       }
 292    }
 293
 294    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
 295     * into masked = {X, Y, Z, W}
 296     */
 297    if (desc->block.bits < 32 && normalized) {
 298       /*
 299        * Note: we cannot do the shift below on x86 natively until AVX2.
 300        *
 301        * Old llvm versions will resort to scalar extract/shift insert,
 302        * which is definitely terrible, new versions will just do
 303        * several vector shifts and shuffle/blend results together.
 304        * We could turn this into a variable left shift plus a constant
 305        * right shift, and llvm would then turn the variable left shift
 306        * into a mul for us (albeit without sse41 the mul needs emulation
 307        * too...). However, since we're going to do a float mul
 308        * anyway, we just adjust that mul instead (plus the mask), skipping
 309        * the shift completely.
 310        * We could also use a extra mul when the format isn't normalized and
 311        * we don't have AVX2 support, but don't bother for now. Unfortunately,
 312        * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
 313        * rgba8 if it ends up here), as that would require UIToFP, albeit that
 314        * would be fixable with easy 16bit shuffle (unless there's channels
 315        * crossing 16bit boundaries).
 316        */
 317       for (i = 0; i < 4; ++i) {
 318          if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
 319             unsigned bits = desc->channel[i].size;
 320             unsigned shift = desc->channel[i].shift;
 321             unsigned long long mask = ((1ULL << bits) - 1) << shift;
 322             scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
 323             masks[i] = lp_build_const_int32(gallivm, mask);
 324          }
 325       }
 326       masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
 327    } else {
 328       shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
 329       masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
 330    }
 331
 332    if (!needs_uitofp) {
 333       /* UIToFP can't be expressed in SSE2 */
 334       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
 335    } else {
 336       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
 337    }
 338
 339    /*
 340     * At this point 'casted' may be a vector of floats such as
 341     * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
 342     * by powers of two). Next, if the pixel values are normalized
 343     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
 344     */
 345
 346    if (normalized)
 347       scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
 348    else
 349       scaled = casted;
 350
 351    return scaled;
 352 }
 353
 354
 355 /**
 356  * Pack a single pixel.
 357  *
 358  * @param rgba 4 float vector with the unpacked components.
 359  *
 360  * XXX: This is mostly for reference and testing -- operating a single pixel at
 361  * a time is rarely if ever needed.
 362  */
 363 LLVMValueRef
 364 lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
 365                        const struct util_format_description *desc,
 366                        LLVMValueRef rgba)
 367 {
 368    LLVMBuilderRef builder = gallivm->builder;
 369    LLVMTypeRef type;
 370    LLVMValueRef packed = NULL;
 371    LLVMValueRef swizzles[4];
 372    LLVMValueRef shifted, casted, scaled, unswizzled;
 373    LLVMValueRef shifts[4];
 374    LLVMValueRef scales[4];
 375    boolean normalized;
 376    unsigned i, j;
 377
 378    assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
 379    assert(desc->block.width == 1);
 380    assert(desc->block.height == 1);
 381
 382    type = LLVMIntTypeInContext(gallivm->context, desc->block.bits);
 383
 384    /* Unswizzle the color components into the source vector. */
 385    for (i = 0; i < 4; ++i) {
 386       for (j = 0; j < 4; ++j) {
 387          if (desc->swizzle[j] == i)
 388             break;
 389       }
 390       if (j < 4)
 391          swizzles[i] = lp_build_const_int32(gallivm, j);
 392       else
 393          swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 394    }
 395
 396    unswizzled = LLVMBuildShuffleVector(builder, rgba,
 397                                        LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)),
 398                                        LLVMConstVector(swizzles, 4), "");
 399
 400    normalized = FALSE;
 401    for (i = 0; i < 4; ++i) {
 402       unsigned bits = desc->channel[i].size;
 403       unsigned shift = desc->channel[i].shift;
 404
 405       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
 406          shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 407          scales[i] =  LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context));
 408       }
 409       else {
 410          unsigned mask = (1 << bits) - 1;
 411
 412          assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
 413          assert(bits < 32);
 414
 415          shifts[i] = lp_build_const_int32(gallivm, shift);
 416
 417          if (desc->channel[i].normalized) {
 418             scales[i] = lp_build_const_float(gallivm, mask);
 419             normalized = TRUE;
 420          }
 421          else
 422             scales[i] = lp_build_const_float(gallivm, 1.0);
 423       }
 424    }
 425
 426    if (normalized)
 427       scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
 428    else
 429       scaled = unswizzled;
 430
 431    casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), "");
 432
 433    shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
 434
 435    /* Bitwise or all components */
 436    for (i = 0; i < 4; ++i) {
 437       if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
 438          LLVMValueRef component = LLVMBuildExtractElement(builder, shifted,
 439                                                lp_build_const_int32(gallivm, i), "");
 440          if (packed)
 441             packed = LLVMBuildOr(builder, packed, component, "");
 442          else
 443             packed = component;
 444       }
 445    }
 446
 447    if (!packed)
 448       packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 449
 450    if (desc->block.bits < 32)
 451       packed = LLVMBuildTrunc(builder, packed, type, "");
 452
 453    return packed;
 454 }
 455
 456
 457
 458
 459 /**
 460  * Fetch a pixel into a 4 float AoS.
 461  *
 462  * \param format_desc  describes format of the image we're fetching from
 463  * \param aligned  whether the data is guaranteed to be aligned
 464  * \param ptr  address of the pixel block (or the texel if uncompressed)
 465  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 466  *              these will always be (0, 0).
 467  * \param cache  optional value pointing to a lp_build_format_cache structure
 468  * \return  a 4 element vector with the pixel's RGBA values.
 469  */
 470 LLVMValueRef
 471 lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
 472                         const struct util_format_description *format_desc,
 473                         struct lp_type type,
 474                         boolean aligned,
 475                         LLVMValueRef base_ptr,
 476                         LLVMValueRef offset,
 477                         LLVMValueRef i,
 478                         LLVMValueRef j,
 479                         LLVMValueRef cache)
 480 {
 481    const struct util_format_unpack_description *unpack =
 482       util_format_unpack_description(format_desc->format);
 483    LLVMBuilderRef builder = gallivm->builder;
 484    unsigned num_pixels = type.length / 4;
 485    struct lp_build_context bld;
 486
 487    assert(type.length <= LP_MAX_VECTOR_LENGTH);
 488    assert(type.length % 4 == 0);
 489
 490    lp_build_context_init(&bld, gallivm, type);
 491
 492    /*
 493     * Trivial case
 494     *
 495     * The format matches the type (apart of a swizzle) so no need for
 496     * scaling or converting.
 497     */
 498
 499    if (format_matches_type(format_desc, type) &&
 500        format_desc->block.bits <= type.width * 4 &&
 501        /* XXX this shouldn't be needed */
 502        util_is_power_of_two_or_zero(format_desc->block.bits)) {
 503       LLVMValueRef packed;
 504       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
 505       struct lp_type fetch_type;
 506       unsigned vec_len = type.width * type.length;
 507
 508       /*
 509        * The format matches the type (apart of a swizzle) so no need for
 510        * scaling or converting.
 511        */
 512
 513       fetch_type = lp_type_uint(type.width*4);
 514       packed = lp_build_gather(gallivm, type.length/4,
 515                                format_desc->block.bits, fetch_type,
 516                                aligned, base_ptr, offset, TRUE);
 517
 518       assert(format_desc->block.bits <= vec_len);
 519       (void) vec_len; /* silence unused var warning for non-debug build */
 520
 521       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
 522       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
 523    }
 524
 525    /*
 526     * Bit arithmetic for converting small_unorm to unorm8.
 527     *
 528     * This misses some opportunities for optimizations (like skipping mask
 529     * for the highest channel for instance, or doing bit scaling in parallel
 530     * for channels with the same bit width) but it should be passable for
 531     * all arithmetic formats.
 532     */
 533    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 534        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
 535        util_format_fits_8unorm(format_desc) &&
 536        type.width == 8 && type.norm == 1 && type.sign == 0 &&
 537        type.fixed == 0 && type.floating == 0) {
 538       LLVMValueRef packed, res = NULL, chans[4], rgba[4];
 539       LLVMTypeRef dst_vec_type, conv_vec_type;
 540       struct lp_type fetch_type, conv_type;
 541       struct lp_build_context bld_conv;
 542       unsigned j;
 543
 544       fetch_type = lp_type_uint(type.width*4);
 545       conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
 546       dst_vec_type = lp_build_vec_type(gallivm, type);
 547       conv_vec_type = lp_build_vec_type(gallivm, conv_type);
 548       lp_build_context_init(&bld_conv, gallivm, conv_type);
 549
 550       packed = lp_build_gather(gallivm, type.length/4,
 551                                format_desc->block.bits, fetch_type,
 552                                aligned, base_ptr, offset, TRUE);
 553
 554       assert(format_desc->block.bits * type.length / 4 <=
 555              type.width * type.length);
 556
 557       packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
 558
 559       for (j = 0; j < format_desc->nr_channels; ++j) {
 560          unsigned mask = 0;
 561          unsigned sa = format_desc->channel[j].shift;
 562
 563          mask = (1 << format_desc->channel[j].size) - 1;
 564
 565          /* Extract bits from source */
 566          chans[j] = LLVMBuildLShr(builder, packed,
 567                                   lp_build_const_int_vec(gallivm, conv_type, sa),
 568                                   "");
 569
 570          chans[j] = LLVMBuildAnd(builder, chans[j],
 571                                  lp_build_const_int_vec(gallivm, conv_type, mask),
 572                                  "");
 573
 574          /* Scale bits */
 575          if (type.norm) {
 576             chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
 577                                      type.width, chans[j], conv_type);
 578          }
 579       }
 580       /*
 581        * This is a hacked lp_build_format_swizzle_soa() since we need a
 582        * normalized 1 but only 8 bits in a 32bit vector...
 583        */
 584       for (j = 0; j < 4; ++j) {
 585          enum pipe_swizzle swizzle = format_desc->swizzle[j];
 586          if (swizzle == PIPE_SWIZZLE_1) {
 587             rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
 588          } else {
 589             rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
 590          }
 591          if (j == 0) {
 592             res = rgba[j];
 593          } else {
 594             rgba[j] = LLVMBuildShl(builder, rgba[j],
 595                                    lp_build_const_int_vec(gallivm, conv_type,
 596                                                           j * type.width), "");
 597             res = LLVMBuildOr(builder, res, rgba[j], "");
 598          }
 599       }
 600       res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
 601
 602       return res;
 603    }
 604
 605    /*
 606     * Bit arithmetic
 607     */
 608
 609    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
 610        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
 611         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
 612        format_desc->block.width == 1 &&
 613        format_desc->block.height == 1 &&
 614        /* XXX this shouldn't be needed */
 615        util_is_power_of_two_or_zero(format_desc->block.bits) &&
 616        format_desc->block.bits <= 32 &&
 617        format_desc->is_bitmask &&
 618        !format_desc->is_mixed &&
 619        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
 620         format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
 621        !format_desc->channel[0].pure_integer) {
 622
 623       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
 624       LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
 625       struct lp_type conv_type;
 626       unsigned k, num_conv_src, num_conv_dst;
 627
 628       /*
 629        * Note this path is generally terrible for fetching multiple pixels.
 630        * We should make sure we cannot hit this code path for anything but
 631        * single pixels.
 632        */
 633
 634       /*
 635        * Unpack a pixel at a time into a <4 x float> RGBA vector
 636        */
 637
 638       for (k = 0; k < num_pixels; ++k) {
 639          LLVMValueRef packed;
 640
 641          packed = lp_build_gather_elem(gallivm, num_pixels,
 642                                        format_desc->block.bits, 32, aligned,
 643                                        base_ptr, offset, k, FALSE);
 644
 645          tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
 646                                                   format_desc,
 647                                                   packed);
 648       }
 649
 650       /*
 651        * Type conversion.
 652        *
 653        * TODO: We could avoid floating conversion for integer to
 654        * integer conversions.
 655        */
 656
 657       if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
 658          debug_printf("%s: unpacking %s with floating point\n",
 659                       __FUNCTION__, format_desc->short_name);
 660       }
 661
 662       conv_type = lp_float32_vec4_type();
 663       num_conv_src = num_pixels;
 664       num_conv_dst = 1;
 665
 666       if (num_pixels % 8 == 0) {
 667          lp_build_concat_n(gallivm, lp_float32_vec4_type(),
 668                            tmps, num_pixels, tmps, num_pixels / 2);
 669          conv_type.length *= num_pixels / 4;
 670          num_conv_src = 4 * num_pixels / 8;
 671          if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
 672             /*
 673              * FIXME: The fast float->unorm path (which is basically
 674              * skipping the MIN/MAX which are extremely pointless in any
 675              * case) requires that there's 2 destinations...
 676              * In any case, we really should make sure we don't hit this
 677              * code with multiple pixels for unorm8 dst types, it's
 678              * completely hopeless even if we do hit the right conversion.
 679              */
 680             type.length /= num_pixels / 4;
 681             num_conv_dst = num_pixels / 4;
 682          }
 683       }
 684
 685       lp_build_conv(gallivm, conv_type, type,
 686                     tmps, num_conv_src, res, num_conv_dst);
 687
 688       if (num_pixels % 8 == 0 &&
 689           (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
 690          lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
 691       }
 692
 693       return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
 694    }
 695
 696    /* If all channels are of same type and we are not using half-floats */
 697    if (format_desc->is_array &&
 698        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
 699       assert(!format_desc->is_mixed);
 700       return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
 701    }
 702
 703    /*
 704     * YUV / subsampled formats
 705     */
 706
 707    if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
 708       struct lp_type tmp_type;
 709       LLVMValueRef tmp;
 710
 711       memset(&tmp_type, 0, sizeof tmp_type);
 712       tmp_type.width = 8;
 713       tmp_type.length = num_pixels * 4;
 714       tmp_type.norm = TRUE;
 715
 716       tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
 717                                                format_desc,
 718                                                num_pixels,
 719                                                base_ptr,
 720                                                offset,
 721                                                i, j);
 722
 723       lp_build_conv(gallivm,
 724                     tmp_type, type,
 725                     &tmp, 1, &tmp, 1);
 726
 727       return tmp;
 728    }
 729
 730    /*
 731     * s3tc rgb formats
 732     */
 733
 734    if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
 735       struct lp_type tmp_type;
 736       LLVMValueRef tmp;
 737
 738       memset(&tmp_type, 0, sizeof tmp_type);
 739       tmp_type.width = 8;
 740       tmp_type.length = num_pixels * 4;
 741       tmp_type.norm = TRUE;
 742
 743       tmp = lp_build_fetch_s3tc_rgba_aos(gallivm,
 744                                          format_desc,
 745                                          num_pixels,
 746                                          base_ptr,
 747                                          offset,
 748                                          i, j,
 749                                          cache);
 750
 751       lp_build_conv(gallivm,
 752                     tmp_type, type,
 753                     &tmp, 1, &tmp, 1);
 754
 755        return tmp;
 756    }
 757
 758    /*
 759     * rgtc rgb formats
 760     */
 761
 762    if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
 763       struct lp_type tmp_type;
 764       LLVMValueRef tmp;
 765
 766       memset(&tmp_type, 0, sizeof tmp_type);
 767       tmp_type.width = 8;
 768       tmp_type.length = num_pixels * 4;
 769       tmp_type.norm = TRUE;
 770       tmp_type.sign = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
 771                        format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
 772                        format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
 773                        format_desc->format == PIPE_FORMAT_LATC2_SNORM);
 774
 775       tmp = lp_build_fetch_rgtc_rgba_aos(gallivm,
 776                                          format_desc,
 777                                          num_pixels,
 778                                          base_ptr,
 779                                          offset,
 780                                          i, j,
 781                                          cache);
 782
 783       lp_build_conv(gallivm,
 784                     tmp_type, type,
 785                     &tmp, 1, &tmp, 1);
 786
 787        return tmp;
 788    }
 789
 790    /*
 791     * Fallback to util_format_description::fetch_rgba_8unorm().
 792     */
 793
 794    if (unpack->fetch_rgba_8unorm &&
 795        !type.floating && type.width == 8 && !type.sign && type.norm) {
 796       /*
 797        * Fallback to calling util_format_description::fetch_rgba_8unorm.
 798        *
 799        * This is definitely not the most efficient way of fetching pixels, as
 800        * we miss the opportunity to do vectorization, but this it is a
 801        * convenient for formats or scenarios for which there was no opportunity
 802        * or incentive to optimize.
 803        */
 804
 805       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
 806       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
 807       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 808       LLVMValueRef function;
 809       LLVMValueRef tmp_ptr;
 810       LLVMValueRef tmp;
 811       LLVMValueRef res;
 812       unsigned k;
 813
 814       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
 815          debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
 816                       __FUNCTION__, format_desc->short_name);
 817       }
 818
 819       /*
 820        * Declare and bind format_desc->fetch_rgba_8unorm().
 821        */
 822
 823       {
 824          /*
 825           * Function to call looks like:
 826           *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
 827           */
 828          LLVMTypeRef ret_type;
 829          LLVMTypeRef arg_types[4];
 830          LLVMTypeRef function_type;
 831
 832          ret_type = LLVMVoidTypeInContext(gallivm->context);
 833          arg_types[0] = pi8t;
 834          arg_types[1] = pi8t;
 835          arg_types[2] = i32t;
 836          arg_types[3] = i32t;
 837          function_type = LLVMFunctionType(ret_type, arg_types,
 838                                           ARRAY_SIZE(arg_types), 0);
 839
 840          if (gallivm->cache)
 841             gallivm->cache->dont_cache = true;
 842          /* make const pointer for the C fetch_rgba_8unorm function */
 843          function = lp_build_const_int_pointer(gallivm,
 844             func_to_pointer((func_pointer) unpack->fetch_rgba_8unorm));
 845
 846          /* cast the callee pointer to the function's type */
 847          function = LLVMBuildBitCast(builder, function,
 848                                      LLVMPointerType(function_type, 0),
 849                                      "cast callee");
 850       }
 851
 852       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
 853
 854       res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
 855
 856       /*
 857        * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
 858        * in the SoA vectors.
 859        */
 860
 861       for (k = 0; k < num_pixels; ++k) {
 862          LLVMValueRef index = lp_build_const_int32(gallivm, k);
 863          LLVMValueRef args[4];
 864
 865          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
 866          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
 867                                             base_ptr, offset, k);
 868
 869          if (num_pixels == 1) {
 870             args[2] = i;
 871             args[3] = j;
 872          }
 873          else {
 874             args[2] = LLVMBuildExtractElement(builder, i, index, "");
 875             args[3] = LLVMBuildExtractElement(builder, j, index, "");
 876          }
 877
 878          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
 879
 880          tmp = LLVMBuildLoad(builder, tmp_ptr, "");
 881
 882          if (num_pixels == 1) {
 883             res = tmp;
 884          }
 885          else {
 886             res = LLVMBuildInsertElement(builder, res, tmp, index, "");
 887          }
 888       }
 889
 890       /* Bitcast from <n x i32> to <4n x i8> */
 891       res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
 892
 893       return res;
 894    }
 895
 896    /*
 897     * Fallback to util_format_description::fetch_rgba_float().
 898     */
 899
 900    if (unpack->fetch_rgba) {
 901       /*
 902        * Fallback to calling util_format_description::fetch_rgba_float.
 903        *
 904        * This is definitely not the most efficient way of fetching pixels, as
 905        * we miss the opportunity to do vectorization, but this it is a
 906        * convenient for formats or scenarios for which there was no opportunity
 907        * or incentive to optimize.
 908        */
 909
 910       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
 911       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
 912       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
 913       LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
 914       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
 915       LLVMValueRef function;
 916       LLVMValueRef tmp_ptr;
 917       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
 918       LLVMValueRef res;
 919       unsigned k;
 920
 921       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
 922          debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
 923                       __FUNCTION__, format_desc->short_name);
 924       }
 925
 926       /*
 927        * Declare and bind unpack->fetch_rgba_float().
 928        */
 929
 930       {
 931          /*
 932           * Function to call looks like:
 933           *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
 934           */
 935          LLVMTypeRef ret_type;
 936          LLVMTypeRef arg_types[4];
 937
 938          ret_type = LLVMVoidTypeInContext(gallivm->context);
 939          arg_types[0] = pf32t;
 940          arg_types[1] = pi8t;
 941          arg_types[2] = i32t;
 942          arg_types[3] = i32t;
 943
 944          if (gallivm->cache)
 945             gallivm->cache->dont_cache = true;
 946          function = lp_build_const_func_pointer(gallivm,
 947                                                 func_to_pointer((func_pointer) unpack->fetch_rgba),
 948                                                 ret_type,
 949                                                 arg_types, ARRAY_SIZE(arg_types),
 950                                                 format_desc->short_name);
 951       }
 952
 953       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
 954
 955       /*
 956        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
 957        * in the SoA vectors.
 958        */
 959
 960       for (k = 0; k < num_pixels; ++k) {
 961          LLVMValueRef args[4];
 962
 963          args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
 964          args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
 965                                             base_ptr, offset, k);
 966
 967          if (num_pixels == 1) {
 968             args[2] = i;
 969             args[3] = j;
 970          }
 971          else {
 972             LLVMValueRef index = lp_build_const_int32(gallivm, k);
 973             args[2] = LLVMBuildExtractElement(builder, i, index, "");
 974             args[3] = LLVMBuildExtractElement(builder, j, index, "");
 975          }
 976
 977          LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
 978
 979          tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
 980       }
 981
 982       lp_build_conv(gallivm,
 983                     lp_float32_vec4_type(),
 984                     type,
 985                     tmps, num_pixels, &res, 1);
 986
 987       return res;
 988    }
 989
 990    assert(!util_format_is_pure_integer(format_desc->format));
 991
 992    assert(0);
 993    return lp_build_undef(gallivm, type);
 994 }