src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- AoS.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  * @author Brian Paul <brianp@vmware.com>
  34  */
  35
  36 #include "pipe/p_defines.h"
  37 #include "pipe/p_state.h"
  38 #include "util/u_debug.h"
  39 #include "util/u_dump.h"
  40 #include "util/u_memory.h"
  41 #include "util/u_math.h"
  42 #include "util/u_format.h"
  43 #include "util/u_cpu_detect.h"
  44 #include "lp_bld_debug.h"
  45 #include "lp_bld_type.h"
  46 #include "lp_bld_const.h"
  47 #include "lp_bld_conv.h"
  48 #include "lp_bld_arit.h"
  49 #include "lp_bld_bitarit.h"
  50 #include "lp_bld_logic.h"
  51 #include "lp_bld_swizzle.h"
  52 #include "lp_bld_pack.h"
  53 #include "lp_bld_flow.h"
  54 #include "lp_bld_gather.h"
  55 #include "lp_bld_format.h"
  56 #include "lp_bld_init.h"
  57 #include "lp_bld_sample.h"
  58 #include "lp_bld_sample_aos.h"
  59 #include "lp_bld_quad.h"
  60
  61
  62 /**
  63  * Build LLVM code for texture coord wrapping, for nearest filtering,
  64  * for scaled integer texcoords.
  65  * \param block_length  is the length of the pixel block along the
  66  *                      coordinate axis
  67  * \param coord  the incoming texcoord (s,t or r) scaled to the texture size
  68  * \param coord_f  the incoming texcoord (s,t or r) as float vec
  69  * \param length  the texture size along one dimension
  70  * \param stride  pixel stride along the coordinate axis (in bytes)
  71  * \param offset  the texel offset along the coord axis
  72  * \param is_pot  if TRUE, length is a power of two
  73  * \param wrap_mode  one of PIPE_TEX_WRAP_x
  74  * \param out_offset  byte offset for the wrapped coordinate
  75  * \param out_i  resulting sub-block pixel coordinate for coord0
  76  */
  77 static void
  78 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
  79                                  unsigned block_length,
  80                                  LLVMValueRef coord,
  81                                  LLVMValueRef coord_f,
  82                                  LLVMValueRef length,
  83                                  LLVMValueRef stride,
  84                                  LLVMValueRef offset,
  85                                  boolean is_pot,
  86                                  unsigned wrap_mode,
  87                                  LLVMValueRef *out_offset,
  88                                  LLVMValueRef *out_i)
  89 {
  90    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
  91    LLVMBuilderRef builder = bld->gallivm->builder;
  92    LLVMValueRef length_minus_one;
  93
  94    length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
  95
  96    switch(wrap_mode) {
  97    case PIPE_TEX_WRAP_REPEAT:
  98       if(is_pot)
  99          coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
 100       else {
 101          struct lp_build_context *coord_bld = &bld->coord_bld;
 102          LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
 103          if (offset) {
 104             offset = lp_build_int_to_float(coord_bld, offset);
 105             offset = lp_build_div(coord_bld, offset, length_f);
 106             coord_f = lp_build_add(coord_bld, coord_f, offset);
 107          }
 108          coord = lp_build_fract_safe(coord_bld, coord_f);
 109          coord = lp_build_mul(coord_bld, coord, length_f);
 110          coord = lp_build_itrunc(coord_bld, coord);
 111       }
 112       break;
 113
 114    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 115       coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
 116       coord = lp_build_min(int_coord_bld, coord, length_minus_one);
 117       break;
 118
 119    case PIPE_TEX_WRAP_CLAMP:
 120    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 121    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 122    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 123    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 124    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 125    default:
 126       assert(0);
 127    }
 128
 129    lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
 130                                   out_offset, out_i);
 131 }
 132
 133
 134 /**
 135  * Build LLVM code for texture coord wrapping, for nearest filtering,
 136  * for float texcoords.
 137  * \param coord  the incoming texcoord (s,t or r)
 138  * \param length  the texture size along one dimension
 139  * \param offset  the texel offset along the coord axis
 140  * \param is_pot  if TRUE, length is a power of two
 141  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 142  * \param icoord  the texcoord after wrapping, as int
 143  */
 144 static void
 145 lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
 146                                    LLVMValueRef coord,
 147                                    LLVMValueRef length,
 148                                    LLVMValueRef offset,
 149                                    boolean is_pot,
 150                                    unsigned wrap_mode,
 151                                    LLVMValueRef *icoord)
 152 {
 153    struct lp_build_context *coord_bld = &bld->coord_bld;
 154    LLVMValueRef length_minus_one;
 155
 156    switch(wrap_mode) {
 157    case PIPE_TEX_WRAP_REPEAT:
 158       if (offset) {
 159          /* this is definitely not ideal for POT case */
 160          offset = lp_build_int_to_float(coord_bld, offset);
 161          offset = lp_build_div(coord_bld, offset, length);
 162          coord = lp_build_add(coord_bld, coord, offset);
 163       }
 164       /* take fraction, unnormalize */
 165       coord = lp_build_fract_safe(coord_bld, coord);
 166       coord = lp_build_mul(coord_bld, coord, length);
 167       *icoord = lp_build_itrunc(coord_bld, coord);
 168       break;
 169    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 170       length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
 171       if (bld->static_sampler_state->normalized_coords) {
 172          /* scale coord to length */
 173          coord = lp_build_mul(coord_bld, coord, length);
 174       }
 175       if (offset) {
 176          offset = lp_build_int_to_float(coord_bld, offset);
 177          coord = lp_build_add(coord_bld, coord, offset);
 178       }
 179       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
 180                              length_minus_one);
 181       *icoord = lp_build_itrunc(coord_bld, coord);
 182       break;
 183
 184    case PIPE_TEX_WRAP_CLAMP:
 185    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 186    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 187    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 188    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 189    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 190    default:
 191       assert(0);
 192    }
 193 }
 194
 195
 196 /**
 197  * Helper to compute the first coord and the weight for
 198  * linear wrap repeat npot textures
 199  */
 200 static void
 201 lp_build_coord_repeat_npot_linear_int(struct lp_build_sample_context *bld,
 202                                       LLVMValueRef coord_f,
 203                                       LLVMValueRef length_i,
 204                                       LLVMValueRef length_f,
 205                                       LLVMValueRef *coord0_i,
 206                                       LLVMValueRef *weight_i)
 207 {
 208    struct lp_build_context *coord_bld = &bld->coord_bld;
 209    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 210    struct lp_build_context abs_coord_bld;
 211    struct lp_type abs_type;
 212    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
 213                                                 int_coord_bld->one);
 214    LLVMValueRef mask, i32_c8, i32_c128, i32_c255;
 215
 216    /* wrap with normalized floats is just fract */
 217    coord_f = lp_build_fract(coord_bld, coord_f);
 218    /* mul by size */
 219    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
 220    /* convert to int, compute lerp weight */
 221    coord_f = lp_build_mul_imm(&bld->coord_bld, coord_f, 256);
 222
 223    /* At this point we don't have any negative numbers so use non-signed
 224     * build context which might help on some archs.
 225     */
 226    abs_type = coord_bld->type;
 227    abs_type.sign = 0;
 228    lp_build_context_init(&abs_coord_bld, bld->gallivm, abs_type);
 229    *coord0_i = lp_build_iround(&abs_coord_bld, coord_f);
 230
 231    /* subtract 0.5 (add -128) */
 232    i32_c128 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, -128);
 233    *coord0_i = LLVMBuildAdd(bld->gallivm->builder, *coord0_i, i32_c128, "");
 234
 235    /* compute fractional part (AND with 0xff) */
 236    i32_c255 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 255);
 237    *weight_i = LLVMBuildAnd(bld->gallivm->builder, *coord0_i, i32_c255, "");
 238
 239    /* compute floor (shift right 8) */
 240    i32_c8 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 8);
 241    *coord0_i = LLVMBuildAShr(bld->gallivm->builder, *coord0_i, i32_c8, "");
 242    /*
 243     * we avoided the 0.5/length division before the repeat wrap,
 244     * now need to fix up edge cases with selects
 245     */
 246    mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 247                            PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
 248    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
 249 }
 250
 251
 252 /**
 253  * Build LLVM code for texture coord wrapping, for linear filtering,
 254  * for scaled integer texcoords.
 255  * \param block_length  is the length of the pixel block along the
 256  *                      coordinate axis
 257  * \param coord0  the incoming texcoord (s,t or r) scaled to the texture size
 258  * \param coord_f  the incoming texcoord (s,t or r) as float vec
 259  * \param length  the texture size along one dimension
 260  * \param stride  pixel stride along the coordinate axis (in bytes)
 261  * \param offset  the texel offset along the coord axis
 262  * \param is_pot  if TRUE, length is a power of two
 263  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 264  * \param offset0  resulting relative offset for coord0
 265  * \param offset1  resulting relative offset for coord0 + 1
 266  * \param i0  resulting sub-block pixel coordinate for coord0
 267  * \param i1  resulting sub-block pixel coordinate for coord0 + 1
 268  */
 269 static void
 270 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 271                                 unsigned block_length,
 272                                 LLVMValueRef coord0,
 273                                 LLVMValueRef *weight_i,
 274                                 LLVMValueRef coord_f,
 275                                 LLVMValueRef length,
 276                                 LLVMValueRef stride,
 277                                 LLVMValueRef offset,
 278                                 boolean is_pot,
 279                                 unsigned wrap_mode,
 280                                 LLVMValueRef *offset0,
 281                                 LLVMValueRef *offset1,
 282                                 LLVMValueRef *i0,
 283                                 LLVMValueRef *i1)
 284 {
 285    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 286    LLVMBuilderRef builder = bld->gallivm->builder;
 287    LLVMValueRef length_minus_one;
 288    LLVMValueRef lmask, umask, mask;
 289
 290    /*
 291     * If the pixel block covers more than one pixel then there is no easy
 292     * way to calculate offset1 relative to offset0. Instead, compute them
 293     * independently. Otherwise, try to compute offset0 and offset1 with
 294     * a single stride multiplication.
 295     */
 296
 297    length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 298
 299    if (block_length != 1) {
 300       LLVMValueRef coord1;
 301       switch(wrap_mode) {
 302       case PIPE_TEX_WRAP_REPEAT:
 303          if (is_pot) {
 304             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 305             coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
 306             coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
 307          }
 308          else {
 309             LLVMValueRef mask;
 310             LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
 311             if (offset) {
 312                offset = lp_build_int_to_float(&bld->coord_bld, offset);
 313                offset = lp_build_div(&bld->coord_bld, offset, length_f);
 314                coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
 315             }
 316             lp_build_coord_repeat_npot_linear_int(bld, coord_f,
 317                                                   length, length_f,
 318                                                   &coord0, weight_i);
 319             mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
 320                                     PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 321             coord1 = LLVMBuildAnd(builder,
 322                                   lp_build_add(int_coord_bld, coord0,
 323                                                int_coord_bld->one),
 324                                   mask, "");
 325          }
 326          break;
 327
 328       case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 329          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 330          coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
 331                                 length_minus_one);
 332          coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
 333                                 length_minus_one);
 334          break;
 335
 336       case PIPE_TEX_WRAP_CLAMP:
 337       case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 338       case PIPE_TEX_WRAP_MIRROR_REPEAT:
 339       case PIPE_TEX_WRAP_MIRROR_CLAMP:
 340       case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 341       case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 342       default:
 343          assert(0);
 344          coord0 = int_coord_bld->zero;
 345          coord1 = int_coord_bld->zero;
 346          break;
 347       }
 348       lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
 349                                      offset0, i0);
 350       lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
 351                                      offset1, i1);
 352       return;
 353    }
 354
 355    *i0 = int_coord_bld->zero;
 356    *i1 = int_coord_bld->zero;
 357
 358    switch(wrap_mode) {
 359    case PIPE_TEX_WRAP_REPEAT:
 360       if (is_pot) {
 361          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
 362       }
 363       else {
 364          LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
 365          if (offset) {
 366             offset = lp_build_int_to_float(&bld->coord_bld, offset);
 367             offset = lp_build_div(&bld->coord_bld, offset, length_f);
 368             coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
 369          }
 370          lp_build_coord_repeat_npot_linear_int(bld, coord_f,
 371                                                length, length_f,
 372                                                &coord0, weight_i);
 373       }
 374
 375       mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
 376                               PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 377
 378       *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
 379       *offset1 = LLVMBuildAnd(builder,
 380                               lp_build_add(int_coord_bld, *offset0, stride),
 381                               mask, "");
 382       break;
 383
 384    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 385       /* XXX this might be slower than the separate path
 386        * on some newer cpus. With sse41 this is 8 instructions vs. 7
 387        * - at least on SNB this is almost certainly slower since
 388        * min/max are cheaper than selects, and the muls aren't bad.
 389        */
 390       lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 391                                PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
 392       umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 393                                PIPE_FUNC_LESS, coord0, length_minus_one);
 394
 395       coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
 396       coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
 397
 398       mask = LLVMBuildAnd(builder, lmask, umask, "");
 399
 400       *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
 401       *offset1 = lp_build_add(int_coord_bld,
 402                               *offset0,
 403                               LLVMBuildAnd(builder, stride, mask, ""));
 404       break;
 405
 406    case PIPE_TEX_WRAP_CLAMP:
 407    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 408    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 409    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 410    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 411    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 412    default:
 413       assert(0);
 414       *offset0 = int_coord_bld->zero;
 415       *offset1 = int_coord_bld->zero;
 416       break;
 417    }
 418 }
 419
 420
 421 /**
 422  * Build LLVM code for texture coord wrapping, for linear filtering,
 423  * for float texcoords.
 424  * \param block_length  is the length of the pixel block along the
 425  *                      coordinate axis
 426  * \param coord  the incoming texcoord (s,t or r)
 427  * \param length  the texture size along one dimension
 428  * \param offset  the texel offset along the coord axis
 429  * \param is_pot  if TRUE, length is a power of two
 430  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 431  * \param coord0  the first texcoord after wrapping, as int
 432  * \param coord1  the second texcoord after wrapping, as int
 433  * \param weight  the filter weight as int (0-255)
 434  * \param force_nearest  if this coord actually uses nearest filtering
 435  */
 436 static void
 437 lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
 438                                   unsigned block_length,
 439                                   LLVMValueRef coord,
 440                                   LLVMValueRef length,
 441                                   LLVMValueRef offset,
 442                                   boolean is_pot,
 443                                   unsigned wrap_mode,
 444                                   LLVMValueRef *coord0,
 445                                   LLVMValueRef *coord1,
 446                                   LLVMValueRef *weight,
 447                                   unsigned force_nearest)
 448 {
 449    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 450    struct lp_build_context *coord_bld = &bld->coord_bld;
 451    LLVMBuilderRef builder = bld->gallivm->builder;
 452    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 453    LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
 454
 455    switch(wrap_mode) {
 456    case PIPE_TEX_WRAP_REPEAT:
 457       if (is_pot) {
 458          /* mul by size and subtract 0.5 */
 459          coord = lp_build_mul(coord_bld, coord, length);
 460          if (offset) {
 461             offset = lp_build_int_to_float(coord_bld, offset);
 462             coord = lp_build_add(coord_bld, coord, offset);
 463          }
 464          if (!force_nearest)
 465             coord = lp_build_sub(coord_bld, coord, half);
 466          *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
 467          /* convert to int, compute lerp weight */
 468          lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
 469          *coord1 = lp_build_ifloor(coord_bld, *coord1);
 470          /* repeat wrap */
 471          length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
 472          *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
 473          *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
 474       }
 475       else {
 476          LLVMValueRef mask;
 477          if (offset) {
 478             offset = lp_build_int_to_float(coord_bld, offset);
 479             offset = lp_build_div(coord_bld, offset, length);
 480             coord = lp_build_add(coord_bld, coord, offset);
 481          }
 482          /* wrap with normalized floats is just fract */
 483          coord = lp_build_fract(coord_bld, coord);
 484          /* unnormalize */
 485          coord = lp_build_mul(coord_bld, coord, length);
 486          /*
 487           * we avoided the 0.5/length division, have to fix up wrong
 488           * edge cases with selects
 489           */
 490          *coord1 = lp_build_add(coord_bld, coord, half);
 491          coord = lp_build_sub(coord_bld, coord, half);
 492          *weight = lp_build_fract(coord_bld, coord);
 493          mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
 494                                  PIPE_FUNC_LESS, coord, coord_bld->zero);
 495          *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
 496          *coord0 = lp_build_itrunc(coord_bld, *coord0);
 497          mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
 498                                  PIPE_FUNC_LESS, *coord1, length);
 499          *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
 500          *coord1 = lp_build_itrunc(coord_bld, *coord1);
 501       }
 502       break;
 503    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 504       if (bld->static_sampler_state->normalized_coords) {
 505          /* mul by tex size */
 506          coord = lp_build_mul(coord_bld, coord, length);
 507       }
 508       if (offset) {
 509          offset = lp_build_int_to_float(coord_bld, offset);
 510          coord = lp_build_add(coord_bld, coord, offset);
 511       }
 512       /* subtract 0.5 */
 513       if (!force_nearest) {
 514          coord = lp_build_sub(coord_bld, coord, half);
 515       }
 516       /* clamp to [0, length - 1] */
 517       coord = lp_build_min(coord_bld, coord, length_minus_one);
 518       coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 519       *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
 520       /* convert to int, compute lerp weight */
 521       lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
 522       /* coord1 = min(coord1, length-1) */
 523       *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
 524       *coord1 = lp_build_itrunc(coord_bld, *coord1);
 525       break;
 526    default:
 527       assert(0);
 528       *coord0 = int_coord_bld->zero;
 529       *coord1 = int_coord_bld->zero;
 530       *weight = coord_bld->zero;
 531       break;
 532    }
 533    *weight = lp_build_mul_imm(coord_bld, *weight, 256);
 534    *weight = lp_build_itrunc(coord_bld, *weight);
 535    return;
 536 }
 537
 538
 539 /**
 540  * Fetch texels for image with nearest sampling.
 541  * Return filtered color as two vectors of 16-bit fixed point values.
 542  */
 543 static void
 544 lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
 545                                     LLVMValueRef data_ptr,
 546                                     LLVMValueRef offset,
 547                                     LLVMValueRef x_subcoord,
 548                                     LLVMValueRef y_subcoord,
 549                                     LLVMValueRef *colors)
 550 {
 551    /*
 552     * Fetch the pixels as 4 x 32bit (rgba order might differ):
 553     *
 554     *   rgba0 rgba1 rgba2 rgba3
 555     *
 556     * bit cast them into 16 x u8
 557     *
 558     *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
 559     *
 560     * unpack them into two 8 x i16:
 561     *
 562     *   r0 g0 b0 a0 r1 g1 b1 a1
 563     *   r2 g2 b2 a2 r3 g3 b3 a3
 564     *
 565     * The higher 8 bits of the resulting elements will be zero.
 566     */
 567    LLVMBuilderRef builder = bld->gallivm->builder;
 568    LLVMValueRef rgba8;
 569    struct lp_build_context u8n;
 570    LLVMTypeRef u8n_vec_type;
 571
 572    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
 573    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 574
 575    if (util_format_is_rgba8_variant(bld->format_desc)) {
 576       /*
 577        * Given the format is a rgba8, just read the pixels as is,
 578        * without any swizzling. Swizzling will be done later.
 579        */
 580       rgba8 = lp_build_gather(bld->gallivm,
 581                               bld->texel_type.length,
 582                               bld->format_desc->block.bits,
 583                               bld->texel_type.width,
 584                               data_ptr, offset, TRUE);
 585
 586       rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
 587    }
 588    else {
 589       rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
 590                                       bld->format_desc,
 591                                       u8n.type,
 592                                       data_ptr, offset,
 593                                       x_subcoord,
 594                                       y_subcoord);
 595    }
 596
 597    *colors = rgba8;
 598 }
 599
 600
 601 /**
 602  * Sample a single texture image with nearest sampling.
 603  * If sampling a cube texture, r = cube face in [0,5].
 604  * Return filtered color as two vectors of 16-bit fixed point values.
 605  */
 606 static void
 607 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 608                               LLVMValueRef int_size,
 609                               LLVMValueRef row_stride_vec,
 610                               LLVMValueRef img_stride_vec,
 611                               LLVMValueRef data_ptr,
 612                               LLVMValueRef mipoffsets,
 613                               LLVMValueRef s,
 614                               LLVMValueRef t,
 615                               LLVMValueRef r,
 616                               const LLVMValueRef *offsets,
 617                               LLVMValueRef *colors)
 618 {
 619    const unsigned dims = bld->dims;
 620    struct lp_build_context i32;
 621    LLVMValueRef width_vec, height_vec, depth_vec;
 622    LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
 623    LLVMValueRef s_float, t_float = NULL, r_float = NULL;
 624    LLVMValueRef x_stride;
 625    LLVMValueRef x_offset, offset;
 626    LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
 627
 628    lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
 629
 630    lp_build_extract_image_sizes(bld,
 631                                 &bld->int_size_bld,
 632                                 bld->int_coord_type,
 633                                 int_size,
 634                                 &width_vec,
 635                                 &height_vec,
 636                                 &depth_vec);
 637
 638    s_float = s; t_float = t; r_float = r;
 639
 640    if (bld->static_sampler_state->normalized_coords) {
 641       LLVMValueRef flt_size;
 642
 643       flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
 644
 645       lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
 646    }
 647
 648    /* convert float to int */
 649    /* For correct rounding, need floor, not truncation here.
 650     * Note that in some cases (clamp to edge, no texel offsets) we
 651     * could use a non-signed build context which would help archs
 652     * greatly which don't have arch rounding.
 653     */
 654    s_ipart = lp_build_ifloor(&bld->coord_bld, s);
 655    if (dims >= 2)
 656       t_ipart = lp_build_ifloor(&bld->coord_bld, t);
 657    if (dims >= 3)
 658       r_ipart = lp_build_ifloor(&bld->coord_bld, r);
 659
 660    /* add texel offsets */
 661    if (offsets[0]) {
 662       s_ipart = lp_build_add(&i32, s_ipart, offsets[0]);
 663       if (dims >= 2) {
 664          t_ipart = lp_build_add(&i32, t_ipart, offsets[1]);
 665          if (dims >= 3) {
 666             r_ipart = lp_build_add(&i32, r_ipart, offsets[2]);
 667          }
 668       }
 669    }
 670
 671    /* get pixel, row, image strides */
 672    x_stride = lp_build_const_vec(bld->gallivm,
 673                                  bld->int_coord_bld.type,
 674                                  bld->format_desc->block.bits/8);
 675
 676    /* Do texcoord wrapping, compute texel offset */
 677    lp_build_sample_wrap_nearest_int(bld,
 678                                     bld->format_desc->block.width,
 679                                     s_ipart, s_float,
 680                                     width_vec, x_stride, offsets[0],
 681                                     bld->static_texture_state->pot_width,
 682                                     bld->static_sampler_state->wrap_s,
 683                                     &x_offset, &x_subcoord);
 684    offset = x_offset;
 685    if (dims >= 2) {
 686       LLVMValueRef y_offset;
 687       lp_build_sample_wrap_nearest_int(bld,
 688                                        bld->format_desc->block.height,
 689                                        t_ipart, t_float,
 690                                        height_vec, row_stride_vec, offsets[1],
 691                                        bld->static_texture_state->pot_height,
 692                                        bld->static_sampler_state->wrap_t,
 693                                        &y_offset, &y_subcoord);
 694       offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
 695       if (dims >= 3) {
 696          LLVMValueRef z_offset;
 697          lp_build_sample_wrap_nearest_int(bld,
 698                                           1, /* block length (depth) */
 699                                           r_ipart, r_float,
 700                                           depth_vec, img_stride_vec, offsets[2],
 701                                           bld->static_texture_state->pot_depth,
 702                                           bld->static_sampler_state->wrap_r,
 703                                           &z_offset, &z_subcoord);
 704          offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
 705       }
 706    }
 707    if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
 708        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
 709        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
 710       LLVMValueRef z_offset;
 711       /* The r coord is the cube face in [0,5] or array layer */
 712       z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
 713       offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
 714    }
 715    if (mipoffsets) {
 716       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
 717    }
 718
 719    lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
 720                                        x_subcoord, y_subcoord,
 721                                        colors);
 722 }
 723
 724
 725 /**
 726  * Sample a single texture image with nearest sampling.
 727  * If sampling a cube texture, r = cube face in [0,5].
 728  * Return filtered color as two vectors of 16-bit fixed point values.
 729  * Does address calcs (except offsets) with floats.
 730  * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
 731  */
 732 static void
 733 lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
 734                                      LLVMValueRef int_size,
 735                                      LLVMValueRef row_stride_vec,
 736                                      LLVMValueRef img_stride_vec,
 737                                      LLVMValueRef data_ptr,
 738                                      LLVMValueRef mipoffsets,
 739                                      LLVMValueRef s,
 740                                      LLVMValueRef t,
 741                                      LLVMValueRef r,
 742                                      const LLVMValueRef *offsets,
 743                                      LLVMValueRef *colors)
 744    {
 745    const unsigned dims = bld->dims;
 746    LLVMValueRef width_vec, height_vec, depth_vec;
 747    LLVMValueRef offset;
 748    LLVMValueRef x_subcoord, y_subcoord;
 749    LLVMValueRef x_icoord = NULL, y_icoord = NULL, z_icoord = NULL;
 750    LLVMValueRef flt_size;
 751
 752    flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
 753
 754    lp_build_extract_image_sizes(bld,
 755                                 &bld->float_size_bld,
 756                                 bld->coord_type,
 757                                 flt_size,
 758                                 &width_vec,
 759                                 &height_vec,
 760                                 &depth_vec);
 761
 762    /* Do texcoord wrapping */
 763    lp_build_sample_wrap_nearest_float(bld,
 764                                       s, width_vec, offsets[0],
 765                                       bld->static_texture_state->pot_width,
 766                                       bld->static_sampler_state->wrap_s,
 767                                       &x_icoord);
 768
 769    if (dims >= 2) {
 770       lp_build_sample_wrap_nearest_float(bld,
 771                                          t, height_vec, offsets[1],
 772                                          bld->static_texture_state->pot_height,
 773                                          bld->static_sampler_state->wrap_t,
 774                                          &y_icoord);
 775
 776       if (dims >= 3) {
 777          lp_build_sample_wrap_nearest_float(bld,
 778                                             r, depth_vec, offsets[2],
 779                                             bld->static_texture_state->pot_depth,
 780                                             bld->static_sampler_state->wrap_r,
 781                                             &z_icoord);
 782       }
 783    }
 784    if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
 785        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
 786        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
 787       z_icoord = r;
 788    }
 789
 790    /*
 791     * From here on we deal with ints, and we should split up the 256bit
 792     * vectors manually for better generated code.
 793     */
 794
 795    /*
 796     * compute texel offsets -
 797     * cannot do offset calc with floats, difficult for block-based formats,
 798     * and not enough precision anyway.
 799     */
 800    lp_build_sample_offset(&bld->int_coord_bld,
 801                           bld->format_desc,
 802                           x_icoord, y_icoord,
 803                           z_icoord,
 804                           row_stride_vec, img_stride_vec,
 805                           &offset,
 806                           &x_subcoord, &y_subcoord);
 807    if (mipoffsets) {
 808       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
 809    }
 810
 811    lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
 812                                        x_subcoord, y_subcoord,
 813                                        colors);
 814 }
 815
 816
 817 /**
 818  * Fetch texels for image with linear sampling.
 819  * Return filtered color as two vectors of 16-bit fixed point values.
 820  */
 821 static void
 822 lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
 823                                    LLVMValueRef data_ptr,
 824                                    LLVMValueRef offset[2][2][2],
 825                                    LLVMValueRef x_subcoord[2],
 826                                    LLVMValueRef y_subcoord[2],
 827                                    LLVMValueRef s_fpart,
 828                                    LLVMValueRef t_fpart,
 829                                    LLVMValueRef r_fpart,
 830                                    LLVMValueRef *colors)
 831 {
 832    const unsigned dims = bld->dims;
 833    LLVMBuilderRef builder = bld->gallivm->builder;
 834    struct lp_build_context u8n;
 835    LLVMTypeRef u8n_vec_type;
 836    LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
 837    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
 838    LLVMValueRef shuffle;
 839    LLVMValueRef neighbors[2][2][2]; /* [z][y][x] */
 840    LLVMValueRef packed;
 841    unsigned i, j, k;
 842    unsigned numj, numk;
 843
 844    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
 845    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 846
 847    /*
 848     * Transform 4 x i32 in
 849     *
 850     *   s_fpart = {s0, s1, s2, s3}
 851     *
 852     * where each value is between 0 and 0xff,
 853     *
 854     * into one 16 x i20
 855     *
 856     *   s_fpart = {s0, s0, s0, s0, s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3}
 857     *
 858     * and likewise for t_fpart. There is no risk of loosing precision here
 859     * since the fractional parts only use the lower 8bits.
 860     */
 861    s_fpart = LLVMBuildBitCast(builder, s_fpart, u8n_vec_type, "");
 862    if (dims >= 2)
 863       t_fpart = LLVMBuildBitCast(builder, t_fpart, u8n_vec_type, "");
 864    if (dims >= 3)
 865       r_fpart = LLVMBuildBitCast(builder, r_fpart, u8n_vec_type, "");
 866
 867    for (j = 0; j < u8n.type.length; j += 4) {
 868 #ifdef PIPE_ARCH_LITTLE_ENDIAN
 869       unsigned subindex = 0;
 870 #else
 871       unsigned subindex = 3;
 872 #endif
 873       LLVMValueRef index;
 874
 875       index = LLVMConstInt(elem_type, j + subindex, 0);
 876       for (i = 0; i < 4; ++i)
 877          shuffles[j + i] = index;
 878    }
 879
 880    shuffle = LLVMConstVector(shuffles, u8n.type.length);
 881
 882    s_fpart = LLVMBuildShuffleVector(builder, s_fpart, u8n.undef,
 883                                     shuffle, "");
 884    if (dims >= 2) {
 885       t_fpart = LLVMBuildShuffleVector(builder, t_fpart, u8n.undef,
 886                                        shuffle, "");
 887    }
 888    if (dims >= 3) {
 889       r_fpart = LLVMBuildShuffleVector(builder, r_fpart, u8n.undef,
 890                                        shuffle, "");
 891    }
 892
 893    /*
 894     * Fetch the pixels as 4 x 32bit (rgba order might differ):
 895     *
 896     *   rgba0 rgba1 rgba2 rgba3
 897     *
 898     * bit cast them into 16 x u8
 899     *
 900     *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
 901     *
 902     * unpack them into two 8 x i16:
 903     *
 904     *   r0 g0 b0 a0 r1 g1 b1 a1
 905     *   r2 g2 b2 a2 r3 g3 b3 a3
 906     *
 907     * The higher 8 bits of the resulting elements will be zero.
 908     */
 909    numj = 1 + (dims >= 2);
 910    numk = 1 + (dims >= 3);
 911
 912    for (k = 0; k < numk; k++) {
 913       for (j = 0; j < numj; j++) {
 914          for (i = 0; i < 2; i++) {
 915             LLVMValueRef rgba8;
 916
 917             if (util_format_is_rgba8_variant(bld->format_desc)) {
 918                /*
 919                 * Given the format is a rgba8, just read the pixels as is,
 920                 * without any swizzling. Swizzling will be done later.
 921                 */
 922                rgba8 = lp_build_gather(bld->gallivm,
 923                                        bld->texel_type.length,
 924                                        bld->format_desc->block.bits,
 925                                        bld->texel_type.width,
 926                                        data_ptr, offset[k][j][i], TRUE);
 927
 928                rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
 929             }
 930             else {
 931                rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
 932                                                bld->format_desc,
 933                                                u8n.type,
 934                                                data_ptr, offset[k][j][i],
 935                                                x_subcoord[i],
 936                                                y_subcoord[j]);
 937             }
 938
 939             neighbors[k][j][i] = rgba8;
 940          }
 941       }
 942    }
 943
 944    /*
 945     * Linear interpolation with 8.8 fixed point.
 946     */
 947    if (bld->static_sampler_state->force_nearest_s) {
 948       /* special case 1-D lerp */
 949       packed = lp_build_lerp(&u8n,
 950                              t_fpart,
 951                              neighbors[0][0][0],
 952                              neighbors[0][0][1],
 953                              LP_BLD_LERP_PRESCALED_WEIGHTS);
 954    }
 955    else if (bld->static_sampler_state->force_nearest_t) {
 956       /* special case 1-D lerp */
 957       packed = lp_build_lerp(&u8n,
 958                              s_fpart,
 959                              neighbors[0][0][0],
 960                              neighbors[0][0][1],
 961                              LP_BLD_LERP_PRESCALED_WEIGHTS);
 962    }
 963    else {
 964       /* general 1/2/3-D lerping */
 965       if (dims == 1) {
 966          packed = lp_build_lerp(&u8n,
 967                                 s_fpart,
 968                                 neighbors[0][0][0],
 969                                 neighbors[0][0][1],
 970                                 LP_BLD_LERP_PRESCALED_WEIGHTS);
 971       } else if (dims == 2) {
 972          /* 2-D lerp */
 973          packed = lp_build_lerp_2d(&u8n,
 974                                    s_fpart, t_fpart,
 975                                    neighbors[0][0][0],
 976                                    neighbors[0][0][1],
 977                                    neighbors[0][1][0],
 978                                    neighbors[0][1][1],
 979                                    LP_BLD_LERP_PRESCALED_WEIGHTS);
 980       } else {
 981          /* 3-D lerp */
 982          assert(dims == 3);
 983          packed = lp_build_lerp_3d(&u8n,
 984                                    s_fpart, t_fpart, r_fpart,
 985                                    neighbors[0][0][0],
 986                                    neighbors[0][0][1],
 987                                    neighbors[0][1][0],
 988                                    neighbors[0][1][1],
 989                                    neighbors[1][0][0],
 990                                    neighbors[1][0][1],
 991                                    neighbors[1][1][0],
 992                                    neighbors[1][1][1],
 993                                    LP_BLD_LERP_PRESCALED_WEIGHTS);
 994       }
 995    }
 996
 997    *colors = packed;
 998 }
 999
1000 /**
1001  * Sample a single texture image with (bi-)(tri-)linear sampling.
1002  * Return filtered color as two vectors of 16-bit fixed point values.
1003  */
1004 static void
1005 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1006                              LLVMValueRef int_size,
1007                              LLVMValueRef row_stride_vec,
1008                              LLVMValueRef img_stride_vec,
1009                              LLVMValueRef data_ptr,
1010                              LLVMValueRef mipoffsets,
1011                              LLVMValueRef s,
1012                              LLVMValueRef t,
1013                              LLVMValueRef r,
1014                              const LLVMValueRef *offsets,
1015                              LLVMValueRef *colors)
1016 {
1017    const unsigned dims = bld->dims;
1018    LLVMBuilderRef builder = bld->gallivm->builder;
1019    struct lp_build_context i32;
1020    LLVMValueRef i32_c8, i32_c128, i32_c255;
1021    LLVMValueRef width_vec, height_vec, depth_vec;
1022    LLVMValueRef s_ipart, s_fpart, s_float;
1023    LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
1024    LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
1025    LLVMValueRef x_stride, y_stride, z_stride;
1026    LLVMValueRef x_offset0, x_offset1;
1027    LLVMValueRef y_offset0, y_offset1;
1028    LLVMValueRef z_offset0, z_offset1;
1029    LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1030    LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
1031    unsigned x, y, z;
1032
1033    lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
1034
1035    lp_build_extract_image_sizes(bld,
1036                                 &bld->int_size_bld,
1037                                 bld->int_coord_type,
1038                                 int_size,
1039                                 &width_vec,
1040                                 &height_vec,
1041                                 &depth_vec);
1042
1043    s_float = s; t_float = t; r_float = r;
1044
1045    if (bld->static_sampler_state->normalized_coords) {
1046       LLVMValueRef scaled_size;
1047       LLVMValueRef flt_size;
1048
1049       /* scale size by 256 (8 fractional bits) */
1050       scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
1051
1052       flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
1053
1054       lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
1055    }
1056    else {
1057       /* scale coords by 256 (8 fractional bits) */
1058       s = lp_build_mul_imm(&bld->coord_bld, s, 256);
1059       if (dims >= 2)
1060          t = lp_build_mul_imm(&bld->coord_bld, t, 256);
1061       if (dims >= 3)
1062          r = lp_build_mul_imm(&bld->coord_bld, r, 256);
1063    }
1064
1065    /* convert float to int */
1066    /* For correct rounding, need round to nearest, not truncation here.
1067     * Note that in some cases (clamp to edge, no texel offsets) we
1068     * could use a non-signed build context which would help archs which
1069     * don't have fptosi intrinsic with nearest rounding implemented.
1070     */
1071    s = lp_build_iround(&bld->coord_bld, s);
1072    if (dims >= 2)
1073       t = lp_build_iround(&bld->coord_bld, t);
1074    if (dims >= 3)
1075       r = lp_build_iround(&bld->coord_bld, r);
1076
1077    /* subtract 0.5 (add -128) */
1078    i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128);
1079    if (!bld->static_sampler_state->force_nearest_s) {
1080       s = LLVMBuildAdd(builder, s, i32_c128, "");
1081    }
1082    if (dims >= 2 && !bld->static_sampler_state->force_nearest_t) {
1083       t = LLVMBuildAdd(builder, t, i32_c128, "");
1084    }
1085    if (dims >= 3) {
1086       r = LLVMBuildAdd(builder, r, i32_c128, "");
1087    }
1088
1089    /* compute floor (shift right 8) */
1090    i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
1091    s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
1092    if (dims >= 2)
1093       t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
1094    if (dims >= 3)
1095       r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
1096
1097    /* add texel offsets */
1098    if (offsets[0]) {
1099       s_ipart = lp_build_add(&i32, s_ipart, offsets[0]);
1100       if (dims >= 2) {
1101          t_ipart = lp_build_add(&i32, t_ipart, offsets[1]);
1102          if (dims >= 3) {
1103             r_ipart = lp_build_add(&i32, r_ipart, offsets[2]);
1104          }
1105       }
1106    }
1107
1108    /* compute fractional part (AND with 0xff) */
1109    i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255);
1110    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
1111    if (dims >= 2)
1112       t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
1113    if (dims >= 3)
1114       r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
1115
1116    /* get pixel, row and image strides */
1117    x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type,
1118                                  bld->format_desc->block.bits/8);
1119    y_stride = row_stride_vec;
1120    z_stride = img_stride_vec;
1121
1122    /* do texcoord wrapping and compute texel offsets */
1123    lp_build_sample_wrap_linear_int(bld,
1124                                    bld->format_desc->block.width,
1125                                    s_ipart, &s_fpart, s_float,
1126                                    width_vec, x_stride, offsets[0],
1127                                    bld->static_texture_state->pot_width,
1128                                    bld->static_sampler_state->wrap_s,
1129                                    &x_offset0, &x_offset1,
1130                                    &x_subcoord[0], &x_subcoord[1]);
1131
1132    /* add potential cube/array/mip offsets now as they are constant per pixel */
1133    if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1134        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1135        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1136       LLVMValueRef z_offset;
1137       z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1138       /* The r coord is the cube face in [0,5] or array layer */
1139       x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1140       x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1141    }
1142    if (mipoffsets) {
1143       x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1144       x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1145    }
1146
1147    for (z = 0; z < 2; z++) {
1148       for (y = 0; y < 2; y++) {
1149          offset[z][y][0] = x_offset0;
1150          offset[z][y][1] = x_offset1;
1151       }
1152    }
1153
1154    if (dims >= 2) {
1155       lp_build_sample_wrap_linear_int(bld,
1156                                       bld->format_desc->block.height,
1157                                       t_ipart, &t_fpart, t_float,
1158                                       height_vec, y_stride, offsets[1],
1159                                       bld->static_texture_state->pot_height,
1160                                       bld->static_sampler_state->wrap_t,
1161                                       &y_offset0, &y_offset1,
1162                                       &y_subcoord[0], &y_subcoord[1]);
1163
1164       for (z = 0; z < 2; z++) {
1165          for (x = 0; x < 2; x++) {
1166             offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1167                                            offset[z][0][x], y_offset0);
1168             offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1169                                            offset[z][1][x], y_offset1);
1170          }
1171       }
1172    }
1173
1174    if (dims >= 3) {
1175       lp_build_sample_wrap_linear_int(bld,
1176                                       1, /* block length (depth) */
1177                                       r_ipart, &r_fpart, r_float,
1178                                       depth_vec, z_stride, offsets[2],
1179                                       bld->static_texture_state->pot_depth,
1180                                       bld->static_sampler_state->wrap_r,
1181                                       &z_offset0, &z_offset1,
1182                                       &z_subcoord[0], &z_subcoord[1]);
1183       for (y = 0; y < 2; y++) {
1184          for (x = 0; x < 2; x++) {
1185             offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1186                                            offset[0][y][x], z_offset0);
1187             offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1188                                            offset[1][y][x], z_offset1);
1189          }
1190       }
1191    }
1192
1193    lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1194                                       x_subcoord, y_subcoord,
1195                                       s_fpart, t_fpart, r_fpart,
1196                                       colors);
1197 }
1198
1199
1200 /**
1201  * Sample a single texture image with (bi-)(tri-)linear sampling.
1202  * Return filtered color as two vectors of 16-bit fixed point values.
1203  * Does address calcs (except offsets) with floats.
1204  * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
1205  */
1206 static void
1207 lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
1208                                     LLVMValueRef int_size,
1209                                     LLVMValueRef row_stride_vec,
1210                                     LLVMValueRef img_stride_vec,
1211                                     LLVMValueRef data_ptr,
1212                                     LLVMValueRef mipoffsets,
1213                                     LLVMValueRef s,
1214                                     LLVMValueRef t,
1215                                     LLVMValueRef r,
1216                                     const LLVMValueRef *offsets,
1217                                     LLVMValueRef *colors)
1218 {
1219    const unsigned dims = bld->dims;
1220    LLVMValueRef width_vec, height_vec, depth_vec;
1221    LLVMValueRef s_fpart;
1222    LLVMValueRef t_fpart = NULL;
1223    LLVMValueRef r_fpart = NULL;
1224    LLVMValueRef x_stride, y_stride, z_stride;
1225    LLVMValueRef x_offset0, x_offset1;
1226    LLVMValueRef y_offset0, y_offset1;
1227    LLVMValueRef z_offset0, z_offset1;
1228    LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1229    LLVMValueRef x_subcoord[2], y_subcoord[2];
1230    LLVMValueRef flt_size;
1231    LLVMValueRef x_icoord0, x_icoord1;
1232    LLVMValueRef y_icoord0, y_icoord1;
1233    LLVMValueRef z_icoord0, z_icoord1;
1234    unsigned x, y, z;
1235
1236    flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
1237
1238    lp_build_extract_image_sizes(bld,
1239                                 &bld->float_size_bld,
1240                                 bld->coord_type,
1241                                 flt_size,
1242                                 &width_vec,
1243                                 &height_vec,
1244                                 &depth_vec);
1245
1246    /* do texcoord wrapping and compute texel offsets */
1247    lp_build_sample_wrap_linear_float(bld,
1248                                      bld->format_desc->block.width,
1249                                      s, width_vec, offsets[0],
1250                                      bld->static_texture_state->pot_width,
1251                                      bld->static_sampler_state->wrap_s,
1252                                      &x_icoord0, &x_icoord1,
1253                                      &s_fpart,
1254                                      bld->static_sampler_state->force_nearest_s);
1255
1256    if (dims >= 2) {
1257       lp_build_sample_wrap_linear_float(bld,
1258                                         bld->format_desc->block.height,
1259                                         t, height_vec, offsets[1],
1260                                         bld->static_texture_state->pot_height,
1261                                         bld->static_sampler_state->wrap_t,
1262                                         &y_icoord0, &y_icoord1,
1263                                         &t_fpart,
1264                                         bld->static_sampler_state->force_nearest_t);
1265
1266       if (dims >= 3) {
1267          lp_build_sample_wrap_linear_float(bld,
1268                                            1, /* block length (depth) */
1269                                            r, depth_vec, offsets[2],
1270                                            bld->static_texture_state->pot_depth,
1271                                            bld->static_sampler_state->wrap_r,
1272                                            &z_icoord0, &z_icoord1,
1273                                            &r_fpart, 0);
1274       }
1275    }
1276
1277    /*
1278     * From here on we deal with ints, and we should split up the 256bit
1279     * vectors manually for better generated code.
1280     */
1281
1282    /* get pixel, row and image strides */
1283    x_stride = lp_build_const_vec(bld->gallivm,
1284                                  bld->int_coord_bld.type,
1285                                  bld->format_desc->block.bits/8);
1286    y_stride = row_stride_vec;
1287    z_stride = img_stride_vec;
1288
1289    /*
1290     * compute texel offset -
1291     * cannot do offset calc with floats, difficult for block-based formats,
1292     * and not enough precision anyway.
1293     */
1294    lp_build_sample_partial_offset(&bld->int_coord_bld,
1295                                   bld->format_desc->block.width,
1296                                   x_icoord0, x_stride,
1297                                   &x_offset0, &x_subcoord[0]);
1298    lp_build_sample_partial_offset(&bld->int_coord_bld,
1299                                   bld->format_desc->block.width,
1300                                   x_icoord1, x_stride,
1301                                   &x_offset1, &x_subcoord[1]);
1302
1303    /* add potential cube/array/mip offsets now as they are constant per pixel */
1304    if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1305        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1306        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1307       LLVMValueRef z_offset;
1308       z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1309       /* The r coord is the cube face in [0,5] or array layer */
1310       x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1311       x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1312    }
1313    if (mipoffsets) {
1314       x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1315       x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1316    }
1317
1318    for (z = 0; z < 2; z++) {
1319       for (y = 0; y < 2; y++) {
1320          offset[z][y][0] = x_offset0;
1321          offset[z][y][1] = x_offset1;
1322       }
1323    }
1324
1325    if (dims >= 2) {
1326       lp_build_sample_partial_offset(&bld->int_coord_bld,
1327                                      bld->format_desc->block.height,
1328                                      y_icoord0, y_stride,
1329                                      &y_offset0, &y_subcoord[0]);
1330       lp_build_sample_partial_offset(&bld->int_coord_bld,
1331                                      bld->format_desc->block.height,
1332                                      y_icoord1, y_stride,
1333                                      &y_offset1, &y_subcoord[1]);
1334       for (z = 0; z < 2; z++) {
1335          for (x = 0; x < 2; x++) {
1336             offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1337                                            offset[z][0][x], y_offset0);
1338             offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1339                                            offset[z][1][x], y_offset1);
1340          }
1341       }
1342    }
1343
1344    if (dims >= 3) {
1345       LLVMValueRef z_subcoord[2];
1346       lp_build_sample_partial_offset(&bld->int_coord_bld,
1347                                      1,
1348                                      z_icoord0, z_stride,
1349                                      &z_offset0, &z_subcoord[0]);
1350       lp_build_sample_partial_offset(&bld->int_coord_bld,
1351                                      1,
1352                                      z_icoord1, z_stride,
1353                                      &z_offset1, &z_subcoord[1]);
1354       for (y = 0; y < 2; y++) {
1355          for (x = 0; x < 2; x++) {
1356             offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1357                                            offset[0][y][x], z_offset0);
1358             offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1359                                            offset[1][y][x], z_offset1);
1360          }
1361       }
1362    }
1363
1364    lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1365                                       x_subcoord, y_subcoord,
1366                                       s_fpart, t_fpart, r_fpart,
1367                                       colors);
1368 }
1369
1370
1371 /**
1372  * Sample the texture/mipmap using given image filter and mip filter.
1373  * data0_ptr and data1_ptr point to the two mipmap levels to sample
1374  * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
1375  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1376  */
1377 static void
1378 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1379                        unsigned img_filter,
1380                        unsigned mip_filter,
1381                        LLVMValueRef s,
1382                        LLVMValueRef t,
1383                        LLVMValueRef r,
1384                        const LLVMValueRef *offsets,
1385                        LLVMValueRef ilevel0,
1386                        LLVMValueRef ilevel1,
1387                        LLVMValueRef lod_fpart,
1388                        LLVMValueRef colors_var)
1389 {
1390    LLVMBuilderRef builder = bld->gallivm->builder;
1391    LLVMValueRef size0;
1392    LLVMValueRef size1;
1393    LLVMValueRef row_stride0_vec = NULL;
1394    LLVMValueRef row_stride1_vec = NULL;
1395    LLVMValueRef img_stride0_vec = NULL;
1396    LLVMValueRef img_stride1_vec = NULL;
1397    LLVMValueRef data_ptr0;
1398    LLVMValueRef data_ptr1;
1399    LLVMValueRef mipoff0 = NULL;
1400    LLVMValueRef mipoff1 = NULL;
1401    LLVMValueRef colors0;
1402    LLVMValueRef colors1;
1403
1404    /* sample the first mipmap level */
1405    lp_build_mipmap_level_sizes(bld, ilevel0,
1406                                &size0,
1407                                &row_stride0_vec, &img_stride0_vec);
1408    if (bld->num_mips == 1) {
1409       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1410    }
1411    else {
1412       /* This path should work for num_lods 1 too but slightly less efficient */
1413       data_ptr0 = bld->base_ptr;
1414       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1415    }
1416
1417    if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1418       if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1419          lp_build_sample_image_nearest_afloat(bld,
1420                                               size0,
1421                                               row_stride0_vec, img_stride0_vec,
1422                                               data_ptr0, mipoff0, s, t, r, offsets,
1423                                               &colors0);
1424       }
1425       else {
1426          assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1427          lp_build_sample_image_linear_afloat(bld,
1428                                              size0,
1429                                              row_stride0_vec, img_stride0_vec,
1430                                              data_ptr0, mipoff0, s, t, r, offsets,
1431                                              &colors0);
1432       }
1433    }
1434    else {
1435       if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1436          lp_build_sample_image_nearest(bld,
1437                                        size0,
1438                                        row_stride0_vec, img_stride0_vec,
1439                                        data_ptr0, mipoff0, s, t, r, offsets,
1440                                        &colors0);
1441       }
1442       else {
1443          assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1444          lp_build_sample_image_linear(bld,
1445                                       size0,
1446                                       row_stride0_vec, img_stride0_vec,
1447                                       data_ptr0, mipoff0, s, t, r, offsets,
1448                                       &colors0);
1449       }
1450    }
1451
1452    /* Store the first level's colors in the output variables */
1453    LLVMBuildStore(builder, colors0, colors_var);
1454
1455    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1456       LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
1457                                                      bld->lodf_bld.type, 256.0);
1458       LLVMTypeRef i32vec_type = bld->lodi_bld.vec_type;
1459       struct lp_build_if_state if_ctx;
1460       LLVMValueRef need_lerp;
1461       unsigned num_quads = bld->coord_bld.type.length / 4;
1462       unsigned i;
1463
1464       lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
1465       lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
1466
1467       /* need_lerp = lod_fpart > 0 */
1468       if (bld->num_lods == 1) {
1469          need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
1470                                    lod_fpart, bld->lodi_bld.zero,
1471                                    "need_lerp");
1472       }
1473       else {
1474          /*
1475           * We'll do mip filtering if any of the quads need it.
1476           * It might be better to split the vectors here and only fetch/filter
1477           * quads which need it.
1478           */
1479          /*
1480           * We need to clamp lod_fpart here since we can get negative
1481           * values which would screw up filtering if not all
1482           * lod_fpart values have same sign.
1483           * We can however then skip the greater than comparison.
1484           */
1485          lod_fpart = lp_build_max(&bld->lodi_bld, lod_fpart,
1486                                   bld->lodi_bld.zero);
1487          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_fpart);
1488       }
1489
1490       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1491       {
1492          struct lp_build_context u8n_bld;
1493
1494          lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
1495
1496          /* sample the second mipmap level */
1497          lp_build_mipmap_level_sizes(bld, ilevel1,
1498                                      &size1,
1499                                      &row_stride1_vec, &img_stride1_vec);
1500          if (bld->num_mips == 1) {
1501             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1502          }
1503          else {
1504             data_ptr1 = bld->base_ptr;
1505             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1506          }
1507
1508          if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1509             if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1510                lp_build_sample_image_nearest_afloat(bld,
1511                                                     size1,
1512                                                     row_stride1_vec, img_stride1_vec,
1513                                                     data_ptr1, mipoff1, s, t, r, offsets,
1514                                                     &colors1);
1515             }
1516             else {
1517                lp_build_sample_image_linear_afloat(bld,
1518                                                    size1,
1519                                                    row_stride1_vec, img_stride1_vec,
1520                                                    data_ptr1, mipoff1, s, t, r, offsets,
1521                                                    &colors1);
1522             }
1523          }
1524          else {
1525             if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1526                lp_build_sample_image_nearest(bld,
1527                                              size1,
1528                                              row_stride1_vec, img_stride1_vec,
1529                                              data_ptr1, mipoff1, s, t, r, offsets,
1530                                              &colors1);
1531             }
1532             else {
1533                lp_build_sample_image_linear(bld,
1534                                             size1,
1535                                             row_stride1_vec, img_stride1_vec,
1536                                             data_ptr1, mipoff1, s, t, r, offsets,
1537                                             &colors1);
1538             }
1539          }
1540
1541          /* interpolate samples from the two mipmap levels */
1542
1543          if (num_quads == 1 && bld->num_lods == 1) {
1544             lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, "");
1545             lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart);
1546          }
1547          else {
1548             unsigned num_chans_per_lod = 4 * bld->coord_type.length / bld->num_lods;
1549             LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->lodi_bld.type.length);
1550             LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH];
1551
1552             /* Take the LSB of lod_fpart */
1553             lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, "");
1554
1555             /* Broadcast each lod weight into their respective channels */
1556             for (i = 0; i < u8n_bld.type.length; ++i) {
1557                shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_lod);
1558             }
1559             lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type),
1560                                                LLVMConstVector(shuffle, u8n_bld.type.length), "");
1561          }
1562
1563          colors0 = lp_build_lerp(&u8n_bld, lod_fpart,
1564                                  colors0, colors1,
1565                                  LP_BLD_LERP_PRESCALED_WEIGHTS);
1566
1567          LLVMBuildStore(builder, colors0, colors_var);
1568       }
1569       lp_build_endif(&if_ctx);
1570    }
1571 }
1572
1573
1574
1575 /**
1576  * Texture sampling in AoS format.  Used when sampling common 32-bit/texel
1577  * formats.  1D/2D/3D/cube texture supported.  All mipmap sampling modes
1578  * but only limited texture coord wrap modes.
1579  */
1580 void
1581 lp_build_sample_aos(struct lp_build_sample_context *bld,
1582                     unsigned sampler_unit,
1583                     LLVMValueRef s,
1584                     LLVMValueRef t,
1585                     LLVMValueRef r,
1586                     const LLVMValueRef *offsets,
1587                     LLVMValueRef lod_positive,
1588                     LLVMValueRef lod_fpart,
1589                     LLVMValueRef ilevel0,
1590                     LLVMValueRef ilevel1,
1591                     LLVMValueRef texel_out[4])
1592 {
1593    LLVMBuilderRef builder = bld->gallivm->builder;
1594    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
1595    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
1596    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
1597    const unsigned dims = bld->dims;
1598    LLVMValueRef packed_var, packed;
1599    LLVMValueRef unswizzled[4];
1600    struct lp_build_context u8n_bld;
1601
1602    /* we only support the common/simple wrap modes at this time */
1603    assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_s));
1604    if (dims >= 2)
1605       assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_t));
1606    if (dims >= 3)
1607       assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_r));
1608
1609
1610    /* make 8-bit unorm builder context */
1611    lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
1612
1613    /*
1614     * Get/interpolate texture colors.
1615     */
1616
1617    packed_var = lp_build_alloca(bld->gallivm, u8n_bld.vec_type, "packed_var");
1618
1619    if (min_filter == mag_filter) {
1620       /* no need to distinguish between minification and magnification */
1621       lp_build_sample_mipmap(bld,
1622                              min_filter, mip_filter,
1623                              s, t, r, offsets,
1624                              ilevel0, ilevel1, lod_fpart,
1625                              packed_var);
1626    }
1627    else {
1628       /* Emit conditional to choose min image filter or mag image filter
1629        * depending on the lod being > 0 or <= 0, respectively.
1630        */
1631       struct lp_build_if_state if_ctx;
1632
1633       /*
1634        * FIXME this should take all lods into account, if some are min
1635        * some max probably could hack up the weights in the linear
1636        * path with selects to work for nearest.
1637        */
1638       if (bld->num_lods > 1)
1639          lod_positive = LLVMBuildExtractElement(builder, lod_positive,
1640                                                 lp_build_const_int32(bld->gallivm, 0), "");
1641
1642       lod_positive = LLVMBuildTrunc(builder, lod_positive,
1643                                     LLVMInt1TypeInContext(bld->gallivm->context), "");
1644
1645       lp_build_if(&if_ctx, bld->gallivm, lod_positive);
1646       {
1647          /* Use the minification filter */
1648          lp_build_sample_mipmap(bld,
1649                                 min_filter, mip_filter,
1650                                 s, t, r, offsets,
1651                                 ilevel0, ilevel1, lod_fpart,
1652                                 packed_var);
1653       }
1654       lp_build_else(&if_ctx);
1655       {
1656          /* Use the magnification filter */
1657          lp_build_sample_mipmap(bld,
1658                                 mag_filter, PIPE_TEX_MIPFILTER_NONE,
1659                                 s, t, r, offsets,
1660                                 ilevel0, NULL, NULL,
1661                                 packed_var);
1662       }
1663       lp_build_endif(&if_ctx);
1664    }
1665
1666    packed = LLVMBuildLoad(builder, packed_var, "");
1667
1668    /*
1669     * Convert to SoA and swizzle.
1670     */
1671    lp_build_rgba8_to_fi32_soa(bld->gallivm,
1672                              bld->texel_type,
1673                              packed, unswizzled);
1674
1675    if (util_format_is_rgba8_variant(bld->format_desc)) {
1676       lp_build_format_swizzle_soa(bld->format_desc,
1677                                   &bld->texel_bld,
1678                                   unswizzled, texel_out);
1679    }
1680    else {
1681       texel_out[0] = unswizzled[0];
1682       texel_out[1] = unswizzled[1];
1683       texel_out[2] = unswizzled[2];
1684       texel_out[3] = unswizzled[3];
1685    }
1686 }