src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- SoA.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  * @author Brian Paul <brianp@vmware.com>
  34  */
  35
  36 #include "pipe/p_defines.h"
  37 #include "pipe/p_state.h"
  38 #include "pipe/p_shader_tokens.h"
  39 #include "util/u_debug.h"
  40 #include "util/u_dump.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_math.h"
  43 #include "util/format/u_format.h"
  44 #include "util/u_cpu_detect.h"
  45 #include "util/format_rgb9e5.h"
  46 #include "lp_bld_debug.h"
  47 #include "lp_bld_type.h"
  48 #include "lp_bld_const.h"
  49 #include "lp_bld_conv.h"
  50 #include "lp_bld_arit.h"
  51 #include "lp_bld_bitarit.h"
  52 #include "lp_bld_logic.h"
  53 #include "lp_bld_printf.h"
  54 #include "lp_bld_swizzle.h"
  55 #include "lp_bld_flow.h"
  56 #include "lp_bld_gather.h"
  57 #include "lp_bld_format.h"
  58 #include "lp_bld_sample.h"
  59 #include "lp_bld_sample_aos.h"
  60 #include "lp_bld_struct.h"
  61 #include "lp_bld_quad.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_intr.h"
  64 #include "lp_bld_misc.h"
  65
  66
  67 /**
  68  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  69  * The computation depends on whether the texture is 1D, 2D or 3D.
  70  * The result, texel, will be float vectors:
  71  *   texel[0] = red values
  72  *   texel[1] = green values
  73  *   texel[2] = blue values
  74  *   texel[3] = alpha values
  75  */
  76 static void
  77 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  78                           LLVMValueRef width,
  79                           LLVMValueRef height,
  80                           LLVMValueRef depth,
  81                           LLVMValueRef x,
  82                           LLVMValueRef y,
  83                           LLVMValueRef z,
  84                           LLVMValueRef y_stride,
  85                           LLVMValueRef z_stride,
  86                           LLVMValueRef data_ptr,
  87                           LLVMValueRef mipoffsets,
  88                           LLVMValueRef texel_out[4])
  89 {
  90    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
  91    const unsigned dims = bld->dims;
  92    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
  93    LLVMBuilderRef builder = bld->gallivm->builder;
  94    LLVMValueRef offset;
  95    LLVMValueRef i, j;
  96    LLVMValueRef use_border = NULL;
  97
  98    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
  99    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
 100                                               static_state->min_img_filter,
 101                                               static_state->mag_img_filter)) {
 102       LLVMValueRef b1, b2;
 103       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
 104       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
 105       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 106    }
 107
 108    if (dims >= 2 &&
 109        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
 110                                               static_state->min_img_filter,
 111                                               static_state->mag_img_filter)) {
 112       LLVMValueRef b1, b2;
 113       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
 114       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
 115       if (use_border) {
 116          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 117          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 118       }
 119       else {
 120          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 121       }
 122    }
 123
 124    if (dims == 3 &&
 125        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
 126                                               static_state->min_img_filter,
 127                                               static_state->mag_img_filter)) {
 128       LLVMValueRef b1, b2;
 129       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
 130       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
 131       if (use_border) {
 132          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 133          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 134       }
 135       else {
 136          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 137       }
 138    }
 139
 140    /* convert x,y,z coords to linear offset from start of texture, in bytes */
 141    lp_build_sample_offset(&bld->int_coord_bld,
 142                           bld->format_desc,
 143                           x, y, z, y_stride, z_stride,
 144                           &offset, &i, &j);
 145    if (mipoffsets) {
 146       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
 147    }
 148
 149    if (use_border) {
 150       /* If we can sample the border color, it means that texcoords may
 151        * lie outside the bounds of the texture image.  We need to do
 152        * something to prevent reading out of bounds and causing a segfault.
 153        *
 154        * Simply AND the texture coords with !use_border.  This will cause
 155        * coords which are out of bounds to become zero.  Zero's guaranteed
 156        * to be inside the texture image.
 157        */
 158       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
 159    }
 160
 161    lp_build_fetch_rgba_soa(bld->gallivm,
 162                            bld->format_desc,
 163                            bld->texel_type, TRUE,
 164                            data_ptr, offset,
 165                            i, j,
 166                            bld->cache,
 167                            texel_out);
 168
 169    /*
 170     * Note: if we find an app which frequently samples the texture border
 171     * we might want to implement a true conditional here to avoid sampling
 172     * the texture whenever possible (since that's quite a bit of code).
 173     * Ex:
 174     *   if (use_border) {
 175     *      texel = border_color;
 176     *   }
 177     *   else {
 178     *      texel = sample_texture(coord);
 179     *   }
 180     * As it is now, we always sample the texture, then selectively replace
 181     * the texel color results with the border color.
 182     */
 183
 184    if (use_border) {
 185       /* select texel color or border color depending on use_border. */
 186       const struct util_format_description *format_desc = bld->format_desc;
 187       int chan;
 188       struct lp_type border_type = bld->texel_type;
 189       border_type.length = 4;
 190       /*
 191        * Only replace channels which are actually present. The others should
 192        * get optimized away eventually by sampler_view swizzle anyway but it's
 193        * easier too.
 194        */
 195       for (chan = 0; chan < 4; chan++) {
 196          unsigned chan_s;
 197          /* reverse-map channel... */
 198          if (util_format_has_stencil(format_desc)) {
 199             if (chan == 0)
 200                chan_s = 0;
 201             else
 202                break;
 203          }
 204          else {
 205             for (chan_s = 0; chan_s < 4; chan_s++) {
 206                if (chan_s == format_desc->swizzle[chan]) {
 207                   break;
 208                }
 209             }
 210          }
 211          if (chan_s <= 3) {
 212             /* use the already clamped color */
 213             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
 214             LLVMValueRef border_chan;
 215
 216             border_chan = lp_build_extract_broadcast(bld->gallivm,
 217                                                      border_type,
 218                                                      bld->texel_type,
 219                                                      bld->border_color_clamped,
 220                                                      idx);
 221             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
 222                                               border_chan, texel_out[chan]);
 223          }
 224       }
 225    }
 226 }
 227
 228
 229 /**
 230  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
 231  * (Note that with pot sizes could do this much more easily post-scale
 232  * with some bit arithmetic.)
 233  */
 234 static LLVMValueRef
 235 lp_build_coord_mirror(struct lp_build_sample_context *bld,
 236                       LLVMValueRef coord, boolean posOnly)
 237 {
 238    struct lp_build_context *coord_bld = &bld->coord_bld;
 239    LLVMValueRef fract;
 240    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 241
 242    /*
 243     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
 244     * it all works out. (The result is in range [-1, 1.0], negative if
 245     * the coord is in the "odd" section, otherwise positive.)
 246     */
 247
 248    coord = lp_build_mul(coord_bld, coord, half);
 249    fract = lp_build_round(coord_bld, coord);
 250    fract = lp_build_sub(coord_bld, coord, fract);
 251    coord = lp_build_add(coord_bld, fract, fract);
 252
 253    if (posOnly) {
 254       /*
 255        * Theoretically it's not quite 100% accurate because the spec says
 256        * that ultimately a scaled coord of -x.0 should map to int coord
 257        * -x + 1 with mirroring, not -x (this does not matter for bilinear
 258        * filtering).
 259        */
 260       coord = lp_build_abs(coord_bld, coord);
 261       /* kill off NaNs */
 262       /* XXX: not safe without arch rounding, fract can be anything. */
 263       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
 264                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 265    }
 266
 267    return coord;
 268 }
 269
 270
 271 /**
 272  * Helper to compute the first coord and the weight for
 273  * linear wrap repeat npot textures
 274  */
 275 void
 276 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
 277                                   LLVMValueRef coord_f,
 278                                   LLVMValueRef length_i,
 279                                   LLVMValueRef length_f,
 280                                   LLVMValueRef *coord0_i,
 281                                   LLVMValueRef *weight_f)
 282 {
 283    struct lp_build_context *coord_bld = &bld->coord_bld;
 284    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 285    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 286    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
 287                                                 int_coord_bld->one);
 288    LLVMValueRef mask;
 289    /* wrap with normalized floats is just fract */
 290    coord_f = lp_build_fract(coord_bld, coord_f);
 291    /* mul by size and subtract 0.5 */
 292    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
 293    coord_f = lp_build_sub(coord_bld, coord_f, half);
 294    /*
 295     * we avoided the 0.5/length division before the repeat wrap,
 296     * now need to fix up edge cases with selects
 297     */
 298    /*
 299     * Note we do a float (unordered) compare so we can eliminate NaNs.
 300     * (Otherwise would need fract_safe above).
 301     */
 302    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
 303                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
 304
 305    /* convert to int, compute lerp weight */
 306    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
 307    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
 308 }
 309
 310
 311 /**
 312  * Build LLVM code for texture wrap mode for linear filtering.
 313  * \param x0_out  returns first integer texcoord
 314  * \param x1_out  returns second integer texcoord
 315  * \param weight_out  returns linear interpolation weight
 316  */
 317 static void
 318 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 319                             boolean is_gather,
 320                             LLVMValueRef coord,
 321                             LLVMValueRef length,
 322                             LLVMValueRef length_f,
 323                             LLVMValueRef offset,
 324                             boolean is_pot,
 325                             unsigned wrap_mode,
 326                             LLVMValueRef *x0_out,
 327                             LLVMValueRef *x1_out,
 328                             LLVMValueRef *weight_out)
 329 {
 330    struct lp_build_context *coord_bld = &bld->coord_bld;
 331    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 332    LLVMBuilderRef builder = bld->gallivm->builder;
 333    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 334    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 335    LLVMValueRef coord0, coord1, weight;
 336
 337    switch(wrap_mode) {
 338    case PIPE_TEX_WRAP_REPEAT:
 339       if (is_pot) {
 340          /* mul by size and subtract 0.5 */
 341          coord = lp_build_mul(coord_bld, coord, length_f);
 342          coord = lp_build_sub(coord_bld, coord, half);
 343          if (offset) {
 344             offset = lp_build_int_to_float(coord_bld, offset);
 345             coord = lp_build_add(coord_bld, coord, offset);
 346          }
 347          /* convert to int, compute lerp weight */
 348          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 349          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 350          /* repeat wrap */
 351          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
 352          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
 353       }
 354       else {
 355          LLVMValueRef mask;
 356          if (offset) {
 357             offset = lp_build_int_to_float(coord_bld, offset);
 358             offset = lp_build_div(coord_bld, offset, length_f);
 359             coord = lp_build_add(coord_bld, coord, offset);
 360          }
 361          lp_build_coord_repeat_npot_linear(bld, coord,
 362                                            length, length_f,
 363                                            &coord0, &weight);
 364          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 365                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 366          coord1 = LLVMBuildAnd(builder,
 367                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
 368                                mask, "");
 369       }
 370       break;
 371
 372    case PIPE_TEX_WRAP_CLAMP:
 373       if (bld->static_sampler_state->normalized_coords) {
 374          /* scale coord to length */
 375          coord = lp_build_mul(coord_bld, coord, length_f);
 376       }
 377       if (offset) {
 378          offset = lp_build_int_to_float(coord_bld, offset);
 379          coord = lp_build_add(coord_bld, coord, offset);
 380       }
 381
 382       /*
 383        * clamp to [0, length]
 384        *
 385        * Unlike some other wrap modes, this should be correct for gather
 386        * too. GL_CLAMP explicitly does this clamp on the coord prior to
 387        * actual wrapping (which is per sample).
 388        */
 389       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
 390
 391       coord = lp_build_sub(coord_bld, coord, half);
 392
 393       /* convert to int, compute lerp weight */
 394       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 395       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 396       break;
 397
 398    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 399       {
 400          struct lp_build_context abs_coord_bld = bld->coord_bld;
 401          abs_coord_bld.type.sign = FALSE;
 402
 403          if (bld->static_sampler_state->normalized_coords) {
 404             /* mul by tex size */
 405             coord = lp_build_mul(coord_bld, coord, length_f);
 406          }
 407          if (offset) {
 408             offset = lp_build_int_to_float(coord_bld, offset);
 409             coord = lp_build_add(coord_bld, coord, offset);
 410          }
 411
 412          /* clamp to length max */
 413          coord = lp_build_min_ext(coord_bld, coord, length_f,
 414                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 415          if (!is_gather) {
 416             /* subtract 0.5 */
 417             coord = lp_build_sub(coord_bld, coord, half);
 418             /* clamp to [0, length - 0.5] */
 419             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 420             /* convert to int, compute lerp weight */
 421             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 422             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 423          } else {
 424             /*
 425              * The non-gather path will end up with coords 0, 1 if coord was
 426              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
 427              * really matter what the second coord is). But for gather, we
 428              * really need to end up with coords 0, 0.
 429              */
 430             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 431             coord0 = lp_build_sub(coord_bld, coord, half);
 432             coord1 = lp_build_add(coord_bld, coord, half);
 433             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
 434             coord0 = lp_build_itrunc(coord_bld, coord0);
 435             coord1 = lp_build_itrunc(coord_bld, coord1);
 436             weight = coord_bld->undef;
 437          }
 438          /* coord1 = min(coord1, length-1) */
 439          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 440          break;
 441       }
 442
 443    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 444       if (bld->static_sampler_state->normalized_coords) {
 445          /* scale coord to length */
 446          coord = lp_build_mul(coord_bld, coord, length_f);
 447       }
 448       if (offset) {
 449          offset = lp_build_int_to_float(coord_bld, offset);
 450          coord = lp_build_add(coord_bld, coord, offset);
 451       }
 452       /*
 453        * We don't need any clamp. Technically, for very large (pos or neg)
 454        * (or infinite) values, clamp against [-length, length] would be
 455        * correct, but we don't need to guarantee any specific
 456        * result for such coords (the ifloor will be undefined, but for modes
 457        * requiring border all resulting coords are safe).
 458        */
 459       coord = lp_build_sub(coord_bld, coord, half);
 460       /* convert to int, compute lerp weight */
 461       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 462       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 463       break;
 464
 465    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 466       if (offset) {
 467          offset = lp_build_int_to_float(coord_bld, offset);
 468          offset = lp_build_div(coord_bld, offset, length_f);
 469          coord = lp_build_add(coord_bld, coord, offset);
 470       }
 471       if (!is_gather) {
 472          /* compute mirror function */
 473          coord = lp_build_coord_mirror(bld, coord, TRUE);
 474
 475          /* scale coord to length */
 476          coord = lp_build_mul(coord_bld, coord, length_f);
 477          coord = lp_build_sub(coord_bld, coord, half);
 478
 479          /* convert to int, compute lerp weight */
 480          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 481          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 482
 483          /* coord0 = max(coord0, 0) */
 484          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
 485          /* coord1 = min(coord1, length-1) */
 486          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 487       } else {
 488          /*
 489           * This is pretty reasonable in the end,  all what the tests care
 490           * about is nasty edge cases (scaled coords x.5, so the individual
 491           * coords are actually integers, which is REALLY tricky to get right
 492           * due to this working differently both for negative numbers as well
 493           * as for even/odd cases). But with enough magic it's not too complex
 494           * after all.
 495           * Maybe should try a bit arithmetic one though for POT textures...
 496           */
 497          LLVMValueRef isNeg;
 498          /*
 499           * Wrapping just once still works, even though it means we can
 500           * get "wrong" sign due to performing mirror in the middle of the
 501           * two coords (because this can only happen very near the odd/even
 502           * edges, so both coords will actually end up as 0 or length - 1
 503           * in the end).
 504           * For GL4 gather with per-sample offsets we'd need to the mirroring
 505           * per coord too.
 506           */
 507          coord = lp_build_coord_mirror(bld, coord, FALSE);
 508          coord = lp_build_mul(coord_bld, coord, length_f);
 509
 510          /*
 511           * NaNs should be safe here, we'll do away with them with
 512           * the ones' complement plus min.
 513           */
 514          coord0 = lp_build_sub(coord_bld, coord, half);
 515          coord0 = lp_build_ifloor(coord_bld, coord0);
 516          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 517          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
 518          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 519                               coord0, int_coord_bld->zero);
 520          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
 521          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 522                               coord1, int_coord_bld->zero);
 523          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
 524          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 525          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 526
 527          weight = coord_bld->undef;
 528       }
 529       break;
 530
 531    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 532       if (bld->static_sampler_state->normalized_coords) {
 533          /* scale coord to length */
 534          coord = lp_build_mul(coord_bld, coord, length_f);
 535       }
 536       if (offset) {
 537          offset = lp_build_int_to_float(coord_bld, offset);
 538          coord = lp_build_add(coord_bld, coord, offset);
 539       }
 540       /*
 541        * XXX: probably not correct for gather, albeit I'm not
 542        * entirely sure as it's poorly specified. The wrapping looks
 543        * correct according to the spec which is against gl 1.2.1,
 544        * however negative values will be swapped - gl re-specified
 545        * wrapping with newer versions (no more pre-clamp except with
 546        * GL_CLAMP).
 547        */
 548       coord = lp_build_abs(coord_bld, coord);
 549
 550       /* clamp to [0, length] */
 551       coord = lp_build_min_ext(coord_bld, coord, length_f,
 552                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 553
 554       coord = lp_build_sub(coord_bld, coord, half);
 555
 556       /* convert to int, compute lerp weight */
 557       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 558       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 559       break;
 560
 561    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 562       {
 563          struct lp_build_context abs_coord_bld = bld->coord_bld;
 564          abs_coord_bld.type.sign = FALSE;
 565
 566          if (bld->static_sampler_state->normalized_coords) {
 567             /* scale coord to length */
 568             coord = lp_build_mul(coord_bld, coord, length_f);
 569          }
 570          if (offset) {
 571             offset = lp_build_int_to_float(coord_bld, offset);
 572             coord = lp_build_add(coord_bld, coord, offset);
 573          }
 574          if (!is_gather) {
 575             coord = lp_build_abs(coord_bld, coord);
 576
 577             /* clamp to length max */
 578             coord = lp_build_min_ext(coord_bld, coord, length_f,
 579                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 580             /* subtract 0.5 */
 581             coord = lp_build_sub(coord_bld, coord, half);
 582             /* clamp to [0, length - 0.5] */
 583             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 584
 585             /* convert to int, compute lerp weight */
 586             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 587             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 588             /* coord1 = min(coord1, length-1) */
 589             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 590          } else {
 591             /*
 592              * The non-gather path will swap coord0/1 if coord was negative,
 593              * which is ok for filtering since the filter weight matches
 594              * accordingly. Also, if coord is close to zero, coord0/1 will
 595              * be 0 and 1, instead of 0 and 0 (again ok due to filter
 596              * weight being 0.0). Both issues need to be fixed for gather.
 597              */
 598             LLVMValueRef isNeg;
 599
 600             /*
 601              * Actually wanted to cheat here and use:
 602              * coord1 = lp_build_iround(coord_bld, coord);
 603              * but it's not good enough for some tests (even piglit
 604              * textureGather is set up in a way so the coords area always
 605              * .5, that is right at the crossover points).
 606              * So do ordinary sub/floor, then do ones' complement
 607              * for negative numbers.
 608              * (Note can't just do sub|add/abs/itrunc per coord neither -
 609              * because the spec demands that mirror(3.0) = 3 but
 610              * mirror(-3.0) = 2.)
 611              */
 612             coord = lp_build_sub(coord_bld, coord, half);
 613             coord0 = lp_build_ifloor(coord_bld, coord);
 614             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 615             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
 616                                  int_coord_bld->zero);
 617             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
 618             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 619
 620             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
 621                                  int_coord_bld->zero);
 622             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
 623             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 624
 625             weight = coord_bld->undef;
 626          }
 627       }
 628       break;
 629
 630    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 631       {
 632          if (bld->static_sampler_state->normalized_coords) {
 633             /* scale coord to length */
 634             coord = lp_build_mul(coord_bld, coord, length_f);
 635          }
 636          if (offset) {
 637             offset = lp_build_int_to_float(coord_bld, offset);
 638             coord = lp_build_add(coord_bld, coord, offset);
 639          }
 640          /*
 641           * XXX: probably not correct for gather due to swapped
 642           * order if coord is negative (same rationale as for
 643           * MIRROR_CLAMP).
 644           */
 645          coord = lp_build_abs(coord_bld, coord);
 646
 647          /*
 648           * We don't need any clamp. Technically, for very large
 649           * (or infinite) values, clamp against length would be
 650           * correct, but we don't need to guarantee any specific
 651           * result for such coords (the ifloor will be undefined, but
 652           * for modes requiring border all resulting coords are safe).
 653           */
 654          coord = lp_build_sub(coord_bld, coord, half);
 655
 656          /* convert to int, compute lerp weight */
 657          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 658          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 659       }
 660       break;
 661
 662    default:
 663       assert(0);
 664       coord0 = NULL;
 665       coord1 = NULL;
 666       weight = NULL;
 667    }
 668
 669    *x0_out = coord0;
 670    *x1_out = coord1;
 671    *weight_out = weight;
 672 }
 673
 674
 675 /**
 676  * Build LLVM code for texture wrap mode for nearest filtering.
 677  * \param coord  the incoming texcoord (nominally in [0,1])
 678  * \param length  the texture size along one dimension, as int vector
 679  * \param length_f  the texture size along one dimension, as float vector
 680  * \param offset  texel offset along one dimension (as int vector)
 681  * \param is_pot  if TRUE, length is a power of two
 682  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 683  */
 684 static LLVMValueRef
 685 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 686                              LLVMValueRef coord,
 687                              LLVMValueRef length,
 688                              LLVMValueRef length_f,
 689                              LLVMValueRef offset,
 690                              boolean is_pot,
 691                              unsigned wrap_mode)
 692 {
 693    struct lp_build_context *coord_bld = &bld->coord_bld;
 694    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 695    LLVMBuilderRef builder = bld->gallivm->builder;
 696    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 697    LLVMValueRef icoord;
 698
 699    switch(wrap_mode) {
 700    case PIPE_TEX_WRAP_REPEAT:
 701       if (is_pot) {
 702          coord = lp_build_mul(coord_bld, coord, length_f);
 703          icoord = lp_build_ifloor(coord_bld, coord);
 704          if (offset) {
 705             icoord = lp_build_add(int_coord_bld, icoord, offset);
 706          }
 707          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
 708       }
 709       else {
 710           if (offset) {
 711              offset = lp_build_int_to_float(coord_bld, offset);
 712              offset = lp_build_div(coord_bld, offset, length_f);
 713              coord = lp_build_add(coord_bld, coord, offset);
 714           }
 715           /* take fraction, unnormalize */
 716           coord = lp_build_fract_safe(coord_bld, coord);
 717           coord = lp_build_mul(coord_bld, coord, length_f);
 718           icoord = lp_build_itrunc(coord_bld, coord);
 719       }
 720       break;
 721
 722    case PIPE_TEX_WRAP_CLAMP:
 723    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 724       if (bld->static_sampler_state->normalized_coords) {
 725          /* scale coord to length */
 726          coord = lp_build_mul(coord_bld, coord, length_f);
 727       }
 728
 729       if (offset) {
 730          offset = lp_build_int_to_float(coord_bld, offset);
 731          coord = lp_build_add(coord_bld, coord, offset);
 732       }
 733       /* floor */
 734       /* use itrunc instead since we clamp to 0 anyway */
 735       icoord = lp_build_itrunc(coord_bld, coord);
 736
 737       /* clamp to [0, length - 1]. */
 738       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
 739                               length_minus_one);
 740       break;
 741
 742    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 743       if (bld->static_sampler_state->normalized_coords) {
 744          /* scale coord to length */
 745          coord = lp_build_mul(coord_bld, coord, length_f);
 746       }
 747       /* no clamp necessary, border masking will handle this */
 748       icoord = lp_build_ifloor(coord_bld, coord);
 749       if (offset) {
 750          icoord = lp_build_add(int_coord_bld, icoord, offset);
 751       }
 752       break;
 753
 754    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 755       if (offset) {
 756          offset = lp_build_int_to_float(coord_bld, offset);
 757          offset = lp_build_div(coord_bld, offset, length_f);
 758          coord = lp_build_add(coord_bld, coord, offset);
 759       }
 760       /* compute mirror function */
 761       coord = lp_build_coord_mirror(bld, coord, TRUE);
 762
 763       /* scale coord to length */
 764       assert(bld->static_sampler_state->normalized_coords);
 765       coord = lp_build_mul(coord_bld, coord, length_f);
 766
 767       /* itrunc == ifloor here */
 768       icoord = lp_build_itrunc(coord_bld, coord);
 769
 770       /* clamp to [0, length - 1] */
 771       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
 772       break;
 773
 774    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 775    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 776       if (bld->static_sampler_state->normalized_coords) {
 777          /* scale coord to length */
 778          coord = lp_build_mul(coord_bld, coord, length_f);
 779       }
 780       if (offset) {
 781          offset = lp_build_int_to_float(coord_bld, offset);
 782          coord = lp_build_add(coord_bld, coord, offset);
 783       }
 784       coord = lp_build_abs(coord_bld, coord);
 785
 786       /* itrunc == ifloor here */
 787       icoord = lp_build_itrunc(coord_bld, coord);
 788       /*
 789        * Use unsigned min due to possible undef values (NaNs, overflow)
 790        */
 791       {
 792          struct lp_build_context abs_coord_bld = *int_coord_bld;
 793          abs_coord_bld.type.sign = FALSE;
 794          /* clamp to [0, length - 1] */
 795          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
 796       }
 797       break;
 798
 799    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 800       if (bld->static_sampler_state->normalized_coords) {
 801          /* scale coord to length */
 802          coord = lp_build_mul(coord_bld, coord, length_f);
 803       }
 804       if (offset) {
 805          offset = lp_build_int_to_float(coord_bld, offset);
 806          coord = lp_build_add(coord_bld, coord, offset);
 807       }
 808       coord = lp_build_abs(coord_bld, coord);
 809
 810       /* itrunc == ifloor here */
 811       icoord = lp_build_itrunc(coord_bld, coord);
 812       break;
 813
 814    default:
 815       assert(0);
 816       icoord = NULL;
 817    }
 818
 819    return icoord;
 820 }
 821
 822
 823 /**
 824  * Do shadow test/comparison.
 825  * \param p shadow ref value
 826  * \param texel  the texel to compare against
 827  */
 828 static LLVMValueRef
 829 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
 830                             LLVMValueRef p,
 831                             LLVMValueRef texel)
 832 {
 833    struct lp_build_context *texel_bld = &bld->texel_bld;
 834    LLVMValueRef res;
 835
 836    if (0) {
 837       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
 838       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
 839    }
 840
 841    /* result = (p FUNC texel) ? 1 : 0 */
 842    /*
 843     * honor d3d10 floating point rules here, which state that comparisons
 844     * are ordered except NOT_EQUAL which is unordered.
 845     */
 846    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
 847       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
 848                                  p, texel);
 849    }
 850    else {
 851       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
 852                          p, texel);
 853    }
 854    return res;
 855 }
 856
 857
 858 /**
 859  * Generate code to sample a mipmap level with nearest filtering.
 860  * If sampling a cube texture, r = cube face in [0,5].
 861  */
 862 static void
 863 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 864                               LLVMValueRef size,
 865                               LLVMValueRef row_stride_vec,
 866                               LLVMValueRef img_stride_vec,
 867                               LLVMValueRef data_ptr,
 868                               LLVMValueRef mipoffsets,
 869                               const LLVMValueRef *coords,
 870                               const LLVMValueRef *offsets,
 871                               LLVMValueRef colors_out[4])
 872 {
 873    const unsigned dims = bld->dims;
 874    LLVMValueRef width_vec;
 875    LLVMValueRef height_vec;
 876    LLVMValueRef depth_vec;
 877    LLVMValueRef flt_size;
 878    LLVMValueRef flt_width_vec;
 879    LLVMValueRef flt_height_vec;
 880    LLVMValueRef flt_depth_vec;
 881    LLVMValueRef x, y = NULL, z = NULL;
 882
 883    lp_build_extract_image_sizes(bld,
 884                                 &bld->int_size_bld,
 885                                 bld->int_coord_type,
 886                                 size,
 887                                 &width_vec, &height_vec, &depth_vec);
 888
 889    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
 890
 891    lp_build_extract_image_sizes(bld,
 892                                 &bld->float_size_bld,
 893                                 bld->coord_type,
 894                                 flt_size,
 895                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
 896
 897    /*
 898     * Compute integer texcoords.
 899     */
 900    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
 901                                     flt_width_vec, offsets[0],
 902                                     bld->static_texture_state->pot_width,
 903                                     bld->static_sampler_state->wrap_s);
 904    lp_build_name(x, "tex.x.wrapped");
 905
 906    if (dims >= 2) {
 907       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
 908                                        flt_height_vec, offsets[1],
 909                                        bld->static_texture_state->pot_height,
 910                                        bld->static_sampler_state->wrap_t);
 911       lp_build_name(y, "tex.y.wrapped");
 912
 913       if (dims == 3) {
 914          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
 915                                           flt_depth_vec, offsets[2],
 916                                           bld->static_texture_state->pot_depth,
 917                                           bld->static_sampler_state->wrap_r);
 918          lp_build_name(z, "tex.z.wrapped");
 919       }
 920    }
 921    if (has_layer_coord(bld->static_texture_state->target)) {
 922       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
 923          /* add cube layer to face */
 924          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
 925       }
 926       else {
 927          z = coords[2];
 928       }
 929       lp_build_name(z, "tex.z.layer");
 930    }
 931
 932    /*
 933     * Get texture colors.
 934     */
 935    lp_build_sample_texel_soa(bld,
 936                              width_vec, height_vec, depth_vec,
 937                              x, y, z,
 938                              row_stride_vec, img_stride_vec,
 939                              data_ptr, mipoffsets, colors_out);
 940
 941    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
 942       LLVMValueRef cmpval;
 943       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
 944       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
 945       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
 946                                       bld->texel_bld.one, bld->texel_bld.zero);
 947       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
 948    }
 949
 950 }
 951
 952
 953 /**
 954  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
 955  */
 956 static LLVMValueRef
 957 lp_build_masklerp(struct lp_build_context *bld,
 958                  LLVMValueRef weight,
 959                  LLVMValueRef mask0,
 960                  LLVMValueRef mask1)
 961 {
 962    struct gallivm_state *gallivm = bld->gallivm;
 963    LLVMBuilderRef builder = gallivm->builder;
 964    LLVMValueRef weight2;
 965
 966    weight2 = lp_build_sub(bld, bld->one, weight);
 967    weight = LLVMBuildBitCast(builder, weight,
 968                               lp_build_int_vec_type(gallivm, bld->type), "");
 969    weight2 = LLVMBuildBitCast(builder, weight2,
 970                               lp_build_int_vec_type(gallivm, bld->type), "");
 971    weight = LLVMBuildAnd(builder, weight, mask1, "");
 972    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
 973    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
 974    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
 975    return lp_build_add(bld, weight, weight2);
 976 }
 977
 978 /**
 979  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
 980  */
 981 static LLVMValueRef
 982 lp_build_masklerp2d(struct lp_build_context *bld,
 983                     LLVMValueRef weight0,
 984                     LLVMValueRef weight1,
 985                     LLVMValueRef mask00,
 986                     LLVMValueRef mask01,
 987                     LLVMValueRef mask10,
 988                     LLVMValueRef mask11)
 989 {
 990    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
 991    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
 992    return lp_build_lerp(bld, weight1, val0, val1, 0);
 993 }
 994
 995 /*
 996  * this is a bit excessive code for something OpenGL just recommends
 997  * but does not require.
 998  */
 999 #define ACCURATE_CUBE_CORNERS 1
1000
1001 /**
1002  * Generate code to sample a mipmap level with linear filtering.
1003  * If sampling a cube texture, r = cube face in [0,5].
1004  * If linear_mask is present, only pixels having their mask set
1005  * will receive linear filtering, the rest will use nearest.
1006  */
1007 static void
1008 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1009                              boolean is_gather,
1010                              LLVMValueRef size,
1011                              LLVMValueRef linear_mask,
1012                              LLVMValueRef row_stride_vec,
1013                              LLVMValueRef img_stride_vec,
1014                              LLVMValueRef data_ptr,
1015                              LLVMValueRef mipoffsets,
1016                              const LLVMValueRef *coords,
1017                              const LLVMValueRef *offsets,
1018                              LLVMValueRef colors_out[4])
1019 {
1020    LLVMBuilderRef builder = bld->gallivm->builder;
1021    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1022    struct lp_build_context *coord_bld = &bld->coord_bld;
1023    struct lp_build_context *texel_bld = &bld->texel_bld;
1024    const unsigned dims = bld->dims;
1025    LLVMValueRef width_vec;
1026    LLVMValueRef height_vec;
1027    LLVMValueRef depth_vec;
1028    LLVMValueRef flt_size;
1029    LLVMValueRef flt_width_vec;
1030    LLVMValueRef flt_height_vec;
1031    LLVMValueRef flt_depth_vec;
1032    LLVMValueRef fall_off[4], have_corners;
1033    LLVMValueRef z1 = NULL;
1034    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1035    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1036    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1037    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1038    LLVMValueRef xs[4], ys[4], zs[4];
1039    LLVMValueRef neighbors[2][2][4];
1040    int chan, texel_index;
1041    boolean seamless_cube_filter, accurate_cube_corners;
1042    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1043
1044    if (is_gather) {
1045       switch (bld->gather_comp) {
1046       case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
1047       case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
1048       case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
1049       case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
1050       default:
1051          break;
1052       }
1053    }
1054
1055    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1056                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1057                           bld->static_sampler_state->seamless_cube_map;
1058
1059    /*
1060     * Disable accurate cube corners for integer textures, which should only
1061     * get here in the gather path.
1062     */
1063    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
1064      !util_format_is_pure_integer(bld->static_texture_state->format);
1065
1066    lp_build_extract_image_sizes(bld,
1067                                 &bld->int_size_bld,
1068                                 bld->int_coord_type,
1069                                 size,
1070                                 &width_vec, &height_vec, &depth_vec);
1071
1072    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1073
1074    lp_build_extract_image_sizes(bld,
1075                                 &bld->float_size_bld,
1076                                 bld->coord_type,
1077                                 flt_size,
1078                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1079
1080    /*
1081     * Compute integer texcoords.
1082     */
1083
1084    if (!seamless_cube_filter) {
1085       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1086                                   flt_width_vec, offsets[0],
1087                                   bld->static_texture_state->pot_width,
1088                                   bld->static_sampler_state->wrap_s,
1089                                   &x00, &x01, &s_fpart);
1090       lp_build_name(x00, "tex.x0.wrapped");
1091       lp_build_name(x01, "tex.x1.wrapped");
1092       x10 = x00;
1093       x11 = x01;
1094
1095       if (dims >= 2) {
1096          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1097                                      flt_height_vec, offsets[1],
1098                                      bld->static_texture_state->pot_height,
1099                                      bld->static_sampler_state->wrap_t,
1100                                      &y00, &y10, &t_fpart);
1101          lp_build_name(y00, "tex.y0.wrapped");
1102          lp_build_name(y10, "tex.y1.wrapped");
1103          y01 = y00;
1104          y11 = y10;
1105
1106          if (dims == 3) {
1107             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1108                                         flt_depth_vec, offsets[2],
1109                                         bld->static_texture_state->pot_depth,
1110                                         bld->static_sampler_state->wrap_r,
1111                                         &z00, &z1, &r_fpart);
1112             z01 = z10 = z11 = z00;
1113             lp_build_name(z00, "tex.z0.wrapped");
1114             lp_build_name(z1, "tex.z1.wrapped");
1115          }
1116       }
1117       if (has_layer_coord(bld->static_texture_state->target)) {
1118          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1119             /* add cube layer to face */
1120             z00 = z01 = z10 = z11 = z1 =
1121                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1122          }
1123          else {
1124             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1125          }
1126          lp_build_name(z00, "tex.z0.layer");
1127          lp_build_name(z1, "tex.z1.layer");
1128       }
1129    }
1130    else {
1131       struct lp_build_if_state edge_if;
1132       LLVMTypeRef int1t;
1133       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1134       LLVMValueRef coord0, coord1, have_edge, have_corner;
1135       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1136       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1137       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1138       LLVMValueRef face = coords[2];
1139       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1140       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1141       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1142       height_vec = width_vec;
1143       flt_height_vec = flt_width_vec;
1144
1145       /* XXX the overflow logic is actually sort of duplicated with trilinear,
1146        * since an overflow in one mip should also have a corresponding overflow
1147        * in another.
1148        */
1149       /* should always have normalized coords, and offsets are undefined */
1150       assert(bld->static_sampler_state->normalized_coords);
1151       /*
1152        * The coords should all be between [0,1] however we can have NaNs,
1153        * which will wreak havoc. In particular the y1_clamped value below
1154        * can be -INT_MAX (on x86) and be propagated right through (probably
1155        * other values might be bogus in the end too).
1156        * So kill off the NaNs here.
1157        */
1158       coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1159                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1160       coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1161       /* instead of clamp, build mask if overflowed */
1162       coord0 = lp_build_sub(coord_bld, coord0, half);
1163       /* convert to int, compute lerp weight */
1164       /* not ideal with AVX (and no AVX2) */
1165       lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1166       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1167       coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1168                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1169       coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1170       coord1 = lp_build_sub(coord_bld, coord1, half);
1171       lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1172       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1173
1174       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1175       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1176       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1177       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1178
1179       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1180       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1181       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1182       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1183
1184       /* needed for accurate corner filtering branch later, rely on 0 init */
1185       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1186       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1187
1188       for (texel_index = 0; texel_index < 4; texel_index++) {
1189          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1190          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1191          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1192       }
1193
1194       lp_build_if(&edge_if, bld->gallivm, have_edge);
1195
1196       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1197       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1198       LLVMBuildStore(builder, have_corner, have_corners);
1199
1200       /*
1201        * Need to feed clamped values here for cheap corner handling,
1202        * but only for y coord (as when falling off both edges we only
1203        * fall off the x one) - this should be sufficient.
1204        */
1205       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1206       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1207
1208       /*
1209        * Get all possible new coords.
1210        */
1211       lp_build_cube_new_coords(ivec_bld, face,
1212                                x0, x1, y0_clamped, y1_clamped,
1213                                length_minus_one,
1214                                new_faces, new_xcoords, new_ycoords);
1215
1216       /* handle fall off x-, x+ direction */
1217       /* determine new coords, face (not both fall_off vars can be true at same time) */
1218       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1219       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1220       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1221       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1222       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1223       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1224       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1225       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1226
1227       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1228       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1229
1230       /* handle fall off y-, y+ direction */
1231       /*
1232        * Cheap corner logic: just hack up things so a texel doesn't fall
1233        * off both sides (which means filter weights will be wrong but we'll only
1234        * use valid texels in the filter).
1235        * This means however (y) coords must additionally be clamped (see above).
1236        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1237        */
1238       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1239       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1240       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1241       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1242
1243       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1244       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1245       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1246       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1247       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1248       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1249       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1250       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1251
1252       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1253       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1254       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1255       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1256
1257       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1258          /* now can add cube layer to face (per sample) */
1259          z00 = lp_build_add(ivec_bld, z00, coords[3]);
1260          z01 = lp_build_add(ivec_bld, z01, coords[3]);
1261          z10 = lp_build_add(ivec_bld, z10, coords[3]);
1262          z11 = lp_build_add(ivec_bld, z11, coords[3]);
1263       }
1264
1265       LLVMBuildStore(builder, x00, xs[0]);
1266       LLVMBuildStore(builder, x01, xs[1]);
1267       LLVMBuildStore(builder, x10, xs[2]);
1268       LLVMBuildStore(builder, x11, xs[3]);
1269       LLVMBuildStore(builder, y00, ys[0]);
1270       LLVMBuildStore(builder, y01, ys[1]);
1271       LLVMBuildStore(builder, y10, ys[2]);
1272       LLVMBuildStore(builder, y11, ys[3]);
1273       LLVMBuildStore(builder, z00, zs[0]);
1274       LLVMBuildStore(builder, z01, zs[1]);
1275       LLVMBuildStore(builder, z10, zs[2]);
1276       LLVMBuildStore(builder, z11, zs[3]);
1277
1278       lp_build_else(&edge_if);
1279
1280       LLVMBuildStore(builder, x0, xs[0]);
1281       LLVMBuildStore(builder, x1, xs[1]);
1282       LLVMBuildStore(builder, x0, xs[2]);
1283       LLVMBuildStore(builder, x1, xs[3]);
1284       LLVMBuildStore(builder, y0, ys[0]);
1285       LLVMBuildStore(builder, y0, ys[1]);
1286       LLVMBuildStore(builder, y1, ys[2]);
1287       LLVMBuildStore(builder, y1, ys[3]);
1288       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1289          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1290          LLVMBuildStore(builder, cube_layer, zs[0]);
1291          LLVMBuildStore(builder, cube_layer, zs[1]);
1292          LLVMBuildStore(builder, cube_layer, zs[2]);
1293          LLVMBuildStore(builder, cube_layer, zs[3]);
1294       }
1295       else {
1296          LLVMBuildStore(builder, face, zs[0]);
1297          LLVMBuildStore(builder, face, zs[1]);
1298          LLVMBuildStore(builder, face, zs[2]);
1299          LLVMBuildStore(builder, face, zs[3]);
1300       }
1301
1302       lp_build_endif(&edge_if);
1303
1304       x00 = LLVMBuildLoad(builder, xs[0], "");
1305       x01 = LLVMBuildLoad(builder, xs[1], "");
1306       x10 = LLVMBuildLoad(builder, xs[2], "");
1307       x11 = LLVMBuildLoad(builder, xs[3], "");
1308       y00 = LLVMBuildLoad(builder, ys[0], "");
1309       y01 = LLVMBuildLoad(builder, ys[1], "");
1310       y10 = LLVMBuildLoad(builder, ys[2], "");
1311       y11 = LLVMBuildLoad(builder, ys[3], "");
1312       z00 = LLVMBuildLoad(builder, zs[0], "");
1313       z01 = LLVMBuildLoad(builder, zs[1], "");
1314       z10 = LLVMBuildLoad(builder, zs[2], "");
1315       z11 = LLVMBuildLoad(builder, zs[3], "");
1316    }
1317
1318    if (linear_mask) {
1319       /*
1320        * Whack filter weights into place. Whatever texel had more weight is
1321        * the one which should have been selected by nearest filtering hence
1322        * just use 100% weight for it.
1323        */
1324       struct lp_build_context *c_bld = &bld->coord_bld;
1325       LLVMValueRef w1_mask, w1_weight;
1326       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1327
1328       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1329       /* this select is really just a "and" */
1330       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1331       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1332       if (dims >= 2) {
1333          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1334          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1335          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1336          if (dims == 3) {
1337             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1338             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1339             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1340          }
1341       }
1342    }
1343
1344    /*
1345     * Get texture colors.
1346     */
1347    /* get x0/x1 texels */
1348    lp_build_sample_texel_soa(bld,
1349                              width_vec, height_vec, depth_vec,
1350                              x00, y00, z00,
1351                              row_stride_vec, img_stride_vec,
1352                              data_ptr, mipoffsets, neighbors[0][0]);
1353    lp_build_sample_texel_soa(bld,
1354                              width_vec, height_vec, depth_vec,
1355                              x01, y01, z01,
1356                              row_stride_vec, img_stride_vec,
1357                              data_ptr, mipoffsets, neighbors[0][1]);
1358
1359    if (dims == 1) {
1360       assert(!is_gather);
1361       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1362          /* Interpolate two samples from 1D image to produce one color */
1363          for (chan = 0; chan < 4; chan++) {
1364             colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1365                                              neighbors[0][0][chan],
1366                                              neighbors[0][1][chan],
1367                                              0);
1368          }
1369       }
1370       else {
1371          LLVMValueRef cmpval0, cmpval1;
1372          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1373          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1374          /* simplified lerp, AND mask with weight and add */
1375          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1376                                            cmpval0, cmpval1);
1377          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1378       }
1379    }
1380    else {
1381       /* 2D/3D texture */
1382       struct lp_build_if_state corner_if;
1383       LLVMValueRef colors0[4], colorss[4];
1384
1385       /* get x0/x1 texels at y1 */
1386       lp_build_sample_texel_soa(bld,
1387                                 width_vec, height_vec, depth_vec,
1388                                 x10, y10, z10,
1389                                 row_stride_vec, img_stride_vec,
1390                                 data_ptr, mipoffsets, neighbors[1][0]);
1391       lp_build_sample_texel_soa(bld,
1392                                 width_vec, height_vec, depth_vec,
1393                                 x11, y11, z11,
1394                                 row_stride_vec, img_stride_vec,
1395                                 data_ptr, mipoffsets, neighbors[1][1]);
1396
1397       /*
1398        * To avoid having to duplicate linear_mask / fetch code use
1399        * another branch (with corner condition though edge would work
1400        * as well) here.
1401        */
1402       if (accurate_cube_corners) {
1403          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1404          LLVMValueRef have_corner, one_third;
1405
1406          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1407          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1408          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1409          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1410
1411          have_corner = LLVMBuildLoad(builder, have_corners, "");
1412
1413          lp_build_if(&corner_if, bld->gallivm, have_corner);
1414
1415          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1416                                         1.0f/3.0f);
1417
1418          /* find corner */
1419          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1420          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1421          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1422          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1423          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1424          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1425          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1426          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1427
1428          if (!is_gather) {
1429             /*
1430              * we can't use standard 2d lerp as we need per-element weight
1431              * in case of corners, so just calculate bilinear result as
1432              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1433              * (This is actually less work than using 2d lerp, 7 vs. 9
1434              * instructions, however calculating the weights needs another 6,
1435              * so actually probably not slower than 2d lerp only for 4 channels
1436              * as weights only need to be calculated once - of course fixing
1437              * the weights has additional cost.)
1438              */
1439             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1440             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1441             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1442             w00 = lp_build_mul(coord_bld, wx0, wy0);
1443             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1444             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1445             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1446
1447             /* find corner weight */
1448             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1449             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1450             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1451             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1452
1453             /*
1454              * add 1/3 of the corner weight to the weight of the 3 other
1455              * samples and null out corner weight.
1456              */
1457             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1458             w00 = lp_build_add(coord_bld, w00, c_weight);
1459             w00 = lp_build_andnot(coord_bld, w00, c00f);
1460             w01 = lp_build_add(coord_bld, w01, c_weight);
1461             w01 = lp_build_andnot(coord_bld, w01, c01f);
1462             w10 = lp_build_add(coord_bld, w10, c_weight);
1463             w10 = lp_build_andnot(coord_bld, w10, c10f);
1464             w11 = lp_build_add(coord_bld, w11, c_weight);
1465             w11 = lp_build_andnot(coord_bld, w11, c11f);
1466
1467             if (bld->static_sampler_state->compare_mode ==
1468                 PIPE_TEX_COMPARE_NONE) {
1469                for (chan = 0; chan < 4; chan++) {
1470                   colors0[chan] = lp_build_mul(coord_bld, w00,
1471                                                neighbors[0][0][chan]);
1472                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1473                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1474                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1475                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1476                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1477                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1478                }
1479             }
1480             else {
1481                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1482                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1483                                                       neighbors[0][0][0]);
1484                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1485                                                       neighbors[0][1][0]);
1486                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1487                                                       neighbors[1][0][0]);
1488                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1489                                                       neighbors[1][1][0]);
1490                /*
1491                 * inputs to interpolation are just masks so just add
1492                 * masked weights together
1493                 */
1494                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1495                                            coord_bld->vec_type, "");
1496                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1497                                            coord_bld->vec_type, "");
1498                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1499                                            coord_bld->vec_type, "");
1500                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1501                                            coord_bld->vec_type, "");
1502                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1503                tmp = lp_build_and(coord_bld, w01, cmpval01);
1504                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1505                tmp = lp_build_and(coord_bld, w10, cmpval10);
1506                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1507                tmp = lp_build_and(coord_bld, w11, cmpval11);
1508                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1509                colors0[1] = colors0[2] = colors0[3] = colors0[0];
1510             }
1511          }
1512          else {
1513             /*
1514              * We don't have any weights to adjust, so instead calculate
1515              * the fourth texel as simply the average of the other 3.
1516              * (This would work for non-gather too, however we'd have
1517              * a boatload more of the select stuff due to there being
1518              * 4 times as many colors as weights.)
1519              */
1520             LLVMValueRef col00, col01, col10, col11;
1521             LLVMValueRef colc, colc0, colc1;
1522             col10 = lp_build_swizzle_soa_channel(texel_bld,
1523                                                  neighbors[1][0], chan_swiz);
1524             col11 = lp_build_swizzle_soa_channel(texel_bld,
1525                                                  neighbors[1][1], chan_swiz);
1526             col01 = lp_build_swizzle_soa_channel(texel_bld,
1527                                                  neighbors[0][1], chan_swiz);
1528             col00 = lp_build_swizzle_soa_channel(texel_bld,
1529                                                  neighbors[0][0], chan_swiz);
1530
1531             /*
1532              * The spec says for comparison filtering, the comparison
1533              * must happen before synthesizing the new value.
1534              * This means all gathered values are always 0 or 1,
1535              * except for the non-existing texel, which can be 0,1/3,2/3,1...
1536              * Seems like we'd be allowed to just return 0 or 1 too, so we
1537              * could simplify and pass down the compare mask values to the
1538              * end (using int arithmetic/compare on the mask values to
1539              * construct the fourth texel) and only there convert to floats
1540              * but it's probably not worth it (it might be easier for the cpu
1541              * but not for the code)...
1542              */
1543             if (bld->static_sampler_state->compare_mode !=
1544                 PIPE_TEX_COMPARE_NONE) {
1545                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1546                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1547                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1548                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1549                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1550                col00 = lp_build_select(texel_bld, cmpval00,
1551                                        texel_bld->one, texel_bld->zero);
1552                col01 = lp_build_select(texel_bld, cmpval01,
1553                                        texel_bld->one, texel_bld->zero);
1554                col10 = lp_build_select(texel_bld, cmpval10,
1555                                        texel_bld->one, texel_bld->zero);
1556                col11 = lp_build_select(texel_bld, cmpval11,
1557                                        texel_bld->one, texel_bld->zero);
1558             }
1559
1560             /*
1561              * Null out corner color.
1562              */
1563             col00 = lp_build_andnot(coord_bld, col00, c00f);
1564             col01 = lp_build_andnot(coord_bld, col01, c01f);
1565             col10 = lp_build_andnot(coord_bld, col10, c10f);
1566             col11 = lp_build_andnot(coord_bld, col11, c11f);
1567
1568             /*
1569              * New corner texel color is all colors added / 3.
1570              */
1571             colc0 = lp_build_add(coord_bld, col00, col01);
1572             colc1 = lp_build_add(coord_bld, col10, col11);
1573             colc = lp_build_add(coord_bld, colc0, colc1);
1574             colc = lp_build_mul(coord_bld, one_third, colc);
1575
1576             /*
1577              * Replace the corner texel color with the new value.
1578              */
1579             col00 = lp_build_select(coord_bld, c00, colc, col00);
1580             col01 = lp_build_select(coord_bld, c01, colc, col01);
1581             col10 = lp_build_select(coord_bld, c10, colc, col10);
1582             col11 = lp_build_select(coord_bld, c11, colc, col11);
1583
1584             colors0[0] = col10;
1585             colors0[1] = col11;
1586             colors0[2] = col01;
1587             colors0[3] = col00;
1588          }
1589
1590          LLVMBuildStore(builder, colors0[0], colorss[0]);
1591          LLVMBuildStore(builder, colors0[1], colorss[1]);
1592          LLVMBuildStore(builder, colors0[2], colorss[2]);
1593          LLVMBuildStore(builder, colors0[3], colorss[3]);
1594
1595          lp_build_else(&corner_if);
1596       }
1597
1598       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1599          if (is_gather) {
1600             /*
1601              * Just assign the red channel (no component selection yet).
1602              * This is a bit hackish, we usually do the swizzle at the
1603              * end of sampling (much less values to swizzle), but this
1604              * obviously cannot work when using gather.
1605              */
1606             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1607                                                       neighbors[1][0],
1608                                                       chan_swiz);
1609             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1610                                                       neighbors[1][1],
1611                                                       chan_swiz);
1612             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1613                                                       neighbors[0][1],
1614                                                       chan_swiz);
1615             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1616                                                       neighbors[0][0],
1617                                                       chan_swiz);
1618          }
1619          else {
1620             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1621             for (chan = 0; chan < 4; chan++) {
1622                colors0[chan] = lp_build_lerp_2d(texel_bld,
1623                                                 s_fpart, t_fpart,
1624                                                 neighbors[0][0][chan],
1625                                                 neighbors[0][1][chan],
1626                                                 neighbors[1][0][chan],
1627                                                 neighbors[1][1][chan],
1628                                                 0);
1629             }
1630          }
1631       }
1632       else {
1633          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1634          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1635          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1636          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1637          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1638
1639          if (is_gather) {
1640             /* more hacks for swizzling, should be X, ONE or ZERO... */
1641             colors0[0] = lp_build_select(texel_bld, cmpval10,
1642                                          texel_bld->one, texel_bld->zero);
1643             colors0[1] = lp_build_select(texel_bld, cmpval11,
1644                                          texel_bld->one, texel_bld->zero);
1645             colors0[2] = lp_build_select(texel_bld, cmpval01,
1646                                          texel_bld->one, texel_bld->zero);
1647             colors0[3] = lp_build_select(texel_bld, cmpval00,
1648                                          texel_bld->one, texel_bld->zero);
1649          }
1650          else {
1651             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1652                                              cmpval00, cmpval01, cmpval10, cmpval11);
1653             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1654          }
1655       }
1656
1657       if (accurate_cube_corners) {
1658          LLVMBuildStore(builder, colors0[0], colorss[0]);
1659          LLVMBuildStore(builder, colors0[1], colorss[1]);
1660          LLVMBuildStore(builder, colors0[2], colorss[2]);
1661          LLVMBuildStore(builder, colors0[3], colorss[3]);
1662
1663          lp_build_endif(&corner_if);
1664
1665          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1666          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1667          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1668          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1669       }
1670
1671       if (dims == 3) {
1672          LLVMValueRef neighbors1[2][2][4];
1673          LLVMValueRef colors1[4];
1674
1675          assert(!is_gather);
1676
1677          /* get x0/x1/y0/y1 texels at z1 */
1678          lp_build_sample_texel_soa(bld,
1679                                    width_vec, height_vec, depth_vec,
1680                                    x00, y00, z1,
1681                                    row_stride_vec, img_stride_vec,
1682                                    data_ptr, mipoffsets, neighbors1[0][0]);
1683          lp_build_sample_texel_soa(bld,
1684                                    width_vec, height_vec, depth_vec,
1685                                    x01, y01, z1,
1686                                    row_stride_vec, img_stride_vec,
1687                                    data_ptr, mipoffsets, neighbors1[0][1]);
1688          lp_build_sample_texel_soa(bld,
1689                                    width_vec, height_vec, depth_vec,
1690                                    x10, y10, z1,
1691                                    row_stride_vec, img_stride_vec,
1692                                    data_ptr, mipoffsets, neighbors1[1][0]);
1693          lp_build_sample_texel_soa(bld,
1694                                    width_vec, height_vec, depth_vec,
1695                                    x11, y11, z1,
1696                                    row_stride_vec, img_stride_vec,
1697                                    data_ptr, mipoffsets, neighbors1[1][1]);
1698
1699          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1700             /* Bilinear interpolate the four samples from the second Z slice */
1701             for (chan = 0; chan < 4; chan++) {
1702                colors1[chan] = lp_build_lerp_2d(texel_bld,
1703                                                 s_fpart, t_fpart,
1704                                                 neighbors1[0][0][chan],
1705                                                 neighbors1[0][1][chan],
1706                                                 neighbors1[1][0][chan],
1707                                                 neighbors1[1][1][chan],
1708                                                 0);
1709             }
1710             /* Linearly interpolate the two samples from the two 3D slices */
1711             for (chan = 0; chan < 4; chan++) {
1712                colors_out[chan] = lp_build_lerp(texel_bld,
1713                                                 r_fpart,
1714                                                 colors0[chan], colors1[chan],
1715                                                 0);
1716             }
1717          }
1718          else {
1719             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1720             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1721             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1722             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1723             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1724             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1725                                              cmpval00, cmpval01, cmpval10, cmpval11);
1726             /* Linearly interpolate the two samples from the two 3D slices */
1727             colors_out[0] = lp_build_lerp(texel_bld,
1728                                           r_fpart,
1729                                           colors0[0], colors1[0],
1730                                           0);
1731             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1732          }
1733       }
1734       else {
1735          /* 2D tex */
1736          for (chan = 0; chan < 4; chan++) {
1737             colors_out[chan] = colors0[chan];
1738          }
1739       }
1740    }
1741    if (is_gather) {
1742       /*
1743        * For gather, we can't do our usual channel swizzling done later,
1744        * so do it here. It only really matters for 0/1 swizzles in case
1745        * of comparison filtering, since in this case the results would be
1746        * wrong, without comparison it should all work out alright but it
1747        * can't hurt to do that here, since it will instantly drop all
1748        * calculations above, though it's a rather stupid idea to do
1749        * gather on a channel which will always return 0 or 1 in any case...
1750        */
1751       if (chan_swiz == PIPE_SWIZZLE_1) {
1752          for (chan = 0; chan < 4; chan++) {
1753             colors_out[chan] = texel_bld->one;
1754          }
1755       } else if (chan_swiz == PIPE_SWIZZLE_0) {
1756          for (chan = 0; chan < 4; chan++) {
1757             colors_out[chan] = texel_bld->zero;
1758          }
1759       }
1760    }
1761 }
1762
1763
1764 /**
1765  * Sample the texture/mipmap using given image filter and mip filter.
1766  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1767  * from (vectors or scalars).
1768  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1769  */
1770 static void
1771 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1772                        unsigned img_filter,
1773                        unsigned mip_filter,
1774                        boolean is_gather,
1775                        const LLVMValueRef *coords,
1776                        const LLVMValueRef *offsets,
1777                        LLVMValueRef ilevel0,
1778                        LLVMValueRef ilevel1,
1779                        LLVMValueRef lod_fpart,
1780                        LLVMValueRef *colors_out)
1781 {
1782    LLVMBuilderRef builder = bld->gallivm->builder;
1783    LLVMValueRef size0 = NULL;
1784    LLVMValueRef size1 = NULL;
1785    LLVMValueRef row_stride0_vec = NULL;
1786    LLVMValueRef row_stride1_vec = NULL;
1787    LLVMValueRef img_stride0_vec = NULL;
1788    LLVMValueRef img_stride1_vec = NULL;
1789    LLVMValueRef data_ptr0 = NULL;
1790    LLVMValueRef data_ptr1 = NULL;
1791    LLVMValueRef mipoff0 = NULL;
1792    LLVMValueRef mipoff1 = NULL;
1793    LLVMValueRef colors0[4], colors1[4];
1794    unsigned chan;
1795
1796    /* sample the first mipmap level */
1797    lp_build_mipmap_level_sizes(bld, ilevel0,
1798                                &size0,
1799                                &row_stride0_vec, &img_stride0_vec);
1800    if (bld->num_mips == 1) {
1801       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1802    }
1803    else {
1804       /* This path should work for num_lods 1 too but slightly less efficient */
1805       data_ptr0 = bld->base_ptr;
1806       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1807    }
1808    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1809       lp_build_sample_image_nearest(bld, size0,
1810                                     row_stride0_vec, img_stride0_vec,
1811                                     data_ptr0, mipoff0, coords, offsets,
1812                                     colors0);
1813    }
1814    else {
1815       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1816       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1817                                    row_stride0_vec, img_stride0_vec,
1818                                    data_ptr0, mipoff0, coords, offsets,
1819                                    colors0);
1820    }
1821
1822    /* Store the first level's colors in the output variables */
1823    for (chan = 0; chan < 4; chan++) {
1824        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1825    }
1826
1827    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1828       struct lp_build_if_state if_ctx;
1829       LLVMValueRef need_lerp;
1830
1831       /* need_lerp = lod_fpart > 0 */
1832       if (bld->num_lods == 1) {
1833          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1834                                    lod_fpart, bld->lodf_bld.zero,
1835                                    "need_lerp");
1836       }
1837       else {
1838          /*
1839           * We'll do mip filtering if any of the quads (or individual
1840           * pixel in case of per-pixel lod) need it.
1841           * It might be better to split the vectors here and only fetch/filter
1842           * quads which need it (if there's one lod per quad).
1843           */
1844          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1845                                       PIPE_FUNC_GREATER,
1846                                       lod_fpart, bld->lodf_bld.zero);
1847          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1848          lp_build_name(need_lerp, "need_lerp");
1849       }
1850
1851       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1852       {
1853          /*
1854           * We unfortunately need to clamp lod_fpart here since we can get
1855           * negative values which would screw up filtering if not all
1856           * lod_fpart values have same sign.
1857           */
1858          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1859                                   bld->lodf_bld.zero);
1860          /* sample the second mipmap level */
1861          lp_build_mipmap_level_sizes(bld, ilevel1,
1862                                      &size1,
1863                                      &row_stride1_vec, &img_stride1_vec);
1864          if (bld->num_mips == 1) {
1865             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1866          }
1867          else {
1868             data_ptr1 = bld->base_ptr;
1869             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1870          }
1871          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1872             lp_build_sample_image_nearest(bld, size1,
1873                                           row_stride1_vec, img_stride1_vec,
1874                                           data_ptr1, mipoff1, coords, offsets,
1875                                           colors1);
1876          }
1877          else {
1878             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1879                                          row_stride1_vec, img_stride1_vec,
1880                                          data_ptr1, mipoff1, coords, offsets,
1881                                          colors1);
1882          }
1883
1884          /* interpolate samples from the two mipmap levels */
1885
1886          if (bld->num_lods != bld->coord_type.length)
1887             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1888                                                               bld->lodf_bld.type,
1889                                                               bld->texel_bld.type,
1890                                                               lod_fpart);
1891
1892          for (chan = 0; chan < 4; chan++) {
1893             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1894                                           colors0[chan], colors1[chan],
1895                                           0);
1896             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1897          }
1898       }
1899       lp_build_endif(&if_ctx);
1900    }
1901 }
1902
1903
1904 /**
1905  * Sample the texture/mipmap using given mip filter, and using
1906  * both nearest and linear filtering at the same time depending
1907  * on linear_mask.
1908  * lod can be per quad but linear_mask is always per pixel.
1909  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1910  * from (vectors or scalars).
1911  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1912  */
1913 static void
1914 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1915                             LLVMValueRef linear_mask,
1916                             unsigned mip_filter,
1917                             const LLVMValueRef *coords,
1918                             const LLVMValueRef *offsets,
1919                             LLVMValueRef ilevel0,
1920                             LLVMValueRef ilevel1,
1921                             LLVMValueRef lod_fpart,
1922                             LLVMValueRef lod_positive,
1923                             LLVMValueRef *colors_out)
1924 {
1925    LLVMBuilderRef builder = bld->gallivm->builder;
1926    LLVMValueRef size0 = NULL;
1927    LLVMValueRef size1 = NULL;
1928    LLVMValueRef row_stride0_vec = NULL;
1929    LLVMValueRef row_stride1_vec = NULL;
1930    LLVMValueRef img_stride0_vec = NULL;
1931    LLVMValueRef img_stride1_vec = NULL;
1932    LLVMValueRef data_ptr0 = NULL;
1933    LLVMValueRef data_ptr1 = NULL;
1934    LLVMValueRef mipoff0 = NULL;
1935    LLVMValueRef mipoff1 = NULL;
1936    LLVMValueRef colors0[4], colors1[4];
1937    unsigned chan;
1938
1939    /* sample the first mipmap level */
1940    lp_build_mipmap_level_sizes(bld, ilevel0,
1941                                &size0,
1942                                &row_stride0_vec, &img_stride0_vec);
1943    if (bld->num_mips == 1) {
1944       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1945    }
1946    else {
1947       /* This path should work for num_lods 1 too but slightly less efficient */
1948       data_ptr0 = bld->base_ptr;
1949       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1950    }
1951
1952    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1953                                 row_stride0_vec, img_stride0_vec,
1954                                 data_ptr0, mipoff0, coords, offsets,
1955                                 colors0);
1956
1957    /* Store the first level's colors in the output variables */
1958    for (chan = 0; chan < 4; chan++) {
1959        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1960    }
1961
1962    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1963       struct lp_build_if_state if_ctx;
1964       LLVMValueRef need_lerp;
1965
1966       /*
1967        * We'll do mip filtering if any of the quads (or individual
1968        * pixel in case of per-pixel lod) need it.
1969        * Note using lod_positive here not lod_fpart since it may be the same
1970        * condition as that used in the outer "if" in the caller hence llvm
1971        * should be able to merge the branches in this case.
1972        */
1973       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1974       lp_build_name(need_lerp, "need_lerp");
1975
1976       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1977       {
1978          /*
1979           * We unfortunately need to clamp lod_fpart here since we can get
1980           * negative values which would screw up filtering if not all
1981           * lod_fpart values have same sign.
1982           */
1983          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1984                                   bld->lodf_bld.zero);
1985          /* sample the second mipmap level */
1986          lp_build_mipmap_level_sizes(bld, ilevel1,
1987                                      &size1,
1988                                      &row_stride1_vec, &img_stride1_vec);
1989          if (bld->num_mips == 1) {
1990             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1991          }
1992          else {
1993             data_ptr1 = bld->base_ptr;
1994             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1995          }
1996
1997          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1998                                       row_stride1_vec, img_stride1_vec,
1999                                       data_ptr1, mipoff1, coords, offsets,
2000                                       colors1);
2001
2002          /* interpolate samples from the two mipmap levels */
2003
2004          if (bld->num_lods != bld->coord_type.length)
2005             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2006                                                               bld->lodf_bld.type,
2007                                                               bld->texel_bld.type,
2008                                                               lod_fpart);
2009
2010          for (chan = 0; chan < 4; chan++) {
2011             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
2012                                           colors0[chan], colors1[chan],
2013                                           0);
2014             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2015          }
2016       }
2017       lp_build_endif(&if_ctx);
2018    }
2019 }
2020
2021
2022 /**
2023  * Build (per-coord) layer value.
2024  * Either clamp layer to valid values or fill in optional out_of_bounds
2025  * value and just return value unclamped.
2026  */
2027 static LLVMValueRef
2028 lp_build_layer_coord(struct lp_build_sample_context *bld,
2029                      unsigned texture_unit,
2030                      boolean is_cube_array,
2031                      LLVMValueRef layer,
2032                      LLVMValueRef *out_of_bounds)
2033 {
2034    LLVMValueRef num_layers;
2035    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2036
2037    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2038                                           bld->context_ptr, texture_unit, NULL);
2039
2040    if (out_of_bounds) {
2041       LLVMValueRef out1, out;
2042       assert(!is_cube_array);
2043       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2044       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2045       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2046       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2047       return layer;
2048    }
2049    else {
2050       LLVMValueRef maxlayer;
2051       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2052                                        bld->int_bld.one;
2053       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2054       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2055       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2056    }
2057 }
2058
2059
2060 /**
2061  * Calculate cube face, lod, mip levels.
2062  */
2063 static void
2064 lp_build_sample_common(struct lp_build_sample_context *bld,
2065                        boolean is_lodq,
2066                        unsigned texture_index,
2067                        unsigned sampler_index,
2068                        LLVMValueRef *coords,
2069                        const struct lp_derivatives *derivs, /* optional */
2070                        LLVMValueRef lod_bias, /* optional */
2071                        LLVMValueRef explicit_lod, /* optional */
2072                        LLVMValueRef *lod_pos_or_zero,
2073                        LLVMValueRef *lod,
2074                        LLVMValueRef *lod_fpart,
2075                        LLVMValueRef *ilevel0,
2076                        LLVMValueRef *ilevel1)
2077 {
2078    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2079    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2080    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2081    const unsigned target = bld->static_texture_state->target;
2082    LLVMValueRef first_level, cube_rho = NULL;
2083    LLVMValueRef lod_ipart = NULL;
2084    struct lp_derivatives cube_derivs;
2085
2086    /*
2087    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2088           mip_filter, min_filter, mag_filter);
2089    */
2090
2091    /*
2092     * Choose cube face, recompute texcoords for the chosen face and
2093     * compute rho here too (as it requires transform of derivatives).
2094     */
2095    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2096       boolean need_derivs;
2097       need_derivs = ((min_filter != mag_filter ||
2098                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2099                       !bld->static_sampler_state->min_max_lod_equal &&
2100                       !explicit_lod);
2101       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2102       derivs = &cube_derivs;
2103       if (target == PIPE_TEXTURE_CUBE_ARRAY && !is_lodq) {
2104          /* calculate cube layer coord now */
2105          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2106          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2107          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2108          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2109          /* because of seamless filtering can't add it to face (coords[2]) here. */
2110       }
2111    }
2112    else if ((target == PIPE_TEXTURE_1D_ARRAY ||
2113              target == PIPE_TEXTURE_2D_ARRAY) && !is_lodq) {
2114       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2115       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2116    }
2117
2118    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2119       /*
2120        * Clamp p coords to [0,1] for fixed function depth texture format here.
2121        * Technically this is not entirely correct for unorm depth as the ref value
2122        * should be converted to the depth format (quantization!) and comparison
2123        * then done in texture format. This would actually help performance (since
2124        * only need to do it once and could save the per-sample conversion of texels
2125        * to floats instead), but it would need more messy code (would need to push
2126        * at least some bits down to actual fetch so conversion could be skipped,
2127        * and would have ugly interaction with border color, would need to convert
2128        * border color to that format too or do some other tricks to make it work).
2129        */
2130       const struct util_format_description *format_desc = bld->format_desc;
2131       unsigned chan_type;
2132       /* not entirely sure we couldn't end up with non-valid swizzle here */
2133       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2134                      format_desc->channel[format_desc->swizzle[0]].type :
2135                      UTIL_FORMAT_TYPE_FLOAT;
2136       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2137          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2138                                     bld->coord_bld.zero, bld->coord_bld.one);
2139       }
2140    }
2141
2142    /*
2143     * Compute the level of detail (float).
2144     */
2145    if (min_filter != mag_filter ||
2146        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2147       /* Need to compute lod either to choose mipmap levels or to
2148        * distinguish between minification/magnification with one mipmap level.
2149        */
2150       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2151                             coords[0], coords[1], coords[2], cube_rho,
2152                             derivs, lod_bias, explicit_lod,
2153                             mip_filter, lod,
2154                             &lod_ipart, lod_fpart, lod_pos_or_zero);
2155       if (is_lodq) {
2156          LLVMValueRef last_level;
2157          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2158                                                      bld->gallivm,
2159                                                      bld->context_ptr,
2160                                                      texture_index, NULL);
2161          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2162                                                        bld->gallivm,
2163                                                        bld->context_ptr,
2164                                                        texture_index, NULL);
2165          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2166          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2167          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2168
2169          switch (mip_filter) {
2170          case PIPE_TEX_MIPFILTER_NONE:
2171             *lod_fpart = bld->lodf_bld.zero;
2172             break;
2173          case PIPE_TEX_MIPFILTER_NEAREST:
2174              *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2175              /* fallthrough */
2176          case PIPE_TEX_MIPFILTER_LINEAR:
2177             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2178                                         bld->lodf_bld.zero, last_level);
2179             break;
2180          }
2181          return;
2182       }
2183
2184    } else {
2185       lod_ipart = bld->lodi_bld.zero;
2186       *lod_pos_or_zero = bld->lodi_bld.zero;
2187    }
2188
2189    if (bld->num_lods != bld->num_mips) {
2190       /* only makes sense if there's just a single mip level */
2191       assert(bld->num_mips == 1);
2192       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2193    }
2194
2195    /*
2196     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2197     */
2198    switch (mip_filter) {
2199    default:
2200       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2201       /* fall-through */
2202    case PIPE_TEX_MIPFILTER_NONE:
2203       /* always use mip level 0 */
2204       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2205                                                     bld->gallivm, bld->context_ptr,
2206                                                     texture_index, NULL);
2207       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2208       *ilevel0 = first_level;
2209       break;
2210    case PIPE_TEX_MIPFILTER_NEAREST:
2211       assert(lod_ipart);
2212       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2213       break;
2214    case PIPE_TEX_MIPFILTER_LINEAR:
2215       assert(lod_ipart);
2216       assert(*lod_fpart);
2217       lp_build_linear_mip_levels(bld, texture_index,
2218                                  lod_ipart, lod_fpart,
2219                                  ilevel0, ilevel1);
2220       break;
2221    }
2222 }
2223
2224 static void
2225 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2226                             unsigned sampler_unit)
2227 {
2228    struct gallivm_state *gallivm = bld->gallivm;
2229    LLVMBuilderRef builder = gallivm->builder;
2230    LLVMValueRef border_color_ptr =
2231       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2232                                        bld->context_ptr, sampler_unit);
2233    LLVMValueRef border_color;
2234    const struct util_format_description *format_desc = bld->format_desc;
2235    struct lp_type vec4_type = bld->texel_type;
2236    struct lp_build_context vec4_bld;
2237    LLVMValueRef min_clamp = NULL;
2238    LLVMValueRef max_clamp = NULL;
2239
2240    /*
2241     * For normalized format need to clamp border color (technically
2242     * probably should also quantize the data). Really sucks doing this
2243     * here but can't avoid at least for now since this is part of
2244     * sampler state and texture format is part of sampler_view state.
2245     * GL expects also expects clamping for uint/sint formats too so
2246     * do that as well (d3d10 can't end up here with uint/sint since it
2247     * only supports them with ld).
2248     */
2249    vec4_type.length = 4;
2250    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2251
2252    /*
2253     * Vectorized clamping of border color. Loading is a bit of a hack since
2254     * we just cast the pointer to float array to pointer to vec4
2255     * (int or float).
2256     */
2257    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2258                                              lp_build_const_int32(gallivm, 0));
2259    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2260                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
2261    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2262    /* we don't have aligned type in the dynamic state unfortunately */
2263    LLVMSetAlignment(border_color, 4);
2264
2265    /*
2266     * Instead of having some incredibly complex logic which will try to figure out
2267     * clamping necessary for each channel, simply use the first channel, and treat
2268     * mixed signed/unsigned normalized formats specially.
2269     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2270     * good reason.)
2271     */
2272    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2273       int chan;
2274       /* d/s needs special handling because both present means just sampling depth */
2275       if (util_format_is_depth_and_stencil(format_desc->format)) {
2276          chan = format_desc->swizzle[0];
2277       }
2278       else {
2279          chan = util_format_get_first_non_void_channel(format_desc->format);
2280       }
2281       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2282          unsigned chan_type = format_desc->channel[chan].type;
2283          unsigned chan_norm = format_desc->channel[chan].normalized;
2284          unsigned chan_pure = format_desc->channel[chan].pure_integer;
2285          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2286             if (chan_norm) {
2287                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2288                max_clamp = vec4_bld.one;
2289             }
2290             else if (chan_pure) {
2291                /*
2292                 * Border color was stored as int, hence need min/max clamp
2293                 * only if chan has less than 32 bits..
2294                 */
2295                unsigned chan_size = format_desc->channel[chan].size;
2296                if (chan_size < 32) {
2297                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2298                                                      0 - (1 << (chan_size - 1)));
2299                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2300                                                      (1 << (chan_size - 1)) - 1);
2301                }
2302             }
2303             /* TODO: no idea about non-pure, non-normalized! */
2304          }
2305          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2306             if (chan_norm) {
2307                min_clamp = vec4_bld.zero;
2308                max_clamp = vec4_bld.one;
2309             }
2310             /*
2311              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2312              * we use Z32_FLOAT_S8X24 to imply sampling depth component
2313              * and ignoring stencil, which will blow up here if we try to
2314              * do a uint clamp in a float texel build...
2315              * And even if we had that format, mesa st also thinks using z24s8
2316              * means depth sampling ignoring stencil.
2317              */
2318             else if (chan_pure) {
2319                /*
2320                 * Border color was stored as uint, hence never need min
2321                 * clamp, and only need max clamp if chan has less than 32 bits.
2322                 */
2323                unsigned chan_size = format_desc->channel[chan].size;
2324                if (chan_size < 32) {
2325                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2326                                                      (1 << chan_size) - 1);
2327                }
2328                /* TODO: no idea about non-pure, non-normalized! */
2329             }
2330          }
2331          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2332             /* TODO: I have no idea what clamp this would need if any! */
2333          }
2334       }
2335       /* mixed plain formats (or different pure size) */
2336       switch (format_desc->format) {
2337       case PIPE_FORMAT_B10G10R10A2_UINT:
2338       case PIPE_FORMAT_R10G10B10A2_UINT:
2339       {
2340          unsigned max10 = (1 << 10) - 1;
2341          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2342                                         max10, (1 << 2) - 1, NULL);
2343       }
2344          break;
2345       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2346          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2347                                         -1.0F, 0.0F, NULL);
2348          max_clamp = vec4_bld.one;
2349          break;
2350       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2351       case PIPE_FORMAT_R5SG5SB6U_NORM:
2352          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2353                                         0.0F, 0.0F, NULL);
2354          max_clamp = vec4_bld.one;
2355          break;
2356       default:
2357          break;
2358       }
2359    }
2360    else {
2361       /* cannot figure this out from format description */
2362       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2363          /* s3tc formats are always unorm */
2364          min_clamp = vec4_bld.zero;
2365          max_clamp = vec4_bld.one;
2366       }
2367       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2368                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC ||
2369                format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
2370          switch (format_desc->format) {
2371          case PIPE_FORMAT_RGTC1_UNORM:
2372          case PIPE_FORMAT_RGTC2_UNORM:
2373          case PIPE_FORMAT_LATC1_UNORM:
2374          case PIPE_FORMAT_LATC2_UNORM:
2375          case PIPE_FORMAT_ETC1_RGB8:
2376          case PIPE_FORMAT_BPTC_RGBA_UNORM:
2377          case PIPE_FORMAT_BPTC_SRGBA:
2378             min_clamp = vec4_bld.zero;
2379             max_clamp = vec4_bld.one;
2380             break;
2381          case PIPE_FORMAT_RGTC1_SNORM:
2382          case PIPE_FORMAT_RGTC2_SNORM:
2383          case PIPE_FORMAT_LATC1_SNORM:
2384          case PIPE_FORMAT_LATC2_SNORM:
2385             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2386             max_clamp = vec4_bld.one;
2387             break;
2388          case PIPE_FORMAT_BPTC_RGB_FLOAT:
2389             /* not sure if we should clamp to max half float? */
2390             break;
2391          case PIPE_FORMAT_BPTC_RGB_UFLOAT:
2392             min_clamp = vec4_bld.zero;
2393             break;
2394          default:
2395             assert(0);
2396             break;
2397          }
2398       }
2399       /*
2400        * all others from subsampled/other group, though we don't care
2401        * about yuv (and should not have any from zs here)
2402        */
2403       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2404          switch (format_desc->format) {
2405          case PIPE_FORMAT_R8G8_B8G8_UNORM:
2406          case PIPE_FORMAT_G8R8_G8B8_UNORM:
2407          case PIPE_FORMAT_G8R8_B8R8_UNORM:
2408          case PIPE_FORMAT_R8G8_R8B8_UNORM:
2409          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2410             min_clamp = vec4_bld.zero;
2411             max_clamp = vec4_bld.one;
2412             break;
2413          case PIPE_FORMAT_R8G8Bx_SNORM:
2414             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2415             max_clamp = vec4_bld.one;
2416             break;
2417             /*
2418              * Note smallfloat formats usually don't need clamping
2419              * (they still have infinite range) however this is not
2420              * true for r11g11b10 and r9g9b9e5, which can't represent
2421              * negative numbers (and additionally r9g9b9e5 can't represent
2422              * very large numbers). d3d10 seems happy without clamping in
2423              * this case, but gl spec is pretty clear: "for floating
2424              * point and integer formats, border values are clamped to
2425              * the representable range of the format" so do that here.
2426              */
2427          case PIPE_FORMAT_R11G11B10_FLOAT:
2428             min_clamp = vec4_bld.zero;
2429             break;
2430          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2431             min_clamp = vec4_bld.zero;
2432             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2433             break;
2434          default:
2435             assert(0);
2436             break;
2437          }
2438       }
2439    }
2440
2441    if (min_clamp) {
2442       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2443    }
2444    if (max_clamp) {
2445       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2446    }
2447
2448    bld->border_color_clamped = border_color;
2449 }
2450
2451
2452 /**
2453  * General texture sampling codegen.
2454  * This function handles texture sampling for all texture targets (1D,
2455  * 2D, 3D, cube) and all filtering modes.
2456  */
2457 static void
2458 lp_build_sample_general(struct lp_build_sample_context *bld,
2459                         unsigned sampler_unit,
2460                         boolean is_gather,
2461                         const LLVMValueRef *coords,
2462                         const LLVMValueRef *offsets,
2463                         LLVMValueRef lod_positive,
2464                         LLVMValueRef lod_fpart,
2465                         LLVMValueRef ilevel0,
2466                         LLVMValueRef ilevel1,
2467                         LLVMValueRef *colors_out)
2468 {
2469    LLVMBuilderRef builder = bld->gallivm->builder;
2470    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2471    const unsigned mip_filter = sampler_state->min_mip_filter;
2472    const unsigned min_filter = sampler_state->min_img_filter;
2473    const unsigned mag_filter = sampler_state->mag_img_filter;
2474    LLVMValueRef texels[4];
2475    unsigned chan;
2476
2477    /* if we need border color, (potentially) clamp it now */
2478    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2479                                               min_filter,
2480                                               mag_filter) ||
2481        (bld->dims > 1 &&
2482            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2483                                                   min_filter,
2484                                                   mag_filter)) ||
2485        (bld->dims > 2 &&
2486            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2487                                                   min_filter,
2488                                                   mag_filter))) {
2489       lp_build_clamp_border_color(bld, sampler_unit);
2490    }
2491
2492
2493    /*
2494     * Get/interpolate texture colors.
2495     */
2496
2497    for (chan = 0; chan < 4; ++chan) {
2498      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2499      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2500    }
2501
2502    if (min_filter == mag_filter) {
2503       /* no need to distinguish between minification and magnification */
2504       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2505                              is_gather,
2506                              coords, offsets,
2507                              ilevel0, ilevel1, lod_fpart,
2508                              texels);
2509    }
2510    else {
2511       /*
2512        * Could also get rid of the if-logic and always use mipmap_both, both
2513        * for the single lod and multi-lod case if nothing really uses this.
2514        */
2515       if (bld->num_lods == 1) {
2516          /* Emit conditional to choose min image filter or mag image filter
2517           * depending on the lod being > 0 or <= 0, respectively.
2518           */
2519          struct lp_build_if_state if_ctx;
2520
2521          lod_positive = LLVMBuildTrunc(builder, lod_positive,
2522                                        LLVMInt1TypeInContext(bld->gallivm->context),
2523                                        "lod_pos");
2524
2525          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2526          {
2527             /* Use the minification filter */
2528             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2529                                    coords, offsets,
2530                                    ilevel0, ilevel1, lod_fpart,
2531                                    texels);
2532          }
2533          lp_build_else(&if_ctx);
2534          {
2535             /* Use the magnification filter */
2536             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2537                                    FALSE,
2538                                    coords, offsets,
2539                                    ilevel0, NULL, NULL,
2540                                    texels);
2541          }
2542          lp_build_endif(&if_ctx);
2543       }
2544       else {
2545          LLVMValueRef need_linear, linear_mask;
2546          unsigned mip_filter_for_nearest;
2547          struct lp_build_if_state if_ctx;
2548
2549          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2550             linear_mask = lod_positive;
2551             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2552          }
2553          else {
2554             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2555             mip_filter_for_nearest = mip_filter;
2556          }
2557          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2558                                                linear_mask);
2559          lp_build_name(need_linear, "need_linear");
2560
2561          if (bld->num_lods != bld->coord_type.length) {
2562             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2563                                                                 bld->lodi_type,
2564                                                                 bld->int_coord_type,
2565                                                                 linear_mask);
2566          }
2567
2568          lp_build_if(&if_ctx, bld->gallivm, need_linear);
2569          {
2570             /*
2571              * Do sampling with both filters simultaneously. This means using
2572              * a linear filter and doing some tricks (with weights) for the pixels
2573              * which need nearest filter.
2574              * Note that it's probably rare some pixels need nearest and some
2575              * linear filter but the fixups required for the nearest pixels
2576              * aren't all that complicated so just always run a combined path
2577              * if at least some pixels require linear.
2578              */
2579             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2580                                         coords, offsets,
2581                                         ilevel0, ilevel1,
2582                                         lod_fpart, lod_positive,
2583                                         texels);
2584          }
2585          lp_build_else(&if_ctx);
2586          {
2587             /*
2588              * All pixels require just nearest filtering, which is way
2589              * cheaper than linear, hence do a separate path for that.
2590              */
2591             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2592                                    mip_filter_for_nearest, FALSE,
2593                                    coords, offsets,
2594                                    ilevel0, ilevel1, lod_fpart,
2595                                    texels);
2596          }
2597          lp_build_endif(&if_ctx);
2598       }
2599    }
2600
2601    for (chan = 0; chan < 4; ++chan) {
2602      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2603      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2604    }
2605 }
2606
2607
2608 /**
2609  * Texel fetch function.
2610  * In contrast to general sampling there is no filtering, no coord minification,
2611  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2612  * directly to be applied to the selected mip level (after adding texel offsets).
2613  * This function handles texel fetch for all targets where texel fetch is supported
2614  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2615  */
2616 static void
2617 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2618                      unsigned texture_unit,
2619                      LLVMValueRef ms_index,
2620                      const LLVMValueRef *coords,
2621                      LLVMValueRef explicit_lod,
2622                      const LLVMValueRef *offsets,
2623                      LLVMValueRef *colors_out)
2624 {
2625    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2626    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2627    unsigned dims = bld->dims, chan;
2628    unsigned target = bld->static_texture_state->target;
2629    boolean out_of_bound_ret_zero = TRUE;
2630    LLVMValueRef size, ilevel;
2631    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2632    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2633    LLVMValueRef width, height, depth, i, j;
2634    LLVMValueRef offset, out_of_bounds, out1;
2635
2636    out_of_bounds = int_coord_bld->zero;
2637
2638    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2639       if (bld->num_mips != int_coord_bld->type.length) {
2640          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2641                                             perquadi_bld->type, explicit_lod, 0);
2642       }
2643       else {
2644          ilevel = explicit_lod;
2645       }
2646       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2647                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
2648    }
2649    else {
2650       assert(bld->num_mips == 1);
2651       if (bld->static_texture_state->target != PIPE_BUFFER) {
2652          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2653                                                   bld->context_ptr, texture_unit, NULL);
2654       }
2655       else {
2656          ilevel = lp_build_const_int32(bld->gallivm, 0);
2657       }
2658    }
2659    lp_build_mipmap_level_sizes(bld, ilevel,
2660                                &size,
2661                                &row_stride_vec, &img_stride_vec);
2662    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2663                                 size, &width, &height, &depth);
2664
2665    if (target == PIPE_TEXTURE_1D_ARRAY ||
2666        target == PIPE_TEXTURE_2D_ARRAY) {
2667       if (out_of_bound_ret_zero) {
2668          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2669          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2670       }
2671       else {
2672          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2673       }
2674    }
2675
2676    /* This is a lot like border sampling */
2677    if (offsets[0]) {
2678       /*
2679        * coords are really unsigned, offsets are signed, but I don't think
2680        * exceeding 31 bits is possible
2681        */
2682       x = lp_build_add(int_coord_bld, x, offsets[0]);
2683    }
2684    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2685    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2686    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2687    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2688
2689    if (dims >= 2) {
2690       if (offsets[1]) {
2691          y = lp_build_add(int_coord_bld, y, offsets[1]);
2692       }
2693       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2694       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2695       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2696       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2697
2698       if (dims >= 3) {
2699          if (offsets[2]) {
2700             z = lp_build_add(int_coord_bld, z, offsets[2]);
2701          }
2702          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2703          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2704          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2705          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2706       }
2707    }
2708
2709    lp_build_sample_offset(int_coord_bld,
2710                           bld->format_desc,
2711                           x, y, z, row_stride_vec, img_stride_vec,
2712                           &offset, &i, &j);
2713
2714    if (bld->static_texture_state->target != PIPE_BUFFER) {
2715       offset = lp_build_add(int_coord_bld, offset,
2716                             lp_build_get_mip_offsets(bld, ilevel));
2717    }
2718
2719    if (bld->fetch_ms) {
2720       LLVMValueRef num_samples;
2721       num_samples = bld->dynamic_state->num_samples(bld->dynamic_state, bld->gallivm,
2722                                                     bld->context_ptr, texture_unit, NULL);
2723       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, ms_index, int_coord_bld->zero);
2724       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2725       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(int_coord_bld, num_samples));
2726       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2727       offset = lp_build_add(int_coord_bld, offset,
2728                             lp_build_mul(int_coord_bld, bld->sample_stride, ms_index));
2729    }
2730
2731    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2732
2733    lp_build_fetch_rgba_soa(bld->gallivm,
2734                            bld->format_desc,
2735                            bld->texel_type, TRUE,
2736                            bld->base_ptr, offset,
2737                            i, j,
2738                            bld->cache,
2739                            colors_out);
2740
2741    if (out_of_bound_ret_zero) {
2742       /*
2743        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2744        * Could use min/max above instead of out-of-bounds comparisons
2745        * if we don't care about the result returned for out-of-bounds.
2746        */
2747       for (chan = 0; chan < 4; chan++) {
2748          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2749                                             bld->texel_bld.zero, colors_out[chan]);
2750       }
2751    }
2752 }
2753
2754
2755 /**
2756  * Just set texels to white instead of actually sampling the texture.
2757  * For debugging.
2758  */
2759 void
2760 lp_build_sample_nop(struct gallivm_state *gallivm,
2761                     struct lp_type type,
2762                     const LLVMValueRef *coords,
2763                     LLVMValueRef texel_out[4])
2764 {
2765    LLVMValueRef one = lp_build_one(gallivm, type);
2766    unsigned chan;
2767
2768    for (chan = 0; chan < 4; chan++) {
2769       texel_out[chan] = one;
2770    }
2771 }
2772
2773
2774 /**
2775  * Build the actual texture sampling code.
2776  * 'texel' will return a vector of four LLVMValueRefs corresponding to
2777  * R, G, B, A.
2778  * \param type  vector float type to use for coords, etc.
2779  * \param sample_key
2780  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2781  */
2782 static void
2783 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2784                          const struct lp_static_texture_state *static_texture_state,
2785                          const struct lp_static_sampler_state *static_sampler_state,
2786                          struct lp_sampler_dynamic_state *dynamic_state,
2787                          struct lp_type type,
2788                          unsigned sample_key,
2789                          unsigned texture_index,
2790                          unsigned sampler_index,
2791                          LLVMValueRef context_ptr,
2792                          LLVMValueRef thread_data_ptr,
2793                          const LLVMValueRef *coords,
2794                          const LLVMValueRef *offsets,
2795                          const struct lp_derivatives *derivs, /* optional */
2796                          LLVMValueRef lod, /* optional */
2797                          LLVMValueRef ms_index, /* optional */
2798                          LLVMValueRef texel_out[4])
2799 {
2800    unsigned target = static_texture_state->target;
2801    unsigned dims = texture_dims(target);
2802    unsigned num_quads = type.length / 4;
2803    unsigned mip_filter, min_img_filter, mag_img_filter, i;
2804    struct lp_build_sample_context bld;
2805    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2806    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2807    LLVMBuilderRef builder = gallivm->builder;
2808    LLVMValueRef tex_width, newcoords[5];
2809    enum lp_sampler_lod_property lod_property;
2810    enum lp_sampler_lod_control lod_control;
2811    enum lp_sampler_op_type op_type;
2812    LLVMValueRef lod_bias = NULL;
2813    LLVMValueRef explicit_lod = NULL;
2814    boolean op_is_tex, op_is_lodq, op_is_gather, fetch_ms;
2815
2816    if (0) {
2817       enum pipe_format fmt = static_texture_state->format;
2818       debug_printf("Sample from %s\n", util_format_name(fmt));
2819    }
2820
2821    lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2822                      LP_SAMPLER_LOD_PROPERTY_SHIFT;
2823    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2824                     LP_SAMPLER_LOD_CONTROL_SHIFT;
2825    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2826                  LP_SAMPLER_OP_TYPE_SHIFT;
2827    fetch_ms = !!(sample_key & LP_SAMPLER_FETCH_MS);
2828
2829    op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2830    op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2831    op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2832
2833    if (lod_control == LP_SAMPLER_LOD_BIAS) {
2834       lod_bias = lod;
2835       assert(lod);
2836       assert(derivs == NULL);
2837    }
2838    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2839       explicit_lod = lod;
2840       assert(lod);
2841       assert(derivs == NULL);
2842    }
2843    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2844       assert(derivs);
2845       assert(lod == NULL);
2846    }
2847    else {
2848       assert(derivs == NULL);
2849       assert(lod == NULL);
2850    }
2851
2852    if (static_texture_state->format == PIPE_FORMAT_NONE) {
2853       /*
2854        * If there's nothing bound, format is NONE, and we must return
2855        * all zero as mandated by d3d10 in this case.
2856        */
2857       unsigned chan;
2858       LLVMValueRef zero = lp_build_zero(gallivm, type);
2859       for (chan = 0; chan < 4; chan++) {
2860          texel_out[chan] = zero;
2861       }
2862       return;
2863    }
2864
2865    assert(type.floating);
2866
2867    /* Setup our build context */
2868    memset(&bld, 0, sizeof bld);
2869    bld.gallivm = gallivm;
2870    bld.context_ptr = context_ptr;
2871    bld.static_sampler_state = &derived_sampler_state;
2872    bld.static_texture_state = static_texture_state;
2873    bld.dynamic_state = dynamic_state;
2874    bld.format_desc = util_format_description(static_texture_state->format);
2875    bld.dims = dims;
2876
2877    if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
2878       bld.no_quad_lod = TRUE;
2879    }
2880    if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
2881       bld.no_rho_approx = TRUE;
2882    }
2883    if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
2884       bld.no_brilinear = TRUE;
2885    }
2886
2887    bld.vector_width = lp_type_width(type);
2888
2889    bld.float_type = lp_type_float(32);
2890    bld.int_type = lp_type_int(32);
2891    bld.coord_type = type;
2892    bld.int_coord_type = lp_int_type(type);
2893    bld.float_size_in_type = lp_type_float(32);
2894    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2895    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2896    bld.texel_type = type;
2897
2898    /* always using the first channel hopefully should be safe,
2899     * if not things WILL break in other places anyway.
2900     */
2901    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2902        bld.format_desc->channel[0].pure_integer) {
2903       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2904          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2905       }
2906       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2907          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2908       }
2909    }
2910    else if (util_format_has_stencil(bld.format_desc) &&
2911        !util_format_has_depth(bld.format_desc)) {
2912       /* for stencil only formats, sample stencil (uint) */
2913       bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2914    }
2915
2916    if (!static_texture_state->level_zero_only ||
2917        !static_sampler_state->max_lod_pos || op_is_lodq) {
2918       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2919    } else {
2920       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2921    }
2922    if (op_is_gather) {
2923       /*
2924        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2925        * the actual filtering. Using mostly the same paths, so cube face
2926        * selection, coord wrapping etc. all naturally uses the same code.
2927        */
2928       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2929       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2930       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2931    }
2932    mip_filter = derived_sampler_state.min_mip_filter;
2933
2934    if (0) {
2935       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2936    }
2937
2938    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2939        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2940    {
2941       /*
2942        * Seamless filtering ignores wrap modes.
2943        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2944        * bilinear it's not correct but way better than using for instance repeat.
2945        * Note we even set this for non-seamless. Technically GL allows any wrap
2946        * mode, which made sense when supporting true borders (can get seamless
2947        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2948        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2949        * up the sampler state (as it makes it texture dependent).
2950        */
2951       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2952       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2953    }
2954    /*
2955     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2956     * so AoS path could be used. Not sure it's worth the trouble...
2957     */
2958
2959    min_img_filter = derived_sampler_state.min_img_filter;
2960    mag_img_filter = derived_sampler_state.mag_img_filter;
2961
2962
2963    /*
2964     * This is all a bit complicated different paths are chosen for performance
2965     * reasons.
2966     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2967     * everything (the last two options are equivalent for 4-wide case).
2968     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2969     * lod is calculated then the lod value extracted afterwards so making this
2970     * case basically the same as far as lod handling is concerned for the
2971     * further sample/filter code as the 1 lod for everything case.
2972     * Different lod handling mostly shows up when building mipmap sizes
2973     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2974     * (getting the fractional part of the lod to the right texels).
2975     */
2976
2977    /*
2978     * There are other situations where at least the multiple int lods could be
2979     * avoided like min and max lod being equal.
2980     */
2981    bld.num_mips = bld.num_lods = 1;
2982
2983    if (bld.no_quad_lod && bld.no_rho_approx &&
2984        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2985          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2986           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2987         op_is_lodq)) {
2988       /*
2989        * special case for using per-pixel lod even for implicit lod,
2990        * which is generally never required (ok by APIs) except to please
2991        * some (somewhat broken imho) tests (because per-pixel face selection
2992        * can cause derivatives to be different for pixels outside the primitive
2993        * due to the major axis division even if pre-project derivatives are
2994        * looking normal).
2995        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2996        * cube maps we do indeed get per-pixel lod values).
2997        */
2998       bld.num_mips = type.length;
2999       bld.num_lods = type.length;
3000    }
3001    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
3002        (explicit_lod || lod_bias || derivs)) {
3003       if ((!op_is_tex && target != PIPE_BUFFER) ||
3004           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3005          bld.num_mips = type.length;
3006          bld.num_lods = type.length;
3007       }
3008       else if (op_is_tex && min_img_filter != mag_img_filter) {
3009          bld.num_mips = 1;
3010          bld.num_lods = type.length;
3011       }
3012    }
3013    /* TODO: for true scalar_lod should only use 1 lod value */
3014    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
3015             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3016       bld.num_mips = num_quads;
3017       bld.num_lods = num_quads;
3018    }
3019    else if (op_is_tex && min_img_filter != mag_img_filter) {
3020       bld.num_mips = 1;
3021       bld.num_lods = num_quads;
3022    }
3023
3024    bld.fetch_ms = fetch_ms;
3025    if (op_is_gather)
3026       bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
3027    bld.lodf_type = type;
3028    /* we want native vector size to be able to use our intrinsics */
3029    if (bld.num_lods != type.length) {
3030       /* TODO: this currently always has to be per-quad or per-element */
3031       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
3032    }
3033    bld.lodi_type = lp_int_type(bld.lodf_type);
3034    bld.levelf_type = bld.lodf_type;
3035    if (bld.num_mips == 1) {
3036       bld.levelf_type.length = 1;
3037    }
3038    bld.leveli_type = lp_int_type(bld.levelf_type);
3039    bld.float_size_type = bld.float_size_in_type;
3040    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
3041     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
3042    if (bld.num_mips > 1) {
3043       bld.float_size_type.length = bld.num_mips == type.length ?
3044                                       bld.num_mips * bld.float_size_in_type.length :
3045                                       type.length;
3046    }
3047    bld.int_size_type = lp_int_type(bld.float_size_type);
3048
3049    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3050    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3051    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3052    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3053    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3054    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3055    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3056    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3057    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3058    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3059    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3060    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3061    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3062    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3063
3064    /* Get the dynamic state */
3065    tex_width = dynamic_state->width(dynamic_state, gallivm,
3066                                     context_ptr, texture_index, NULL);
3067    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3068                                                     context_ptr, texture_index, NULL);
3069    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3070                                                     context_ptr, texture_index, NULL);
3071    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3072                                           context_ptr, texture_index, NULL);
3073    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3074                                                 context_ptr, texture_index, NULL);
3075
3076    if (fetch_ms)
3077       bld.sample_stride = lp_build_broadcast_scalar(&bld.int_coord_bld, dynamic_state->sample_stride(dynamic_state, gallivm,
3078                                                                                                      context_ptr, texture_index, NULL));
3079    /* Note that mip_offsets is an array[level] of offsets to texture images */
3080
3081    if (dynamic_state->cache_ptr && thread_data_ptr) {
3082       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3083                                            thread_data_ptr, texture_index);
3084    }
3085
3086    /* width, height, depth as single int vector */
3087    if (dims <= 1) {
3088       bld.int_size = tex_width;
3089    }
3090    else {
3091       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3092                                             tex_width,
3093                                             LLVMConstInt(i32t, 0, 0), "");
3094       if (dims >= 2) {
3095          LLVMValueRef tex_height =
3096             dynamic_state->height(dynamic_state, gallivm,
3097                                   context_ptr, texture_index, NULL);
3098          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3099                                                tex_height,
3100                                                LLVMConstInt(i32t, 1, 0), "");
3101          if (dims >= 3) {
3102             LLVMValueRef tex_depth =
3103                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3104                                     texture_index, NULL);
3105             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3106                                                   tex_depth,
3107                                                   LLVMConstInt(i32t, 2, 0), "");
3108          }
3109       }
3110    }
3111
3112    for (i = 0; i < 5; i++) {
3113       newcoords[i] = coords[i];
3114    }
3115
3116    if (util_format_is_pure_integer(static_texture_state->format) &&
3117        !util_format_has_depth(bld.format_desc) && op_is_tex &&
3118        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3119         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3120         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3121       /*
3122        * Bail if impossible filtering is specified (the awkard additional
3123        * depth check is because it is legal in gallium to have things like S8Z24
3124        * here which would say it's pure int despite such formats should sample
3125        * the depth component).
3126        * In GL such filters make the texture incomplete, this makes it robust
3127        * against gallium frontends which set this up regardless (we'd crash in the
3128        * lerp later otherwise).
3129        * At least in some apis it may be legal to use such filters with lod
3130        * queries and/or gather (at least for gather d3d10 says only the wrap
3131        * bits are really used hence filter bits are likely simply ignored).
3132        * For fetch, we don't get valid samplers either way here.
3133        */
3134       unsigned chan;
3135       LLVMValueRef zero = lp_build_zero(gallivm, type);
3136       for (chan = 0; chan < 4; chan++) {
3137          texel_out[chan] = zero;
3138       }
3139       return;
3140    }
3141
3142    if (0) {
3143       /* For debug: no-op texture sampling */
3144       lp_build_sample_nop(gallivm,
3145                           bld.texel_type,
3146                           newcoords,
3147                           texel_out);
3148    }
3149
3150    else if (op_type == LP_SAMPLER_OP_FETCH) {
3151       lp_build_fetch_texel(&bld, texture_index, ms_index, newcoords,
3152                            lod, offsets,
3153                            texel_out);
3154    }
3155
3156    else {
3157       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3158       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3159       boolean use_aos;
3160
3161       use_aos = util_format_fits_8unorm(bld.format_desc) &&
3162                 op_is_tex &&
3163                 /* not sure this is strictly needed or simply impossible */
3164                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3165                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3166
3167       use_aos &= bld.num_lods <= num_quads ||
3168                  derived_sampler_state.min_img_filter ==
3169                     derived_sampler_state.mag_img_filter;
3170
3171       if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3172          use_aos = 0;
3173       }
3174
3175       if (dims > 1) {
3176          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3177          if (dims > 2) {
3178             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3179          }
3180       }
3181       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3182            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3183           derived_sampler_state.seamless_cube_map &&
3184           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3185            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3186          /* theoretically possible with AoS filtering but not implemented (complex!) */
3187          use_aos = 0;
3188       }
3189
3190       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3191           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3192          debug_printf("%s: using floating point linear filtering for %s\n",
3193                       __FUNCTION__, bld.format_desc->short_name);
3194          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3195                       "  wraps %d  wrapt %d  wrapr %d\n",
3196                       derived_sampler_state.min_img_filter,
3197                       derived_sampler_state.mag_img_filter,
3198                       derived_sampler_state.min_mip_filter,
3199                       static_texture_state->target,
3200                       derived_sampler_state.seamless_cube_map,
3201                       derived_sampler_state.wrap_s,
3202                       derived_sampler_state.wrap_t,
3203                       derived_sampler_state.wrap_r);
3204       }
3205
3206       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3207                              newcoords,
3208                              derivs, lod_bias, explicit_lod,
3209                              &lod_positive, &lod, &lod_fpart,
3210                              &ilevel0, &ilevel1);
3211
3212       if (op_is_lodq) {
3213          texel_out[0] = lod_fpart;
3214          texel_out[1] = lod;
3215          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3216          return;
3217       }
3218
3219       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3220          /* The aos path doesn't do seamless filtering so simply add cube layer
3221           * to face now.
3222           */
3223          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3224       }
3225
3226       /*
3227        * we only try 8-wide sampling with soa or if we have AVX2
3228        * as it appears to be a loss with just AVX)
3229        */
3230       if (num_quads == 1 || !use_aos ||
3231           (util_cpu_caps.has_avx2 &&
3232            (bld.num_lods == 1 ||
3233             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3234          if (use_aos) {
3235             /* do sampling/filtering with fixed pt arithmetic */
3236             lp_build_sample_aos(&bld, sampler_index,
3237                                 newcoords[0], newcoords[1],
3238                                 newcoords[2],
3239                                 offsets, lod_positive, lod_fpart,
3240                                 ilevel0, ilevel1,
3241                                 texel_out);
3242          }
3243
3244          else {
3245             lp_build_sample_general(&bld, sampler_index,
3246                                     op_type == LP_SAMPLER_OP_GATHER,
3247                                     newcoords, offsets,
3248                                     lod_positive, lod_fpart,
3249                                     ilevel0, ilevel1,
3250                                     texel_out);
3251          }
3252       }
3253       else {
3254          unsigned j;
3255          struct lp_build_sample_context bld4;
3256          struct lp_type type4 = type;
3257          unsigned i;
3258          LLVMValueRef texelout4[4];
3259          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3260
3261          type4.length = 4;
3262
3263          /* Setup our build context */
3264          memset(&bld4, 0, sizeof bld4);
3265          bld4.no_quad_lod = bld.no_quad_lod;
3266          bld4.no_rho_approx = bld.no_rho_approx;
3267          bld4.no_brilinear = bld.no_brilinear;
3268          bld4.gallivm = bld.gallivm;
3269          bld4.context_ptr = bld.context_ptr;
3270          bld4.static_texture_state = bld.static_texture_state;
3271          bld4.static_sampler_state = bld.static_sampler_state;
3272          bld4.dynamic_state = bld.dynamic_state;
3273          bld4.format_desc = bld.format_desc;
3274          bld4.dims = bld.dims;
3275          bld4.row_stride_array = bld.row_stride_array;
3276          bld4.img_stride_array = bld.img_stride_array;
3277          bld4.base_ptr = bld.base_ptr;
3278          bld4.mip_offsets = bld.mip_offsets;
3279          bld4.int_size = bld.int_size;
3280          bld4.cache = bld.cache;
3281
3282          bld4.vector_width = lp_type_width(type4);
3283
3284          bld4.float_type = lp_type_float(32);
3285          bld4.int_type = lp_type_int(32);
3286          bld4.coord_type = type4;
3287          bld4.int_coord_type = lp_int_type(type4);
3288          bld4.float_size_in_type = lp_type_float(32);
3289          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3290          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3291          bld4.texel_type = bld.texel_type;
3292          bld4.texel_type.length = 4;
3293
3294          bld4.num_mips = bld4.num_lods = 1;
3295          if (bld4.no_quad_lod && bld4.no_rho_approx &&
3296              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3297               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3298              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3299             bld4.num_mips = type4.length;
3300             bld4.num_lods = type4.length;
3301          }
3302          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3303              (explicit_lod || lod_bias || derivs)) {
3304             if ((!op_is_tex && target != PIPE_BUFFER) ||
3305                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3306                bld4.num_mips = type4.length;
3307                bld4.num_lods = type4.length;
3308             }
3309             else if (op_is_tex && min_img_filter != mag_img_filter) {
3310                bld4.num_mips = 1;
3311                bld4.num_lods = type4.length;
3312             }
3313          }
3314
3315          /* we want native vector size to be able to use our intrinsics */
3316          bld4.lodf_type = type4;
3317          if (bld4.num_lods != type4.length) {
3318             bld4.lodf_type.length = 1;
3319          }
3320          bld4.lodi_type = lp_int_type(bld4.lodf_type);
3321          bld4.levelf_type = type4;
3322          if (bld4.num_mips != type4.length) {
3323             bld4.levelf_type.length = 1;
3324          }
3325          bld4.leveli_type = lp_int_type(bld4.levelf_type);
3326          bld4.float_size_type = bld4.float_size_in_type;
3327          if (bld4.num_mips > 1) {
3328             bld4.float_size_type.length = bld4.num_mips == type4.length ?
3329                                             bld4.num_mips * bld4.float_size_in_type.length :
3330                                             type4.length;
3331          }
3332          bld4.int_size_type = lp_int_type(bld4.float_size_type);
3333
3334          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3335          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3336          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3337          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3338          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3339          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3340          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3341          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3342          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3343          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3344          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3345          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3346          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3347          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3348
3349          for (i = 0; i < num_quads; i++) {
3350             LLVMValueRef s4, t4, r4;
3351             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3352             LLVMValueRef ilevel04, ilevel14 = NULL;
3353             LLVMValueRef offsets4[4] = { NULL };
3354             unsigned num_lods = bld4.num_lods;
3355
3356             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3357             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3358             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3359
3360             if (offsets[0]) {
3361                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3362                if (dims > 1) {
3363                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3364                   if (dims > 2) {
3365                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3366                   }
3367                }
3368             }
3369             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3370             ilevel04 = bld.num_mips == 1 ? ilevel0 :
3371                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3372             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3373                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3374                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3375             }
3376
3377             if (use_aos) {
3378                /* do sampling/filtering with fixed pt arithmetic */
3379                lp_build_sample_aos(&bld4, sampler_index,
3380                                    s4, t4, r4, offsets4,
3381                                    lod_positive4, lod_fpart4,
3382                                    ilevel04, ilevel14,
3383                                    texelout4);
3384             }
3385
3386             else {
3387                /* this path is currently unreachable and hence might break easily... */
3388                LLVMValueRef newcoords4[5];
3389                newcoords4[0] = s4;
3390                newcoords4[1] = t4;
3391                newcoords4[2] = r4;
3392                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3393                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3394
3395                lp_build_sample_general(&bld4, sampler_index,
3396                                        op_type == LP_SAMPLER_OP_GATHER,
3397                                        newcoords4, offsets4,
3398                                        lod_positive4, lod_fpart4,
3399                                        ilevel04, ilevel14,
3400                                        texelout4);
3401             }
3402             for (j = 0; j < 4; j++) {
3403                texelouttmp[j][i] = texelout4[j];
3404             }
3405          }
3406
3407          for (j = 0; j < 4; j++) {
3408             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3409          }
3410       }
3411    }
3412
3413    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3414       apply_sampler_swizzle(&bld, texel_out);
3415    }
3416
3417    /*
3418     * texel type can be a (32bit) int/uint (for pure int formats only),
3419     * however we are expected to always return floats (storage is untyped).
3420     */
3421    if (!bld.texel_type.floating) {
3422       unsigned chan;
3423       for (chan = 0; chan < 4; chan++) {
3424          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3425                                             lp_build_vec_type(gallivm, type), "");
3426       }
3427    }
3428 }
3429
3430
3431 #define USE_TEX_FUNC_CALL 1
3432
3433 #define LP_MAX_TEX_FUNC_ARGS 32
3434
3435 static inline void
3436 get_target_info(enum pipe_texture_target target,
3437                 unsigned *num_coords, unsigned *num_derivs,
3438                 unsigned *num_offsets, unsigned *layer)
3439 {
3440    unsigned dims = texture_dims(target);
3441    *num_coords = dims;
3442    *num_offsets = dims;
3443    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3444                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3445    *layer = has_layer_coord(target) ? 2: 0;
3446    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3447       /*
3448        * dims doesn't include r coord for cubes - this is handled
3449        * by layer instead, but need to fix up for cube arrays...
3450        */
3451       *layer = 3;
3452       *num_coords = 3;
3453    }
3454 }
3455
3456
3457 /**
3458  * Generate the function body for a texture sampling function.
3459  */
3460 static void
3461 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3462                          const struct lp_static_texture_state *static_texture_state,
3463                          const struct lp_static_sampler_state *static_sampler_state,
3464                          struct lp_sampler_dynamic_state *dynamic_state,
3465                          struct lp_type type,
3466                          unsigned texture_index,
3467                          unsigned sampler_index,
3468                          LLVMValueRef function,
3469                          unsigned num_args,
3470                          unsigned sample_key)
3471 {
3472    LLVMBuilderRef old_builder;
3473    LLVMBasicBlockRef block;
3474    LLVMValueRef coords[5];
3475    LLVMValueRef offsets[3] = { NULL };
3476    LLVMValueRef lod = NULL;
3477    LLVMValueRef ms_index = NULL;
3478    LLVMValueRef context_ptr;
3479    LLVMValueRef thread_data_ptr = NULL;
3480    LLVMValueRef texel_out[4];
3481    struct lp_derivatives derivs;
3482    struct lp_derivatives *deriv_ptr = NULL;
3483    unsigned num_param = 0;
3484    unsigned i, num_coords, num_derivs, num_offsets, layer;
3485    enum lp_sampler_lod_control lod_control;
3486    enum lp_sampler_op_type op_type;
3487    boolean need_cache = FALSE;
3488
3489    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3490                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3491
3492    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3493                     LP_SAMPLER_OP_TYPE_SHIFT;
3494
3495    get_target_info(static_texture_state->target,
3496                    &num_coords, &num_derivs, &num_offsets, &layer);
3497
3498    /* lod query doesn't take a layer */
3499    if (layer && op_type == LP_SAMPLER_OP_LODQ)
3500       layer = 0;
3501
3502    if (dynamic_state->cache_ptr) {
3503       const struct util_format_description *format_desc;
3504       format_desc = util_format_description(static_texture_state->format);
3505       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3506          need_cache = TRUE;
3507       }
3508    }
3509
3510    /* "unpack" arguments */
3511    context_ptr = LLVMGetParam(function, num_param++);
3512    if (need_cache) {
3513       thread_data_ptr = LLVMGetParam(function, num_param++);
3514    }
3515    for (i = 0; i < num_coords; i++) {
3516       coords[i] = LLVMGetParam(function, num_param++);
3517    }
3518    for (i = num_coords; i < 5; i++) {
3519       /* This is rather unfortunate... */
3520       coords[i] = lp_build_undef(gallivm, type);
3521    }
3522    if (layer) {
3523       coords[layer] = LLVMGetParam(function, num_param++);
3524    }
3525    if (sample_key & LP_SAMPLER_SHADOW) {
3526       coords[4] = LLVMGetParam(function, num_param++);
3527    }
3528    if (sample_key & LP_SAMPLER_FETCH_MS) {
3529       ms_index = LLVMGetParam(function, num_param++);
3530    }
3531    if (sample_key & LP_SAMPLER_OFFSETS) {
3532       for (i = 0; i < num_offsets; i++) {
3533          offsets[i] = LLVMGetParam(function, num_param++);
3534       }
3535    }
3536    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3537        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3538       lod = LLVMGetParam(function, num_param++);
3539    }
3540    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3541       for (i = 0; i < num_derivs; i++) {
3542          derivs.ddx[i] = LLVMGetParam(function, num_param++);
3543          derivs.ddy[i] = LLVMGetParam(function, num_param++);
3544       }
3545       deriv_ptr = &derivs;
3546    }
3547
3548    assert(num_args == num_param);
3549
3550    /*
3551     * Function body
3552     */
3553
3554    old_builder = gallivm->builder;
3555    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3556    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3557    LLVMPositionBuilderAtEnd(gallivm->builder, block);
3558
3559    lp_build_sample_soa_code(gallivm,
3560                             static_texture_state,
3561                             static_sampler_state,
3562                             dynamic_state,
3563                             type,
3564                             sample_key,
3565                             texture_index,
3566                             sampler_index,
3567                             context_ptr,
3568                             thread_data_ptr,
3569                             coords,
3570                             offsets,
3571                             deriv_ptr,
3572                             lod,
3573                             ms_index,
3574                             texel_out);
3575
3576    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3577
3578    LLVMDisposeBuilder(gallivm->builder);
3579    gallivm->builder = old_builder;
3580
3581    gallivm_verify_function(gallivm, function);
3582 }
3583
3584
3585 /**
3586  * Call the matching function for texture sampling.
3587  * If there's no match, generate a new one.
3588  */
3589 static void
3590 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3591                          const struct lp_static_texture_state *static_texture_state,
3592                          const struct lp_static_sampler_state *static_sampler_state,
3593                          struct lp_sampler_dynamic_state *dynamic_state,
3594                          const struct lp_sampler_params *params,
3595                          int texture_index, int sampler_index,
3596                          LLVMValueRef *tex_ret)
3597 {
3598    LLVMBuilderRef builder = gallivm->builder;
3599    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3600                              LLVMGetInsertBlock(builder)));
3601    LLVMValueRef function, inst;
3602    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3603    LLVMBasicBlockRef bb;
3604    unsigned num_args = 0;
3605    char func_name[64];
3606    unsigned i, num_coords, num_derivs, num_offsets, layer;
3607    unsigned sample_key = params->sample_key;
3608    const LLVMValueRef *coords = params->coords;
3609    const LLVMValueRef *offsets = params->offsets;
3610    const struct lp_derivatives *derivs = params->derivs;
3611    enum lp_sampler_lod_control lod_control;
3612    enum lp_sampler_op_type op_type;
3613    boolean need_cache = FALSE;
3614
3615    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3616                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3617
3618    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3619                     LP_SAMPLER_OP_TYPE_SHIFT;
3620
3621    get_target_info(static_texture_state->target,
3622                    &num_coords, &num_derivs, &num_offsets, &layer);
3623
3624    /* lod query doesn't take a layer */
3625    if (layer && op_type == LP_SAMPLER_OP_LODQ)
3626       layer = 0;
3627
3628    if (dynamic_state->cache_ptr) {
3629       const struct util_format_description *format_desc;
3630       format_desc = util_format_description(static_texture_state->format);
3631       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3632          need_cache = TRUE;
3633       }
3634    }
3635    /*
3636     * texture function matches are found by name.
3637     * Thus the name has to include both the texture and sampler unit
3638     * (which covers all static state) plus the actual texture function
3639     * (including things like offsets, shadow coord, lod control).
3640     * Additionally lod_property has to be included too.
3641     */
3642
3643    snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3644             texture_index, sampler_index, sample_key);
3645
3646    function = LLVMGetNamedFunction(module, func_name);
3647
3648    if(!function) {
3649       LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3650       LLVMTypeRef ret_type;
3651       LLVMTypeRef function_type;
3652       LLVMTypeRef val_type[4];
3653       unsigned num_param = 0;
3654
3655       /*
3656        * Generate the function prototype.
3657        */
3658
3659       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3660       if (need_cache) {
3661          arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3662       }
3663       for (i = 0; i < num_coords; i++) {
3664          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3665          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3666       }
3667       if (layer) {
3668          arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3669          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3670       }
3671       if (sample_key & LP_SAMPLER_SHADOW) {
3672          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3673       }
3674       if (sample_key & LP_SAMPLER_FETCH_MS) {
3675          arg_types[num_param++] = LLVMTypeOf(params->ms_index);
3676       }
3677       if (sample_key & LP_SAMPLER_OFFSETS) {
3678          for (i = 0; i < num_offsets; i++) {
3679             arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3680             assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3681          }
3682       }
3683       if (lod_control == LP_SAMPLER_LOD_BIAS ||
3684           lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3685          arg_types[num_param++] = LLVMTypeOf(params->lod);
3686       }
3687       else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3688          for (i = 0; i < num_derivs; i++) {
3689             arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3690             arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3691             assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3692             assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3693          }
3694       }
3695
3696       val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3697          lp_build_vec_type(gallivm, params->type);
3698       ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3699       function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3700       function = LLVMAddFunction(module, func_name, function_type);
3701
3702       for (i = 0; i < num_param; ++i) {
3703          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3704
3705             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3706          }
3707       }
3708
3709       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3710       LLVMSetLinkage(function, LLVMInternalLinkage);
3711
3712       lp_build_sample_gen_func(gallivm,
3713                                static_texture_state,
3714                                static_sampler_state,
3715                                dynamic_state,
3716                                params->type,
3717                                texture_index,
3718                                sampler_index,
3719                                function,
3720                                num_param,
3721                                sample_key);
3722    }
3723
3724    num_args = 0;
3725    args[num_args++] = params->context_ptr;
3726    if (need_cache) {
3727       args[num_args++] = params->thread_data_ptr;
3728    }
3729    for (i = 0; i < num_coords; i++) {
3730       args[num_args++] = coords[i];
3731    }
3732    if (layer) {
3733       args[num_args++] = coords[layer];
3734    }
3735    if (sample_key & LP_SAMPLER_SHADOW) {
3736       args[num_args++] = coords[4];
3737    }
3738    if (sample_key & LP_SAMPLER_FETCH_MS) {
3739       args[num_args++] = params->ms_index;
3740    }
3741    if (sample_key & LP_SAMPLER_OFFSETS) {
3742       for (i = 0; i < num_offsets; i++) {
3743          args[num_args++] = offsets[i];
3744       }
3745    }
3746    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3747        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3748       args[num_args++] = params->lod;
3749    }
3750    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3751       for (i = 0; i < num_derivs; i++) {
3752          args[num_args++] = derivs->ddx[i];
3753          args[num_args++] = derivs->ddy[i];
3754       }
3755    }
3756
3757    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3758
3759    *tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3760    bb = LLVMGetInsertBlock(builder);
3761    inst = LLVMGetLastInstruction(bb);
3762    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3763
3764 }
3765
3766
3767 /**
3768  * Build texture sampling code.
3769  * Either via a function call or inline it directly.
3770  */
3771 void
3772 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3773                     const struct lp_static_sampler_state *static_sampler_state,
3774                     struct lp_sampler_dynamic_state *dynamic_state,
3775                     struct gallivm_state *gallivm,
3776                     const struct lp_sampler_params *params)
3777 {
3778    boolean use_tex_func = FALSE;
3779
3780    /*
3781     * Do not use a function call if the sampling is "simple enough".
3782     * We define this by
3783     * a) format
3784     * b) no mips (either one level only or no mip filter)
3785     * No mips will definitely make the code smaller, though
3786     * the format requirement is a bit iffy - there's some (SoA) formats
3787     * which definitely generate less code. This does happen to catch
3788     * some important cases though which are hurt quite a bit by using
3789     * a call (though not really because of the call overhead but because
3790     * they are reusing the same texture unit with some of the same
3791     * parameters).
3792     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3793     */
3794
3795    if (USE_TEX_FUNC_CALL) {
3796       const struct util_format_description *format_desc;
3797       boolean simple_format;
3798       boolean simple_tex;
3799       enum lp_sampler_op_type op_type;
3800       format_desc = util_format_description(static_texture_state->format);
3801       simple_format = !format_desc ||
3802                          (util_format_is_rgba8_variant(format_desc) &&
3803                           format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3804
3805       op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3806                     LP_SAMPLER_OP_TYPE_SHIFT;
3807       simple_tex =
3808          op_type != LP_SAMPLER_OP_TEXTURE ||
3809            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3810              static_texture_state->level_zero_only == TRUE) &&
3811             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3812
3813       use_tex_func = format_desc && !(simple_format && simple_tex);
3814    }
3815
3816    if (use_tex_func) {
3817       LLVMValueRef tex_ret;
3818       lp_build_sample_soa_func(gallivm,
3819                                static_texture_state,
3820                                static_sampler_state,
3821                                dynamic_state,
3822                                params, params->texture_index, params->sampler_index, &tex_ret);
3823
3824       for (unsigned i = 0; i < 4; i++) {
3825          params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3826       }
3827    }
3828    else {
3829       lp_build_sample_soa_code(gallivm,
3830                                static_texture_state,
3831                                static_sampler_state,
3832                                dynamic_state,
3833                                params->type,
3834                                params->sample_key,
3835                                params->texture_index,
3836                                params->sampler_index,
3837                                params->context_ptr,
3838                                params->thread_data_ptr,
3839                                params->coords,
3840                                params->offsets,
3841                                params->derivs,
3842                                params->lod,
3843                                params->ms_index,
3844                                params->texel);
3845    }
3846 }
3847
3848
3849 void
3850 lp_build_size_query_soa(struct gallivm_state *gallivm,
3851                         const struct lp_static_texture_state *static_state,
3852                         struct lp_sampler_dynamic_state *dynamic_state,
3853                         const struct lp_sampler_size_query_params *params)
3854 {
3855    LLVMValueRef lod, level = 0, size;
3856    LLVMValueRef first_level = NULL;
3857    int dims, i;
3858    boolean has_array;
3859    unsigned num_lods = 1;
3860    struct lp_build_context bld_int_vec4;
3861    LLVMValueRef context_ptr = params->context_ptr;
3862    unsigned texture_unit = params->texture_unit;
3863    unsigned target = params->target;
3864    LLVMValueRef texture_unit_offset = params->texture_unit_offset;
3865
3866    if (static_state->format == PIPE_FORMAT_NONE) {
3867       /*
3868        * If there's nothing bound, format is NONE, and we must return
3869        * all zero as mandated by d3d10 in this case.
3870        */
3871       unsigned chan;
3872       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3873       for (chan = 0; chan < 4; chan++) {
3874          params->sizes_out[chan] = zero;
3875       }
3876       return;
3877    }
3878
3879    /*
3880     * Do some sanity verification about bound texture and shader dcl target.
3881     * Not entirely sure what's possible but assume array/non-array
3882     * always compatible (probably not ok for OpenGL but d3d10 has no
3883     * distinction of arrays at the resource level).
3884     * Everything else looks bogus (though not entirely sure about rect/2d).
3885     * Currently disabled because it causes assertion failures if there's
3886     * nothing bound (or rather a dummy texture, not that this case would
3887     * return the right values).
3888     */
3889    if (0 && static_state->target != target) {
3890       if (static_state->target == PIPE_TEXTURE_1D)
3891          assert(target == PIPE_TEXTURE_1D_ARRAY);
3892       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3893          assert(target == PIPE_TEXTURE_1D);
3894       else if (static_state->target == PIPE_TEXTURE_2D)
3895          assert(target == PIPE_TEXTURE_2D_ARRAY);
3896       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3897          assert(target == PIPE_TEXTURE_2D);
3898       else if (static_state->target == PIPE_TEXTURE_CUBE)
3899          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3900       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3901          assert(target == PIPE_TEXTURE_CUBE);
3902       else
3903          assert(0);
3904    }
3905
3906    dims = texture_dims(target);
3907
3908    switch (target) {
3909    case PIPE_TEXTURE_1D_ARRAY:
3910    case PIPE_TEXTURE_2D_ARRAY:
3911    case PIPE_TEXTURE_CUBE_ARRAY:
3912       has_array = TRUE;
3913       break;
3914    default:
3915       has_array = FALSE;
3916       break;
3917    }
3918
3919    assert(!params->int_type.floating);
3920
3921    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3922
3923    if (params->samples_only) {
3924       params->sizes_out[0] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3925                                                 dynamic_state->num_samples(dynamic_state, gallivm,
3926                                                                            context_ptr, texture_unit,
3927                                                                            texture_unit_offset));
3928       return;
3929    }
3930    if (params->explicit_lod) {
3931       /* FIXME: this needs to honor per-element lod */
3932       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3933                                     lp_build_const_int32(gallivm, 0), "");
3934       first_level = dynamic_state->first_level(dynamic_state, gallivm,
3935                                                context_ptr, texture_unit, texture_unit_offset);
3936       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3937       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3938    } else {
3939       lod = bld_int_vec4.zero;
3940    }
3941
3942    size = bld_int_vec4.undef;
3943
3944    size = LLVMBuildInsertElement(gallivm->builder, size,
3945                                  dynamic_state->width(dynamic_state, gallivm,
3946                                                       context_ptr, texture_unit, texture_unit_offset),
3947                                  lp_build_const_int32(gallivm, 0), "");
3948
3949    if (dims >= 2) {
3950       size = LLVMBuildInsertElement(gallivm->builder, size,
3951                                     dynamic_state->height(dynamic_state, gallivm,
3952                                                           context_ptr, texture_unit, texture_unit_offset),
3953                                     lp_build_const_int32(gallivm, 1), "");
3954    }
3955
3956    if (dims >= 3) {
3957       size = LLVMBuildInsertElement(gallivm->builder, size,
3958                                     dynamic_state->depth(dynamic_state, gallivm,
3959                                                          context_ptr, texture_unit, texture_unit_offset),
3960                                     lp_build_const_int32(gallivm, 2), "");
3961    }
3962
3963    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3964
3965    if (has_array) {
3966       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3967                                                  context_ptr, texture_unit, texture_unit_offset);
3968       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3969          /*
3970           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3971           * Could avoid this by passing in number of cubes instead of total
3972           * number of layers (might make things easier elsewhere too).
3973           */
3974          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3975          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3976       }
3977       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3978                                     lp_build_const_int32(gallivm, dims), "");
3979    }
3980
3981    /*
3982     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3983     * if level is out of bounds (note this can't cover unbound texture
3984     * here, which also requires returning zero).
3985     */
3986    if (params->explicit_lod && params->is_sviewinfo) {
3987       LLVMValueRef last_level, out, out1;
3988       struct lp_build_context leveli_bld;
3989
3990       /* everything is scalar for now */
3991       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3992       last_level = dynamic_state->last_level(dynamic_state, gallivm,
3993                                              context_ptr, texture_unit, texture_unit_offset);
3994
3995       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3996       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3997       out = lp_build_or(&leveli_bld, out, out1);
3998       if (num_lods == 1) {
3999          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
4000       }
4001       else {
4002          /* TODO */
4003          assert(0);
4004       }
4005       size = lp_build_andnot(&bld_int_vec4, size, out);
4006    }
4007    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
4008       params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
4009                                                 size,
4010                                                 lp_build_const_int32(gallivm, i));
4011    }
4012    if (params->is_sviewinfo) {
4013       for (; i < 4; i++) {
4014          params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
4015       }
4016    }
4017
4018    /*
4019     * if there's no explicit_lod (buffers, rects) queries requiring nr of
4020     * mips would be illegal.
4021     */
4022    if (params->is_sviewinfo && params->explicit_lod) {
4023       struct lp_build_context bld_int_scalar;
4024       LLVMValueRef num_levels;
4025       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
4026
4027       if (static_state->level_zero_only) {
4028          num_levels = bld_int_scalar.one;
4029       }
4030       else {
4031          LLVMValueRef last_level;
4032
4033          last_level = dynamic_state->last_level(dynamic_state, gallivm,
4034                                                 context_ptr, texture_unit, texture_unit_offset);
4035          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
4036          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
4037       }
4038       params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
4039                                         num_levels);
4040    }
4041 }
4042
4043 static void
4044 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
4045                        const struct util_format_description *format_desc,
4046                        struct lp_type type,
4047                        LLVMValueRef exec_mask,
4048                        LLVMValueRef base_ptr,
4049                        LLVMValueRef offset,
4050                        LLVMValueRef out_of_bounds,
4051                        unsigned img_op,
4052                        LLVMAtomicRMWBinOp op,
4053                        const LLVMValueRef rgba_in[4],
4054                        const LLVMValueRef rgba2_in[4],
4055                        LLVMValueRef atomic_result[4])
4056 {
4057    enum pipe_format format = format_desc->format;
4058
4059    if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT) {
4060       atomic_result[0] = lp_build_zero(gallivm, type);
4061       return;
4062    }
4063
4064    LLVMValueRef atom_res = lp_build_alloca(gallivm,
4065                                            LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
4066
4067    offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
4068    struct lp_build_loop_state loop_state;
4069    lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
4070    struct lp_build_if_state ifthen;
4071    LLVMValueRef cond;
4072    LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
4073
4074    LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
4075    assert(exec_mask);
4076
4077    cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
4078    cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
4079    lp_build_if(&ifthen, gallivm, cond);
4080
4081    LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
4082    LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
4083    cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
4084    data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
4085
4086    if (img_op == LP_IMG_ATOMIC_CAS) {
4087       LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
4088       LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
4089       data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
4090                                     cas_src,
4091                                     LLVMAtomicOrderingSequentiallyConsistent,
4092                                     LLVMAtomicOrderingSequentiallyConsistent,
4093                                     false);
4094       data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4095    } else {
4096       data = LLVMBuildAtomicRMW(gallivm->builder, op,
4097                                 cast_base_ptr, data,
4098                                 LLVMAtomicOrderingSequentiallyConsistent,
4099                                 false);
4100    }
4101
4102    LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
4103    temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
4104    LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4105
4106    lp_build_endif(&ifthen);
4107    lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
4108                           NULL, LLVMIntUGE);
4109    atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
4110 }
4111
4112 static void
4113 lp_build_img_op_no_format(struct gallivm_state *gallivm,
4114                           const struct lp_img_params *params,
4115                           LLVMValueRef outdata[4])
4116 {
4117    /*
4118     * If there's nothing bound, format is NONE, and we must return
4119     * all zero as mandated by d3d10 in this case.
4120        */
4121    if (params->img_op != LP_IMG_STORE) {
4122       LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4123       for (unsigned chan = 0; chan < (params->img_op == LP_IMG_LOAD ? 4 : 1); chan++) {
4124          outdata[chan] = zero;
4125       }
4126    }
4127 }
4128
4129 static struct lp_type
4130 lp_build_img_texel_type(struct gallivm_state *gallivm,
4131                         struct lp_type texel_type,
4132                         const struct util_format_description *format_desc)
4133 {
4134    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
4135        format_desc->channel[0].pure_integer) {
4136       if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
4137          texel_type = lp_type_int_vec(texel_type.width, texel_type.width * texel_type.length);
4138       } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
4139          texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
4140       }
4141    }
4142    return texel_type;
4143 }
4144
4145 void
4146 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4147                     struct lp_sampler_dynamic_state *dynamic_state,
4148                     struct gallivm_state *gallivm,
4149                     const struct lp_img_params *params,
4150                     LLVMValueRef outdata[4])
4151 {
4152    unsigned target = params->target;
4153    unsigned dims = texture_dims(target);
4154    /** regular scalar int type */
4155    struct lp_type int_type, int_coord_type;
4156    struct lp_build_context int_bld, int_coord_bld;
4157    const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
4158    LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
4159    LLVMValueRef ms_index = params->ms_index;
4160    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4161    int_type = lp_type_int(32);
4162    int_coord_type = lp_int_type(params->type);
4163    lp_build_context_init(&int_bld, gallivm, int_type);
4164    lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4165
4166    if (static_texture_state->format == PIPE_FORMAT_NONE) {
4167       lp_build_img_op_no_format(gallivm, params, outdata);
4168       return;
4169    }
4170    LLVMValueRef offset, i, j;
4171
4172    LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4173                                                        params->context_ptr, params->image_index, NULL);
4174    LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4175                                                        params->context_ptr, params->image_index, NULL);
4176    LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4177                                                    params->context_ptr, params->image_index, NULL);
4178    LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4179                                                 params->context_ptr, params->image_index, NULL);
4180    LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4181                                                params->context_ptr, params->image_index, NULL);
4182    LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4183                                               params->context_ptr, params->image_index, NULL);
4184    LLVMValueRef num_samples = NULL, sample_stride = NULL;
4185    if (ms_index) {
4186       num_samples = dynamic_state->num_samples(dynamic_state, gallivm,
4187                                                params->context_ptr, params->image_index, NULL);
4188       sample_stride = dynamic_state->sample_stride(dynamic_state, gallivm,
4189                                                    params->context_ptr, params->image_index, NULL);
4190    }
4191
4192    boolean layer_coord = has_layer_coord(target);
4193
4194    width = lp_build_broadcast_scalar(&int_coord_bld, width);
4195    if (dims >= 2) {
4196       height = lp_build_broadcast_scalar(&int_coord_bld, height);
4197       row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4198    }
4199    if (dims >= 3 || layer_coord) {
4200       depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4201       img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4202    }
4203
4204    LLVMValueRef out_of_bounds = int_coord_bld.zero;
4205    LLVMValueRef out1;
4206    out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4207    out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4208
4209    if (dims >= 2) {
4210       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4211       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4212    }
4213    if (dims >= 3) {
4214       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4215       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4216    }
4217    lp_build_sample_offset(&int_coord_bld,
4218                           format_desc,
4219                           x, y, z, row_stride_vec, img_stride_vec,
4220                           &offset, &i, &j);
4221
4222    if (ms_index) {
4223       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(&int_coord_bld, num_samples));
4224       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4225
4226       offset = lp_build_add(&int_coord_bld, offset,
4227                             lp_build_mul(&int_coord_bld, lp_build_broadcast_scalar(&int_coord_bld, sample_stride),
4228                                          ms_index));
4229    }
4230    if (params->img_op == LP_IMG_LOAD) {
4231       struct lp_type texel_type = lp_build_img_texel_type(gallivm, params->type, format_desc);
4232
4233       offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4234       struct lp_build_context texel_bld;
4235       lp_build_context_init(&texel_bld, gallivm, texel_type);
4236       lp_build_fetch_rgba_soa(gallivm,
4237                               format_desc,
4238                               texel_type, TRUE,
4239                               base_ptr, offset,
4240                               i, j,
4241                               NULL,
4242                               outdata);
4243
4244       for (unsigned chan = 0; chan < 4; chan++) {
4245          outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4246                                          texel_bld.zero, outdata[chan]);
4247       }
4248    } else if (params->img_op == LP_IMG_STORE) {
4249       lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4250                               params->indata);
4251    } else {
4252       lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4253                              params->img_op, params->op, params->indata, params->indata2, outdata);
4254    }
4255 }
4256
4257 /*
4258  * These functions are for indirect texture access suppoort.
4259  *
4260  * Indirect textures are implemented using a switch statement, that
4261  * takes the texture index and jumps to the sampler functions for
4262  * that texture unit.
4263  */
4264
4265 /*
4266  * Initialise an indexed sampler switch block.
4267  *
4268  * This sets up the switch_info state and adds the LLVM flow control pieces.
4269  */
4270 void
4271 lp_build_sample_array_init_soa(struct lp_build_sample_array_switch *switch_info,
4272                            struct gallivm_state *gallivm,
4273                            const struct lp_sampler_params *params,
4274                            LLVMValueRef idx,
4275                            unsigned base, unsigned range)
4276 {
4277    switch_info->gallivm = gallivm;
4278    switch_info->params = *params;
4279    switch_info->base = base;
4280    switch_info->range = range;
4281
4282    /* for generating the switch functions we don't want the texture index offset */
4283    switch_info->params.texture_index_offset = 0;
4284
4285    LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4286    switch_info->merge_ref = lp_build_insert_new_block(gallivm, "texmerge");
4287
4288    switch_info->switch_ref = LLVMBuildSwitch(gallivm->builder, idx,
4289                                              switch_info->merge_ref, range - base);
4290
4291    LLVMTypeRef val_type[4];
4292    val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4293       lp_build_vec_type(gallivm, params->type);
4294    LLVMTypeRef ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4295
4296    LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4297
4298    LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4299
4300    switch_info->phi = LLVMBuildPhi(gallivm->builder, ret_type, "");
4301    LLVMAddIncoming(switch_info->phi, &undef_val, &initial_block, 1);
4302 }
4303
4304 /*
4305  * Add an individual entry to the indirect texture switch.
4306  *
4307  * This builds the sample function and links a case for it into the switch statement.
4308  */
4309 void
4310 lp_build_sample_array_case_soa(struct lp_build_sample_array_switch *switch_info,
4311                            int idx,
4312                            const struct lp_static_texture_state *static_texture_state,
4313                            const struct lp_static_sampler_state *static_sampler_state,
4314                            struct lp_sampler_dynamic_state *dynamic_texture_state)
4315 {
4316    struct gallivm_state *gallivm = switch_info->gallivm;
4317    LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "texblock");
4318    LLVMValueRef tex_ret;
4319
4320    LLVMAddCase(switch_info->switch_ref, LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), idx, 0), this_block);
4321    LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4322
4323    lp_build_sample_soa_func(gallivm, static_texture_state,
4324                             static_sampler_state, dynamic_texture_state, &switch_info->params, idx, idx,
4325                             &tex_ret);
4326
4327    LLVMAddIncoming(switch_info->phi, &tex_ret, &this_block, 1);
4328    LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4329 }
4330
4331 /*
4332  * Finish a switch statement.
4333  *
4334  * This handles extract the results from the switch.
4335  */
4336 void lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch *switch_info)
4337 {
4338    struct gallivm_state *gallivm = switch_info->gallivm;
4339
4340    LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4341    for (unsigned i = 0; i < 4; i++)
4342       switch_info->params.texel[i] = LLVMBuildExtractValue(gallivm->builder, switch_info->phi, i, "");
4343 }
4344
4345 void
4346 lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch *switch_info,
4347                              struct gallivm_state *gallivm,
4348                              const struct lp_img_params *params,
4349                              LLVMValueRef idx,
4350                              unsigned base, unsigned range)
4351 {
4352    switch_info->gallivm = gallivm;
4353    switch_info->params = *params;
4354    switch_info->base = base;
4355    switch_info->range = range;
4356
4357    /* for generating the switch functions we don't want the texture index offset */
4358    switch_info->params.image_index_offset = 0;
4359
4360    LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4361    switch_info->merge_ref = lp_build_insert_new_block(gallivm, "imgmerge");
4362
4363    switch_info->switch_ref = LLVMBuildSwitch(gallivm->builder, idx,
4364                                              switch_info->merge_ref, range - base);
4365
4366    if (params->img_op != LP_IMG_STORE) {
4367       LLVMTypeRef ret_type = lp_build_vec_type(gallivm, params->type);
4368       LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4369
4370       LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4371
4372       for (unsigned i = 0; i < ((params->img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4373          switch_info->phi[i] = LLVMBuildPhi(gallivm->builder, ret_type, "");
4374          LLVMAddIncoming(switch_info->phi[i], &undef_val, &initial_block, 1);
4375       }
4376    }
4377 }
4378
4379 void
4380 lp_build_image_op_array_case(struct lp_build_img_op_array_switch *switch_info,
4381                             int idx,
4382                             const struct lp_static_texture_state *static_texture_state,
4383                             struct lp_sampler_dynamic_state *dynamic_state)
4384 {
4385    struct gallivm_state *gallivm = switch_info->gallivm;
4386    LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "img");
4387    LLVMValueRef tex_ret[4];
4388
4389    LLVMAddCase(switch_info->switch_ref, lp_build_const_int32(gallivm, idx), this_block);
4390    LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4391
4392    switch_info->params.image_index = idx;
4393
4394    lp_build_img_op_soa(static_texture_state, dynamic_state, switch_info->gallivm, &switch_info->params, tex_ret);
4395    if (switch_info->params.img_op != LP_IMG_STORE) {
4396       for (unsigned i = 0; i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++)
4397          tex_ret[i] = LLVMBuildBitCast(gallivm->builder, tex_ret[i], lp_build_vec_type(gallivm, switch_info->params.type), "");
4398
4399       this_block = LLVMGetInsertBlock(gallivm->builder);
4400       for (unsigned i = 0; i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4401          LLVMAddIncoming(switch_info->phi[i], &tex_ret[i], &this_block, 1);
4402       }
4403    }
4404    LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4405 }
4406
4407 void lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch *switch_info)
4408 {
4409    struct gallivm_state *gallivm = switch_info->gallivm;
4410
4411    LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4412
4413    if (switch_info->params.img_op != LP_IMG_STORE) {
4414       for (unsigned i = 0; i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4415          switch_info->params.outdata[i] = switch_info->phi[i];
4416       }
4417    }
4418 }