src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- SoA.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  * @author Brian Paul <brianp@vmware.com>
  34  */
  35
  36 #include "pipe/p_defines.h"
  37 #include "pipe/p_state.h"
  38 #include "pipe/p_shader_tokens.h"
  39 #include "util/u_debug.h"
  40 #include "util/u_dump.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_math.h"
  43 #include "util/format/u_format.h"
  44 #include "util/u_cpu_detect.h"
  45 #include "util/format_rgb9e5.h"
  46 #include "lp_bld_debug.h"
  47 #include "lp_bld_type.h"
  48 #include "lp_bld_const.h"
  49 #include "lp_bld_conv.h"
  50 #include "lp_bld_arit.h"
  51 #include "lp_bld_bitarit.h"
  52 #include "lp_bld_logic.h"
  53 #include "lp_bld_printf.h"
  54 #include "lp_bld_swizzle.h"
  55 #include "lp_bld_flow.h"
  56 #include "lp_bld_gather.h"
  57 #include "lp_bld_format.h"
  58 #include "lp_bld_sample.h"
  59 #include "lp_bld_sample_aos.h"
  60 #include "lp_bld_struct.h"
  61 #include "lp_bld_quad.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_intr.h"
  64 #include "lp_bld_misc.h"
  65
  66
  67 /**
  68  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  69  * The computation depends on whether the texture is 1D, 2D or 3D.
  70  * The result, texel, will be float vectors:
  71  *   texel[0] = red values
  72  *   texel[1] = green values
  73  *   texel[2] = blue values
  74  *   texel[3] = alpha values
  75  */
  76 static void
  77 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  78                           LLVMValueRef width,
  79                           LLVMValueRef height,
  80                           LLVMValueRef depth,
  81                           LLVMValueRef x,
  82                           LLVMValueRef y,
  83                           LLVMValueRef z,
  84                           LLVMValueRef y_stride,
  85                           LLVMValueRef z_stride,
  86                           LLVMValueRef data_ptr,
  87                           LLVMValueRef mipoffsets,
  88                           LLVMValueRef texel_out[4])
  89 {
  90    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
  91    const unsigned dims = bld->dims;
  92    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
  93    LLVMBuilderRef builder = bld->gallivm->builder;
  94    LLVMValueRef offset;
  95    LLVMValueRef i, j;
  96    LLVMValueRef use_border = NULL;
  97
  98    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
  99    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
 100                                               static_state->min_img_filter,
 101                                               static_state->mag_img_filter)) {
 102       LLVMValueRef b1, b2;
 103       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
 104       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
 105       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 106    }
 107
 108    if (dims >= 2 &&
 109        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
 110                                               static_state->min_img_filter,
 111                                               static_state->mag_img_filter)) {
 112       LLVMValueRef b1, b2;
 113       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
 114       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
 115       if (use_border) {
 116          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 117          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 118       }
 119       else {
 120          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 121       }
 122    }
 123
 124    if (dims == 3 &&
 125        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
 126                                               static_state->min_img_filter,
 127                                               static_state->mag_img_filter)) {
 128       LLVMValueRef b1, b2;
 129       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
 130       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
 131       if (use_border) {
 132          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 133          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 134       }
 135       else {
 136          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 137       }
 138    }
 139
 140    /* convert x,y,z coords to linear offset from start of texture, in bytes */
 141    lp_build_sample_offset(&bld->int_coord_bld,
 142                           bld->format_desc,
 143                           x, y, z, y_stride, z_stride,
 144                           &offset, &i, &j);
 145    if (mipoffsets) {
 146       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
 147    }
 148
 149    if (use_border) {
 150       /* If we can sample the border color, it means that texcoords may
 151        * lie outside the bounds of the texture image.  We need to do
 152        * something to prevent reading out of bounds and causing a segfault.
 153        *
 154        * Simply AND the texture coords with !use_border.  This will cause
 155        * coords which are out of bounds to become zero.  Zero's guaranteed
 156        * to be inside the texture image.
 157        */
 158       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
 159    }
 160
 161    lp_build_fetch_rgba_soa(bld->gallivm,
 162                            bld->format_desc,
 163                            bld->texel_type, TRUE,
 164                            data_ptr, offset,
 165                            i, j,
 166                            bld->cache,
 167                            texel_out);
 168
 169    /*
 170     * Note: if we find an app which frequently samples the texture border
 171     * we might want to implement a true conditional here to avoid sampling
 172     * the texture whenever possible (since that's quite a bit of code).
 173     * Ex:
 174     *   if (use_border) {
 175     *      texel = border_color;
 176     *   }
 177     *   else {
 178     *      texel = sample_texture(coord);
 179     *   }
 180     * As it is now, we always sample the texture, then selectively replace
 181     * the texel color results with the border color.
 182     */
 183
 184    if (use_border) {
 185       /* select texel color or border color depending on use_border. */
 186       const struct util_format_description *format_desc = bld->format_desc;
 187       int chan;
 188       struct lp_type border_type = bld->texel_type;
 189       border_type.length = 4;
 190       /*
 191        * Only replace channels which are actually present. The others should
 192        * get optimized away eventually by sampler_view swizzle anyway but it's
 193        * easier too.
 194        */
 195       for (chan = 0; chan < 4; chan++) {
 196          unsigned chan_s;
 197          /* reverse-map channel... */
 198          for (chan_s = 0; chan_s < 4; chan_s++) {
 199             if (chan_s == format_desc->swizzle[chan]) {
 200                break;
 201             }
 202          }
 203          if (chan_s <= 3) {
 204             /* use the already clamped color */
 205             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
 206             LLVMValueRef border_chan;
 207
 208             border_chan = lp_build_extract_broadcast(bld->gallivm,
 209                                                      border_type,
 210                                                      bld->texel_type,
 211                                                      bld->border_color_clamped,
 212                                                      idx);
 213             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
 214                                               border_chan, texel_out[chan]);
 215          }
 216       }
 217    }
 218 }
 219
 220
 221 /**
 222  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
 223  * (Note that with pot sizes could do this much more easily post-scale
 224  * with some bit arithmetic.)
 225  */
 226 static LLVMValueRef
 227 lp_build_coord_mirror(struct lp_build_sample_context *bld,
 228                       LLVMValueRef coord, boolean posOnly)
 229 {
 230    struct lp_build_context *coord_bld = &bld->coord_bld;
 231    LLVMValueRef fract;
 232    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 233
 234    /*
 235     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
 236     * it all works out. (The result is in range [-1, 1.0], negative if
 237     * the coord is in the "odd" section, otherwise positive.)
 238     */
 239
 240    coord = lp_build_mul(coord_bld, coord, half);
 241    fract = lp_build_round(coord_bld, coord);
 242    fract = lp_build_sub(coord_bld, coord, fract);
 243    coord = lp_build_add(coord_bld, fract, fract);
 244
 245    if (posOnly) {
 246       /*
 247        * Theoretically it's not quite 100% accurate because the spec says
 248        * that ultimately a scaled coord of -x.0 should map to int coord
 249        * -x + 1 with mirroring, not -x (this does not matter for bilinear
 250        * filtering).
 251        */
 252       coord = lp_build_abs(coord_bld, coord);
 253       /* kill off NaNs */
 254       /* XXX: not safe without arch rounding, fract can be anything. */
 255       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
 256                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 257    }
 258
 259    return coord;
 260 }
 261
 262
 263 /**
 264  * Helper to compute the first coord and the weight for
 265  * linear wrap repeat npot textures
 266  */
 267 void
 268 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
 269                                   LLVMValueRef coord_f,
 270                                   LLVMValueRef length_i,
 271                                   LLVMValueRef length_f,
 272                                   LLVMValueRef *coord0_i,
 273                                   LLVMValueRef *weight_f)
 274 {
 275    struct lp_build_context *coord_bld = &bld->coord_bld;
 276    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 277    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 278    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
 279                                                 int_coord_bld->one);
 280    LLVMValueRef mask;
 281    /* wrap with normalized floats is just fract */
 282    coord_f = lp_build_fract(coord_bld, coord_f);
 283    /* mul by size and subtract 0.5 */
 284    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
 285    coord_f = lp_build_sub(coord_bld, coord_f, half);
 286    /*
 287     * we avoided the 0.5/length division before the repeat wrap,
 288     * now need to fix up edge cases with selects
 289     */
 290    /*
 291     * Note we do a float (unordered) compare so we can eliminate NaNs.
 292     * (Otherwise would need fract_safe above).
 293     */
 294    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
 295                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
 296
 297    /* convert to int, compute lerp weight */
 298    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
 299    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
 300 }
 301
 302
 303 /**
 304  * Build LLVM code for texture wrap mode for linear filtering.
 305  * \param x0_out  returns first integer texcoord
 306  * \param x1_out  returns second integer texcoord
 307  * \param weight_out  returns linear interpolation weight
 308  */
 309 static void
 310 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 311                             boolean is_gather,
 312                             LLVMValueRef coord,
 313                             LLVMValueRef length,
 314                             LLVMValueRef length_f,
 315                             LLVMValueRef offset,
 316                             boolean is_pot,
 317                             unsigned wrap_mode,
 318                             LLVMValueRef *x0_out,
 319                             LLVMValueRef *x1_out,
 320                             LLVMValueRef *weight_out)
 321 {
 322    struct lp_build_context *coord_bld = &bld->coord_bld;
 323    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 324    LLVMBuilderRef builder = bld->gallivm->builder;
 325    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 326    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 327    LLVMValueRef coord0, coord1, weight;
 328
 329    switch(wrap_mode) {
 330    case PIPE_TEX_WRAP_REPEAT:
 331       if (is_pot) {
 332          /* mul by size and subtract 0.5 */
 333          coord = lp_build_mul(coord_bld, coord, length_f);
 334          coord = lp_build_sub(coord_bld, coord, half);
 335          if (offset) {
 336             offset = lp_build_int_to_float(coord_bld, offset);
 337             coord = lp_build_add(coord_bld, coord, offset);
 338          }
 339          /* convert to int, compute lerp weight */
 340          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 341          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 342          /* repeat wrap */
 343          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
 344          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
 345       }
 346       else {
 347          LLVMValueRef mask;
 348          if (offset) {
 349             offset = lp_build_int_to_float(coord_bld, offset);
 350             offset = lp_build_div(coord_bld, offset, length_f);
 351             coord = lp_build_add(coord_bld, coord, offset);
 352          }
 353          lp_build_coord_repeat_npot_linear(bld, coord,
 354                                            length, length_f,
 355                                            &coord0, &weight);
 356          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 357                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 358          coord1 = LLVMBuildAnd(builder,
 359                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
 360                                mask, "");
 361       }
 362       break;
 363
 364    case PIPE_TEX_WRAP_CLAMP:
 365       if (bld->static_sampler_state->normalized_coords) {
 366          /* scale coord to length */
 367          coord = lp_build_mul(coord_bld, coord, length_f);
 368       }
 369       if (offset) {
 370          offset = lp_build_int_to_float(coord_bld, offset);
 371          coord = lp_build_add(coord_bld, coord, offset);
 372       }
 373
 374       /*
 375        * clamp to [0, length]
 376        *
 377        * Unlike some other wrap modes, this should be correct for gather
 378        * too. GL_CLAMP explicitly does this clamp on the coord prior to
 379        * actual wrapping (which is per sample).
 380        */
 381       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
 382
 383       coord = lp_build_sub(coord_bld, coord, half);
 384
 385       /* convert to int, compute lerp weight */
 386       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 387       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 388       break;
 389
 390    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 391       {
 392          struct lp_build_context abs_coord_bld = bld->coord_bld;
 393          abs_coord_bld.type.sign = FALSE;
 394
 395          if (bld->static_sampler_state->normalized_coords) {
 396             /* mul by tex size */
 397             coord = lp_build_mul(coord_bld, coord, length_f);
 398          }
 399          if (offset) {
 400             offset = lp_build_int_to_float(coord_bld, offset);
 401             coord = lp_build_add(coord_bld, coord, offset);
 402          }
 403
 404          /* clamp to length max */
 405          coord = lp_build_min_ext(coord_bld, coord, length_f,
 406                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 407          if (!is_gather) {
 408             /* subtract 0.5 */
 409             coord = lp_build_sub(coord_bld, coord, half);
 410             /* clamp to [0, length - 0.5] */
 411             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 412             /* convert to int, compute lerp weight */
 413             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 414             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 415          } else {
 416             /*
 417              * The non-gather path will end up with coords 0, 1 if coord was
 418              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
 419              * really matter what the second coord is). But for gather, we
 420              * really need to end up with coords 0, 0.
 421              */
 422             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 423             coord0 = lp_build_sub(coord_bld, coord, half);
 424             coord1 = lp_build_add(coord_bld, coord, half);
 425             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
 426             coord0 = lp_build_itrunc(coord_bld, coord0);
 427             coord1 = lp_build_itrunc(coord_bld, coord1);
 428             weight = coord_bld->undef;
 429          }
 430          /* coord1 = min(coord1, length-1) */
 431          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 432          break;
 433       }
 434
 435    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 436       if (bld->static_sampler_state->normalized_coords) {
 437          /* scale coord to length */
 438          coord = lp_build_mul(coord_bld, coord, length_f);
 439       }
 440       if (offset) {
 441          offset = lp_build_int_to_float(coord_bld, offset);
 442          coord = lp_build_add(coord_bld, coord, offset);
 443       }
 444       /*
 445        * We don't need any clamp. Technically, for very large (pos or neg)
 446        * (or infinite) values, clamp against [-length, length] would be
 447        * correct, but we don't need to guarantee any specific
 448        * result for such coords (the ifloor will be undefined, but for modes
 449        * requiring border all resulting coords are safe).
 450        */
 451       coord = lp_build_sub(coord_bld, coord, half);
 452       /* convert to int, compute lerp weight */
 453       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 454       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 455       break;
 456
 457    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 458       if (offset) {
 459          offset = lp_build_int_to_float(coord_bld, offset);
 460          offset = lp_build_div(coord_bld, offset, length_f);
 461          coord = lp_build_add(coord_bld, coord, offset);
 462       }
 463       if (!is_gather) {
 464          /* compute mirror function */
 465          coord = lp_build_coord_mirror(bld, coord, TRUE);
 466
 467          /* scale coord to length */
 468          coord = lp_build_mul(coord_bld, coord, length_f);
 469          coord = lp_build_sub(coord_bld, coord, half);
 470
 471          /* convert to int, compute lerp weight */
 472          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 473          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 474
 475          /* coord0 = max(coord0, 0) */
 476          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
 477          /* coord1 = min(coord1, length-1) */
 478          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 479       } else {
 480          /*
 481           * This is pretty reasonable in the end,  all what the tests care
 482           * about is nasty edge cases (scaled coords x.5, so the individual
 483           * coords are actually integers, which is REALLY tricky to get right
 484           * due to this working differently both for negative numbers as well
 485           * as for even/odd cases). But with enough magic it's not too complex
 486           * after all.
 487           * Maybe should try a bit arithmetic one though for POT textures...
 488           */
 489          LLVMValueRef isNeg;
 490          /*
 491           * Wrapping just once still works, even though it means we can
 492           * get "wrong" sign due to performing mirror in the middle of the
 493           * two coords (because this can only happen very near the odd/even
 494           * edges, so both coords will actually end up as 0 or length - 1
 495           * in the end).
 496           * For GL4 gather with per-sample offsets we'd need to the mirroring
 497           * per coord too.
 498           */
 499          coord = lp_build_coord_mirror(bld, coord, FALSE);
 500          coord = lp_build_mul(coord_bld, coord, length_f);
 501
 502          /*
 503           * NaNs should be safe here, we'll do away with them with
 504           * the ones' complement plus min.
 505           */
 506          coord0 = lp_build_sub(coord_bld, coord, half);
 507          coord0 = lp_build_ifloor(coord_bld, coord0);
 508          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 509          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
 510          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 511                               coord0, int_coord_bld->zero);
 512          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
 513          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 514                               coord1, int_coord_bld->zero);
 515          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
 516          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 517          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 518
 519          weight = coord_bld->undef;
 520       }
 521       break;
 522
 523    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 524       if (bld->static_sampler_state->normalized_coords) {
 525          /* scale coord to length */
 526          coord = lp_build_mul(coord_bld, coord, length_f);
 527       }
 528       if (offset) {
 529          offset = lp_build_int_to_float(coord_bld, offset);
 530          coord = lp_build_add(coord_bld, coord, offset);
 531       }
 532       /*
 533        * XXX: probably not correct for gather, albeit I'm not
 534        * entirely sure as it's poorly specified. The wrapping looks
 535        * correct according to the spec which is against gl 1.2.1,
 536        * however negative values will be swapped - gl re-specified
 537        * wrapping with newer versions (no more pre-clamp except with
 538        * GL_CLAMP).
 539        */
 540       coord = lp_build_abs(coord_bld, coord);
 541
 542       /* clamp to [0, length] */
 543       coord = lp_build_min_ext(coord_bld, coord, length_f,
 544                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 545
 546       coord = lp_build_sub(coord_bld, coord, half);
 547
 548       /* convert to int, compute lerp weight */
 549       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 550       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 551       break;
 552
 553    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 554       {
 555          struct lp_build_context abs_coord_bld = bld->coord_bld;
 556          abs_coord_bld.type.sign = FALSE;
 557
 558          if (bld->static_sampler_state->normalized_coords) {
 559             /* scale coord to length */
 560             coord = lp_build_mul(coord_bld, coord, length_f);
 561          }
 562          if (offset) {
 563             offset = lp_build_int_to_float(coord_bld, offset);
 564             coord = lp_build_add(coord_bld, coord, offset);
 565          }
 566          if (!is_gather) {
 567             coord = lp_build_abs(coord_bld, coord);
 568
 569             /* clamp to length max */
 570             coord = lp_build_min_ext(coord_bld, coord, length_f,
 571                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 572             /* subtract 0.5 */
 573             coord = lp_build_sub(coord_bld, coord, half);
 574             /* clamp to [0, length - 0.5] */
 575             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 576
 577             /* convert to int, compute lerp weight */
 578             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 579             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 580             /* coord1 = min(coord1, length-1) */
 581             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 582          } else {
 583             /*
 584              * The non-gather path will swap coord0/1 if coord was negative,
 585              * which is ok for filtering since the filter weight matches
 586              * accordingly. Also, if coord is close to zero, coord0/1 will
 587              * be 0 and 1, instead of 0 and 0 (again ok due to filter
 588              * weight being 0.0). Both issues need to be fixed for gather.
 589              */
 590             LLVMValueRef isNeg;
 591
 592             /*
 593              * Actually wanted to cheat here and use:
 594              * coord1 = lp_build_iround(coord_bld, coord);
 595              * but it's not good enough for some tests (even piglit
 596              * textureGather is set up in a way so the coords area always
 597              * .5, that is right at the crossover points).
 598              * So do ordinary sub/floor, then do ones' complement
 599              * for negative numbers.
 600              * (Note can't just do sub|add/abs/itrunc per coord neither -
 601              * because the spec demands that mirror(3.0) = 3 but
 602              * mirror(-3.0) = 2.)
 603              */
 604             coord = lp_build_sub(coord_bld, coord, half);
 605             coord0 = lp_build_ifloor(coord_bld, coord);
 606             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 607             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
 608                                  int_coord_bld->zero);
 609             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
 610             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 611
 612             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
 613                                  int_coord_bld->zero);
 614             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
 615             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 616
 617             weight = coord_bld->undef;
 618          }
 619       }
 620       break;
 621
 622    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 623       {
 624          if (bld->static_sampler_state->normalized_coords) {
 625             /* scale coord to length */
 626             coord = lp_build_mul(coord_bld, coord, length_f);
 627          }
 628          if (offset) {
 629             offset = lp_build_int_to_float(coord_bld, offset);
 630             coord = lp_build_add(coord_bld, coord, offset);
 631          }
 632          /*
 633           * XXX: probably not correct for gather due to swapped
 634           * order if coord is negative (same rationale as for
 635           * MIRROR_CLAMP).
 636           */
 637          coord = lp_build_abs(coord_bld, coord);
 638
 639          /*
 640           * We don't need any clamp. Technically, for very large
 641           * (or infinite) values, clamp against length would be
 642           * correct, but we don't need to guarantee any specific
 643           * result for such coords (the ifloor will be undefined, but
 644           * for modes requiring border all resulting coords are safe).
 645           */
 646          coord = lp_build_sub(coord_bld, coord, half);
 647
 648          /* convert to int, compute lerp weight */
 649          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 650          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 651       }
 652       break;
 653
 654    default:
 655       assert(0);
 656       coord0 = NULL;
 657       coord1 = NULL;
 658       weight = NULL;
 659    }
 660
 661    *x0_out = coord0;
 662    *x1_out = coord1;
 663    *weight_out = weight;
 664 }
 665
 666
 667 /**
 668  * Build LLVM code for texture wrap mode for nearest filtering.
 669  * \param coord  the incoming texcoord (nominally in [0,1])
 670  * \param length  the texture size along one dimension, as int vector
 671  * \param length_f  the texture size along one dimension, as float vector
 672  * \param offset  texel offset along one dimension (as int vector)
 673  * \param is_pot  if TRUE, length is a power of two
 674  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 675  */
 676 static LLVMValueRef
 677 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 678                              LLVMValueRef coord,
 679                              LLVMValueRef length,
 680                              LLVMValueRef length_f,
 681                              LLVMValueRef offset,
 682                              boolean is_pot,
 683                              unsigned wrap_mode)
 684 {
 685    struct lp_build_context *coord_bld = &bld->coord_bld;
 686    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 687    LLVMBuilderRef builder = bld->gallivm->builder;
 688    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 689    LLVMValueRef icoord;
 690
 691    switch(wrap_mode) {
 692    case PIPE_TEX_WRAP_REPEAT:
 693       if (is_pot) {
 694          coord = lp_build_mul(coord_bld, coord, length_f);
 695          icoord = lp_build_ifloor(coord_bld, coord);
 696          if (offset) {
 697             icoord = lp_build_add(int_coord_bld, icoord, offset);
 698          }
 699          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
 700       }
 701       else {
 702           if (offset) {
 703              offset = lp_build_int_to_float(coord_bld, offset);
 704              offset = lp_build_div(coord_bld, offset, length_f);
 705              coord = lp_build_add(coord_bld, coord, offset);
 706           }
 707           /* take fraction, unnormalize */
 708           coord = lp_build_fract_safe(coord_bld, coord);
 709           coord = lp_build_mul(coord_bld, coord, length_f);
 710           icoord = lp_build_itrunc(coord_bld, coord);
 711       }
 712       break;
 713
 714    case PIPE_TEX_WRAP_CLAMP:
 715    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 716       if (bld->static_sampler_state->normalized_coords) {
 717          /* scale coord to length */
 718          coord = lp_build_mul(coord_bld, coord, length_f);
 719       }
 720
 721       if (offset) {
 722          offset = lp_build_int_to_float(coord_bld, offset);
 723          coord = lp_build_add(coord_bld, coord, offset);
 724       }
 725       /* floor */
 726       /* use itrunc instead since we clamp to 0 anyway */
 727       icoord = lp_build_itrunc(coord_bld, coord);
 728
 729       /* clamp to [0, length - 1]. */
 730       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
 731                               length_minus_one);
 732       break;
 733
 734    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 735       if (bld->static_sampler_state->normalized_coords) {
 736          /* scale coord to length */
 737          coord = lp_build_mul(coord_bld, coord, length_f);
 738       }
 739       /* no clamp necessary, border masking will handle this */
 740       icoord = lp_build_ifloor(coord_bld, coord);
 741       if (offset) {
 742          icoord = lp_build_add(int_coord_bld, icoord, offset);
 743       }
 744       break;
 745
 746    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 747       if (offset) {
 748          offset = lp_build_int_to_float(coord_bld, offset);
 749          offset = lp_build_div(coord_bld, offset, length_f);
 750          coord = lp_build_add(coord_bld, coord, offset);
 751       }
 752       /* compute mirror function */
 753       coord = lp_build_coord_mirror(bld, coord, TRUE);
 754
 755       /* scale coord to length */
 756       assert(bld->static_sampler_state->normalized_coords);
 757       coord = lp_build_mul(coord_bld, coord, length_f);
 758
 759       /* itrunc == ifloor here */
 760       icoord = lp_build_itrunc(coord_bld, coord);
 761
 762       /* clamp to [0, length - 1] */
 763       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
 764       break;
 765
 766    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 767    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 768       if (bld->static_sampler_state->normalized_coords) {
 769          /* scale coord to length */
 770          coord = lp_build_mul(coord_bld, coord, length_f);
 771       }
 772       if (offset) {
 773          offset = lp_build_int_to_float(coord_bld, offset);
 774          coord = lp_build_add(coord_bld, coord, offset);
 775       }
 776       coord = lp_build_abs(coord_bld, coord);
 777
 778       /* itrunc == ifloor here */
 779       icoord = lp_build_itrunc(coord_bld, coord);
 780       /*
 781        * Use unsigned min due to possible undef values (NaNs, overflow)
 782        */
 783       {
 784          struct lp_build_context abs_coord_bld = *int_coord_bld;
 785          abs_coord_bld.type.sign = FALSE;
 786          /* clamp to [0, length - 1] */
 787          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
 788       }
 789       break;
 790
 791    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 792       if (bld->static_sampler_state->normalized_coords) {
 793          /* scale coord to length */
 794          coord = lp_build_mul(coord_bld, coord, length_f);
 795       }
 796       if (offset) {
 797          offset = lp_build_int_to_float(coord_bld, offset);
 798          coord = lp_build_add(coord_bld, coord, offset);
 799       }
 800       coord = lp_build_abs(coord_bld, coord);
 801
 802       /* itrunc == ifloor here */
 803       icoord = lp_build_itrunc(coord_bld, coord);
 804       break;
 805
 806    default:
 807       assert(0);
 808       icoord = NULL;
 809    }
 810
 811    return icoord;
 812 }
 813
 814
 815 /**
 816  * Do shadow test/comparison.
 817  * \param p shadow ref value
 818  * \param texel  the texel to compare against
 819  */
 820 static LLVMValueRef
 821 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
 822                             LLVMValueRef p,
 823                             LLVMValueRef texel)
 824 {
 825    struct lp_build_context *texel_bld = &bld->texel_bld;
 826    LLVMValueRef res;
 827
 828    if (0) {
 829       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
 830       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
 831    }
 832
 833    /* result = (p FUNC texel) ? 1 : 0 */
 834    /*
 835     * honor d3d10 floating point rules here, which state that comparisons
 836     * are ordered except NOT_EQUAL which is unordered.
 837     */
 838    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
 839       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
 840                                  p, texel);
 841    }
 842    else {
 843       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
 844                          p, texel);
 845    }
 846    return res;
 847 }
 848
 849
 850 /**
 851  * Generate code to sample a mipmap level with nearest filtering.
 852  * If sampling a cube texture, r = cube face in [0,5].
 853  */
 854 static void
 855 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 856                               LLVMValueRef size,
 857                               LLVMValueRef row_stride_vec,
 858                               LLVMValueRef img_stride_vec,
 859                               LLVMValueRef data_ptr,
 860                               LLVMValueRef mipoffsets,
 861                               const LLVMValueRef *coords,
 862                               const LLVMValueRef *offsets,
 863                               LLVMValueRef colors_out[4])
 864 {
 865    const unsigned dims = bld->dims;
 866    LLVMValueRef width_vec;
 867    LLVMValueRef height_vec;
 868    LLVMValueRef depth_vec;
 869    LLVMValueRef flt_size;
 870    LLVMValueRef flt_width_vec;
 871    LLVMValueRef flt_height_vec;
 872    LLVMValueRef flt_depth_vec;
 873    LLVMValueRef x, y = NULL, z = NULL;
 874
 875    lp_build_extract_image_sizes(bld,
 876                                 &bld->int_size_bld,
 877                                 bld->int_coord_type,
 878                                 size,
 879                                 &width_vec, &height_vec, &depth_vec);
 880
 881    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
 882
 883    lp_build_extract_image_sizes(bld,
 884                                 &bld->float_size_bld,
 885                                 bld->coord_type,
 886                                 flt_size,
 887                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
 888
 889    /*
 890     * Compute integer texcoords.
 891     */
 892    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
 893                                     flt_width_vec, offsets[0],
 894                                     bld->static_texture_state->pot_width,
 895                                     bld->static_sampler_state->wrap_s);
 896    lp_build_name(x, "tex.x.wrapped");
 897
 898    if (dims >= 2) {
 899       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
 900                                        flt_height_vec, offsets[1],
 901                                        bld->static_texture_state->pot_height,
 902                                        bld->static_sampler_state->wrap_t);
 903       lp_build_name(y, "tex.y.wrapped");
 904
 905       if (dims == 3) {
 906          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
 907                                           flt_depth_vec, offsets[2],
 908                                           bld->static_texture_state->pot_depth,
 909                                           bld->static_sampler_state->wrap_r);
 910          lp_build_name(z, "tex.z.wrapped");
 911       }
 912    }
 913    if (has_layer_coord(bld->static_texture_state->target)) {
 914       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
 915          /* add cube layer to face */
 916          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
 917       }
 918       else {
 919          z = coords[2];
 920       }
 921       lp_build_name(z, "tex.z.layer");
 922    }
 923
 924    /*
 925     * Get texture colors.
 926     */
 927    lp_build_sample_texel_soa(bld,
 928                              width_vec, height_vec, depth_vec,
 929                              x, y, z,
 930                              row_stride_vec, img_stride_vec,
 931                              data_ptr, mipoffsets, colors_out);
 932
 933    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
 934       LLVMValueRef cmpval;
 935       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
 936       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
 937       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
 938                                       bld->texel_bld.one, bld->texel_bld.zero);
 939       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
 940    }
 941
 942 }
 943
 944
 945 /**
 946  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
 947  */
 948 static LLVMValueRef
 949 lp_build_masklerp(struct lp_build_context *bld,
 950                  LLVMValueRef weight,
 951                  LLVMValueRef mask0,
 952                  LLVMValueRef mask1)
 953 {
 954    struct gallivm_state *gallivm = bld->gallivm;
 955    LLVMBuilderRef builder = gallivm->builder;
 956    LLVMValueRef weight2;
 957
 958    weight2 = lp_build_sub(bld, bld->one, weight);
 959    weight = LLVMBuildBitCast(builder, weight,
 960                               lp_build_int_vec_type(gallivm, bld->type), "");
 961    weight2 = LLVMBuildBitCast(builder, weight2,
 962                               lp_build_int_vec_type(gallivm, bld->type), "");
 963    weight = LLVMBuildAnd(builder, weight, mask1, "");
 964    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
 965    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
 966    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
 967    return lp_build_add(bld, weight, weight2);
 968 }
 969
 970 /**
 971  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
 972  */
 973 static LLVMValueRef
 974 lp_build_masklerp2d(struct lp_build_context *bld,
 975                     LLVMValueRef weight0,
 976                     LLVMValueRef weight1,
 977                     LLVMValueRef mask00,
 978                     LLVMValueRef mask01,
 979                     LLVMValueRef mask10,
 980                     LLVMValueRef mask11)
 981 {
 982    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
 983    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
 984    return lp_build_lerp(bld, weight1, val0, val1, 0);
 985 }
 986
 987 /*
 988  * this is a bit excessive code for something OpenGL just recommends
 989  * but does not require.
 990  */
 991 #define ACCURATE_CUBE_CORNERS 1
 992
 993 /**
 994  * Generate code to sample a mipmap level with linear filtering.
 995  * If sampling a cube texture, r = cube face in [0,5].
 996  * If linear_mask is present, only pixels having their mask set
 997  * will receive linear filtering, the rest will use nearest.
 998  */
 999 static void
1000 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1001                              boolean is_gather,
1002                              LLVMValueRef size,
1003                              LLVMValueRef linear_mask,
1004                              LLVMValueRef row_stride_vec,
1005                              LLVMValueRef img_stride_vec,
1006                              LLVMValueRef data_ptr,
1007                              LLVMValueRef mipoffsets,
1008                              const LLVMValueRef *coords,
1009                              const LLVMValueRef *offsets,
1010                              LLVMValueRef colors_out[4])
1011 {
1012    LLVMBuilderRef builder = bld->gallivm->builder;
1013    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1014    struct lp_build_context *coord_bld = &bld->coord_bld;
1015    struct lp_build_context *texel_bld = &bld->texel_bld;
1016    const unsigned dims = bld->dims;
1017    LLVMValueRef width_vec;
1018    LLVMValueRef height_vec;
1019    LLVMValueRef depth_vec;
1020    LLVMValueRef flt_size;
1021    LLVMValueRef flt_width_vec;
1022    LLVMValueRef flt_height_vec;
1023    LLVMValueRef flt_depth_vec;
1024    LLVMValueRef fall_off[4], have_corners;
1025    LLVMValueRef z1 = NULL;
1026    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1027    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1028    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1029    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1030    LLVMValueRef xs[4], ys[4], zs[4];
1031    LLVMValueRef neighbors[2][2][4];
1032    int chan, texel_index;
1033    boolean seamless_cube_filter, accurate_cube_corners;
1034    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1035
1036    if (is_gather) {
1037       switch (bld->gather_comp) {
1038       case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
1039       case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
1040       case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
1041       case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
1042       default:
1043          break;
1044       }
1045    }
1046
1047    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1048                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1049                           bld->static_sampler_state->seamless_cube_map;
1050
1051    /*
1052     * Disable accurate cube corners for integer textures, which should only
1053     * get here in the gather path.
1054     */
1055    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
1056      !util_format_is_pure_integer(bld->static_texture_state->format);
1057
1058    lp_build_extract_image_sizes(bld,
1059                                 &bld->int_size_bld,
1060                                 bld->int_coord_type,
1061                                 size,
1062                                 &width_vec, &height_vec, &depth_vec);
1063
1064    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1065
1066    lp_build_extract_image_sizes(bld,
1067                                 &bld->float_size_bld,
1068                                 bld->coord_type,
1069                                 flt_size,
1070                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1071
1072    /*
1073     * Compute integer texcoords.
1074     */
1075
1076    if (!seamless_cube_filter) {
1077       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1078                                   flt_width_vec, offsets[0],
1079                                   bld->static_texture_state->pot_width,
1080                                   bld->static_sampler_state->wrap_s,
1081                                   &x00, &x01, &s_fpart);
1082       lp_build_name(x00, "tex.x0.wrapped");
1083       lp_build_name(x01, "tex.x1.wrapped");
1084       x10 = x00;
1085       x11 = x01;
1086
1087       if (dims >= 2) {
1088          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1089                                      flt_height_vec, offsets[1],
1090                                      bld->static_texture_state->pot_height,
1091                                      bld->static_sampler_state->wrap_t,
1092                                      &y00, &y10, &t_fpart);
1093          lp_build_name(y00, "tex.y0.wrapped");
1094          lp_build_name(y10, "tex.y1.wrapped");
1095          y01 = y00;
1096          y11 = y10;
1097
1098          if (dims == 3) {
1099             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1100                                         flt_depth_vec, offsets[2],
1101                                         bld->static_texture_state->pot_depth,
1102                                         bld->static_sampler_state->wrap_r,
1103                                         &z00, &z1, &r_fpart);
1104             z01 = z10 = z11 = z00;
1105             lp_build_name(z00, "tex.z0.wrapped");
1106             lp_build_name(z1, "tex.z1.wrapped");
1107          }
1108       }
1109       if (has_layer_coord(bld->static_texture_state->target)) {
1110          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1111             /* add cube layer to face */
1112             z00 = z01 = z10 = z11 = z1 =
1113                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1114          }
1115          else {
1116             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1117          }
1118          lp_build_name(z00, "tex.z0.layer");
1119          lp_build_name(z1, "tex.z1.layer");
1120       }
1121    }
1122    else {
1123       struct lp_build_if_state edge_if;
1124       LLVMTypeRef int1t;
1125       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1126       LLVMValueRef coord0, coord1, have_edge, have_corner;
1127       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1128       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1129       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1130       LLVMValueRef face = coords[2];
1131       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1132       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1133       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1134       height_vec = width_vec;
1135       flt_height_vec = flt_width_vec;
1136
1137       /* XXX the overflow logic is actually sort of duplicated with trilinear,
1138        * since an overflow in one mip should also have a corresponding overflow
1139        * in another.
1140        */
1141       /* should always have normalized coords, and offsets are undefined */
1142       assert(bld->static_sampler_state->normalized_coords);
1143       /*
1144        * The coords should all be between [0,1] however we can have NaNs,
1145        * which will wreak havoc. In particular the y1_clamped value below
1146        * can be -INT_MAX (on x86) and be propagated right through (probably
1147        * other values might be bogus in the end too).
1148        * So kill off the NaNs here.
1149        */
1150       coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1151                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1152       coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1153       /* instead of clamp, build mask if overflowed */
1154       coord0 = lp_build_sub(coord_bld, coord0, half);
1155       /* convert to int, compute lerp weight */
1156       /* not ideal with AVX (and no AVX2) */
1157       lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1158       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1159       coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1160                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1161       coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1162       coord1 = lp_build_sub(coord_bld, coord1, half);
1163       lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1164       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1165
1166       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1167       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1168       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1169       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1170
1171       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1172       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1173       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1174       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1175
1176       /* needed for accurate corner filtering branch later, rely on 0 init */
1177       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1178       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1179
1180       for (texel_index = 0; texel_index < 4; texel_index++) {
1181          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1182          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1183          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1184       }
1185
1186       lp_build_if(&edge_if, bld->gallivm, have_edge);
1187
1188       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1189       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1190       LLVMBuildStore(builder, have_corner, have_corners);
1191
1192       /*
1193        * Need to feed clamped values here for cheap corner handling,
1194        * but only for y coord (as when falling off both edges we only
1195        * fall off the x one) - this should be sufficient.
1196        */
1197       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1198       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1199
1200       /*
1201        * Get all possible new coords.
1202        */
1203       lp_build_cube_new_coords(ivec_bld, face,
1204                                x0, x1, y0_clamped, y1_clamped,
1205                                length_minus_one,
1206                                new_faces, new_xcoords, new_ycoords);
1207
1208       /* handle fall off x-, x+ direction */
1209       /* determine new coords, face (not both fall_off vars can be true at same time) */
1210       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1211       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1212       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1213       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1214       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1215       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1216       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1217       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1218
1219       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1220       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1221
1222       /* handle fall off y-, y+ direction */
1223       /*
1224        * Cheap corner logic: just hack up things so a texel doesn't fall
1225        * off both sides (which means filter weights will be wrong but we'll only
1226        * use valid texels in the filter).
1227        * This means however (y) coords must additionally be clamped (see above).
1228        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1229        */
1230       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1231       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1232       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1233       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1234
1235       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1236       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1237       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1238       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1239       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1240       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1241       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1242       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1243
1244       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1245       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1246       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1247       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1248
1249       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1250          /* now can add cube layer to face (per sample) */
1251          z00 = lp_build_add(ivec_bld, z00, coords[3]);
1252          z01 = lp_build_add(ivec_bld, z01, coords[3]);
1253          z10 = lp_build_add(ivec_bld, z10, coords[3]);
1254          z11 = lp_build_add(ivec_bld, z11, coords[3]);
1255       }
1256
1257       LLVMBuildStore(builder, x00, xs[0]);
1258       LLVMBuildStore(builder, x01, xs[1]);
1259       LLVMBuildStore(builder, x10, xs[2]);
1260       LLVMBuildStore(builder, x11, xs[3]);
1261       LLVMBuildStore(builder, y00, ys[0]);
1262       LLVMBuildStore(builder, y01, ys[1]);
1263       LLVMBuildStore(builder, y10, ys[2]);
1264       LLVMBuildStore(builder, y11, ys[3]);
1265       LLVMBuildStore(builder, z00, zs[0]);
1266       LLVMBuildStore(builder, z01, zs[1]);
1267       LLVMBuildStore(builder, z10, zs[2]);
1268       LLVMBuildStore(builder, z11, zs[3]);
1269
1270       lp_build_else(&edge_if);
1271
1272       LLVMBuildStore(builder, x0, xs[0]);
1273       LLVMBuildStore(builder, x1, xs[1]);
1274       LLVMBuildStore(builder, x0, xs[2]);
1275       LLVMBuildStore(builder, x1, xs[3]);
1276       LLVMBuildStore(builder, y0, ys[0]);
1277       LLVMBuildStore(builder, y0, ys[1]);
1278       LLVMBuildStore(builder, y1, ys[2]);
1279       LLVMBuildStore(builder, y1, ys[3]);
1280       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1281          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1282          LLVMBuildStore(builder, cube_layer, zs[0]);
1283          LLVMBuildStore(builder, cube_layer, zs[1]);
1284          LLVMBuildStore(builder, cube_layer, zs[2]);
1285          LLVMBuildStore(builder, cube_layer, zs[3]);
1286       }
1287       else {
1288          LLVMBuildStore(builder, face, zs[0]);
1289          LLVMBuildStore(builder, face, zs[1]);
1290          LLVMBuildStore(builder, face, zs[2]);
1291          LLVMBuildStore(builder, face, zs[3]);
1292       }
1293
1294       lp_build_endif(&edge_if);
1295
1296       x00 = LLVMBuildLoad(builder, xs[0], "");
1297       x01 = LLVMBuildLoad(builder, xs[1], "");
1298       x10 = LLVMBuildLoad(builder, xs[2], "");
1299       x11 = LLVMBuildLoad(builder, xs[3], "");
1300       y00 = LLVMBuildLoad(builder, ys[0], "");
1301       y01 = LLVMBuildLoad(builder, ys[1], "");
1302       y10 = LLVMBuildLoad(builder, ys[2], "");
1303       y11 = LLVMBuildLoad(builder, ys[3], "");
1304       z00 = LLVMBuildLoad(builder, zs[0], "");
1305       z01 = LLVMBuildLoad(builder, zs[1], "");
1306       z10 = LLVMBuildLoad(builder, zs[2], "");
1307       z11 = LLVMBuildLoad(builder, zs[3], "");
1308    }
1309
1310    if (linear_mask) {
1311       /*
1312        * Whack filter weights into place. Whatever texel had more weight is
1313        * the one which should have been selected by nearest filtering hence
1314        * just use 100% weight for it.
1315        */
1316       struct lp_build_context *c_bld = &bld->coord_bld;
1317       LLVMValueRef w1_mask, w1_weight;
1318       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1319
1320       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1321       /* this select is really just a "and" */
1322       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1323       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1324       if (dims >= 2) {
1325          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1326          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1327          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1328          if (dims == 3) {
1329             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1330             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1331             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1332          }
1333       }
1334    }
1335
1336    /*
1337     * Get texture colors.
1338     */
1339    /* get x0/x1 texels */
1340    lp_build_sample_texel_soa(bld,
1341                              width_vec, height_vec, depth_vec,
1342                              x00, y00, z00,
1343                              row_stride_vec, img_stride_vec,
1344                              data_ptr, mipoffsets, neighbors[0][0]);
1345    lp_build_sample_texel_soa(bld,
1346                              width_vec, height_vec, depth_vec,
1347                              x01, y01, z01,
1348                              row_stride_vec, img_stride_vec,
1349                              data_ptr, mipoffsets, neighbors[0][1]);
1350
1351    if (dims == 1) {
1352       assert(!is_gather);
1353       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1354          /* Interpolate two samples from 1D image to produce one color */
1355          for (chan = 0; chan < 4; chan++) {
1356             colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1357                                              neighbors[0][0][chan],
1358                                              neighbors[0][1][chan],
1359                                              0);
1360          }
1361       }
1362       else {
1363          LLVMValueRef cmpval0, cmpval1;
1364          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1365          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1366          /* simplified lerp, AND mask with weight and add */
1367          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1368                                            cmpval0, cmpval1);
1369          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1370       }
1371    }
1372    else {
1373       /* 2D/3D texture */
1374       struct lp_build_if_state corner_if;
1375       LLVMValueRef colors0[4], colorss[4];
1376
1377       /* get x0/x1 texels at y1 */
1378       lp_build_sample_texel_soa(bld,
1379                                 width_vec, height_vec, depth_vec,
1380                                 x10, y10, z10,
1381                                 row_stride_vec, img_stride_vec,
1382                                 data_ptr, mipoffsets, neighbors[1][0]);
1383       lp_build_sample_texel_soa(bld,
1384                                 width_vec, height_vec, depth_vec,
1385                                 x11, y11, z11,
1386                                 row_stride_vec, img_stride_vec,
1387                                 data_ptr, mipoffsets, neighbors[1][1]);
1388
1389       /*
1390        * To avoid having to duplicate linear_mask / fetch code use
1391        * another branch (with corner condition though edge would work
1392        * as well) here.
1393        */
1394       if (accurate_cube_corners) {
1395          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1396          LLVMValueRef have_corner, one_third;
1397
1398          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1399          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1400          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1401          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1402
1403          have_corner = LLVMBuildLoad(builder, have_corners, "");
1404
1405          lp_build_if(&corner_if, bld->gallivm, have_corner);
1406
1407          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1408                                         1.0f/3.0f);
1409
1410          /* find corner */
1411          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1412          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1413          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1414          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1415          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1416          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1417          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1418          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1419
1420          if (!is_gather) {
1421             /*
1422              * we can't use standard 2d lerp as we need per-element weight
1423              * in case of corners, so just calculate bilinear result as
1424              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1425              * (This is actually less work than using 2d lerp, 7 vs. 9
1426              * instructions, however calculating the weights needs another 6,
1427              * so actually probably not slower than 2d lerp only for 4 channels
1428              * as weights only need to be calculated once - of course fixing
1429              * the weights has additional cost.)
1430              */
1431             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1432             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1433             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1434             w00 = lp_build_mul(coord_bld, wx0, wy0);
1435             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1436             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1437             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1438
1439             /* find corner weight */
1440             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1441             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1442             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1443             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1444
1445             /*
1446              * add 1/3 of the corner weight to the weight of the 3 other
1447              * samples and null out corner weight.
1448              */
1449             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1450             w00 = lp_build_add(coord_bld, w00, c_weight);
1451             w00 = lp_build_andnot(coord_bld, w00, c00f);
1452             w01 = lp_build_add(coord_bld, w01, c_weight);
1453             w01 = lp_build_andnot(coord_bld, w01, c01f);
1454             w10 = lp_build_add(coord_bld, w10, c_weight);
1455             w10 = lp_build_andnot(coord_bld, w10, c10f);
1456             w11 = lp_build_add(coord_bld, w11, c_weight);
1457             w11 = lp_build_andnot(coord_bld, w11, c11f);
1458
1459             if (bld->static_sampler_state->compare_mode ==
1460                 PIPE_TEX_COMPARE_NONE) {
1461                for (chan = 0; chan < 4; chan++) {
1462                   colors0[chan] = lp_build_mul(coord_bld, w00,
1463                                                neighbors[0][0][chan]);
1464                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1465                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1466                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1467                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1468                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1469                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1470                }
1471             }
1472             else {
1473                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1474                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1475                                                       neighbors[0][0][0]);
1476                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1477                                                       neighbors[0][1][0]);
1478                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1479                                                       neighbors[1][0][0]);
1480                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1481                                                       neighbors[1][1][0]);
1482                /*
1483                 * inputs to interpolation are just masks so just add
1484                 * masked weights together
1485                 */
1486                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1487                                            coord_bld->vec_type, "");
1488                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1489                                            coord_bld->vec_type, "");
1490                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1491                                            coord_bld->vec_type, "");
1492                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1493                                            coord_bld->vec_type, "");
1494                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1495                tmp = lp_build_and(coord_bld, w01, cmpval01);
1496                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1497                tmp = lp_build_and(coord_bld, w10, cmpval10);
1498                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1499                tmp = lp_build_and(coord_bld, w11, cmpval11);
1500                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1501                colors0[1] = colors0[2] = colors0[3] = colors0[0];
1502             }
1503          }
1504          else {
1505             /*
1506              * We don't have any weights to adjust, so instead calculate
1507              * the fourth texel as simply the average of the other 3.
1508              * (This would work for non-gather too, however we'd have
1509              * a boatload more of the select stuff due to there being
1510              * 4 times as many colors as weights.)
1511              */
1512             LLVMValueRef col00, col01, col10, col11;
1513             LLVMValueRef colc, colc0, colc1;
1514             col10 = lp_build_swizzle_soa_channel(texel_bld,
1515                                                  neighbors[1][0], chan_swiz);
1516             col11 = lp_build_swizzle_soa_channel(texel_bld,
1517                                                  neighbors[1][1], chan_swiz);
1518             col01 = lp_build_swizzle_soa_channel(texel_bld,
1519                                                  neighbors[0][1], chan_swiz);
1520             col00 = lp_build_swizzle_soa_channel(texel_bld,
1521                                                  neighbors[0][0], chan_swiz);
1522
1523             /*
1524              * The spec says for comparison filtering, the comparison
1525              * must happen before synthesizing the new value.
1526              * This means all gathered values are always 0 or 1,
1527              * except for the non-existing texel, which can be 0,1/3,2/3,1...
1528              * Seems like we'd be allowed to just return 0 or 1 too, so we
1529              * could simplify and pass down the compare mask values to the
1530              * end (using int arithmetic/compare on the mask values to
1531              * construct the fourth texel) and only there convert to floats
1532              * but it's probably not worth it (it might be easier for the cpu
1533              * but not for the code)...
1534              */
1535             if (bld->static_sampler_state->compare_mode !=
1536                 PIPE_TEX_COMPARE_NONE) {
1537                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1538                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1539                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1540                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1541                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1542                col00 = lp_build_select(texel_bld, cmpval00,
1543                                        texel_bld->one, texel_bld->zero);
1544                col01 = lp_build_select(texel_bld, cmpval01,
1545                                        texel_bld->one, texel_bld->zero);
1546                col10 = lp_build_select(texel_bld, cmpval10,
1547                                        texel_bld->one, texel_bld->zero);
1548                col11 = lp_build_select(texel_bld, cmpval11,
1549                                        texel_bld->one, texel_bld->zero);
1550             }
1551
1552             /*
1553              * Null out corner color.
1554              */
1555             col00 = lp_build_andnot(coord_bld, col00, c00f);
1556             col01 = lp_build_andnot(coord_bld, col01, c01f);
1557             col10 = lp_build_andnot(coord_bld, col10, c10f);
1558             col11 = lp_build_andnot(coord_bld, col11, c11f);
1559
1560             /*
1561              * New corner texel color is all colors added / 3.
1562              */
1563             colc0 = lp_build_add(coord_bld, col00, col01);
1564             colc1 = lp_build_add(coord_bld, col10, col11);
1565             colc = lp_build_add(coord_bld, colc0, colc1);
1566             colc = lp_build_mul(coord_bld, one_third, colc);
1567
1568             /*
1569              * Replace the corner texel color with the new value.
1570              */
1571             col00 = lp_build_select(coord_bld, c00, colc, col00);
1572             col01 = lp_build_select(coord_bld, c01, colc, col01);
1573             col10 = lp_build_select(coord_bld, c10, colc, col10);
1574             col11 = lp_build_select(coord_bld, c11, colc, col11);
1575
1576             colors0[0] = col10;
1577             colors0[1] = col11;
1578             colors0[2] = col01;
1579             colors0[3] = col00;
1580          }
1581
1582          LLVMBuildStore(builder, colors0[0], colorss[0]);
1583          LLVMBuildStore(builder, colors0[1], colorss[1]);
1584          LLVMBuildStore(builder, colors0[2], colorss[2]);
1585          LLVMBuildStore(builder, colors0[3], colorss[3]);
1586
1587          lp_build_else(&corner_if);
1588       }
1589
1590       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1591          if (is_gather) {
1592             /*
1593              * Just assign the red channel (no component selection yet).
1594              * This is a bit hackish, we usually do the swizzle at the
1595              * end of sampling (much less values to swizzle), but this
1596              * obviously cannot work when using gather.
1597              */
1598             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1599                                                       neighbors[1][0],
1600                                                       chan_swiz);
1601             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1602                                                       neighbors[1][1],
1603                                                       chan_swiz);
1604             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1605                                                       neighbors[0][1],
1606                                                       chan_swiz);
1607             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1608                                                       neighbors[0][0],
1609                                                       chan_swiz);
1610          }
1611          else {
1612             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1613             for (chan = 0; chan < 4; chan++) {
1614                colors0[chan] = lp_build_lerp_2d(texel_bld,
1615                                                 s_fpart, t_fpart,
1616                                                 neighbors[0][0][chan],
1617                                                 neighbors[0][1][chan],
1618                                                 neighbors[1][0][chan],
1619                                                 neighbors[1][1][chan],
1620                                                 0);
1621             }
1622          }
1623       }
1624       else {
1625          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1626          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1627          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1628          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1629          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1630
1631          if (is_gather) {
1632             /* more hacks for swizzling, should be X, ONE or ZERO... */
1633             colors0[0] = lp_build_select(texel_bld, cmpval10,
1634                                          texel_bld->one, texel_bld->zero);
1635             colors0[1] = lp_build_select(texel_bld, cmpval11,
1636                                          texel_bld->one, texel_bld->zero);
1637             colors0[2] = lp_build_select(texel_bld, cmpval01,
1638                                          texel_bld->one, texel_bld->zero);
1639             colors0[3] = lp_build_select(texel_bld, cmpval00,
1640                                          texel_bld->one, texel_bld->zero);
1641          }
1642          else {
1643             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1644                                              cmpval00, cmpval01, cmpval10, cmpval11);
1645             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1646          }
1647       }
1648
1649       if (accurate_cube_corners) {
1650          LLVMBuildStore(builder, colors0[0], colorss[0]);
1651          LLVMBuildStore(builder, colors0[1], colorss[1]);
1652          LLVMBuildStore(builder, colors0[2], colorss[2]);
1653          LLVMBuildStore(builder, colors0[3], colorss[3]);
1654
1655          lp_build_endif(&corner_if);
1656
1657          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1658          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1659          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1660          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1661       }
1662
1663       if (dims == 3) {
1664          LLVMValueRef neighbors1[2][2][4];
1665          LLVMValueRef colors1[4];
1666
1667          assert(!is_gather);
1668
1669          /* get x0/x1/y0/y1 texels at z1 */
1670          lp_build_sample_texel_soa(bld,
1671                                    width_vec, height_vec, depth_vec,
1672                                    x00, y00, z1,
1673                                    row_stride_vec, img_stride_vec,
1674                                    data_ptr, mipoffsets, neighbors1[0][0]);
1675          lp_build_sample_texel_soa(bld,
1676                                    width_vec, height_vec, depth_vec,
1677                                    x01, y01, z1,
1678                                    row_stride_vec, img_stride_vec,
1679                                    data_ptr, mipoffsets, neighbors1[0][1]);
1680          lp_build_sample_texel_soa(bld,
1681                                    width_vec, height_vec, depth_vec,
1682                                    x10, y10, z1,
1683                                    row_stride_vec, img_stride_vec,
1684                                    data_ptr, mipoffsets, neighbors1[1][0]);
1685          lp_build_sample_texel_soa(bld,
1686                                    width_vec, height_vec, depth_vec,
1687                                    x11, y11, z1,
1688                                    row_stride_vec, img_stride_vec,
1689                                    data_ptr, mipoffsets, neighbors1[1][1]);
1690
1691          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1692             /* Bilinear interpolate the four samples from the second Z slice */
1693             for (chan = 0; chan < 4; chan++) {
1694                colors1[chan] = lp_build_lerp_2d(texel_bld,
1695                                                 s_fpart, t_fpart,
1696                                                 neighbors1[0][0][chan],
1697                                                 neighbors1[0][1][chan],
1698                                                 neighbors1[1][0][chan],
1699                                                 neighbors1[1][1][chan],
1700                                                 0);
1701             }
1702             /* Linearly interpolate the two samples from the two 3D slices */
1703             for (chan = 0; chan < 4; chan++) {
1704                colors_out[chan] = lp_build_lerp(texel_bld,
1705                                                 r_fpart,
1706                                                 colors0[chan], colors1[chan],
1707                                                 0);
1708             }
1709          }
1710          else {
1711             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1712             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1713             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1714             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1715             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1716             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1717                                              cmpval00, cmpval01, cmpval10, cmpval11);
1718             /* Linearly interpolate the two samples from the two 3D slices */
1719             colors_out[0] = lp_build_lerp(texel_bld,
1720                                           r_fpart,
1721                                           colors0[0], colors1[0],
1722                                           0);
1723             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1724          }
1725       }
1726       else {
1727          /* 2D tex */
1728          for (chan = 0; chan < 4; chan++) {
1729             colors_out[chan] = colors0[chan];
1730          }
1731       }
1732    }
1733    if (is_gather) {
1734       /*
1735        * For gather, we can't do our usual channel swizzling done later,
1736        * so do it here. It only really matters for 0/1 swizzles in case
1737        * of comparison filtering, since in this case the results would be
1738        * wrong, without comparison it should all work out alright but it
1739        * can't hurt to do that here, since it will instantly drop all
1740        * calculations above, though it's a rather stupid idea to do
1741        * gather on a channel which will always return 0 or 1 in any case...
1742        */
1743       if (chan_swiz == PIPE_SWIZZLE_1) {
1744          for (chan = 0; chan < 4; chan++) {
1745             colors_out[chan] = texel_bld->one;
1746          }
1747       } else if (chan_swiz == PIPE_SWIZZLE_0) {
1748          for (chan = 0; chan < 4; chan++) {
1749             colors_out[chan] = texel_bld->zero;
1750          }
1751       }
1752    }
1753 }
1754
1755
1756 /**
1757  * Sample the texture/mipmap using given image filter and mip filter.
1758  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1759  * from (vectors or scalars).
1760  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1761  */
1762 static void
1763 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1764                        unsigned img_filter,
1765                        unsigned mip_filter,
1766                        boolean is_gather,
1767                        const LLVMValueRef *coords,
1768                        const LLVMValueRef *offsets,
1769                        LLVMValueRef ilevel0,
1770                        LLVMValueRef ilevel1,
1771                        LLVMValueRef lod_fpart,
1772                        LLVMValueRef *colors_out)
1773 {
1774    LLVMBuilderRef builder = bld->gallivm->builder;
1775    LLVMValueRef size0 = NULL;
1776    LLVMValueRef size1 = NULL;
1777    LLVMValueRef row_stride0_vec = NULL;
1778    LLVMValueRef row_stride1_vec = NULL;
1779    LLVMValueRef img_stride0_vec = NULL;
1780    LLVMValueRef img_stride1_vec = NULL;
1781    LLVMValueRef data_ptr0 = NULL;
1782    LLVMValueRef data_ptr1 = NULL;
1783    LLVMValueRef mipoff0 = NULL;
1784    LLVMValueRef mipoff1 = NULL;
1785    LLVMValueRef colors0[4], colors1[4];
1786    unsigned chan;
1787
1788    /* sample the first mipmap level */
1789    lp_build_mipmap_level_sizes(bld, ilevel0,
1790                                &size0,
1791                                &row_stride0_vec, &img_stride0_vec);
1792    if (bld->num_mips == 1) {
1793       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1794    }
1795    else {
1796       /* This path should work for num_lods 1 too but slightly less efficient */
1797       data_ptr0 = bld->base_ptr;
1798       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1799    }
1800    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1801       lp_build_sample_image_nearest(bld, size0,
1802                                     row_stride0_vec, img_stride0_vec,
1803                                     data_ptr0, mipoff0, coords, offsets,
1804                                     colors0);
1805    }
1806    else {
1807       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1808       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1809                                    row_stride0_vec, img_stride0_vec,
1810                                    data_ptr0, mipoff0, coords, offsets,
1811                                    colors0);
1812    }
1813
1814    /* Store the first level's colors in the output variables */
1815    for (chan = 0; chan < 4; chan++) {
1816        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1817    }
1818
1819    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1820       struct lp_build_if_state if_ctx;
1821       LLVMValueRef need_lerp;
1822
1823       /* need_lerp = lod_fpart > 0 */
1824       if (bld->num_lods == 1) {
1825          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1826                                    lod_fpart, bld->lodf_bld.zero,
1827                                    "need_lerp");
1828       }
1829       else {
1830          /*
1831           * We'll do mip filtering if any of the quads (or individual
1832           * pixel in case of per-pixel lod) need it.
1833           * It might be better to split the vectors here and only fetch/filter
1834           * quads which need it (if there's one lod per quad).
1835           */
1836          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1837                                       PIPE_FUNC_GREATER,
1838                                       lod_fpart, bld->lodf_bld.zero);
1839          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1840          lp_build_name(need_lerp, "need_lerp");
1841       }
1842
1843       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1844       {
1845          /*
1846           * We unfortunately need to clamp lod_fpart here since we can get
1847           * negative values which would screw up filtering if not all
1848           * lod_fpart values have same sign.
1849           */
1850          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1851                                   bld->lodf_bld.zero);
1852          /* sample the second mipmap level */
1853          lp_build_mipmap_level_sizes(bld, ilevel1,
1854                                      &size1,
1855                                      &row_stride1_vec, &img_stride1_vec);
1856          if (bld->num_mips == 1) {
1857             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1858          }
1859          else {
1860             data_ptr1 = bld->base_ptr;
1861             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1862          }
1863          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1864             lp_build_sample_image_nearest(bld, size1,
1865                                           row_stride1_vec, img_stride1_vec,
1866                                           data_ptr1, mipoff1, coords, offsets,
1867                                           colors1);
1868          }
1869          else {
1870             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1871                                          row_stride1_vec, img_stride1_vec,
1872                                          data_ptr1, mipoff1, coords, offsets,
1873                                          colors1);
1874          }
1875
1876          /* interpolate samples from the two mipmap levels */
1877
1878          if (bld->num_lods != bld->coord_type.length)
1879             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1880                                                               bld->lodf_bld.type,
1881                                                               bld->texel_bld.type,
1882                                                               lod_fpart);
1883
1884          for (chan = 0; chan < 4; chan++) {
1885             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1886                                           colors0[chan], colors1[chan],
1887                                           0);
1888             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1889          }
1890       }
1891       lp_build_endif(&if_ctx);
1892    }
1893 }
1894
1895
1896 /**
1897  * Sample the texture/mipmap using given mip filter, and using
1898  * both nearest and linear filtering at the same time depending
1899  * on linear_mask.
1900  * lod can be per quad but linear_mask is always per pixel.
1901  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1902  * from (vectors or scalars).
1903  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1904  */
1905 static void
1906 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1907                             LLVMValueRef linear_mask,
1908                             unsigned mip_filter,
1909                             const LLVMValueRef *coords,
1910                             const LLVMValueRef *offsets,
1911                             LLVMValueRef ilevel0,
1912                             LLVMValueRef ilevel1,
1913                             LLVMValueRef lod_fpart,
1914                             LLVMValueRef lod_positive,
1915                             LLVMValueRef *colors_out)
1916 {
1917    LLVMBuilderRef builder = bld->gallivm->builder;
1918    LLVMValueRef size0 = NULL;
1919    LLVMValueRef size1 = NULL;
1920    LLVMValueRef row_stride0_vec = NULL;
1921    LLVMValueRef row_stride1_vec = NULL;
1922    LLVMValueRef img_stride0_vec = NULL;
1923    LLVMValueRef img_stride1_vec = NULL;
1924    LLVMValueRef data_ptr0 = NULL;
1925    LLVMValueRef data_ptr1 = NULL;
1926    LLVMValueRef mipoff0 = NULL;
1927    LLVMValueRef mipoff1 = NULL;
1928    LLVMValueRef colors0[4], colors1[4];
1929    unsigned chan;
1930
1931    /* sample the first mipmap level */
1932    lp_build_mipmap_level_sizes(bld, ilevel0,
1933                                &size0,
1934                                &row_stride0_vec, &img_stride0_vec);
1935    if (bld->num_mips == 1) {
1936       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1937    }
1938    else {
1939       /* This path should work for num_lods 1 too but slightly less efficient */
1940       data_ptr0 = bld->base_ptr;
1941       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1942    }
1943
1944    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1945                                 row_stride0_vec, img_stride0_vec,
1946                                 data_ptr0, mipoff0, coords, offsets,
1947                                 colors0);
1948
1949    /* Store the first level's colors in the output variables */
1950    for (chan = 0; chan < 4; chan++) {
1951        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1952    }
1953
1954    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1955       struct lp_build_if_state if_ctx;
1956       LLVMValueRef need_lerp;
1957
1958       /*
1959        * We'll do mip filtering if any of the quads (or individual
1960        * pixel in case of per-pixel lod) need it.
1961        * Note using lod_positive here not lod_fpart since it may be the same
1962        * condition as that used in the outer "if" in the caller hence llvm
1963        * should be able to merge the branches in this case.
1964        */
1965       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1966       lp_build_name(need_lerp, "need_lerp");
1967
1968       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1969       {
1970          /*
1971           * We unfortunately need to clamp lod_fpart here since we can get
1972           * negative values which would screw up filtering if not all
1973           * lod_fpart values have same sign.
1974           */
1975          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1976                                   bld->lodf_bld.zero);
1977          /* sample the second mipmap level */
1978          lp_build_mipmap_level_sizes(bld, ilevel1,
1979                                      &size1,
1980                                      &row_stride1_vec, &img_stride1_vec);
1981          if (bld->num_mips == 1) {
1982             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1983          }
1984          else {
1985             data_ptr1 = bld->base_ptr;
1986             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1987          }
1988
1989          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1990                                       row_stride1_vec, img_stride1_vec,
1991                                       data_ptr1, mipoff1, coords, offsets,
1992                                       colors1);
1993
1994          /* interpolate samples from the two mipmap levels */
1995
1996          if (bld->num_lods != bld->coord_type.length)
1997             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1998                                                               bld->lodf_bld.type,
1999                                                               bld->texel_bld.type,
2000                                                               lod_fpart);
2001
2002          for (chan = 0; chan < 4; chan++) {
2003             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
2004                                           colors0[chan], colors1[chan],
2005                                           0);
2006             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2007          }
2008       }
2009       lp_build_endif(&if_ctx);
2010    }
2011 }
2012
2013
2014 /**
2015  * Build (per-coord) layer value.
2016  * Either clamp layer to valid values or fill in optional out_of_bounds
2017  * value and just return value unclamped.
2018  */
2019 static LLVMValueRef
2020 lp_build_layer_coord(struct lp_build_sample_context *bld,
2021                      unsigned texture_unit,
2022                      boolean is_cube_array,
2023                      LLVMValueRef layer,
2024                      LLVMValueRef *out_of_bounds)
2025 {
2026    LLVMValueRef num_layers;
2027    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2028
2029    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2030                                           bld->context_ptr, texture_unit);
2031
2032    if (out_of_bounds) {
2033       LLVMValueRef out1, out;
2034       assert(!is_cube_array);
2035       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2036       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2037       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2038       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2039       return layer;
2040    }
2041    else {
2042       LLVMValueRef maxlayer;
2043       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2044                                        bld->int_bld.one;
2045       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2046       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2047       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2048    }
2049 }
2050
2051
2052 /**
2053  * Calculate cube face, lod, mip levels.
2054  */
2055 static void
2056 lp_build_sample_common(struct lp_build_sample_context *bld,
2057                        boolean is_lodq,
2058                        unsigned texture_index,
2059                        unsigned sampler_index,
2060                        LLVMValueRef *coords,
2061                        const struct lp_derivatives *derivs, /* optional */
2062                        LLVMValueRef lod_bias, /* optional */
2063                        LLVMValueRef explicit_lod, /* optional */
2064                        LLVMValueRef *lod_pos_or_zero,
2065                        LLVMValueRef *lod,
2066                        LLVMValueRef *lod_fpart,
2067                        LLVMValueRef *ilevel0,
2068                        LLVMValueRef *ilevel1)
2069 {
2070    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2071    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2072    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2073    const unsigned target = bld->static_texture_state->target;
2074    LLVMValueRef first_level, cube_rho = NULL;
2075    LLVMValueRef lod_ipart = NULL;
2076    struct lp_derivatives cube_derivs;
2077
2078    /*
2079    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2080           mip_filter, min_filter, mag_filter);
2081    */
2082
2083    /*
2084     * Choose cube face, recompute texcoords for the chosen face and
2085     * compute rho here too (as it requires transform of derivatives).
2086     */
2087    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2088       boolean need_derivs;
2089       need_derivs = ((min_filter != mag_filter ||
2090                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2091                       !bld->static_sampler_state->min_max_lod_equal &&
2092                       !explicit_lod);
2093       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2094       derivs = &cube_derivs;
2095       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2096          /* calculate cube layer coord now */
2097          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2098          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2099          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2100          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2101          /* because of seamless filtering can't add it to face (coords[2]) here. */
2102       }
2103    }
2104    else if (target == PIPE_TEXTURE_1D_ARRAY ||
2105             target == PIPE_TEXTURE_2D_ARRAY) {
2106       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2107       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2108    }
2109
2110    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2111       /*
2112        * Clamp p coords to [0,1] for fixed function depth texture format here.
2113        * Technically this is not entirely correct for unorm depth as the ref value
2114        * should be converted to the depth format (quantization!) and comparison
2115        * then done in texture format. This would actually help performance (since
2116        * only need to do it once and could save the per-sample conversion of texels
2117        * to floats instead), but it would need more messy code (would need to push
2118        * at least some bits down to actual fetch so conversion could be skipped,
2119        * and would have ugly interaction with border color, would need to convert
2120        * border color to that format too or do some other tricks to make it work).
2121        */
2122       const struct util_format_description *format_desc = bld->format_desc;
2123       unsigned chan_type;
2124       /* not entirely sure we couldn't end up with non-valid swizzle here */
2125       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2126                      format_desc->channel[format_desc->swizzle[0]].type :
2127                      UTIL_FORMAT_TYPE_FLOAT;
2128       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2129          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2130                                     bld->coord_bld.zero, bld->coord_bld.one);
2131       }
2132    }
2133
2134    /*
2135     * Compute the level of detail (float).
2136     */
2137    if (min_filter != mag_filter ||
2138        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2139       /* Need to compute lod either to choose mipmap levels or to
2140        * distinguish between minification/magnification with one mipmap level.
2141        */
2142       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2143                             coords[0], coords[1], coords[2], cube_rho,
2144                             derivs, lod_bias, explicit_lod,
2145                             mip_filter, lod,
2146                             &lod_ipart, lod_fpart, lod_pos_or_zero);
2147       if (is_lodq) {
2148          LLVMValueRef last_level;
2149          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2150                                                      bld->gallivm,
2151                                                      bld->context_ptr,
2152                                                      texture_index);
2153          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2154                                                        bld->gallivm,
2155                                                        bld->context_ptr,
2156                                                        texture_index);
2157          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2158          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2159          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2160
2161          switch (mip_filter) {
2162          case PIPE_TEX_MIPFILTER_NONE:
2163             *lod_fpart = bld->lodf_bld.zero;
2164             break;
2165          case PIPE_TEX_MIPFILTER_NEAREST:
2166              *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2167              /* fallthrough */
2168          case PIPE_TEX_MIPFILTER_LINEAR:
2169             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2170                                         bld->lodf_bld.zero, last_level);
2171             break;
2172          }
2173          return;
2174       }
2175
2176    } else {
2177       lod_ipart = bld->lodi_bld.zero;
2178       *lod_pos_or_zero = bld->lodi_bld.zero;
2179    }
2180
2181    if (bld->num_lods != bld->num_mips) {
2182       /* only makes sense if there's just a single mip level */
2183       assert(bld->num_mips == 1);
2184       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2185    }
2186
2187    /*
2188     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2189     */
2190    switch (mip_filter) {
2191    default:
2192       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2193       /* fall-through */
2194    case PIPE_TEX_MIPFILTER_NONE:
2195       /* always use mip level 0 */
2196       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2197                                                     bld->gallivm, bld->context_ptr,
2198                                                     texture_index);
2199       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2200       *ilevel0 = first_level;
2201       break;
2202    case PIPE_TEX_MIPFILTER_NEAREST:
2203       assert(lod_ipart);
2204       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2205       break;
2206    case PIPE_TEX_MIPFILTER_LINEAR:
2207       assert(lod_ipart);
2208       assert(*lod_fpart);
2209       lp_build_linear_mip_levels(bld, texture_index,
2210                                  lod_ipart, lod_fpart,
2211                                  ilevel0, ilevel1);
2212       break;
2213    }
2214 }
2215
2216 static void
2217 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2218                             unsigned sampler_unit)
2219 {
2220    struct gallivm_state *gallivm = bld->gallivm;
2221    LLVMBuilderRef builder = gallivm->builder;
2222    LLVMValueRef border_color_ptr =
2223       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2224                                        bld->context_ptr, sampler_unit);
2225    LLVMValueRef border_color;
2226    const struct util_format_description *format_desc = bld->format_desc;
2227    struct lp_type vec4_type = bld->texel_type;
2228    struct lp_build_context vec4_bld;
2229    LLVMValueRef min_clamp = NULL;
2230    LLVMValueRef max_clamp = NULL;
2231
2232    /*
2233     * For normalized format need to clamp border color (technically
2234     * probably should also quantize the data). Really sucks doing this
2235     * here but can't avoid at least for now since this is part of
2236     * sampler state and texture format is part of sampler_view state.
2237     * GL expects also expects clamping for uint/sint formats too so
2238     * do that as well (d3d10 can't end up here with uint/sint since it
2239     * only supports them with ld).
2240     */
2241    vec4_type.length = 4;
2242    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2243
2244    /*
2245     * Vectorized clamping of border color. Loading is a bit of a hack since
2246     * we just cast the pointer to float array to pointer to vec4
2247     * (int or float).
2248     */
2249    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2250                                              lp_build_const_int32(gallivm, 0));
2251    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2252                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
2253    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2254    /* we don't have aligned type in the dynamic state unfortunately */
2255    LLVMSetAlignment(border_color, 4);
2256
2257    /*
2258     * Instead of having some incredibly complex logic which will try to figure out
2259     * clamping necessary for each channel, simply use the first channel, and treat
2260     * mixed signed/unsigned normalized formats specially.
2261     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2262     * good reason.)
2263     */
2264    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2265       int chan;
2266       /* d/s needs special handling because both present means just sampling depth */
2267       if (util_format_is_depth_and_stencil(format_desc->format)) {
2268          chan = format_desc->swizzle[0];
2269       }
2270       else {
2271          chan = util_format_get_first_non_void_channel(format_desc->format);
2272       }
2273       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2274          unsigned chan_type = format_desc->channel[chan].type;
2275          unsigned chan_norm = format_desc->channel[chan].normalized;
2276          unsigned chan_pure = format_desc->channel[chan].pure_integer;
2277          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2278             if (chan_norm) {
2279                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2280                max_clamp = vec4_bld.one;
2281             }
2282             else if (chan_pure) {
2283                /*
2284                 * Border color was stored as int, hence need min/max clamp
2285                 * only if chan has less than 32 bits..
2286                 */
2287                unsigned chan_size = format_desc->channel[chan].size;
2288                if (chan_size < 32) {
2289                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2290                                                      0 - (1 << (chan_size - 1)));
2291                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2292                                                      (1 << (chan_size - 1)) - 1);
2293                }
2294             }
2295             /* TODO: no idea about non-pure, non-normalized! */
2296          }
2297          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2298             if (chan_norm) {
2299                min_clamp = vec4_bld.zero;
2300                max_clamp = vec4_bld.one;
2301             }
2302             /*
2303              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2304              * we use Z32_FLOAT_S8X24 to imply sampling depth component
2305              * and ignoring stencil, which will blow up here if we try to
2306              * do a uint clamp in a float texel build...
2307              * And even if we had that format, mesa st also thinks using z24s8
2308              * means depth sampling ignoring stencil.
2309              */
2310             else if (chan_pure) {
2311                /*
2312                 * Border color was stored as uint, hence never need min
2313                 * clamp, and only need max clamp if chan has less than 32 bits.
2314                 */
2315                unsigned chan_size = format_desc->channel[chan].size;
2316                if (chan_size < 32) {
2317                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2318                                                      (1 << chan_size) - 1);
2319                }
2320                /* TODO: no idea about non-pure, non-normalized! */
2321             }
2322          }
2323          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2324             /* TODO: I have no idea what clamp this would need if any! */
2325          }
2326       }
2327       /* mixed plain formats (or different pure size) */
2328       switch (format_desc->format) {
2329       case PIPE_FORMAT_B10G10R10A2_UINT:
2330       case PIPE_FORMAT_R10G10B10A2_UINT:
2331       {
2332          unsigned max10 = (1 << 10) - 1;
2333          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2334                                         max10, (1 << 2) - 1, NULL);
2335       }
2336          break;
2337       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2338          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2339                                         -1.0F, 0.0F, NULL);
2340          max_clamp = vec4_bld.one;
2341          break;
2342       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2343       case PIPE_FORMAT_R5SG5SB6U_NORM:
2344          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2345                                         0.0F, 0.0F, NULL);
2346          max_clamp = vec4_bld.one;
2347          break;
2348       default:
2349          break;
2350       }
2351    }
2352    else {
2353       /* cannot figure this out from format description */
2354       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2355          /* s3tc formats are always unorm */
2356          min_clamp = vec4_bld.zero;
2357          max_clamp = vec4_bld.one;
2358       }
2359       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2360                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2361          switch (format_desc->format) {
2362          case PIPE_FORMAT_RGTC1_UNORM:
2363          case PIPE_FORMAT_RGTC2_UNORM:
2364          case PIPE_FORMAT_LATC1_UNORM:
2365          case PIPE_FORMAT_LATC2_UNORM:
2366          case PIPE_FORMAT_ETC1_RGB8:
2367             min_clamp = vec4_bld.zero;
2368             max_clamp = vec4_bld.one;
2369             break;
2370          case PIPE_FORMAT_RGTC1_SNORM:
2371          case PIPE_FORMAT_RGTC2_SNORM:
2372          case PIPE_FORMAT_LATC1_SNORM:
2373          case PIPE_FORMAT_LATC2_SNORM:
2374             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2375             max_clamp = vec4_bld.one;
2376             break;
2377          default:
2378             assert(0);
2379             break;
2380          }
2381       }
2382       /*
2383        * all others from subsampled/other group, though we don't care
2384        * about yuv (and should not have any from zs here)
2385        */
2386       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2387          switch (format_desc->format) {
2388          case PIPE_FORMAT_R8G8_B8G8_UNORM:
2389          case PIPE_FORMAT_G8R8_G8B8_UNORM:
2390          case PIPE_FORMAT_G8R8_B8R8_UNORM:
2391          case PIPE_FORMAT_R8G8_R8B8_UNORM:
2392          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2393             min_clamp = vec4_bld.zero;
2394             max_clamp = vec4_bld.one;
2395             break;
2396          case PIPE_FORMAT_R8G8Bx_SNORM:
2397             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2398             max_clamp = vec4_bld.one;
2399             break;
2400             /*
2401              * Note smallfloat formats usually don't need clamping
2402              * (they still have infinite range) however this is not
2403              * true for r11g11b10 and r9g9b9e5, which can't represent
2404              * negative numbers (and additionally r9g9b9e5 can't represent
2405              * very large numbers). d3d10 seems happy without clamping in
2406              * this case, but gl spec is pretty clear: "for floating
2407              * point and integer formats, border values are clamped to
2408              * the representable range of the format" so do that here.
2409              */
2410          case PIPE_FORMAT_R11G11B10_FLOAT:
2411             min_clamp = vec4_bld.zero;
2412             break;
2413          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2414             min_clamp = vec4_bld.zero;
2415             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2416             break;
2417          default:
2418             assert(0);
2419             break;
2420          }
2421       }
2422    }
2423
2424    if (min_clamp) {
2425       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2426    }
2427    if (max_clamp) {
2428       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2429    }
2430
2431    bld->border_color_clamped = border_color;
2432 }
2433
2434
2435 /**
2436  * General texture sampling codegen.
2437  * This function handles texture sampling for all texture targets (1D,
2438  * 2D, 3D, cube) and all filtering modes.
2439  */
2440 static void
2441 lp_build_sample_general(struct lp_build_sample_context *bld,
2442                         unsigned sampler_unit,
2443                         boolean is_gather,
2444                         const LLVMValueRef *coords,
2445                         const LLVMValueRef *offsets,
2446                         LLVMValueRef lod_positive,
2447                         LLVMValueRef lod_fpart,
2448                         LLVMValueRef ilevel0,
2449                         LLVMValueRef ilevel1,
2450                         LLVMValueRef *colors_out)
2451 {
2452    LLVMBuilderRef builder = bld->gallivm->builder;
2453    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2454    const unsigned mip_filter = sampler_state->min_mip_filter;
2455    const unsigned min_filter = sampler_state->min_img_filter;
2456    const unsigned mag_filter = sampler_state->mag_img_filter;
2457    LLVMValueRef texels[4];
2458    unsigned chan;
2459
2460    /* if we need border color, (potentially) clamp it now */
2461    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2462                                               min_filter,
2463                                               mag_filter) ||
2464        (bld->dims > 1 &&
2465            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2466                                                   min_filter,
2467                                                   mag_filter)) ||
2468        (bld->dims > 2 &&
2469            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2470                                                   min_filter,
2471                                                   mag_filter))) {
2472       lp_build_clamp_border_color(bld, sampler_unit);
2473    }
2474
2475
2476    /*
2477     * Get/interpolate texture colors.
2478     */
2479
2480    for (chan = 0; chan < 4; ++chan) {
2481      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2482      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2483    }
2484
2485    if (min_filter == mag_filter) {
2486       /* no need to distinguish between minification and magnification */
2487       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2488                              is_gather,
2489                              coords, offsets,
2490                              ilevel0, ilevel1, lod_fpart,
2491                              texels);
2492    }
2493    else {
2494       /*
2495        * Could also get rid of the if-logic and always use mipmap_both, both
2496        * for the single lod and multi-lod case if nothing really uses this.
2497        */
2498       if (bld->num_lods == 1) {
2499          /* Emit conditional to choose min image filter or mag image filter
2500           * depending on the lod being > 0 or <= 0, respectively.
2501           */
2502          struct lp_build_if_state if_ctx;
2503
2504          lod_positive = LLVMBuildTrunc(builder, lod_positive,
2505                                        LLVMInt1TypeInContext(bld->gallivm->context),
2506                                        "lod_pos");
2507
2508          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2509          {
2510             /* Use the minification filter */
2511             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2512                                    coords, offsets,
2513                                    ilevel0, ilevel1, lod_fpart,
2514                                    texels);
2515          }
2516          lp_build_else(&if_ctx);
2517          {
2518             /* Use the magnification filter */
2519             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2520                                    FALSE,
2521                                    coords, offsets,
2522                                    ilevel0, NULL, NULL,
2523                                    texels);
2524          }
2525          lp_build_endif(&if_ctx);
2526       }
2527       else {
2528          LLVMValueRef need_linear, linear_mask;
2529          unsigned mip_filter_for_nearest;
2530          struct lp_build_if_state if_ctx;
2531
2532          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2533             linear_mask = lod_positive;
2534             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2535          }
2536          else {
2537             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2538             mip_filter_for_nearest = mip_filter;
2539          }
2540          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2541                                                linear_mask);
2542          lp_build_name(need_linear, "need_linear");
2543
2544          if (bld->num_lods != bld->coord_type.length) {
2545             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2546                                                                 bld->lodi_type,
2547                                                                 bld->int_coord_type,
2548                                                                 linear_mask);
2549          }
2550
2551          lp_build_if(&if_ctx, bld->gallivm, need_linear);
2552          {
2553             /*
2554              * Do sampling with both filters simultaneously. This means using
2555              * a linear filter and doing some tricks (with weights) for the pixels
2556              * which need nearest filter.
2557              * Note that it's probably rare some pixels need nearest and some
2558              * linear filter but the fixups required for the nearest pixels
2559              * aren't all that complicated so just always run a combined path
2560              * if at least some pixels require linear.
2561              */
2562             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2563                                         coords, offsets,
2564                                         ilevel0, ilevel1,
2565                                         lod_fpart, lod_positive,
2566                                         texels);
2567          }
2568          lp_build_else(&if_ctx);
2569          {
2570             /*
2571              * All pixels require just nearest filtering, which is way
2572              * cheaper than linear, hence do a separate path for that.
2573              */
2574             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2575                                    mip_filter_for_nearest, FALSE,
2576                                    coords, offsets,
2577                                    ilevel0, ilevel1, lod_fpart,
2578                                    texels);
2579          }
2580          lp_build_endif(&if_ctx);
2581       }
2582    }
2583
2584    for (chan = 0; chan < 4; ++chan) {
2585      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2586      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2587    }
2588 }
2589
2590
2591 /**
2592  * Texel fetch function.
2593  * In contrast to general sampling there is no filtering, no coord minification,
2594  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2595  * directly to be applied to the selected mip level (after adding texel offsets).
2596  * This function handles texel fetch for all targets where texel fetch is supported
2597  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2598  */
2599 static void
2600 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2601                      unsigned texture_unit,
2602                      const LLVMValueRef *coords,
2603                      LLVMValueRef explicit_lod,
2604                      const LLVMValueRef *offsets,
2605                      LLVMValueRef *colors_out)
2606 {
2607    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2608    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2609    unsigned dims = bld->dims, chan;
2610    unsigned target = bld->static_texture_state->target;
2611    boolean out_of_bound_ret_zero = TRUE;
2612    LLVMValueRef size, ilevel;
2613    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2614    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2615    LLVMValueRef width, height, depth, i, j;
2616    LLVMValueRef offset, out_of_bounds, out1;
2617
2618    out_of_bounds = int_coord_bld->zero;
2619
2620    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2621       if (bld->num_mips != int_coord_bld->type.length) {
2622          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2623                                             perquadi_bld->type, explicit_lod, 0);
2624       }
2625       else {
2626          ilevel = explicit_lod;
2627       }
2628       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2629                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
2630    }
2631    else {
2632       assert(bld->num_mips == 1);
2633       if (bld->static_texture_state->target != PIPE_BUFFER) {
2634          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2635                                                   bld->context_ptr, texture_unit);
2636       }
2637       else {
2638          ilevel = lp_build_const_int32(bld->gallivm, 0);
2639       }
2640    }
2641    lp_build_mipmap_level_sizes(bld, ilevel,
2642                                &size,
2643                                &row_stride_vec, &img_stride_vec);
2644    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2645                                 size, &width, &height, &depth);
2646
2647    if (target == PIPE_TEXTURE_1D_ARRAY ||
2648        target == PIPE_TEXTURE_2D_ARRAY) {
2649       if (out_of_bound_ret_zero) {
2650          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2651          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2652       }
2653       else {
2654          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2655       }
2656    }
2657
2658    /* This is a lot like border sampling */
2659    if (offsets[0]) {
2660       /*
2661        * coords are really unsigned, offsets are signed, but I don't think
2662        * exceeding 31 bits is possible
2663        */
2664       x = lp_build_add(int_coord_bld, x, offsets[0]);
2665    }
2666    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2667    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2668    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2669    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2670
2671    if (dims >= 2) {
2672       if (offsets[1]) {
2673          y = lp_build_add(int_coord_bld, y, offsets[1]);
2674       }
2675       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2676       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2677       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2678       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2679
2680       if (dims >= 3) {
2681          if (offsets[2]) {
2682             z = lp_build_add(int_coord_bld, z, offsets[2]);
2683          }
2684          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2685          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2686          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2687          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2688       }
2689    }
2690
2691    lp_build_sample_offset(int_coord_bld,
2692                           bld->format_desc,
2693                           x, y, z, row_stride_vec, img_stride_vec,
2694                           &offset, &i, &j);
2695
2696    if (bld->static_texture_state->target != PIPE_BUFFER) {
2697       offset = lp_build_add(int_coord_bld, offset,
2698                             lp_build_get_mip_offsets(bld, ilevel));
2699    }
2700
2701    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2702
2703    lp_build_fetch_rgba_soa(bld->gallivm,
2704                            bld->format_desc,
2705                            bld->texel_type, TRUE,
2706                            bld->base_ptr, offset,
2707                            i, j,
2708                            bld->cache,
2709                            colors_out);
2710
2711    if (out_of_bound_ret_zero) {
2712       /*
2713        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2714        * Could use min/max above instead of out-of-bounds comparisons
2715        * if we don't care about the result returned for out-of-bounds.
2716        */
2717       for (chan = 0; chan < 4; chan++) {
2718          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2719                                             bld->texel_bld.zero, colors_out[chan]);
2720       }
2721    }
2722 }
2723
2724
2725 /**
2726  * Just set texels to white instead of actually sampling the texture.
2727  * For debugging.
2728  */
2729 void
2730 lp_build_sample_nop(struct gallivm_state *gallivm,
2731                     struct lp_type type,
2732                     const LLVMValueRef *coords,
2733                     LLVMValueRef texel_out[4])
2734 {
2735    LLVMValueRef one = lp_build_one(gallivm, type);
2736    unsigned chan;
2737
2738    for (chan = 0; chan < 4; chan++) {
2739       texel_out[chan] = one;
2740    }
2741 }
2742
2743
2744 /**
2745  * Build the actual texture sampling code.
2746  * 'texel' will return a vector of four LLVMValueRefs corresponding to
2747  * R, G, B, A.
2748  * \param type  vector float type to use for coords, etc.
2749  * \param sample_key
2750  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2751  */
2752 static void
2753 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2754                          const struct lp_static_texture_state *static_texture_state,
2755                          const struct lp_static_sampler_state *static_sampler_state,
2756                          struct lp_sampler_dynamic_state *dynamic_state,
2757                          struct lp_type type,
2758                          unsigned sample_key,
2759                          unsigned texture_index,
2760                          unsigned sampler_index,
2761                          LLVMValueRef context_ptr,
2762                          LLVMValueRef thread_data_ptr,
2763                          const LLVMValueRef *coords,
2764                          const LLVMValueRef *offsets,
2765                          const struct lp_derivatives *derivs, /* optional */
2766                          LLVMValueRef lod, /* optional */
2767                          LLVMValueRef texel_out[4])
2768 {
2769    unsigned target = static_texture_state->target;
2770    unsigned dims = texture_dims(target);
2771    unsigned num_quads = type.length / 4;
2772    unsigned mip_filter, min_img_filter, mag_img_filter, i;
2773    struct lp_build_sample_context bld;
2774    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2775    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2776    LLVMBuilderRef builder = gallivm->builder;
2777    LLVMValueRef tex_width, newcoords[5];
2778    enum lp_sampler_lod_property lod_property;
2779    enum lp_sampler_lod_control lod_control;
2780    enum lp_sampler_op_type op_type;
2781    LLVMValueRef lod_bias = NULL;
2782    LLVMValueRef explicit_lod = NULL;
2783    boolean op_is_tex, op_is_lodq, op_is_gather;
2784
2785    if (0) {
2786       enum pipe_format fmt = static_texture_state->format;
2787       debug_printf("Sample from %s\n", util_format_name(fmt));
2788    }
2789
2790    lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2791                      LP_SAMPLER_LOD_PROPERTY_SHIFT;
2792    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2793                     LP_SAMPLER_LOD_CONTROL_SHIFT;
2794    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2795                  LP_SAMPLER_OP_TYPE_SHIFT;
2796
2797    op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2798    op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2799    op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2800
2801    if (lod_control == LP_SAMPLER_LOD_BIAS) {
2802       lod_bias = lod;
2803       assert(lod);
2804       assert(derivs == NULL);
2805    }
2806    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2807       explicit_lod = lod;
2808       assert(lod);
2809       assert(derivs == NULL);
2810    }
2811    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2812       assert(derivs);
2813       assert(lod == NULL);
2814    }
2815    else {
2816       assert(derivs == NULL);
2817       assert(lod == NULL);
2818    }
2819
2820    if (static_texture_state->format == PIPE_FORMAT_NONE) {
2821       /*
2822        * If there's nothing bound, format is NONE, and we must return
2823        * all zero as mandated by d3d10 in this case.
2824        */
2825       unsigned chan;
2826       LLVMValueRef zero = lp_build_zero(gallivm, type);
2827       for (chan = 0; chan < 4; chan++) {
2828          texel_out[chan] = zero;
2829       }
2830       return;
2831    }
2832
2833    assert(type.floating);
2834
2835    /* Setup our build context */
2836    memset(&bld, 0, sizeof bld);
2837    bld.gallivm = gallivm;
2838    bld.context_ptr = context_ptr;
2839    bld.static_sampler_state = &derived_sampler_state;
2840    bld.static_texture_state = static_texture_state;
2841    bld.dynamic_state = dynamic_state;
2842    bld.format_desc = util_format_description(static_texture_state->format);
2843    bld.dims = dims;
2844
2845    if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
2846       bld.no_quad_lod = TRUE;
2847    }
2848    if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
2849       bld.no_rho_approx = TRUE;
2850    }
2851    if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
2852       bld.no_brilinear = TRUE;
2853    }
2854
2855    bld.vector_width = lp_type_width(type);
2856
2857    bld.float_type = lp_type_float(32);
2858    bld.int_type = lp_type_int(32);
2859    bld.coord_type = type;
2860    bld.int_coord_type = lp_int_type(type);
2861    bld.float_size_in_type = lp_type_float(32);
2862    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2863    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2864    bld.texel_type = type;
2865
2866    /* always using the first channel hopefully should be safe,
2867     * if not things WILL break in other places anyway.
2868     */
2869    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2870        bld.format_desc->channel[0].pure_integer) {
2871       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2872          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2873       }
2874       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2875          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2876       }
2877    }
2878    else if (util_format_has_stencil(bld.format_desc) &&
2879        !util_format_has_depth(bld.format_desc)) {
2880       /* for stencil only formats, sample stencil (uint) */
2881       bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2882    }
2883
2884    if (!static_texture_state->level_zero_only ||
2885        !static_sampler_state->max_lod_pos || op_is_lodq) {
2886       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2887    } else {
2888       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2889    }
2890    if (op_is_gather) {
2891       /*
2892        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2893        * the actual filtering. Using mostly the same paths, so cube face
2894        * selection, coord wrapping etc. all naturally uses the same code.
2895        */
2896       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2897       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2898       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2899    }
2900    mip_filter = derived_sampler_state.min_mip_filter;
2901
2902    if (0) {
2903       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2904    }
2905
2906    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2907        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2908    {
2909       /*
2910        * Seamless filtering ignores wrap modes.
2911        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2912        * bilinear it's not correct but way better than using for instance repeat.
2913        * Note we even set this for non-seamless. Technically GL allows any wrap
2914        * mode, which made sense when supporting true borders (can get seamless
2915        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2916        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2917        * up the sampler state (as it makes it texture dependent).
2918        */
2919       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2920       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2921    }
2922    /*
2923     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2924     * so AoS path could be used. Not sure it's worth the trouble...
2925     */
2926
2927    min_img_filter = derived_sampler_state.min_img_filter;
2928    mag_img_filter = derived_sampler_state.mag_img_filter;
2929
2930
2931    /*
2932     * This is all a bit complicated different paths are chosen for performance
2933     * reasons.
2934     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2935     * everything (the last two options are equivalent for 4-wide case).
2936     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2937     * lod is calculated then the lod value extracted afterwards so making this
2938     * case basically the same as far as lod handling is concerned for the
2939     * further sample/filter code as the 1 lod for everything case.
2940     * Different lod handling mostly shows up when building mipmap sizes
2941     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2942     * (getting the fractional part of the lod to the right texels).
2943     */
2944
2945    /*
2946     * There are other situations where at least the multiple int lods could be
2947     * avoided like min and max lod being equal.
2948     */
2949    bld.num_mips = bld.num_lods = 1;
2950
2951    if (bld.no_quad_lod && bld.no_rho_approx &&
2952        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2953          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2954           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2955         op_is_lodq)) {
2956       /*
2957        * special case for using per-pixel lod even for implicit lod,
2958        * which is generally never required (ok by APIs) except to please
2959        * some (somewhat broken imho) tests (because per-pixel face selection
2960        * can cause derivatives to be different for pixels outside the primitive
2961        * due to the major axis division even if pre-project derivatives are
2962        * looking normal).
2963        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2964        * cube maps we do indeed get per-pixel lod values).
2965        */
2966       bld.num_mips = type.length;
2967       bld.num_lods = type.length;
2968    }
2969    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2970        (explicit_lod || lod_bias || derivs)) {
2971       if ((!op_is_tex && target != PIPE_BUFFER) ||
2972           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2973          bld.num_mips = type.length;
2974          bld.num_lods = type.length;
2975       }
2976       else if (op_is_tex && min_img_filter != mag_img_filter) {
2977          bld.num_mips = 1;
2978          bld.num_lods = type.length;
2979       }
2980    }
2981    /* TODO: for true scalar_lod should only use 1 lod value */
2982    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2983             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2984       bld.num_mips = num_quads;
2985       bld.num_lods = num_quads;
2986    }
2987    else if (op_is_tex && min_img_filter != mag_img_filter) {
2988       bld.num_mips = 1;
2989       bld.num_lods = num_quads;
2990    }
2991
2992    if (op_is_gather)
2993       bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
2994    bld.lodf_type = type;
2995    /* we want native vector size to be able to use our intrinsics */
2996    if (bld.num_lods != type.length) {
2997       /* TODO: this currently always has to be per-quad or per-element */
2998       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2999    }
3000    bld.lodi_type = lp_int_type(bld.lodf_type);
3001    bld.levelf_type = bld.lodf_type;
3002    if (bld.num_mips == 1) {
3003       bld.levelf_type.length = 1;
3004    }
3005    bld.leveli_type = lp_int_type(bld.levelf_type);
3006    bld.float_size_type = bld.float_size_in_type;
3007    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
3008     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
3009    if (bld.num_mips > 1) {
3010       bld.float_size_type.length = bld.num_mips == type.length ?
3011                                       bld.num_mips * bld.float_size_in_type.length :
3012                                       type.length;
3013    }
3014    bld.int_size_type = lp_int_type(bld.float_size_type);
3015
3016    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3017    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3018    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3019    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3020    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3021    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3022    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3023    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3024    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3025    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3026    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3027    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3028    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3029    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3030
3031    /* Get the dynamic state */
3032    tex_width = dynamic_state->width(dynamic_state, gallivm,
3033                                     context_ptr, texture_index);
3034    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3035                                                     context_ptr, texture_index);
3036    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3037                                                     context_ptr, texture_index);
3038    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3039                                           context_ptr, texture_index);
3040    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3041                                                 context_ptr, texture_index);
3042    /* Note that mip_offsets is an array[level] of offsets to texture images */
3043
3044    if (dynamic_state->cache_ptr && thread_data_ptr) {
3045       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3046                                            thread_data_ptr, texture_index);
3047    }
3048
3049    /* width, height, depth as single int vector */
3050    if (dims <= 1) {
3051       bld.int_size = tex_width;
3052    }
3053    else {
3054       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3055                                             tex_width,
3056                                             LLVMConstInt(i32t, 0, 0), "");
3057       if (dims >= 2) {
3058          LLVMValueRef tex_height =
3059             dynamic_state->height(dynamic_state, gallivm,
3060                                   context_ptr, texture_index);
3061          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3062                                                tex_height,
3063                                                LLVMConstInt(i32t, 1, 0), "");
3064          if (dims >= 3) {
3065             LLVMValueRef tex_depth =
3066                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3067                                     texture_index);
3068             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3069                                                   tex_depth,
3070                                                   LLVMConstInt(i32t, 2, 0), "");
3071          }
3072       }
3073    }
3074
3075    for (i = 0; i < 5; i++) {
3076       newcoords[i] = coords[i];
3077    }
3078
3079    if (util_format_is_pure_integer(static_texture_state->format) &&
3080        !util_format_has_depth(bld.format_desc) && op_is_tex &&
3081        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3082         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3083         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3084       /*
3085        * Bail if impossible filtering is specified (the awkard additional
3086        * depth check is because it is legal in gallium to have things like S8Z24
3087        * here which would say it's pure int despite such formats should sample
3088        * the depth component).
3089        * In GL such filters make the texture incomplete, this makes it robust
3090        * against state trackers which set this up regardless (we'd crash in the
3091        * lerp later otherwise).
3092        * At least in some apis it may be legal to use such filters with lod
3093        * queries and/or gather (at least for gather d3d10 says only the wrap
3094        * bits are really used hence filter bits are likely simply ignored).
3095        * For fetch, we don't get valid samplers either way here.
3096        */
3097       unsigned chan;
3098       LLVMValueRef zero = lp_build_zero(gallivm, type);
3099       for (chan = 0; chan < 4; chan++) {
3100          texel_out[chan] = zero;
3101       }
3102       return;
3103    }
3104
3105    if (0) {
3106       /* For debug: no-op texture sampling */
3107       lp_build_sample_nop(gallivm,
3108                           bld.texel_type,
3109                           newcoords,
3110                           texel_out);
3111    }
3112
3113    else if (op_type == LP_SAMPLER_OP_FETCH) {
3114       lp_build_fetch_texel(&bld, texture_index, newcoords,
3115                            lod, offsets,
3116                            texel_out);
3117    }
3118
3119    else {
3120       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3121       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3122       boolean use_aos;
3123
3124       use_aos = util_format_fits_8unorm(bld.format_desc) &&
3125                 op_is_tex &&
3126                 /* not sure this is strictly needed or simply impossible */
3127                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3128                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3129
3130       use_aos &= bld.num_lods <= num_quads ||
3131                  derived_sampler_state.min_img_filter ==
3132                     derived_sampler_state.mag_img_filter;
3133
3134       if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3135          use_aos = 0;
3136       }
3137
3138       if (dims > 1) {
3139          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3140          if (dims > 2) {
3141             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3142          }
3143       }
3144       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3145            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3146           derived_sampler_state.seamless_cube_map &&
3147           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3148            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3149          /* theoretically possible with AoS filtering but not implemented (complex!) */
3150          use_aos = 0;
3151       }
3152
3153       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3154           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3155          debug_printf("%s: using floating point linear filtering for %s\n",
3156                       __FUNCTION__, bld.format_desc->short_name);
3157          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3158                       "  wraps %d  wrapt %d  wrapr %d\n",
3159                       derived_sampler_state.min_img_filter,
3160                       derived_sampler_state.mag_img_filter,
3161                       derived_sampler_state.min_mip_filter,
3162                       static_texture_state->target,
3163                       derived_sampler_state.seamless_cube_map,
3164                       derived_sampler_state.wrap_s,
3165                       derived_sampler_state.wrap_t,
3166                       derived_sampler_state.wrap_r);
3167       }
3168
3169       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3170                              newcoords,
3171                              derivs, lod_bias, explicit_lod,
3172                              &lod_positive, &lod, &lod_fpart,
3173                              &ilevel0, &ilevel1);
3174
3175       if (op_is_lodq) {
3176          texel_out[0] = lod_fpart;
3177          texel_out[1] = lod;
3178          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3179          return;
3180       }
3181
3182       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3183          /* The aos path doesn't do seamless filtering so simply add cube layer
3184           * to face now.
3185           */
3186          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3187       }
3188
3189       /*
3190        * we only try 8-wide sampling with soa or if we have AVX2
3191        * as it appears to be a loss with just AVX)
3192        */
3193       if (num_quads == 1 || !use_aos ||
3194           (util_cpu_caps.has_avx2 &&
3195            (bld.num_lods == 1 ||
3196             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3197          if (use_aos) {
3198             /* do sampling/filtering with fixed pt arithmetic */
3199             lp_build_sample_aos(&bld, sampler_index,
3200                                 newcoords[0], newcoords[1],
3201                                 newcoords[2],
3202                                 offsets, lod_positive, lod_fpart,
3203                                 ilevel0, ilevel1,
3204                                 texel_out);
3205          }
3206
3207          else {
3208             lp_build_sample_general(&bld, sampler_index,
3209                                     op_type == LP_SAMPLER_OP_GATHER,
3210                                     newcoords, offsets,
3211                                     lod_positive, lod_fpart,
3212                                     ilevel0, ilevel1,
3213                                     texel_out);
3214          }
3215       }
3216       else {
3217          unsigned j;
3218          struct lp_build_sample_context bld4;
3219          struct lp_type type4 = type;
3220          unsigned i;
3221          LLVMValueRef texelout4[4];
3222          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3223
3224          type4.length = 4;
3225
3226          /* Setup our build context */
3227          memset(&bld4, 0, sizeof bld4);
3228          bld4.no_quad_lod = bld.no_quad_lod;
3229          bld4.no_rho_approx = bld.no_rho_approx;
3230          bld4.no_brilinear = bld.no_brilinear;
3231          bld4.gallivm = bld.gallivm;
3232          bld4.context_ptr = bld.context_ptr;
3233          bld4.static_texture_state = bld.static_texture_state;
3234          bld4.static_sampler_state = bld.static_sampler_state;
3235          bld4.dynamic_state = bld.dynamic_state;
3236          bld4.format_desc = bld.format_desc;
3237          bld4.dims = bld.dims;
3238          bld4.row_stride_array = bld.row_stride_array;
3239          bld4.img_stride_array = bld.img_stride_array;
3240          bld4.base_ptr = bld.base_ptr;
3241          bld4.mip_offsets = bld.mip_offsets;
3242          bld4.int_size = bld.int_size;
3243          bld4.cache = bld.cache;
3244
3245          bld4.vector_width = lp_type_width(type4);
3246
3247          bld4.float_type = lp_type_float(32);
3248          bld4.int_type = lp_type_int(32);
3249          bld4.coord_type = type4;
3250          bld4.int_coord_type = lp_int_type(type4);
3251          bld4.float_size_in_type = lp_type_float(32);
3252          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3253          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3254          bld4.texel_type = bld.texel_type;
3255          bld4.texel_type.length = 4;
3256
3257          bld4.num_mips = bld4.num_lods = 1;
3258          if (bld4.no_quad_lod && bld4.no_rho_approx &&
3259              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3260               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3261              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3262             bld4.num_mips = type4.length;
3263             bld4.num_lods = type4.length;
3264          }
3265          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3266              (explicit_lod || lod_bias || derivs)) {
3267             if ((!op_is_tex && target != PIPE_BUFFER) ||
3268                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3269                bld4.num_mips = type4.length;
3270                bld4.num_lods = type4.length;
3271             }
3272             else if (op_is_tex && min_img_filter != mag_img_filter) {
3273                bld4.num_mips = 1;
3274                bld4.num_lods = type4.length;
3275             }
3276          }
3277
3278          /* we want native vector size to be able to use our intrinsics */
3279          bld4.lodf_type = type4;
3280          if (bld4.num_lods != type4.length) {
3281             bld4.lodf_type.length = 1;
3282          }
3283          bld4.lodi_type = lp_int_type(bld4.lodf_type);
3284          bld4.levelf_type = type4;
3285          if (bld4.num_mips != type4.length) {
3286             bld4.levelf_type.length = 1;
3287          }
3288          bld4.leveli_type = lp_int_type(bld4.levelf_type);
3289          bld4.float_size_type = bld4.float_size_in_type;
3290          if (bld4.num_mips > 1) {
3291             bld4.float_size_type.length = bld4.num_mips == type4.length ?
3292                                             bld4.num_mips * bld4.float_size_in_type.length :
3293                                             type4.length;
3294          }
3295          bld4.int_size_type = lp_int_type(bld4.float_size_type);
3296
3297          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3298          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3299          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3300          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3301          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3302          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3303          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3304          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3305          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3306          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3307          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3308          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3309          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3310          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3311
3312          for (i = 0; i < num_quads; i++) {
3313             LLVMValueRef s4, t4, r4;
3314             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3315             LLVMValueRef ilevel04, ilevel14 = NULL;
3316             LLVMValueRef offsets4[4] = { NULL };
3317             unsigned num_lods = bld4.num_lods;
3318
3319             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3320             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3321             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3322
3323             if (offsets[0]) {
3324                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3325                if (dims > 1) {
3326                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3327                   if (dims > 2) {
3328                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3329                   }
3330                }
3331             }
3332             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3333             ilevel04 = bld.num_mips == 1 ? ilevel0 :
3334                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3335             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3336                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3337                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3338             }
3339
3340             if (use_aos) {
3341                /* do sampling/filtering with fixed pt arithmetic */
3342                lp_build_sample_aos(&bld4, sampler_index,
3343                                    s4, t4, r4, offsets4,
3344                                    lod_positive4, lod_fpart4,
3345                                    ilevel04, ilevel14,
3346                                    texelout4);
3347             }
3348
3349             else {
3350                /* this path is currently unreachable and hence might break easily... */
3351                LLVMValueRef newcoords4[5];
3352                newcoords4[0] = s4;
3353                newcoords4[1] = t4;
3354                newcoords4[2] = r4;
3355                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3356                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3357
3358                lp_build_sample_general(&bld4, sampler_index,
3359                                        op_type == LP_SAMPLER_OP_GATHER,
3360                                        newcoords4, offsets4,
3361                                        lod_positive4, lod_fpart4,
3362                                        ilevel04, ilevel14,
3363                                        texelout4);
3364             }
3365             for (j = 0; j < 4; j++) {
3366                texelouttmp[j][i] = texelout4[j];
3367             }
3368          }
3369
3370          for (j = 0; j < 4; j++) {
3371             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3372          }
3373       }
3374    }
3375
3376    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3377       apply_sampler_swizzle(&bld, texel_out);
3378    }
3379
3380    /*
3381     * texel type can be a (32bit) int/uint (for pure int formats only),
3382     * however we are expected to always return floats (storage is untyped).
3383     */
3384    if (!bld.texel_type.floating) {
3385       unsigned chan;
3386       for (chan = 0; chan < 4; chan++) {
3387          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3388                                             lp_build_vec_type(gallivm, type), "");
3389       }
3390    }
3391 }
3392
3393
3394 #define USE_TEX_FUNC_CALL 1
3395
3396 #define LP_MAX_TEX_FUNC_ARGS 32
3397
3398 static inline void
3399 get_target_info(enum pipe_texture_target target,
3400                 unsigned *num_coords, unsigned *num_derivs,
3401                 unsigned *num_offsets, unsigned *layer)
3402 {
3403    unsigned dims = texture_dims(target);
3404    *num_coords = dims;
3405    *num_offsets = dims;
3406    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3407                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3408    *layer = has_layer_coord(target) ? 2: 0;
3409    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3410       /*
3411        * dims doesn't include r coord for cubes - this is handled
3412        * by layer instead, but need to fix up for cube arrays...
3413        */
3414       *layer = 3;
3415       *num_coords = 3;
3416    }
3417 }
3418
3419
3420 /**
3421  * Generate the function body for a texture sampling function.
3422  */
3423 static void
3424 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3425                          const struct lp_static_texture_state *static_texture_state,
3426                          const struct lp_static_sampler_state *static_sampler_state,
3427                          struct lp_sampler_dynamic_state *dynamic_state,
3428                          struct lp_type type,
3429                          unsigned texture_index,
3430                          unsigned sampler_index,
3431                          LLVMValueRef function,
3432                          unsigned num_args,
3433                          unsigned sample_key)
3434 {
3435    LLVMBuilderRef old_builder;
3436    LLVMBasicBlockRef block;
3437    LLVMValueRef coords[5];
3438    LLVMValueRef offsets[3] = { NULL };
3439    LLVMValueRef lod = NULL;
3440    LLVMValueRef context_ptr;
3441    LLVMValueRef thread_data_ptr = NULL;
3442    LLVMValueRef texel_out[4];
3443    struct lp_derivatives derivs;
3444    struct lp_derivatives *deriv_ptr = NULL;
3445    unsigned num_param = 0;
3446    unsigned i, num_coords, num_derivs, num_offsets, layer;
3447    enum lp_sampler_lod_control lod_control;
3448    boolean need_cache = FALSE;
3449
3450    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3451                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3452
3453    get_target_info(static_texture_state->target,
3454                    &num_coords, &num_derivs, &num_offsets, &layer);
3455
3456    if (dynamic_state->cache_ptr) {
3457       const struct util_format_description *format_desc;
3458       format_desc = util_format_description(static_texture_state->format);
3459       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3460          need_cache = TRUE;
3461       }
3462    }
3463
3464    /* "unpack" arguments */
3465    context_ptr = LLVMGetParam(function, num_param++);
3466    if (need_cache) {
3467       thread_data_ptr = LLVMGetParam(function, num_param++);
3468    }
3469    for (i = 0; i < num_coords; i++) {
3470       coords[i] = LLVMGetParam(function, num_param++);
3471    }
3472    for (i = num_coords; i < 5; i++) {
3473       /* This is rather unfortunate... */
3474       coords[i] = lp_build_undef(gallivm, type);
3475    }
3476    if (layer) {
3477       coords[layer] = LLVMGetParam(function, num_param++);
3478    }
3479    if (sample_key & LP_SAMPLER_SHADOW) {
3480       coords[4] = LLVMGetParam(function, num_param++);
3481    }
3482    if (sample_key & LP_SAMPLER_OFFSETS) {
3483       for (i = 0; i < num_offsets; i++) {
3484          offsets[i] = LLVMGetParam(function, num_param++);
3485       }
3486    }
3487    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3488        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3489       lod = LLVMGetParam(function, num_param++);
3490    }
3491    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3492       for (i = 0; i < num_derivs; i++) {
3493          derivs.ddx[i] = LLVMGetParam(function, num_param++);
3494          derivs.ddy[i] = LLVMGetParam(function, num_param++);
3495       }
3496       deriv_ptr = &derivs;
3497    }
3498
3499    assert(num_args == num_param);
3500
3501    /*
3502     * Function body
3503     */
3504
3505    old_builder = gallivm->builder;
3506    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3507    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3508    LLVMPositionBuilderAtEnd(gallivm->builder, block);
3509
3510    lp_build_sample_soa_code(gallivm,
3511                             static_texture_state,
3512                             static_sampler_state,
3513                             dynamic_state,
3514                             type,
3515                             sample_key,
3516                             texture_index,
3517                             sampler_index,
3518                             context_ptr,
3519                             thread_data_ptr,
3520                             coords,
3521                             offsets,
3522                             deriv_ptr,
3523                             lod,
3524                             texel_out);
3525
3526    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3527
3528    LLVMDisposeBuilder(gallivm->builder);
3529    gallivm->builder = old_builder;
3530
3531    gallivm_verify_function(gallivm, function);
3532 }
3533
3534
3535 /**
3536  * Call the matching function for texture sampling.
3537  * If there's no match, generate a new one.
3538  */
3539 static void
3540 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3541                          const struct lp_static_texture_state *static_texture_state,
3542                          const struct lp_static_sampler_state *static_sampler_state,
3543                          struct lp_sampler_dynamic_state *dynamic_state,
3544                          const struct lp_sampler_params *params)
3545 {
3546    LLVMBuilderRef builder = gallivm->builder;
3547    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3548                              LLVMGetInsertBlock(builder)));
3549    LLVMValueRef function, inst;
3550    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3551    LLVMBasicBlockRef bb;
3552    LLVMValueRef tex_ret;
3553    unsigned num_args = 0;
3554    char func_name[64];
3555    unsigned i, num_coords, num_derivs, num_offsets, layer;
3556    unsigned texture_index = params->texture_index;
3557    unsigned sampler_index = params->sampler_index;
3558    unsigned sample_key = params->sample_key;
3559    const LLVMValueRef *coords = params->coords;
3560    const LLVMValueRef *offsets = params->offsets;
3561    const struct lp_derivatives *derivs = params->derivs;
3562    enum lp_sampler_lod_control lod_control;
3563    boolean need_cache = FALSE;
3564
3565    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3566                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3567
3568    get_target_info(static_texture_state->target,
3569                    &num_coords, &num_derivs, &num_offsets, &layer);
3570
3571    if (dynamic_state->cache_ptr) {
3572       const struct util_format_description *format_desc;
3573       format_desc = util_format_description(static_texture_state->format);
3574       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3575          need_cache = TRUE;
3576       }
3577    }
3578    /*
3579     * texture function matches are found by name.
3580     * Thus the name has to include both the texture and sampler unit
3581     * (which covers all static state) plus the actual texture function
3582     * (including things like offsets, shadow coord, lod control).
3583     * Additionally lod_property has to be included too.
3584     */
3585
3586    snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3587             texture_index, sampler_index, sample_key);
3588
3589    function = LLVMGetNamedFunction(module, func_name);
3590
3591    if(!function) {
3592       LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3593       LLVMTypeRef ret_type;
3594       LLVMTypeRef function_type;
3595       LLVMTypeRef val_type[4];
3596       unsigned num_param = 0;
3597
3598       /*
3599        * Generate the function prototype.
3600        */
3601
3602       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3603       if (need_cache) {
3604          arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3605       }
3606       for (i = 0; i < num_coords; i++) {
3607          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3608          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3609       }
3610       if (layer) {
3611          arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3612          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3613       }
3614       if (sample_key & LP_SAMPLER_SHADOW) {
3615          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3616       }
3617       if (sample_key & LP_SAMPLER_OFFSETS) {
3618          for (i = 0; i < num_offsets; i++) {
3619             arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3620             assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3621          }
3622       }
3623       if (lod_control == LP_SAMPLER_LOD_BIAS ||
3624           lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3625          arg_types[num_param++] = LLVMTypeOf(params->lod);
3626       }
3627       else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3628          for (i = 0; i < num_derivs; i++) {
3629             arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3630             arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3631             assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3632             assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3633          }
3634       }
3635
3636       val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3637          lp_build_vec_type(gallivm, params->type);
3638       ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3639       function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3640       function = LLVMAddFunction(module, func_name, function_type);
3641
3642       for (i = 0; i < num_param; ++i) {
3643          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3644
3645             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3646          }
3647       }
3648
3649       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3650       LLVMSetLinkage(function, LLVMInternalLinkage);
3651
3652       lp_build_sample_gen_func(gallivm,
3653                                static_texture_state,
3654                                static_sampler_state,
3655                                dynamic_state,
3656                                params->type,
3657                                texture_index,
3658                                sampler_index,
3659                                function,
3660                                num_param,
3661                                sample_key);
3662    }
3663
3664    num_args = 0;
3665    args[num_args++] = params->context_ptr;
3666    if (need_cache) {
3667       args[num_args++] = params->thread_data_ptr;
3668    }
3669    for (i = 0; i < num_coords; i++) {
3670       args[num_args++] = coords[i];
3671    }
3672    if (layer) {
3673       args[num_args++] = coords[layer];
3674    }
3675    if (sample_key & LP_SAMPLER_SHADOW) {
3676       args[num_args++] = coords[4];
3677    }
3678    if (sample_key & LP_SAMPLER_OFFSETS) {
3679       for (i = 0; i < num_offsets; i++) {
3680          args[num_args++] = offsets[i];
3681       }
3682    }
3683    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3684        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3685       args[num_args++] = params->lod;
3686    }
3687    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3688       for (i = 0; i < num_derivs; i++) {
3689          args[num_args++] = derivs->ddx[i];
3690          args[num_args++] = derivs->ddy[i];
3691       }
3692    }
3693
3694    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3695
3696    tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3697    bb = LLVMGetInsertBlock(builder);
3698    inst = LLVMGetLastInstruction(bb);
3699    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3700
3701    for (i = 0; i < 4; i++) {
3702       params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3703    }
3704 }
3705
3706
3707 /**
3708  * Build texture sampling code.
3709  * Either via a function call or inline it directly.
3710  */
3711 void
3712 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3713                     const struct lp_static_sampler_state *static_sampler_state,
3714                     struct lp_sampler_dynamic_state *dynamic_state,
3715                     struct gallivm_state *gallivm,
3716                     const struct lp_sampler_params *params)
3717 {
3718    boolean use_tex_func = FALSE;
3719
3720    /*
3721     * Do not use a function call if the sampling is "simple enough".
3722     * We define this by
3723     * a) format
3724     * b) no mips (either one level only or no mip filter)
3725     * No mips will definitely make the code smaller, though
3726     * the format requirement is a bit iffy - there's some (SoA) formats
3727     * which definitely generate less code. This does happen to catch
3728     * some important cases though which are hurt quite a bit by using
3729     * a call (though not really because of the call overhead but because
3730     * they are reusing the same texture unit with some of the same
3731     * parameters).
3732     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3733     */
3734
3735    if (USE_TEX_FUNC_CALL) {
3736       const struct util_format_description *format_desc;
3737       boolean simple_format;
3738       boolean simple_tex;
3739       enum lp_sampler_op_type op_type;
3740       format_desc = util_format_description(static_texture_state->format);
3741       simple_format = !format_desc ||
3742                          (util_format_is_rgba8_variant(format_desc) &&
3743                           format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3744
3745       op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3746                     LP_SAMPLER_OP_TYPE_SHIFT;
3747       simple_tex =
3748          op_type != LP_SAMPLER_OP_TEXTURE ||
3749            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3750              static_texture_state->level_zero_only == TRUE) &&
3751             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3752
3753       use_tex_func = format_desc && !(simple_format && simple_tex);
3754    }
3755
3756    if (use_tex_func) {
3757       lp_build_sample_soa_func(gallivm,
3758                                static_texture_state,
3759                                static_sampler_state,
3760                                dynamic_state,
3761                                params);
3762    }
3763    else {
3764       lp_build_sample_soa_code(gallivm,
3765                                static_texture_state,
3766                                static_sampler_state,
3767                                dynamic_state,
3768                                params->type,
3769                                params->sample_key,
3770                                params->texture_index,
3771                                params->sampler_index,
3772                                params->context_ptr,
3773                                params->thread_data_ptr,
3774                                params->coords,
3775                                params->offsets,
3776                                params->derivs,
3777                                params->lod,
3778                                params->texel);
3779    }
3780 }
3781
3782
3783 void
3784 lp_build_size_query_soa(struct gallivm_state *gallivm,
3785                         const struct lp_static_texture_state *static_state,
3786                         struct lp_sampler_dynamic_state *dynamic_state,
3787                         const struct lp_sampler_size_query_params *params)
3788 {
3789    LLVMValueRef lod, level = 0, size;
3790    LLVMValueRef first_level = NULL;
3791    int dims, i;
3792    boolean has_array;
3793    unsigned num_lods = 1;
3794    struct lp_build_context bld_int_vec4;
3795    LLVMValueRef context_ptr = params->context_ptr;
3796    unsigned texture_unit = params->texture_unit;
3797    unsigned target = params->target;
3798
3799    if (static_state->format == PIPE_FORMAT_NONE) {
3800       /*
3801        * If there's nothing bound, format is NONE, and we must return
3802        * all zero as mandated by d3d10 in this case.
3803        */
3804       unsigned chan;
3805       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3806       for (chan = 0; chan < 4; chan++) {
3807          params->sizes_out[chan] = zero;
3808       }
3809       return;
3810    }
3811
3812    /*
3813     * Do some sanity verification about bound texture and shader dcl target.
3814     * Not entirely sure what's possible but assume array/non-array
3815     * always compatible (probably not ok for OpenGL but d3d10 has no
3816     * distinction of arrays at the resource level).
3817     * Everything else looks bogus (though not entirely sure about rect/2d).
3818     * Currently disabled because it causes assertion failures if there's
3819     * nothing bound (or rather a dummy texture, not that this case would
3820     * return the right values).
3821     */
3822    if (0 && static_state->target != target) {
3823       if (static_state->target == PIPE_TEXTURE_1D)
3824          assert(target == PIPE_TEXTURE_1D_ARRAY);
3825       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3826          assert(target == PIPE_TEXTURE_1D);
3827       else if (static_state->target == PIPE_TEXTURE_2D)
3828          assert(target == PIPE_TEXTURE_2D_ARRAY);
3829       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3830          assert(target == PIPE_TEXTURE_2D);
3831       else if (static_state->target == PIPE_TEXTURE_CUBE)
3832          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3833       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3834          assert(target == PIPE_TEXTURE_CUBE);
3835       else
3836          assert(0);
3837    }
3838
3839    dims = texture_dims(target);
3840
3841    switch (target) {
3842    case PIPE_TEXTURE_1D_ARRAY:
3843    case PIPE_TEXTURE_2D_ARRAY:
3844    case PIPE_TEXTURE_CUBE_ARRAY:
3845       has_array = TRUE;
3846       break;
3847    default:
3848       has_array = FALSE;
3849       break;
3850    }
3851
3852    assert(!params->int_type.floating);
3853
3854    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3855
3856    if (params->explicit_lod) {
3857       /* FIXME: this needs to honor per-element lod */
3858       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3859                                     lp_build_const_int32(gallivm, 0), "");
3860       first_level = dynamic_state->first_level(dynamic_state, gallivm,
3861                                                context_ptr, texture_unit);
3862       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3863       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3864    } else {
3865       lod = bld_int_vec4.zero;
3866    }
3867
3868    size = bld_int_vec4.undef;
3869
3870    size = LLVMBuildInsertElement(gallivm->builder, size,
3871                                  dynamic_state->width(dynamic_state, gallivm,
3872                                                       context_ptr, texture_unit),
3873                                  lp_build_const_int32(gallivm, 0), "");
3874
3875    if (dims >= 2) {
3876       size = LLVMBuildInsertElement(gallivm->builder, size,
3877                                     dynamic_state->height(dynamic_state, gallivm,
3878                                                           context_ptr, texture_unit),
3879                                     lp_build_const_int32(gallivm, 1), "");
3880    }
3881
3882    if (dims >= 3) {
3883       size = LLVMBuildInsertElement(gallivm->builder, size,
3884                                     dynamic_state->depth(dynamic_state, gallivm,
3885                                                          context_ptr, texture_unit),
3886                                     lp_build_const_int32(gallivm, 2), "");
3887    }
3888
3889    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3890
3891    if (has_array) {
3892       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3893                                                  context_ptr, texture_unit);
3894       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3895          /*
3896           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3897           * Could avoid this by passing in number of cubes instead of total
3898           * number of layers (might make things easier elsewhere too).
3899           */
3900          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3901          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3902       }
3903       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3904                                     lp_build_const_int32(gallivm, dims), "");
3905    }
3906
3907    /*
3908     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3909     * if level is out of bounds (note this can't cover unbound texture
3910     * here, which also requires returning zero).
3911     */
3912    if (params->explicit_lod && params->is_sviewinfo) {
3913       LLVMValueRef last_level, out, out1;
3914       struct lp_build_context leveli_bld;
3915
3916       /* everything is scalar for now */
3917       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3918       last_level = dynamic_state->last_level(dynamic_state, gallivm,
3919                                              context_ptr, texture_unit);
3920
3921       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3922       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3923       out = lp_build_or(&leveli_bld, out, out1);
3924       if (num_lods == 1) {
3925          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3926       }
3927       else {
3928          /* TODO */
3929          assert(0);
3930       }
3931       size = lp_build_andnot(&bld_int_vec4, size, out);
3932    }
3933    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3934       params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3935                                                 size,
3936                                                 lp_build_const_int32(gallivm, i));
3937    }
3938    if (params->is_sviewinfo) {
3939       for (; i < 4; i++) {
3940          params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3941       }
3942    }
3943
3944    /*
3945     * if there's no explicit_lod (buffers, rects) queries requiring nr of
3946     * mips would be illegal.
3947     */
3948    if (params->is_sviewinfo && params->explicit_lod) {
3949       struct lp_build_context bld_int_scalar;
3950       LLVMValueRef num_levels;
3951       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3952
3953       if (static_state->level_zero_only) {
3954          num_levels = bld_int_scalar.one;
3955       }
3956       else {
3957          LLVMValueRef last_level;
3958
3959          last_level = dynamic_state->last_level(dynamic_state, gallivm,
3960                                                 context_ptr, texture_unit);
3961          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3962          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3963       }
3964       params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3965                                         num_levels);
3966    }
3967 }
3968
3969 static void
3970 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
3971                        const struct util_format_description *format_desc,
3972                        struct lp_type type,
3973                        LLVMValueRef exec_mask,
3974                        LLVMValueRef base_ptr,
3975                        LLVMValueRef offset,
3976                        LLVMValueRef out_of_bounds,
3977                        unsigned img_op,
3978                        LLVMAtomicRMWBinOp op,
3979                        const LLVMValueRef rgba_in[4],
3980                        const LLVMValueRef rgba2_in[4],
3981                        LLVMValueRef atomic_result[4])
3982 {
3983    enum pipe_format format = format_desc->format;
3984
3985    if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT)
3986       return;
3987
3988    LLVMValueRef atom_res = lp_build_alloca(gallivm,
3989                                            LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
3990
3991    offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
3992    struct lp_build_loop_state loop_state;
3993    lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
3994    struct lp_build_if_state ifthen;
3995    LLVMValueRef cond;
3996    LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
3997
3998    LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
3999    assert(exec_mask);
4000
4001    cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
4002    cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
4003    lp_build_if(&ifthen, gallivm, cond);
4004
4005    LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
4006    LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
4007    cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
4008    data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
4009
4010    if (img_op == LP_IMG_ATOMIC_CAS) {
4011       LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
4012       LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
4013       data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
4014                                     cas_src,
4015                                     LLVMAtomicOrderingSequentiallyConsistent,
4016                                     LLVMAtomicOrderingSequentiallyConsistent,
4017                                     false);
4018       data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4019    } else {
4020       data = LLVMBuildAtomicRMW(gallivm->builder, op,
4021                                 cast_base_ptr, data,
4022                                 LLVMAtomicOrderingSequentiallyConsistent,
4023                                 false);
4024    }
4025
4026    LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
4027    temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
4028    LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4029
4030    lp_build_endif(&ifthen);
4031    lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
4032                           NULL, LLVMIntUGE);
4033    atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
4034 }
4035
4036 void
4037 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4038                     struct lp_sampler_dynamic_state *dynamic_state,
4039                     struct gallivm_state *gallivm,
4040                     const struct lp_img_params *params)
4041 {
4042    unsigned target = params->target;
4043    unsigned dims = texture_dims(target);
4044    /** regular scalar int type */
4045    struct lp_type int_type, int_coord_type;
4046    struct lp_build_context int_bld, int_coord_bld;
4047    const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
4048    LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
4049    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4050    int_type = lp_type_int(32);
4051    int_coord_type = lp_int_type(params->type);
4052    lp_build_context_init(&int_bld, gallivm, int_type);
4053    lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4054
4055    LLVMValueRef offset, i, j;
4056
4057    LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4058                                                        params->context_ptr, params->image_index);
4059    LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4060                                                        params->context_ptr, params->image_index);
4061    LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4062                                                    params->context_ptr, params->image_index);
4063    LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4064                                                 params->context_ptr, params->image_index);
4065    LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4066                                                params->context_ptr, params->image_index);
4067    LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4068                                               params->context_ptr, params->image_index);
4069    boolean layer_coord = has_layer_coord(target);
4070
4071    width = lp_build_broadcast_scalar(&int_coord_bld, width);
4072    if (dims >= 2) {
4073       height = lp_build_broadcast_scalar(&int_coord_bld, height);
4074       row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4075    }
4076    if (dims >= 3 || layer_coord) {
4077       depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4078       img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4079    }
4080
4081    LLVMValueRef out_of_bounds = int_coord_bld.zero;
4082    LLVMValueRef out1;
4083    out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4084    out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4085
4086    if (dims >= 2) {
4087       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4088       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4089    }
4090    if (dims >= 3) {
4091       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4092       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4093    }
4094    lp_build_sample_offset(&int_coord_bld,
4095                           format_desc,
4096                           x, y, z, row_stride_vec, img_stride_vec,
4097                           &offset, &i, &j);
4098
4099    if (params->img_op == LP_IMG_LOAD) {
4100       struct lp_type texel_type = params->type;
4101       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
4102           format_desc->channel[0].pure_integer) {
4103          if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
4104             texel_type = lp_type_int_vec(params->type.width, params->type.width * params->type.length);
4105          } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
4106             texel_type = lp_type_uint_vec(params->type.width, params->type.width * params->type.length);
4107          }
4108       }
4109
4110       if (static_texture_state->format == PIPE_FORMAT_NONE) {
4111          /*
4112           * If there's nothing bound, format is NONE, and we must return
4113           * all zero as mandated by d3d10 in this case.
4114           */
4115          unsigned chan;
4116          LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4117          for (chan = 0; chan < 4; chan++) {
4118             params->outdata[chan] = zero;
4119          }
4120          return;
4121       }
4122
4123       offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4124       struct lp_build_context texel_bld;
4125       lp_build_context_init(&texel_bld, gallivm, texel_type);
4126       lp_build_fetch_rgba_soa(gallivm,
4127                               format_desc,
4128                               texel_type, TRUE,
4129                               base_ptr, offset,
4130                               i, j,
4131                               NULL,
4132                               params->outdata);
4133
4134       for (unsigned chan = 0; chan < 4; chan++) {
4135          params->outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4136                                                  texel_bld.zero, params->outdata[chan]);
4137       }
4138    } else if (params->img_op == LP_IMG_STORE) {
4139       if (static_texture_state->format == PIPE_FORMAT_NONE)
4140          return;
4141       lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4142                               params->indata);
4143    } else {
4144       if (static_texture_state->format == PIPE_FORMAT_NONE)
4145          return;
4146       lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4147                              params->img_op, params->op, params->indata, params->indata2, params->outdata);
4148    }
4149 }