src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- SoA.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  * @author Brian Paul <brianp@vmware.com>
  34  */
  35
  36 #include "pipe/p_defines.h"
  37 #include "pipe/p_state.h"
  38 #include "pipe/p_shader_tokens.h"
  39 #include "util/u_debug.h"
  40 #include "util/u_dump.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_math.h"
  43 #include "util/u_format.h"
  44 #include "util/u_cpu_detect.h"
  45 #include "util/format_rgb9e5.h"
  46 #include "lp_bld_debug.h"
  47 #include "lp_bld_type.h"
  48 #include "lp_bld_const.h"
  49 #include "lp_bld_conv.h"
  50 #include "lp_bld_arit.h"
  51 #include "lp_bld_bitarit.h"
  52 #include "lp_bld_logic.h"
  53 #include "lp_bld_printf.h"
  54 #include "lp_bld_swizzle.h"
  55 #include "lp_bld_flow.h"
  56 #include "lp_bld_gather.h"
  57 #include "lp_bld_format.h"
  58 #include "lp_bld_sample.h"
  59 #include "lp_bld_sample_aos.h"
  60 #include "lp_bld_struct.h"
  61 #include "lp_bld_quad.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_intr.h"
  64
  65
  66 /**
  67  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  68  * The computation depends on whether the texture is 1D, 2D or 3D.
  69  * The result, texel, will be float vectors:
  70  *   texel[0] = red values
  71  *   texel[1] = green values
  72  *   texel[2] = blue values
  73  *   texel[3] = alpha values
  74  */
  75 static void
  76 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  77                           LLVMValueRef width,
  78                           LLVMValueRef height,
  79                           LLVMValueRef depth,
  80                           LLVMValueRef x,
  81                           LLVMValueRef y,
  82                           LLVMValueRef z,
  83                           LLVMValueRef y_stride,
  84                           LLVMValueRef z_stride,
  85                           LLVMValueRef data_ptr,
  86                           LLVMValueRef mipoffsets,
  87                           LLVMValueRef texel_out[4])
  88 {
  89    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
  90    const unsigned dims = bld->dims;
  91    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
  92    LLVMBuilderRef builder = bld->gallivm->builder;
  93    LLVMValueRef offset;
  94    LLVMValueRef i, j;
  95    LLVMValueRef use_border = NULL;
  96
  97    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
  98    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
  99                                               static_state->min_img_filter,
 100                                               static_state->mag_img_filter)) {
 101       LLVMValueRef b1, b2;
 102       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
 103       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
 104       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 105    }
 106
 107    if (dims >= 2 &&
 108        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
 109                                               static_state->min_img_filter,
 110                                               static_state->mag_img_filter)) {
 111       LLVMValueRef b1, b2;
 112       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
 113       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
 114       if (use_border) {
 115          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 116          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 117       }
 118       else {
 119          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 120       }
 121    }
 122
 123    if (dims == 3 &&
 124        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
 125                                               static_state->min_img_filter,
 126                                               static_state->mag_img_filter)) {
 127       LLVMValueRef b1, b2;
 128       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
 129       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
 130       if (use_border) {
 131          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 132          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 133       }
 134       else {
 135          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 136       }
 137    }
 138
 139    /* convert x,y,z coords to linear offset from start of texture, in bytes */
 140    lp_build_sample_offset(&bld->int_coord_bld,
 141                           bld->format_desc,
 142                           x, y, z, y_stride, z_stride,
 143                           &offset, &i, &j);
 144    if (mipoffsets) {
 145       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
 146    }
 147
 148    if (use_border) {
 149       /* If we can sample the border color, it means that texcoords may
 150        * lie outside the bounds of the texture image.  We need to do
 151        * something to prevent reading out of bounds and causing a segfault.
 152        *
 153        * Simply AND the texture coords with !use_border.  This will cause
 154        * coords which are out of bounds to become zero.  Zero's guaranteed
 155        * to be inside the texture image.
 156        */
 157       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
 158    }
 159
 160    lp_build_fetch_rgba_soa(bld->gallivm,
 161                            bld->format_desc,
 162                            bld->texel_type, TRUE,
 163                            data_ptr, offset,
 164                            i, j,
 165                            bld->cache,
 166                            texel_out);
 167
 168    /*
 169     * Note: if we find an app which frequently samples the texture border
 170     * we might want to implement a true conditional here to avoid sampling
 171     * the texture whenever possible (since that's quite a bit of code).
 172     * Ex:
 173     *   if (use_border) {
 174     *      texel = border_color;
 175     *   }
 176     *   else {
 177     *      texel = sample_texture(coord);
 178     *   }
 179     * As it is now, we always sample the texture, then selectively replace
 180     * the texel color results with the border color.
 181     */
 182
 183    if (use_border) {
 184       /* select texel color or border color depending on use_border. */
 185       const struct util_format_description *format_desc = bld->format_desc;
 186       int chan;
 187       struct lp_type border_type = bld->texel_type;
 188       border_type.length = 4;
 189       /*
 190        * Only replace channels which are actually present. The others should
 191        * get optimized away eventually by sampler_view swizzle anyway but it's
 192        * easier too.
 193        */
 194       for (chan = 0; chan < 4; chan++) {
 195          unsigned chan_s;
 196          /* reverse-map channel... */
 197          for (chan_s = 0; chan_s < 4; chan_s++) {
 198             if (chan_s == format_desc->swizzle[chan]) {
 199                break;
 200             }
 201          }
 202          if (chan_s <= 3) {
 203             /* use the already clamped color */
 204             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
 205             LLVMValueRef border_chan;
 206
 207             border_chan = lp_build_extract_broadcast(bld->gallivm,
 208                                                      border_type,
 209                                                      bld->texel_type,
 210                                                      bld->border_color_clamped,
 211                                                      idx);
 212             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
 213                                               border_chan, texel_out[chan]);
 214          }
 215       }
 216    }
 217 }
 218
 219
 220 /**
 221  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
 222  * (Note that with pot sizes could do this much more easily post-scale
 223  * with some bit arithmetic.)
 224  */
 225 static LLVMValueRef
 226 lp_build_coord_mirror(struct lp_build_sample_context *bld,
 227                       LLVMValueRef coord, boolean posOnly)
 228 {
 229    struct lp_build_context *coord_bld = &bld->coord_bld;
 230    LLVMValueRef fract;
 231    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 232
 233    /*
 234     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
 235     * it all works out. (The result is in range [-1, 1.0], negative if
 236     * the coord is in the "odd" section, otherwise positive.)
 237     */
 238
 239    coord = lp_build_mul(coord_bld, coord, half);
 240    fract = lp_build_round(coord_bld, coord);
 241    fract = lp_build_sub(coord_bld, coord, fract);
 242    coord = lp_build_add(coord_bld, fract, fract);
 243
 244    if (posOnly) {
 245       /*
 246        * Theoretically it's not quite 100% accurate because the spec says
 247        * that ultimately a scaled coord of -x.0 should map to int coord
 248        * -x + 1 with mirroring, not -x (this does not matter for bilinear
 249        * filtering).
 250        */
 251       coord = lp_build_abs(coord_bld, coord);
 252       /* kill off NaNs */
 253       /* XXX: not safe without arch rounding, fract can be anything. */
 254       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
 255                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 256    }
 257
 258    return coord;
 259 }
 260
 261
 262 /**
 263  * Helper to compute the first coord and the weight for
 264  * linear wrap repeat npot textures
 265  */
 266 void
 267 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
 268                                   LLVMValueRef coord_f,
 269                                   LLVMValueRef length_i,
 270                                   LLVMValueRef length_f,
 271                                   LLVMValueRef *coord0_i,
 272                                   LLVMValueRef *weight_f)
 273 {
 274    struct lp_build_context *coord_bld = &bld->coord_bld;
 275    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 276    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 277    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
 278                                                 int_coord_bld->one);
 279    LLVMValueRef mask;
 280    /* wrap with normalized floats is just fract */
 281    coord_f = lp_build_fract(coord_bld, coord_f);
 282    /* mul by size and subtract 0.5 */
 283    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
 284    coord_f = lp_build_sub(coord_bld, coord_f, half);
 285    /*
 286     * we avoided the 0.5/length division before the repeat wrap,
 287     * now need to fix up edge cases with selects
 288     */
 289    /*
 290     * Note we do a float (unordered) compare so we can eliminate NaNs.
 291     * (Otherwise would need fract_safe above).
 292     */
 293    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
 294                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
 295
 296    /* convert to int, compute lerp weight */
 297    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
 298    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
 299 }
 300
 301
 302 /**
 303  * Build LLVM code for texture wrap mode for linear filtering.
 304  * \param x0_out  returns first integer texcoord
 305  * \param x1_out  returns second integer texcoord
 306  * \param weight_out  returns linear interpolation weight
 307  */
 308 static void
 309 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 310                             boolean is_gather,
 311                             LLVMValueRef coord,
 312                             LLVMValueRef length,
 313                             LLVMValueRef length_f,
 314                             LLVMValueRef offset,
 315                             boolean is_pot,
 316                             unsigned wrap_mode,
 317                             LLVMValueRef *x0_out,
 318                             LLVMValueRef *x1_out,
 319                             LLVMValueRef *weight_out)
 320 {
 321    struct lp_build_context *coord_bld = &bld->coord_bld;
 322    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 323    LLVMBuilderRef builder = bld->gallivm->builder;
 324    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 325    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 326    LLVMValueRef coord0, coord1, weight;
 327
 328    switch(wrap_mode) {
 329    case PIPE_TEX_WRAP_REPEAT:
 330       if (is_pot) {
 331          /* mul by size and subtract 0.5 */
 332          coord = lp_build_mul(coord_bld, coord, length_f);
 333          coord = lp_build_sub(coord_bld, coord, half);
 334          if (offset) {
 335             offset = lp_build_int_to_float(coord_bld, offset);
 336             coord = lp_build_add(coord_bld, coord, offset);
 337          }
 338          /* convert to int, compute lerp weight */
 339          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 340          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 341          /* repeat wrap */
 342          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
 343          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
 344       }
 345       else {
 346          LLVMValueRef mask;
 347          if (offset) {
 348             offset = lp_build_int_to_float(coord_bld, offset);
 349             offset = lp_build_div(coord_bld, offset, length_f);
 350             coord = lp_build_add(coord_bld, coord, offset);
 351          }
 352          lp_build_coord_repeat_npot_linear(bld, coord,
 353                                            length, length_f,
 354                                            &coord0, &weight);
 355          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 356                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 357          coord1 = LLVMBuildAnd(builder,
 358                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
 359                                mask, "");
 360       }
 361       break;
 362
 363    case PIPE_TEX_WRAP_CLAMP:
 364       if (bld->static_sampler_state->normalized_coords) {
 365          /* scale coord to length */
 366          coord = lp_build_mul(coord_bld, coord, length_f);
 367       }
 368       if (offset) {
 369          offset = lp_build_int_to_float(coord_bld, offset);
 370          coord = lp_build_add(coord_bld, coord, offset);
 371       }
 372
 373       /*
 374        * clamp to [0, length]
 375        *
 376        * Unlike some other wrap modes, this should be correct for gather
 377        * too. GL_CLAMP explicitly does this clamp on the coord prior to
 378        * actual wrapping (which is per sample).
 379        */
 380       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
 381
 382       coord = lp_build_sub(coord_bld, coord, half);
 383
 384       /* convert to int, compute lerp weight */
 385       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 386       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 387       break;
 388
 389    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 390       {
 391          struct lp_build_context abs_coord_bld = bld->coord_bld;
 392          abs_coord_bld.type.sign = FALSE;
 393
 394          if (bld->static_sampler_state->normalized_coords) {
 395             /* mul by tex size */
 396             coord = lp_build_mul(coord_bld, coord, length_f);
 397          }
 398          if (offset) {
 399             offset = lp_build_int_to_float(coord_bld, offset);
 400             coord = lp_build_add(coord_bld, coord, offset);
 401          }
 402
 403          /* clamp to length max */
 404          coord = lp_build_min_ext(coord_bld, coord, length_f,
 405                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 406          if (!is_gather) {
 407             /* subtract 0.5 */
 408             coord = lp_build_sub(coord_bld, coord, half);
 409             /* clamp to [0, length - 0.5] */
 410             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 411             /* convert to int, compute lerp weight */
 412             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 413             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 414          } else {
 415             /*
 416              * The non-gather path will end up with coords 0, 1 if coord was
 417              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
 418              * really matter what the second coord is). But for gather, we
 419              * really need to end up with coords 0, 0.
 420              */
 421             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 422             coord0 = lp_build_sub(coord_bld, coord, half);
 423             coord1 = lp_build_add(coord_bld, coord, half);
 424             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
 425             coord0 = lp_build_itrunc(coord_bld, coord0);
 426             coord1 = lp_build_itrunc(coord_bld, coord1);
 427             weight = coord_bld->undef;
 428          }
 429          /* coord1 = min(coord1, length-1) */
 430          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 431          break;
 432       }
 433
 434    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435       if (bld->static_sampler_state->normalized_coords) {
 436          /* scale coord to length */
 437          coord = lp_build_mul(coord_bld, coord, length_f);
 438       }
 439       if (offset) {
 440          offset = lp_build_int_to_float(coord_bld, offset);
 441          coord = lp_build_add(coord_bld, coord, offset);
 442       }
 443       /*
 444        * We don't need any clamp. Technically, for very large (pos or neg)
 445        * (or infinite) values, clamp against [-length, length] would be
 446        * correct, but we don't need to guarantee any specific
 447        * result for such coords (the ifloor will be undefined, but for modes
 448        * requiring border all resulting coords are safe).
 449        */
 450       coord = lp_build_sub(coord_bld, coord, half);
 451       /* convert to int, compute lerp weight */
 452       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 453       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 454       break;
 455
 456    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 457       if (offset) {
 458          offset = lp_build_int_to_float(coord_bld, offset);
 459          offset = lp_build_div(coord_bld, offset, length_f);
 460          coord = lp_build_add(coord_bld, coord, offset);
 461       }
 462       if (!is_gather) {
 463          /* compute mirror function */
 464          coord = lp_build_coord_mirror(bld, coord, TRUE);
 465
 466          /* scale coord to length */
 467          coord = lp_build_mul(coord_bld, coord, length_f);
 468          coord = lp_build_sub(coord_bld, coord, half);
 469
 470          /* convert to int, compute lerp weight */
 471          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 472          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 473
 474          /* coord0 = max(coord0, 0) */
 475          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
 476          /* coord1 = min(coord1, length-1) */
 477          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 478       } else {
 479          /*
 480           * This is pretty reasonable in the end,  all what the tests care
 481           * about is nasty edge cases (scaled coords x.5, so the individual
 482           * coords are actually integers, which is REALLY tricky to get right
 483           * due to this working differently both for negative numbers as well
 484           * as for even/odd cases). But with enough magic it's not too complex
 485           * after all.
 486           * Maybe should try a bit arithmetic one though for POT textures...
 487           */
 488          LLVMValueRef isNeg;
 489          /*
 490           * Wrapping just once still works, even though it means we can
 491           * get "wrong" sign due to performing mirror in the middle of the
 492           * two coords (because this can only happen very near the odd/even
 493           * edges, so both coords will actually end up as 0 or length - 1
 494           * in the end).
 495           * For GL4 gather with per-sample offsets we'd need to the mirroring
 496           * per coord too.
 497           */
 498          coord = lp_build_coord_mirror(bld, coord, FALSE);
 499          coord = lp_build_mul(coord_bld, coord, length_f);
 500
 501          /*
 502           * NaNs should be safe here, we'll do away with them with
 503           * the ones' complement plus min.
 504           */
 505          coord0 = lp_build_sub(coord_bld, coord, half);
 506          coord0 = lp_build_ifloor(coord_bld, coord0);
 507          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 508          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
 509          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 510                               coord0, int_coord_bld->zero);
 511          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
 512          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 513                               coord1, int_coord_bld->zero);
 514          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
 515          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 516          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 517
 518          weight = coord_bld->undef;
 519       }
 520       break;
 521
 522    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 523       if (bld->static_sampler_state->normalized_coords) {
 524          /* scale coord to length */
 525          coord = lp_build_mul(coord_bld, coord, length_f);
 526       }
 527       if (offset) {
 528          offset = lp_build_int_to_float(coord_bld, offset);
 529          coord = lp_build_add(coord_bld, coord, offset);
 530       }
 531       /*
 532        * XXX: probably not correct for gather, albeit I'm not
 533        * entirely sure as it's poorly specified. The wrapping looks
 534        * correct according to the spec which is against gl 1.2.1,
 535        * however negative values will be swapped - gl re-specified
 536        * wrapping with newer versions (no more pre-clamp except with
 537        * GL_CLAMP).
 538        */
 539       coord = lp_build_abs(coord_bld, coord);
 540
 541       /* clamp to [0, length] */
 542       coord = lp_build_min_ext(coord_bld, coord, length_f,
 543                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 544
 545       coord = lp_build_sub(coord_bld, coord, half);
 546
 547       /* convert to int, compute lerp weight */
 548       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 549       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 550       break;
 551
 552    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 553       {
 554          struct lp_build_context abs_coord_bld = bld->coord_bld;
 555          abs_coord_bld.type.sign = FALSE;
 556
 557          if (bld->static_sampler_state->normalized_coords) {
 558             /* scale coord to length */
 559             coord = lp_build_mul(coord_bld, coord, length_f);
 560          }
 561          if (offset) {
 562             offset = lp_build_int_to_float(coord_bld, offset);
 563             coord = lp_build_add(coord_bld, coord, offset);
 564          }
 565          if (!is_gather) {
 566             coord = lp_build_abs(coord_bld, coord);
 567
 568             /* clamp to length max */
 569             coord = lp_build_min_ext(coord_bld, coord, length_f,
 570                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 571             /* subtract 0.5 */
 572             coord = lp_build_sub(coord_bld, coord, half);
 573             /* clamp to [0, length - 0.5] */
 574             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 575
 576             /* convert to int, compute lerp weight */
 577             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 578             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 579             /* coord1 = min(coord1, length-1) */
 580             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 581          } else {
 582             /*
 583              * The non-gather path will swap coord0/1 if coord was negative,
 584              * which is ok for filtering since the filter weight matches
 585              * accordingly. Also, if coord is close to zero, coord0/1 will
 586              * be 0 and 1, instead of 0 and 0 (again ok due to filter
 587              * weight being 0.0). Both issues need to be fixed for gather.
 588              */
 589             LLVMValueRef isNeg;
 590
 591             /*
 592              * Actually wanted to cheat here and use:
 593              * coord1 = lp_build_iround(coord_bld, coord);
 594              * but it's not good enough for some tests (even piglit
 595              * textureGather is set up in a way so the coords area always
 596              * .5, that is right at the crossover points).
 597              * So do ordinary sub/floor, then do ones' complement
 598              * for negative numbers.
 599              * (Note can't just do sub|add/abs/itrunc per coord neither -
 600              * because the spec demands that mirror(3.0) = 3 but
 601              * mirror(-3.0) = 2.)
 602              */
 603             coord = lp_build_sub(coord_bld, coord, half);
 604             coord0 = lp_build_ifloor(coord_bld, coord);
 605             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 606             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
 607                                  int_coord_bld->zero);
 608             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
 609             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 610
 611             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
 612                                  int_coord_bld->zero);
 613             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
 614             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 615
 616             weight = coord_bld->undef;
 617          }
 618       }
 619       break;
 620
 621    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 622       {
 623          if (bld->static_sampler_state->normalized_coords) {
 624             /* scale coord to length */
 625             coord = lp_build_mul(coord_bld, coord, length_f);
 626          }
 627          if (offset) {
 628             offset = lp_build_int_to_float(coord_bld, offset);
 629             coord = lp_build_add(coord_bld, coord, offset);
 630          }
 631          /*
 632           * XXX: probably not correct for gather due to swapped
 633           * order if coord is negative (same rationale as for
 634           * MIRROR_CLAMP).
 635           */
 636          coord = lp_build_abs(coord_bld, coord);
 637
 638          /*
 639           * We don't need any clamp. Technically, for very large
 640           * (or infinite) values, clamp against length would be
 641           * correct, but we don't need to guarantee any specific
 642           * result for such coords (the ifloor will be undefined, but
 643           * for modes requiring border all resulting coords are safe).
 644           */
 645          coord = lp_build_sub(coord_bld, coord, half);
 646
 647          /* convert to int, compute lerp weight */
 648          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 649          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 650       }
 651       break;
 652
 653    default:
 654       assert(0);
 655       coord0 = NULL;
 656       coord1 = NULL;
 657       weight = NULL;
 658    }
 659
 660    *x0_out = coord0;
 661    *x1_out = coord1;
 662    *weight_out = weight;
 663 }
 664
 665
 666 /**
 667  * Build LLVM code for texture wrap mode for nearest filtering.
 668  * \param coord  the incoming texcoord (nominally in [0,1])
 669  * \param length  the texture size along one dimension, as int vector
 670  * \param length_f  the texture size along one dimension, as float vector
 671  * \param offset  texel offset along one dimension (as int vector)
 672  * \param is_pot  if TRUE, length is a power of two
 673  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 674  */
 675 static LLVMValueRef
 676 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 677                              LLVMValueRef coord,
 678                              LLVMValueRef length,
 679                              LLVMValueRef length_f,
 680                              LLVMValueRef offset,
 681                              boolean is_pot,
 682                              unsigned wrap_mode)
 683 {
 684    struct lp_build_context *coord_bld = &bld->coord_bld;
 685    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 686    LLVMBuilderRef builder = bld->gallivm->builder;
 687    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 688    LLVMValueRef icoord;
 689
 690    switch(wrap_mode) {
 691    case PIPE_TEX_WRAP_REPEAT:
 692       if (is_pot) {
 693          coord = lp_build_mul(coord_bld, coord, length_f);
 694          icoord = lp_build_ifloor(coord_bld, coord);
 695          if (offset) {
 696             icoord = lp_build_add(int_coord_bld, icoord, offset);
 697          }
 698          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
 699       }
 700       else {
 701           if (offset) {
 702              offset = lp_build_int_to_float(coord_bld, offset);
 703              offset = lp_build_div(coord_bld, offset, length_f);
 704              coord = lp_build_add(coord_bld, coord, offset);
 705           }
 706           /* take fraction, unnormalize */
 707           coord = lp_build_fract_safe(coord_bld, coord);
 708           coord = lp_build_mul(coord_bld, coord, length_f);
 709           icoord = lp_build_itrunc(coord_bld, coord);
 710       }
 711       break;
 712
 713    case PIPE_TEX_WRAP_CLAMP:
 714    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 715       if (bld->static_sampler_state->normalized_coords) {
 716          /* scale coord to length */
 717          coord = lp_build_mul(coord_bld, coord, length_f);
 718       }
 719
 720       if (offset) {
 721          offset = lp_build_int_to_float(coord_bld, offset);
 722          coord = lp_build_add(coord_bld, coord, offset);
 723       }
 724       /* floor */
 725       /* use itrunc instead since we clamp to 0 anyway */
 726       icoord = lp_build_itrunc(coord_bld, coord);
 727
 728       /* clamp to [0, length - 1]. */
 729       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
 730                               length_minus_one);
 731       break;
 732
 733    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 734       if (bld->static_sampler_state->normalized_coords) {
 735          /* scale coord to length */
 736          coord = lp_build_mul(coord_bld, coord, length_f);
 737       }
 738       /* no clamp necessary, border masking will handle this */
 739       icoord = lp_build_ifloor(coord_bld, coord);
 740       if (offset) {
 741          icoord = lp_build_add(int_coord_bld, icoord, offset);
 742       }
 743       break;
 744
 745    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 746       if (offset) {
 747          offset = lp_build_int_to_float(coord_bld, offset);
 748          offset = lp_build_div(coord_bld, offset, length_f);
 749          coord = lp_build_add(coord_bld, coord, offset);
 750       }
 751       /* compute mirror function */
 752       coord = lp_build_coord_mirror(bld, coord, TRUE);
 753
 754       /* scale coord to length */
 755       assert(bld->static_sampler_state->normalized_coords);
 756       coord = lp_build_mul(coord_bld, coord, length_f);
 757
 758       /* itrunc == ifloor here */
 759       icoord = lp_build_itrunc(coord_bld, coord);
 760
 761       /* clamp to [0, length - 1] */
 762       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
 763       break;
 764
 765    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 766    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 767       if (bld->static_sampler_state->normalized_coords) {
 768          /* scale coord to length */
 769          coord = lp_build_mul(coord_bld, coord, length_f);
 770       }
 771       if (offset) {
 772          offset = lp_build_int_to_float(coord_bld, offset);
 773          coord = lp_build_add(coord_bld, coord, offset);
 774       }
 775       coord = lp_build_abs(coord_bld, coord);
 776
 777       /* itrunc == ifloor here */
 778       icoord = lp_build_itrunc(coord_bld, coord);
 779       /*
 780        * Use unsigned min due to possible undef values (NaNs, overflow)
 781        */
 782       {
 783          struct lp_build_context abs_coord_bld = *int_coord_bld;
 784          abs_coord_bld.type.sign = FALSE;
 785          /* clamp to [0, length - 1] */
 786          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
 787       }
 788       break;
 789
 790    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 791       if (bld->static_sampler_state->normalized_coords) {
 792          /* scale coord to length */
 793          coord = lp_build_mul(coord_bld, coord, length_f);
 794       }
 795       if (offset) {
 796          offset = lp_build_int_to_float(coord_bld, offset);
 797          coord = lp_build_add(coord_bld, coord, offset);
 798       }
 799       coord = lp_build_abs(coord_bld, coord);
 800
 801       /* itrunc == ifloor here */
 802       icoord = lp_build_itrunc(coord_bld, coord);
 803       break;
 804
 805    default:
 806       assert(0);
 807       icoord = NULL;
 808    }
 809
 810    return icoord;
 811 }
 812
 813
 814 /**
 815  * Do shadow test/comparison.
 816  * \param p shadow ref value
 817  * \param texel  the texel to compare against
 818  */
 819 static LLVMValueRef
 820 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
 821                             LLVMValueRef p,
 822                             LLVMValueRef texel)
 823 {
 824    struct lp_build_context *texel_bld = &bld->texel_bld;
 825    LLVMValueRef res;
 826
 827    if (0) {
 828       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
 829       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
 830    }
 831
 832    /* result = (p FUNC texel) ? 1 : 0 */
 833    /*
 834     * honor d3d10 floating point rules here, which state that comparisons
 835     * are ordered except NOT_EQUAL which is unordered.
 836     */
 837    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
 838       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
 839                                  p, texel);
 840    }
 841    else {
 842       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
 843                          p, texel);
 844    }
 845    return res;
 846 }
 847
 848
 849 /**
 850  * Generate code to sample a mipmap level with nearest filtering.
 851  * If sampling a cube texture, r = cube face in [0,5].
 852  */
 853 static void
 854 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 855                               LLVMValueRef size,
 856                               LLVMValueRef row_stride_vec,
 857                               LLVMValueRef img_stride_vec,
 858                               LLVMValueRef data_ptr,
 859                               LLVMValueRef mipoffsets,
 860                               const LLVMValueRef *coords,
 861                               const LLVMValueRef *offsets,
 862                               LLVMValueRef colors_out[4])
 863 {
 864    const unsigned dims = bld->dims;
 865    LLVMValueRef width_vec;
 866    LLVMValueRef height_vec;
 867    LLVMValueRef depth_vec;
 868    LLVMValueRef flt_size;
 869    LLVMValueRef flt_width_vec;
 870    LLVMValueRef flt_height_vec;
 871    LLVMValueRef flt_depth_vec;
 872    LLVMValueRef x, y = NULL, z = NULL;
 873
 874    lp_build_extract_image_sizes(bld,
 875                                 &bld->int_size_bld,
 876                                 bld->int_coord_type,
 877                                 size,
 878                                 &width_vec, &height_vec, &depth_vec);
 879
 880    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
 881
 882    lp_build_extract_image_sizes(bld,
 883                                 &bld->float_size_bld,
 884                                 bld->coord_type,
 885                                 flt_size,
 886                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
 887
 888    /*
 889     * Compute integer texcoords.
 890     */
 891    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
 892                                     flt_width_vec, offsets[0],
 893                                     bld->static_texture_state->pot_width,
 894                                     bld->static_sampler_state->wrap_s);
 895    lp_build_name(x, "tex.x.wrapped");
 896
 897    if (dims >= 2) {
 898       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
 899                                        flt_height_vec, offsets[1],
 900                                        bld->static_texture_state->pot_height,
 901                                        bld->static_sampler_state->wrap_t);
 902       lp_build_name(y, "tex.y.wrapped");
 903
 904       if (dims == 3) {
 905          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
 906                                           flt_depth_vec, offsets[2],
 907                                           bld->static_texture_state->pot_depth,
 908                                           bld->static_sampler_state->wrap_r);
 909          lp_build_name(z, "tex.z.wrapped");
 910       }
 911    }
 912    if (has_layer_coord(bld->static_texture_state->target)) {
 913       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
 914          /* add cube layer to face */
 915          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
 916       }
 917       else {
 918          z = coords[2];
 919       }
 920       lp_build_name(z, "tex.z.layer");
 921    }
 922
 923    /*
 924     * Get texture colors.
 925     */
 926    lp_build_sample_texel_soa(bld,
 927                              width_vec, height_vec, depth_vec,
 928                              x, y, z,
 929                              row_stride_vec, img_stride_vec,
 930                              data_ptr, mipoffsets, colors_out);
 931
 932    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
 933       LLVMValueRef cmpval;
 934       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
 935       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
 936       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
 937                                       bld->texel_bld.one, bld->texel_bld.zero);
 938       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
 939    }
 940
 941 }
 942
 943
 944 /**
 945  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
 946  */
 947 static LLVMValueRef
 948 lp_build_masklerp(struct lp_build_context *bld,
 949                  LLVMValueRef weight,
 950                  LLVMValueRef mask0,
 951                  LLVMValueRef mask1)
 952 {
 953    struct gallivm_state *gallivm = bld->gallivm;
 954    LLVMBuilderRef builder = gallivm->builder;
 955    LLVMValueRef weight2;
 956
 957    weight2 = lp_build_sub(bld, bld->one, weight);
 958    weight = LLVMBuildBitCast(builder, weight,
 959                               lp_build_int_vec_type(gallivm, bld->type), "");
 960    weight2 = LLVMBuildBitCast(builder, weight2,
 961                               lp_build_int_vec_type(gallivm, bld->type), "");
 962    weight = LLVMBuildAnd(builder, weight, mask1, "");
 963    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
 964    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
 965    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
 966    return lp_build_add(bld, weight, weight2);
 967 }
 968
 969 /**
 970  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
 971  */
 972 static LLVMValueRef
 973 lp_build_masklerp2d(struct lp_build_context *bld,
 974                     LLVMValueRef weight0,
 975                     LLVMValueRef weight1,
 976                     LLVMValueRef mask00,
 977                     LLVMValueRef mask01,
 978                     LLVMValueRef mask10,
 979                     LLVMValueRef mask11)
 980 {
 981    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
 982    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
 983    return lp_build_lerp(bld, weight1, val0, val1, 0);
 984 }
 985
 986 /*
 987  * this is a bit excessive code for something OpenGL just recommends
 988  * but does not require.
 989  */
 990 #define ACCURATE_CUBE_CORNERS 1
 991
 992 /**
 993  * Generate code to sample a mipmap level with linear filtering.
 994  * If sampling a cube texture, r = cube face in [0,5].
 995  * If linear_mask is present, only pixels having their mask set
 996  * will receive linear filtering, the rest will use nearest.
 997  */
 998 static void
 999 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1000                              boolean is_gather,
1001                              LLVMValueRef size,
1002                              LLVMValueRef linear_mask,
1003                              LLVMValueRef row_stride_vec,
1004                              LLVMValueRef img_stride_vec,
1005                              LLVMValueRef data_ptr,
1006                              LLVMValueRef mipoffsets,
1007                              const LLVMValueRef *coords,
1008                              const LLVMValueRef *offsets,
1009                              LLVMValueRef colors_out[4])
1010 {
1011    LLVMBuilderRef builder = bld->gallivm->builder;
1012    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1013    struct lp_build_context *coord_bld = &bld->coord_bld;
1014    struct lp_build_context *texel_bld = &bld->texel_bld;
1015    const unsigned dims = bld->dims;
1016    LLVMValueRef width_vec;
1017    LLVMValueRef height_vec;
1018    LLVMValueRef depth_vec;
1019    LLVMValueRef flt_size;
1020    LLVMValueRef flt_width_vec;
1021    LLVMValueRef flt_height_vec;
1022    LLVMValueRef flt_depth_vec;
1023    LLVMValueRef fall_off[4], have_corners;
1024    LLVMValueRef z1 = NULL;
1025    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1026    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1027    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1028    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1029    LLVMValueRef xs[4], ys[4], zs[4];
1030    LLVMValueRef neighbors[2][2][4];
1031    int chan, texel_index;
1032    boolean seamless_cube_filter, accurate_cube_corners;
1033    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1034
1035    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1036                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1037                           bld->static_sampler_state->seamless_cube_map;
1038
1039    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
1040
1041    lp_build_extract_image_sizes(bld,
1042                                 &bld->int_size_bld,
1043                                 bld->int_coord_type,
1044                                 size,
1045                                 &width_vec, &height_vec, &depth_vec);
1046
1047    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1048
1049    lp_build_extract_image_sizes(bld,
1050                                 &bld->float_size_bld,
1051                                 bld->coord_type,
1052                                 flt_size,
1053                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1054
1055    /*
1056     * Compute integer texcoords.
1057     */
1058
1059    if (!seamless_cube_filter) {
1060       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1061                                   flt_width_vec, offsets[0],
1062                                   bld->static_texture_state->pot_width,
1063                                   bld->static_sampler_state->wrap_s,
1064                                   &x00, &x01, &s_fpart);
1065       lp_build_name(x00, "tex.x0.wrapped");
1066       lp_build_name(x01, "tex.x1.wrapped");
1067       x10 = x00;
1068       x11 = x01;
1069
1070       if (dims >= 2) {
1071          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1072                                      flt_height_vec, offsets[1],
1073                                      bld->static_texture_state->pot_height,
1074                                      bld->static_sampler_state->wrap_t,
1075                                      &y00, &y10, &t_fpart);
1076          lp_build_name(y00, "tex.y0.wrapped");
1077          lp_build_name(y10, "tex.y1.wrapped");
1078          y01 = y00;
1079          y11 = y10;
1080
1081          if (dims == 3) {
1082             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1083                                         flt_depth_vec, offsets[2],
1084                                         bld->static_texture_state->pot_depth,
1085                                         bld->static_sampler_state->wrap_r,
1086                                         &z00, &z1, &r_fpart);
1087             z01 = z10 = z11 = z00;
1088             lp_build_name(z00, "tex.z0.wrapped");
1089             lp_build_name(z1, "tex.z1.wrapped");
1090          }
1091       }
1092       if (has_layer_coord(bld->static_texture_state->target)) {
1093          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1094             /* add cube layer to face */
1095             z00 = z01 = z10 = z11 = z1 =
1096                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1097          }
1098          else {
1099             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1100          }
1101          lp_build_name(z00, "tex.z0.layer");
1102          lp_build_name(z1, "tex.z1.layer");
1103       }
1104    }
1105    else {
1106       struct lp_build_if_state edge_if;
1107       LLVMTypeRef int1t;
1108       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1109       LLVMValueRef coord0, coord1, have_edge, have_corner;
1110       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1111       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1112       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1113       LLVMValueRef face = coords[2];
1114       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1115       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1116       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1117       height_vec = width_vec;
1118       flt_height_vec = flt_width_vec;
1119
1120       /* XXX the overflow logic is actually sort of duplicated with trilinear,
1121        * since an overflow in one mip should also have a corresponding overflow
1122        * in another.
1123        */
1124       /* should always have normalized coords, and offsets are undefined */
1125       assert(bld->static_sampler_state->normalized_coords);
1126       /*
1127        * The coords should all be between [0,1] however we can have NaNs,
1128        * which will wreak havoc. In particular the y1_clamped value below
1129        * can be -INT_MAX (on x86) and be propagated right through (probably
1130        * other values might be bogus in the end too).
1131        * So kill off the NaNs here.
1132        */
1133       coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1134                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1135       coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1136       /* instead of clamp, build mask if overflowed */
1137       coord0 = lp_build_sub(coord_bld, coord0, half);
1138       /* convert to int, compute lerp weight */
1139       /* not ideal with AVX (and no AVX2) */
1140       lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1141       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1142       coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1143                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1144       coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1145       coord1 = lp_build_sub(coord_bld, coord1, half);
1146       lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1147       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1148
1149       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1150       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1151       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1152       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1153
1154       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1155       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1156       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1157       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1158
1159       /* needed for accurate corner filtering branch later, rely on 0 init */
1160       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1161       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1162
1163       for (texel_index = 0; texel_index < 4; texel_index++) {
1164          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1165          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1166          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1167       }
1168
1169       lp_build_if(&edge_if, bld->gallivm, have_edge);
1170
1171       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1172       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1173       LLVMBuildStore(builder, have_corner, have_corners);
1174
1175       /*
1176        * Need to feed clamped values here for cheap corner handling,
1177        * but only for y coord (as when falling off both edges we only
1178        * fall off the x one) - this should be sufficient.
1179        */
1180       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1181       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1182
1183       /*
1184        * Get all possible new coords.
1185        */
1186       lp_build_cube_new_coords(ivec_bld, face,
1187                                x0, x1, y0_clamped, y1_clamped,
1188                                length_minus_one,
1189                                new_faces, new_xcoords, new_ycoords);
1190
1191       /* handle fall off x-, x+ direction */
1192       /* determine new coords, face (not both fall_off vars can be true at same time) */
1193       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1194       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1195       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1196       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1197       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1198       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1199       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1200       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1201
1202       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1203       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1204
1205       /* handle fall off y-, y+ direction */
1206       /*
1207        * Cheap corner logic: just hack up things so a texel doesn't fall
1208        * off both sides (which means filter weights will be wrong but we'll only
1209        * use valid texels in the filter).
1210        * This means however (y) coords must additionally be clamped (see above).
1211        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1212        */
1213       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1214       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1215       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1216       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1217
1218       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1219       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1220       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1221       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1222       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1223       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1224       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1225       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1226
1227       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1228       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1229       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1230       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1231
1232       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1233          /* now can add cube layer to face (per sample) */
1234          z00 = lp_build_add(ivec_bld, z00, coords[3]);
1235          z01 = lp_build_add(ivec_bld, z01, coords[3]);
1236          z10 = lp_build_add(ivec_bld, z10, coords[3]);
1237          z11 = lp_build_add(ivec_bld, z11, coords[3]);
1238       }
1239
1240       LLVMBuildStore(builder, x00, xs[0]);
1241       LLVMBuildStore(builder, x01, xs[1]);
1242       LLVMBuildStore(builder, x10, xs[2]);
1243       LLVMBuildStore(builder, x11, xs[3]);
1244       LLVMBuildStore(builder, y00, ys[0]);
1245       LLVMBuildStore(builder, y01, ys[1]);
1246       LLVMBuildStore(builder, y10, ys[2]);
1247       LLVMBuildStore(builder, y11, ys[3]);
1248       LLVMBuildStore(builder, z00, zs[0]);
1249       LLVMBuildStore(builder, z01, zs[1]);
1250       LLVMBuildStore(builder, z10, zs[2]);
1251       LLVMBuildStore(builder, z11, zs[3]);
1252
1253       lp_build_else(&edge_if);
1254
1255       LLVMBuildStore(builder, x0, xs[0]);
1256       LLVMBuildStore(builder, x1, xs[1]);
1257       LLVMBuildStore(builder, x0, xs[2]);
1258       LLVMBuildStore(builder, x1, xs[3]);
1259       LLVMBuildStore(builder, y0, ys[0]);
1260       LLVMBuildStore(builder, y0, ys[1]);
1261       LLVMBuildStore(builder, y1, ys[2]);
1262       LLVMBuildStore(builder, y1, ys[3]);
1263       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1264          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1265          LLVMBuildStore(builder, cube_layer, zs[0]);
1266          LLVMBuildStore(builder, cube_layer, zs[1]);
1267          LLVMBuildStore(builder, cube_layer, zs[2]);
1268          LLVMBuildStore(builder, cube_layer, zs[3]);
1269       }
1270       else {
1271          LLVMBuildStore(builder, face, zs[0]);
1272          LLVMBuildStore(builder, face, zs[1]);
1273          LLVMBuildStore(builder, face, zs[2]);
1274          LLVMBuildStore(builder, face, zs[3]);
1275       }
1276
1277       lp_build_endif(&edge_if);
1278
1279       x00 = LLVMBuildLoad(builder, xs[0], "");
1280       x01 = LLVMBuildLoad(builder, xs[1], "");
1281       x10 = LLVMBuildLoad(builder, xs[2], "");
1282       x11 = LLVMBuildLoad(builder, xs[3], "");
1283       y00 = LLVMBuildLoad(builder, ys[0], "");
1284       y01 = LLVMBuildLoad(builder, ys[1], "");
1285       y10 = LLVMBuildLoad(builder, ys[2], "");
1286       y11 = LLVMBuildLoad(builder, ys[3], "");
1287       z00 = LLVMBuildLoad(builder, zs[0], "");
1288       z01 = LLVMBuildLoad(builder, zs[1], "");
1289       z10 = LLVMBuildLoad(builder, zs[2], "");
1290       z11 = LLVMBuildLoad(builder, zs[3], "");
1291    }
1292
1293    if (linear_mask) {
1294       /*
1295        * Whack filter weights into place. Whatever texel had more weight is
1296        * the one which should have been selected by nearest filtering hence
1297        * just use 100% weight for it.
1298        */
1299       struct lp_build_context *c_bld = &bld->coord_bld;
1300       LLVMValueRef w1_mask, w1_weight;
1301       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1302
1303       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1304       /* this select is really just a "and" */
1305       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1306       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1307       if (dims >= 2) {
1308          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1309          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1310          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1311          if (dims == 3) {
1312             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1313             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1314             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1315          }
1316       }
1317    }
1318
1319    /*
1320     * Get texture colors.
1321     */
1322    /* get x0/x1 texels */
1323    lp_build_sample_texel_soa(bld,
1324                              width_vec, height_vec, depth_vec,
1325                              x00, y00, z00,
1326                              row_stride_vec, img_stride_vec,
1327                              data_ptr, mipoffsets, neighbors[0][0]);
1328    lp_build_sample_texel_soa(bld,
1329                              width_vec, height_vec, depth_vec,
1330                              x01, y01, z01,
1331                              row_stride_vec, img_stride_vec,
1332                              data_ptr, mipoffsets, neighbors[0][1]);
1333
1334    if (dims == 1) {
1335       assert(!is_gather);
1336       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1337          /* Interpolate two samples from 1D image to produce one color */
1338          for (chan = 0; chan < 4; chan++) {
1339             colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1340                                              neighbors[0][0][chan],
1341                                              neighbors[0][1][chan],
1342                                              0);
1343          }
1344       }
1345       else {
1346          LLVMValueRef cmpval0, cmpval1;
1347          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1348          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1349          /* simplified lerp, AND mask with weight and add */
1350          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1351                                            cmpval0, cmpval1);
1352          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1353       }
1354    }
1355    else {
1356       /* 2D/3D texture */
1357       struct lp_build_if_state corner_if;
1358       LLVMValueRef colors0[4], colorss[4];
1359
1360       /* get x0/x1 texels at y1 */
1361       lp_build_sample_texel_soa(bld,
1362                                 width_vec, height_vec, depth_vec,
1363                                 x10, y10, z10,
1364                                 row_stride_vec, img_stride_vec,
1365                                 data_ptr, mipoffsets, neighbors[1][0]);
1366       lp_build_sample_texel_soa(bld,
1367                                 width_vec, height_vec, depth_vec,
1368                                 x11, y11, z11,
1369                                 row_stride_vec, img_stride_vec,
1370                                 data_ptr, mipoffsets, neighbors[1][1]);
1371
1372       /*
1373        * To avoid having to duplicate linear_mask / fetch code use
1374        * another branch (with corner condition though edge would work
1375        * as well) here.
1376        */
1377       if (accurate_cube_corners) {
1378          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1379          LLVMValueRef have_corner, one_third;
1380
1381          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1382          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1383          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1384          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1385
1386          have_corner = LLVMBuildLoad(builder, have_corners, "");
1387
1388          lp_build_if(&corner_if, bld->gallivm, have_corner);
1389
1390          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1391                                         1.0f/3.0f);
1392
1393          /* find corner */
1394          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1395          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1396          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1397          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1398          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1399          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1400          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1401          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1402
1403          if (!is_gather) {
1404             /*
1405              * we can't use standard 2d lerp as we need per-element weight
1406              * in case of corners, so just calculate bilinear result as
1407              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1408              * (This is actually less work than using 2d lerp, 7 vs. 9
1409              * instructions, however calculating the weights needs another 6,
1410              * so actually probably not slower than 2d lerp only for 4 channels
1411              * as weights only need to be calculated once - of course fixing
1412              * the weights has additional cost.)
1413              */
1414             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1415             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1416             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1417             w00 = lp_build_mul(coord_bld, wx0, wy0);
1418             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1419             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1420             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1421
1422             /* find corner weight */
1423             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1424             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1425             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1426             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1427
1428             /*
1429              * add 1/3 of the corner weight to the weight of the 3 other
1430              * samples and null out corner weight.
1431              */
1432             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1433             w00 = lp_build_add(coord_bld, w00, c_weight);
1434             w00 = lp_build_andnot(coord_bld, w00, c00f);
1435             w01 = lp_build_add(coord_bld, w01, c_weight);
1436             w01 = lp_build_andnot(coord_bld, w01, c01f);
1437             w10 = lp_build_add(coord_bld, w10, c_weight);
1438             w10 = lp_build_andnot(coord_bld, w10, c10f);
1439             w11 = lp_build_add(coord_bld, w11, c_weight);
1440             w11 = lp_build_andnot(coord_bld, w11, c11f);
1441
1442             if (bld->static_sampler_state->compare_mode ==
1443                 PIPE_TEX_COMPARE_NONE) {
1444                for (chan = 0; chan < 4; chan++) {
1445                   colors0[chan] = lp_build_mul(coord_bld, w00,
1446                                                neighbors[0][0][chan]);
1447                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1448                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1449                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1450                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1451                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1452                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1453                }
1454             }
1455             else {
1456                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1457                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1458                                                       neighbors[0][0][0]);
1459                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1460                                                       neighbors[0][1][0]);
1461                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1462                                                       neighbors[1][0][0]);
1463                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1464                                                       neighbors[1][1][0]);
1465                /*
1466                 * inputs to interpolation are just masks so just add
1467                 * masked weights together
1468                 */
1469                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1470                                            coord_bld->vec_type, "");
1471                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1472                                            coord_bld->vec_type, "");
1473                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1474                                            coord_bld->vec_type, "");
1475                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1476                                            coord_bld->vec_type, "");
1477                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1478                tmp = lp_build_and(coord_bld, w01, cmpval01);
1479                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1480                tmp = lp_build_and(coord_bld, w10, cmpval10);
1481                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1482                tmp = lp_build_and(coord_bld, w11, cmpval11);
1483                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1484                colors0[1] = colors0[2] = colors0[3] = colors0[0];
1485             }
1486          }
1487          else {
1488             /*
1489              * We don't have any weights to adjust, so instead calculate
1490              * the fourth texel as simply the average of the other 3.
1491              * (This would work for non-gather too, however we'd have
1492              * a boatload more of the select stuff due to there being
1493              * 4 times as many colors as weights.)
1494              */
1495             LLVMValueRef col00, col01, col10, col11;
1496             LLVMValueRef colc, colc0, colc1;
1497             col10 = lp_build_swizzle_soa_channel(texel_bld,
1498                                                  neighbors[1][0], chan_swiz);
1499             col11 = lp_build_swizzle_soa_channel(texel_bld,
1500                                                  neighbors[1][1], chan_swiz);
1501             col01 = lp_build_swizzle_soa_channel(texel_bld,
1502                                                  neighbors[0][1], chan_swiz);
1503             col00 = lp_build_swizzle_soa_channel(texel_bld,
1504                                                  neighbors[0][0], chan_swiz);
1505
1506             /*
1507              * The spec says for comparison filtering, the comparison
1508              * must happen before synthesizing the new value.
1509              * This means all gathered values are always 0 or 1,
1510              * except for the non-existing texel, which can be 0,1/3,2/3,1...
1511              * Seems like we'd be allowed to just return 0 or 1 too, so we
1512              * could simplify and pass down the compare mask values to the
1513              * end (using int arithmetic/compare on the mask values to
1514              * construct the fourth texel) and only there convert to floats
1515              * but it's probably not worth it (it might be easier for the cpu
1516              * but not for the code)...
1517              */
1518             if (bld->static_sampler_state->compare_mode !=
1519                 PIPE_TEX_COMPARE_NONE) {
1520                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1521                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1522                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1523                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1524                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1525                col00 = lp_build_select(texel_bld, cmpval00,
1526                                        texel_bld->one, texel_bld->zero);
1527                col01 = lp_build_select(texel_bld, cmpval01,
1528                                        texel_bld->one, texel_bld->zero);
1529                col10 = lp_build_select(texel_bld, cmpval10,
1530                                        texel_bld->one, texel_bld->zero);
1531                col11 = lp_build_select(texel_bld, cmpval11,
1532                                        texel_bld->one, texel_bld->zero);
1533             }
1534
1535             /*
1536              * Null out corner color.
1537              */
1538             col00 = lp_build_andnot(coord_bld, col00, c00f);
1539             col01 = lp_build_andnot(coord_bld, col01, c01f);
1540             col10 = lp_build_andnot(coord_bld, col10, c10f);
1541             col11 = lp_build_andnot(coord_bld, col11, c11f);
1542
1543             /*
1544              * New corner texel color is all colors added / 3.
1545              */
1546             colc0 = lp_build_add(coord_bld, col00, col01);
1547             colc1 = lp_build_add(coord_bld, col10, col11);
1548             colc = lp_build_add(coord_bld, colc0, colc1);
1549             colc = lp_build_mul(coord_bld, one_third, colc);
1550
1551             /*
1552              * Replace the corner texel color with the new value.
1553              */
1554             col00 = lp_build_select(coord_bld, c00, colc, col00);
1555             col01 = lp_build_select(coord_bld, c01, colc, col01);
1556             col10 = lp_build_select(coord_bld, c10, colc, col10);
1557             col11 = lp_build_select(coord_bld, c11, colc, col11);
1558
1559             colors0[0] = col10;
1560             colors0[1] = col11;
1561             colors0[2] = col01;
1562             colors0[3] = col00;
1563          }
1564
1565          LLVMBuildStore(builder, colors0[0], colorss[0]);
1566          LLVMBuildStore(builder, colors0[1], colorss[1]);
1567          LLVMBuildStore(builder, colors0[2], colorss[2]);
1568          LLVMBuildStore(builder, colors0[3], colorss[3]);
1569
1570          lp_build_else(&corner_if);
1571       }
1572
1573       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1574          if (is_gather) {
1575             /*
1576              * Just assign the red channel (no component selection yet).
1577              * This is a bit hackish, we usually do the swizzle at the
1578              * end of sampling (much less values to swizzle), but this
1579              * obviously cannot work when using gather.
1580              */
1581             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1582                                                       neighbors[1][0],
1583                                                       chan_swiz);
1584             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1585                                                       neighbors[1][1],
1586                                                       chan_swiz);
1587             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1588                                                       neighbors[0][1],
1589                                                       chan_swiz);
1590             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1591                                                       neighbors[0][0],
1592                                                       chan_swiz);
1593          }
1594          else {
1595             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1596             for (chan = 0; chan < 4; chan++) {
1597                colors0[chan] = lp_build_lerp_2d(texel_bld,
1598                                                 s_fpart, t_fpart,
1599                                                 neighbors[0][0][chan],
1600                                                 neighbors[0][1][chan],
1601                                                 neighbors[1][0][chan],
1602                                                 neighbors[1][1][chan],
1603                                                 0);
1604             }
1605          }
1606       }
1607       else {
1608          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1609          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1610          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1611          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1612          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1613
1614          if (is_gather) {
1615             /* more hacks for swizzling, should be X, ONE or ZERO... */
1616             colors0[0] = lp_build_select(texel_bld, cmpval10,
1617                                          texel_bld->one, texel_bld->zero);
1618             colors0[1] = lp_build_select(texel_bld, cmpval11,
1619                                          texel_bld->one, texel_bld->zero);
1620             colors0[2] = lp_build_select(texel_bld, cmpval01,
1621                                          texel_bld->one, texel_bld->zero);
1622             colors0[3] = lp_build_select(texel_bld, cmpval00,
1623                                          texel_bld->one, texel_bld->zero);
1624          }
1625          else {
1626             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1627                                              cmpval00, cmpval01, cmpval10, cmpval11);
1628             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1629          }
1630       }
1631
1632       if (accurate_cube_corners) {
1633          LLVMBuildStore(builder, colors0[0], colorss[0]);
1634          LLVMBuildStore(builder, colors0[1], colorss[1]);
1635          LLVMBuildStore(builder, colors0[2], colorss[2]);
1636          LLVMBuildStore(builder, colors0[3], colorss[3]);
1637
1638          lp_build_endif(&corner_if);
1639
1640          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1641          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1642          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1643          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1644       }
1645
1646       if (dims == 3) {
1647          LLVMValueRef neighbors1[2][2][4];
1648          LLVMValueRef colors1[4];
1649
1650          assert(!is_gather);
1651
1652          /* get x0/x1/y0/y1 texels at z1 */
1653          lp_build_sample_texel_soa(bld,
1654                                    width_vec, height_vec, depth_vec,
1655                                    x00, y00, z1,
1656                                    row_stride_vec, img_stride_vec,
1657                                    data_ptr, mipoffsets, neighbors1[0][0]);
1658          lp_build_sample_texel_soa(bld,
1659                                    width_vec, height_vec, depth_vec,
1660                                    x01, y01, z1,
1661                                    row_stride_vec, img_stride_vec,
1662                                    data_ptr, mipoffsets, neighbors1[0][1]);
1663          lp_build_sample_texel_soa(bld,
1664                                    width_vec, height_vec, depth_vec,
1665                                    x10, y10, z1,
1666                                    row_stride_vec, img_stride_vec,
1667                                    data_ptr, mipoffsets, neighbors1[1][0]);
1668          lp_build_sample_texel_soa(bld,
1669                                    width_vec, height_vec, depth_vec,
1670                                    x11, y11, z1,
1671                                    row_stride_vec, img_stride_vec,
1672                                    data_ptr, mipoffsets, neighbors1[1][1]);
1673
1674          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1675             /* Bilinear interpolate the four samples from the second Z slice */
1676             for (chan = 0; chan < 4; chan++) {
1677                colors1[chan] = lp_build_lerp_2d(texel_bld,
1678                                                 s_fpart, t_fpart,
1679                                                 neighbors1[0][0][chan],
1680                                                 neighbors1[0][1][chan],
1681                                                 neighbors1[1][0][chan],
1682                                                 neighbors1[1][1][chan],
1683                                                 0);
1684             }
1685             /* Linearly interpolate the two samples from the two 3D slices */
1686             for (chan = 0; chan < 4; chan++) {
1687                colors_out[chan] = lp_build_lerp(texel_bld,
1688                                                 r_fpart,
1689                                                 colors0[chan], colors1[chan],
1690                                                 0);
1691             }
1692          }
1693          else {
1694             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1695             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1696             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1697             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1698             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1699             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1700                                              cmpval00, cmpval01, cmpval10, cmpval11);
1701             /* Linearly interpolate the two samples from the two 3D slices */
1702             colors_out[0] = lp_build_lerp(texel_bld,
1703                                           r_fpart,
1704                                           colors0[0], colors1[0],
1705                                           0);
1706             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1707          }
1708       }
1709       else {
1710          /* 2D tex */
1711          for (chan = 0; chan < 4; chan++) {
1712             colors_out[chan] = colors0[chan];
1713          }
1714       }
1715    }
1716    if (is_gather) {
1717       /*
1718        * For gather, we can't do our usual channel swizzling done later,
1719        * so do it here. It only really matters for 0/1 swizzles in case
1720        * of comparison filtering, since in this case the results would be
1721        * wrong, without comparison it should all work out alright but it
1722        * can't hurt to do that here, since it will instantly drop all
1723        * calculations above, though it's a rather stupid idea to do
1724        * gather on a channel which will always return 0 or 1 in any case...
1725        */
1726       if (chan_swiz == PIPE_SWIZZLE_1) {
1727          for (chan = 0; chan < 4; chan++) {
1728             colors_out[chan] = texel_bld->one;
1729          }
1730       } else if (chan_swiz == PIPE_SWIZZLE_0) {
1731          for (chan = 0; chan < 4; chan++) {
1732             colors_out[chan] = texel_bld->zero;
1733          }
1734       }
1735    }
1736 }
1737
1738
1739 /**
1740  * Sample the texture/mipmap using given image filter and mip filter.
1741  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1742  * from (vectors or scalars).
1743  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1744  */
1745 static void
1746 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1747                        unsigned img_filter,
1748                        unsigned mip_filter,
1749                        boolean is_gather,
1750                        const LLVMValueRef *coords,
1751                        const LLVMValueRef *offsets,
1752                        LLVMValueRef ilevel0,
1753                        LLVMValueRef ilevel1,
1754                        LLVMValueRef lod_fpart,
1755                        LLVMValueRef *colors_out)
1756 {
1757    LLVMBuilderRef builder = bld->gallivm->builder;
1758    LLVMValueRef size0 = NULL;
1759    LLVMValueRef size1 = NULL;
1760    LLVMValueRef row_stride0_vec = NULL;
1761    LLVMValueRef row_stride1_vec = NULL;
1762    LLVMValueRef img_stride0_vec = NULL;
1763    LLVMValueRef img_stride1_vec = NULL;
1764    LLVMValueRef data_ptr0 = NULL;
1765    LLVMValueRef data_ptr1 = NULL;
1766    LLVMValueRef mipoff0 = NULL;
1767    LLVMValueRef mipoff1 = NULL;
1768    LLVMValueRef colors0[4], colors1[4];
1769    unsigned chan;
1770
1771    /* sample the first mipmap level */
1772    lp_build_mipmap_level_sizes(bld, ilevel0,
1773                                &size0,
1774                                &row_stride0_vec, &img_stride0_vec);
1775    if (bld->num_mips == 1) {
1776       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1777    }
1778    else {
1779       /* This path should work for num_lods 1 too but slightly less efficient */
1780       data_ptr0 = bld->base_ptr;
1781       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1782    }
1783    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1784       lp_build_sample_image_nearest(bld, size0,
1785                                     row_stride0_vec, img_stride0_vec,
1786                                     data_ptr0, mipoff0, coords, offsets,
1787                                     colors0);
1788    }
1789    else {
1790       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1791       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1792                                    row_stride0_vec, img_stride0_vec,
1793                                    data_ptr0, mipoff0, coords, offsets,
1794                                    colors0);
1795    }
1796
1797    /* Store the first level's colors in the output variables */
1798    for (chan = 0; chan < 4; chan++) {
1799        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1800    }
1801
1802    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1803       struct lp_build_if_state if_ctx;
1804       LLVMValueRef need_lerp;
1805
1806       /* need_lerp = lod_fpart > 0 */
1807       if (bld->num_lods == 1) {
1808          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1809                                    lod_fpart, bld->lodf_bld.zero,
1810                                    "need_lerp");
1811       }
1812       else {
1813          /*
1814           * We'll do mip filtering if any of the quads (or individual
1815           * pixel in case of per-pixel lod) need it.
1816           * It might be better to split the vectors here and only fetch/filter
1817           * quads which need it (if there's one lod per quad).
1818           */
1819          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1820                                       PIPE_FUNC_GREATER,
1821                                       lod_fpart, bld->lodf_bld.zero);
1822          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1823          lp_build_name(need_lerp, "need_lerp");
1824       }
1825
1826       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1827       {
1828          /*
1829           * We unfortunately need to clamp lod_fpart here since we can get
1830           * negative values which would screw up filtering if not all
1831           * lod_fpart values have same sign.
1832           */
1833          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1834                                   bld->lodf_bld.zero);
1835          /* sample the second mipmap level */
1836          lp_build_mipmap_level_sizes(bld, ilevel1,
1837                                      &size1,
1838                                      &row_stride1_vec, &img_stride1_vec);
1839          if (bld->num_mips == 1) {
1840             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1841          }
1842          else {
1843             data_ptr1 = bld->base_ptr;
1844             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1845          }
1846          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1847             lp_build_sample_image_nearest(bld, size1,
1848                                           row_stride1_vec, img_stride1_vec,
1849                                           data_ptr1, mipoff1, coords, offsets,
1850                                           colors1);
1851          }
1852          else {
1853             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1854                                          row_stride1_vec, img_stride1_vec,
1855                                          data_ptr1, mipoff1, coords, offsets,
1856                                          colors1);
1857          }
1858
1859          /* interpolate samples from the two mipmap levels */
1860
1861          if (bld->num_lods != bld->coord_type.length)
1862             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1863                                                               bld->lodf_bld.type,
1864                                                               bld->texel_bld.type,
1865                                                               lod_fpart);
1866
1867          for (chan = 0; chan < 4; chan++) {
1868             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1869                                           colors0[chan], colors1[chan],
1870                                           0);
1871             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1872          }
1873       }
1874       lp_build_endif(&if_ctx);
1875    }
1876 }
1877
1878
1879 /**
1880  * Sample the texture/mipmap using given mip filter, and using
1881  * both nearest and linear filtering at the same time depending
1882  * on linear_mask.
1883  * lod can be per quad but linear_mask is always per pixel.
1884  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1885  * from (vectors or scalars).
1886  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1887  */
1888 static void
1889 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1890                             LLVMValueRef linear_mask,
1891                             unsigned mip_filter,
1892                             const LLVMValueRef *coords,
1893                             const LLVMValueRef *offsets,
1894                             LLVMValueRef ilevel0,
1895                             LLVMValueRef ilevel1,
1896                             LLVMValueRef lod_fpart,
1897                             LLVMValueRef lod_positive,
1898                             LLVMValueRef *colors_out)
1899 {
1900    LLVMBuilderRef builder = bld->gallivm->builder;
1901    LLVMValueRef size0 = NULL;
1902    LLVMValueRef size1 = NULL;
1903    LLVMValueRef row_stride0_vec = NULL;
1904    LLVMValueRef row_stride1_vec = NULL;
1905    LLVMValueRef img_stride0_vec = NULL;
1906    LLVMValueRef img_stride1_vec = NULL;
1907    LLVMValueRef data_ptr0 = NULL;
1908    LLVMValueRef data_ptr1 = NULL;
1909    LLVMValueRef mipoff0 = NULL;
1910    LLVMValueRef mipoff1 = NULL;
1911    LLVMValueRef colors0[4], colors1[4];
1912    unsigned chan;
1913
1914    /* sample the first mipmap level */
1915    lp_build_mipmap_level_sizes(bld, ilevel0,
1916                                &size0,
1917                                &row_stride0_vec, &img_stride0_vec);
1918    if (bld->num_mips == 1) {
1919       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1920    }
1921    else {
1922       /* This path should work for num_lods 1 too but slightly less efficient */
1923       data_ptr0 = bld->base_ptr;
1924       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1925    }
1926
1927    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1928                                 row_stride0_vec, img_stride0_vec,
1929                                 data_ptr0, mipoff0, coords, offsets,
1930                                 colors0);
1931
1932    /* Store the first level's colors in the output variables */
1933    for (chan = 0; chan < 4; chan++) {
1934        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1935    }
1936
1937    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1938       struct lp_build_if_state if_ctx;
1939       LLVMValueRef need_lerp;
1940
1941       /*
1942        * We'll do mip filtering if any of the quads (or individual
1943        * pixel in case of per-pixel lod) need it.
1944        * Note using lod_positive here not lod_fpart since it may be the same
1945        * condition as that used in the outer "if" in the caller hence llvm
1946        * should be able to merge the branches in this case.
1947        */
1948       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1949       lp_build_name(need_lerp, "need_lerp");
1950
1951       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1952       {
1953          /*
1954           * We unfortunately need to clamp lod_fpart here since we can get
1955           * negative values which would screw up filtering if not all
1956           * lod_fpart values have same sign.
1957           */
1958          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1959                                   bld->lodf_bld.zero);
1960          /* sample the second mipmap level */
1961          lp_build_mipmap_level_sizes(bld, ilevel1,
1962                                      &size1,
1963                                      &row_stride1_vec, &img_stride1_vec);
1964          if (bld->num_mips == 1) {
1965             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1966          }
1967          else {
1968             data_ptr1 = bld->base_ptr;
1969             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1970          }
1971
1972          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1973                                       row_stride1_vec, img_stride1_vec,
1974                                       data_ptr1, mipoff1, coords, offsets,
1975                                       colors1);
1976
1977          /* interpolate samples from the two mipmap levels */
1978
1979          if (bld->num_lods != bld->coord_type.length)
1980             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1981                                                               bld->lodf_bld.type,
1982                                                               bld->texel_bld.type,
1983                                                               lod_fpart);
1984
1985          for (chan = 0; chan < 4; chan++) {
1986             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1987                                           colors0[chan], colors1[chan],
1988                                           0);
1989             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1990          }
1991       }
1992       lp_build_endif(&if_ctx);
1993    }
1994 }
1995
1996
1997 /**
1998  * Build (per-coord) layer value.
1999  * Either clamp layer to valid values or fill in optional out_of_bounds
2000  * value and just return value unclamped.
2001  */
2002 static LLVMValueRef
2003 lp_build_layer_coord(struct lp_build_sample_context *bld,
2004                      unsigned texture_unit,
2005                      boolean is_cube_array,
2006                      LLVMValueRef layer,
2007                      LLVMValueRef *out_of_bounds)
2008 {
2009    LLVMValueRef num_layers;
2010    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2011
2012    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2013                                           bld->context_ptr, texture_unit);
2014
2015    if (out_of_bounds) {
2016       LLVMValueRef out1, out;
2017       assert(!is_cube_array);
2018       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2019       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2020       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2021       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2022       return layer;
2023    }
2024    else {
2025       LLVMValueRef maxlayer;
2026       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2027                                        bld->int_bld.one;
2028       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2029       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2030       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2031    }
2032 }
2033
2034
2035 /**
2036  * Calculate cube face, lod, mip levels.
2037  */
2038 static void
2039 lp_build_sample_common(struct lp_build_sample_context *bld,
2040                        boolean is_lodq,
2041                        unsigned texture_index,
2042                        unsigned sampler_index,
2043                        LLVMValueRef *coords,
2044                        const struct lp_derivatives *derivs, /* optional */
2045                        LLVMValueRef lod_bias, /* optional */
2046                        LLVMValueRef explicit_lod, /* optional */
2047                        LLVMValueRef *lod_pos_or_zero,
2048                        LLVMValueRef *lod,
2049                        LLVMValueRef *lod_fpart,
2050                        LLVMValueRef *ilevel0,
2051                        LLVMValueRef *ilevel1)
2052 {
2053    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2054    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2055    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2056    const unsigned target = bld->static_texture_state->target;
2057    LLVMValueRef first_level, cube_rho = NULL;
2058    LLVMValueRef lod_ipart = NULL;
2059    struct lp_derivatives cube_derivs;
2060
2061    /*
2062    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2063           mip_filter, min_filter, mag_filter);
2064    */
2065
2066    /*
2067     * Choose cube face, recompute texcoords for the chosen face and
2068     * compute rho here too (as it requires transform of derivatives).
2069     */
2070    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2071       boolean need_derivs;
2072       need_derivs = ((min_filter != mag_filter ||
2073                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2074                       !bld->static_sampler_state->min_max_lod_equal &&
2075                       !explicit_lod);
2076       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2077       derivs = &cube_derivs;
2078       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2079          /* calculate cube layer coord now */
2080          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2081          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2082          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2083          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2084          /* because of seamless filtering can't add it to face (coords[2]) here. */
2085       }
2086    }
2087    else if (target == PIPE_TEXTURE_1D_ARRAY ||
2088             target == PIPE_TEXTURE_2D_ARRAY) {
2089       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2090       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2091    }
2092
2093    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2094       /*
2095        * Clamp p coords to [0,1] for fixed function depth texture format here.
2096        * Technically this is not entirely correct for unorm depth as the ref value
2097        * should be converted to the depth format (quantization!) and comparison
2098        * then done in texture format. This would actually help performance (since
2099        * only need to do it once and could save the per-sample conversion of texels
2100        * to floats instead), but it would need more messy code (would need to push
2101        * at least some bits down to actual fetch so conversion could be skipped,
2102        * and would have ugly interaction with border color, would need to convert
2103        * border color to that format too or do some other tricks to make it work).
2104        */
2105       const struct util_format_description *format_desc = bld->format_desc;
2106       unsigned chan_type;
2107       /* not entirely sure we couldn't end up with non-valid swizzle here */
2108       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2109                      format_desc->channel[format_desc->swizzle[0]].type :
2110                      UTIL_FORMAT_TYPE_FLOAT;
2111       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2112          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2113                                     bld->coord_bld.zero, bld->coord_bld.one);
2114       }
2115    }
2116
2117    /*
2118     * Compute the level of detail (float).
2119     */
2120    if (min_filter != mag_filter ||
2121        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2122       /* Need to compute lod either to choose mipmap levels or to
2123        * distinguish between minification/magnification with one mipmap level.
2124        */
2125       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2126                             coords[0], coords[1], coords[2], cube_rho,
2127                             derivs, lod_bias, explicit_lod,
2128                             mip_filter, lod,
2129                             &lod_ipart, lod_fpart, lod_pos_or_zero);
2130       if (is_lodq) {
2131          LLVMValueRef last_level;
2132          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2133                                                      bld->gallivm,
2134                                                      bld->context_ptr,
2135                                                      texture_index);
2136          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2137                                                        bld->gallivm,
2138                                                        bld->context_ptr,
2139                                                        texture_index);
2140          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2141          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2142          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2143
2144          switch (mip_filter) {
2145          case PIPE_TEX_MIPFILTER_NONE:
2146             *lod_fpart = bld->lodf_bld.zero;
2147             break;
2148          case PIPE_TEX_MIPFILTER_NEAREST:
2149              *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2150              /* fallthrough */
2151          case PIPE_TEX_MIPFILTER_LINEAR:
2152             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2153                                         bld->lodf_bld.zero, last_level);
2154             break;
2155          }
2156          return;
2157       }
2158
2159    } else {
2160       lod_ipart = bld->lodi_bld.zero;
2161       *lod_pos_or_zero = bld->lodi_bld.zero;
2162    }
2163
2164    if (bld->num_lods != bld->num_mips) {
2165       /* only makes sense if there's just a single mip level */
2166       assert(bld->num_mips == 1);
2167       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2168    }
2169
2170    /*
2171     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2172     */
2173    switch (mip_filter) {
2174    default:
2175       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2176       /* fall-through */
2177    case PIPE_TEX_MIPFILTER_NONE:
2178       /* always use mip level 0 */
2179       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2180                                                     bld->gallivm, bld->context_ptr,
2181                                                     texture_index);
2182       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2183       *ilevel0 = first_level;
2184       break;
2185    case PIPE_TEX_MIPFILTER_NEAREST:
2186       assert(lod_ipart);
2187       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2188       break;
2189    case PIPE_TEX_MIPFILTER_LINEAR:
2190       assert(lod_ipart);
2191       assert(*lod_fpart);
2192       lp_build_linear_mip_levels(bld, texture_index,
2193                                  lod_ipart, lod_fpart,
2194                                  ilevel0, ilevel1);
2195       break;
2196    }
2197 }
2198
2199 static void
2200 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2201                             unsigned sampler_unit)
2202 {
2203    struct gallivm_state *gallivm = bld->gallivm;
2204    LLVMBuilderRef builder = gallivm->builder;
2205    LLVMValueRef border_color_ptr =
2206       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2207                                        bld->context_ptr, sampler_unit);
2208    LLVMValueRef border_color;
2209    const struct util_format_description *format_desc = bld->format_desc;
2210    struct lp_type vec4_type = bld->texel_type;
2211    struct lp_build_context vec4_bld;
2212    LLVMValueRef min_clamp = NULL;
2213    LLVMValueRef max_clamp = NULL;
2214
2215    /*
2216     * For normalized format need to clamp border color (technically
2217     * probably should also quantize the data). Really sucks doing this
2218     * here but can't avoid at least for now since this is part of
2219     * sampler state and texture format is part of sampler_view state.
2220     * GL expects also expects clamping for uint/sint formats too so
2221     * do that as well (d3d10 can't end up here with uint/sint since it
2222     * only supports them with ld).
2223     */
2224    vec4_type.length = 4;
2225    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2226
2227    /*
2228     * Vectorized clamping of border color. Loading is a bit of a hack since
2229     * we just cast the pointer to float array to pointer to vec4
2230     * (int or float).
2231     */
2232    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2233                                              lp_build_const_int32(gallivm, 0));
2234    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2235                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
2236    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2237    /* we don't have aligned type in the dynamic state unfortunately */
2238    LLVMSetAlignment(border_color, 4);
2239
2240    /*
2241     * Instead of having some incredibly complex logic which will try to figure out
2242     * clamping necessary for each channel, simply use the first channel, and treat
2243     * mixed signed/unsigned normalized formats specially.
2244     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2245     * good reason.)
2246     */
2247    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2248       int chan;
2249       /* d/s needs special handling because both present means just sampling depth */
2250       if (util_format_is_depth_and_stencil(format_desc->format)) {
2251          chan = format_desc->swizzle[0];
2252       }
2253       else {
2254          chan = util_format_get_first_non_void_channel(format_desc->format);
2255       }
2256       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2257          unsigned chan_type = format_desc->channel[chan].type;
2258          unsigned chan_norm = format_desc->channel[chan].normalized;
2259          unsigned chan_pure = format_desc->channel[chan].pure_integer;
2260          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2261             if (chan_norm) {
2262                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2263                max_clamp = vec4_bld.one;
2264             }
2265             else if (chan_pure) {
2266                /*
2267                 * Border color was stored as int, hence need min/max clamp
2268                 * only if chan has less than 32 bits..
2269                 */
2270                unsigned chan_size = format_desc->channel[chan].size;
2271                if (chan_size < 32) {
2272                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2273                                                      0 - (1 << (chan_size - 1)));
2274                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2275                                                      (1 << (chan_size - 1)) - 1);
2276                }
2277             }
2278             /* TODO: no idea about non-pure, non-normalized! */
2279          }
2280          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2281             if (chan_norm) {
2282                min_clamp = vec4_bld.zero;
2283                max_clamp = vec4_bld.one;
2284             }
2285             /*
2286              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2287              * we use Z32_FLOAT_S8X24 to imply sampling depth component
2288              * and ignoring stencil, which will blow up here if we try to
2289              * do a uint clamp in a float texel build...
2290              * And even if we had that format, mesa st also thinks using z24s8
2291              * means depth sampling ignoring stencil.
2292              */
2293             else if (chan_pure) {
2294                /*
2295                 * Border color was stored as uint, hence never need min
2296                 * clamp, and only need max clamp if chan has less than 32 bits.
2297                 */
2298                unsigned chan_size = format_desc->channel[chan].size;
2299                if (chan_size < 32) {
2300                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2301                                                      (1 << chan_size) - 1);
2302                }
2303                /* TODO: no idea about non-pure, non-normalized! */
2304             }
2305          }
2306          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2307             /* TODO: I have no idea what clamp this would need if any! */
2308          }
2309       }
2310       /* mixed plain formats (or different pure size) */
2311       switch (format_desc->format) {
2312       case PIPE_FORMAT_B10G10R10A2_UINT:
2313       case PIPE_FORMAT_R10G10B10A2_UINT:
2314       {
2315          unsigned max10 = (1 << 10) - 1;
2316          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2317                                         max10, (1 << 2) - 1, NULL);
2318       }
2319          break;
2320       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2321          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2322                                         -1.0F, 0.0F, NULL);
2323          max_clamp = vec4_bld.one;
2324          break;
2325       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2326       case PIPE_FORMAT_R5SG5SB6U_NORM:
2327          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2328                                         0.0F, 0.0F, NULL);
2329          max_clamp = vec4_bld.one;
2330          break;
2331       default:
2332          break;
2333       }
2334    }
2335    else {
2336       /* cannot figure this out from format description */
2337       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2338          /* s3tc formats are always unorm */
2339          min_clamp = vec4_bld.zero;
2340          max_clamp = vec4_bld.one;
2341       }
2342       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2343                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2344          switch (format_desc->format) {
2345          case PIPE_FORMAT_RGTC1_UNORM:
2346          case PIPE_FORMAT_RGTC2_UNORM:
2347          case PIPE_FORMAT_LATC1_UNORM:
2348          case PIPE_FORMAT_LATC2_UNORM:
2349          case PIPE_FORMAT_ETC1_RGB8:
2350             min_clamp = vec4_bld.zero;
2351             max_clamp = vec4_bld.one;
2352             break;
2353          case PIPE_FORMAT_RGTC1_SNORM:
2354          case PIPE_FORMAT_RGTC2_SNORM:
2355          case PIPE_FORMAT_LATC1_SNORM:
2356          case PIPE_FORMAT_LATC2_SNORM:
2357             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2358             max_clamp = vec4_bld.one;
2359             break;
2360          default:
2361             assert(0);
2362             break;
2363          }
2364       }
2365       /*
2366        * all others from subsampled/other group, though we don't care
2367        * about yuv (and should not have any from zs here)
2368        */
2369       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2370          switch (format_desc->format) {
2371          case PIPE_FORMAT_R8G8_B8G8_UNORM:
2372          case PIPE_FORMAT_G8R8_G8B8_UNORM:
2373          case PIPE_FORMAT_G8R8_B8R8_UNORM:
2374          case PIPE_FORMAT_R8G8_R8B8_UNORM:
2375          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2376             min_clamp = vec4_bld.zero;
2377             max_clamp = vec4_bld.one;
2378             break;
2379          case PIPE_FORMAT_R8G8Bx_SNORM:
2380             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2381             max_clamp = vec4_bld.one;
2382             break;
2383             /*
2384              * Note smallfloat formats usually don't need clamping
2385              * (they still have infinite range) however this is not
2386              * true for r11g11b10 and r9g9b9e5, which can't represent
2387              * negative numbers (and additionally r9g9b9e5 can't represent
2388              * very large numbers). d3d10 seems happy without clamping in
2389              * this case, but gl spec is pretty clear: "for floating
2390              * point and integer formats, border values are clamped to
2391              * the representable range of the format" so do that here.
2392              */
2393          case PIPE_FORMAT_R11G11B10_FLOAT:
2394             min_clamp = vec4_bld.zero;
2395             break;
2396          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2397             min_clamp = vec4_bld.zero;
2398             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2399             break;
2400          default:
2401             assert(0);
2402             break;
2403          }
2404       }
2405    }
2406
2407    if (min_clamp) {
2408       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2409    }
2410    if (max_clamp) {
2411       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2412    }
2413
2414    bld->border_color_clamped = border_color;
2415 }
2416
2417
2418 /**
2419  * General texture sampling codegen.
2420  * This function handles texture sampling for all texture targets (1D,
2421  * 2D, 3D, cube) and all filtering modes.
2422  */
2423 static void
2424 lp_build_sample_general(struct lp_build_sample_context *bld,
2425                         unsigned sampler_unit,
2426                         boolean is_gather,
2427                         const LLVMValueRef *coords,
2428                         const LLVMValueRef *offsets,
2429                         LLVMValueRef lod_positive,
2430                         LLVMValueRef lod_fpart,
2431                         LLVMValueRef ilevel0,
2432                         LLVMValueRef ilevel1,
2433                         LLVMValueRef *colors_out)
2434 {
2435    LLVMBuilderRef builder = bld->gallivm->builder;
2436    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2437    const unsigned mip_filter = sampler_state->min_mip_filter;
2438    const unsigned min_filter = sampler_state->min_img_filter;
2439    const unsigned mag_filter = sampler_state->mag_img_filter;
2440    LLVMValueRef texels[4];
2441    unsigned chan;
2442
2443    /* if we need border color, (potentially) clamp it now */
2444    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2445                                               min_filter,
2446                                               mag_filter) ||
2447        (bld->dims > 1 &&
2448            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2449                                                   min_filter,
2450                                                   mag_filter)) ||
2451        (bld->dims > 2 &&
2452            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2453                                                   min_filter,
2454                                                   mag_filter))) {
2455       lp_build_clamp_border_color(bld, sampler_unit);
2456    }
2457
2458
2459    /*
2460     * Get/interpolate texture colors.
2461     */
2462
2463    for (chan = 0; chan < 4; ++chan) {
2464      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2465      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2466    }
2467
2468    if (min_filter == mag_filter) {
2469       /* no need to distinguish between minification and magnification */
2470       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2471                              is_gather,
2472                              coords, offsets,
2473                              ilevel0, ilevel1, lod_fpart,
2474                              texels);
2475    }
2476    else {
2477       /*
2478        * Could also get rid of the if-logic and always use mipmap_both, both
2479        * for the single lod and multi-lod case if nothing really uses this.
2480        */
2481       if (bld->num_lods == 1) {
2482          /* Emit conditional to choose min image filter or mag image filter
2483           * depending on the lod being > 0 or <= 0, respectively.
2484           */
2485          struct lp_build_if_state if_ctx;
2486
2487          lod_positive = LLVMBuildTrunc(builder, lod_positive,
2488                                        LLVMInt1TypeInContext(bld->gallivm->context),
2489                                        "lod_pos");
2490
2491          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2492          {
2493             /* Use the minification filter */
2494             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2495                                    coords, offsets,
2496                                    ilevel0, ilevel1, lod_fpart,
2497                                    texels);
2498          }
2499          lp_build_else(&if_ctx);
2500          {
2501             /* Use the magnification filter */
2502             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2503                                    FALSE,
2504                                    coords, offsets,
2505                                    ilevel0, NULL, NULL,
2506                                    texels);
2507          }
2508          lp_build_endif(&if_ctx);
2509       }
2510       else {
2511          LLVMValueRef need_linear, linear_mask;
2512          unsigned mip_filter_for_nearest;
2513          struct lp_build_if_state if_ctx;
2514
2515          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2516             linear_mask = lod_positive;
2517             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2518          }
2519          else {
2520             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2521             mip_filter_for_nearest = mip_filter;
2522          }
2523          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2524                                                linear_mask);
2525          lp_build_name(need_linear, "need_linear");
2526
2527          if (bld->num_lods != bld->coord_type.length) {
2528             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2529                                                                 bld->lodi_type,
2530                                                                 bld->int_coord_type,
2531                                                                 linear_mask);
2532          }
2533
2534          lp_build_if(&if_ctx, bld->gallivm, need_linear);
2535          {
2536             /*
2537              * Do sampling with both filters simultaneously. This means using
2538              * a linear filter and doing some tricks (with weights) for the pixels
2539              * which need nearest filter.
2540              * Note that it's probably rare some pixels need nearest and some
2541              * linear filter but the fixups required for the nearest pixels
2542              * aren't all that complicated so just always run a combined path
2543              * if at least some pixels require linear.
2544              */
2545             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2546                                         coords, offsets,
2547                                         ilevel0, ilevel1,
2548                                         lod_fpart, lod_positive,
2549                                         texels);
2550          }
2551          lp_build_else(&if_ctx);
2552          {
2553             /*
2554              * All pixels require just nearest filtering, which is way
2555              * cheaper than linear, hence do a separate path for that.
2556              */
2557             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2558                                    mip_filter_for_nearest, FALSE,
2559                                    coords, offsets,
2560                                    ilevel0, ilevel1, lod_fpart,
2561                                    texels);
2562          }
2563          lp_build_endif(&if_ctx);
2564       }
2565    }
2566
2567    for (chan = 0; chan < 4; ++chan) {
2568      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2569      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2570    }
2571 }
2572
2573
2574 /**
2575  * Texel fetch function.
2576  * In contrast to general sampling there is no filtering, no coord minification,
2577  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2578  * directly to be applied to the selected mip level (after adding texel offsets).
2579  * This function handles texel fetch for all targets where texel fetch is supported
2580  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2581  */
2582 static void
2583 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2584                      unsigned texture_unit,
2585                      const LLVMValueRef *coords,
2586                      LLVMValueRef explicit_lod,
2587                      const LLVMValueRef *offsets,
2588                      LLVMValueRef *colors_out)
2589 {
2590    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2591    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2592    unsigned dims = bld->dims, chan;
2593    unsigned target = bld->static_texture_state->target;
2594    boolean out_of_bound_ret_zero = TRUE;
2595    LLVMValueRef size, ilevel;
2596    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2597    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2598    LLVMValueRef width, height, depth, i, j;
2599    LLVMValueRef offset, out_of_bounds, out1;
2600
2601    out_of_bounds = int_coord_bld->zero;
2602
2603    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2604       if (bld->num_mips != int_coord_bld->type.length) {
2605          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2606                                             perquadi_bld->type, explicit_lod, 0);
2607       }
2608       else {
2609          ilevel = explicit_lod;
2610       }
2611       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2612                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
2613    }
2614    else {
2615       assert(bld->num_mips == 1);
2616       if (bld->static_texture_state->target != PIPE_BUFFER) {
2617          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2618                                                   bld->context_ptr, texture_unit);
2619       }
2620       else {
2621          ilevel = lp_build_const_int32(bld->gallivm, 0);
2622       }
2623    }
2624    lp_build_mipmap_level_sizes(bld, ilevel,
2625                                &size,
2626                                &row_stride_vec, &img_stride_vec);
2627    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2628                                 size, &width, &height, &depth);
2629
2630    if (target == PIPE_TEXTURE_1D_ARRAY ||
2631        target == PIPE_TEXTURE_2D_ARRAY) {
2632       if (out_of_bound_ret_zero) {
2633          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2634          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2635       }
2636       else {
2637          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2638       }
2639    }
2640
2641    /* This is a lot like border sampling */
2642    if (offsets[0]) {
2643       /*
2644        * coords are really unsigned, offsets are signed, but I don't think
2645        * exceeding 31 bits is possible
2646        */
2647       x = lp_build_add(int_coord_bld, x, offsets[0]);
2648    }
2649    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2650    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2651    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2652    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2653
2654    if (dims >= 2) {
2655       if (offsets[1]) {
2656          y = lp_build_add(int_coord_bld, y, offsets[1]);
2657       }
2658       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2659       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2660       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2661       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2662
2663       if (dims >= 3) {
2664          if (offsets[2]) {
2665             z = lp_build_add(int_coord_bld, z, offsets[2]);
2666          }
2667          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2668          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2669          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2670          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2671       }
2672    }
2673
2674    lp_build_sample_offset(int_coord_bld,
2675                           bld->format_desc,
2676                           x, y, z, row_stride_vec, img_stride_vec,
2677                           &offset, &i, &j);
2678
2679    if (bld->static_texture_state->target != PIPE_BUFFER) {
2680       offset = lp_build_add(int_coord_bld, offset,
2681                             lp_build_get_mip_offsets(bld, ilevel));
2682    }
2683
2684    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2685
2686    lp_build_fetch_rgba_soa(bld->gallivm,
2687                            bld->format_desc,
2688                            bld->texel_type, TRUE,
2689                            bld->base_ptr, offset,
2690                            i, j,
2691                            bld->cache,
2692                            colors_out);
2693
2694    if (out_of_bound_ret_zero) {
2695       /*
2696        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2697        * Could use min/max above instead of out-of-bounds comparisons
2698        * if we don't care about the result returned for out-of-bounds.
2699        */
2700       for (chan = 0; chan < 4; chan++) {
2701          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2702                                             bld->texel_bld.zero, colors_out[chan]);
2703       }
2704    }
2705 }
2706
2707
2708 /**
2709  * Just set texels to white instead of actually sampling the texture.
2710  * For debugging.
2711  */
2712 void
2713 lp_build_sample_nop(struct gallivm_state *gallivm,
2714                     struct lp_type type,
2715                     const LLVMValueRef *coords,
2716                     LLVMValueRef texel_out[4])
2717 {
2718    LLVMValueRef one = lp_build_one(gallivm, type);
2719    unsigned chan;
2720
2721    for (chan = 0; chan < 4; chan++) {
2722       texel_out[chan] = one;
2723    }
2724 }
2725
2726
2727 /**
2728  * Build the actual texture sampling code.
2729  * 'texel' will return a vector of four LLVMValueRefs corresponding to
2730  * R, G, B, A.
2731  * \param type  vector float type to use for coords, etc.
2732  * \param sample_key
2733  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2734  */
2735 static void
2736 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2737                          const struct lp_static_texture_state *static_texture_state,
2738                          const struct lp_static_sampler_state *static_sampler_state,
2739                          struct lp_sampler_dynamic_state *dynamic_state,
2740                          struct lp_type type,
2741                          unsigned sample_key,
2742                          unsigned texture_index,
2743                          unsigned sampler_index,
2744                          LLVMValueRef context_ptr,
2745                          LLVMValueRef thread_data_ptr,
2746                          const LLVMValueRef *coords,
2747                          const LLVMValueRef *offsets,
2748                          const struct lp_derivatives *derivs, /* optional */
2749                          LLVMValueRef lod, /* optional */
2750                          LLVMValueRef texel_out[4])
2751 {
2752    unsigned target = static_texture_state->target;
2753    unsigned dims = texture_dims(target);
2754    unsigned num_quads = type.length / 4;
2755    unsigned mip_filter, min_img_filter, mag_img_filter, i;
2756    struct lp_build_sample_context bld;
2757    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2758    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2759    LLVMBuilderRef builder = gallivm->builder;
2760    LLVMValueRef tex_width, newcoords[5];
2761    enum lp_sampler_lod_property lod_property;
2762    enum lp_sampler_lod_control lod_control;
2763    enum lp_sampler_op_type op_type;
2764    LLVMValueRef lod_bias = NULL;
2765    LLVMValueRef explicit_lod = NULL;
2766    boolean op_is_tex, op_is_lodq, op_is_gather;
2767
2768    if (0) {
2769       enum pipe_format fmt = static_texture_state->format;
2770       debug_printf("Sample from %s\n", util_format_name(fmt));
2771    }
2772
2773    lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2774                      LP_SAMPLER_LOD_PROPERTY_SHIFT;
2775    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2776                     LP_SAMPLER_LOD_CONTROL_SHIFT;
2777    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2778                  LP_SAMPLER_OP_TYPE_SHIFT;
2779
2780    op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2781    op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2782    op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2783
2784    if (lod_control == LP_SAMPLER_LOD_BIAS) {
2785       lod_bias = lod;
2786       assert(lod);
2787       assert(derivs == NULL);
2788    }
2789    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2790       explicit_lod = lod;
2791       assert(lod);
2792       assert(derivs == NULL);
2793    }
2794    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2795       assert(derivs);
2796       assert(lod == NULL);
2797    }
2798    else {
2799       assert(derivs == NULL);
2800       assert(lod == NULL);
2801    }
2802
2803    if (static_texture_state->format == PIPE_FORMAT_NONE) {
2804       /*
2805        * If there's nothing bound, format is NONE, and we must return
2806        * all zero as mandated by d3d10 in this case.
2807        */
2808       unsigned chan;
2809       LLVMValueRef zero = lp_build_zero(gallivm, type);
2810       for (chan = 0; chan < 4; chan++) {
2811          texel_out[chan] = zero;
2812       }
2813       return;
2814    }
2815
2816    assert(type.floating);
2817
2818    /* Setup our build context */
2819    memset(&bld, 0, sizeof bld);
2820    bld.gallivm = gallivm;
2821    bld.context_ptr = context_ptr;
2822    bld.static_sampler_state = &derived_sampler_state;
2823    bld.static_texture_state = static_texture_state;
2824    bld.dynamic_state = dynamic_state;
2825    bld.format_desc = util_format_description(static_texture_state->format);
2826    bld.dims = dims;
2827
2828    if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD || op_is_lodq) {
2829       bld.no_quad_lod = TRUE;
2830    }
2831    if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX || op_is_lodq) {
2832       bld.no_rho_approx = TRUE;
2833    }
2834    if (gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR || op_is_lodq) {
2835       bld.no_brilinear = TRUE;
2836    }
2837
2838    bld.vector_width = lp_type_width(type);
2839
2840    bld.float_type = lp_type_float(32);
2841    bld.int_type = lp_type_int(32);
2842    bld.coord_type = type;
2843    bld.int_coord_type = lp_int_type(type);
2844    bld.float_size_in_type = lp_type_float(32);
2845    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2846    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2847    bld.texel_type = type;
2848
2849    /* always using the first channel hopefully should be safe,
2850     * if not things WILL break in other places anyway.
2851     */
2852    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2853        bld.format_desc->channel[0].pure_integer) {
2854       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2855          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2856       }
2857       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2858          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2859       }
2860    }
2861    else if (util_format_has_stencil(bld.format_desc) &&
2862        !util_format_has_depth(bld.format_desc)) {
2863       /* for stencil only formats, sample stencil (uint) */
2864       bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2865    }
2866
2867    if (!static_texture_state->level_zero_only ||
2868        !static_sampler_state->max_lod_pos || op_is_lodq) {
2869       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2870    } else {
2871       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2872    }
2873    if (op_is_gather) {
2874       /*
2875        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2876        * the actual filtering. Using mostly the same paths, so cube face
2877        * selection, coord wrapping etc. all naturally uses the same code.
2878        */
2879       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2880       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2881       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2882    }
2883    mip_filter = derived_sampler_state.min_mip_filter;
2884
2885    if (0) {
2886       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2887    }
2888
2889    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2890        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2891    {
2892       /*
2893        * Seamless filtering ignores wrap modes.
2894        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2895        * bilinear it's not correct but way better than using for instance repeat.
2896        * Note we even set this for non-seamless. Technically GL allows any wrap
2897        * mode, which made sense when supporting true borders (can get seamless
2898        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2899        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2900        * up the sampler state (as it makes it texture dependent).
2901        */
2902       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2903       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2904    }
2905    /*
2906     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2907     * so AoS path could be used. Not sure it's worth the trouble...
2908     */
2909
2910    min_img_filter = derived_sampler_state.min_img_filter;
2911    mag_img_filter = derived_sampler_state.mag_img_filter;
2912
2913
2914    /*
2915     * This is all a bit complicated different paths are chosen for performance
2916     * reasons.
2917     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2918     * everything (the last two options are equivalent for 4-wide case).
2919     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2920     * lod is calculated then the lod value extracted afterwards so making this
2921     * case basically the same as far as lod handling is concerned for the
2922     * further sample/filter code as the 1 lod for everything case.
2923     * Different lod handling mostly shows up when building mipmap sizes
2924     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2925     * (getting the fractional part of the lod to the right texels).
2926     */
2927
2928    /*
2929     * There are other situations where at least the multiple int lods could be
2930     * avoided like min and max lod being equal.
2931     */
2932    bld.num_mips = bld.num_lods = 1;
2933
2934    if (bld.no_quad_lod && bld.no_rho_approx &&
2935        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2936          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2937           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2938         op_is_lodq)) {
2939       /*
2940        * special case for using per-pixel lod even for implicit lod,
2941        * which is generally never required (ok by APIs) except to please
2942        * some (somewhat broken imho) tests (because per-pixel face selection
2943        * can cause derivatives to be different for pixels outside the primitive
2944        * due to the major axis division even if pre-project derivatives are
2945        * looking normal).
2946        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2947        * cube maps we do indeed get per-pixel lod values).
2948        */
2949       bld.num_mips = type.length;
2950       bld.num_lods = type.length;
2951    }
2952    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2953        (explicit_lod || lod_bias || derivs)) {
2954       if ((!op_is_tex && target != PIPE_BUFFER) ||
2955           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2956          bld.num_mips = type.length;
2957          bld.num_lods = type.length;
2958       }
2959       else if (op_is_tex && min_img_filter != mag_img_filter) {
2960          bld.num_mips = 1;
2961          bld.num_lods = type.length;
2962       }
2963    }
2964    /* TODO: for true scalar_lod should only use 1 lod value */
2965    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2966             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2967       bld.num_mips = num_quads;
2968       bld.num_lods = num_quads;
2969    }
2970    else if (op_is_tex && min_img_filter != mag_img_filter) {
2971       bld.num_mips = 1;
2972       bld.num_lods = num_quads;
2973    }
2974
2975
2976    bld.lodf_type = type;
2977    /* we want native vector size to be able to use our intrinsics */
2978    if (bld.num_lods != type.length) {
2979       /* TODO: this currently always has to be per-quad or per-element */
2980       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2981    }
2982    bld.lodi_type = lp_int_type(bld.lodf_type);
2983    bld.levelf_type = bld.lodf_type;
2984    if (bld.num_mips == 1) {
2985       bld.levelf_type.length = 1;
2986    }
2987    bld.leveli_type = lp_int_type(bld.levelf_type);
2988    bld.float_size_type = bld.float_size_in_type;
2989    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
2990     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
2991    if (bld.num_mips > 1) {
2992       bld.float_size_type.length = bld.num_mips == type.length ?
2993                                       bld.num_mips * bld.float_size_in_type.length :
2994                                       type.length;
2995    }
2996    bld.int_size_type = lp_int_type(bld.float_size_type);
2997
2998    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
2999    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3000    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3001    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3002    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3003    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3004    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3005    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3006    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3007    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3008    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3009    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3010    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3011    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3012
3013    /* Get the dynamic state */
3014    tex_width = dynamic_state->width(dynamic_state, gallivm,
3015                                     context_ptr, texture_index);
3016    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3017                                                     context_ptr, texture_index);
3018    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3019                                                     context_ptr, texture_index);
3020    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3021                                           context_ptr, texture_index);
3022    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3023                                                 context_ptr, texture_index);
3024    /* Note that mip_offsets is an array[level] of offsets to texture images */
3025
3026    if (dynamic_state->cache_ptr && thread_data_ptr) {
3027       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3028                                            thread_data_ptr, texture_index);
3029    }
3030
3031    /* width, height, depth as single int vector */
3032    if (dims <= 1) {
3033       bld.int_size = tex_width;
3034    }
3035    else {
3036       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3037                                             tex_width,
3038                                             LLVMConstInt(i32t, 0, 0), "");
3039       if (dims >= 2) {
3040          LLVMValueRef tex_height =
3041             dynamic_state->height(dynamic_state, gallivm,
3042                                   context_ptr, texture_index);
3043          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3044                                                tex_height,
3045                                                LLVMConstInt(i32t, 1, 0), "");
3046          if (dims >= 3) {
3047             LLVMValueRef tex_depth =
3048                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3049                                     texture_index);
3050             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3051                                                   tex_depth,
3052                                                   LLVMConstInt(i32t, 2, 0), "");
3053          }
3054       }
3055    }
3056
3057    for (i = 0; i < 5; i++) {
3058       newcoords[i] = coords[i];
3059    }
3060
3061    if (util_format_is_pure_integer(static_texture_state->format) &&
3062        !util_format_has_depth(bld.format_desc) && op_is_tex &&
3063        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3064         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3065         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3066       /*
3067        * Bail if impossible filtering is specified (the awkard additional
3068        * depth check is because it is legal in gallium to have things like S8Z24
3069        * here which would say it's pure int despite such formats should sample
3070        * the depth component).
3071        * In GL such filters make the texture incomplete, this makes it robust
3072        * against state trackers which set this up regardless (we'd crash in the
3073        * lerp later otherwise).
3074        * At least in some apis it may be legal to use such filters with lod
3075        * queries and/or gather (at least for gather d3d10 says only the wrap
3076        * bits are really used hence filter bits are likely simply ignored).
3077        * For fetch, we don't get valid samplers either way here.
3078        */
3079       unsigned chan;
3080       LLVMValueRef zero = lp_build_zero(gallivm, type);
3081       for (chan = 0; chan < 4; chan++) {
3082          texel_out[chan] = zero;
3083       }
3084       return;
3085    }
3086
3087    if (0) {
3088       /* For debug: no-op texture sampling */
3089       lp_build_sample_nop(gallivm,
3090                           bld.texel_type,
3091                           newcoords,
3092                           texel_out);
3093    }
3094
3095    else if (op_type == LP_SAMPLER_OP_FETCH) {
3096       lp_build_fetch_texel(&bld, texture_index, newcoords,
3097                            lod, offsets,
3098                            texel_out);
3099    }
3100
3101    else {
3102       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3103       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3104       boolean use_aos;
3105
3106       use_aos = util_format_fits_8unorm(bld.format_desc) &&
3107                 op_is_tex &&
3108                 /* not sure this is strictly needed or simply impossible */
3109                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3110                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3111
3112       use_aos &= bld.num_lods <= num_quads ||
3113                  derived_sampler_state.min_img_filter ==
3114                     derived_sampler_state.mag_img_filter;
3115       if (dims > 1) {
3116          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3117          if (dims > 2) {
3118             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3119          }
3120       }
3121       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3122            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3123           derived_sampler_state.seamless_cube_map &&
3124           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3125            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3126          /* theoretically possible with AoS filtering but not implemented (complex!) */
3127          use_aos = 0;
3128       }
3129
3130       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3131           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3132          debug_printf("%s: using floating point linear filtering for %s\n",
3133                       __FUNCTION__, bld.format_desc->short_name);
3134          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3135                       "  wraps %d  wrapt %d  wrapr %d\n",
3136                       derived_sampler_state.min_img_filter,
3137                       derived_sampler_state.mag_img_filter,
3138                       derived_sampler_state.min_mip_filter,
3139                       static_texture_state->target,
3140                       derived_sampler_state.seamless_cube_map,
3141                       derived_sampler_state.wrap_s,
3142                       derived_sampler_state.wrap_t,
3143                       derived_sampler_state.wrap_r);
3144       }
3145
3146       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3147                              newcoords,
3148                              derivs, lod_bias, explicit_lod,
3149                              &lod_positive, &lod, &lod_fpart,
3150                              &ilevel0, &ilevel1);
3151
3152       if (op_is_lodq) {
3153          texel_out[0] = lod_fpart;
3154          texel_out[1] = lod;
3155          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3156          return;
3157       }
3158
3159       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3160          /* The aos path doesn't do seamless filtering so simply add cube layer
3161           * to face now.
3162           */
3163          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3164       }
3165
3166       /*
3167        * we only try 8-wide sampling with soa or if we have AVX2
3168        * as it appears to be a loss with just AVX)
3169        */
3170       if (num_quads == 1 || !use_aos ||
3171           (util_cpu_caps.has_avx2 &&
3172            (bld.num_lods == 1 ||
3173             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3174          if (use_aos) {
3175             /* do sampling/filtering with fixed pt arithmetic */
3176             lp_build_sample_aos(&bld, sampler_index,
3177                                 newcoords[0], newcoords[1],
3178                                 newcoords[2],
3179                                 offsets, lod_positive, lod_fpart,
3180                                 ilevel0, ilevel1,
3181                                 texel_out);
3182          }
3183
3184          else {
3185             lp_build_sample_general(&bld, sampler_index,
3186                                     op_type == LP_SAMPLER_OP_GATHER,
3187                                     newcoords, offsets,
3188                                     lod_positive, lod_fpart,
3189                                     ilevel0, ilevel1,
3190                                     texel_out);
3191          }
3192       }
3193       else {
3194          unsigned j;
3195          struct lp_build_sample_context bld4;
3196          struct lp_type type4 = type;
3197          unsigned i;
3198          LLVMValueRef texelout4[4];
3199          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3200
3201          type4.length = 4;
3202
3203          /* Setup our build context */
3204          memset(&bld4, 0, sizeof bld4);
3205          bld4.no_quad_lod = bld.no_quad_lod;
3206          bld4.no_rho_approx = bld.no_rho_approx;
3207          bld4.no_brilinear = bld.no_brilinear;
3208          bld4.gallivm = bld.gallivm;
3209          bld4.context_ptr = bld.context_ptr;
3210          bld4.static_texture_state = bld.static_texture_state;
3211          bld4.static_sampler_state = bld.static_sampler_state;
3212          bld4.dynamic_state = bld.dynamic_state;
3213          bld4.format_desc = bld.format_desc;
3214          bld4.dims = bld.dims;
3215          bld4.row_stride_array = bld.row_stride_array;
3216          bld4.img_stride_array = bld.img_stride_array;
3217          bld4.base_ptr = bld.base_ptr;
3218          bld4.mip_offsets = bld.mip_offsets;
3219          bld4.int_size = bld.int_size;
3220          bld4.cache = bld.cache;
3221
3222          bld4.vector_width = lp_type_width(type4);
3223
3224          bld4.float_type = lp_type_float(32);
3225          bld4.int_type = lp_type_int(32);
3226          bld4.coord_type = type4;
3227          bld4.int_coord_type = lp_int_type(type4);
3228          bld4.float_size_in_type = lp_type_float(32);
3229          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3230          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3231          bld4.texel_type = bld.texel_type;
3232          bld4.texel_type.length = 4;
3233
3234          bld4.num_mips = bld4.num_lods = 1;
3235          if (bld4.no_quad_lod && bld4.no_rho_approx &&
3236              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3237               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3238              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3239             bld4.num_mips = type4.length;
3240             bld4.num_lods = type4.length;
3241          }
3242          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3243              (explicit_lod || lod_bias || derivs)) {
3244             if ((!op_is_tex && target != PIPE_BUFFER) ||
3245                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3246                bld4.num_mips = type4.length;
3247                bld4.num_lods = type4.length;
3248             }
3249             else if (op_is_tex && min_img_filter != mag_img_filter) {
3250                bld4.num_mips = 1;
3251                bld4.num_lods = type4.length;
3252             }
3253          }
3254
3255          /* we want native vector size to be able to use our intrinsics */
3256          bld4.lodf_type = type4;
3257          if (bld4.num_lods != type4.length) {
3258             bld4.lodf_type.length = 1;
3259          }
3260          bld4.lodi_type = lp_int_type(bld4.lodf_type);
3261          bld4.levelf_type = type4;
3262          if (bld4.num_mips != type4.length) {
3263             bld4.levelf_type.length = 1;
3264          }
3265          bld4.leveli_type = lp_int_type(bld4.levelf_type);
3266          bld4.float_size_type = bld4.float_size_in_type;
3267          if (bld4.num_mips > 1) {
3268             bld4.float_size_type.length = bld4.num_mips == type4.length ?
3269                                             bld4.num_mips * bld4.float_size_in_type.length :
3270                                             type4.length;
3271          }
3272          bld4.int_size_type = lp_int_type(bld4.float_size_type);
3273
3274          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3275          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3276          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3277          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3278          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3279          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3280          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3281          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3282          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3283          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3284          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3285          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3286          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3287          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3288
3289          for (i = 0; i < num_quads; i++) {
3290             LLVMValueRef s4, t4, r4;
3291             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3292             LLVMValueRef ilevel04, ilevel14 = NULL;
3293             LLVMValueRef offsets4[4] = { NULL };
3294             unsigned num_lods = bld4.num_lods;
3295
3296             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3297             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3298             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3299
3300             if (offsets[0]) {
3301                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3302                if (dims > 1) {
3303                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3304                   if (dims > 2) {
3305                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3306                   }
3307                }
3308             }
3309             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3310             ilevel04 = bld.num_mips == 1 ? ilevel0 :
3311                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3312             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3313                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3314                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3315             }
3316
3317             if (use_aos) {
3318                /* do sampling/filtering with fixed pt arithmetic */
3319                lp_build_sample_aos(&bld4, sampler_index,
3320                                    s4, t4, r4, offsets4,
3321                                    lod_positive4, lod_fpart4,
3322                                    ilevel04, ilevel14,
3323                                    texelout4);
3324             }
3325
3326             else {
3327                /* this path is currently unreachable and hence might break easily... */
3328                LLVMValueRef newcoords4[5];
3329                newcoords4[0] = s4;
3330                newcoords4[1] = t4;
3331                newcoords4[2] = r4;
3332                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3333                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3334
3335                lp_build_sample_general(&bld4, sampler_index,
3336                                        op_type == LP_SAMPLER_OP_GATHER,
3337                                        newcoords4, offsets4,
3338                                        lod_positive4, lod_fpart4,
3339                                        ilevel04, ilevel14,
3340                                        texelout4);
3341             }
3342             for (j = 0; j < 4; j++) {
3343                texelouttmp[j][i] = texelout4[j];
3344             }
3345          }
3346
3347          for (j = 0; j < 4; j++) {
3348             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3349          }
3350       }
3351    }
3352
3353    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3354       apply_sampler_swizzle(&bld, texel_out);
3355    }
3356
3357    /*
3358     * texel type can be a (32bit) int/uint (for pure int formats only),
3359     * however we are expected to always return floats (storage is untyped).
3360     */
3361    if (!bld.texel_type.floating) {
3362       unsigned chan;
3363       for (chan = 0; chan < 4; chan++) {
3364          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3365                                             lp_build_vec_type(gallivm, type), "");
3366       }
3367    }
3368 }
3369
3370
3371 #define USE_TEX_FUNC_CALL 1
3372
3373 #define LP_MAX_TEX_FUNC_ARGS 32
3374
3375 static inline void
3376 get_target_info(enum pipe_texture_target target,
3377                 unsigned *num_coords, unsigned *num_derivs,
3378                 unsigned *num_offsets, unsigned *layer)
3379 {
3380    unsigned dims = texture_dims(target);
3381    *num_coords = dims;
3382    *num_offsets = dims;
3383    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3384                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3385    *layer = has_layer_coord(target) ? 2: 0;
3386    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3387       /*
3388        * dims doesn't include r coord for cubes - this is handled
3389        * by layer instead, but need to fix up for cube arrays...
3390        */
3391       *layer = 3;
3392       *num_coords = 3;
3393    }
3394 }
3395
3396
3397 /**
3398  * Generate the function body for a texture sampling function.
3399  */
3400 static void
3401 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3402                          const struct lp_static_texture_state *static_texture_state,
3403                          const struct lp_static_sampler_state *static_sampler_state,
3404                          struct lp_sampler_dynamic_state *dynamic_state,
3405                          struct lp_type type,
3406                          unsigned texture_index,
3407                          unsigned sampler_index,
3408                          LLVMValueRef function,
3409                          unsigned num_args,
3410                          unsigned sample_key)
3411 {
3412    LLVMBuilderRef old_builder;
3413    LLVMBasicBlockRef block;
3414    LLVMValueRef coords[5];
3415    LLVMValueRef offsets[3] = { NULL };
3416    LLVMValueRef lod = NULL;
3417    LLVMValueRef context_ptr;
3418    LLVMValueRef thread_data_ptr = NULL;
3419    LLVMValueRef texel_out[4];
3420    struct lp_derivatives derivs;
3421    struct lp_derivatives *deriv_ptr = NULL;
3422    unsigned num_param = 0;
3423    unsigned i, num_coords, num_derivs, num_offsets, layer;
3424    enum lp_sampler_lod_control lod_control;
3425    boolean need_cache = FALSE;
3426
3427    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3428                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3429
3430    get_target_info(static_texture_state->target,
3431                    &num_coords, &num_derivs, &num_offsets, &layer);
3432
3433    if (dynamic_state->cache_ptr) {
3434       const struct util_format_description *format_desc;
3435       format_desc = util_format_description(static_texture_state->format);
3436       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3437          need_cache = TRUE;
3438       }
3439    }
3440
3441    /* "unpack" arguments */
3442    context_ptr = LLVMGetParam(function, num_param++);
3443    if (need_cache) {
3444       thread_data_ptr = LLVMGetParam(function, num_param++);
3445    }
3446    for (i = 0; i < num_coords; i++) {
3447       coords[i] = LLVMGetParam(function, num_param++);
3448    }
3449    for (i = num_coords; i < 5; i++) {
3450       /* This is rather unfortunate... */
3451       coords[i] = lp_build_undef(gallivm, type);
3452    }
3453    if (layer) {
3454       coords[layer] = LLVMGetParam(function, num_param++);
3455    }
3456    if (sample_key & LP_SAMPLER_SHADOW) {
3457       coords[4] = LLVMGetParam(function, num_param++);
3458    }
3459    if (sample_key & LP_SAMPLER_OFFSETS) {
3460       for (i = 0; i < num_offsets; i++) {
3461          offsets[i] = LLVMGetParam(function, num_param++);
3462       }
3463    }
3464    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3465        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3466       lod = LLVMGetParam(function, num_param++);
3467    }
3468    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3469       for (i = 0; i < num_derivs; i++) {
3470          derivs.ddx[i] = LLVMGetParam(function, num_param++);
3471          derivs.ddy[i] = LLVMGetParam(function, num_param++);
3472       }
3473       deriv_ptr = &derivs;
3474    }
3475
3476    assert(num_args == num_param);
3477
3478    /*
3479     * Function body
3480     */
3481
3482    old_builder = gallivm->builder;
3483    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3484    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3485    LLVMPositionBuilderAtEnd(gallivm->builder, block);
3486
3487    lp_build_sample_soa_code(gallivm,
3488                             static_texture_state,
3489                             static_sampler_state,
3490                             dynamic_state,
3491                             type,
3492                             sample_key,
3493                             texture_index,
3494                             sampler_index,
3495                             context_ptr,
3496                             thread_data_ptr,
3497                             coords,
3498                             offsets,
3499                             deriv_ptr,
3500                             lod,
3501                             texel_out);
3502
3503    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3504
3505    LLVMDisposeBuilder(gallivm->builder);
3506    gallivm->builder = old_builder;
3507
3508    gallivm_verify_function(gallivm, function);
3509 }
3510
3511
3512 /**
3513  * Call the matching function for texture sampling.
3514  * If there's no match, generate a new one.
3515  */
3516 static void
3517 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3518                          const struct lp_static_texture_state *static_texture_state,
3519                          const struct lp_static_sampler_state *static_sampler_state,
3520                          struct lp_sampler_dynamic_state *dynamic_state,
3521                          const struct lp_sampler_params *params)
3522 {
3523    LLVMBuilderRef builder = gallivm->builder;
3524    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3525                              LLVMGetInsertBlock(builder)));
3526    LLVMValueRef function, inst;
3527    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3528    LLVMBasicBlockRef bb;
3529    LLVMValueRef tex_ret;
3530    unsigned num_args = 0;
3531    char func_name[64];
3532    unsigned i, num_coords, num_derivs, num_offsets, layer;
3533    unsigned texture_index = params->texture_index;
3534    unsigned sampler_index = params->sampler_index;
3535    unsigned sample_key = params->sample_key;
3536    const LLVMValueRef *coords = params->coords;
3537    const LLVMValueRef *offsets = params->offsets;
3538    const struct lp_derivatives *derivs = params->derivs;
3539    enum lp_sampler_lod_control lod_control;
3540    boolean need_cache = FALSE;
3541
3542    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3543                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3544
3545    get_target_info(static_texture_state->target,
3546                    &num_coords, &num_derivs, &num_offsets, &layer);
3547
3548    if (dynamic_state->cache_ptr) {
3549       const struct util_format_description *format_desc;
3550       format_desc = util_format_description(static_texture_state->format);
3551       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3552          /*
3553           * This is not 100% correct, if we have cache but the
3554           * util_format_s3tc_prefer is true the cache won't get used
3555           * regardless (could hook up the block decode there...) */
3556          need_cache = TRUE;
3557       }
3558    }
3559    /*
3560     * texture function matches are found by name.
3561     * Thus the name has to include both the texture and sampler unit
3562     * (which covers all static state) plus the actual texture function
3563     * (including things like offsets, shadow coord, lod control).
3564     * Additionally lod_property has to be included too.
3565     */
3566
3567    util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3568                  texture_index, sampler_index, sample_key);
3569
3570    function = LLVMGetNamedFunction(module, func_name);
3571
3572    if(!function) {
3573       LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3574       LLVMTypeRef ret_type;
3575       LLVMTypeRef function_type;
3576       LLVMTypeRef val_type[4];
3577       unsigned num_param = 0;
3578
3579       /*
3580        * Generate the function prototype.
3581        */
3582
3583       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3584       if (need_cache) {
3585          arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3586       }
3587       for (i = 0; i < num_coords; i++) {
3588          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3589          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3590       }
3591       if (layer) {
3592          arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3593          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3594       }
3595       if (sample_key & LP_SAMPLER_SHADOW) {
3596          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3597       }
3598       if (sample_key & LP_SAMPLER_OFFSETS) {
3599          for (i = 0; i < num_offsets; i++) {
3600             arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3601             assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3602          }
3603       }
3604       if (lod_control == LP_SAMPLER_LOD_BIAS ||
3605           lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3606          arg_types[num_param++] = LLVMTypeOf(params->lod);
3607       }
3608       else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3609          for (i = 0; i < num_derivs; i++) {
3610             arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3611             arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3612             assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3613             assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3614          }
3615       }
3616
3617       val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3618          lp_build_vec_type(gallivm, params->type);
3619       ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3620       function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3621       function = LLVMAddFunction(module, func_name, function_type);
3622
3623       for (i = 0; i < num_param; ++i) {
3624          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3625
3626             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3627          }
3628       }
3629
3630       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3631       LLVMSetLinkage(function, LLVMInternalLinkage);
3632
3633       lp_build_sample_gen_func(gallivm,
3634                                static_texture_state,
3635                                static_sampler_state,
3636                                dynamic_state,
3637                                params->type,
3638                                texture_index,
3639                                sampler_index,
3640                                function,
3641                                num_param,
3642                                sample_key);
3643    }
3644
3645    num_args = 0;
3646    args[num_args++] = params->context_ptr;
3647    if (need_cache) {
3648       args[num_args++] = params->thread_data_ptr;
3649    }
3650    for (i = 0; i < num_coords; i++) {
3651       args[num_args++] = coords[i];
3652    }
3653    if (layer) {
3654       args[num_args++] = coords[layer];
3655    }
3656    if (sample_key & LP_SAMPLER_SHADOW) {
3657       args[num_args++] = coords[4];
3658    }
3659    if (sample_key & LP_SAMPLER_OFFSETS) {
3660       for (i = 0; i < num_offsets; i++) {
3661          args[num_args++] = offsets[i];
3662       }
3663    }
3664    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3665        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3666       args[num_args++] = params->lod;
3667    }
3668    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3669       for (i = 0; i < num_derivs; i++) {
3670          args[num_args++] = derivs->ddx[i];
3671          args[num_args++] = derivs->ddy[i];
3672       }
3673    }
3674
3675    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3676
3677    tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3678    bb = LLVMGetInsertBlock(builder);
3679    inst = LLVMGetLastInstruction(bb);
3680    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3681
3682    for (i = 0; i < 4; i++) {
3683       params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3684    }
3685 }
3686
3687
3688 /**
3689  * Build texture sampling code.
3690  * Either via a function call or inline it directly.
3691  */
3692 void
3693 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3694                     const struct lp_static_sampler_state *static_sampler_state,
3695                     struct lp_sampler_dynamic_state *dynamic_state,
3696                     struct gallivm_state *gallivm,
3697                     const struct lp_sampler_params *params)
3698 {
3699    boolean use_tex_func = FALSE;
3700
3701    /*
3702     * Do not use a function call if the sampling is "simple enough".
3703     * We define this by
3704     * a) format
3705     * b) no mips (either one level only or no mip filter)
3706     * No mips will definitely make the code smaller, though
3707     * the format requirement is a bit iffy - there's some (SoA) formats
3708     * which definitely generate less code. This does happen to catch
3709     * some important cases though which are hurt quite a bit by using
3710     * a call (though not really because of the call overhead but because
3711     * they are reusing the same texture unit with some of the same
3712     * parameters).
3713     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3714     */
3715
3716    if (USE_TEX_FUNC_CALL) {
3717       const struct util_format_description *format_desc;
3718       boolean simple_format;
3719       boolean simple_tex;
3720       enum lp_sampler_op_type op_type;
3721       format_desc = util_format_description(static_texture_state->format);
3722       simple_format = !format_desc ||
3723                          (util_format_is_rgba8_variant(format_desc) &&
3724                           format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3725
3726       op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3727                     LP_SAMPLER_OP_TYPE_SHIFT;
3728       simple_tex =
3729          op_type != LP_SAMPLER_OP_TEXTURE ||
3730            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3731              static_texture_state->level_zero_only == TRUE) &&
3732             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3733
3734       use_tex_func = format_desc && !(simple_format && simple_tex);
3735    }
3736
3737    if (use_tex_func) {
3738       lp_build_sample_soa_func(gallivm,
3739                                static_texture_state,
3740                                static_sampler_state,
3741                                dynamic_state,
3742                                params);
3743    }
3744    else {
3745       lp_build_sample_soa_code(gallivm,
3746                                static_texture_state,
3747                                static_sampler_state,
3748                                dynamic_state,
3749                                params->type,
3750                                params->sample_key,
3751                                params->texture_index,
3752                                params->sampler_index,
3753                                params->context_ptr,
3754                                params->thread_data_ptr,
3755                                params->coords,
3756                                params->offsets,
3757                                params->derivs,
3758                                params->lod,
3759                                params->texel);
3760    }
3761 }
3762
3763
3764 void
3765 lp_build_size_query_soa(struct gallivm_state *gallivm,
3766                         const struct lp_static_texture_state *static_state,
3767                         struct lp_sampler_dynamic_state *dynamic_state,
3768                         const struct lp_sampler_size_query_params *params)
3769 {
3770    LLVMValueRef lod, level = 0, size;
3771    LLVMValueRef first_level = NULL;
3772    int dims, i;
3773    boolean has_array;
3774    unsigned num_lods = 1;
3775    struct lp_build_context bld_int_vec4;
3776    LLVMValueRef context_ptr = params->context_ptr;
3777    unsigned texture_unit = params->texture_unit;
3778    unsigned target = params->target;
3779
3780    if (static_state->format == PIPE_FORMAT_NONE) {
3781       /*
3782        * If there's nothing bound, format is NONE, and we must return
3783        * all zero as mandated by d3d10 in this case.
3784        */
3785       unsigned chan;
3786       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3787       for (chan = 0; chan < 4; chan++) {
3788          params->sizes_out[chan] = zero;
3789       }
3790       return;
3791    }
3792
3793    /*
3794     * Do some sanity verification about bound texture and shader dcl target.
3795     * Not entirely sure what's possible but assume array/non-array
3796     * always compatible (probably not ok for OpenGL but d3d10 has no
3797     * distinction of arrays at the resource level).
3798     * Everything else looks bogus (though not entirely sure about rect/2d).
3799     * Currently disabled because it causes assertion failures if there's
3800     * nothing bound (or rather a dummy texture, not that this case would
3801     * return the right values).
3802     */
3803    if (0 && static_state->target != target) {
3804       if (static_state->target == PIPE_TEXTURE_1D)
3805          assert(target == PIPE_TEXTURE_1D_ARRAY);
3806       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3807          assert(target == PIPE_TEXTURE_1D);
3808       else if (static_state->target == PIPE_TEXTURE_2D)
3809          assert(target == PIPE_TEXTURE_2D_ARRAY);
3810       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3811          assert(target == PIPE_TEXTURE_2D);
3812       else if (static_state->target == PIPE_TEXTURE_CUBE)
3813          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3814       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3815          assert(target == PIPE_TEXTURE_CUBE);
3816       else
3817          assert(0);
3818    }
3819
3820    dims = texture_dims(target);
3821
3822    switch (target) {
3823    case PIPE_TEXTURE_1D_ARRAY:
3824    case PIPE_TEXTURE_2D_ARRAY:
3825    case PIPE_TEXTURE_CUBE_ARRAY:
3826       has_array = TRUE;
3827       break;
3828    default:
3829       has_array = FALSE;
3830       break;
3831    }
3832
3833    assert(!params->int_type.floating);
3834
3835    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3836
3837    if (params->explicit_lod) {
3838       /* FIXME: this needs to honor per-element lod */
3839       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3840                                     lp_build_const_int32(gallivm, 0), "");
3841       first_level = dynamic_state->first_level(dynamic_state, gallivm,
3842                                                context_ptr, texture_unit);
3843       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3844       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3845    } else {
3846       lod = bld_int_vec4.zero;
3847    }
3848
3849    size = bld_int_vec4.undef;
3850
3851    size = LLVMBuildInsertElement(gallivm->builder, size,
3852                                  dynamic_state->width(dynamic_state, gallivm,
3853                                                       context_ptr, texture_unit),
3854                                  lp_build_const_int32(gallivm, 0), "");
3855
3856    if (dims >= 2) {
3857       size = LLVMBuildInsertElement(gallivm->builder, size,
3858                                     dynamic_state->height(dynamic_state, gallivm,
3859                                                           context_ptr, texture_unit),
3860                                     lp_build_const_int32(gallivm, 1), "");
3861    }
3862
3863    if (dims >= 3) {
3864       size = LLVMBuildInsertElement(gallivm->builder, size,
3865                                     dynamic_state->depth(dynamic_state, gallivm,
3866                                                          context_ptr, texture_unit),
3867                                     lp_build_const_int32(gallivm, 2), "");
3868    }
3869
3870    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3871
3872    if (has_array) {
3873       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3874                                                  context_ptr, texture_unit);
3875       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3876          /*
3877           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3878           * Could avoid this by passing in number of cubes instead of total
3879           * number of layers (might make things easier elsewhere too).
3880           */
3881          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3882          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3883       }
3884       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3885                                     lp_build_const_int32(gallivm, dims), "");
3886    }
3887
3888    /*
3889     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3890     * if level is out of bounds (note this can't cover unbound texture
3891     * here, which also requires returning zero).
3892     */
3893    if (params->explicit_lod && params->is_sviewinfo) {
3894       LLVMValueRef last_level, out, out1;
3895       struct lp_build_context leveli_bld;
3896
3897       /* everything is scalar for now */
3898       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3899       last_level = dynamic_state->last_level(dynamic_state, gallivm,
3900                                              context_ptr, texture_unit);
3901
3902       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3903       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3904       out = lp_build_or(&leveli_bld, out, out1);
3905       if (num_lods == 1) {
3906          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3907       }
3908       else {
3909          /* TODO */
3910          assert(0);
3911       }
3912       size = lp_build_andnot(&bld_int_vec4, size, out);
3913    }
3914    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3915       params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3916                                                 size,
3917                                                 lp_build_const_int32(gallivm, i));
3918    }
3919    if (params->is_sviewinfo) {
3920       for (; i < 4; i++) {
3921          params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3922       }
3923    }
3924
3925    /*
3926     * if there's no explicit_lod (buffers, rects) queries requiring nr of
3927     * mips would be illegal.
3928     */
3929    if (params->is_sviewinfo && params->explicit_lod) {
3930       struct lp_build_context bld_int_scalar;
3931       LLVMValueRef num_levels;
3932       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3933
3934       if (static_state->level_zero_only) {
3935          num_levels = bld_int_scalar.one;
3936       }
3937       else {
3938          LLVMValueRef last_level;
3939
3940          last_level = dynamic_state->last_level(dynamic_state, gallivm,
3941                                                 context_ptr, texture_unit);
3942          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3943          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3944       }
3945       params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3946                                         num_levels);
3947    }
3948 }