src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- SoA.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  * @author Brian Paul <brianp@vmware.com>
  34  */
  35
  36 #include "pipe/p_defines.h"
  37 #include "pipe/p_state.h"
  38 #include "pipe/p_shader_tokens.h"
  39 #include "util/u_debug.h"
  40 #include "util/u_dump.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_math.h"
  43 #include "util/u_format.h"
  44 #include "util/u_cpu_detect.h"
  45 #include "util/format_rgb9e5.h"
  46 #include "lp_bld_debug.h"
  47 #include "lp_bld_type.h"
  48 #include "lp_bld_const.h"
  49 #include "lp_bld_conv.h"
  50 #include "lp_bld_arit.h"
  51 #include "lp_bld_bitarit.h"
  52 #include "lp_bld_logic.h"
  53 #include "lp_bld_printf.h"
  54 #include "lp_bld_swizzle.h"
  55 #include "lp_bld_flow.h"
  56 #include "lp_bld_gather.h"
  57 #include "lp_bld_format.h"
  58 #include "lp_bld_sample.h"
  59 #include "lp_bld_sample_aos.h"
  60 #include "lp_bld_struct.h"
  61 #include "lp_bld_quad.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_intr.h"
  64
  65
  66 /**
  67  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  68  * The computation depends on whether the texture is 1D, 2D or 3D.
  69  * The result, texel, will be float vectors:
  70  *   texel[0] = red values
  71  *   texel[1] = green values
  72  *   texel[2] = blue values
  73  *   texel[3] = alpha values
  74  */
  75 static void
  76 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  77                           LLVMValueRef width,
  78                           LLVMValueRef height,
  79                           LLVMValueRef depth,
  80                           LLVMValueRef x,
  81                           LLVMValueRef y,
  82                           LLVMValueRef z,
  83                           LLVMValueRef y_stride,
  84                           LLVMValueRef z_stride,
  85                           LLVMValueRef data_ptr,
  86                           LLVMValueRef mipoffsets,
  87                           LLVMValueRef texel_out[4])
  88 {
  89    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
  90    const unsigned dims = bld->dims;
  91    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
  92    LLVMBuilderRef builder = bld->gallivm->builder;
  93    LLVMValueRef offset;
  94    LLVMValueRef i, j;
  95    LLVMValueRef use_border = NULL;
  96
  97    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
  98    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
  99                                               static_state->min_img_filter,
 100                                               static_state->mag_img_filter)) {
 101       LLVMValueRef b1, b2;
 102       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
 103       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
 104       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 105    }
 106
 107    if (dims >= 2 &&
 108        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
 109                                               static_state->min_img_filter,
 110                                               static_state->mag_img_filter)) {
 111       LLVMValueRef b1, b2;
 112       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
 113       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
 114       if (use_border) {
 115          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 116          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 117       }
 118       else {
 119          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 120       }
 121    }
 122
 123    if (dims == 3 &&
 124        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
 125                                               static_state->min_img_filter,
 126                                               static_state->mag_img_filter)) {
 127       LLVMValueRef b1, b2;
 128       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
 129       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
 130       if (use_border) {
 131          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
 132          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
 133       }
 134       else {
 135          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
 136       }
 137    }
 138
 139    /* convert x,y,z coords to linear offset from start of texture, in bytes */
 140    lp_build_sample_offset(&bld->int_coord_bld,
 141                           bld->format_desc,
 142                           x, y, z, y_stride, z_stride,
 143                           &offset, &i, &j);
 144    if (mipoffsets) {
 145       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
 146    }
 147
 148    if (use_border) {
 149       /* If we can sample the border color, it means that texcoords may
 150        * lie outside the bounds of the texture image.  We need to do
 151        * something to prevent reading out of bounds and causing a segfault.
 152        *
 153        * Simply AND the texture coords with !use_border.  This will cause
 154        * coords which are out of bounds to become zero.  Zero's guaranteed
 155        * to be inside the texture image.
 156        */
 157       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
 158    }
 159
 160    lp_build_fetch_rgba_soa(bld->gallivm,
 161                            bld->format_desc,
 162                            bld->texel_type, TRUE,
 163                            data_ptr, offset,
 164                            i, j,
 165                            bld->cache,
 166                            texel_out);
 167
 168    /*
 169     * Note: if we find an app which frequently samples the texture border
 170     * we might want to implement a true conditional here to avoid sampling
 171     * the texture whenever possible (since that's quite a bit of code).
 172     * Ex:
 173     *   if (use_border) {
 174     *      texel = border_color;
 175     *   }
 176     *   else {
 177     *      texel = sample_texture(coord);
 178     *   }
 179     * As it is now, we always sample the texture, then selectively replace
 180     * the texel color results with the border color.
 181     */
 182
 183    if (use_border) {
 184       /* select texel color or border color depending on use_border. */
 185       const struct util_format_description *format_desc = bld->format_desc;
 186       int chan;
 187       struct lp_type border_type = bld->texel_type;
 188       border_type.length = 4;
 189       /*
 190        * Only replace channels which are actually present. The others should
 191        * get optimized away eventually by sampler_view swizzle anyway but it's
 192        * easier too.
 193        */
 194       for (chan = 0; chan < 4; chan++) {
 195          unsigned chan_s;
 196          /* reverse-map channel... */
 197          for (chan_s = 0; chan_s < 4; chan_s++) {
 198             if (chan_s == format_desc->swizzle[chan]) {
 199                break;
 200             }
 201          }
 202          if (chan_s <= 3) {
 203             /* use the already clamped color */
 204             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
 205             LLVMValueRef border_chan;
 206
 207             border_chan = lp_build_extract_broadcast(bld->gallivm,
 208                                                      border_type,
 209                                                      bld->texel_type,
 210                                                      bld->border_color_clamped,
 211                                                      idx);
 212             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
 213                                               border_chan, texel_out[chan]);
 214          }
 215       }
 216    }
 217 }
 218
 219
 220 /**
 221  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
 222  * (Note that with pot sizes could do this much more easily post-scale
 223  * with some bit arithmetic.)
 224  */
 225 static LLVMValueRef
 226 lp_build_coord_mirror(struct lp_build_sample_context *bld,
 227                       LLVMValueRef coord, boolean posOnly)
 228 {
 229    struct lp_build_context *coord_bld = &bld->coord_bld;
 230    LLVMValueRef fract;
 231    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 232
 233    /*
 234     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
 235     * it all works out. (The result is in range [-1, 1.0], negative if
 236     * the coord is in the "odd" section, otherwise positive.)
 237     */
 238
 239    coord = lp_build_mul(coord_bld, coord, half);
 240    fract = lp_build_round(coord_bld, coord);
 241    fract = lp_build_sub(coord_bld, coord, fract);
 242    coord = lp_build_add(coord_bld, fract, fract);
 243
 244    if (posOnly) {
 245       /*
 246        * Theoretically it's not quite 100% accurate because the spec says
 247        * that ultimately a scaled coord of -x.0 should map to int coord
 248        * -x + 1 with mirroring, not -x (this does not matter for bilinear
 249        * filtering).
 250        */
 251       coord = lp_build_abs(coord_bld, coord);
 252       /* kill off NaNs */
 253       /* XXX: not safe without arch rounding, fract can be anything. */
 254       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
 255                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 256    }
 257
 258    return coord;
 259 }
 260
 261
 262 /**
 263  * Helper to compute the first coord and the weight for
 264  * linear wrap repeat npot textures
 265  */
 266 void
 267 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
 268                                   LLVMValueRef coord_f,
 269                                   LLVMValueRef length_i,
 270                                   LLVMValueRef length_f,
 271                                   LLVMValueRef *coord0_i,
 272                                   LLVMValueRef *weight_f)
 273 {
 274    struct lp_build_context *coord_bld = &bld->coord_bld;
 275    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 276    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 277    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
 278                                                 int_coord_bld->one);
 279    LLVMValueRef mask;
 280    /* wrap with normalized floats is just fract */
 281    coord_f = lp_build_fract(coord_bld, coord_f);
 282    /* mul by size and subtract 0.5 */
 283    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
 284    coord_f = lp_build_sub(coord_bld, coord_f, half);
 285    /*
 286     * we avoided the 0.5/length division before the repeat wrap,
 287     * now need to fix up edge cases with selects
 288     */
 289    /*
 290     * Note we do a float (unordered) compare so we can eliminate NaNs.
 291     * (Otherwise would need fract_safe above).
 292     */
 293    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
 294                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
 295
 296    /* convert to int, compute lerp weight */
 297    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
 298    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
 299 }
 300
 301
 302 /**
 303  * Build LLVM code for texture wrap mode for linear filtering.
 304  * \param x0_out  returns first integer texcoord
 305  * \param x1_out  returns second integer texcoord
 306  * \param weight_out  returns linear interpolation weight
 307  */
 308 static void
 309 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 310                             boolean is_gather,
 311                             LLVMValueRef coord,
 312                             LLVMValueRef length,
 313                             LLVMValueRef length_f,
 314                             LLVMValueRef offset,
 315                             boolean is_pot,
 316                             unsigned wrap_mode,
 317                             LLVMValueRef *x0_out,
 318                             LLVMValueRef *x1_out,
 319                             LLVMValueRef *weight_out)
 320 {
 321    struct lp_build_context *coord_bld = &bld->coord_bld;
 322    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 323    LLVMBuilderRef builder = bld->gallivm->builder;
 324    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 325    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 326    LLVMValueRef coord0, coord1, weight;
 327
 328    switch(wrap_mode) {
 329    case PIPE_TEX_WRAP_REPEAT:
 330       if (is_pot) {
 331          /* mul by size and subtract 0.5 */
 332          coord = lp_build_mul(coord_bld, coord, length_f);
 333          coord = lp_build_sub(coord_bld, coord, half);
 334          if (offset) {
 335             offset = lp_build_int_to_float(coord_bld, offset);
 336             coord = lp_build_add(coord_bld, coord, offset);
 337          }
 338          /* convert to int, compute lerp weight */
 339          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 340          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 341          /* repeat wrap */
 342          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
 343          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
 344       }
 345       else {
 346          LLVMValueRef mask;
 347          if (offset) {
 348             offset = lp_build_int_to_float(coord_bld, offset);
 349             offset = lp_build_div(coord_bld, offset, length_f);
 350             coord = lp_build_add(coord_bld, coord, offset);
 351          }
 352          lp_build_coord_repeat_npot_linear(bld, coord,
 353                                            length, length_f,
 354                                            &coord0, &weight);
 355          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
 356                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 357          coord1 = LLVMBuildAnd(builder,
 358                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
 359                                mask, "");
 360       }
 361       break;
 362
 363    case PIPE_TEX_WRAP_CLAMP:
 364       if (bld->static_sampler_state->normalized_coords) {
 365          /* scale coord to length */
 366          coord = lp_build_mul(coord_bld, coord, length_f);
 367       }
 368       if (offset) {
 369          offset = lp_build_int_to_float(coord_bld, offset);
 370          coord = lp_build_add(coord_bld, coord, offset);
 371       }
 372
 373       /*
 374        * clamp to [0, length]
 375        *
 376        * Unlike some other wrap modes, this should be correct for gather
 377        * too. GL_CLAMP explicitly does this clamp on the coord prior to
 378        * actual wrapping (which is per sample).
 379        */
 380       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
 381
 382       coord = lp_build_sub(coord_bld, coord, half);
 383
 384       /* convert to int, compute lerp weight */
 385       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 386       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 387       break;
 388
 389    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 390       {
 391          struct lp_build_context abs_coord_bld = bld->coord_bld;
 392          abs_coord_bld.type.sign = FALSE;
 393
 394          if (bld->static_sampler_state->normalized_coords) {
 395             /* mul by tex size */
 396             coord = lp_build_mul(coord_bld, coord, length_f);
 397          }
 398          if (offset) {
 399             offset = lp_build_int_to_float(coord_bld, offset);
 400             coord = lp_build_add(coord_bld, coord, offset);
 401          }
 402
 403          /* clamp to length max */
 404          coord = lp_build_min_ext(coord_bld, coord, length_f,
 405                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 406          if (!is_gather) {
 407             /* subtract 0.5 */
 408             coord = lp_build_sub(coord_bld, coord, half);
 409             /* clamp to [0, length - 0.5] */
 410             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 411             /* convert to int, compute lerp weight */
 412             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 413             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 414          } else {
 415             /*
 416              * The non-gather path will end up with coords 0, 1 if coord was
 417              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
 418              * really matter what the second coord is). But for gather, we
 419              * really need to end up with coords 0, 0.
 420              */
 421             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 422             coord0 = lp_build_sub(coord_bld, coord, half);
 423             coord1 = lp_build_add(coord_bld, coord, half);
 424             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
 425             coord0 = lp_build_itrunc(coord_bld, coord0);
 426             coord1 = lp_build_itrunc(coord_bld, coord1);
 427             weight = coord_bld->undef;
 428          }
 429          /* coord1 = min(coord1, length-1) */
 430          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 431          break;
 432       }
 433
 434    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 435       if (bld->static_sampler_state->normalized_coords) {
 436          /* scale coord to length */
 437          coord = lp_build_mul(coord_bld, coord, length_f);
 438       }
 439       if (offset) {
 440          offset = lp_build_int_to_float(coord_bld, offset);
 441          coord = lp_build_add(coord_bld, coord, offset);
 442       }
 443       /*
 444        * We don't need any clamp. Technically, for very large (pos or neg)
 445        * (or infinite) values, clamp against [-length, length] would be
 446        * correct, but we don't need to guarantee any specific
 447        * result for such coords (the ifloor will be undefined, but for modes
 448        * requiring border all resulting coords are safe).
 449        */
 450       coord = lp_build_sub(coord_bld, coord, half);
 451       /* convert to int, compute lerp weight */
 452       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 453       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 454       break;
 455
 456    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 457       if (offset) {
 458          offset = lp_build_int_to_float(coord_bld, offset);
 459          offset = lp_build_div(coord_bld, offset, length_f);
 460          coord = lp_build_add(coord_bld, coord, offset);
 461       }
 462       if (!is_gather) {
 463          /* compute mirror function */
 464          coord = lp_build_coord_mirror(bld, coord, TRUE);
 465
 466          /* scale coord to length */
 467          coord = lp_build_mul(coord_bld, coord, length_f);
 468          coord = lp_build_sub(coord_bld, coord, half);
 469
 470          /* convert to int, compute lerp weight */
 471          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 472          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 473
 474          /* coord0 = max(coord0, 0) */
 475          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
 476          /* coord1 = min(coord1, length-1) */
 477          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 478       } else {
 479          /*
 480           * This is pretty reasonable in the end,  all what the tests care
 481           * about is nasty edge cases (scaled coords x.5, so the individual
 482           * coords are actually integers, which is REALLY tricky to get right
 483           * due to this working differently both for negative numbers as well
 484           * as for even/odd cases). But with enough magic it's not too complex
 485           * after all.
 486           * Maybe should try a bit arithmetic one though for POT textures...
 487           */
 488          LLVMValueRef isNeg;
 489          /*
 490           * Wrapping just once still works, even though it means we can
 491           * get "wrong" sign due to performing mirror in the middle of the
 492           * two coords (because this can only happen very near the odd/even
 493           * edges, so both coords will actually end up as 0 or length - 1
 494           * in the end).
 495           * For GL4 gather with per-sample offsets we'd need to the mirroring
 496           * per coord too.
 497           */
 498          coord = lp_build_coord_mirror(bld, coord, FALSE);
 499          coord = lp_build_mul(coord_bld, coord, length_f);
 500
 501          /*
 502           * NaNs should be safe here, we'll do away with them with
 503           * the ones' complement plus min.
 504           */
 505          coord0 = lp_build_sub(coord_bld, coord, half);
 506          coord0 = lp_build_ifloor(coord_bld, coord0);
 507          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 508          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
 509          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 510                               coord0, int_coord_bld->zero);
 511          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
 512          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
 513                               coord1, int_coord_bld->zero);
 514          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
 515          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 516          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 517
 518          weight = coord_bld->undef;
 519       }
 520       break;
 521
 522    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 523       if (bld->static_sampler_state->normalized_coords) {
 524          /* scale coord to length */
 525          coord = lp_build_mul(coord_bld, coord, length_f);
 526       }
 527       if (offset) {
 528          offset = lp_build_int_to_float(coord_bld, offset);
 529          coord = lp_build_add(coord_bld, coord, offset);
 530       }
 531       /*
 532        * XXX: probably not correct for gather, albeit I'm not
 533        * entirely sure as it's poorly specified. The wrapping looks
 534        * correct according to the spec which is against gl 1.2.1,
 535        * however negative values will be swapped - gl re-specified
 536        * wrapping with newer versions (no more pre-clamp except with
 537        * GL_CLAMP).
 538        */
 539       coord = lp_build_abs(coord_bld, coord);
 540
 541       /* clamp to [0, length] */
 542       coord = lp_build_min_ext(coord_bld, coord, length_f,
 543                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 544
 545       coord = lp_build_sub(coord_bld, coord, half);
 546
 547       /* convert to int, compute lerp weight */
 548       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 549       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 550       break;
 551
 552    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 553       {
 554          struct lp_build_context abs_coord_bld = bld->coord_bld;
 555          abs_coord_bld.type.sign = FALSE;
 556
 557          if (bld->static_sampler_state->normalized_coords) {
 558             /* scale coord to length */
 559             coord = lp_build_mul(coord_bld, coord, length_f);
 560          }
 561          if (offset) {
 562             offset = lp_build_int_to_float(coord_bld, offset);
 563             coord = lp_build_add(coord_bld, coord, offset);
 564          }
 565          if (!is_gather) {
 566             coord = lp_build_abs(coord_bld, coord);
 567
 568             /* clamp to length max */
 569             coord = lp_build_min_ext(coord_bld, coord, length_f,
 570                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 571             /* subtract 0.5 */
 572             coord = lp_build_sub(coord_bld, coord, half);
 573             /* clamp to [0, length - 0.5] */
 574             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
 575
 576             /* convert to int, compute lerp weight */
 577             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
 578             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 579             /* coord1 = min(coord1, length-1) */
 580             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 581          } else {
 582             /*
 583              * The non-gather path will swap coord0/1 if coord was negative,
 584              * which is ok for filtering since the filter weight matches
 585              * accordingly. Also, if coord is close to zero, coord0/1 will
 586              * be 0 and 1, instead of 0 and 0 (again ok due to filter
 587              * weight being 0.0). Both issues need to be fixed for gather.
 588              */
 589             LLVMValueRef isNeg;
 590
 591             /*
 592              * Actually wanted to cheat here and use:
 593              * coord1 = lp_build_iround(coord_bld, coord);
 594              * but it's not good enough for some tests (even piglit
 595              * textureGather is set up in a way so the coords area always
 596              * .5, that is right at the crossover points).
 597              * So do ordinary sub/floor, then do ones' complement
 598              * for negative numbers.
 599              * (Note can't just do sub|add/abs/itrunc per coord neither -
 600              * because the spec demands that mirror(3.0) = 3 but
 601              * mirror(-3.0) = 2.)
 602              */
 603             coord = lp_build_sub(coord_bld, coord, half);
 604             coord0 = lp_build_ifloor(coord_bld, coord);
 605             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 606             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
 607                                  int_coord_bld->zero);
 608             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
 609             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
 610
 611             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
 612                                  int_coord_bld->zero);
 613             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
 614             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
 615
 616             weight = coord_bld->undef;
 617          }
 618       }
 619       break;
 620
 621    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 622       {
 623          if (bld->static_sampler_state->normalized_coords) {
 624             /* scale coord to length */
 625             coord = lp_build_mul(coord_bld, coord, length_f);
 626          }
 627          if (offset) {
 628             offset = lp_build_int_to_float(coord_bld, offset);
 629             coord = lp_build_add(coord_bld, coord, offset);
 630          }
 631          /*
 632           * XXX: probably not correct for gather due to swapped
 633           * order if coord is negative (same rationale as for
 634           * MIRROR_CLAMP).
 635           */
 636          coord = lp_build_abs(coord_bld, coord);
 637
 638          /*
 639           * We don't need any clamp. Technically, for very large
 640           * (or infinite) values, clamp against length would be
 641           * correct, but we don't need to guarantee any specific
 642           * result for such coords (the ifloor will be undefined, but
 643           * for modes requiring border all resulting coords are safe).
 644           */
 645          coord = lp_build_sub(coord_bld, coord, half);
 646
 647          /* convert to int, compute lerp weight */
 648          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
 649          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 650       }
 651       break;
 652
 653    default:
 654       assert(0);
 655       coord0 = NULL;
 656       coord1 = NULL;
 657       weight = NULL;
 658    }
 659
 660    *x0_out = coord0;
 661    *x1_out = coord1;
 662    *weight_out = weight;
 663 }
 664
 665
 666 /**
 667  * Build LLVM code for texture wrap mode for nearest filtering.
 668  * \param coord  the incoming texcoord (nominally in [0,1])
 669  * \param length  the texture size along one dimension, as int vector
 670  * \param length_f  the texture size along one dimension, as float vector
 671  * \param offset  texel offset along one dimension (as int vector)
 672  * \param is_pot  if TRUE, length is a power of two
 673  * \param wrap_mode  one of PIPE_TEX_WRAP_x
 674  */
 675 static LLVMValueRef
 676 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 677                              LLVMValueRef coord,
 678                              LLVMValueRef length,
 679                              LLVMValueRef length_f,
 680                              LLVMValueRef offset,
 681                              boolean is_pot,
 682                              unsigned wrap_mode)
 683 {
 684    struct lp_build_context *coord_bld = &bld->coord_bld;
 685    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
 686    LLVMBuilderRef builder = bld->gallivm->builder;
 687    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 688    LLVMValueRef icoord;
 689
 690    switch(wrap_mode) {
 691    case PIPE_TEX_WRAP_REPEAT:
 692       if (is_pot) {
 693          coord = lp_build_mul(coord_bld, coord, length_f);
 694          icoord = lp_build_ifloor(coord_bld, coord);
 695          if (offset) {
 696             icoord = lp_build_add(int_coord_bld, icoord, offset);
 697          }
 698          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
 699       }
 700       else {
 701           if (offset) {
 702              offset = lp_build_int_to_float(coord_bld, offset);
 703              offset = lp_build_div(coord_bld, offset, length_f);
 704              coord = lp_build_add(coord_bld, coord, offset);
 705           }
 706           /* take fraction, unnormalize */
 707           coord = lp_build_fract_safe(coord_bld, coord);
 708           coord = lp_build_mul(coord_bld, coord, length_f);
 709           icoord = lp_build_itrunc(coord_bld, coord);
 710       }
 711       break;
 712
 713    case PIPE_TEX_WRAP_CLAMP:
 714    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
 715       if (bld->static_sampler_state->normalized_coords) {
 716          /* scale coord to length */
 717          coord = lp_build_mul(coord_bld, coord, length_f);
 718       }
 719
 720       if (offset) {
 721          offset = lp_build_int_to_float(coord_bld, offset);
 722          coord = lp_build_add(coord_bld, coord, offset);
 723       }
 724       /* floor */
 725       /* use itrunc instead since we clamp to 0 anyway */
 726       icoord = lp_build_itrunc(coord_bld, coord);
 727
 728       /* clamp to [0, length - 1]. */
 729       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
 730                               length_minus_one);
 731       break;
 732
 733    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
 734       if (bld->static_sampler_state->normalized_coords) {
 735          /* scale coord to length */
 736          coord = lp_build_mul(coord_bld, coord, length_f);
 737       }
 738       /* no clamp necessary, border masking will handle this */
 739       icoord = lp_build_ifloor(coord_bld, coord);
 740       if (offset) {
 741          icoord = lp_build_add(int_coord_bld, icoord, offset);
 742       }
 743       break;
 744
 745    case PIPE_TEX_WRAP_MIRROR_REPEAT:
 746       if (offset) {
 747          offset = lp_build_int_to_float(coord_bld, offset);
 748          offset = lp_build_div(coord_bld, offset, length_f);
 749          coord = lp_build_add(coord_bld, coord, offset);
 750       }
 751       /* compute mirror function */
 752       coord = lp_build_coord_mirror(bld, coord, TRUE);
 753
 754       /* scale coord to length */
 755       assert(bld->static_sampler_state->normalized_coords);
 756       coord = lp_build_mul(coord_bld, coord, length_f);
 757
 758       /* itrunc == ifloor here */
 759       icoord = lp_build_itrunc(coord_bld, coord);
 760
 761       /* clamp to [0, length - 1] */
 762       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
 763       break;
 764
 765    case PIPE_TEX_WRAP_MIRROR_CLAMP:
 766    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 767       if (bld->static_sampler_state->normalized_coords) {
 768          /* scale coord to length */
 769          coord = lp_build_mul(coord_bld, coord, length_f);
 770       }
 771       if (offset) {
 772          offset = lp_build_int_to_float(coord_bld, offset);
 773          coord = lp_build_add(coord_bld, coord, offset);
 774       }
 775       coord = lp_build_abs(coord_bld, coord);
 776
 777       /* itrunc == ifloor here */
 778       icoord = lp_build_itrunc(coord_bld, coord);
 779       /*
 780        * Use unsigned min due to possible undef values (NaNs, overflow)
 781        */
 782       {
 783          struct lp_build_context abs_coord_bld = *int_coord_bld;
 784          abs_coord_bld.type.sign = FALSE;
 785          /* clamp to [0, length - 1] */
 786          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
 787       }
 788       break;
 789
 790    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 791       if (bld->static_sampler_state->normalized_coords) {
 792          /* scale coord to length */
 793          coord = lp_build_mul(coord_bld, coord, length_f);
 794       }
 795       if (offset) {
 796          offset = lp_build_int_to_float(coord_bld, offset);
 797          coord = lp_build_add(coord_bld, coord, offset);
 798       }
 799       coord = lp_build_abs(coord_bld, coord);
 800
 801       /* itrunc == ifloor here */
 802       icoord = lp_build_itrunc(coord_bld, coord);
 803       break;
 804
 805    default:
 806       assert(0);
 807       icoord = NULL;
 808    }
 809
 810    return icoord;
 811 }
 812
 813
 814 /**
 815  * Do shadow test/comparison.
 816  * \param p shadow ref value
 817  * \param texel  the texel to compare against
 818  */
 819 static LLVMValueRef
 820 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
 821                             LLVMValueRef p,
 822                             LLVMValueRef texel)
 823 {
 824    struct lp_build_context *texel_bld = &bld->texel_bld;
 825    LLVMValueRef res;
 826
 827    if (0) {
 828       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
 829       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
 830    }
 831
 832    /* result = (p FUNC texel) ? 1 : 0 */
 833    /*
 834     * honor d3d10 floating point rules here, which state that comparisons
 835     * are ordered except NOT_EQUAL which is unordered.
 836     */
 837    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
 838       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
 839                                  p, texel);
 840    }
 841    else {
 842       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
 843                          p, texel);
 844    }
 845    return res;
 846 }
 847
 848
 849 /**
 850  * Generate code to sample a mipmap level with nearest filtering.
 851  * If sampling a cube texture, r = cube face in [0,5].
 852  */
 853 static void
 854 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 855                               LLVMValueRef size,
 856                               LLVMValueRef row_stride_vec,
 857                               LLVMValueRef img_stride_vec,
 858                               LLVMValueRef data_ptr,
 859                               LLVMValueRef mipoffsets,
 860                               LLVMValueRef *coords,
 861                               const LLVMValueRef *offsets,
 862                               LLVMValueRef colors_out[4])
 863 {
 864    const unsigned dims = bld->dims;
 865    LLVMValueRef width_vec;
 866    LLVMValueRef height_vec;
 867    LLVMValueRef depth_vec;
 868    LLVMValueRef flt_size;
 869    LLVMValueRef flt_width_vec;
 870    LLVMValueRef flt_height_vec;
 871    LLVMValueRef flt_depth_vec;
 872    LLVMValueRef x, y = NULL, z = NULL;
 873
 874    lp_build_extract_image_sizes(bld,
 875                                 &bld->int_size_bld,
 876                                 bld->int_coord_type,
 877                                 size,
 878                                 &width_vec, &height_vec, &depth_vec);
 879
 880    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
 881
 882    lp_build_extract_image_sizes(bld,
 883                                 &bld->float_size_bld,
 884                                 bld->coord_type,
 885                                 flt_size,
 886                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
 887
 888    /*
 889     * Compute integer texcoords.
 890     */
 891    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
 892                                     flt_width_vec, offsets[0],
 893                                     bld->static_texture_state->pot_width,
 894                                     bld->static_sampler_state->wrap_s);
 895    lp_build_name(x, "tex.x.wrapped");
 896
 897    if (dims >= 2) {
 898       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
 899                                        flt_height_vec, offsets[1],
 900                                        bld->static_texture_state->pot_height,
 901                                        bld->static_sampler_state->wrap_t);
 902       lp_build_name(y, "tex.y.wrapped");
 903
 904       if (dims == 3) {
 905          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
 906                                           flt_depth_vec, offsets[2],
 907                                           bld->static_texture_state->pot_depth,
 908                                           bld->static_sampler_state->wrap_r);
 909          lp_build_name(z, "tex.z.wrapped");
 910       }
 911    }
 912    if (has_layer_coord(bld->static_texture_state->target)) {
 913       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
 914          /* add cube layer to face */
 915          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
 916       }
 917       else {
 918          z = coords[2];
 919       }
 920       lp_build_name(z, "tex.z.layer");
 921    }
 922
 923    /*
 924     * Get texture colors.
 925     */
 926    lp_build_sample_texel_soa(bld,
 927                              width_vec, height_vec, depth_vec,
 928                              x, y, z,
 929                              row_stride_vec, img_stride_vec,
 930                              data_ptr, mipoffsets, colors_out);
 931
 932    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
 933       LLVMValueRef cmpval;
 934       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
 935       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
 936       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
 937                                       bld->texel_bld.one, bld->texel_bld.zero);
 938       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
 939    }
 940
 941 }
 942
 943
 944 /**
 945  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
 946  */
 947 static LLVMValueRef
 948 lp_build_masklerp(struct lp_build_context *bld,
 949                  LLVMValueRef weight,
 950                  LLVMValueRef mask0,
 951                  LLVMValueRef mask1)
 952 {
 953    struct gallivm_state *gallivm = bld->gallivm;
 954    LLVMBuilderRef builder = gallivm->builder;
 955    LLVMValueRef weight2;
 956
 957    weight2 = lp_build_sub(bld, bld->one, weight);
 958    weight = LLVMBuildBitCast(builder, weight,
 959                               lp_build_int_vec_type(gallivm, bld->type), "");
 960    weight2 = LLVMBuildBitCast(builder, weight2,
 961                               lp_build_int_vec_type(gallivm, bld->type), "");
 962    weight = LLVMBuildAnd(builder, weight, mask1, "");
 963    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
 964    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
 965    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
 966    return lp_build_add(bld, weight, weight2);
 967 }
 968
 969 /**
 970  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
 971  */
 972 static LLVMValueRef
 973 lp_build_masklerp2d(struct lp_build_context *bld,
 974                     LLVMValueRef weight0,
 975                     LLVMValueRef weight1,
 976                     LLVMValueRef mask00,
 977                     LLVMValueRef mask01,
 978                     LLVMValueRef mask10,
 979                     LLVMValueRef mask11)
 980 {
 981    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
 982    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
 983    return lp_build_lerp(bld, weight1, val0, val1, 0);
 984 }
 985
 986 /*
 987  * this is a bit excessive code for something OpenGL just recommends
 988  * but does not require.
 989  */
 990 #define ACCURATE_CUBE_CORNERS 1
 991
 992 /**
 993  * Generate code to sample a mipmap level with linear filtering.
 994  * If sampling a cube texture, r = cube face in [0,5].
 995  * If linear_mask is present, only pixels having their mask set
 996  * will receive linear filtering, the rest will use nearest.
 997  */
 998 static void
 999 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1000                              boolean is_gather,
1001                              LLVMValueRef size,
1002                              LLVMValueRef linear_mask,
1003                              LLVMValueRef row_stride_vec,
1004                              LLVMValueRef img_stride_vec,
1005                              LLVMValueRef data_ptr,
1006                              LLVMValueRef mipoffsets,
1007                              LLVMValueRef *coords,
1008                              const LLVMValueRef *offsets,
1009                              LLVMValueRef colors_out[4])
1010 {
1011    LLVMBuilderRef builder = bld->gallivm->builder;
1012    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1013    struct lp_build_context *coord_bld = &bld->coord_bld;
1014    struct lp_build_context *texel_bld = &bld->texel_bld;
1015    const unsigned dims = bld->dims;
1016    LLVMValueRef width_vec;
1017    LLVMValueRef height_vec;
1018    LLVMValueRef depth_vec;
1019    LLVMValueRef flt_size;
1020    LLVMValueRef flt_width_vec;
1021    LLVMValueRef flt_height_vec;
1022    LLVMValueRef flt_depth_vec;
1023    LLVMValueRef fall_off[4], have_corners;
1024    LLVMValueRef z1 = NULL;
1025    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1026    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1027    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1028    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1029    LLVMValueRef xs[4], ys[4], zs[4];
1030    LLVMValueRef neighbors[2][2][4];
1031    int chan, texel_index;
1032    boolean seamless_cube_filter, accurate_cube_corners;
1033    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1034
1035    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1036                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1037                           bld->static_sampler_state->seamless_cube_map;
1038
1039    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
1040
1041    lp_build_extract_image_sizes(bld,
1042                                 &bld->int_size_bld,
1043                                 bld->int_coord_type,
1044                                 size,
1045                                 &width_vec, &height_vec, &depth_vec);
1046
1047    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1048
1049    lp_build_extract_image_sizes(bld,
1050                                 &bld->float_size_bld,
1051                                 bld->coord_type,
1052                                 flt_size,
1053                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1054
1055    /*
1056     * Compute integer texcoords.
1057     */
1058
1059    if (!seamless_cube_filter) {
1060       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1061                                   flt_width_vec, offsets[0],
1062                                   bld->static_texture_state->pot_width,
1063                                   bld->static_sampler_state->wrap_s,
1064                                   &x00, &x01, &s_fpart);
1065       lp_build_name(x00, "tex.x0.wrapped");
1066       lp_build_name(x01, "tex.x1.wrapped");
1067       x10 = x00;
1068       x11 = x01;
1069
1070       if (dims >= 2) {
1071          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1072                                      flt_height_vec, offsets[1],
1073                                      bld->static_texture_state->pot_height,
1074                                      bld->static_sampler_state->wrap_t,
1075                                      &y00, &y10, &t_fpart);
1076          lp_build_name(y00, "tex.y0.wrapped");
1077          lp_build_name(y10, "tex.y1.wrapped");
1078          y01 = y00;
1079          y11 = y10;
1080
1081          if (dims == 3) {
1082             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1083                                         flt_depth_vec, offsets[2],
1084                                         bld->static_texture_state->pot_depth,
1085                                         bld->static_sampler_state->wrap_r,
1086                                         &z00, &z1, &r_fpart);
1087             z01 = z10 = z11 = z00;
1088             lp_build_name(z00, "tex.z0.wrapped");
1089             lp_build_name(z1, "tex.z1.wrapped");
1090          }
1091       }
1092       if (has_layer_coord(bld->static_texture_state->target)) {
1093          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1094             /* add cube layer to face */
1095             z00 = z01 = z10 = z11 = z1 =
1096                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1097          }
1098          else {
1099             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1100          }
1101          lp_build_name(z00, "tex.z0.layer");
1102          lp_build_name(z1, "tex.z1.layer");
1103       }
1104    }
1105    else {
1106       struct lp_build_if_state edge_if;
1107       LLVMTypeRef int1t;
1108       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1109       LLVMValueRef coord, have_edge, have_corner;
1110       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1111       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1112       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1113       LLVMValueRef face = coords[2];
1114       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1115       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1116       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1117       height_vec = width_vec;
1118       flt_height_vec = flt_width_vec;
1119
1120       /* XXX the overflow logic is actually sort of duplicated with trilinear,
1121        * since an overflow in one mip should also have a corresponding overflow
1122        * in another.
1123        */
1124       /* should always have normalized coords, and offsets are undefined */
1125       assert(bld->static_sampler_state->normalized_coords);
1126       /*
1127        * The coords should all be between [0,1] however we can have NaNs,
1128        * which will wreak havoc. In particular the y1_clamped value below
1129        * can be -INT_MAX (on x86) and be propagated right through (probably
1130        * other values might be bogus in the end too).
1131        * So kill off the NaNs here.
1132        */
1133       coords[0] = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1134                                    GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1135       coords[1] = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1136                                    GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1137       coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
1138       /* instead of clamp, build mask if overflowed */
1139       coord = lp_build_sub(coord_bld, coord, half);
1140       /* convert to int, compute lerp weight */
1141       /* not ideal with AVX (and no AVX2) */
1142       lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
1143       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1144       coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
1145       coord = lp_build_sub(coord_bld, coord, half);
1146       lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
1147       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1148
1149       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1150       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1151       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1152       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1153
1154       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1155       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1156       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1157       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1158
1159       /* needed for accurate corner filtering branch later, rely on 0 init */
1160       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1161       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1162
1163       for (texel_index = 0; texel_index < 4; texel_index++) {
1164          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1165          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1166          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1167       }
1168
1169       lp_build_if(&edge_if, bld->gallivm, have_edge);
1170
1171       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1172       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1173       LLVMBuildStore(builder, have_corner, have_corners);
1174
1175       /*
1176        * Need to feed clamped values here for cheap corner handling,
1177        * but only for y coord (as when falling off both edges we only
1178        * fall off the x one) - this should be sufficient.
1179        */
1180       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1181       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1182
1183       /*
1184        * Get all possible new coords.
1185        */
1186       lp_build_cube_new_coords(ivec_bld, face,
1187                                x0, x1, y0_clamped, y1_clamped,
1188                                length_minus_one,
1189                                new_faces, new_xcoords, new_ycoords);
1190
1191       /* handle fall off x-, x+ direction */
1192       /* determine new coords, face (not both fall_off vars can be true at same time) */
1193       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1194       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1195       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1196       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1197       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1198       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1199       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1200       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1201
1202       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1203       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1204
1205       /* handle fall off y-, y+ direction */
1206       /*
1207        * Cheap corner logic: just hack up things so a texel doesn't fall
1208        * off both sides (which means filter weights will be wrong but we'll only
1209        * use valid texels in the filter).
1210        * This means however (y) coords must additionally be clamped (see above).
1211        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1212        */
1213       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1214       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1215       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1216       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1217
1218       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1219       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1220       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1221       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1222       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1223       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1224       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1225       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1226
1227       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1228       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1229       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1230       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1231
1232       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1233          /* now can add cube layer to face (per sample) */
1234          z00 = lp_build_add(ivec_bld, z00, coords[3]);
1235          z01 = lp_build_add(ivec_bld, z01, coords[3]);
1236          z10 = lp_build_add(ivec_bld, z10, coords[3]);
1237          z11 = lp_build_add(ivec_bld, z11, coords[3]);
1238       }
1239
1240       LLVMBuildStore(builder, x00, xs[0]);
1241       LLVMBuildStore(builder, x01, xs[1]);
1242       LLVMBuildStore(builder, x10, xs[2]);
1243       LLVMBuildStore(builder, x11, xs[3]);
1244       LLVMBuildStore(builder, y00, ys[0]);
1245       LLVMBuildStore(builder, y01, ys[1]);
1246       LLVMBuildStore(builder, y10, ys[2]);
1247       LLVMBuildStore(builder, y11, ys[3]);
1248       LLVMBuildStore(builder, z00, zs[0]);
1249       LLVMBuildStore(builder, z01, zs[1]);
1250       LLVMBuildStore(builder, z10, zs[2]);
1251       LLVMBuildStore(builder, z11, zs[3]);
1252
1253       lp_build_else(&edge_if);
1254
1255       LLVMBuildStore(builder, x0, xs[0]);
1256       LLVMBuildStore(builder, x1, xs[1]);
1257       LLVMBuildStore(builder, x0, xs[2]);
1258       LLVMBuildStore(builder, x1, xs[3]);
1259       LLVMBuildStore(builder, y0, ys[0]);
1260       LLVMBuildStore(builder, y0, ys[1]);
1261       LLVMBuildStore(builder, y1, ys[2]);
1262       LLVMBuildStore(builder, y1, ys[3]);
1263       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1264          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1265          LLVMBuildStore(builder, cube_layer, zs[0]);
1266          LLVMBuildStore(builder, cube_layer, zs[1]);
1267          LLVMBuildStore(builder, cube_layer, zs[2]);
1268          LLVMBuildStore(builder, cube_layer, zs[3]);
1269       }
1270       else {
1271          LLVMBuildStore(builder, face, zs[0]);
1272          LLVMBuildStore(builder, face, zs[1]);
1273          LLVMBuildStore(builder, face, zs[2]);
1274          LLVMBuildStore(builder, face, zs[3]);
1275       }
1276
1277       lp_build_endif(&edge_if);
1278
1279       x00 = LLVMBuildLoad(builder, xs[0], "");
1280       x01 = LLVMBuildLoad(builder, xs[1], "");
1281       x10 = LLVMBuildLoad(builder, xs[2], "");
1282       x11 = LLVMBuildLoad(builder, xs[3], "");
1283       y00 = LLVMBuildLoad(builder, ys[0], "");
1284       y01 = LLVMBuildLoad(builder, ys[1], "");
1285       y10 = LLVMBuildLoad(builder, ys[2], "");
1286       y11 = LLVMBuildLoad(builder, ys[3], "");
1287       z00 = LLVMBuildLoad(builder, zs[0], "");
1288       z01 = LLVMBuildLoad(builder, zs[1], "");
1289       z10 = LLVMBuildLoad(builder, zs[2], "");
1290       z11 = LLVMBuildLoad(builder, zs[3], "");
1291    }
1292
1293    if (linear_mask) {
1294       /*
1295        * Whack filter weights into place. Whatever texel had more weight is
1296        * the one which should have been selected by nearest filtering hence
1297        * just use 100% weight for it.
1298        */
1299       struct lp_build_context *c_bld = &bld->coord_bld;
1300       LLVMValueRef w1_mask, w1_weight;
1301       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1302
1303       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1304       /* this select is really just a "and" */
1305       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1306       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1307       if (dims >= 2) {
1308          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1309          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1310          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1311          if (dims == 3) {
1312             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1313             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1314             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1315          }
1316       }
1317    }
1318
1319    /*
1320     * Get texture colors.
1321     */
1322    /* get x0/x1 texels */
1323    lp_build_sample_texel_soa(bld,
1324                              width_vec, height_vec, depth_vec,
1325                              x00, y00, z00,
1326                              row_stride_vec, img_stride_vec,
1327                              data_ptr, mipoffsets, neighbors[0][0]);
1328    lp_build_sample_texel_soa(bld,
1329                              width_vec, height_vec, depth_vec,
1330                              x01, y01, z01,
1331                              row_stride_vec, img_stride_vec,
1332                              data_ptr, mipoffsets, neighbors[0][1]);
1333
1334    if (dims == 1) {
1335       assert(!is_gather);
1336       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1337          /* Interpolate two samples from 1D image to produce one color */
1338          for (chan = 0; chan < 4; chan++) {
1339             colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1340                                              neighbors[0][0][chan],
1341                                              neighbors[0][1][chan],
1342                                              0);
1343          }
1344       }
1345       else {
1346          LLVMValueRef cmpval0, cmpval1;
1347          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1348          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1349          /* simplified lerp, AND mask with weight and add */
1350          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1351                                            cmpval0, cmpval1);
1352          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1353       }
1354    }
1355    else {
1356       /* 2D/3D texture */
1357       struct lp_build_if_state corner_if;
1358       LLVMValueRef colors0[4], colorss[4];
1359
1360       /* get x0/x1 texels at y1 */
1361       lp_build_sample_texel_soa(bld,
1362                                 width_vec, height_vec, depth_vec,
1363                                 x10, y10, z10,
1364                                 row_stride_vec, img_stride_vec,
1365                                 data_ptr, mipoffsets, neighbors[1][0]);
1366       lp_build_sample_texel_soa(bld,
1367                                 width_vec, height_vec, depth_vec,
1368                                 x11, y11, z11,
1369                                 row_stride_vec, img_stride_vec,
1370                                 data_ptr, mipoffsets, neighbors[1][1]);
1371
1372       /*
1373        * To avoid having to duplicate linear_mask / fetch code use
1374        * another branch (with corner condition though edge would work
1375        * as well) here.
1376        */
1377       if (accurate_cube_corners) {
1378          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1379          LLVMValueRef have_corner, one_third;
1380
1381          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1382          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1383          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1384          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1385
1386          have_corner = LLVMBuildLoad(builder, have_corners, "");
1387
1388          lp_build_if(&corner_if, bld->gallivm, have_corner);
1389
1390          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1391                                         1.0f/3.0f);
1392
1393          /* find corner */
1394          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1395          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1396          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1397          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1398          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1399          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1400          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1401          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1402
1403          if (!is_gather) {
1404             /*
1405              * we can't use standard 2d lerp as we need per-element weight
1406              * in case of corners, so just calculate bilinear result as
1407              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1408              * (This is actually less work than using 2d lerp, 7 vs. 9
1409              * instructions, however calculating the weights needs another 6,
1410              * so actually probably not slower than 2d lerp only for 4 channels
1411              * as weights only need to be calculated once - of course fixing
1412              * the weights has additional cost.)
1413              */
1414             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1415             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1416             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1417             w00 = lp_build_mul(coord_bld, wx0, wy0);
1418             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1419             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1420             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1421
1422             /* find corner weight */
1423             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1424             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1425             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1426             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1427
1428             /*
1429              * add 1/3 of the corner weight to the weight of the 3 other
1430              * samples and null out corner weight.
1431              */
1432             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1433             w00 = lp_build_add(coord_bld, w00, c_weight);
1434             w00 = lp_build_andnot(coord_bld, w00, c00f);
1435             w01 = lp_build_add(coord_bld, w01, c_weight);
1436             w01 = lp_build_andnot(coord_bld, w01, c01f);
1437             w10 = lp_build_add(coord_bld, w10, c_weight);
1438             w10 = lp_build_andnot(coord_bld, w10, c10f);
1439             w11 = lp_build_add(coord_bld, w11, c_weight);
1440             w11 = lp_build_andnot(coord_bld, w11, c11f);
1441
1442             if (bld->static_sampler_state->compare_mode ==
1443                 PIPE_TEX_COMPARE_NONE) {
1444                for (chan = 0; chan < 4; chan++) {
1445                   colors0[chan] = lp_build_mul(coord_bld, w00,
1446                                                neighbors[0][0][chan]);
1447                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1448                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1449                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1450                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1451                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1452                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1453                }
1454             }
1455             else {
1456                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1457                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1458                                                       neighbors[0][0][0]);
1459                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1460                                                       neighbors[0][1][0]);
1461                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1462                                                       neighbors[1][0][0]);
1463                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1464                                                       neighbors[1][1][0]);
1465                /*
1466                 * inputs to interpolation are just masks so just add
1467                 * masked weights together
1468                 */
1469                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1470                                            coord_bld->vec_type, "");
1471                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1472                                            coord_bld->vec_type, "");
1473                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1474                                            coord_bld->vec_type, "");
1475                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1476                                            coord_bld->vec_type, "");
1477                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1478                tmp = lp_build_and(coord_bld, w01, cmpval01);
1479                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1480                tmp = lp_build_and(coord_bld, w10, cmpval10);
1481                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1482                tmp = lp_build_and(coord_bld, w11, cmpval11);
1483                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1484                colors0[1] = colors0[2] = colors0[3] = colors0[0];
1485             }
1486          }
1487          else {
1488             /*
1489              * We don't have any weights to adjust, so instead calculate
1490              * the fourth texel as simply the average of the other 3.
1491              * (This would work for non-gather too, however we'd have
1492              * a boatload more of the select stuff due to there being
1493              * 4 times as many colors as weights.)
1494              */
1495             LLVMValueRef col00, col01, col10, col11;
1496             LLVMValueRef colc, colc0, colc1;
1497             col10 = lp_build_swizzle_soa_channel(texel_bld,
1498                                                  neighbors[1][0], chan_swiz);
1499             col11 = lp_build_swizzle_soa_channel(texel_bld,
1500                                                  neighbors[1][1], chan_swiz);
1501             col01 = lp_build_swizzle_soa_channel(texel_bld,
1502                                                  neighbors[0][1], chan_swiz);
1503             col00 = lp_build_swizzle_soa_channel(texel_bld,
1504                                                  neighbors[0][0], chan_swiz);
1505
1506             /*
1507              * The spec says for comparison filtering, the comparison
1508              * must happen before synthesizing the new value.
1509              * This means all gathered values are always 0 or 1,
1510              * except for the non-existing texel, which can be 0,1/3,2/3,1...
1511              * Seems like we'd be allowed to just return 0 or 1 too, so we
1512              * could simplify and pass down the compare mask values to the
1513              * end (using int arithmetic/compare on the mask values to
1514              * construct the fourth texel) and only there convert to floats
1515              * but it's probably not worth it (it might be easier for the cpu
1516              * but not for the code)...
1517              */
1518             if (bld->static_sampler_state->compare_mode !=
1519                 PIPE_TEX_COMPARE_NONE) {
1520                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1521                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1522                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1523                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1524                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1525                col00 = lp_build_select(texel_bld, cmpval00,
1526                                        texel_bld->one, texel_bld->zero);
1527                col01 = lp_build_select(texel_bld, cmpval01,
1528                                        texel_bld->one, texel_bld->zero);
1529                col10 = lp_build_select(texel_bld, cmpval10,
1530                                        texel_bld->one, texel_bld->zero);
1531                col11 = lp_build_select(texel_bld, cmpval11,
1532                                        texel_bld->one, texel_bld->zero);
1533             }
1534
1535             /*
1536              * Null out corner color.
1537              */
1538             col00 = lp_build_andnot(coord_bld, col00, c00f);
1539             col01 = lp_build_andnot(coord_bld, col01, c01f);
1540             col10 = lp_build_andnot(coord_bld, col10, c10f);
1541             col11 = lp_build_andnot(coord_bld, col11, c11f);
1542
1543             /*
1544              * New corner texel color is all colors added / 3.
1545              */
1546             colc0 = lp_build_add(coord_bld, col00, col01);
1547             colc1 = lp_build_add(coord_bld, col10, col11);
1548             colc = lp_build_add(coord_bld, colc0, colc1);
1549             colc = lp_build_mul(coord_bld, one_third, colc);
1550
1551             /*
1552              * Replace the corner texel color with the new value.
1553              */
1554             col00 = lp_build_select(coord_bld, c00, colc, col00);
1555             col01 = lp_build_select(coord_bld, c01, colc, col01);
1556             col10 = lp_build_select(coord_bld, c10, colc, col10);
1557             col11 = lp_build_select(coord_bld, c11, colc, col11);
1558
1559             colors0[0] = col10;
1560             colors0[1] = col11;
1561             colors0[2] = col01;
1562             colors0[3] = col00;
1563          }
1564
1565          LLVMBuildStore(builder, colors0[0], colorss[0]);
1566          LLVMBuildStore(builder, colors0[1], colorss[1]);
1567          LLVMBuildStore(builder, colors0[2], colorss[2]);
1568          LLVMBuildStore(builder, colors0[3], colorss[3]);
1569
1570          lp_build_else(&corner_if);
1571       }
1572
1573       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1574          if (is_gather) {
1575             /*
1576              * Just assign the red channel (no component selection yet).
1577              * This is a bit hackish, we usually do the swizzle at the
1578              * end of sampling (much less values to swizzle), but this
1579              * obviously cannot work when using gather.
1580              */
1581             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1582                                                       neighbors[1][0],
1583                                                       chan_swiz);
1584             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1585                                                       neighbors[1][1],
1586                                                       chan_swiz);
1587             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1588                                                       neighbors[0][1],
1589                                                       chan_swiz);
1590             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1591                                                       neighbors[0][0],
1592                                                       chan_swiz);
1593          }
1594          else {
1595             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1596             for (chan = 0; chan < 4; chan++) {
1597                colors0[chan] = lp_build_lerp_2d(texel_bld,
1598                                                 s_fpart, t_fpart,
1599                                                 neighbors[0][0][chan],
1600                                                 neighbors[0][1][chan],
1601                                                 neighbors[1][0][chan],
1602                                                 neighbors[1][1][chan],
1603                                                 0);
1604             }
1605          }
1606       }
1607       else {
1608          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1609          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1610          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1611          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1612          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1613
1614          if (is_gather) {
1615             /* more hacks for swizzling, should be X, ONE or ZERO... */
1616             colors0[0] = lp_build_select(texel_bld, cmpval10,
1617                                          texel_bld->one, texel_bld->zero);
1618             colors0[1] = lp_build_select(texel_bld, cmpval11,
1619                                          texel_bld->one, texel_bld->zero);
1620             colors0[2] = lp_build_select(texel_bld, cmpval01,
1621                                          texel_bld->one, texel_bld->zero);
1622             colors0[3] = lp_build_select(texel_bld, cmpval00,
1623                                          texel_bld->one, texel_bld->zero);
1624          }
1625          else {
1626             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1627                                              cmpval00, cmpval01, cmpval10, cmpval11);
1628             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1629          }
1630       }
1631
1632       if (accurate_cube_corners) {
1633          LLVMBuildStore(builder, colors0[0], colorss[0]);
1634          LLVMBuildStore(builder, colors0[1], colorss[1]);
1635          LLVMBuildStore(builder, colors0[2], colorss[2]);
1636          LLVMBuildStore(builder, colors0[3], colorss[3]);
1637
1638          lp_build_endif(&corner_if);
1639
1640          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1641          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1642          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1643          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1644       }
1645
1646       if (dims == 3) {
1647          LLVMValueRef neighbors1[2][2][4];
1648          LLVMValueRef colors1[4];
1649
1650          assert(!is_gather);
1651
1652          /* get x0/x1/y0/y1 texels at z1 */
1653          lp_build_sample_texel_soa(bld,
1654                                    width_vec, height_vec, depth_vec,
1655                                    x00, y00, z1,
1656                                    row_stride_vec, img_stride_vec,
1657                                    data_ptr, mipoffsets, neighbors1[0][0]);
1658          lp_build_sample_texel_soa(bld,
1659                                    width_vec, height_vec, depth_vec,
1660                                    x01, y01, z1,
1661                                    row_stride_vec, img_stride_vec,
1662                                    data_ptr, mipoffsets, neighbors1[0][1]);
1663          lp_build_sample_texel_soa(bld,
1664                                    width_vec, height_vec, depth_vec,
1665                                    x10, y10, z1,
1666                                    row_stride_vec, img_stride_vec,
1667                                    data_ptr, mipoffsets, neighbors1[1][0]);
1668          lp_build_sample_texel_soa(bld,
1669                                    width_vec, height_vec, depth_vec,
1670                                    x11, y11, z1,
1671                                    row_stride_vec, img_stride_vec,
1672                                    data_ptr, mipoffsets, neighbors1[1][1]);
1673
1674          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1675             /* Bilinear interpolate the four samples from the second Z slice */
1676             for (chan = 0; chan < 4; chan++) {
1677                colors1[chan] = lp_build_lerp_2d(texel_bld,
1678                                                 s_fpart, t_fpart,
1679                                                 neighbors1[0][0][chan],
1680                                                 neighbors1[0][1][chan],
1681                                                 neighbors1[1][0][chan],
1682                                                 neighbors1[1][1][chan],
1683                                                 0);
1684             }
1685             /* Linearly interpolate the two samples from the two 3D slices */
1686             for (chan = 0; chan < 4; chan++) {
1687                colors_out[chan] = lp_build_lerp(texel_bld,
1688                                                 r_fpart,
1689                                                 colors0[chan], colors1[chan],
1690                                                 0);
1691             }
1692          }
1693          else {
1694             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1695             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1696             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1697             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1698             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1699             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1700                                              cmpval00, cmpval01, cmpval10, cmpval11);
1701             /* Linearly interpolate the two samples from the two 3D slices */
1702             colors_out[0] = lp_build_lerp(texel_bld,
1703                                           r_fpart,
1704                                           colors0[0], colors1[0],
1705                                           0);
1706             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1707          }
1708       }
1709       else {
1710          /* 2D tex */
1711          for (chan = 0; chan < 4; chan++) {
1712             colors_out[chan] = colors0[chan];
1713          }
1714       }
1715    }
1716    if (is_gather) {
1717       /*
1718        * For gather, we can't do our usual channel swizzling done later,
1719        * so do it here. It only really matters for 0/1 swizzles in case
1720        * of comparison filtering, since in this case the results would be
1721        * wrong, without comparison it should all work out alright but it
1722        * can't hurt to do that here, since it will instantly drop all
1723        * calculations above, though it's a rather stupid idea to do
1724        * gather on a channel which will always return 0 or 1 in any case...
1725        */
1726       if (chan_swiz == PIPE_SWIZZLE_1) {
1727          for (chan = 0; chan < 4; chan++) {
1728             colors_out[chan] = texel_bld->one;
1729          }
1730       } else if (chan_swiz == PIPE_SWIZZLE_0) {
1731          for (chan = 0; chan < 4; chan++) {
1732             colors_out[chan] = texel_bld->zero;
1733          }
1734       }
1735    }
1736 }
1737
1738
1739 /**
1740  * Sample the texture/mipmap using given image filter and mip filter.
1741  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1742  * from (vectors or scalars).
1743  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1744  */
1745 static void
1746 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1747                        unsigned img_filter,
1748                        unsigned mip_filter,
1749                        boolean is_gather,
1750                        LLVMValueRef *coords,
1751                        const LLVMValueRef *offsets,
1752                        LLVMValueRef ilevel0,
1753                        LLVMValueRef ilevel1,
1754                        LLVMValueRef lod_fpart,
1755                        LLVMValueRef *colors_out)
1756 {
1757    LLVMBuilderRef builder = bld->gallivm->builder;
1758    LLVMValueRef size0 = NULL;
1759    LLVMValueRef size1 = NULL;
1760    LLVMValueRef row_stride0_vec = NULL;
1761    LLVMValueRef row_stride1_vec = NULL;
1762    LLVMValueRef img_stride0_vec = NULL;
1763    LLVMValueRef img_stride1_vec = NULL;
1764    LLVMValueRef data_ptr0 = NULL;
1765    LLVMValueRef data_ptr1 = NULL;
1766    LLVMValueRef mipoff0 = NULL;
1767    LLVMValueRef mipoff1 = NULL;
1768    LLVMValueRef colors0[4], colors1[4];
1769    unsigned chan;
1770
1771    /* sample the first mipmap level */
1772    lp_build_mipmap_level_sizes(bld, ilevel0,
1773                                &size0,
1774                                &row_stride0_vec, &img_stride0_vec);
1775    if (bld->num_mips == 1) {
1776       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1777    }
1778    else {
1779       /* This path should work for num_lods 1 too but slightly less efficient */
1780       data_ptr0 = bld->base_ptr;
1781       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1782    }
1783    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1784       lp_build_sample_image_nearest(bld, size0,
1785                                     row_stride0_vec, img_stride0_vec,
1786                                     data_ptr0, mipoff0, coords, offsets,
1787                                     colors0);
1788    }
1789    else {
1790       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1791       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1792                                    row_stride0_vec, img_stride0_vec,
1793                                    data_ptr0, mipoff0, coords, offsets,
1794                                    colors0);
1795    }
1796
1797    /* Store the first level's colors in the output variables */
1798    for (chan = 0; chan < 4; chan++) {
1799        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1800    }
1801
1802    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1803       struct lp_build_if_state if_ctx;
1804       LLVMValueRef need_lerp;
1805
1806       /* need_lerp = lod_fpart > 0 */
1807       if (bld->num_lods == 1) {
1808          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1809                                    lod_fpart, bld->lodf_bld.zero,
1810                                    "need_lerp");
1811       }
1812       else {
1813          /*
1814           * We'll do mip filtering if any of the quads (or individual
1815           * pixel in case of per-pixel lod) need it.
1816           * It might be better to split the vectors here and only fetch/filter
1817           * quads which need it (if there's one lod per quad).
1818           */
1819          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1820                                       PIPE_FUNC_GREATER,
1821                                       lod_fpart, bld->lodf_bld.zero);
1822          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1823       }
1824
1825       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1826       {
1827          /*
1828           * We unfortunately need to clamp lod_fpart here since we can get
1829           * negative values which would screw up filtering if not all
1830           * lod_fpart values have same sign.
1831           */
1832          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1833                                   bld->lodf_bld.zero);
1834          /* sample the second mipmap level */
1835          lp_build_mipmap_level_sizes(bld, ilevel1,
1836                                      &size1,
1837                                      &row_stride1_vec, &img_stride1_vec);
1838          if (bld->num_mips == 1) {
1839             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1840          }
1841          else {
1842             data_ptr1 = bld->base_ptr;
1843             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1844          }
1845          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1846             lp_build_sample_image_nearest(bld, size1,
1847                                           row_stride1_vec, img_stride1_vec,
1848                                           data_ptr1, mipoff1, coords, offsets,
1849                                           colors1);
1850          }
1851          else {
1852             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1853                                          row_stride1_vec, img_stride1_vec,
1854                                          data_ptr1, mipoff1, coords, offsets,
1855                                          colors1);
1856          }
1857
1858          /* interpolate samples from the two mipmap levels */
1859
1860          if (bld->num_lods != bld->coord_type.length)
1861             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1862                                                               bld->lodf_bld.type,
1863                                                               bld->texel_bld.type,
1864                                                               lod_fpart);
1865
1866          for (chan = 0; chan < 4; chan++) {
1867             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1868                                           colors0[chan], colors1[chan],
1869                                           0);
1870             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1871          }
1872       }
1873       lp_build_endif(&if_ctx);
1874    }
1875 }
1876
1877
1878 /**
1879  * Sample the texture/mipmap using given mip filter, and using
1880  * both nearest and linear filtering at the same time depending
1881  * on linear_mask.
1882  * lod can be per quad but linear_mask is always per pixel.
1883  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1884  * from (vectors or scalars).
1885  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1886  */
1887 static void
1888 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1889                             LLVMValueRef linear_mask,
1890                             unsigned mip_filter,
1891                             LLVMValueRef *coords,
1892                             const LLVMValueRef *offsets,
1893                             LLVMValueRef ilevel0,
1894                             LLVMValueRef ilevel1,
1895                             LLVMValueRef lod_fpart,
1896                             LLVMValueRef lod_positive,
1897                             LLVMValueRef *colors_out)
1898 {
1899    LLVMBuilderRef builder = bld->gallivm->builder;
1900    LLVMValueRef size0 = NULL;
1901    LLVMValueRef size1 = NULL;
1902    LLVMValueRef row_stride0_vec = NULL;
1903    LLVMValueRef row_stride1_vec = NULL;
1904    LLVMValueRef img_stride0_vec = NULL;
1905    LLVMValueRef img_stride1_vec = NULL;
1906    LLVMValueRef data_ptr0 = NULL;
1907    LLVMValueRef data_ptr1 = NULL;
1908    LLVMValueRef mipoff0 = NULL;
1909    LLVMValueRef mipoff1 = NULL;
1910    LLVMValueRef colors0[4], colors1[4];
1911    unsigned chan;
1912
1913    /* sample the first mipmap level */
1914    lp_build_mipmap_level_sizes(bld, ilevel0,
1915                                &size0,
1916                                &row_stride0_vec, &img_stride0_vec);
1917    if (bld->num_mips == 1) {
1918       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1919    }
1920    else {
1921       /* This path should work for num_lods 1 too but slightly less efficient */
1922       data_ptr0 = bld->base_ptr;
1923       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1924    }
1925
1926    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1927                                 row_stride0_vec, img_stride0_vec,
1928                                 data_ptr0, mipoff0, coords, offsets,
1929                                 colors0);
1930
1931    /* Store the first level's colors in the output variables */
1932    for (chan = 0; chan < 4; chan++) {
1933        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1934    }
1935
1936    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1937       struct lp_build_if_state if_ctx;
1938       LLVMValueRef need_lerp;
1939
1940       /*
1941        * We'll do mip filtering if any of the quads (or individual
1942        * pixel in case of per-pixel lod) need it.
1943        * Note using lod_positive here not lod_fpart since it may be the same
1944        * condition as that used in the outer "if" in the caller hence llvm
1945        * should be able to merge the branches in this case.
1946        */
1947       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1948
1949       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1950       {
1951          /*
1952           * We unfortunately need to clamp lod_fpart here since we can get
1953           * negative values which would screw up filtering if not all
1954           * lod_fpart values have same sign.
1955           */
1956          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1957                                   bld->lodf_bld.zero);
1958          /* sample the second mipmap level */
1959          lp_build_mipmap_level_sizes(bld, ilevel1,
1960                                      &size1,
1961                                      &row_stride1_vec, &img_stride1_vec);
1962          if (bld->num_mips == 1) {
1963             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1964          }
1965          else {
1966             data_ptr1 = bld->base_ptr;
1967             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1968          }
1969
1970          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1971                                       row_stride1_vec, img_stride1_vec,
1972                                       data_ptr1, mipoff1, coords, offsets,
1973                                       colors1);
1974
1975          /* interpolate samples from the two mipmap levels */
1976
1977          if (bld->num_lods != bld->coord_type.length)
1978             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1979                                                               bld->lodf_bld.type,
1980                                                               bld->texel_bld.type,
1981                                                               lod_fpart);
1982
1983          for (chan = 0; chan < 4; chan++) {
1984             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1985                                           colors0[chan], colors1[chan],
1986                                           0);
1987             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1988          }
1989       }
1990       lp_build_endif(&if_ctx);
1991    }
1992 }
1993
1994
1995 /**
1996  * Build (per-coord) layer value.
1997  * Either clamp layer to valid values or fill in optional out_of_bounds
1998  * value and just return value unclamped.
1999  */
2000 static LLVMValueRef
2001 lp_build_layer_coord(struct lp_build_sample_context *bld,
2002                      unsigned texture_unit,
2003                      boolean is_cube_array,
2004                      LLVMValueRef layer,
2005                      LLVMValueRef *out_of_bounds)
2006 {
2007    LLVMValueRef num_layers;
2008    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2009
2010    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2011                                           bld->context_ptr, texture_unit);
2012
2013    if (out_of_bounds) {
2014       LLVMValueRef out1, out;
2015       assert(!is_cube_array);
2016       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2017       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2018       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2019       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2020       return layer;
2021    }
2022    else {
2023       LLVMValueRef maxlayer;
2024       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2025                                        bld->int_bld.one;
2026       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2027       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2028       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2029    }
2030 }
2031
2032
2033 /**
2034  * Calculate cube face, lod, mip levels.
2035  */
2036 static void
2037 lp_build_sample_common(struct lp_build_sample_context *bld,
2038                        boolean is_lodq,
2039                        unsigned texture_index,
2040                        unsigned sampler_index,
2041                        LLVMValueRef *coords,
2042                        const struct lp_derivatives *derivs, /* optional */
2043                        LLVMValueRef lod_bias, /* optional */
2044                        LLVMValueRef explicit_lod, /* optional */
2045                        LLVMValueRef *lod_pos_or_zero,
2046                        LLVMValueRef *lod,
2047                        LLVMValueRef *lod_fpart,
2048                        LLVMValueRef *ilevel0,
2049                        LLVMValueRef *ilevel1)
2050 {
2051    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2052    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2053    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2054    const unsigned target = bld->static_texture_state->target;
2055    LLVMValueRef first_level, cube_rho = NULL;
2056    LLVMValueRef lod_ipart = NULL;
2057    struct lp_derivatives cube_derivs;
2058
2059    /*
2060    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2061           mip_filter, min_filter, mag_filter);
2062    */
2063
2064    /*
2065     * Choose cube face, recompute texcoords for the chosen face and
2066     * compute rho here too (as it requires transform of derivatives).
2067     */
2068    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2069       boolean need_derivs;
2070       need_derivs = ((min_filter != mag_filter ||
2071                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2072                       !bld->static_sampler_state->min_max_lod_equal &&
2073                       !explicit_lod);
2074       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2075       derivs = &cube_derivs;
2076       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2077          /* calculate cube layer coord now */
2078          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2079          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2080          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2081          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2082          /* because of seamless filtering can't add it to face (coords[2]) here. */
2083       }
2084    }
2085    else if (target == PIPE_TEXTURE_1D_ARRAY ||
2086             target == PIPE_TEXTURE_2D_ARRAY) {
2087       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2088       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2089    }
2090
2091    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2092       /*
2093        * Clamp p coords to [0,1] for fixed function depth texture format here.
2094        * Technically this is not entirely correct for unorm depth as the ref value
2095        * should be converted to the depth format (quantization!) and comparison
2096        * then done in texture format. This would actually help performance (since
2097        * only need to do it once and could save the per-sample conversion of texels
2098        * to floats instead), but it would need more messy code (would need to push
2099        * at least some bits down to actual fetch so conversion could be skipped,
2100        * and would have ugly interaction with border color, would need to convert
2101        * border color to that format too or do some other tricks to make it work).
2102        */
2103       const struct util_format_description *format_desc = bld->format_desc;
2104       unsigned chan_type;
2105       /* not entirely sure we couldn't end up with non-valid swizzle here */
2106       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2107                      format_desc->channel[format_desc->swizzle[0]].type :
2108                      UTIL_FORMAT_TYPE_FLOAT;
2109       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2110          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2111                                     bld->coord_bld.zero, bld->coord_bld.one);
2112       }
2113    }
2114
2115    /*
2116     * Compute the level of detail (float).
2117     */
2118    if (min_filter != mag_filter ||
2119        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2120       /* Need to compute lod either to choose mipmap levels or to
2121        * distinguish between minification/magnification with one mipmap level.
2122        */
2123       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2124                             coords[0], coords[1], coords[2], cube_rho,
2125                             derivs, lod_bias, explicit_lod,
2126                             mip_filter, lod,
2127                             &lod_ipart, lod_fpart, lod_pos_or_zero);
2128       if (is_lodq) {
2129          LLVMValueRef last_level;
2130          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2131                                                      bld->gallivm,
2132                                                      bld->context_ptr,
2133                                                      texture_index);
2134          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2135                                                        bld->gallivm,
2136                                                        bld->context_ptr,
2137                                                        texture_index);
2138          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2139          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2140          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2141
2142          switch (mip_filter) {
2143          case PIPE_TEX_MIPFILTER_NONE:
2144             *lod_fpart = bld->lodf_bld.zero;
2145             break;
2146          case PIPE_TEX_MIPFILTER_NEAREST:
2147              *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2148              /* fallthrough */
2149          case PIPE_TEX_MIPFILTER_LINEAR:
2150             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2151                                         bld->lodf_bld.zero, last_level);
2152             break;
2153          }
2154          return;
2155       }
2156
2157    } else {
2158       lod_ipart = bld->lodi_bld.zero;
2159       *lod_pos_or_zero = bld->lodi_bld.zero;
2160    }
2161
2162    if (bld->num_lods != bld->num_mips) {
2163       /* only makes sense if there's just a single mip level */
2164       assert(bld->num_mips == 1);
2165       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2166    }
2167
2168    /*
2169     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2170     */
2171    switch (mip_filter) {
2172    default:
2173       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2174       /* fall-through */
2175    case PIPE_TEX_MIPFILTER_NONE:
2176       /* always use mip level 0 */
2177       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2178                                                     bld->gallivm, bld->context_ptr,
2179                                                     texture_index);
2180       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2181       *ilevel0 = first_level;
2182       break;
2183    case PIPE_TEX_MIPFILTER_NEAREST:
2184       assert(lod_ipart);
2185       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2186       break;
2187    case PIPE_TEX_MIPFILTER_LINEAR:
2188       assert(lod_ipart);
2189       assert(*lod_fpart);
2190       lp_build_linear_mip_levels(bld, texture_index,
2191                                  lod_ipart, lod_fpart,
2192                                  ilevel0, ilevel1);
2193       break;
2194    }
2195 }
2196
2197 static void
2198 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2199                             unsigned sampler_unit)
2200 {
2201    struct gallivm_state *gallivm = bld->gallivm;
2202    LLVMBuilderRef builder = gallivm->builder;
2203    LLVMValueRef border_color_ptr =
2204       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2205                                        bld->context_ptr, sampler_unit);
2206    LLVMValueRef border_color;
2207    const struct util_format_description *format_desc = bld->format_desc;
2208    struct lp_type vec4_type = bld->texel_type;
2209    struct lp_build_context vec4_bld;
2210    LLVMValueRef min_clamp = NULL;
2211    LLVMValueRef max_clamp = NULL;
2212
2213    /*
2214     * For normalized format need to clamp border color (technically
2215     * probably should also quantize the data). Really sucks doing this
2216     * here but can't avoid at least for now since this is part of
2217     * sampler state and texture format is part of sampler_view state.
2218     * GL expects also expects clamping for uint/sint formats too so
2219     * do that as well (d3d10 can't end up here with uint/sint since it
2220     * only supports them with ld).
2221     */
2222    vec4_type.length = 4;
2223    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2224
2225    /*
2226     * Vectorized clamping of border color. Loading is a bit of a hack since
2227     * we just cast the pointer to float array to pointer to vec4
2228     * (int or float).
2229     */
2230    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2231                                              lp_build_const_int32(gallivm, 0));
2232    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2233                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
2234    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2235    /* we don't have aligned type in the dynamic state unfortunately */
2236    LLVMSetAlignment(border_color, 4);
2237
2238    /*
2239     * Instead of having some incredibly complex logic which will try to figure out
2240     * clamping necessary for each channel, simply use the first channel, and treat
2241     * mixed signed/unsigned normalized formats specially.
2242     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2243     * good reason.)
2244     */
2245    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2246       int chan;
2247       /* d/s needs special handling because both present means just sampling depth */
2248       if (util_format_is_depth_and_stencil(format_desc->format)) {
2249          chan = format_desc->swizzle[0];
2250       }
2251       else {
2252          chan = util_format_get_first_non_void_channel(format_desc->format);
2253       }
2254       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2255          unsigned chan_type = format_desc->channel[chan].type;
2256          unsigned chan_norm = format_desc->channel[chan].normalized;
2257          unsigned chan_pure = format_desc->channel[chan].pure_integer;
2258          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2259             if (chan_norm) {
2260                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2261                max_clamp = vec4_bld.one;
2262             }
2263             else if (chan_pure) {
2264                /*
2265                 * Border color was stored as int, hence need min/max clamp
2266                 * only if chan has less than 32 bits..
2267                 */
2268                unsigned chan_size = format_desc->channel[chan].size;
2269                if (chan_size < 32) {
2270                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2271                                                      0 - (1 << (chan_size - 1)));
2272                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2273                                                      (1 << (chan_size - 1)) - 1);
2274                }
2275             }
2276             /* TODO: no idea about non-pure, non-normalized! */
2277          }
2278          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2279             if (chan_norm) {
2280                min_clamp = vec4_bld.zero;
2281                max_clamp = vec4_bld.one;
2282             }
2283             /*
2284              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2285              * we use Z32_FLOAT_S8X24 to imply sampling depth component
2286              * and ignoring stencil, which will blow up here if we try to
2287              * do a uint clamp in a float texel build...
2288              * And even if we had that format, mesa st also thinks using z24s8
2289              * means depth sampling ignoring stencil.
2290              */
2291             else if (chan_pure) {
2292                /*
2293                 * Border color was stored as uint, hence never need min
2294                 * clamp, and only need max clamp if chan has less than 32 bits.
2295                 */
2296                unsigned chan_size = format_desc->channel[chan].size;
2297                if (chan_size < 32) {
2298                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2299                                                      (1 << chan_size) - 1);
2300                }
2301                /* TODO: no idea about non-pure, non-normalized! */
2302             }
2303          }
2304          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2305             /* TODO: I have no idea what clamp this would need if any! */
2306          }
2307       }
2308       /* mixed plain formats (or different pure size) */
2309       switch (format_desc->format) {
2310       case PIPE_FORMAT_B10G10R10A2_UINT:
2311       case PIPE_FORMAT_R10G10B10A2_UINT:
2312       {
2313          unsigned max10 = (1 << 10) - 1;
2314          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2315                                         max10, (1 << 2) - 1, NULL);
2316       }
2317          break;
2318       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2319          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2320                                         -1.0F, 0.0F, NULL);
2321          max_clamp = vec4_bld.one;
2322          break;
2323       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2324       case PIPE_FORMAT_R5SG5SB6U_NORM:
2325          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2326                                         0.0F, 0.0F, NULL);
2327          max_clamp = vec4_bld.one;
2328          break;
2329       default:
2330          break;
2331       }
2332    }
2333    else {
2334       /* cannot figure this out from format description */
2335       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2336          /* s3tc formats are always unorm */
2337          min_clamp = vec4_bld.zero;
2338          max_clamp = vec4_bld.one;
2339       }
2340       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2341                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2342          switch (format_desc->format) {
2343          case PIPE_FORMAT_RGTC1_UNORM:
2344          case PIPE_FORMAT_RGTC2_UNORM:
2345          case PIPE_FORMAT_LATC1_UNORM:
2346          case PIPE_FORMAT_LATC2_UNORM:
2347          case PIPE_FORMAT_ETC1_RGB8:
2348             min_clamp = vec4_bld.zero;
2349             max_clamp = vec4_bld.one;
2350             break;
2351          case PIPE_FORMAT_RGTC1_SNORM:
2352          case PIPE_FORMAT_RGTC2_SNORM:
2353          case PIPE_FORMAT_LATC1_SNORM:
2354          case PIPE_FORMAT_LATC2_SNORM:
2355             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2356             max_clamp = vec4_bld.one;
2357             break;
2358          default:
2359             assert(0);
2360             break;
2361          }
2362       }
2363       /*
2364        * all others from subsampled/other group, though we don't care
2365        * about yuv (and should not have any from zs here)
2366        */
2367       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2368          switch (format_desc->format) {
2369          case PIPE_FORMAT_R8G8_B8G8_UNORM:
2370          case PIPE_FORMAT_G8R8_G8B8_UNORM:
2371          case PIPE_FORMAT_G8R8_B8R8_UNORM:
2372          case PIPE_FORMAT_R8G8_R8B8_UNORM:
2373          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2374             min_clamp = vec4_bld.zero;
2375             max_clamp = vec4_bld.one;
2376             break;
2377          case PIPE_FORMAT_R8G8Bx_SNORM:
2378             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2379             max_clamp = vec4_bld.one;
2380             break;
2381             /*
2382              * Note smallfloat formats usually don't need clamping
2383              * (they still have infinite range) however this is not
2384              * true for r11g11b10 and r9g9b9e5, which can't represent
2385              * negative numbers (and additionally r9g9b9e5 can't represent
2386              * very large numbers). d3d10 seems happy without clamping in
2387              * this case, but gl spec is pretty clear: "for floating
2388              * point and integer formats, border values are clamped to
2389              * the representable range of the format" so do that here.
2390              */
2391          case PIPE_FORMAT_R11G11B10_FLOAT:
2392             min_clamp = vec4_bld.zero;
2393             break;
2394          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2395             min_clamp = vec4_bld.zero;
2396             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2397             break;
2398          default:
2399             assert(0);
2400             break;
2401          }
2402       }
2403    }
2404
2405    if (min_clamp) {
2406       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2407    }
2408    if (max_clamp) {
2409       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2410    }
2411
2412    bld->border_color_clamped = border_color;
2413 }
2414
2415
2416 /**
2417  * General texture sampling codegen.
2418  * This function handles texture sampling for all texture targets (1D,
2419  * 2D, 3D, cube) and all filtering modes.
2420  */
2421 static void
2422 lp_build_sample_general(struct lp_build_sample_context *bld,
2423                         unsigned sampler_unit,
2424                         boolean is_gather,
2425                         LLVMValueRef *coords,
2426                         const LLVMValueRef *offsets,
2427                         LLVMValueRef lod_positive,
2428                         LLVMValueRef lod_fpart,
2429                         LLVMValueRef ilevel0,
2430                         LLVMValueRef ilevel1,
2431                         LLVMValueRef *colors_out)
2432 {
2433    LLVMBuilderRef builder = bld->gallivm->builder;
2434    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2435    const unsigned mip_filter = sampler_state->min_mip_filter;
2436    const unsigned min_filter = sampler_state->min_img_filter;
2437    const unsigned mag_filter = sampler_state->mag_img_filter;
2438    LLVMValueRef texels[4];
2439    unsigned chan;
2440
2441    /* if we need border color, (potentially) clamp it now */
2442    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2443                                               min_filter,
2444                                               mag_filter) ||
2445        (bld->dims > 1 &&
2446            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2447                                                   min_filter,
2448                                                   mag_filter)) ||
2449        (bld->dims > 2 &&
2450            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2451                                                   min_filter,
2452                                                   mag_filter))) {
2453       lp_build_clamp_border_color(bld, sampler_unit);
2454    }
2455
2456
2457    /*
2458     * Get/interpolate texture colors.
2459     */
2460
2461    for (chan = 0; chan < 4; ++chan) {
2462      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2463      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2464    }
2465
2466    if (min_filter == mag_filter) {
2467       /* no need to distinguish between minification and magnification */
2468       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2469                              is_gather,
2470                              coords, offsets,
2471                              ilevel0, ilevel1, lod_fpart,
2472                              texels);
2473    }
2474    else {
2475       /*
2476        * Could also get rid of the if-logic and always use mipmap_both, both
2477        * for the single lod and multi-lod case if nothing really uses this.
2478        */
2479       if (bld->num_lods == 1) {
2480          /* Emit conditional to choose min image filter or mag image filter
2481           * depending on the lod being > 0 or <= 0, respectively.
2482           */
2483          struct lp_build_if_state if_ctx;
2484
2485          lod_positive = LLVMBuildTrunc(builder, lod_positive,
2486                                        LLVMInt1TypeInContext(bld->gallivm->context), "");
2487
2488          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2489          {
2490             /* Use the minification filter */
2491             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2492                                    coords, offsets,
2493                                    ilevel0, ilevel1, lod_fpart,
2494                                    texels);
2495          }
2496          lp_build_else(&if_ctx);
2497          {
2498             /* Use the magnification filter */
2499             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2500                                    FALSE,
2501                                    coords, offsets,
2502                                    ilevel0, NULL, NULL,
2503                                    texels);
2504          }
2505          lp_build_endif(&if_ctx);
2506       }
2507       else {
2508          LLVMValueRef need_linear, linear_mask;
2509          unsigned mip_filter_for_nearest;
2510          struct lp_build_if_state if_ctx;
2511
2512          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2513             linear_mask = lod_positive;
2514             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2515          }
2516          else {
2517             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2518             mip_filter_for_nearest = mip_filter;
2519          }
2520          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2521                                                linear_mask);
2522
2523          if (bld->num_lods != bld->coord_type.length) {
2524             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2525                                                                 bld->lodi_type,
2526                                                                 bld->int_coord_type,
2527                                                                 linear_mask);
2528          }
2529
2530          lp_build_if(&if_ctx, bld->gallivm, need_linear);
2531          {
2532             /*
2533              * Do sampling with both filters simultaneously. This means using
2534              * a linear filter and doing some tricks (with weights) for the pixels
2535              * which need nearest filter.
2536              * Note that it's probably rare some pixels need nearest and some
2537              * linear filter but the fixups required for the nearest pixels
2538              * aren't all that complicated so just always run a combined path
2539              * if at least some pixels require linear.
2540              */
2541             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2542                                         coords, offsets,
2543                                         ilevel0, ilevel1,
2544                                         lod_fpart, lod_positive,
2545                                         texels);
2546          }
2547          lp_build_else(&if_ctx);
2548          {
2549             /*
2550              * All pixels require just nearest filtering, which is way
2551              * cheaper than linear, hence do a separate path for that.
2552              */
2553             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2554                                    mip_filter_for_nearest, FALSE,
2555                                    coords, offsets,
2556                                    ilevel0, ilevel1, lod_fpart,
2557                                    texels);
2558          }
2559          lp_build_endif(&if_ctx);
2560       }
2561    }
2562
2563    for (chan = 0; chan < 4; ++chan) {
2564      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2565      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2566    }
2567 }
2568
2569
2570 /**
2571  * Texel fetch function.
2572  * In contrast to general sampling there is no filtering, no coord minification,
2573  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2574  * directly to be applied to the selected mip level (after adding texel offsets).
2575  * This function handles texel fetch for all targets where texel fetch is supported
2576  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2577  */
2578 static void
2579 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2580                      unsigned texture_unit,
2581                      const LLVMValueRef *coords,
2582                      LLVMValueRef explicit_lod,
2583                      const LLVMValueRef *offsets,
2584                      LLVMValueRef *colors_out)
2585 {
2586    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2587    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2588    unsigned dims = bld->dims, chan;
2589    unsigned target = bld->static_texture_state->target;
2590    boolean out_of_bound_ret_zero = TRUE;
2591    LLVMValueRef size, ilevel;
2592    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2593    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2594    LLVMValueRef width, height, depth, i, j;
2595    LLVMValueRef offset, out_of_bounds, out1;
2596
2597    out_of_bounds = int_coord_bld->zero;
2598
2599    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2600       if (bld->num_mips != int_coord_bld->type.length) {
2601          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2602                                             perquadi_bld->type, explicit_lod, 0);
2603       }
2604       else {
2605          ilevel = explicit_lod;
2606       }
2607       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2608                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
2609    }
2610    else {
2611       assert(bld->num_mips == 1);
2612       if (bld->static_texture_state->target != PIPE_BUFFER) {
2613          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2614                                                   bld->context_ptr, texture_unit);
2615       }
2616       else {
2617          ilevel = lp_build_const_int32(bld->gallivm, 0);
2618       }
2619    }
2620    lp_build_mipmap_level_sizes(bld, ilevel,
2621                                &size,
2622                                &row_stride_vec, &img_stride_vec);
2623    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2624                                 size, &width, &height, &depth);
2625
2626    if (target == PIPE_TEXTURE_1D_ARRAY ||
2627        target == PIPE_TEXTURE_2D_ARRAY) {
2628       if (out_of_bound_ret_zero) {
2629          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2630          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2631       }
2632       else {
2633          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2634       }
2635    }
2636
2637    /* This is a lot like border sampling */
2638    if (offsets[0]) {
2639       /*
2640        * coords are really unsigned, offsets are signed, but I don't think
2641        * exceeding 31 bits is possible
2642        */
2643       x = lp_build_add(int_coord_bld, x, offsets[0]);
2644    }
2645    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2646    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2647    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2648    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2649
2650    if (dims >= 2) {
2651       if (offsets[1]) {
2652          y = lp_build_add(int_coord_bld, y, offsets[1]);
2653       }
2654       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2655       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2656       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2657       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2658
2659       if (dims >= 3) {
2660          if (offsets[2]) {
2661             z = lp_build_add(int_coord_bld, z, offsets[2]);
2662          }
2663          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2664          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2665          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2666          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2667       }
2668    }
2669
2670    lp_build_sample_offset(int_coord_bld,
2671                           bld->format_desc,
2672                           x, y, z, row_stride_vec, img_stride_vec,
2673                           &offset, &i, &j);
2674
2675    if (bld->static_texture_state->target != PIPE_BUFFER) {
2676       offset = lp_build_add(int_coord_bld, offset,
2677                             lp_build_get_mip_offsets(bld, ilevel));
2678    }
2679
2680    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2681
2682    lp_build_fetch_rgba_soa(bld->gallivm,
2683                            bld->format_desc,
2684                            bld->texel_type, TRUE,
2685                            bld->base_ptr, offset,
2686                            i, j,
2687                            bld->cache,
2688                            colors_out);
2689
2690    if (out_of_bound_ret_zero) {
2691       /*
2692        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2693        * Could use min/max above instead of out-of-bounds comparisons
2694        * if we don't care about the result returned for out-of-bounds.
2695        */
2696       for (chan = 0; chan < 4; chan++) {
2697          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2698                                             bld->texel_bld.zero, colors_out[chan]);
2699       }
2700    }
2701 }
2702
2703
2704 /**
2705  * Just set texels to white instead of actually sampling the texture.
2706  * For debugging.
2707  */
2708 void
2709 lp_build_sample_nop(struct gallivm_state *gallivm,
2710                     struct lp_type type,
2711                     const LLVMValueRef *coords,
2712                     LLVMValueRef texel_out[4])
2713 {
2714    LLVMValueRef one = lp_build_one(gallivm, type);
2715    unsigned chan;
2716
2717    for (chan = 0; chan < 4; chan++) {
2718       texel_out[chan] = one;
2719    }
2720 }
2721
2722
2723 /**
2724  * Build the actual texture sampling code.
2725  * 'texel' will return a vector of four LLVMValueRefs corresponding to
2726  * R, G, B, A.
2727  * \param type  vector float type to use for coords, etc.
2728  * \param sample_key
2729  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2730  */
2731 static void
2732 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2733                          const struct lp_static_texture_state *static_texture_state,
2734                          const struct lp_static_sampler_state *static_sampler_state,
2735                          struct lp_sampler_dynamic_state *dynamic_state,
2736                          struct lp_type type,
2737                          unsigned sample_key,
2738                          unsigned texture_index,
2739                          unsigned sampler_index,
2740                          LLVMValueRef context_ptr,
2741                          LLVMValueRef thread_data_ptr,
2742                          const LLVMValueRef *coords,
2743                          const LLVMValueRef *offsets,
2744                          const struct lp_derivatives *derivs, /* optional */
2745                          LLVMValueRef lod, /* optional */
2746                          LLVMValueRef texel_out[4])
2747 {
2748    unsigned target = static_texture_state->target;
2749    unsigned dims = texture_dims(target);
2750    unsigned num_quads = type.length / 4;
2751    unsigned mip_filter, min_img_filter, mag_img_filter, i;
2752    struct lp_build_sample_context bld;
2753    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2754    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2755    LLVMBuilderRef builder = gallivm->builder;
2756    LLVMValueRef tex_width, newcoords[5];
2757    enum lp_sampler_lod_property lod_property;
2758    enum lp_sampler_lod_control lod_control;
2759    enum lp_sampler_op_type op_type;
2760    LLVMValueRef lod_bias = NULL;
2761    LLVMValueRef explicit_lod = NULL;
2762    boolean op_is_tex, op_is_lodq, op_is_gather;
2763
2764    if (0) {
2765       enum pipe_format fmt = static_texture_state->format;
2766       debug_printf("Sample from %s\n", util_format_name(fmt));
2767    }
2768
2769    lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2770                      LP_SAMPLER_LOD_PROPERTY_SHIFT;
2771    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2772                     LP_SAMPLER_LOD_CONTROL_SHIFT;
2773    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2774                  LP_SAMPLER_OP_TYPE_SHIFT;
2775
2776    op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2777    op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2778    op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2779
2780    if (lod_control == LP_SAMPLER_LOD_BIAS) {
2781       lod_bias = lod;
2782       assert(lod);
2783       assert(derivs == NULL);
2784    }
2785    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2786       explicit_lod = lod;
2787       assert(lod);
2788       assert(derivs == NULL);
2789    }
2790    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2791       assert(derivs);
2792       assert(lod == NULL);
2793    }
2794    else {
2795       assert(derivs == NULL);
2796       assert(lod == NULL);
2797    }
2798
2799    if (static_texture_state->format == PIPE_FORMAT_NONE) {
2800       /*
2801        * If there's nothing bound, format is NONE, and we must return
2802        * all zero as mandated by d3d10 in this case.
2803        */
2804       unsigned chan;
2805       LLVMValueRef zero = lp_build_zero(gallivm, type);
2806       for (chan = 0; chan < 4; chan++) {
2807          texel_out[chan] = zero;
2808       }
2809       return;
2810    }
2811
2812    assert(type.floating);
2813
2814    /* Setup our build context */
2815    memset(&bld, 0, sizeof bld);
2816    bld.gallivm = gallivm;
2817    bld.context_ptr = context_ptr;
2818    bld.static_sampler_state = &derived_sampler_state;
2819    bld.static_texture_state = static_texture_state;
2820    bld.dynamic_state = dynamic_state;
2821    bld.format_desc = util_format_description(static_texture_state->format);
2822    bld.dims = dims;
2823
2824    if (gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD || op_is_lodq) {
2825       bld.no_quad_lod = TRUE;
2826    }
2827    if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX || op_is_lodq) {
2828       bld.no_rho_approx = TRUE;
2829    }
2830    if (gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR || op_is_lodq) {
2831       bld.no_brilinear = TRUE;
2832    }
2833
2834    bld.vector_width = lp_type_width(type);
2835
2836    bld.float_type = lp_type_float(32);
2837    bld.int_type = lp_type_int(32);
2838    bld.coord_type = type;
2839    bld.int_coord_type = lp_int_type(type);
2840    bld.float_size_in_type = lp_type_float(32);
2841    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2842    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2843    bld.texel_type = type;
2844
2845    /* always using the first channel hopefully should be safe,
2846     * if not things WILL break in other places anyway.
2847     */
2848    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2849        bld.format_desc->channel[0].pure_integer) {
2850       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2851          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2852       }
2853       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2854          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2855       }
2856    }
2857    else if (util_format_has_stencil(bld.format_desc) &&
2858        !util_format_has_depth(bld.format_desc)) {
2859       /* for stencil only formats, sample stencil (uint) */
2860       bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2861    }
2862
2863    if (!static_texture_state->level_zero_only ||
2864        !static_sampler_state->max_lod_pos || op_is_lodq) {
2865       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2866    } else {
2867       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2868    }
2869    if (op_is_gather) {
2870       /*
2871        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2872        * the actual filtering. Using mostly the same paths, so cube face
2873        * selection, coord wrapping etc. all naturally uses the same code.
2874        */
2875       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2876       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2877       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2878    }
2879    mip_filter = derived_sampler_state.min_mip_filter;
2880
2881    if (0) {
2882       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2883    }
2884
2885    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2886        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2887    {
2888       /*
2889        * Seamless filtering ignores wrap modes.
2890        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2891        * bilinear it's not correct but way better than using for instance repeat.
2892        * Note we even set this for non-seamless. Technically GL allows any wrap
2893        * mode, which made sense when supporting true borders (can get seamless
2894        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2895        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2896        * up the sampler state (as it makes it texture dependent).
2897        */
2898       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2899       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2900    }
2901    /*
2902     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2903     * so AoS path could be used. Not sure it's worth the trouble...
2904     */
2905
2906    min_img_filter = derived_sampler_state.min_img_filter;
2907    mag_img_filter = derived_sampler_state.mag_img_filter;
2908
2909
2910    /*
2911     * This is all a bit complicated different paths are chosen for performance
2912     * reasons.
2913     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2914     * everything (the last two options are equivalent for 4-wide case).
2915     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2916     * lod is calculated then the lod value extracted afterwards so making this
2917     * case basically the same as far as lod handling is concerned for the
2918     * further sample/filter code as the 1 lod for everything case.
2919     * Different lod handling mostly shows up when building mipmap sizes
2920     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2921     * (getting the fractional part of the lod to the right texels).
2922     */
2923
2924    /*
2925     * There are other situations where at least the multiple int lods could be
2926     * avoided like min and max lod being equal.
2927     */
2928    bld.num_mips = bld.num_lods = 1;
2929
2930    if (bld.no_quad_lod && bld.no_rho_approx &&
2931        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2932          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2933           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2934         op_is_lodq)) {
2935       /*
2936        * special case for using per-pixel lod even for implicit lod,
2937        * which is generally never required (ok by APIs) except to please
2938        * some (somewhat broken imho) tests (because per-pixel face selection
2939        * can cause derivatives to be different for pixels outside the primitive
2940        * due to the major axis division even if pre-project derivatives are
2941        * looking normal).
2942        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2943        * cube maps we do indeed get per-pixel lod values).
2944        */
2945       bld.num_mips = type.length;
2946       bld.num_lods = type.length;
2947    }
2948    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2949        (explicit_lod || lod_bias || derivs)) {
2950       if ((!op_is_tex && target != PIPE_BUFFER) ||
2951           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2952          bld.num_mips = type.length;
2953          bld.num_lods = type.length;
2954       }
2955       else if (op_is_tex && min_img_filter != mag_img_filter) {
2956          bld.num_mips = 1;
2957          bld.num_lods = type.length;
2958       }
2959    }
2960    /* TODO: for true scalar_lod should only use 1 lod value */
2961    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2962             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2963       bld.num_mips = num_quads;
2964       bld.num_lods = num_quads;
2965    }
2966    else if (op_is_tex && min_img_filter != mag_img_filter) {
2967       bld.num_mips = 1;
2968       bld.num_lods = num_quads;
2969    }
2970
2971
2972    bld.lodf_type = type;
2973    /* we want native vector size to be able to use our intrinsics */
2974    if (bld.num_lods != type.length) {
2975       /* TODO: this currently always has to be per-quad or per-element */
2976       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2977    }
2978    bld.lodi_type = lp_int_type(bld.lodf_type);
2979    bld.levelf_type = bld.lodf_type;
2980    if (bld.num_mips == 1) {
2981       bld.levelf_type.length = 1;
2982    }
2983    bld.leveli_type = lp_int_type(bld.levelf_type);
2984    bld.float_size_type = bld.float_size_in_type;
2985    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
2986     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
2987    if (bld.num_mips > 1) {
2988       bld.float_size_type.length = bld.num_mips == type.length ?
2989                                       bld.num_mips * bld.float_size_in_type.length :
2990                                       type.length;
2991    }
2992    bld.int_size_type = lp_int_type(bld.float_size_type);
2993
2994    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
2995    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
2996    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
2997    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
2998    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
2999    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3000    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3001    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3002    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3003    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3004    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3005    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3006    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3007    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3008
3009    /* Get the dynamic state */
3010    tex_width = dynamic_state->width(dynamic_state, gallivm,
3011                                     context_ptr, texture_index);
3012    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3013                                                     context_ptr, texture_index);
3014    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3015                                                     context_ptr, texture_index);
3016    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3017                                           context_ptr, texture_index);
3018    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3019                                                 context_ptr, texture_index);
3020    /* Note that mip_offsets is an array[level] of offsets to texture images */
3021
3022    if (dynamic_state->cache_ptr && thread_data_ptr) {
3023       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3024                                            thread_data_ptr, texture_index);
3025    }
3026
3027    /* width, height, depth as single int vector */
3028    if (dims <= 1) {
3029       bld.int_size = tex_width;
3030    }
3031    else {
3032       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3033                                             tex_width,
3034                                             LLVMConstInt(i32t, 0, 0), "");
3035       if (dims >= 2) {
3036          LLVMValueRef tex_height =
3037             dynamic_state->height(dynamic_state, gallivm,
3038                                   context_ptr, texture_index);
3039          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3040                                                tex_height,
3041                                                LLVMConstInt(i32t, 1, 0), "");
3042          if (dims >= 3) {
3043             LLVMValueRef tex_depth =
3044                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3045                                     texture_index);
3046             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3047                                                   tex_depth,
3048                                                   LLVMConstInt(i32t, 2, 0), "");
3049          }
3050       }
3051    }
3052
3053    for (i = 0; i < 5; i++) {
3054       newcoords[i] = coords[i];
3055    }
3056
3057    if (util_format_is_pure_integer(static_texture_state->format) &&
3058        !util_format_has_depth(bld.format_desc) && op_is_tex &&
3059        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3060         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3061         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3062       /*
3063        * Bail if impossible filtering is specified (the awkard additional
3064        * depth check is because it is legal in gallium to have things like S8Z24
3065        * here which would say it's pure int despite such formats should sample
3066        * the depth component).
3067        * In GL such filters make the texture incomplete, this makes it robust
3068        * against state trackers which set this up regardless (we'd crash in the
3069        * lerp later otherwise).
3070        * At least in some apis it may be legal to use such filters with lod
3071        * queries and/or gather (at least for gather d3d10 says only the wrap
3072        * bits are really used hence filter bits are likely simply ignored).
3073        * For fetch, we don't get valid samplers either way here.
3074        */
3075       unsigned chan;
3076       LLVMValueRef zero = lp_build_zero(gallivm, type);
3077       for (chan = 0; chan < 4; chan++) {
3078          texel_out[chan] = zero;
3079       }
3080       return;
3081    }
3082
3083    if (0) {
3084       /* For debug: no-op texture sampling */
3085       lp_build_sample_nop(gallivm,
3086                           bld.texel_type,
3087                           newcoords,
3088                           texel_out);
3089    }
3090
3091    else if (op_type == LP_SAMPLER_OP_FETCH) {
3092       lp_build_fetch_texel(&bld, texture_index, newcoords,
3093                            lod, offsets,
3094                            texel_out);
3095    }
3096
3097    else {
3098       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3099       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3100       boolean use_aos;
3101
3102       use_aos = util_format_fits_8unorm(bld.format_desc) &&
3103                 op_is_tex &&
3104                 /* not sure this is strictly needed or simply impossible */
3105                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3106                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3107
3108       use_aos &= bld.num_lods <= num_quads ||
3109                  derived_sampler_state.min_img_filter ==
3110                     derived_sampler_state.mag_img_filter;
3111       if (dims > 1) {
3112          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3113          if (dims > 2) {
3114             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3115          }
3116       }
3117       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3118            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3119           derived_sampler_state.seamless_cube_map &&
3120           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3121            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3122          /* theoretically possible with AoS filtering but not implemented (complex!) */
3123          use_aos = 0;
3124       }
3125
3126       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3127           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3128          debug_printf("%s: using floating point linear filtering for %s\n",
3129                       __FUNCTION__, bld.format_desc->short_name);
3130          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3131                       "  wraps %d  wrapt %d  wrapr %d\n",
3132                       derived_sampler_state.min_img_filter,
3133                       derived_sampler_state.mag_img_filter,
3134                       derived_sampler_state.min_mip_filter,
3135                       static_texture_state->target,
3136                       derived_sampler_state.seamless_cube_map,
3137                       derived_sampler_state.wrap_s,
3138                       derived_sampler_state.wrap_t,
3139                       derived_sampler_state.wrap_r);
3140       }
3141
3142       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3143                              newcoords,
3144                              derivs, lod_bias, explicit_lod,
3145                              &lod_positive, &lod, &lod_fpart,
3146                              &ilevel0, &ilevel1);
3147
3148       if (op_is_lodq) {
3149          texel_out[0] = lod_fpart;
3150          texel_out[1] = lod;
3151          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3152          return;
3153       }
3154
3155       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3156          /* The aos path doesn't do seamless filtering so simply add cube layer
3157           * to face now.
3158           */
3159          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3160       }
3161
3162       /*
3163        * we only try 8-wide sampling with soa or if we have AVX2
3164        * as it appears to be a loss with just AVX)
3165        */
3166       if (num_quads == 1 || !use_aos ||
3167           (util_cpu_caps.has_avx2 &&
3168            (bld.num_lods == 1 ||
3169             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3170          if (use_aos) {
3171             /* do sampling/filtering with fixed pt arithmetic */
3172             lp_build_sample_aos(&bld, sampler_index,
3173                                 newcoords[0], newcoords[1],
3174                                 newcoords[2],
3175                                 offsets, lod_positive, lod_fpart,
3176                                 ilevel0, ilevel1,
3177                                 texel_out);
3178          }
3179
3180          else {
3181             lp_build_sample_general(&bld, sampler_index,
3182                                     op_type == LP_SAMPLER_OP_GATHER,
3183                                     newcoords, offsets,
3184                                     lod_positive, lod_fpart,
3185                                     ilevel0, ilevel1,
3186                                     texel_out);
3187          }
3188       }
3189       else {
3190          unsigned j;
3191          struct lp_build_sample_context bld4;
3192          struct lp_type type4 = type;
3193          unsigned i;
3194          LLVMValueRef texelout4[4];
3195          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3196
3197          type4.length = 4;
3198
3199          /* Setup our build context */
3200          memset(&bld4, 0, sizeof bld4);
3201          bld4.no_quad_lod = bld.no_quad_lod;
3202          bld4.no_rho_approx = bld.no_rho_approx;
3203          bld4.no_brilinear = bld.no_brilinear;
3204          bld4.gallivm = bld.gallivm;
3205          bld4.context_ptr = bld.context_ptr;
3206          bld4.static_texture_state = bld.static_texture_state;
3207          bld4.static_sampler_state = bld.static_sampler_state;
3208          bld4.dynamic_state = bld.dynamic_state;
3209          bld4.format_desc = bld.format_desc;
3210          bld4.dims = bld.dims;
3211          bld4.row_stride_array = bld.row_stride_array;
3212          bld4.img_stride_array = bld.img_stride_array;
3213          bld4.base_ptr = bld.base_ptr;
3214          bld4.mip_offsets = bld.mip_offsets;
3215          bld4.int_size = bld.int_size;
3216          bld4.cache = bld.cache;
3217
3218          bld4.vector_width = lp_type_width(type4);
3219
3220          bld4.float_type = lp_type_float(32);
3221          bld4.int_type = lp_type_int(32);
3222          bld4.coord_type = type4;
3223          bld4.int_coord_type = lp_int_type(type4);
3224          bld4.float_size_in_type = lp_type_float(32);
3225          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3226          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3227          bld4.texel_type = bld.texel_type;
3228          bld4.texel_type.length = 4;
3229
3230          bld4.num_mips = bld4.num_lods = 1;
3231          if (bld4.no_quad_lod && bld4.no_rho_approx &&
3232              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3233               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3234              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3235             bld4.num_mips = type4.length;
3236             bld4.num_lods = type4.length;
3237          }
3238          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3239              (explicit_lod || lod_bias || derivs)) {
3240             if ((!op_is_tex && target != PIPE_BUFFER) ||
3241                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3242                bld4.num_mips = type4.length;
3243                bld4.num_lods = type4.length;
3244             }
3245             else if (op_is_tex && min_img_filter != mag_img_filter) {
3246                bld4.num_mips = 1;
3247                bld4.num_lods = type4.length;
3248             }
3249          }
3250
3251          /* we want native vector size to be able to use our intrinsics */
3252          bld4.lodf_type = type4;
3253          if (bld4.num_lods != type4.length) {
3254             bld4.lodf_type.length = 1;
3255          }
3256          bld4.lodi_type = lp_int_type(bld4.lodf_type);
3257          bld4.levelf_type = type4;
3258          if (bld4.num_mips != type4.length) {
3259             bld4.levelf_type.length = 1;
3260          }
3261          bld4.leveli_type = lp_int_type(bld4.levelf_type);
3262          bld4.float_size_type = bld4.float_size_in_type;
3263          if (bld4.num_mips > 1) {
3264             bld4.float_size_type.length = bld4.num_mips == type4.length ?
3265                                             bld4.num_mips * bld4.float_size_in_type.length :
3266                                             type4.length;
3267          }
3268          bld4.int_size_type = lp_int_type(bld4.float_size_type);
3269
3270          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3271          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3272          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3273          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3274          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3275          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3276          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3277          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3278          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3279          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3280          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3281          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3282          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3283          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3284
3285          for (i = 0; i < num_quads; i++) {
3286             LLVMValueRef s4, t4, r4;
3287             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3288             LLVMValueRef ilevel04, ilevel14 = NULL;
3289             LLVMValueRef offsets4[4] = { NULL };
3290             unsigned num_lods = bld4.num_lods;
3291
3292             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3293             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3294             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3295
3296             if (offsets[0]) {
3297                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3298                if (dims > 1) {
3299                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3300                   if (dims > 2) {
3301                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3302                   }
3303                }
3304             }
3305             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3306             ilevel04 = bld.num_mips == 1 ? ilevel0 :
3307                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3308             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3309                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3310                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3311             }
3312
3313             if (use_aos) {
3314                /* do sampling/filtering with fixed pt arithmetic */
3315                lp_build_sample_aos(&bld4, sampler_index,
3316                                    s4, t4, r4, offsets4,
3317                                    lod_positive4, lod_fpart4,
3318                                    ilevel04, ilevel14,
3319                                    texelout4);
3320             }
3321
3322             else {
3323                /* this path is currently unreachable and hence might break easily... */
3324                LLVMValueRef newcoords4[5];
3325                newcoords4[0] = s4;
3326                newcoords4[1] = t4;
3327                newcoords4[2] = r4;
3328                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3329                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3330
3331                lp_build_sample_general(&bld4, sampler_index,
3332                                        op_type == LP_SAMPLER_OP_GATHER,
3333                                        newcoords4, offsets4,
3334                                        lod_positive4, lod_fpart4,
3335                                        ilevel04, ilevel14,
3336                                        texelout4);
3337             }
3338             for (j = 0; j < 4; j++) {
3339                texelouttmp[j][i] = texelout4[j];
3340             }
3341          }
3342
3343          for (j = 0; j < 4; j++) {
3344             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3345          }
3346       }
3347    }
3348
3349    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3350       apply_sampler_swizzle(&bld, texel_out);
3351    }
3352
3353    /*
3354     * texel type can be a (32bit) int/uint (for pure int formats only),
3355     * however we are expected to always return floats (storage is untyped).
3356     */
3357    if (!bld.texel_type.floating) {
3358       unsigned chan;
3359       for (chan = 0; chan < 4; chan++) {
3360          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3361                                             lp_build_vec_type(gallivm, type), "");
3362       }
3363    }
3364 }
3365
3366
3367 #define USE_TEX_FUNC_CALL 1
3368
3369 #define LP_MAX_TEX_FUNC_ARGS 32
3370
3371 static inline void
3372 get_target_info(enum pipe_texture_target target,
3373                 unsigned *num_coords, unsigned *num_derivs,
3374                 unsigned *num_offsets, unsigned *layer)
3375 {
3376    unsigned dims = texture_dims(target);
3377    *num_coords = dims;
3378    *num_offsets = dims;
3379    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3380                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3381    *layer = has_layer_coord(target) ? 2: 0;
3382    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3383       /*
3384        * dims doesn't include r coord for cubes - this is handled
3385        * by layer instead, but need to fix up for cube arrays...
3386        */
3387       *layer = 3;
3388       *num_coords = 3;
3389    }
3390 }
3391
3392
3393 /**
3394  * Generate the function body for a texture sampling function.
3395  */
3396 static void
3397 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3398                          const struct lp_static_texture_state *static_texture_state,
3399                          const struct lp_static_sampler_state *static_sampler_state,
3400                          struct lp_sampler_dynamic_state *dynamic_state,
3401                          struct lp_type type,
3402                          unsigned texture_index,
3403                          unsigned sampler_index,
3404                          LLVMValueRef function,
3405                          unsigned num_args,
3406                          unsigned sample_key)
3407 {
3408    LLVMBuilderRef old_builder;
3409    LLVMBasicBlockRef block;
3410    LLVMValueRef coords[5];
3411    LLVMValueRef offsets[3] = { NULL };
3412    LLVMValueRef lod = NULL;
3413    LLVMValueRef context_ptr;
3414    LLVMValueRef thread_data_ptr = NULL;
3415    LLVMValueRef texel_out[4];
3416    struct lp_derivatives derivs;
3417    struct lp_derivatives *deriv_ptr = NULL;
3418    unsigned num_param = 0;
3419    unsigned i, num_coords, num_derivs, num_offsets, layer;
3420    enum lp_sampler_lod_control lod_control;
3421    boolean need_cache = FALSE;
3422
3423    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3424                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3425
3426    get_target_info(static_texture_state->target,
3427                    &num_coords, &num_derivs, &num_offsets, &layer);
3428
3429    if (dynamic_state->cache_ptr) {
3430       const struct util_format_description *format_desc;
3431       format_desc = util_format_description(static_texture_state->format);
3432       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3433          need_cache = TRUE;
3434       }
3435    }
3436
3437    /* "unpack" arguments */
3438    context_ptr = LLVMGetParam(function, num_param++);
3439    if (need_cache) {
3440       thread_data_ptr = LLVMGetParam(function, num_param++);
3441    }
3442    for (i = 0; i < num_coords; i++) {
3443       coords[i] = LLVMGetParam(function, num_param++);
3444    }
3445    for (i = num_coords; i < 5; i++) {
3446       /* This is rather unfortunate... */
3447       coords[i] = lp_build_undef(gallivm, type);
3448    }
3449    if (layer) {
3450       coords[layer] = LLVMGetParam(function, num_param++);
3451    }
3452    if (sample_key & LP_SAMPLER_SHADOW) {
3453       coords[4] = LLVMGetParam(function, num_param++);
3454    }
3455    if (sample_key & LP_SAMPLER_OFFSETS) {
3456       for (i = 0; i < num_offsets; i++) {
3457          offsets[i] = LLVMGetParam(function, num_param++);
3458       }
3459    }
3460    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3461        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3462       lod = LLVMGetParam(function, num_param++);
3463    }
3464    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3465       for (i = 0; i < num_derivs; i++) {
3466          derivs.ddx[i] = LLVMGetParam(function, num_param++);
3467          derivs.ddy[i] = LLVMGetParam(function, num_param++);
3468       }
3469       deriv_ptr = &derivs;
3470    }
3471
3472    assert(num_args == num_param);
3473
3474    /*
3475     * Function body
3476     */
3477
3478    old_builder = gallivm->builder;
3479    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3480    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3481    LLVMPositionBuilderAtEnd(gallivm->builder, block);
3482
3483    lp_build_sample_soa_code(gallivm,
3484                             static_texture_state,
3485                             static_sampler_state,
3486                             dynamic_state,
3487                             type,
3488                             sample_key,
3489                             texture_index,
3490                             sampler_index,
3491                             context_ptr,
3492                             thread_data_ptr,
3493                             coords,
3494                             offsets,
3495                             deriv_ptr,
3496                             lod,
3497                             texel_out);
3498
3499    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3500
3501    LLVMDisposeBuilder(gallivm->builder);
3502    gallivm->builder = old_builder;
3503
3504    gallivm_verify_function(gallivm, function);
3505 }
3506
3507
3508 /**
3509  * Call the matching function for texture sampling.
3510  * If there's no match, generate a new one.
3511  */
3512 static void
3513 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3514                          const struct lp_static_texture_state *static_texture_state,
3515                          const struct lp_static_sampler_state *static_sampler_state,
3516                          struct lp_sampler_dynamic_state *dynamic_state,
3517                          const struct lp_sampler_params *params)
3518 {
3519    LLVMBuilderRef builder = gallivm->builder;
3520    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3521                              LLVMGetInsertBlock(builder)));
3522    LLVMValueRef function, inst;
3523    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3524    LLVMBasicBlockRef bb;
3525    LLVMValueRef tex_ret;
3526    unsigned num_args = 0;
3527    char func_name[64];
3528    unsigned i, num_coords, num_derivs, num_offsets, layer;
3529    unsigned texture_index = params->texture_index;
3530    unsigned sampler_index = params->sampler_index;
3531    unsigned sample_key = params->sample_key;
3532    const LLVMValueRef *coords = params->coords;
3533    const LLVMValueRef *offsets = params->offsets;
3534    const struct lp_derivatives *derivs = params->derivs;
3535    enum lp_sampler_lod_control lod_control;
3536    boolean need_cache = FALSE;
3537
3538    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3539                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3540
3541    get_target_info(static_texture_state->target,
3542                    &num_coords, &num_derivs, &num_offsets, &layer);
3543
3544    if (dynamic_state->cache_ptr) {
3545       const struct util_format_description *format_desc;
3546       format_desc = util_format_description(static_texture_state->format);
3547       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3548          /*
3549           * This is not 100% correct, if we have cache but the
3550           * util_format_s3tc_prefer is true the cache won't get used
3551           * regardless (could hook up the block decode there...) */
3552          need_cache = TRUE;
3553       }
3554    }
3555    /*
3556     * texture function matches are found by name.
3557     * Thus the name has to include both the texture and sampler unit
3558     * (which covers all static state) plus the actual texture function
3559     * (including things like offsets, shadow coord, lod control).
3560     * Additionally lod_property has to be included too.
3561     */
3562
3563    util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3564                  texture_index, sampler_index, sample_key);
3565
3566    function = LLVMGetNamedFunction(module, func_name);
3567
3568    if(!function) {
3569       LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3570       LLVMTypeRef ret_type;
3571       LLVMTypeRef function_type;
3572       LLVMTypeRef val_type[4];
3573       unsigned num_param = 0;
3574
3575       /*
3576        * Generate the function prototype.
3577        */
3578
3579       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3580       if (need_cache) {
3581          arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3582       }
3583       for (i = 0; i < num_coords; i++) {
3584          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3585          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3586       }
3587       if (layer) {
3588          arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3589          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3590       }
3591       if (sample_key & LP_SAMPLER_SHADOW) {
3592          arg_types[num_param++] = LLVMTypeOf(coords[0]);
3593       }
3594       if (sample_key & LP_SAMPLER_OFFSETS) {
3595          for (i = 0; i < num_offsets; i++) {
3596             arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3597             assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3598          }
3599       }
3600       if (lod_control == LP_SAMPLER_LOD_BIAS ||
3601           lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3602          arg_types[num_param++] = LLVMTypeOf(params->lod);
3603       }
3604       else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3605          for (i = 0; i < num_derivs; i++) {
3606             arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3607             arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3608             assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3609             assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3610          }
3611       }
3612
3613       val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3614          lp_build_vec_type(gallivm, params->type);
3615       ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3616       function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3617       function = LLVMAddFunction(module, func_name, function_type);
3618
3619       for (i = 0; i < num_param; ++i) {
3620          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3621
3622             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3623          }
3624       }
3625
3626       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3627       LLVMSetLinkage(function, LLVMInternalLinkage);
3628
3629       lp_build_sample_gen_func(gallivm,
3630                                static_texture_state,
3631                                static_sampler_state,
3632                                dynamic_state,
3633                                params->type,
3634                                texture_index,
3635                                sampler_index,
3636                                function,
3637                                num_param,
3638                                sample_key);
3639    }
3640
3641    num_args = 0;
3642    args[num_args++] = params->context_ptr;
3643    if (need_cache) {
3644       args[num_args++] = params->thread_data_ptr;
3645    }
3646    for (i = 0; i < num_coords; i++) {
3647       args[num_args++] = coords[i];
3648    }
3649    if (layer) {
3650       args[num_args++] = coords[layer];
3651    }
3652    if (sample_key & LP_SAMPLER_SHADOW) {
3653       args[num_args++] = coords[4];
3654    }
3655    if (sample_key & LP_SAMPLER_OFFSETS) {
3656       for (i = 0; i < num_offsets; i++) {
3657          args[num_args++] = offsets[i];
3658       }
3659    }
3660    if (lod_control == LP_SAMPLER_LOD_BIAS ||
3661        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3662       args[num_args++] = params->lod;
3663    }
3664    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3665       for (i = 0; i < num_derivs; i++) {
3666          args[num_args++] = derivs->ddx[i];
3667          args[num_args++] = derivs->ddy[i];
3668       }
3669    }
3670
3671    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3672
3673    tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3674    bb = LLVMGetInsertBlock(builder);
3675    inst = LLVMGetLastInstruction(bb);
3676    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3677
3678    for (i = 0; i < 4; i++) {
3679       params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3680    }
3681 }
3682
3683
3684 /**
3685  * Build texture sampling code.
3686  * Either via a function call or inline it directly.
3687  */
3688 void
3689 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3690                     const struct lp_static_sampler_state *static_sampler_state,
3691                     struct lp_sampler_dynamic_state *dynamic_state,
3692                     struct gallivm_state *gallivm,
3693                     const struct lp_sampler_params *params)
3694 {
3695    boolean use_tex_func = FALSE;
3696
3697    /*
3698     * Do not use a function call if the sampling is "simple enough".
3699     * We define this by
3700     * a) format
3701     * b) no mips (either one level only or no mip filter)
3702     * No mips will definitely make the code smaller, though
3703     * the format requirement is a bit iffy - there's some (SoA) formats
3704     * which definitely generate less code. This does happen to catch
3705     * some important cases though which are hurt quite a bit by using
3706     * a call (though not really because of the call overhead but because
3707     * they are reusing the same texture unit with some of the same
3708     * parameters).
3709     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3710     */
3711
3712    if (USE_TEX_FUNC_CALL) {
3713       const struct util_format_description *format_desc;
3714       boolean simple_format;
3715       boolean simple_tex;
3716       enum lp_sampler_op_type op_type;
3717       format_desc = util_format_description(static_texture_state->format);
3718       simple_format = !format_desc ||
3719                          (util_format_is_rgba8_variant(format_desc) &&
3720                           format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3721
3722       op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3723                     LP_SAMPLER_OP_TYPE_SHIFT;
3724       simple_tex =
3725          op_type != LP_SAMPLER_OP_TEXTURE ||
3726            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3727              static_texture_state->level_zero_only == TRUE) &&
3728             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3729
3730       use_tex_func = format_desc && !(simple_format && simple_tex);
3731    }
3732
3733    if (use_tex_func) {
3734       lp_build_sample_soa_func(gallivm,
3735                                static_texture_state,
3736                                static_sampler_state,
3737                                dynamic_state,
3738                                params);
3739    }
3740    else {
3741       lp_build_sample_soa_code(gallivm,
3742                                static_texture_state,
3743                                static_sampler_state,
3744                                dynamic_state,
3745                                params->type,
3746                                params->sample_key,
3747                                params->texture_index,
3748                                params->sampler_index,
3749                                params->context_ptr,
3750                                params->thread_data_ptr,
3751                                params->coords,
3752                                params->offsets,
3753                                params->derivs,
3754                                params->lod,
3755                                params->texel);
3756    }
3757 }
3758
3759
3760 void
3761 lp_build_size_query_soa(struct gallivm_state *gallivm,
3762                         const struct lp_static_texture_state *static_state,
3763                         struct lp_sampler_dynamic_state *dynamic_state,
3764                         const struct lp_sampler_size_query_params *params)
3765 {
3766    LLVMValueRef lod, level = 0, size;
3767    LLVMValueRef first_level = NULL;
3768    int dims, i;
3769    boolean has_array;
3770    unsigned num_lods = 1;
3771    struct lp_build_context bld_int_vec4;
3772    LLVMValueRef context_ptr = params->context_ptr;
3773    unsigned texture_unit = params->texture_unit;
3774    unsigned target = params->target;
3775
3776    if (static_state->format == PIPE_FORMAT_NONE) {
3777       /*
3778        * If there's nothing bound, format is NONE, and we must return
3779        * all zero as mandated by d3d10 in this case.
3780        */
3781       unsigned chan;
3782       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3783       for (chan = 0; chan < 4; chan++) {
3784          params->sizes_out[chan] = zero;
3785       }
3786       return;
3787    }
3788
3789    /*
3790     * Do some sanity verification about bound texture and shader dcl target.
3791     * Not entirely sure what's possible but assume array/non-array
3792     * always compatible (probably not ok for OpenGL but d3d10 has no
3793     * distinction of arrays at the resource level).
3794     * Everything else looks bogus (though not entirely sure about rect/2d).
3795     * Currently disabled because it causes assertion failures if there's
3796     * nothing bound (or rather a dummy texture, not that this case would
3797     * return the right values).
3798     */
3799    if (0 && static_state->target != target) {
3800       if (static_state->target == PIPE_TEXTURE_1D)
3801          assert(target == PIPE_TEXTURE_1D_ARRAY);
3802       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3803          assert(target == PIPE_TEXTURE_1D);
3804       else if (static_state->target == PIPE_TEXTURE_2D)
3805          assert(target == PIPE_TEXTURE_2D_ARRAY);
3806       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3807          assert(target == PIPE_TEXTURE_2D);
3808       else if (static_state->target == PIPE_TEXTURE_CUBE)
3809          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3810       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3811          assert(target == PIPE_TEXTURE_CUBE);
3812       else
3813          assert(0);
3814    }
3815
3816    dims = texture_dims(target);
3817
3818    switch (target) {
3819    case PIPE_TEXTURE_1D_ARRAY:
3820    case PIPE_TEXTURE_2D_ARRAY:
3821    case PIPE_TEXTURE_CUBE_ARRAY:
3822       has_array = TRUE;
3823       break;
3824    default:
3825       has_array = FALSE;
3826       break;
3827    }
3828
3829    assert(!params->int_type.floating);
3830
3831    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3832
3833    if (params->explicit_lod) {
3834       /* FIXME: this needs to honor per-element lod */
3835       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3836                                     lp_build_const_int32(gallivm, 0), "");
3837       first_level = dynamic_state->first_level(dynamic_state, gallivm,
3838                                                context_ptr, texture_unit);
3839       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3840       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3841    } else {
3842       lod = bld_int_vec4.zero;
3843    }
3844
3845    size = bld_int_vec4.undef;
3846
3847    size = LLVMBuildInsertElement(gallivm->builder, size,
3848                                  dynamic_state->width(dynamic_state, gallivm,
3849                                                       context_ptr, texture_unit),
3850                                  lp_build_const_int32(gallivm, 0), "");
3851
3852    if (dims >= 2) {
3853       size = LLVMBuildInsertElement(gallivm->builder, size,
3854                                     dynamic_state->height(dynamic_state, gallivm,
3855                                                           context_ptr, texture_unit),
3856                                     lp_build_const_int32(gallivm, 1), "");
3857    }
3858
3859    if (dims >= 3) {
3860       size = LLVMBuildInsertElement(gallivm->builder, size,
3861                                     dynamic_state->depth(dynamic_state, gallivm,
3862                                                          context_ptr, texture_unit),
3863                                     lp_build_const_int32(gallivm, 2), "");
3864    }
3865
3866    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3867
3868    if (has_array) {
3869       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3870                                                  context_ptr, texture_unit);
3871       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3872          /*
3873           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3874           * Could avoid this by passing in number of cubes instead of total
3875           * number of layers (might make things easier elsewhere too).
3876           */
3877          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3878          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3879       }
3880       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3881                                     lp_build_const_int32(gallivm, dims), "");
3882    }
3883
3884    /*
3885     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3886     * if level is out of bounds (note this can't cover unbound texture
3887     * here, which also requires returning zero).
3888     */
3889    if (params->explicit_lod && params->is_sviewinfo) {
3890       LLVMValueRef last_level, out, out1;
3891       struct lp_build_context leveli_bld;
3892
3893       /* everything is scalar for now */
3894       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3895       last_level = dynamic_state->last_level(dynamic_state, gallivm,
3896                                              context_ptr, texture_unit);
3897
3898       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3899       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3900       out = lp_build_or(&leveli_bld, out, out1);
3901       if (num_lods == 1) {
3902          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3903       }
3904       else {
3905          /* TODO */
3906          assert(0);
3907       }
3908       size = lp_build_andnot(&bld_int_vec4, size, out);
3909    }
3910    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3911       params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3912                                                 size,
3913                                                 lp_build_const_int32(gallivm, i));
3914    }
3915    if (params->is_sviewinfo) {
3916       for (; i < 4; i++) {
3917          params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3918       }
3919    }
3920
3921    /*
3922     * if there's no explicit_lod (buffers, rects) queries requiring nr of
3923     * mips would be illegal.
3924     */
3925    if (params->is_sviewinfo && params->explicit_lod) {
3926       struct lp_build_context bld_int_scalar;
3927       LLVMValueRef num_levels;
3928       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3929
3930       if (static_state->level_zero_only) {
3931          num_levels = bld_int_scalar.one;
3932       }
3933       else {
3934          LLVMValueRef last_level;
3935
3936          last_level = dynamic_state->last_level(dynamic_state, gallivm,
3937                                                 context_ptr, texture_unit);
3938          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3939          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3940       }
3941       params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3942                                         num_levels);
3943    }
3944 }