src/gallium/auxiliary/gallivm/lp_bld_sample.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- common code.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  */
  34
  35 #include "pipe/p_defines.h"
  36 #include "pipe/p_state.h"
  37 #include "util/u_format.h"
  38 #include "util/u_math.h"
  39 #include "lp_bld_arit.h"
  40 #include "lp_bld_const.h"
  41 #include "lp_bld_debug.h"
  42 #include "lp_bld_printf.h"
  43 #include "lp_bld_flow.h"
  44 #include "lp_bld_sample.h"
  45 #include "lp_bld_swizzle.h"
  46 #include "lp_bld_type.h"
  47 #include "lp_bld_logic.h"
  48 #include "lp_bld_pack.h"
  49 #include "lp_bld_quad.h"
  50
  51
  52 /*
  53  * Bri-linear factor. Should be greater than one.
  54  */
  55 #define BRILINEAR_FACTOR 2
  56
  57 /**
  58  * Does the given texture wrap mode allow sampling the texture border color?
  59  * XXX maybe move this into gallium util code.
  60  */
  61 boolean
  62 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
  63                                        unsigned min_img_filter,
  64                                        unsigned mag_img_filter)
  65 {
  66    switch (mode) {
  67    case PIPE_TEX_WRAP_REPEAT:
  68    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
  69    case PIPE_TEX_WRAP_MIRROR_REPEAT:
  70    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
  71       return FALSE;
  72    case PIPE_TEX_WRAP_CLAMP:
  73    case PIPE_TEX_WRAP_MIRROR_CLAMP:
  74       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
  75           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
  76          return FALSE;
  77       } else {
  78          return TRUE;
  79       }
  80    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
  81    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
  82       return TRUE;
  83    default:
  84       assert(0 && "unexpected wrap mode");
  85       return FALSE;
  86    }
  87 }
  88
  89
  90 /**
  91  * Initialize lp_sampler_static_texture_state object with the gallium
  92  * texture/sampler_view state (this contains the parts which are
  93  * considered static).
  94  */
  95 void
  96 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
  97                                 const struct pipe_sampler_view *view)
  98 {
  99    const struct pipe_resource *texture;
 100
 101    memset(state, 0, sizeof *state);
 102
 103    if (!view || !view->texture)
 104       return;
 105
 106    texture = view->texture;
 107
 108    state->format            = view->format;
 109    state->swizzle_r         = view->swizzle_r;
 110    state->swizzle_g         = view->swizzle_g;
 111    state->swizzle_b         = view->swizzle_b;
 112    state->swizzle_a         = view->swizzle_a;
 113
 114    state->target            = texture->target;
 115    state->pot_width         = util_is_power_of_two(texture->width0);
 116    state->pot_height        = util_is_power_of_two(texture->height0);
 117    state->pot_depth         = util_is_power_of_two(texture->depth0);
 118    state->level_zero_only   = !view->u.tex.last_level;
 119
 120    /*
 121     * the layer / element / level parameters are all either dynamic
 122     * state or handled transparently wrt execution.
 123     */
 124 }
 125
 126
 127 /**
 128  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
 129  * state (this contains the parts which are considered static).
 130  */
 131 void
 132 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
 133                                 const struct pipe_sampler_state *sampler)
 134 {
 135    memset(state, 0, sizeof *state);
 136
 137    if (!sampler)
 138       return;
 139
 140    /*
 141     * We don't copy sampler state over unless it is actually enabled, to avoid
 142     * spurious recompiles, as the sampler static state is part of the shader
 143     * key.
 144     *
 145     * Ideally the state tracker or cso_cache module would make all state
 146     * canonical, but until that happens it's better to be safe than sorry here.
 147     *
 148     * XXX: Actually there's much more than can be done here, especially
 149     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
 150     */
 151
 152    state->wrap_s            = sampler->wrap_s;
 153    state->wrap_t            = sampler->wrap_t;
 154    state->wrap_r            = sampler->wrap_r;
 155    state->min_img_filter    = sampler->min_img_filter;
 156    state->mag_img_filter    = sampler->mag_img_filter;
 157
 158    if (sampler->max_lod > 0.0f) {
 159       state->min_mip_filter = sampler->min_mip_filter;
 160    } else {
 161       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
 162    }
 163
 164    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 165       if (sampler->lod_bias != 0.0f) {
 166          state->lod_bias_non_zero = 1;
 167       }
 168
 169       /* If min_lod == max_lod we can greatly simplify mipmap selection.
 170        * This is a case that occurs during automatic mipmap generation.
 171        */
 172       if (sampler->min_lod == sampler->max_lod) {
 173          state->min_max_lod_equal = 1;
 174       } else {
 175          if (sampler->min_lod > 0.0f) {
 176             state->apply_min_lod = 1;
 177          }
 178
 179          /*
 180           * XXX this won't do anything with the mesa state tracker which always
 181           * sets max_lod to not more than actually present mip maps...
 182           */
 183          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
 184             state->apply_max_lod = 1;
 185          }
 186       }
 187    }
 188
 189    state->compare_mode      = sampler->compare_mode;
 190    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
 191       state->compare_func   = sampler->compare_func;
 192    }
 193
 194    state->normalized_coords = sampler->normalized_coords;
 195 }
 196
 197
 198 /**
 199  * Generate code to compute coordinate gradient (rho).
 200  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 201  *
 202  * The resulting rho is scalar per quad.
 203  */
 204 static LLVMValueRef
 205 lp_build_rho(struct lp_build_sample_context *bld,
 206              unsigned texture_unit,
 207              LLVMValueRef s,
 208              LLVMValueRef t,
 209              LLVMValueRef r,
 210              LLVMValueRef cube_rho,
 211              const struct lp_derivatives *derivs)
 212 {
 213    struct gallivm_state *gallivm = bld->gallivm;
 214    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
 215    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
 216    struct lp_build_context *float_bld = &bld->float_bld;
 217    struct lp_build_context *coord_bld = &bld->coord_bld;
 218    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
 219    const unsigned dims = bld->dims;
 220    LLVMValueRef ddx_ddy[2];
 221    LLVMBuilderRef builder = bld->gallivm->builder;
 222    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
 223    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
 224    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
 225    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
 226    LLVMValueRef rho_vec;
 227    LLVMValueRef int_size, float_size;
 228    LLVMValueRef rho;
 229    LLVMValueRef first_level, first_level_vec;
 230    unsigned length = coord_bld->type.length;
 231    unsigned num_quads = length / 4;
 232    unsigned i;
 233    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 234    LLVMValueRef rho_xvec, rho_yvec;
 235
 236    /* Note that all simplified calculations will only work for isotropic filtering */
 237
 238    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 239                                                  bld->gallivm, texture_unit);
 240    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
 241    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
 242    float_size = lp_build_int_to_float(float_size_bld, int_size);
 243
 244    if (cube_rho) {
 245       LLVMValueRef cubesize;
 246       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
 247       /*
 248        * Cube map code did already everything except size mul and per-quad extraction.
 249        */
 250       rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 251                                       perquadf_bld->type, cube_rho, 0);
 252       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
 253          rho = lp_build_sqrt(perquadf_bld, rho);
 254       }
 255       /* Could optimize this for single quad just skip the broadcast */
 256       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 257                                             perquadf_bld->type, float_size, index0);
 258       rho = lp_build_mul(perquadf_bld, cubesize, rho);
 259    }
 260    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
 261       LLVMValueRef ddmax[3], ddx[3], ddy[3];
 262       for (i = 0; i < dims; i++) {
 263          LLVMValueRef floatdim;
 264          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
 265
 266          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 267                                                coord_bld->type, float_size, indexi);
 268
 269          if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 270             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
 271             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
 272             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
 273             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
 274          }
 275          else {
 276             LLVMValueRef tmpx, tmpy;
 277             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
 278             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
 279             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
 280             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
 281          }
 282       }
 283       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 284          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
 285          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
 286          if (dims > 2) {
 287             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
 288             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
 289          }
 290          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 291          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 292                                          perquadf_bld->type, rho_vec, 0);
 293          /*
 294           * note that as long as we don't care about per-pixel lod could reduce math
 295           * more (at some shuffle cost), but for now only do sqrt after packing.
 296           */
 297          rho = lp_build_sqrt(perquadf_bld, rho);
 298       }
 299       else {
 300          rho_vec = ddmax[0];
 301          if (dims > 1) {
 302             rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
 303             if (dims > 2) {
 304                rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
 305             }
 306          }
 307          /*
 308           * rho_vec now still contains per-pixel rho, convert to scalar per quad
 309           * since we can't handle per-pixel rho/lod from now on (TODO).
 310           */
 311          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 312                                          perquadf_bld->type, rho_vec, 0);
 313       }
 314    }
 315    else {
 316       /*
 317        * This looks all a bit complex, but it's not that bad
 318        * (the shuffle code makes it look worse than it is).
 319        * Still, might not be ideal for all cases.
 320        */
 321       static const unsigned char swizzle0[] = { /* no-op swizzle */
 322          0, LP_BLD_SWIZZLE_DONTCARE,
 323          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 324       };
 325       static const unsigned char swizzle1[] = {
 326          1, LP_BLD_SWIZZLE_DONTCARE,
 327          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 328       };
 329       static const unsigned char swizzle2[] = {
 330          2, LP_BLD_SWIZZLE_DONTCARE,
 331          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 332       };
 333
 334       if (dims < 2) {
 335          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
 336       }
 337       else if (dims >= 2) {
 338          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
 339          if (dims > 2) {
 340             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 341          }
 342       }
 343
 344       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 345          static const unsigned char swizzle01[] = { /* no-op swizzle */
 346             0, 1,
 347             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 348          };
 349          static const unsigned char swizzle23[] = {
 350             2, 3,
 351             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 352          };
 353          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
 354
 355          for (i = 0; i < num_quads; i++) {
 356             shuffles[i*4+0] = shuffles[i*4+1] = index0;
 357             shuffles[i*4+2] = shuffles[i*4+3] = index1;
 358          }
 359          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
 360                                            LLVMConstVector(shuffles, length), "");
 361          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
 362          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
 363          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
 364          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
 365          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
 366
 367          if (dims > 2) {
 368             static const unsigned char swizzle02[] = {
 369                0, 2,
 370                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 371             };
 372             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 373                                                   coord_bld->type, float_size, index2);
 374             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
 375             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 376             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
 377             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
 378          }
 379          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
 380          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 381          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 382
 383          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 384                                          perquadf_bld->type, rho_vec, 0);
 385          rho = lp_build_sqrt(perquadf_bld, rho);
 386       }
 387       else {
 388          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
 389          if (dims > 2) {
 390             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 391          }
 392
 393          if (dims < 2) {
 394             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
 395             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
 396          }
 397          else if (dims == 2) {
 398             static const unsigned char swizzle02[] = {
 399                0, 2,
 400                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 401             };
 402             static const unsigned char swizzle13[] = {
 403                1, 3,
 404                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 405             };
 406             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
 407             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
 408          }
 409          else {
 410             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
 411             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
 412             assert(dims == 3);
 413             for (i = 0; i < num_quads; i++) {
 414                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
 415                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
 416                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
 417                shuffles1[4*i + 3] = i32undef;
 418                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
 419                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
 420                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
 421                shuffles2[4*i + 3] = i32undef;
 422             }
 423             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
 424                                               LLVMConstVector(shuffles1, length), "");
 425             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
 426                                               LLVMConstVector(shuffles2, length), "");
 427          }
 428
 429          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 430
 431          if (bld->coord_type.length > 4) {
 432             /* expand size to each quad */
 433             if (dims > 1) {
 434                /* could use some broadcast_vector helper for this? */
 435                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
 436                for (i = 0; i < num_quads; i++) {
 437                   src[i] = float_size;
 438                }
 439                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
 440             }
 441             else {
 442                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
 443             }
 444             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
 445
 446             if (dims <= 1) {
 447                rho = rho_vec;
 448             }
 449             else {
 450                if (dims >= 2) {
 451                   LLVMValueRef rho_s, rho_t, rho_r;
 452
 453                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
 454                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 455
 456                   rho = lp_build_max(coord_bld, rho_s, rho_t);
 457
 458                   if (dims >= 3) {
 459                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
 460                      rho = lp_build_max(coord_bld, rho, rho_r);
 461                   }
 462                }
 463             }
 464             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 465                                             perquadf_bld->type, rho, 0);
 466          }
 467          else {
 468             if (dims <= 1) {
 469                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 470             }
 471             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
 472
 473             if (dims <= 1) {
 474                rho = rho_vec;
 475             }
 476             else {
 477                if (dims >= 2) {
 478                   LLVMValueRef rho_s, rho_t, rho_r;
 479
 480                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 481                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
 482
 483                   rho = lp_build_max(float_bld, rho_s, rho_t);
 484
 485                   if (dims >= 3) {
 486                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
 487                      rho = lp_build_max(float_bld, rho, rho_r);
 488                   }
 489                }
 490             }
 491          }
 492       }
 493    }
 494
 495    return rho;
 496 }
 497
 498
 499 /*
 500  * Bri-linear lod computation
 501  *
 502  * Use a piece-wise linear approximation of log2 such that:
 503  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
 504  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
 505  *   with the steepness specified in 'factor'
 506  * - exact result for 0.5, 1.5, etc.
 507  *
 508  *
 509  *   1.0 -              /----*
 510  *                     /
 511  *                    /
 512  *                   /
 513  *   0.5 -          *
 514  *                 /
 515  *                /
 516  *               /
 517  *   0.0 - *----/
 518  *
 519  *         |                 |
 520  *        2^0               2^1
 521  *
 522  * This is a technique also commonly used in hardware:
 523  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
 524  *
 525  * TODO: For correctness, this should only be applied when texture is known to
 526  * have regular mipmaps, i.e., mipmaps derived from the base level.
 527  *
 528  * TODO: This could be done in fixed point, where applicable.
 529  */
 530 static void
 531 lp_build_brilinear_lod(struct lp_build_context *bld,
 532                        LLVMValueRef lod,
 533                        double factor,
 534                        LLVMValueRef *out_lod_ipart,
 535                        LLVMValueRef *out_lod_fpart)
 536 {
 537    LLVMValueRef lod_fpart;
 538    double pre_offset = (factor - 0.5)/factor - 0.5;
 539    double post_offset = 1 - factor;
 540
 541    if (0) {
 542       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
 543    }
 544
 545    lod = lp_build_add(bld, lod,
 546                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
 547
 548    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
 549
 550    lod_fpart = lp_build_mul(bld, lod_fpart,
 551                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 552
 553    lod_fpart = lp_build_add(bld, lod_fpart,
 554                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 555
 556    /*
 557     * It's not necessary to clamp lod_fpart since:
 558     * - the above expression will never produce numbers greater than one.
 559     * - the mip filtering branch is only taken if lod_fpart is positive
 560     */
 561
 562    *out_lod_fpart = lod_fpart;
 563
 564    if (0) {
 565       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
 566       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
 567    }
 568 }
 569
 570
 571 /*
 572  * Combined log2 and brilinear lod computation.
 573  *
 574  * It's in all identical to calling lp_build_fast_log2() and
 575  * lp_build_brilinear_lod() above, but by combining we can compute the integer
 576  * and fractional part independently.
 577  */
 578 static void
 579 lp_build_brilinear_rho(struct lp_build_context *bld,
 580                        LLVMValueRef rho,
 581                        double factor,
 582                        LLVMValueRef *out_lod_ipart,
 583                        LLVMValueRef *out_lod_fpart)
 584 {
 585    LLVMValueRef lod_ipart;
 586    LLVMValueRef lod_fpart;
 587
 588    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
 589    const double post_offset = 1 - 2*factor;
 590
 591    assert(bld->type.floating);
 592
 593    assert(lp_check_value(bld->type, rho));
 594
 595    /*
 596     * The pre factor will make the intersections with the exact powers of two
 597     * happen precisely where we want then to be, which means that the integer
 598     * part will not need any post adjustments.
 599     */
 600    rho = lp_build_mul(bld, rho,
 601                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
 602
 603    /* ipart = ifloor(log2(rho)) */
 604    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
 605
 606    /* fpart = rho / 2**ipart */
 607    lod_fpart = lp_build_extract_mantissa(bld, rho);
 608
 609    lod_fpart = lp_build_mul(bld, lod_fpart,
 610                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 611
 612    lod_fpart = lp_build_add(bld, lod_fpart,
 613                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 614
 615    /*
 616     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
 617     * - the above expression will never produce numbers greater than one.
 618     * - the mip filtering branch is only taken if lod_fpart is positive
 619     */
 620
 621    *out_lod_ipart = lod_ipart;
 622    *out_lod_fpart = lod_fpart;
 623 }
 624
 625
 626 /**
 627  * Generate code to compute texture level of detail (lambda).
 628  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 629  * \param lod_bias  optional float vector with the shader lod bias
 630  * \param explicit_lod  optional float vector with the explicit lod
 631  * \param width  scalar int texture width
 632  * \param height  scalar int texture height
 633  * \param depth  scalar int texture depth
 634  *
 635  * The resulting lod is scalar per quad, so only the first value per quad
 636  * passed in from lod_bias, explicit_lod is used.
 637  */
 638 void
 639 lp_build_lod_selector(struct lp_build_sample_context *bld,
 640                       unsigned texture_unit,
 641                       unsigned sampler_unit,
 642                       LLVMValueRef s,
 643                       LLVMValueRef t,
 644                       LLVMValueRef r,
 645                       LLVMValueRef cube_rho,
 646                       const struct lp_derivatives *derivs,
 647                       LLVMValueRef lod_bias, /* optional */
 648                       LLVMValueRef explicit_lod, /* optional */
 649                       unsigned mip_filter,
 650                       LLVMValueRef *out_lod_ipart,
 651                       LLVMValueRef *out_lod_fpart)
 652
 653 {
 654    LLVMBuilderRef builder = bld->gallivm->builder;
 655    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
 656    LLVMValueRef lod;
 657
 658    *out_lod_ipart = bld->perquadi_bld.zero;
 659    *out_lod_fpart = perquadf_bld->zero;
 660
 661    if (bld->static_sampler_state->min_max_lod_equal) {
 662       /* User is forcing sampling from a particular mipmap level.
 663        * This is hit during mipmap generation.
 664        */
 665       LLVMValueRef min_lod =
 666          bld->dynamic_state->min_lod(bld->dynamic_state,
 667                                      bld->gallivm, sampler_unit);
 668
 669       lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
 670    }
 671    else {
 672       if (explicit_lod) {
 673          lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 674                                          perquadf_bld->type, explicit_lod, 0);
 675       }
 676       else {
 677          LLVMValueRef rho;
 678
 679          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 680
 681          /*
 682           * Compute lod = log2(rho)
 683           */
 684
 685          if (!lod_bias &&
 686              !bld->static_sampler_state->lod_bias_non_zero &&
 687              !bld->static_sampler_state->apply_max_lod &&
 688              !bld->static_sampler_state->apply_min_lod) {
 689             /*
 690              * Special case when there are no post-log2 adjustments, which
 691              * saves instructions but keeping the integer and fractional lod
 692              * computations separate from the start.
 693              */
 694
 695             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
 696                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
 697                *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
 698                *out_lod_fpart = perquadf_bld->zero;
 699                return;
 700             }
 701             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
 702                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 703                lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
 704                                       out_lod_ipart, out_lod_fpart);
 705                return;
 706             }
 707          }
 708
 709          if (0) {
 710             lod = lp_build_log2(perquadf_bld, rho);
 711          }
 712          else {
 713             lod = lp_build_fast_log2(perquadf_bld, rho);
 714          }
 715
 716          /* add shader lod bias */
 717          if (lod_bias) {
 718             lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 719                   perquadf_bld->type, lod_bias, 0);
 720             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
 721          }
 722       }
 723
 724       /* add sampler lod bias */
 725       if (bld->static_sampler_state->lod_bias_non_zero) {
 726          LLVMValueRef sampler_lod_bias =
 727             bld->dynamic_state->lod_bias(bld->dynamic_state,
 728                                          bld->gallivm, sampler_unit);
 729          sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
 730                                                       sampler_lod_bias);
 731          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
 732       }
 733
 734       /* clamp lod */
 735       if (bld->static_sampler_state->apply_max_lod) {
 736          LLVMValueRef max_lod =
 737             bld->dynamic_state->max_lod(bld->dynamic_state,
 738                                         bld->gallivm, sampler_unit);
 739          max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
 740
 741          lod = lp_build_min(perquadf_bld, lod, max_lod);
 742       }
 743       if (bld->static_sampler_state->apply_min_lod) {
 744          LLVMValueRef min_lod =
 745             bld->dynamic_state->min_lod(bld->dynamic_state,
 746                                         bld->gallivm, sampler_unit);
 747          min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
 748
 749          lod = lp_build_max(perquadf_bld, lod, min_lod);
 750       }
 751    }
 752
 753    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
 754       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 755          lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
 756                                 out_lod_ipart, out_lod_fpart);
 757       }
 758       else {
 759          lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
 760       }
 761
 762       lp_build_name(*out_lod_fpart, "lod_fpart");
 763    }
 764    else {
 765       *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
 766    }
 767
 768    lp_build_name(*out_lod_ipart, "lod_ipart");
 769
 770    return;
 771 }
 772
 773
 774 /**
 775  * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
 776  * mipmap level index.
 777  * Note: this is all scalar per quad code.
 778  * \param lod_ipart  int texture level of detail
 779  * \param level_out  returns integer
 780  */
 781 void
 782 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 783                            unsigned texture_unit,
 784                            LLVMValueRef lod_ipart,
 785                            LLVMValueRef *level_out)
 786 {
 787    struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
 788    LLVMValueRef first_level, last_level, level;
 789
 790    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 791                                                  bld->gallivm, texture_unit);
 792    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 793                                                bld->gallivm, texture_unit);
 794    first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
 795    last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
 796
 797    level = lp_build_add(perquadi_bld, lod_ipart, first_level);
 798
 799    /* clamp level to legal range of levels */
 800    *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
 801 }
 802
 803
 804 /**
 805  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
 806  * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
 807  * Later, we'll sample from those two mipmap levels and interpolate between them.
 808  */
 809 void
 810 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
 811                            unsigned texture_unit,
 812                            LLVMValueRef lod_ipart,
 813                            LLVMValueRef *lod_fpart_inout,
 814                            LLVMValueRef *level0_out,
 815                            LLVMValueRef *level1_out)
 816 {
 817    LLVMBuilderRef builder = bld->gallivm->builder;
 818    struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
 819    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
 820    LLVMValueRef first_level, last_level;
 821    LLVMValueRef clamp_min;
 822    LLVMValueRef clamp_max;
 823
 824    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 825                                                  bld->gallivm, texture_unit);
 826    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 827                                                bld->gallivm, texture_unit);
 828    first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
 829    last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
 830
 831    *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
 832    *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
 833
 834    /*
 835     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
 836     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
 837     * ends in the process.
 838     */
 839
 840    /*
 841     * This code (vector select in particular) only works with llvm 3.1
 842     * (if there's more than one quad, with x86 backend). Might consider
 843     * converting to our lp_bld_logic helpers.
 844     */
 845 #if HAVE_LLVM < 0x0301
 846    assert(perquadi_bld->type.length == 1);
 847 #endif
 848
 849    /* *level0_out < first_level */
 850    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
 851                              *level0_out, first_level,
 852                              "clamp_lod_to_first");
 853
 854    *level0_out = LLVMBuildSelect(builder, clamp_min,
 855                                  first_level, *level0_out, "");
 856
 857    *level1_out = LLVMBuildSelect(builder, clamp_min,
 858                                  first_level, *level1_out, "");
 859
 860    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
 861                                       perquadf_bld->zero, *lod_fpart_inout, "");
 862
 863    /* *level0_out >= last_level */
 864    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
 865                              *level0_out, last_level,
 866                              "clamp_lod_to_last");
 867
 868    *level0_out = LLVMBuildSelect(builder, clamp_max,
 869                                  last_level, *level0_out, "");
 870
 871    *level1_out = LLVMBuildSelect(builder, clamp_max,
 872                                  last_level, *level1_out, "");
 873
 874    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
 875                                       perquadf_bld->zero, *lod_fpart_inout, "");
 876
 877    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
 878    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
 879    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
 880 }
 881
 882
 883 /**
 884  * Return pointer to a single mipmap level.
 885  * \param level  integer mipmap level
 886  */
 887 LLVMValueRef
 888 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 889                           LLVMValueRef level)
 890 {
 891    LLVMBuilderRef builder = bld->gallivm->builder;
 892    LLVMValueRef indexes[2], data_ptr, mip_offset;
 893
 894    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 895    indexes[1] = level;
 896    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 897    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
 898    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
 899    return data_ptr;
 900 }
 901
 902 /**
 903  * Return (per-pixel) offsets to mip levels.
 904  * \param level  integer mipmap level
 905  */
 906 LLVMValueRef
 907 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
 908                          LLVMValueRef level)
 909 {
 910    LLVMBuilderRef builder = bld->gallivm->builder;
 911    LLVMValueRef indexes[2], offsets, offset1;
 912
 913    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 914    if (bld->num_lods == 1) {
 915       indexes[1] = level;
 916       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 917       offset1 = LLVMBuildLoad(builder, offset1, "");
 918       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
 919    }
 920    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
 921       unsigned i;
 922
 923       offsets = bld->int_coord_bld.undef;
 924       for (i = 0; i < bld->num_lods; i++) {
 925          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 926          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
 927          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 928          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 929          offset1 = LLVMBuildLoad(builder, offset1, "");
 930          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
 931       }
 932       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
 933    }
 934    else {
 935       unsigned i;
 936
 937       assert (bld->num_lods == bld->coord_bld.type.length);
 938
 939       offsets = bld->int_coord_bld.undef;
 940       for (i = 0; i < bld->num_lods; i++) {
 941          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 942          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 943          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 944          offset1 = LLVMBuildLoad(builder, offset1, "");
 945          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
 946       }
 947    }
 948    return offsets;
 949 }
 950
 951
 952 /**
 953  * Codegen equivalent for u_minify().
 954  * Return max(1, base_size >> level);
 955  */
 956 LLVMValueRef
 957 lp_build_minify(struct lp_build_context *bld,
 958                 LLVMValueRef base_size,
 959                 LLVMValueRef level)
 960 {
 961    LLVMBuilderRef builder = bld->gallivm->builder;
 962    assert(lp_check_value(bld->type, base_size));
 963    assert(lp_check_value(bld->type, level));
 964
 965    if (level == bld->zero) {
 966       /* if we're using mipmap level zero, no minification is needed */
 967       return base_size;
 968    }
 969    else {
 970       LLVMValueRef size =
 971          LLVMBuildLShr(builder, base_size, level, "minify");
 972       assert(bld->type.sign);
 973       size = lp_build_max(bld, size, bld->one);
 974       return size;
 975    }
 976 }
 977
 978
 979 /**
 980  * Dereference stride_array[mipmap_level] array to get a stride.
 981  * Return stride as a vector.
 982  */
 983 static LLVMValueRef
 984 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
 985                               LLVMValueRef stride_array, LLVMValueRef level)
 986 {
 987    LLVMBuilderRef builder = bld->gallivm->builder;
 988    LLVMValueRef indexes[2], stride, stride1;
 989    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 990    if (bld->num_lods == 1) {
 991       indexes[1] = level;
 992       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
 993       stride1 = LLVMBuildLoad(builder, stride1, "");
 994       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
 995    }
 996    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
 997       LLVMValueRef stride1;
 998       unsigned i;
 999
1000       stride = bld->int_coord_bld.undef;
1001       for (i = 0; i < bld->num_lods; i++) {
1002          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1003          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1004          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1005          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1006          stride1 = LLVMBuildLoad(builder, stride1, "");
1007          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
1008       }
1009       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
1010    }
1011    else {
1012       LLVMValueRef stride1;
1013       unsigned i;
1014
1015       assert (bld->num_lods == bld->coord_bld.type.length);
1016
1017       stride = bld->int_coord_bld.undef;
1018       for (i = 0; i < bld->coord_bld.type.length; i++) {
1019          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1020          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1021          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1022          stride1 = LLVMBuildLoad(builder, stride1, "");
1023          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
1024       }
1025    }
1026    return stride;
1027 }
1028
1029
1030 /**
1031  * When sampling a mipmap, we need to compute the width, height, depth
1032  * of the source levels from the level indexes.  This helper function
1033  * does that.
1034  */
1035 void
1036 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
1037                             LLVMValueRef ilevel,
1038                             LLVMValueRef *out_size,
1039                             LLVMValueRef *row_stride_vec,
1040                             LLVMValueRef *img_stride_vec)
1041 {
1042    const unsigned dims = bld->dims;
1043    LLVMValueRef ilevel_vec;
1044
1045    /*
1046     * Compute width, height, depth at mipmap level 'ilevel'
1047     */
1048    if (bld->num_lods == 1) {
1049       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
1050       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
1051    }
1052    else {
1053       LLVMValueRef int_size_vec;
1054       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
1055       unsigned num_quads = bld->coord_bld.type.length / 4;
1056       unsigned i;
1057
1058       if (bld->num_lods == num_quads) {
1059          /*
1060           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
1061           * intel "forgot" the variable shift count instruction until avx2.
1062           * A harmless 8x32 shift gets translated into 32 instructions
1063           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
1064           * unable to recognize if there are really just 2 different shift
1065           * count values. So do the shift 4-wide before expansion.
1066           */
1067          struct lp_build_context bld4;
1068          struct lp_type type4;
1069
1070          type4 = bld->int_coord_bld.type;
1071          type4.length = 4;
1072
1073          lp_build_context_init(&bld4, bld->gallivm, type4);
1074
1075          if (bld->dims == 1) {
1076             assert(bld->int_size_in_bld.type.length == 1);
1077             int_size_vec = lp_build_broadcast_scalar(&bld4,
1078                                                      bld->int_size);
1079          }
1080          else {
1081             assert(bld->int_size_in_bld.type.length == 4);
1082             int_size_vec = bld->int_size;
1083          }
1084
1085          for (i = 0; i < num_quads; i++) {
1086             LLVMValueRef ileveli;
1087             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1088
1089             ileveli = lp_build_extract_broadcast(bld->gallivm,
1090                                                  bld->perquadi_bld.type,
1091                                                  bld4.type,
1092                                                  ilevel,
1093                                                  indexi);
1094             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
1095          }
1096          /*
1097           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
1098           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1099           */
1100          *out_size = lp_build_concat(bld->gallivm,
1101                                      tmp,
1102                                      bld4.type,
1103                                      num_quads);
1104       }
1105       else {
1106         /* FIXME: this is terrible and results in _huge_ vector
1107          * (for the dims > 1 case).
1108          * Should refactor this (together with extract_image_sizes) and do
1109          * something more useful. Could for instance if we have width,height
1110          * with 4-wide vector pack all elements into a 8xi16 vector
1111          * (on which we can still do useful math) instead of using a 16xi32
1112          * vector.
1113          * FIXME: some callers can't handle this yet.
1114          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1115          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
1116          */
1117          assert(bld->num_lods == bld->coord_bld.type.length);
1118          if (bld->dims == 1) {
1119             assert(bld->int_size_bld.type.length == 1);
1120             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1121                                                      bld->int_size);
1122             /* vector shift with variable shift count alert... */
1123             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
1124          }
1125          else {
1126             LLVMValueRef ilevel1;
1127             for (i = 0; i < bld->num_lods; i++) {
1128                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1129                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
1130                                                     bld->int_size_in_bld.type, ilevel, indexi);
1131                tmp[i] = bld->int_size;
1132                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1133             }
1134             int_size_vec = lp_build_concat(bld->gallivm,
1135                                            tmp,
1136                                            bld->int_size_in_bld.type,
1137                                            bld->num_lods);
1138          }
1139       }
1140    }
1141
1142    if (dims >= 2) {
1143       *row_stride_vec = lp_build_get_level_stride_vec(bld,
1144                                                       bld->row_stride_array,
1145                                                       ilevel);
1146    }
1147    if (dims == 3 ||
1148        bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1149        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1150        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1151       *img_stride_vec = lp_build_get_level_stride_vec(bld,
1152                                                       bld->img_stride_array,
1153                                                       ilevel);
1154    }
1155 }
1156
1157
1158 /**
1159  * Extract and broadcast texture size.
1160  *
1161  * @param size_type   type of the texture size vector (either
1162  *                    bld->int_size_type or bld->float_size_type)
1163  * @param coord_type  type of the texture size vector (either
1164  *                    bld->int_coord_type or bld->coord_type)
1165  * @param size        vector with the texture size (width, height, depth)
1166  */
1167 void
1168 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1169                              struct lp_build_context *size_bld,
1170                              struct lp_type coord_type,
1171                              LLVMValueRef size,
1172                              LLVMValueRef *out_width,
1173                              LLVMValueRef *out_height,
1174                              LLVMValueRef *out_depth)
1175 {
1176    const unsigned dims = bld->dims;
1177    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1178    struct lp_type size_type = size_bld->type;
1179
1180    if (bld->num_lods == 1) {
1181       *out_width = lp_build_extract_broadcast(bld->gallivm,
1182                                               size_type,
1183                                               coord_type,
1184                                               size,
1185                                               LLVMConstInt(i32t, 0, 0));
1186       if (dims >= 2) {
1187          *out_height = lp_build_extract_broadcast(bld->gallivm,
1188                                                   size_type,
1189                                                   coord_type,
1190                                                   size,
1191                                                   LLVMConstInt(i32t, 1, 0));
1192          if (dims == 3) {
1193             *out_depth = lp_build_extract_broadcast(bld->gallivm,
1194                                                     size_type,
1195                                                     coord_type,
1196                                                     size,
1197                                                     LLVMConstInt(i32t, 2, 0));
1198          }
1199       }
1200    }
1201    else {
1202       unsigned num_quads = bld->coord_bld.type.length / 4;
1203
1204       if (dims == 1) {
1205          *out_width = size;
1206       }
1207       else if (bld->num_lods == num_quads) {
1208          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1209          if (dims >= 2) {
1210             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1211             if (dims == 3) {
1212                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1213             }
1214          }
1215       }
1216       else {
1217          assert(bld->num_lods == bld->coord_type.length);
1218          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1219                                                 coord_type, size, 0);
1220          if (dims >= 2) {
1221             *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1222                                                    coord_type, size, 1);
1223             if (dims == 3) {
1224                *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1225                                                       coord_type, size, 2);
1226             }
1227          }
1228       }
1229    }
1230 }
1231
1232
1233 /**
1234  * Unnormalize coords.
1235  *
1236  * @param flt_size  vector with the integer texture size (width, height, depth)
1237  */
1238 void
1239 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1240                              LLVMValueRef flt_size,
1241                              LLVMValueRef *s,
1242                              LLVMValueRef *t,
1243                              LLVMValueRef *r)
1244 {
1245    const unsigned dims = bld->dims;
1246    LLVMValueRef width;
1247    LLVMValueRef height;
1248    LLVMValueRef depth;
1249
1250    lp_build_extract_image_sizes(bld,
1251                                 &bld->float_size_bld,
1252                                 bld->coord_type,
1253                                 flt_size,
1254                                 &width,
1255                                 &height,
1256                                 &depth);
1257
1258    /* s = s * width, t = t * height */
1259    *s = lp_build_mul(&bld->coord_bld, *s, width);
1260    if (dims >= 2) {
1261       *t = lp_build_mul(&bld->coord_bld, *t, height);
1262       if (dims >= 3) {
1263          *r = lp_build_mul(&bld->coord_bld, *r, depth);
1264       }
1265    }
1266 }
1267
1268
1269 /** Helper used by lp_build_cube_lookup() */
1270 static LLVMValueRef
1271 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1272 {
1273    /* ima = +0.5 / abs(coord); */
1274    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1275    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1276    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1277    return ima;
1278 }
1279
1280 /** Helper used by lp_build_cube_lookup() */
1281 static LLVMValueRef
1282 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1283 {
1284    /* ima = -0.5 / abs(coord); */
1285    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1286    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1287    LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1288    return ima;
1289 }
1290
1291 /**
1292  * Helper used by lp_build_cube_lookup()
1293  * FIXME: the sign here can also be 0.
1294  * Arithmetically this could definitely make a difference. Either
1295  * fix the comment or use other (simpler) sign function, not sure
1296  * which one it should be.
1297  * \param sign  scalar +1 or -1
1298  * \param coord  float vector
1299  * \param ima  float vector
1300  */
1301 static LLVMValueRef
1302 lp_build_cube_coord(struct lp_build_context *coord_bld,
1303                     LLVMValueRef sign, int negate_coord,
1304                     LLVMValueRef coord, LLVMValueRef ima)
1305 {
1306    /* return negate(coord) * ima * sign + 0.5; */
1307    LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1308    LLVMValueRef res;
1309
1310    assert(negate_coord == +1 || negate_coord == -1);
1311
1312    if (negate_coord == -1) {
1313       coord = lp_build_negate(coord_bld, coord);
1314    }
1315
1316    res = lp_build_mul(coord_bld, coord, ima);
1317    if (sign) {
1318       sign = lp_build_broadcast_scalar(coord_bld, sign);
1319       res = lp_build_mul(coord_bld, res, sign);
1320    }
1321    res = lp_build_add(coord_bld, res, half);
1322
1323    return res;
1324 }
1325
1326
1327 /** Helper used by lp_build_cube_lookup()
1328  * Return (major_coord >= 0) ? pos_face : neg_face;
1329  */
1330 static LLVMValueRef
1331 lp_build_cube_face(struct lp_build_sample_context *bld,
1332                    LLVMValueRef major_coord,
1333                    unsigned pos_face, unsigned neg_face)
1334 {
1335    struct gallivm_state *gallivm = bld->gallivm;
1336    LLVMBuilderRef builder = gallivm->builder;
1337    LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1338                                     major_coord,
1339                                     bld->float_bld.zero, "");
1340    LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1341    LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1342    LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1343    return res;
1344 }
1345
1346
1347
1348 /**
1349  * Generate code to do cube face selection and compute per-face texcoords.
1350  */
1351 void
1352 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1353                      LLVMValueRef s,
1354                      LLVMValueRef t,
1355                      LLVMValueRef r,
1356                      const struct lp_derivatives *derivs, /* optional */
1357                      LLVMValueRef *face,
1358                      LLVMValueRef *face_s,
1359                      LLVMValueRef *face_t,
1360                      LLVMValueRef *rho,
1361                      boolean need_derivs)
1362 {
1363    struct lp_build_context *coord_bld = &bld->coord_bld;
1364    LLVMBuilderRef builder = bld->gallivm->builder;
1365    struct gallivm_state *gallivm = bld->gallivm;
1366    LLVMValueRef si, ti, ri;
1367
1368    if (1 || coord_bld->type.length > 4) {
1369       /*
1370        * Do per-pixel face selection. We cannot however (as we used to do)
1371        * simply calculate the derivs afterwards (which is very bogus for
1372        * explicit derivs btw) because the values would be "random" when
1373        * not all pixels lie on the same face. So what we do here is just
1374        * calculate the derivatives after scaling the coords by the absolute
1375        * value of the inverse major axis, and essentially do rho calculation
1376        * steps as if it were a 3d texture. This is perfect if all pixels hit
1377        * the same face, but not so great at edges, I believe the max error
1378        * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
1379        * the 3d distance between 2 points on the cube instead of measuring up/down
1380        * the edge). Still this is possibly a win over just selecting the same face
1381        * for all pixels. Unfortunately, something like that doesn't work for
1382        * explicit derivatives.
1383        * TODO: handle explicit derivatives by transforming them alongside coords
1384        * somehow.
1385        */
1386       struct lp_build_context *cint_bld = &bld->int_coord_bld;
1387       struct lp_type intctype = cint_bld->type;
1388       LLVMValueRef signs, signt, signr, signma;
1389       LLVMValueRef as, at, ar;
1390       LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
1391       LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1392       LLVMValueRef tnegi, rnegi;
1393       LLVMValueRef ma, mai, ima;
1394       LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1395       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1396                                                      1 << (intctype.width - 1));
1397       LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1398                                                       intctype.width -1);
1399       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1400       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1401       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1402
1403       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1404       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1405       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1406
1407       /*
1408        * get absolute value (for x/y/z face selection) and sign bit
1409        * (for mirroring minor coords and pos/neg face selection)
1410        * of the original coords.
1411        */
1412       as = lp_build_abs(&bld->coord_bld, s);
1413       at = lp_build_abs(&bld->coord_bld, t);
1414       ar = lp_build_abs(&bld->coord_bld, r);
1415
1416       /*
1417        * major face determination: select x if x > y else select y
1418        * select z if z >= max(x,y) else select previous result
1419        * if some axis are the same we chose z over y, y over x - the
1420        * dx10 spec seems to ask for it while OpenGL doesn't care (if we
1421        * wouldn't care could save a select or two if using different
1422        * compares and doing at_g_as_ar last since tnewx and tnewz are the
1423        * same).
1424        */
1425       as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
1426       maxasat = lp_build_max(coord_bld, as, at);
1427       ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
1428
1429       if (need_derivs) {
1430          LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
1431          static const unsigned char swizzle0[] = { /* no-op swizzle */
1432             0, LP_BLD_SWIZZLE_DONTCARE,
1433             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1434          };
1435          static const unsigned char swizzle1[] = {
1436             1, LP_BLD_SWIZZLE_DONTCARE,
1437             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1438          };
1439          static const unsigned char swizzle01[] = { /* no-op swizzle */
1440             0, 1,
1441             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1442          };
1443          static const unsigned char swizzle23[] = {
1444             2, 3,
1445             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1446          };
1447          static const unsigned char swizzle02[] = {
1448             0, 2,
1449             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1450          };
1451
1452          /*
1453           * scale the s/t/r coords pre-select/mirror so we can calculate
1454           * "reasonable" derivs.
1455           */
1456          ma = lp_build_select(coord_bld, as_ge_at, s, t);
1457          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1458          ima = lp_build_cube_imapos(coord_bld, ma);
1459          s = lp_build_mul(coord_bld, s, ima);
1460          t = lp_build_mul(coord_bld, t, ima);
1461          r = lp_build_mul(coord_bld, r, ima);
1462
1463          /*
1464           * This isn't quite the same as the "ordinary" (3d deriv) path since we
1465           * know the texture is square which simplifies things (we can omit the
1466           * size mul which happens very early completely here and do it at the
1467           * very end).
1468           */
1469          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
1470          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
1471
1472          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1473             ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
1474             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
1475          }
1476          else {
1477             ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
1478             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
1479          }
1480
1481          tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
1482          tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
1483          tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
1484
1485          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1486             rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
1487             rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
1488          }
1489          else {
1490             rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
1491             rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
1492          }
1493
1494          tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
1495          tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
1496          *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
1497       }
1498
1499       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1500       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1501       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1502       signs = LLVMBuildAnd(builder, si, signmask, "");
1503       signt = LLVMBuildAnd(builder, ti, signmask, "");
1504       signr = LLVMBuildAnd(builder, ri, signmask, "");
1505
1506       /*
1507        * compute all possible new s/t coords
1508        * snewx = signs * -r;
1509        * tnewx = -t;
1510        * snewy = s;
1511        * tnewy = signt * r;
1512        * snewz = signr * s;
1513        * tnewz = -t;
1514        */
1515       tnegi = LLVMBuildXor(builder, ti, signmask, "");
1516       rnegi = LLVMBuildXor(builder, ri, signmask, "");
1517
1518       snewx = LLVMBuildXor(builder, signs, rnegi, "");
1519       tnewx = tnegi;
1520
1521       snewy = si;
1522       tnewy = LLVMBuildXor(builder, signt, ri, "");
1523
1524       snewz = LLVMBuildXor(builder, signr, si, "");
1525       tnewz = tnegi;
1526
1527       /* XXX on x86 unclear if we should cast the values back to float
1528        * or not - on some cpus (nehalem) pblendvb has twice the throughput
1529        * of blendvps though on others there just might be domain
1530        * transition penalties when using it (this depends on what llvm
1531        * will chose for the bit ops above so there appears no "right way",
1532        * but given the boatload of selects let's just use the int type).
1533        */
1534
1535       /* select/mirror */
1536       if (!need_derivs) {
1537          ma = lp_build_select(coord_bld, as_ge_at, s, t);
1538       }
1539       *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
1540       *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
1541       *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
1542
1543       if (!need_derivs) {
1544          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1545       }
1546       *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
1547       *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
1548       *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
1549
1550       *face_s = LLVMBuildBitCast(builder, *face_s,
1551                                lp_build_vec_type(gallivm, coord_bld->type), "");
1552       *face_t = LLVMBuildBitCast(builder, *face_t,
1553                                lp_build_vec_type(gallivm, coord_bld->type), "");
1554
1555       /* add +1 for neg face */
1556       /* XXX with AVX probably want to use another select here -
1557        * as long as we ensure vblendvps gets used we can actually
1558        * skip the comparison and just use sign as a "mask" directly.
1559        */
1560       mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
1561       signma = LLVMBuildLShr(builder, mai, signshift, "");
1562       *face = LLVMBuildOr(builder, *face, signma, "face");
1563
1564       /* project coords */
1565       if (!need_derivs) {
1566          ima = lp_build_cube_imapos(coord_bld, ma);
1567          *face_s = lp_build_mul(coord_bld, *face_s, ima);
1568          *face_t = lp_build_mul(coord_bld, *face_t, ima);
1569       }
1570
1571       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
1572       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
1573    }
1574
1575    else {
1576       struct lp_build_if_state if_ctx;
1577       LLVMValueRef face_s_var;
1578       LLVMValueRef face_t_var;
1579       LLVMValueRef face_var;
1580       LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1581       LLVMValueRef shuffles[4];
1582       LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1583       LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1584       LLVMValueRef tmp[4], rxyz, arxyz;
1585       struct lp_build_context *float_bld = &bld->float_bld;
1586
1587       assert(bld->coord_bld.type.length == 4);
1588
1589       tmp[0] = s;
1590       tmp[1] = t;
1591       tmp[2] = r;
1592       rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1593       arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1594
1595       shuffles[0] = lp_build_const_int32(gallivm, 0);
1596       shuffles[1] = lp_build_const_int32(gallivm, 1);
1597       shuffles[2] = lp_build_const_int32(gallivm, 0);
1598       shuffles[3] = lp_build_const_int32(gallivm, 1);
1599       arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1600       shuffles[0] = lp_build_const_int32(gallivm, 1);
1601       shuffles[1] = lp_build_const_int32(gallivm, 0);
1602       shuffles[2] = lp_build_const_int32(gallivm, 2);
1603       shuffles[3] = lp_build_const_int32(gallivm, 2);
1604       aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1605       arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1606
1607       shuffles[0] = lp_build_const_int32(gallivm, 0);
1608       shuffles[1] = lp_build_const_int32(gallivm, 1);
1609       arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1610                                             LLVMConstVector(shuffles, 2), "");
1611       shuffles[0] = lp_build_const_int32(gallivm, 2);
1612       shuffles[1] = lp_build_const_int32(gallivm, 3);
1613       arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1614                                             LLVMConstVector(shuffles, 2), "");
1615       arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1616
1617       arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1618                                                lp_build_const_int32(gallivm, 0), "");
1619       arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1620                                                lp_build_const_int32(gallivm, 0), "");
1621       ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1622                                                lp_build_const_int32(gallivm, 1), "");
1623       ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1624                                                lp_build_const_int32(gallivm, 0), "");
1625       face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1626       face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1627       face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1628
1629       lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1630       {
1631          /* +/- X face */
1632          LLVMValueRef sign, ima;
1633          si = LLVMBuildExtractElement(builder, rxyz,
1634                                       lp_build_const_int32(gallivm, 0), "");
1635          /* +/- X face */
1636          sign = lp_build_sgn(float_bld, si);
1637          ima = lp_build_cube_imaneg(coord_bld, s);
1638          *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1639          *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1640          *face = lp_build_cube_face(bld, si,
1641                                     PIPE_TEX_FACE_POS_X,
1642                                     PIPE_TEX_FACE_NEG_X);
1643          LLVMBuildStore(builder, *face_s, face_s_var);
1644          LLVMBuildStore(builder, *face_t, face_t_var);
1645          LLVMBuildStore(builder, *face, face_var);
1646       }
1647       lp_build_else(&if_ctx);
1648       {
1649          struct lp_build_if_state if_ctx2;
1650
1651          lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1652          {
1653             LLVMValueRef sign, ima;
1654             /* +/- Y face */
1655             ti = LLVMBuildExtractElement(builder, rxyz,
1656                                          lp_build_const_int32(gallivm, 1), "");
1657             sign = lp_build_sgn(float_bld, ti);
1658             ima = lp_build_cube_imaneg(coord_bld, t);
1659             *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1660             *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1661             *face = lp_build_cube_face(bld, ti,
1662                                        PIPE_TEX_FACE_POS_Y,
1663                                        PIPE_TEX_FACE_NEG_Y);
1664             LLVMBuildStore(builder, *face_s, face_s_var);
1665             LLVMBuildStore(builder, *face_t, face_t_var);
1666             LLVMBuildStore(builder, *face, face_var);
1667          }
1668          lp_build_else(&if_ctx2);
1669          {
1670             /* +/- Z face */
1671             LLVMValueRef sign, ima;
1672             ri = LLVMBuildExtractElement(builder, rxyz,
1673                                          lp_build_const_int32(gallivm, 2), "");
1674             sign = lp_build_sgn(float_bld, ri);
1675             ima = lp_build_cube_imaneg(coord_bld, r);
1676             *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1677             *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1678             *face = lp_build_cube_face(bld, ri,
1679                                        PIPE_TEX_FACE_POS_Z,
1680                                        PIPE_TEX_FACE_NEG_Z);
1681             LLVMBuildStore(builder, *face_s, face_s_var);
1682             LLVMBuildStore(builder, *face_t, face_t_var);
1683             LLVMBuildStore(builder, *face, face_var);
1684          }
1685          lp_build_endif(&if_ctx2);
1686       }
1687
1688       lp_build_endif(&if_ctx);
1689
1690       *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
1691       *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
1692       *face   = LLVMBuildLoad(builder, face_var, "face");
1693       *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
1694    }
1695 }
1696
1697
1698 /**
1699  * Compute the partial offset of a pixel block along an arbitrary axis.
1700  *
1701  * @param coord   coordinate in pixels
1702  * @param stride  number of bytes between rows of successive pixel blocks
1703  * @param block_length  number of pixels in a pixels block along the coordinate
1704  *                      axis
1705  * @param out_offset    resulting relative offset of the pixel block in bytes
1706  * @param out_subcoord  resulting sub-block pixel coordinate
1707  */
1708 void
1709 lp_build_sample_partial_offset(struct lp_build_context *bld,
1710                                unsigned block_length,
1711                                LLVMValueRef coord,
1712                                LLVMValueRef stride,
1713                                LLVMValueRef *out_offset,
1714                                LLVMValueRef *out_subcoord)
1715 {
1716    LLVMBuilderRef builder = bld->gallivm->builder;
1717    LLVMValueRef offset;
1718    LLVMValueRef subcoord;
1719
1720    if (block_length == 1) {
1721       subcoord = bld->zero;
1722    }
1723    else {
1724       /*
1725        * Pixel blocks have power of two dimensions. LLVM should convert the
1726        * rem/div to bit arithmetic.
1727        * TODO: Verify this.
1728        * It does indeed BUT it does transform it to scalar (and back) when doing so
1729        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1730        * The generated code looks seriously unfunny and is quite expensive.
1731        */
1732 #if 0
1733       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1734       subcoord = LLVMBuildURem(builder, coord, block_width, "");
1735       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
1736 #else
1737       unsigned logbase2 = util_logbase2(block_length);
1738       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1739       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1740       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1741       coord = LLVMBuildLShr(builder, coord, block_shift, "");
1742 #endif
1743    }
1744
1745    offset = lp_build_mul(bld, coord, stride);
1746
1747    assert(out_offset);
1748    assert(out_subcoord);
1749
1750    *out_offset = offset;
1751    *out_subcoord = subcoord;
1752 }
1753
1754
1755 /**
1756  * Compute the offset of a pixel block.
1757  *
1758  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1759  *
1760  * Returns the relative offset and i,j sub-block coordinates
1761  */
1762 void
1763 lp_build_sample_offset(struct lp_build_context *bld,
1764                        const struct util_format_description *format_desc,
1765                        LLVMValueRef x,
1766                        LLVMValueRef y,
1767                        LLVMValueRef z,
1768                        LLVMValueRef y_stride,
1769                        LLVMValueRef z_stride,
1770                        LLVMValueRef *out_offset,
1771                        LLVMValueRef *out_i,
1772                        LLVMValueRef *out_j)
1773 {
1774    LLVMValueRef x_stride;
1775    LLVMValueRef offset;
1776
1777    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1778                                  format_desc->block.bits/8);
1779
1780    lp_build_sample_partial_offset(bld,
1781                                   format_desc->block.width,
1782                                   x, x_stride,
1783                                   &offset, out_i);
1784
1785    if (y && y_stride) {
1786       LLVMValueRef y_offset;
1787       lp_build_sample_partial_offset(bld,
1788                                      format_desc->block.height,
1789                                      y, y_stride,
1790                                      &y_offset, out_j);
1791       offset = lp_build_add(bld, offset, y_offset);
1792    }
1793    else {
1794       *out_j = bld->zero;
1795    }
1796
1797    if (z && z_stride) {
1798       LLVMValueRef z_offset;
1799       LLVMValueRef k;
1800       lp_build_sample_partial_offset(bld,
1801                                      1, /* pixel blocks are always 2D */
1802                                      z, z_stride,
1803                                      &z_offset, &k);
1804       offset = lp_build_add(bld, offset, z_offset);
1805    }
1806
1807    *out_offset = offset;
1808 }