src/gallium/auxiliary/gallivm/lp_bld_sample.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- common code.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  */
  34
  35 #include "pipe/p_defines.h"
  36 #include "pipe/p_state.h"
  37 #include "util/u_format.h"
  38 #include "util/u_math.h"
  39 #include "lp_bld_arit.h"
  40 #include "lp_bld_const.h"
  41 #include "lp_bld_debug.h"
  42 #include "lp_bld_printf.h"
  43 #include "lp_bld_flow.h"
  44 #include "lp_bld_sample.h"
  45 #include "lp_bld_swizzle.h"
  46 #include "lp_bld_type.h"
  47 #include "lp_bld_logic.h"
  48 #include "lp_bld_pack.h"
  49 #include "lp_bld_quad.h"
  50 #include "lp_bld_bitarit.h"
  51
  52
  53 /*
  54  * Bri-linear factor. Should be greater than one.
  55  */
  56 #define BRILINEAR_FACTOR 2
  57
  58 /**
  59  * Does the given texture wrap mode allow sampling the texture border color?
  60  * XXX maybe move this into gallium util code.
  61  */
  62 boolean
  63 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
  64                                        unsigned min_img_filter,
  65                                        unsigned mag_img_filter)
  66 {
  67    switch (mode) {
  68    case PIPE_TEX_WRAP_REPEAT:
  69    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
  70    case PIPE_TEX_WRAP_MIRROR_REPEAT:
  71    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
  72       return FALSE;
  73    case PIPE_TEX_WRAP_CLAMP:
  74    case PIPE_TEX_WRAP_MIRROR_CLAMP:
  75       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
  76           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
  77          return FALSE;
  78       } else {
  79          return TRUE;
  80       }
  81    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
  82    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
  83       return TRUE;
  84    default:
  85       assert(0 && "unexpected wrap mode");
  86       return FALSE;
  87    }
  88 }
  89
  90
  91 /**
  92  * Initialize lp_sampler_static_texture_state object with the gallium
  93  * texture/sampler_view state (this contains the parts which are
  94  * considered static).
  95  */
  96 void
  97 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
  98                                 const struct pipe_sampler_view *view)
  99 {
 100    const struct pipe_resource *texture;
 101
 102    memset(state, 0, sizeof *state);
 103
 104    if (!view || !view->texture)
 105       return;
 106
 107    texture = view->texture;
 108
 109    state->format            = view->format;
 110    state->swizzle_r         = view->swizzle_r;
 111    state->swizzle_g         = view->swizzle_g;
 112    state->swizzle_b         = view->swizzle_b;
 113    state->swizzle_a         = view->swizzle_a;
 114
 115    state->target            = texture->target;
 116    state->pot_width         = util_is_power_of_two(texture->width0);
 117    state->pot_height        = util_is_power_of_two(texture->height0);
 118    state->pot_depth         = util_is_power_of_two(texture->depth0);
 119    state->level_zero_only   = !view->u.tex.last_level;
 120
 121    /*
 122     * the layer / element / level parameters are all either dynamic
 123     * state or handled transparently wrt execution.
 124     */
 125 }
 126
 127
 128 /**
 129  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
 130  * state (this contains the parts which are considered static).
 131  */
 132 void
 133 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
 134                                 const struct pipe_sampler_state *sampler)
 135 {
 136    memset(state, 0, sizeof *state);
 137
 138    if (!sampler)
 139       return;
 140
 141    /*
 142     * We don't copy sampler state over unless it is actually enabled, to avoid
 143     * spurious recompiles, as the sampler static state is part of the shader
 144     * key.
 145     *
 146     * Ideally the state tracker or cso_cache module would make all state
 147     * canonical, but until that happens it's better to be safe than sorry here.
 148     *
 149     * XXX: Actually there's much more than can be done here, especially
 150     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
 151     */
 152
 153    state->wrap_s            = sampler->wrap_s;
 154    state->wrap_t            = sampler->wrap_t;
 155    state->wrap_r            = sampler->wrap_r;
 156    state->min_img_filter    = sampler->min_img_filter;
 157    state->mag_img_filter    = sampler->mag_img_filter;
 158
 159    if (sampler->max_lod > 0.0f) {
 160       state->min_mip_filter = sampler->min_mip_filter;
 161    } else {
 162       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
 163    }
 164
 165    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 166       if (sampler->lod_bias != 0.0f) {
 167          state->lod_bias_non_zero = 1;
 168       }
 169
 170       /* If min_lod == max_lod we can greatly simplify mipmap selection.
 171        * This is a case that occurs during automatic mipmap generation.
 172        */
 173       if (sampler->min_lod == sampler->max_lod) {
 174          state->min_max_lod_equal = 1;
 175       } else {
 176          if (sampler->min_lod > 0.0f) {
 177             state->apply_min_lod = 1;
 178          }
 179
 180          /*
 181           * XXX this won't do anything with the mesa state tracker which always
 182           * sets max_lod to not more than actually present mip maps...
 183           */
 184          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
 185             state->apply_max_lod = 1;
 186          }
 187       }
 188    }
 189
 190    state->compare_mode      = sampler->compare_mode;
 191    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
 192       state->compare_func   = sampler->compare_func;
 193    }
 194
 195    state->normalized_coords = sampler->normalized_coords;
 196 }
 197
 198
 199 /**
 200  * Generate code to compute coordinate gradient (rho).
 201  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 202  *
 203  * The resulting rho has bld->levelf format (per quad or per element).
 204  */
 205 static LLVMValueRef
 206 lp_build_rho(struct lp_build_sample_context *bld,
 207              unsigned texture_unit,
 208              LLVMValueRef s,
 209              LLVMValueRef t,
 210              LLVMValueRef r,
 211              LLVMValueRef cube_rho,
 212              const struct lp_derivatives *derivs)
 213 {
 214    struct gallivm_state *gallivm = bld->gallivm;
 215    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
 216    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
 217    struct lp_build_context *float_bld = &bld->float_bld;
 218    struct lp_build_context *coord_bld = &bld->coord_bld;
 219    struct lp_build_context *levelf_bld = &bld->levelf_bld;
 220    const unsigned dims = bld->dims;
 221    LLVMValueRef ddx_ddy[2];
 222    LLVMBuilderRef builder = bld->gallivm->builder;
 223    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
 224    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
 225    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
 226    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
 227    LLVMValueRef rho_vec;
 228    LLVMValueRef int_size, float_size;
 229    LLVMValueRef rho;
 230    LLVMValueRef first_level, first_level_vec;
 231    unsigned length = coord_bld->type.length;
 232    unsigned num_quads = length / 4;
 233    boolean rho_per_quad = levelf_bld->type.length != length;
 234    unsigned i;
 235    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 236    LLVMValueRef rho_xvec, rho_yvec;
 237
 238    /* Note that all simplified calculations will only work for isotropic filtering */
 239
 240    /*
 241     * rho calcs are always per quad except for explicit derivs (excluding
 242     * the messy cube maps for now) when requested.
 243     */
 244
 245    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 246                                                  bld->gallivm, texture_unit);
 247    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
 248    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
 249    float_size = lp_build_int_to_float(float_size_bld, int_size);
 250
 251    if (cube_rho) {
 252       LLVMValueRef cubesize;
 253       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
 254
 255       /*
 256        * Cube map code did already everything except size mul and per-quad extraction.
 257        * Luckily cube maps are always quadratic!
 258        */
 259       if (rho_per_quad) {
 260          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 261                                          levelf_bld->type, cube_rho, 0);
 262       }
 263       else {
 264          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
 265       }
 266       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
 267          rho = lp_build_sqrt(levelf_bld, rho);
 268       }
 269       /* Could optimize this for single quad just skip the broadcast */
 270       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 271                                             levelf_bld->type, float_size, index0);
 272       rho = lp_build_mul(levelf_bld, cubesize, rho);
 273    }
 274    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
 275       LLVMValueRef ddmax[3], ddx[3], ddy[3];
 276       for (i = 0; i < dims; i++) {
 277          LLVMValueRef floatdim;
 278          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
 279
 280          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 281                                                coord_bld->type, float_size, indexi);
 282
 283          if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 284             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
 285             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
 286             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
 287             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
 288          }
 289          else {
 290             LLVMValueRef tmpx, tmpy;
 291             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
 292             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
 293             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
 294             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
 295          }
 296       }
 297       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 298          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
 299          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
 300          if (dims > 2) {
 301             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
 302             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
 303          }
 304          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 305
 306          if (rho_per_quad) {
 307             /*
 308              * note for this case without per-pixel lod could reduce math more
 309              * (at some shuffle cost), but for now only do sqrt after packing,
 310              * otherwise would also need different code to per-pixel lod case.
 311              */
 312             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 313                                             levelf_bld->type, rho, 0);
 314          }
 315          rho = lp_build_sqrt(levelf_bld, rho);
 316
 317       }
 318       else {
 319          rho = ddmax[0];
 320          if (dims > 1) {
 321             rho = lp_build_max(coord_bld, rho, ddmax[1]);
 322             if (dims > 2) {
 323                rho = lp_build_max(coord_bld, rho, ddmax[2]);
 324             }
 325          }
 326          if (rho_per_quad) {
 327             /*
 328              * rho_vec contains per-pixel rho, convert to scalar per quad.
 329              */
 330             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 331                                             levelf_bld->type, rho, 0);
 332          }
 333       }
 334    }
 335    else {
 336       /*
 337        * This looks all a bit complex, but it's not that bad
 338        * (the shuffle code makes it look worse than it is).
 339        * Still, might not be ideal for all cases.
 340        */
 341       static const unsigned char swizzle0[] = { /* no-op swizzle */
 342          0, LP_BLD_SWIZZLE_DONTCARE,
 343          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 344       };
 345       static const unsigned char swizzle1[] = {
 346          1, LP_BLD_SWIZZLE_DONTCARE,
 347          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 348       };
 349       static const unsigned char swizzle2[] = {
 350          2, LP_BLD_SWIZZLE_DONTCARE,
 351          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 352       };
 353
 354       if (dims < 2) {
 355          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
 356       }
 357       else if (dims >= 2) {
 358          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
 359          if (dims > 2) {
 360             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 361          }
 362       }
 363
 364       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 365          static const unsigned char swizzle01[] = { /* no-op swizzle */
 366             0, 1,
 367             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 368          };
 369          static const unsigned char swizzle23[] = {
 370             2, 3,
 371             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 372          };
 373          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
 374
 375          for (i = 0; i < num_quads; i++) {
 376             shuffles[i*4+0] = shuffles[i*4+1] = index0;
 377             shuffles[i*4+2] = shuffles[i*4+3] = index1;
 378          }
 379          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
 380                                            LLVMConstVector(shuffles, length), "");
 381          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
 382          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
 383          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
 384          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
 385          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
 386
 387          if (dims > 2) {
 388             static const unsigned char swizzle02[] = {
 389                0, 2,
 390                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 391             };
 392             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 393                                                   coord_bld->type, float_size, index2);
 394             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
 395             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 396             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
 397             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
 398          }
 399
 400          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
 401          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 402          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 403
 404          if (rho_per_quad) {
 405             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 406                                             levelf_bld->type, rho, 0);
 407          }
 408          else {
 409             /*
 410              * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
 411              * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
 412              * same is true for cpus having faster scalars than 4-wide vecs
 413              * for 4-wide case (where pack/unpack would be no-ops anyway).
 414              * (Same is true really for cube_rho case above.)
 415              */
 416             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
 417          }
 418          rho = lp_build_sqrt(levelf_bld, rho);
 419       }
 420       else {
 421          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
 422          if (dims > 2) {
 423             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 424          }
 425
 426          if (dims < 2) {
 427             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
 428             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
 429          }
 430          else if (dims == 2) {
 431             static const unsigned char swizzle02[] = {
 432                0, 2,
 433                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 434             };
 435             static const unsigned char swizzle13[] = {
 436                1, 3,
 437                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 438             };
 439             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
 440             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
 441          }
 442          else {
 443             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
 444             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
 445             assert(dims == 3);
 446             for (i = 0; i < num_quads; i++) {
 447                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
 448                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
 449                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
 450                shuffles1[4*i + 3] = i32undef;
 451                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
 452                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
 453                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
 454                shuffles2[4*i + 3] = i32undef;
 455             }
 456             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
 457                                               LLVMConstVector(shuffles1, length), "");
 458             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
 459                                               LLVMConstVector(shuffles2, length), "");
 460          }
 461
 462          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 463
 464          if (bld->coord_type.length > 4) {
 465             /* expand size to each quad */
 466             if (dims > 1) {
 467                /* could use some broadcast_vector helper for this? */
 468                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
 469                for (i = 0; i < num_quads; i++) {
 470                   src[i] = float_size;
 471                }
 472                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
 473             }
 474             else {
 475                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
 476             }
 477             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
 478
 479             if (dims <= 1) {
 480                rho = rho_vec;
 481             }
 482             else {
 483                if (dims >= 2) {
 484                   LLVMValueRef rho_s, rho_t, rho_r;
 485
 486                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
 487                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 488
 489                   rho = lp_build_max(coord_bld, rho_s, rho_t);
 490
 491                   if (dims >= 3) {
 492                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
 493                      rho = lp_build_max(coord_bld, rho, rho_r);
 494                   }
 495                }
 496             }
 497             if (rho_per_quad) {
 498                rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 499                                                levelf_bld->type, rho, 0);
 500             }
 501             else {
 502                rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
 503             }
 504          }
 505          else {
 506             if (dims <= 1) {
 507                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 508             }
 509             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
 510
 511             if (dims <= 1) {
 512                rho = rho_vec;
 513             }
 514             else {
 515                if (dims >= 2) {
 516                   LLVMValueRef rho_s, rho_t, rho_r;
 517
 518                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 519                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
 520
 521                   rho = lp_build_max(float_bld, rho_s, rho_t);
 522
 523                   if (dims >= 3) {
 524                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
 525                      rho = lp_build_max(float_bld, rho, rho_r);
 526                   }
 527                }
 528             }
 529             if (!rho_per_quad) {
 530                rho = lp_build_broadcast_scalar(levelf_bld, rho);
 531             }
 532          }
 533       }
 534    }
 535
 536    return rho;
 537 }
 538
 539
 540 /*
 541  * Bri-linear lod computation
 542  *
 543  * Use a piece-wise linear approximation of log2 such that:
 544  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
 545  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
 546  *   with the steepness specified in 'factor'
 547  * - exact result for 0.5, 1.5, etc.
 548  *
 549  *
 550  *   1.0 -              /----*
 551  *                     /
 552  *                    /
 553  *                   /
 554  *   0.5 -          *
 555  *                 /
 556  *                /
 557  *               /
 558  *   0.0 - *----/
 559  *
 560  *         |                 |
 561  *        2^0               2^1
 562  *
 563  * This is a technique also commonly used in hardware:
 564  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
 565  *
 566  * TODO: For correctness, this should only be applied when texture is known to
 567  * have regular mipmaps, i.e., mipmaps derived from the base level.
 568  *
 569  * TODO: This could be done in fixed point, where applicable.
 570  */
 571 static void
 572 lp_build_brilinear_lod(struct lp_build_context *bld,
 573                        LLVMValueRef lod,
 574                        double factor,
 575                        LLVMValueRef *out_lod_ipart,
 576                        LLVMValueRef *out_lod_fpart)
 577 {
 578    LLVMValueRef lod_fpart;
 579    double pre_offset = (factor - 0.5)/factor - 0.5;
 580    double post_offset = 1 - factor;
 581
 582    if (0) {
 583       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
 584    }
 585
 586    lod = lp_build_add(bld, lod,
 587                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
 588
 589    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
 590
 591    lod_fpart = lp_build_mul(bld, lod_fpart,
 592                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 593
 594    lod_fpart = lp_build_add(bld, lod_fpart,
 595                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 596
 597    /*
 598     * It's not necessary to clamp lod_fpart since:
 599     * - the above expression will never produce numbers greater than one.
 600     * - the mip filtering branch is only taken if lod_fpart is positive
 601     */
 602
 603    *out_lod_fpart = lod_fpart;
 604
 605    if (0) {
 606       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
 607       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
 608    }
 609 }
 610
 611
 612 /*
 613  * Combined log2 and brilinear lod computation.
 614  *
 615  * It's in all identical to calling lp_build_fast_log2() and
 616  * lp_build_brilinear_lod() above, but by combining we can compute the integer
 617  * and fractional part independently.
 618  */
 619 static void
 620 lp_build_brilinear_rho(struct lp_build_context *bld,
 621                        LLVMValueRef rho,
 622                        double factor,
 623                        LLVMValueRef *out_lod_ipart,
 624                        LLVMValueRef *out_lod_fpart)
 625 {
 626    LLVMValueRef lod_ipart;
 627    LLVMValueRef lod_fpart;
 628
 629    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
 630    const double post_offset = 1 - 2*factor;
 631
 632    assert(bld->type.floating);
 633
 634    assert(lp_check_value(bld->type, rho));
 635
 636    /*
 637     * The pre factor will make the intersections with the exact powers of two
 638     * happen precisely where we want then to be, which means that the integer
 639     * part will not need any post adjustments.
 640     */
 641    rho = lp_build_mul(bld, rho,
 642                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
 643
 644    /* ipart = ifloor(log2(rho)) */
 645    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
 646
 647    /* fpart = rho / 2**ipart */
 648    lod_fpart = lp_build_extract_mantissa(bld, rho);
 649
 650    lod_fpart = lp_build_mul(bld, lod_fpart,
 651                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 652
 653    lod_fpart = lp_build_add(bld, lod_fpart,
 654                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 655
 656    /*
 657     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
 658     * - the above expression will never produce numbers greater than one.
 659     * - the mip filtering branch is only taken if lod_fpart is positive
 660     */
 661
 662    *out_lod_ipart = lod_ipart;
 663    *out_lod_fpart = lod_fpart;
 664 }
 665
 666
 667 /**
 668  * Generate code to compute texture level of detail (lambda).
 669  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 670  * \param lod_bias  optional float vector with the shader lod bias
 671  * \param explicit_lod  optional float vector with the explicit lod
 672  * \param width  scalar int texture width
 673  * \param height  scalar int texture height
 674  * \param depth  scalar int texture depth
 675  *
 676  * The resulting lod is scalar per quad, so only the first value per quad
 677  * passed in from lod_bias, explicit_lod is used.
 678  */
 679 void
 680 lp_build_lod_selector(struct lp_build_sample_context *bld,
 681                       unsigned texture_unit,
 682                       unsigned sampler_unit,
 683                       LLVMValueRef s,
 684                       LLVMValueRef t,
 685                       LLVMValueRef r,
 686                       LLVMValueRef cube_rho,
 687                       const struct lp_derivatives *derivs,
 688                       LLVMValueRef lod_bias, /* optional */
 689                       LLVMValueRef explicit_lod, /* optional */
 690                       unsigned mip_filter,
 691                       LLVMValueRef *out_lod_ipart,
 692                       LLVMValueRef *out_lod_fpart)
 693
 694 {
 695    LLVMBuilderRef builder = bld->gallivm->builder;
 696    struct lp_build_context *levelf_bld = &bld->levelf_bld;
 697    LLVMValueRef lod;
 698
 699    *out_lod_ipart = bld->leveli_bld.zero;
 700    *out_lod_fpart = levelf_bld->zero;
 701
 702    if (bld->static_sampler_state->min_max_lod_equal) {
 703       /* User is forcing sampling from a particular mipmap level.
 704        * This is hit during mipmap generation.
 705        */
 706       LLVMValueRef min_lod =
 707          bld->dynamic_state->min_lod(bld->dynamic_state,
 708                                      bld->gallivm, sampler_unit);
 709
 710       lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
 711    }
 712    else {
 713       if (explicit_lod) {
 714          if (bld->num_lods != bld->coord_type.length)
 715             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 716                                             levelf_bld->type, explicit_lod, 0);
 717          else
 718             lod = explicit_lod;
 719       }
 720       else {
 721          LLVMValueRef rho;
 722
 723          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 724
 725          /*
 726           * Compute lod = log2(rho)
 727           */
 728
 729          if (!lod_bias &&
 730              !bld->static_sampler_state->lod_bias_non_zero &&
 731              !bld->static_sampler_state->apply_max_lod &&
 732              !bld->static_sampler_state->apply_min_lod) {
 733             /*
 734              * Special case when there are no post-log2 adjustments, which
 735              * saves instructions but keeping the integer and fractional lod
 736              * computations separate from the start.
 737              */
 738
 739             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
 740                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
 741                /*
 742                 * FIXME: this is not entirely correct, as out_lod_ipart is used
 743                 * both for mip level determination as well as mag/min switchover
 744                 * point (if different min/mag filters are used). In particular,
 745                 * lod values between [-0.5,0] (rho between [sqrt(2), 1.0]) will
 746                 * incorrectly use min filter instead of mag (the non-optimized
 747                 * calculation further down has exactly the same problem).
 748                 */
 749                *out_lod_ipart = lp_build_ilog2(levelf_bld, rho);
 750                *out_lod_fpart = levelf_bld->zero;
 751                return;
 752             }
 753             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
 754                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 755                lp_build_brilinear_rho(levelf_bld, rho, BRILINEAR_FACTOR,
 756                                       out_lod_ipart, out_lod_fpart);
 757                return;
 758             }
 759          }
 760
 761          if (0) {
 762             lod = lp_build_log2(levelf_bld, rho);
 763          }
 764          else {
 765             lod = lp_build_fast_log2(levelf_bld, rho);
 766          }
 767
 768          /* add shader lod bias */
 769          if (lod_bias) {
 770             if (bld->num_lods != bld->coord_type.length)
 771                lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 772                                                     levelf_bld->type, lod_bias, 0);
 773             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
 774          }
 775       }
 776
 777       /* add sampler lod bias */
 778       if (bld->static_sampler_state->lod_bias_non_zero) {
 779          LLVMValueRef sampler_lod_bias =
 780             bld->dynamic_state->lod_bias(bld->dynamic_state,
 781                                          bld->gallivm, sampler_unit);
 782          sampler_lod_bias = lp_build_broadcast_scalar(levelf_bld,
 783                                                       sampler_lod_bias);
 784          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
 785       }
 786
 787       /* clamp lod */
 788       if (bld->static_sampler_state->apply_max_lod) {
 789          LLVMValueRef max_lod =
 790             bld->dynamic_state->max_lod(bld->dynamic_state,
 791                                         bld->gallivm, sampler_unit);
 792          max_lod = lp_build_broadcast_scalar(levelf_bld, max_lod);
 793
 794          lod = lp_build_min(levelf_bld, lod, max_lod);
 795       }
 796       if (bld->static_sampler_state->apply_min_lod) {
 797          LLVMValueRef min_lod =
 798             bld->dynamic_state->min_lod(bld->dynamic_state,
 799                                         bld->gallivm, sampler_unit);
 800          min_lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
 801
 802          lod = lp_build_max(levelf_bld, lod, min_lod);
 803       }
 804    }
 805
 806    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
 807       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 808          lp_build_brilinear_lod(levelf_bld, lod, BRILINEAR_FACTOR,
 809                                 out_lod_ipart, out_lod_fpart);
 810       }
 811       else {
 812          lp_build_ifloor_fract(levelf_bld, lod, out_lod_ipart, out_lod_fpart);
 813       }
 814
 815       lp_build_name(*out_lod_fpart, "lod_fpart");
 816    }
 817    else {
 818       *out_lod_ipart = lp_build_iround(levelf_bld, lod);
 819    }
 820
 821    lp_build_name(*out_lod_ipart, "lod_ipart");
 822
 823    return;
 824 }
 825
 826
 827 /**
 828  * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod
 829  * to actual mip level.
 830  * Note: this is all scalar per quad code.
 831  * \param lod_ipart  int texture level of detail
 832  * \param level_out  returns integer
 833  * \param out_of_bounds returns per coord out_of_bounds mask if provided
 834  */
 835 void
 836 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 837                            unsigned texture_unit,
 838                            LLVMValueRef lod_ipart,
 839                            LLVMValueRef *level_out,
 840                            LLVMValueRef *out_of_bounds)
 841 {
 842    struct lp_build_context *leveli_bld = &bld->leveli_bld;
 843    LLVMValueRef first_level, last_level, level;
 844
 845    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 846                                                  bld->gallivm, texture_unit);
 847    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 848                                                bld->gallivm, texture_unit);
 849    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
 850    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
 851
 852    level = lp_build_add(leveli_bld, lod_ipart, first_level);
 853
 854    if (out_of_bounds) {
 855       LLVMValueRef out, out1;
 856       out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level);
 857       out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level);
 858       out = lp_build_or(leveli_bld, out, out1);
 859       if (bld->num_lods == bld->coord_bld.type.length) {
 860          *out_of_bounds = out;
 861       }
 862       else if (bld->num_lods == 1) {
 863          *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out);
 864       }
 865       else {
 866          assert(bld->num_lods == bld->coord_bld.type.length / 4);
 867          *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
 868                                                                 leveli_bld->type,
 869                                                                 bld->int_coord_bld.type,
 870                                                                 out);
 871       }
 872       *level_out = level;
 873    }
 874    else {
 875       /* clamp level to legal range of levels */
 876       *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
 877
 878    }
 879 }
 880
 881
 882 /**
 883  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
 884  * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
 885  * Later, we'll sample from those two mipmap levels and interpolate between them.
 886  */
 887 void
 888 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
 889                            unsigned texture_unit,
 890                            LLVMValueRef lod_ipart,
 891                            LLVMValueRef *lod_fpart_inout,
 892                            LLVMValueRef *level0_out,
 893                            LLVMValueRef *level1_out)
 894 {
 895    LLVMBuilderRef builder = bld->gallivm->builder;
 896    struct lp_build_context *leveli_bld = &bld->leveli_bld;
 897    struct lp_build_context *levelf_bld = &bld->levelf_bld;
 898    LLVMValueRef first_level, last_level;
 899    LLVMValueRef clamp_min;
 900    LLVMValueRef clamp_max;
 901
 902    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 903                                                  bld->gallivm, texture_unit);
 904    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 905                                                bld->gallivm, texture_unit);
 906    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
 907    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
 908
 909    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
 910    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
 911
 912    /*
 913     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
 914     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
 915     * ends in the process.
 916     */
 917
 918    /*
 919     * This code (vector select in particular) only works with llvm 3.1
 920     * (if there's more than one quad, with x86 backend). Might consider
 921     * converting to our lp_bld_logic helpers.
 922     */
 923 #if HAVE_LLVM < 0x0301
 924    assert(leveli_bld->type.length == 1);
 925 #endif
 926
 927    /* *level0_out < first_level */
 928    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
 929                              *level0_out, first_level,
 930                              "clamp_lod_to_first");
 931
 932    *level0_out = LLVMBuildSelect(builder, clamp_min,
 933                                  first_level, *level0_out, "");
 934
 935    *level1_out = LLVMBuildSelect(builder, clamp_min,
 936                                  first_level, *level1_out, "");
 937
 938    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
 939                                       levelf_bld->zero, *lod_fpart_inout, "");
 940
 941    /* *level0_out >= last_level */
 942    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
 943                              *level0_out, last_level,
 944                              "clamp_lod_to_last");
 945
 946    *level0_out = LLVMBuildSelect(builder, clamp_max,
 947                                  last_level, *level0_out, "");
 948
 949    *level1_out = LLVMBuildSelect(builder, clamp_max,
 950                                  last_level, *level1_out, "");
 951
 952    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
 953                                       levelf_bld->zero, *lod_fpart_inout, "");
 954
 955    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
 956    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
 957    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
 958 }
 959
 960
 961 /**
 962  * Return pointer to a single mipmap level.
 963  * \param level  integer mipmap level
 964  */
 965 LLVMValueRef
 966 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 967                           LLVMValueRef level)
 968 {
 969    LLVMBuilderRef builder = bld->gallivm->builder;
 970    LLVMValueRef indexes[2], data_ptr, mip_offset;
 971
 972    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 973    indexes[1] = level;
 974    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 975    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
 976    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
 977    return data_ptr;
 978 }
 979
 980 /**
 981  * Return (per-pixel) offsets to mip levels.
 982  * \param level  integer mipmap level
 983  */
 984 LLVMValueRef
 985 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
 986                          LLVMValueRef level)
 987 {
 988    LLVMBuilderRef builder = bld->gallivm->builder;
 989    LLVMValueRef indexes[2], offsets, offset1;
 990
 991    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 992    if (bld->num_lods == 1) {
 993       indexes[1] = level;
 994       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 995       offset1 = LLVMBuildLoad(builder, offset1, "");
 996       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
 997    }
 998    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
 999       unsigned i;
1000
1001       offsets = bld->int_coord_bld.undef;
1002       for (i = 0; i < bld->num_lods; i++) {
1003          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1004          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1005          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1006          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
1007          offset1 = LLVMBuildLoad(builder, offset1, "");
1008          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
1009       }
1010       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
1011    }
1012    else {
1013       unsigned i;
1014
1015       assert (bld->num_lods == bld->coord_bld.type.length);
1016
1017       offsets = bld->int_coord_bld.undef;
1018       for (i = 0; i < bld->num_lods; i++) {
1019          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1020          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1021          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
1022          offset1 = LLVMBuildLoad(builder, offset1, "");
1023          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
1024       }
1025    }
1026    return offsets;
1027 }
1028
1029
1030 /**
1031  * Codegen equivalent for u_minify().
1032  * Return max(1, base_size >> level);
1033  */
1034 LLVMValueRef
1035 lp_build_minify(struct lp_build_context *bld,
1036                 LLVMValueRef base_size,
1037                 LLVMValueRef level)
1038 {
1039    LLVMBuilderRef builder = bld->gallivm->builder;
1040    assert(lp_check_value(bld->type, base_size));
1041    assert(lp_check_value(bld->type, level));
1042
1043    if (level == bld->zero) {
1044       /* if we're using mipmap level zero, no minification is needed */
1045       return base_size;
1046    }
1047    else {
1048       LLVMValueRef size =
1049          LLVMBuildLShr(builder, base_size, level, "minify");
1050       assert(bld->type.sign);
1051       size = lp_build_max(bld, size, bld->one);
1052       return size;
1053    }
1054 }
1055
1056
1057 /**
1058  * Dereference stride_array[mipmap_level] array to get a stride.
1059  * Return stride as a vector.
1060  */
1061 static LLVMValueRef
1062 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
1063                               LLVMValueRef stride_array, LLVMValueRef level)
1064 {
1065    LLVMBuilderRef builder = bld->gallivm->builder;
1066    LLVMValueRef indexes[2], stride, stride1;
1067    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
1068    if (bld->num_lods == 1) {
1069       indexes[1] = level;
1070       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1071       stride1 = LLVMBuildLoad(builder, stride1, "");
1072       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
1073    }
1074    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
1075       LLVMValueRef stride1;
1076       unsigned i;
1077
1078       stride = bld->int_coord_bld.undef;
1079       for (i = 0; i < bld->num_lods; i++) {
1080          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1081          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1082          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1083          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1084          stride1 = LLVMBuildLoad(builder, stride1, "");
1085          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
1086       }
1087       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
1088    }
1089    else {
1090       LLVMValueRef stride1;
1091       unsigned i;
1092
1093       assert (bld->num_lods == bld->coord_bld.type.length);
1094
1095       stride = bld->int_coord_bld.undef;
1096       for (i = 0; i < bld->coord_bld.type.length; i++) {
1097          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1098          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1099          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1100          stride1 = LLVMBuildLoad(builder, stride1, "");
1101          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
1102       }
1103    }
1104    return stride;
1105 }
1106
1107
1108 /**
1109  * When sampling a mipmap, we need to compute the width, height, depth
1110  * of the source levels from the level indexes.  This helper function
1111  * does that.
1112  */
1113 void
1114 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
1115                             LLVMValueRef ilevel,
1116                             LLVMValueRef *out_size,
1117                             LLVMValueRef *row_stride_vec,
1118                             LLVMValueRef *img_stride_vec)
1119 {
1120    const unsigned dims = bld->dims;
1121    LLVMValueRef ilevel_vec;
1122
1123    /*
1124     * Compute width, height, depth at mipmap level 'ilevel'
1125     */
1126    if (bld->num_lods == 1) {
1127       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
1128       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
1129    }
1130    else {
1131       LLVMValueRef int_size_vec;
1132       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
1133       unsigned num_quads = bld->coord_bld.type.length / 4;
1134       unsigned i;
1135
1136       if (bld->num_lods == num_quads) {
1137          /*
1138           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
1139           * intel "forgot" the variable shift count instruction until avx2.
1140           * A harmless 8x32 shift gets translated into 32 instructions
1141           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
1142           * unable to recognize if there are really just 2 different shift
1143           * count values. So do the shift 4-wide before expansion.
1144           */
1145          struct lp_build_context bld4;
1146          struct lp_type type4;
1147
1148          type4 = bld->int_coord_bld.type;
1149          type4.length = 4;
1150
1151          lp_build_context_init(&bld4, bld->gallivm, type4);
1152
1153          if (bld->dims == 1) {
1154             assert(bld->int_size_in_bld.type.length == 1);
1155             int_size_vec = lp_build_broadcast_scalar(&bld4,
1156                                                      bld->int_size);
1157          }
1158          else {
1159             assert(bld->int_size_in_bld.type.length == 4);
1160             int_size_vec = bld->int_size;
1161          }
1162
1163          for (i = 0; i < num_quads; i++) {
1164             LLVMValueRef ileveli;
1165             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1166
1167             ileveli = lp_build_extract_broadcast(bld->gallivm,
1168                                                  bld->leveli_bld.type,
1169                                                  bld4.type,
1170                                                  ilevel,
1171                                                  indexi);
1172             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
1173          }
1174          /*
1175           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
1176           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1177           */
1178          *out_size = lp_build_concat(bld->gallivm,
1179                                      tmp,
1180                                      bld4.type,
1181                                      num_quads);
1182       }
1183       else {
1184         /* FIXME: this is terrible and results in _huge_ vector
1185          * (for the dims > 1 case).
1186          * Should refactor this (together with extract_image_sizes) and do
1187          * something more useful. Could for instance if we have width,height
1188          * with 4-wide vector pack all elements into a 8xi16 vector
1189          * (on which we can still do useful math) instead of using a 16xi32
1190          * vector.
1191          * FIXME: some callers can't handle this yet.
1192          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1193          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
1194          */
1195          assert(bld->num_lods == bld->coord_bld.type.length);
1196          if (bld->dims == 1) {
1197             assert(bld->int_size_in_bld.type.length == 1);
1198             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1199                                                      bld->int_size);
1200             /* vector shift with variable shift count alert... */
1201             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
1202          }
1203          else {
1204             LLVMValueRef ilevel1;
1205             for (i = 0; i < bld->num_lods; i++) {
1206                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1207                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
1208                                                     bld->int_size_in_bld.type, ilevel, indexi);
1209                tmp[i] = bld->int_size;
1210                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1211             }
1212             *out_size = lp_build_concat(bld->gallivm, tmp,
1213                                         bld->int_size_in_bld.type,
1214                                         bld->num_lods);
1215          }
1216       }
1217    }
1218
1219    if (dims >= 2) {
1220       *row_stride_vec = lp_build_get_level_stride_vec(bld,
1221                                                       bld->row_stride_array,
1222                                                       ilevel);
1223    }
1224    if (dims == 3 ||
1225        bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1226        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1227        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1228       *img_stride_vec = lp_build_get_level_stride_vec(bld,
1229                                                       bld->img_stride_array,
1230                                                       ilevel);
1231    }
1232 }
1233
1234
1235 /**
1236  * Extract and broadcast texture size.
1237  *
1238  * @param size_type   type of the texture size vector (either
1239  *                    bld->int_size_type or bld->float_size_type)
1240  * @param coord_type  type of the texture size vector (either
1241  *                    bld->int_coord_type or bld->coord_type)
1242  * @param size        vector with the texture size (width, height, depth)
1243  */
1244 void
1245 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1246                              struct lp_build_context *size_bld,
1247                              struct lp_type coord_type,
1248                              LLVMValueRef size,
1249                              LLVMValueRef *out_width,
1250                              LLVMValueRef *out_height,
1251                              LLVMValueRef *out_depth)
1252 {
1253    const unsigned dims = bld->dims;
1254    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1255    struct lp_type size_type = size_bld->type;
1256
1257    if (bld->num_lods == 1) {
1258       *out_width = lp_build_extract_broadcast(bld->gallivm,
1259                                               size_type,
1260                                               coord_type,
1261                                               size,
1262                                               LLVMConstInt(i32t, 0, 0));
1263       if (dims >= 2) {
1264          *out_height = lp_build_extract_broadcast(bld->gallivm,
1265                                                   size_type,
1266                                                   coord_type,
1267                                                   size,
1268                                                   LLVMConstInt(i32t, 1, 0));
1269          if (dims == 3) {
1270             *out_depth = lp_build_extract_broadcast(bld->gallivm,
1271                                                     size_type,
1272                                                     coord_type,
1273                                                     size,
1274                                                     LLVMConstInt(i32t, 2, 0));
1275          }
1276       }
1277    }
1278    else {
1279       unsigned num_quads = bld->coord_bld.type.length / 4;
1280
1281       if (dims == 1) {
1282          *out_width = size;
1283       }
1284       else if (bld->num_lods == num_quads) {
1285          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1286          if (dims >= 2) {
1287             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1288             if (dims == 3) {
1289                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1290             }
1291          }
1292       }
1293       else {
1294          assert(bld->num_lods == bld->coord_type.length);
1295          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1296                                                 coord_type, size, 0);
1297          if (dims >= 2) {
1298             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1299                                                     coord_type, size, 1);
1300             if (dims == 3) {
1301                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1302                                                       coord_type, size, 2);
1303             }
1304          }
1305       }
1306    }
1307 }
1308
1309
1310 /**
1311  * Unnormalize coords.
1312  *
1313  * @param flt_size  vector with the integer texture size (width, height, depth)
1314  */
1315 void
1316 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1317                              LLVMValueRef flt_size,
1318                              LLVMValueRef *s,
1319                              LLVMValueRef *t,
1320                              LLVMValueRef *r)
1321 {
1322    const unsigned dims = bld->dims;
1323    LLVMValueRef width;
1324    LLVMValueRef height;
1325    LLVMValueRef depth;
1326
1327    lp_build_extract_image_sizes(bld,
1328                                 &bld->float_size_bld,
1329                                 bld->coord_type,
1330                                 flt_size,
1331                                 &width,
1332                                 &height,
1333                                 &depth);
1334
1335    /* s = s * width, t = t * height */
1336    *s = lp_build_mul(&bld->coord_bld, *s, width);
1337    if (dims >= 2) {
1338       *t = lp_build_mul(&bld->coord_bld, *t, height);
1339       if (dims >= 3) {
1340          *r = lp_build_mul(&bld->coord_bld, *r, depth);
1341       }
1342    }
1343 }
1344
1345
1346 /** Helper used by lp_build_cube_lookup() */
1347 static LLVMValueRef
1348 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1349 {
1350    /* ima = +0.5 / abs(coord); */
1351    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1352    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1353    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1354    return ima;
1355 }
1356
1357 /** Helper used by lp_build_cube_lookup() */
1358 static LLVMValueRef
1359 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1360 {
1361    /* ima = -0.5 / abs(coord); */
1362    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1363    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1364    LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1365    return ima;
1366 }
1367
1368 /**
1369  * Helper used by lp_build_cube_lookup()
1370  * FIXME: the sign here can also be 0.
1371  * Arithmetically this could definitely make a difference. Either
1372  * fix the comment or use other (simpler) sign function, not sure
1373  * which one it should be.
1374  * \param sign  scalar +1 or -1
1375  * \param coord  float vector
1376  * \param ima  float vector
1377  */
1378 static LLVMValueRef
1379 lp_build_cube_coord(struct lp_build_context *coord_bld,
1380                     LLVMValueRef sign, int negate_coord,
1381                     LLVMValueRef coord, LLVMValueRef ima)
1382 {
1383    /* return negate(coord) * ima * sign + 0.5; */
1384    LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1385    LLVMValueRef res;
1386
1387    assert(negate_coord == +1 || negate_coord == -1);
1388
1389    if (negate_coord == -1) {
1390       coord = lp_build_negate(coord_bld, coord);
1391    }
1392
1393    res = lp_build_mul(coord_bld, coord, ima);
1394    if (sign) {
1395       sign = lp_build_broadcast_scalar(coord_bld, sign);
1396       res = lp_build_mul(coord_bld, res, sign);
1397    }
1398    res = lp_build_add(coord_bld, res, half);
1399
1400    return res;
1401 }
1402
1403
1404 /** Helper used by lp_build_cube_lookup()
1405  * Return (major_coord >= 0) ? pos_face : neg_face;
1406  */
1407 static LLVMValueRef
1408 lp_build_cube_face(struct lp_build_sample_context *bld,
1409                    LLVMValueRef major_coord,
1410                    unsigned pos_face, unsigned neg_face)
1411 {
1412    struct gallivm_state *gallivm = bld->gallivm;
1413    LLVMBuilderRef builder = gallivm->builder;
1414    LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1415                                     major_coord,
1416                                     bld->float_bld.zero, "");
1417    LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1418    LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1419    LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1420    return res;
1421 }
1422
1423
1424
1425 /**
1426  * Generate code to do cube face selection and compute per-face texcoords.
1427  */
1428 void
1429 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1430                      LLVMValueRef *coords,
1431                      const struct lp_derivatives *derivs, /* optional */
1432                      LLVMValueRef *rho,
1433                      boolean need_derivs)
1434 {
1435    struct lp_build_context *coord_bld = &bld->coord_bld;
1436    LLVMBuilderRef builder = bld->gallivm->builder;
1437    struct gallivm_state *gallivm = bld->gallivm;
1438    LLVMValueRef si, ti, ri;
1439
1440    if (1 || coord_bld->type.length > 4) {
1441       /*
1442        * Do per-pixel face selection. We cannot however (as we used to do)
1443        * simply calculate the derivs afterwards (which is very bogus for
1444        * explicit derivs btw) because the values would be "random" when
1445        * not all pixels lie on the same face. So what we do here is just
1446        * calculate the derivatives after scaling the coords by the absolute
1447        * value of the inverse major axis, and essentially do rho calculation
1448        * steps as if it were a 3d texture. This is perfect if all pixels hit
1449        * the same face, but not so great at edges, I believe the max error
1450        * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
1451        * the 3d distance between 2 points on the cube instead of measuring up/down
1452        * the edge). Still this is possibly a win over just selecting the same face
1453        * for all pixels. Unfortunately, something like that doesn't work for
1454        * explicit derivatives.
1455        * TODO: handle explicit derivatives by transforming them alongside coords
1456        * somehow.
1457        */
1458       struct lp_build_context *cint_bld = &bld->int_coord_bld;
1459       struct lp_type intctype = cint_bld->type;
1460       LLVMValueRef signs, signt, signr, signma;
1461       LLVMValueRef as, at, ar, face, face_s, face_t;
1462       LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
1463       LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1464       LLVMValueRef tnegi, rnegi;
1465       LLVMValueRef ma, mai, ima;
1466       LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1467       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1468                                                      1 << (intctype.width - 1));
1469       LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1470                                                       intctype.width -1);
1471       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1472       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1473       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1474       LLVMValueRef s = coords[0];
1475       LLVMValueRef t = coords[1];
1476       LLVMValueRef r = coords[2];
1477
1478       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1479       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1480       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1481
1482       /*
1483        * get absolute value (for x/y/z face selection) and sign bit
1484        * (for mirroring minor coords and pos/neg face selection)
1485        * of the original coords.
1486        */
1487       as = lp_build_abs(&bld->coord_bld, s);
1488       at = lp_build_abs(&bld->coord_bld, t);
1489       ar = lp_build_abs(&bld->coord_bld, r);
1490
1491       /*
1492        * major face determination: select x if x > y else select y
1493        * select z if z >= max(x,y) else select previous result
1494        * if some axis are the same we chose z over y, y over x - the
1495        * dx10 spec seems to ask for it while OpenGL doesn't care (if we
1496        * wouldn't care could save a select or two if using different
1497        * compares and doing at_g_as_ar last since tnewx and tnewz are the
1498        * same).
1499        */
1500       as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
1501       maxasat = lp_build_max(coord_bld, as, at);
1502       ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
1503
1504       if (need_derivs) {
1505          LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
1506          static const unsigned char swizzle0[] = { /* no-op swizzle */
1507             0, LP_BLD_SWIZZLE_DONTCARE,
1508             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1509          };
1510          static const unsigned char swizzle1[] = {
1511             1, LP_BLD_SWIZZLE_DONTCARE,
1512             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1513          };
1514          static const unsigned char swizzle01[] = { /* no-op swizzle */
1515             0, 1,
1516             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1517          };
1518          static const unsigned char swizzle23[] = {
1519             2, 3,
1520             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1521          };
1522          static const unsigned char swizzle02[] = {
1523             0, 2,
1524             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1525          };
1526
1527          /*
1528           * scale the s/t/r coords pre-select/mirror so we can calculate
1529           * "reasonable" derivs.
1530           */
1531          ma = lp_build_select(coord_bld, as_ge_at, s, t);
1532          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1533          ima = lp_build_cube_imapos(coord_bld, ma);
1534          s = lp_build_mul(coord_bld, s, ima);
1535          t = lp_build_mul(coord_bld, t, ima);
1536          r = lp_build_mul(coord_bld, r, ima);
1537
1538          /*
1539           * This isn't quite the same as the "ordinary" (3d deriv) path since we
1540           * know the texture is square which simplifies things (we can omit the
1541           * size mul which happens very early completely here and do it at the
1542           * very end).
1543           */
1544          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
1545          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
1546
1547          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1548             ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
1549             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
1550          }
1551          else {
1552             ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
1553             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
1554          }
1555
1556          tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
1557          tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
1558          tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
1559
1560          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1561             rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
1562             rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
1563          }
1564          else {
1565             rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
1566             rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
1567          }
1568
1569          tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
1570          tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
1571          *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
1572       }
1573
1574       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1575       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1576       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1577       signs = LLVMBuildAnd(builder, si, signmask, "");
1578       signt = LLVMBuildAnd(builder, ti, signmask, "");
1579       signr = LLVMBuildAnd(builder, ri, signmask, "");
1580
1581       /*
1582        * compute all possible new s/t coords
1583        * snewx = signs * -r;
1584        * tnewx = -t;
1585        * snewy = s;
1586        * tnewy = signt * r;
1587        * snewz = signr * s;
1588        * tnewz = -t;
1589        */
1590       tnegi = LLVMBuildXor(builder, ti, signmask, "");
1591       rnegi = LLVMBuildXor(builder, ri, signmask, "");
1592
1593       snewx = LLVMBuildXor(builder, signs, rnegi, "");
1594       tnewx = tnegi;
1595
1596       snewy = si;
1597       tnewy = LLVMBuildXor(builder, signt, ri, "");
1598
1599       snewz = LLVMBuildXor(builder, signr, si, "");
1600       tnewz = tnegi;
1601
1602       /* XXX on x86 unclear if we should cast the values back to float
1603        * or not - on some cpus (nehalem) pblendvb has twice the throughput
1604        * of blendvps though on others there just might be domain
1605        * transition penalties when using it (this depends on what llvm
1606        * will chose for the bit ops above so there appears no "right way",
1607        * but given the boatload of selects let's just use the int type).
1608        */
1609
1610       /* select/mirror */
1611       if (!need_derivs) {
1612          ma = lp_build_select(coord_bld, as_ge_at, s, t);
1613       }
1614       face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
1615       face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
1616       face = lp_build_select(cint_bld, as_ge_at, facex, facey);
1617
1618       if (!need_derivs) {
1619          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1620       }
1621       face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, face_s);
1622       face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, face_t);
1623       face = lp_build_select(cint_bld, ar_ge_as_at, facez, face);
1624
1625       face_s = LLVMBuildBitCast(builder, face_s,
1626                                lp_build_vec_type(gallivm, coord_bld->type), "");
1627       face_t = LLVMBuildBitCast(builder, face_t,
1628                                lp_build_vec_type(gallivm, coord_bld->type), "");
1629
1630       /* add +1 for neg face */
1631       /* XXX with AVX probably want to use another select here -
1632        * as long as we ensure vblendvps gets used we can actually
1633        * skip the comparison and just use sign as a "mask" directly.
1634        */
1635       mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
1636       signma = LLVMBuildLShr(builder, mai, signshift, "");
1637       coords[2] = LLVMBuildOr(builder, face, signma, "face");
1638
1639       /* project coords */
1640       if (!need_derivs) {
1641          ima = lp_build_cube_imapos(coord_bld, ma);
1642          face_s = lp_build_mul(coord_bld, face_s, ima);
1643          face_t = lp_build_mul(coord_bld, face_t, ima);
1644       }
1645
1646       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
1647       coords[1] = lp_build_add(coord_bld, face_t, posHalf);
1648    }
1649
1650    else {
1651       struct lp_build_if_state if_ctx;
1652       LLVMValueRef face_s_var;
1653       LLVMValueRef face_t_var;
1654       LLVMValueRef face_var;
1655       LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1656       LLVMValueRef shuffles[4];
1657       LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1658       LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1659       LLVMValueRef tmp[4], rxyz, arxyz;
1660       struct lp_build_context *float_bld = &bld->float_bld;
1661       LLVMValueRef s, t, r, face, face_s, face_t;
1662
1663       assert(bld->coord_bld.type.length == 4);
1664
1665       tmp[0] = s = coords[0];
1666       tmp[1] = t = coords[1];
1667       tmp[2] = r = coords[2];
1668       rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1669       arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1670
1671       shuffles[0] = lp_build_const_int32(gallivm, 0);
1672       shuffles[1] = lp_build_const_int32(gallivm, 1);
1673       shuffles[2] = lp_build_const_int32(gallivm, 0);
1674       shuffles[3] = lp_build_const_int32(gallivm, 1);
1675       arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1676       shuffles[0] = lp_build_const_int32(gallivm, 1);
1677       shuffles[1] = lp_build_const_int32(gallivm, 0);
1678       shuffles[2] = lp_build_const_int32(gallivm, 2);
1679       shuffles[3] = lp_build_const_int32(gallivm, 2);
1680       aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1681       arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1682
1683       shuffles[0] = lp_build_const_int32(gallivm, 0);
1684       shuffles[1] = lp_build_const_int32(gallivm, 1);
1685       arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1686                                             LLVMConstVector(shuffles, 2), "");
1687       shuffles[0] = lp_build_const_int32(gallivm, 2);
1688       shuffles[1] = lp_build_const_int32(gallivm, 3);
1689       arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1690                                             LLVMConstVector(shuffles, 2), "");
1691       arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1692
1693       arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1694                                                lp_build_const_int32(gallivm, 0), "");
1695       arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1696                                                lp_build_const_int32(gallivm, 0), "");
1697       ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1698                                                lp_build_const_int32(gallivm, 1), "");
1699       ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1700                                                lp_build_const_int32(gallivm, 0), "");
1701       face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1702       face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1703       face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1704
1705       lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1706       {
1707          /* +/- X face */
1708          LLVMValueRef sign, ima;
1709          si = LLVMBuildExtractElement(builder, rxyz,
1710                                       lp_build_const_int32(gallivm, 0), "");
1711          /* +/- X face */
1712          sign = lp_build_sgn(float_bld, si);
1713          ima = lp_build_cube_imaneg(coord_bld, s);
1714          face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1715          face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1716          face = lp_build_cube_face(bld, si,
1717                                     PIPE_TEX_FACE_POS_X,
1718                                     PIPE_TEX_FACE_NEG_X);
1719          LLVMBuildStore(builder, face_s, face_s_var);
1720          LLVMBuildStore(builder, face_t, face_t_var);
1721          LLVMBuildStore(builder, face, face_var);
1722       }
1723       lp_build_else(&if_ctx);
1724       {
1725          struct lp_build_if_state if_ctx2;
1726
1727          lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1728          {
1729             LLVMValueRef sign, ima;
1730             /* +/- Y face */
1731             ti = LLVMBuildExtractElement(builder, rxyz,
1732                                          lp_build_const_int32(gallivm, 1), "");
1733             sign = lp_build_sgn(float_bld, ti);
1734             ima = lp_build_cube_imaneg(coord_bld, t);
1735             face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1736             face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1737             face = lp_build_cube_face(bld, ti,
1738                                        PIPE_TEX_FACE_POS_Y,
1739                                        PIPE_TEX_FACE_NEG_Y);
1740             LLVMBuildStore(builder, face_s, face_s_var);
1741             LLVMBuildStore(builder, face_t, face_t_var);
1742             LLVMBuildStore(builder, face, face_var);
1743          }
1744          lp_build_else(&if_ctx2);
1745          {
1746             /* +/- Z face */
1747             LLVMValueRef sign, ima;
1748             ri = LLVMBuildExtractElement(builder, rxyz,
1749                                          lp_build_const_int32(gallivm, 2), "");
1750             sign = lp_build_sgn(float_bld, ri);
1751             ima = lp_build_cube_imaneg(coord_bld, r);
1752             face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1753             face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1754             face = lp_build_cube_face(bld, ri,
1755                                        PIPE_TEX_FACE_POS_Z,
1756                                        PIPE_TEX_FACE_NEG_Z);
1757             LLVMBuildStore(builder, face_s, face_s_var);
1758             LLVMBuildStore(builder, face_t, face_t_var);
1759             LLVMBuildStore(builder, face, face_var);
1760          }
1761          lp_build_endif(&if_ctx2);
1762       }
1763
1764       lp_build_endif(&if_ctx);
1765
1766       coords[0] = LLVMBuildLoad(builder, face_s_var, "face_s");
1767       coords[1] = LLVMBuildLoad(builder, face_t_var, "face_t");
1768       face   = LLVMBuildLoad(builder, face_var, "face");
1769       coords[2]   = lp_build_broadcast_scalar(&bld->int_coord_bld, face);
1770    }
1771 }
1772
1773
1774 /**
1775  * Compute the partial offset of a pixel block along an arbitrary axis.
1776  *
1777  * @param coord   coordinate in pixels
1778  * @param stride  number of bytes between rows of successive pixel blocks
1779  * @param block_length  number of pixels in a pixels block along the coordinate
1780  *                      axis
1781  * @param out_offset    resulting relative offset of the pixel block in bytes
1782  * @param out_subcoord  resulting sub-block pixel coordinate
1783  */
1784 void
1785 lp_build_sample_partial_offset(struct lp_build_context *bld,
1786                                unsigned block_length,
1787                                LLVMValueRef coord,
1788                                LLVMValueRef stride,
1789                                LLVMValueRef *out_offset,
1790                                LLVMValueRef *out_subcoord)
1791 {
1792    LLVMBuilderRef builder = bld->gallivm->builder;
1793    LLVMValueRef offset;
1794    LLVMValueRef subcoord;
1795
1796    if (block_length == 1) {
1797       subcoord = bld->zero;
1798    }
1799    else {
1800       /*
1801        * Pixel blocks have power of two dimensions. LLVM should convert the
1802        * rem/div to bit arithmetic.
1803        * TODO: Verify this.
1804        * It does indeed BUT it does transform it to scalar (and back) when doing so
1805        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1806        * The generated code looks seriously unfunny and is quite expensive.
1807        */
1808 #if 0
1809       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1810       subcoord = LLVMBuildURem(builder, coord, block_width, "");
1811       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
1812 #else
1813       unsigned logbase2 = util_logbase2(block_length);
1814       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1815       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1816       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1817       coord = LLVMBuildLShr(builder, coord, block_shift, "");
1818 #endif
1819    }
1820
1821    offset = lp_build_mul(bld, coord, stride);
1822
1823    assert(out_offset);
1824    assert(out_subcoord);
1825
1826    *out_offset = offset;
1827    *out_subcoord = subcoord;
1828 }
1829
1830
1831 /**
1832  * Compute the offset of a pixel block.
1833  *
1834  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1835  *
1836  * Returns the relative offset and i,j sub-block coordinates
1837  */
1838 void
1839 lp_build_sample_offset(struct lp_build_context *bld,
1840                        const struct util_format_description *format_desc,
1841                        LLVMValueRef x,
1842                        LLVMValueRef y,
1843                        LLVMValueRef z,
1844                        LLVMValueRef y_stride,
1845                        LLVMValueRef z_stride,
1846                        LLVMValueRef *out_offset,
1847                        LLVMValueRef *out_i,
1848                        LLVMValueRef *out_j)
1849 {
1850    LLVMValueRef x_stride;
1851    LLVMValueRef offset;
1852
1853    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1854                                  format_desc->block.bits/8);
1855
1856    lp_build_sample_partial_offset(bld,
1857                                   format_desc->block.width,
1858                                   x, x_stride,
1859                                   &offset, out_i);
1860
1861    if (y && y_stride) {
1862       LLVMValueRef y_offset;
1863       lp_build_sample_partial_offset(bld,
1864                                      format_desc->block.height,
1865                                      y, y_stride,
1866                                      &y_offset, out_j);
1867       offset = lp_build_add(bld, offset, y_offset);
1868    }
1869    else {
1870       *out_j = bld->zero;
1871    }
1872
1873    if (z && z_stride) {
1874       LLVMValueRef z_offset;
1875       LLVMValueRef k;
1876       lp_build_sample_partial_offset(bld,
1877                                      1, /* pixel blocks are always 2D */
1878                                      z, z_stride,
1879                                      &z_offset, &k);
1880       offset = lp_build_add(bld, offset, z_offset);
1881    }
1882
1883    *out_offset = offset;
1884 }