src/gallium/auxiliary/gallivm/lp_bld_sample.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- common code.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  */
  34
  35 #include "pipe/p_defines.h"
  36 #include "pipe/p_state.h"
  37 #include "util/u_format.h"
  38 #include "util/u_math.h"
  39 #include "lp_bld_arit.h"
  40 #include "lp_bld_const.h"
  41 #include "lp_bld_debug.h"
  42 #include "lp_bld_printf.h"
  43 #include "lp_bld_flow.h"
  44 #include "lp_bld_sample.h"
  45 #include "lp_bld_swizzle.h"
  46 #include "lp_bld_type.h"
  47 #include "lp_bld_logic.h"
  48 #include "lp_bld_pack.h"
  49 #include "lp_bld_quad.h"
  50
  51
  52 /*
  53  * Bri-linear factor. Should be greater than one.
  54  */
  55 #define BRILINEAR_FACTOR 2
  56
  57 /**
  58  * Does the given texture wrap mode allow sampling the texture border color?
  59  * XXX maybe move this into gallium util code.
  60  */
  61 boolean
  62 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
  63                                        unsigned min_img_filter,
  64                                        unsigned mag_img_filter)
  65 {
  66    switch (mode) {
  67    case PIPE_TEX_WRAP_REPEAT:
  68    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
  69    case PIPE_TEX_WRAP_MIRROR_REPEAT:
  70    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
  71       return FALSE;
  72    case PIPE_TEX_WRAP_CLAMP:
  73    case PIPE_TEX_WRAP_MIRROR_CLAMP:
  74       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
  75           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
  76          return FALSE;
  77       } else {
  78          return TRUE;
  79       }
  80    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
  81    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
  82       return TRUE;
  83    default:
  84       assert(0 && "unexpected wrap mode");
  85       return FALSE;
  86    }
  87 }
  88
  89
  90 /**
  91  * Initialize lp_sampler_static_texture_state object with the gallium
  92  * texture/sampler_view state (this contains the parts which are
  93  * considered static).
  94  */
  95 void
  96 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
  97                                 const struct pipe_sampler_view *view)
  98 {
  99    const struct pipe_resource *texture;
 100
 101    memset(state, 0, sizeof *state);
 102
 103    if (!view || !view->texture)
 104       return;
 105
 106    texture = view->texture;
 107
 108    state->format            = view->format;
 109    state->swizzle_r         = view->swizzle_r;
 110    state->swizzle_g         = view->swizzle_g;
 111    state->swizzle_b         = view->swizzle_b;
 112    state->swizzle_a         = view->swizzle_a;
 113
 114    state->target            = texture->target;
 115    state->pot_width         = util_is_power_of_two(texture->width0);
 116    state->pot_height        = util_is_power_of_two(texture->height0);
 117    state->pot_depth         = util_is_power_of_two(texture->depth0);
 118    state->level_zero_only   = !view->u.tex.last_level;
 119
 120    /*
 121     * the layer / element / level parameters are all either dynamic
 122     * state or handled transparently wrt execution.
 123     */
 124 }
 125
 126
 127 /**
 128  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
 129  * state (this contains the parts which are considered static).
 130  */
 131 void
 132 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
 133                                 const struct pipe_sampler_state *sampler)
 134 {
 135    memset(state, 0, sizeof *state);
 136
 137    if (!sampler)
 138       return;
 139
 140    /*
 141     * We don't copy sampler state over unless it is actually enabled, to avoid
 142     * spurious recompiles, as the sampler static state is part of the shader
 143     * key.
 144     *
 145     * Ideally the state tracker or cso_cache module would make all state
 146     * canonical, but until that happens it's better to be safe than sorry here.
 147     *
 148     * XXX: Actually there's much more than can be done here, especially
 149     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
 150     */
 151
 152    state->wrap_s            = sampler->wrap_s;
 153    state->wrap_t            = sampler->wrap_t;
 154    state->wrap_r            = sampler->wrap_r;
 155    state->min_img_filter    = sampler->min_img_filter;
 156    state->mag_img_filter    = sampler->mag_img_filter;
 157
 158    if (sampler->max_lod > 0.0f) {
 159       state->min_mip_filter = sampler->min_mip_filter;
 160    } else {
 161       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
 162    }
 163
 164    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 165       if (sampler->lod_bias != 0.0f) {
 166          state->lod_bias_non_zero = 1;
 167       }
 168
 169       /* If min_lod == max_lod we can greatly simplify mipmap selection.
 170        * This is a case that occurs during automatic mipmap generation.
 171        */
 172       if (sampler->min_lod == sampler->max_lod) {
 173          state->min_max_lod_equal = 1;
 174       } else {
 175          if (sampler->min_lod > 0.0f) {
 176             state->apply_min_lod = 1;
 177          }
 178
 179          /*
 180           * XXX this won't do anything with the mesa state tracker which always
 181           * sets max_lod to not more than actually present mip maps...
 182           */
 183          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
 184             state->apply_max_lod = 1;
 185          }
 186       }
 187    }
 188
 189    state->compare_mode      = sampler->compare_mode;
 190    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
 191       state->compare_func   = sampler->compare_func;
 192    }
 193
 194    state->normalized_coords = sampler->normalized_coords;
 195 }
 196
 197
 198 /**
 199  * Generate code to compute coordinate gradient (rho).
 200  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 201  *
 202  * The resulting rho is scalar per quad.
 203  */
 204 static LLVMValueRef
 205 lp_build_rho(struct lp_build_sample_context *bld,
 206              unsigned texture_unit,
 207              LLVMValueRef s,
 208              LLVMValueRef t,
 209              LLVMValueRef r,
 210              LLVMValueRef cube_rho,
 211              const struct lp_derivatives *derivs)
 212 {
 213    struct gallivm_state *gallivm = bld->gallivm;
 214    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
 215    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
 216    struct lp_build_context *float_bld = &bld->float_bld;
 217    struct lp_build_context *coord_bld = &bld->coord_bld;
 218    struct lp_build_context *levelf_bld = &bld->levelf_bld;
 219    const unsigned dims = bld->dims;
 220    LLVMValueRef ddx_ddy[2];
 221    LLVMBuilderRef builder = bld->gallivm->builder;
 222    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
 223    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
 224    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
 225    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
 226    LLVMValueRef rho_vec;
 227    LLVMValueRef int_size, float_size;
 228    LLVMValueRef rho;
 229    LLVMValueRef first_level, first_level_vec;
 230    unsigned length = coord_bld->type.length;
 231    unsigned num_quads = length / 4;
 232    unsigned i;
 233    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 234    LLVMValueRef rho_xvec, rho_yvec;
 235
 236    /* Note that all simplified calculations will only work for isotropic filtering */
 237
 238    assert(bld->num_lods != length);
 239
 240    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 241                                                  bld->gallivm, texture_unit);
 242    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
 243    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
 244    float_size = lp_build_int_to_float(float_size_bld, int_size);
 245
 246    if (cube_rho) {
 247       LLVMValueRef cubesize;
 248       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
 249       /*
 250        * Cube map code did already everything except size mul and per-quad extraction.
 251        */
 252       rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 253                                       levelf_bld->type, cube_rho, 0);
 254       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
 255          rho = lp_build_sqrt(levelf_bld, rho);
 256       }
 257       /* Could optimize this for single quad just skip the broadcast */
 258       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 259                                             levelf_bld->type, float_size, index0);
 260       rho = lp_build_mul(levelf_bld, cubesize, rho);
 261    }
 262    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
 263       LLVMValueRef ddmax[3], ddx[3], ddy[3];
 264       for (i = 0; i < dims; i++) {
 265          LLVMValueRef floatdim;
 266          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
 267
 268          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 269                                                coord_bld->type, float_size, indexi);
 270
 271          if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 272             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
 273             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
 274             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
 275             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
 276          }
 277          else {
 278             LLVMValueRef tmpx, tmpy;
 279             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
 280             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
 281             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
 282             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
 283          }
 284       }
 285       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 286          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
 287          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
 288          if (dims > 2) {
 289             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
 290             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
 291          }
 292          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 293          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 294                                          levelf_bld->type, rho_vec, 0);
 295          /*
 296           * note that as long as we don't care about per-pixel lod could reduce math
 297           * more (at some shuffle cost), but for now only do sqrt after packing.
 298           */
 299          rho = lp_build_sqrt(levelf_bld, rho);
 300       }
 301       else {
 302          rho_vec = ddmax[0];
 303          if (dims > 1) {
 304             rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
 305             if (dims > 2) {
 306                rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
 307             }
 308          }
 309          /*
 310           * rho_vec now still contains per-pixel rho, convert to scalar per quad
 311           * since we can't handle per-pixel rho/lod from now on (TODO).
 312           */
 313          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 314                                          levelf_bld->type, rho_vec, 0);
 315       }
 316    }
 317    else {
 318       /*
 319        * This looks all a bit complex, but it's not that bad
 320        * (the shuffle code makes it look worse than it is).
 321        * Still, might not be ideal for all cases.
 322        */
 323       static const unsigned char swizzle0[] = { /* no-op swizzle */
 324          0, LP_BLD_SWIZZLE_DONTCARE,
 325          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 326       };
 327       static const unsigned char swizzle1[] = {
 328          1, LP_BLD_SWIZZLE_DONTCARE,
 329          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 330       };
 331       static const unsigned char swizzle2[] = {
 332          2, LP_BLD_SWIZZLE_DONTCARE,
 333          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 334       };
 335
 336       if (dims < 2) {
 337          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
 338       }
 339       else if (dims >= 2) {
 340          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
 341          if (dims > 2) {
 342             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 343          }
 344       }
 345
 346       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
 347          static const unsigned char swizzle01[] = { /* no-op swizzle */
 348             0, 1,
 349             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 350          };
 351          static const unsigned char swizzle23[] = {
 352             2, 3,
 353             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 354          };
 355          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
 356
 357          for (i = 0; i < num_quads; i++) {
 358             shuffles[i*4+0] = shuffles[i*4+1] = index0;
 359             shuffles[i*4+2] = shuffles[i*4+3] = index1;
 360          }
 361          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
 362                                            LLVMConstVector(shuffles, length), "");
 363          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
 364          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
 365          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
 366          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
 367          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
 368
 369          if (dims > 2) {
 370             static const unsigned char swizzle02[] = {
 371                0, 2,
 372                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 373             };
 374             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 375                                                   coord_bld->type, float_size, index2);
 376             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
 377             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 378             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
 379             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
 380          }
 381          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
 382          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 383          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 384
 385          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 386                                          levelf_bld->type, rho_vec, 0);
 387          rho = lp_build_sqrt(levelf_bld, rho);
 388       }
 389       else {
 390          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
 391          if (dims > 2) {
 392             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 393          }
 394
 395          if (dims < 2) {
 396             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
 397             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
 398          }
 399          else if (dims == 2) {
 400             static const unsigned char swizzle02[] = {
 401                0, 2,
 402                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 403             };
 404             static const unsigned char swizzle13[] = {
 405                1, 3,
 406                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 407             };
 408             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
 409             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
 410          }
 411          else {
 412             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
 413             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
 414             assert(dims == 3);
 415             for (i = 0; i < num_quads; i++) {
 416                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
 417                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
 418                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
 419                shuffles1[4*i + 3] = i32undef;
 420                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
 421                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
 422                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
 423                shuffles2[4*i + 3] = i32undef;
 424             }
 425             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
 426                                               LLVMConstVector(shuffles1, length), "");
 427             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
 428                                               LLVMConstVector(shuffles2, length), "");
 429          }
 430
 431          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 432
 433          if (bld->coord_type.length > 4) {
 434             /* expand size to each quad */
 435             if (dims > 1) {
 436                /* could use some broadcast_vector helper for this? */
 437                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
 438                for (i = 0; i < num_quads; i++) {
 439                   src[i] = float_size;
 440                }
 441                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
 442             }
 443             else {
 444                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
 445             }
 446             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
 447
 448             if (dims <= 1) {
 449                rho = rho_vec;
 450             }
 451             else {
 452                if (dims >= 2) {
 453                   LLVMValueRef rho_s, rho_t, rho_r;
 454
 455                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
 456                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 457
 458                   rho = lp_build_max(coord_bld, rho_s, rho_t);
 459
 460                   if (dims >= 3) {
 461                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
 462                      rho = lp_build_max(coord_bld, rho, rho_r);
 463                   }
 464                }
 465             }
 466             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 467                                             levelf_bld->type, rho, 0);
 468          }
 469          else {
 470             if (dims <= 1) {
 471                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 472             }
 473             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
 474
 475             if (dims <= 1) {
 476                rho = rho_vec;
 477             }
 478             else {
 479                if (dims >= 2) {
 480                   LLVMValueRef rho_s, rho_t, rho_r;
 481
 482                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 483                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
 484
 485                   rho = lp_build_max(float_bld, rho_s, rho_t);
 486
 487                   if (dims >= 3) {
 488                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
 489                      rho = lp_build_max(float_bld, rho, rho_r);
 490                   }
 491                }
 492             }
 493          }
 494       }
 495    }
 496
 497    return rho;
 498 }
 499
 500
 501 /*
 502  * Bri-linear lod computation
 503  *
 504  * Use a piece-wise linear approximation of log2 such that:
 505  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
 506  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
 507  *   with the steepness specified in 'factor'
 508  * - exact result for 0.5, 1.5, etc.
 509  *
 510  *
 511  *   1.0 -              /----*
 512  *                     /
 513  *                    /
 514  *                   /
 515  *   0.5 -          *
 516  *                 /
 517  *                /
 518  *               /
 519  *   0.0 - *----/
 520  *
 521  *         |                 |
 522  *        2^0               2^1
 523  *
 524  * This is a technique also commonly used in hardware:
 525  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
 526  *
 527  * TODO: For correctness, this should only be applied when texture is known to
 528  * have regular mipmaps, i.e., mipmaps derived from the base level.
 529  *
 530  * TODO: This could be done in fixed point, where applicable.
 531  */
 532 static void
 533 lp_build_brilinear_lod(struct lp_build_context *bld,
 534                        LLVMValueRef lod,
 535                        double factor,
 536                        LLVMValueRef *out_lod_ipart,
 537                        LLVMValueRef *out_lod_fpart)
 538 {
 539    LLVMValueRef lod_fpart;
 540    double pre_offset = (factor - 0.5)/factor - 0.5;
 541    double post_offset = 1 - factor;
 542
 543    if (0) {
 544       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
 545    }
 546
 547    lod = lp_build_add(bld, lod,
 548                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
 549
 550    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
 551
 552    lod_fpart = lp_build_mul(bld, lod_fpart,
 553                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 554
 555    lod_fpart = lp_build_add(bld, lod_fpart,
 556                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 557
 558    /*
 559     * It's not necessary to clamp lod_fpart since:
 560     * - the above expression will never produce numbers greater than one.
 561     * - the mip filtering branch is only taken if lod_fpart is positive
 562     */
 563
 564    *out_lod_fpart = lod_fpart;
 565
 566    if (0) {
 567       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
 568       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
 569    }
 570 }
 571
 572
 573 /*
 574  * Combined log2 and brilinear lod computation.
 575  *
 576  * It's in all identical to calling lp_build_fast_log2() and
 577  * lp_build_brilinear_lod() above, but by combining we can compute the integer
 578  * and fractional part independently.
 579  */
 580 static void
 581 lp_build_brilinear_rho(struct lp_build_context *bld,
 582                        LLVMValueRef rho,
 583                        double factor,
 584                        LLVMValueRef *out_lod_ipart,
 585                        LLVMValueRef *out_lod_fpart)
 586 {
 587    LLVMValueRef lod_ipart;
 588    LLVMValueRef lod_fpart;
 589
 590    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
 591    const double post_offset = 1 - 2*factor;
 592
 593    assert(bld->type.floating);
 594
 595    assert(lp_check_value(bld->type, rho));
 596
 597    /*
 598     * The pre factor will make the intersections with the exact powers of two
 599     * happen precisely where we want then to be, which means that the integer
 600     * part will not need any post adjustments.
 601     */
 602    rho = lp_build_mul(bld, rho,
 603                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
 604
 605    /* ipart = ifloor(log2(rho)) */
 606    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
 607
 608    /* fpart = rho / 2**ipart */
 609    lod_fpart = lp_build_extract_mantissa(bld, rho);
 610
 611    lod_fpart = lp_build_mul(bld, lod_fpart,
 612                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 613
 614    lod_fpart = lp_build_add(bld, lod_fpart,
 615                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 616
 617    /*
 618     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
 619     * - the above expression will never produce numbers greater than one.
 620     * - the mip filtering branch is only taken if lod_fpart is positive
 621     */
 622
 623    *out_lod_ipart = lod_ipart;
 624    *out_lod_fpart = lod_fpart;
 625 }
 626
 627
 628 /**
 629  * Generate code to compute texture level of detail (lambda).
 630  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 631  * \param lod_bias  optional float vector with the shader lod bias
 632  * \param explicit_lod  optional float vector with the explicit lod
 633  * \param width  scalar int texture width
 634  * \param height  scalar int texture height
 635  * \param depth  scalar int texture depth
 636  *
 637  * The resulting lod is scalar per quad, so only the first value per quad
 638  * passed in from lod_bias, explicit_lod is used.
 639  */
 640 void
 641 lp_build_lod_selector(struct lp_build_sample_context *bld,
 642                       unsigned texture_unit,
 643                       unsigned sampler_unit,
 644                       LLVMValueRef s,
 645                       LLVMValueRef t,
 646                       LLVMValueRef r,
 647                       LLVMValueRef cube_rho,
 648                       const struct lp_derivatives *derivs,
 649                       LLVMValueRef lod_bias, /* optional */
 650                       LLVMValueRef explicit_lod, /* optional */
 651                       unsigned mip_filter,
 652                       LLVMValueRef *out_lod_ipart,
 653                       LLVMValueRef *out_lod_fpart)
 654
 655 {
 656    LLVMBuilderRef builder = bld->gallivm->builder;
 657    struct lp_build_context *levelf_bld = &bld->levelf_bld;
 658    LLVMValueRef lod;
 659
 660    *out_lod_ipart = bld->leveli_bld.zero;
 661    *out_lod_fpart = levelf_bld->zero;
 662
 663    if (bld->static_sampler_state->min_max_lod_equal) {
 664       /* User is forcing sampling from a particular mipmap level.
 665        * This is hit during mipmap generation.
 666        */
 667       LLVMValueRef min_lod =
 668          bld->dynamic_state->min_lod(bld->dynamic_state,
 669                                      bld->gallivm, sampler_unit);
 670
 671       lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
 672    }
 673    else {
 674       if (explicit_lod) {
 675          if (bld->num_lods != bld->coord_type.length)
 676             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 677                                             levelf_bld->type, explicit_lod, 0);
 678          else
 679             lod = explicit_lod;
 680       }
 681       else {
 682          LLVMValueRef rho;
 683
 684          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 685
 686          /*
 687           * Compute lod = log2(rho)
 688           */
 689
 690          if (!lod_bias &&
 691              !bld->static_sampler_state->lod_bias_non_zero &&
 692              !bld->static_sampler_state->apply_max_lod &&
 693              !bld->static_sampler_state->apply_min_lod) {
 694             /*
 695              * Special case when there are no post-log2 adjustments, which
 696              * saves instructions but keeping the integer and fractional lod
 697              * computations separate from the start.
 698              */
 699
 700             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
 701                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
 702                *out_lod_ipart = lp_build_ilog2(levelf_bld, rho);
 703                *out_lod_fpart = levelf_bld->zero;
 704                return;
 705             }
 706             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
 707                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 708                lp_build_brilinear_rho(levelf_bld, rho, BRILINEAR_FACTOR,
 709                                       out_lod_ipart, out_lod_fpart);
 710                return;
 711             }
 712          }
 713
 714          if (0) {
 715             lod = lp_build_log2(levelf_bld, rho);
 716          }
 717          else {
 718             lod = lp_build_fast_log2(levelf_bld, rho);
 719          }
 720
 721          /* add shader lod bias */
 722          if (lod_bias) {
 723             lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 724                   levelf_bld->type, lod_bias, 0);
 725             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
 726          }
 727       }
 728
 729       /* add sampler lod bias */
 730       if (bld->static_sampler_state->lod_bias_non_zero) {
 731          LLVMValueRef sampler_lod_bias =
 732             bld->dynamic_state->lod_bias(bld->dynamic_state,
 733                                          bld->gallivm, sampler_unit);
 734          sampler_lod_bias = lp_build_broadcast_scalar(levelf_bld,
 735                                                       sampler_lod_bias);
 736          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
 737       }
 738
 739       /* clamp lod */
 740       if (bld->static_sampler_state->apply_max_lod) {
 741          LLVMValueRef max_lod =
 742             bld->dynamic_state->max_lod(bld->dynamic_state,
 743                                         bld->gallivm, sampler_unit);
 744          max_lod = lp_build_broadcast_scalar(levelf_bld, max_lod);
 745
 746          lod = lp_build_min(levelf_bld, lod, max_lod);
 747       }
 748       if (bld->static_sampler_state->apply_min_lod) {
 749          LLVMValueRef min_lod =
 750             bld->dynamic_state->min_lod(bld->dynamic_state,
 751                                         bld->gallivm, sampler_unit);
 752          min_lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
 753
 754          lod = lp_build_max(levelf_bld, lod, min_lod);
 755       }
 756    }
 757
 758    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
 759       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 760          lp_build_brilinear_lod(levelf_bld, lod, BRILINEAR_FACTOR,
 761                                 out_lod_ipart, out_lod_fpart);
 762       }
 763       else {
 764          lp_build_ifloor_fract(levelf_bld, lod, out_lod_ipart, out_lod_fpart);
 765       }
 766
 767       lp_build_name(*out_lod_fpart, "lod_fpart");
 768    }
 769    else {
 770       *out_lod_ipart = lp_build_iround(levelf_bld, lod);
 771    }
 772
 773    lp_build_name(*out_lod_ipart, "lod_ipart");
 774
 775    return;
 776 }
 777
 778
 779 /**
 780  * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
 781  * mipmap level index.
 782  * Note: this is all scalar per quad code.
 783  * \param lod_ipart  int texture level of detail
 784  * \param level_out  returns integer
 785  */
 786 void
 787 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 788                            unsigned texture_unit,
 789                            LLVMValueRef lod_ipart,
 790                            LLVMValueRef *level_out)
 791 {
 792    struct lp_build_context *leveli_bld = &bld->leveli_bld;
 793    LLVMValueRef first_level, last_level, level;
 794
 795    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 796                                                  bld->gallivm, texture_unit);
 797    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 798                                                bld->gallivm, texture_unit);
 799    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
 800    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
 801
 802    level = lp_build_add(leveli_bld, lod_ipart, first_level);
 803
 804    /* clamp level to legal range of levels */
 805    *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
 806 }
 807
 808
 809 /**
 810  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
 811  * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
 812  * Later, we'll sample from those two mipmap levels and interpolate between them.
 813  */
 814 void
 815 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
 816                            unsigned texture_unit,
 817                            LLVMValueRef lod_ipart,
 818                            LLVMValueRef *lod_fpart_inout,
 819                            LLVMValueRef *level0_out,
 820                            LLVMValueRef *level1_out)
 821 {
 822    LLVMBuilderRef builder = bld->gallivm->builder;
 823    struct lp_build_context *leveli_bld = &bld->leveli_bld;
 824    struct lp_build_context *levelf_bld = &bld->levelf_bld;
 825    LLVMValueRef first_level, last_level;
 826    LLVMValueRef clamp_min;
 827    LLVMValueRef clamp_max;
 828
 829    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 830                                                  bld->gallivm, texture_unit);
 831    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 832                                                bld->gallivm, texture_unit);
 833    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
 834    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
 835
 836    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
 837    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
 838
 839    /*
 840     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
 841     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
 842     * ends in the process.
 843     */
 844
 845    /*
 846     * This code (vector select in particular) only works with llvm 3.1
 847     * (if there's more than one quad, with x86 backend). Might consider
 848     * converting to our lp_bld_logic helpers.
 849     */
 850 #if HAVE_LLVM < 0x0301
 851    assert(leveli_bld->type.length == 1);
 852 #endif
 853
 854    /* *level0_out < first_level */
 855    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
 856                              *level0_out, first_level,
 857                              "clamp_lod_to_first");
 858
 859    *level0_out = LLVMBuildSelect(builder, clamp_min,
 860                                  first_level, *level0_out, "");
 861
 862    *level1_out = LLVMBuildSelect(builder, clamp_min,
 863                                  first_level, *level1_out, "");
 864
 865    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
 866                                       levelf_bld->zero, *lod_fpart_inout, "");
 867
 868    /* *level0_out >= last_level */
 869    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
 870                              *level0_out, last_level,
 871                              "clamp_lod_to_last");
 872
 873    *level0_out = LLVMBuildSelect(builder, clamp_max,
 874                                  last_level, *level0_out, "");
 875
 876    *level1_out = LLVMBuildSelect(builder, clamp_max,
 877                                  last_level, *level1_out, "");
 878
 879    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
 880                                       levelf_bld->zero, *lod_fpart_inout, "");
 881
 882    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
 883    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
 884    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
 885 }
 886
 887
 888 /**
 889  * Return pointer to a single mipmap level.
 890  * \param level  integer mipmap level
 891  */
 892 LLVMValueRef
 893 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 894                           LLVMValueRef level)
 895 {
 896    LLVMBuilderRef builder = bld->gallivm->builder;
 897    LLVMValueRef indexes[2], data_ptr, mip_offset;
 898
 899    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 900    indexes[1] = level;
 901    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 902    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
 903    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
 904    return data_ptr;
 905 }
 906
 907 /**
 908  * Return (per-pixel) offsets to mip levels.
 909  * \param level  integer mipmap level
 910  */
 911 LLVMValueRef
 912 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
 913                          LLVMValueRef level)
 914 {
 915    LLVMBuilderRef builder = bld->gallivm->builder;
 916    LLVMValueRef indexes[2], offsets, offset1;
 917
 918    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 919    if (bld->num_lods == 1) {
 920       indexes[1] = level;
 921       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 922       offset1 = LLVMBuildLoad(builder, offset1, "");
 923       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
 924    }
 925    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
 926       unsigned i;
 927
 928       offsets = bld->int_coord_bld.undef;
 929       for (i = 0; i < bld->num_lods; i++) {
 930          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 931          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
 932          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 933          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 934          offset1 = LLVMBuildLoad(builder, offset1, "");
 935          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
 936       }
 937       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
 938    }
 939    else {
 940       unsigned i;
 941
 942       assert (bld->num_lods == bld->coord_bld.type.length);
 943
 944       offsets = bld->int_coord_bld.undef;
 945       for (i = 0; i < bld->num_lods; i++) {
 946          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 947          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 948          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 949          offset1 = LLVMBuildLoad(builder, offset1, "");
 950          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
 951       }
 952    }
 953    return offsets;
 954 }
 955
 956
 957 /**
 958  * Codegen equivalent for u_minify().
 959  * Return max(1, base_size >> level);
 960  */
 961 LLVMValueRef
 962 lp_build_minify(struct lp_build_context *bld,
 963                 LLVMValueRef base_size,
 964                 LLVMValueRef level)
 965 {
 966    LLVMBuilderRef builder = bld->gallivm->builder;
 967    assert(lp_check_value(bld->type, base_size));
 968    assert(lp_check_value(bld->type, level));
 969
 970    if (level == bld->zero) {
 971       /* if we're using mipmap level zero, no minification is needed */
 972       return base_size;
 973    }
 974    else {
 975       LLVMValueRef size =
 976          LLVMBuildLShr(builder, base_size, level, "minify");
 977       assert(bld->type.sign);
 978       size = lp_build_max(bld, size, bld->one);
 979       return size;
 980    }
 981 }
 982
 983
 984 /**
 985  * Dereference stride_array[mipmap_level] array to get a stride.
 986  * Return stride as a vector.
 987  */
 988 static LLVMValueRef
 989 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
 990                               LLVMValueRef stride_array, LLVMValueRef level)
 991 {
 992    LLVMBuilderRef builder = bld->gallivm->builder;
 993    LLVMValueRef indexes[2], stride, stride1;
 994    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 995    if (bld->num_lods == 1) {
 996       indexes[1] = level;
 997       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
 998       stride1 = LLVMBuildLoad(builder, stride1, "");
 999       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
1000    }
1001    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
1002       LLVMValueRef stride1;
1003       unsigned i;
1004
1005       stride = bld->int_coord_bld.undef;
1006       for (i = 0; i < bld->num_lods; i++) {
1007          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1008          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1009          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1010          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1011          stride1 = LLVMBuildLoad(builder, stride1, "");
1012          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
1013       }
1014       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
1015    }
1016    else {
1017       LLVMValueRef stride1;
1018       unsigned i;
1019
1020       assert (bld->num_lods == bld->coord_bld.type.length);
1021
1022       stride = bld->int_coord_bld.undef;
1023       for (i = 0; i < bld->coord_bld.type.length; i++) {
1024          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1025          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1026          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1027          stride1 = LLVMBuildLoad(builder, stride1, "");
1028          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
1029       }
1030    }
1031    return stride;
1032 }
1033
1034
1035 /**
1036  * When sampling a mipmap, we need to compute the width, height, depth
1037  * of the source levels from the level indexes.  This helper function
1038  * does that.
1039  */
1040 void
1041 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
1042                             LLVMValueRef ilevel,
1043                             LLVMValueRef *out_size,
1044                             LLVMValueRef *row_stride_vec,
1045                             LLVMValueRef *img_stride_vec)
1046 {
1047    const unsigned dims = bld->dims;
1048    LLVMValueRef ilevel_vec;
1049
1050    /*
1051     * Compute width, height, depth at mipmap level 'ilevel'
1052     */
1053    if (bld->num_lods == 1) {
1054       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
1055       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
1056    }
1057    else {
1058       LLVMValueRef int_size_vec;
1059       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
1060       unsigned num_quads = bld->coord_bld.type.length / 4;
1061       unsigned i;
1062
1063       if (bld->num_lods == num_quads) {
1064          /*
1065           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
1066           * intel "forgot" the variable shift count instruction until avx2.
1067           * A harmless 8x32 shift gets translated into 32 instructions
1068           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
1069           * unable to recognize if there are really just 2 different shift
1070           * count values. So do the shift 4-wide before expansion.
1071           */
1072          struct lp_build_context bld4;
1073          struct lp_type type4;
1074
1075          type4 = bld->int_coord_bld.type;
1076          type4.length = 4;
1077
1078          lp_build_context_init(&bld4, bld->gallivm, type4);
1079
1080          if (bld->dims == 1) {
1081             assert(bld->int_size_in_bld.type.length == 1);
1082             int_size_vec = lp_build_broadcast_scalar(&bld4,
1083                                                      bld->int_size);
1084          }
1085          else {
1086             assert(bld->int_size_in_bld.type.length == 4);
1087             int_size_vec = bld->int_size;
1088          }
1089
1090          for (i = 0; i < num_quads; i++) {
1091             LLVMValueRef ileveli;
1092             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1093
1094             ileveli = lp_build_extract_broadcast(bld->gallivm,
1095                                                  bld->leveli_bld.type,
1096                                                  bld4.type,
1097                                                  ilevel,
1098                                                  indexi);
1099             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
1100          }
1101          /*
1102           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
1103           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1104           */
1105          *out_size = lp_build_concat(bld->gallivm,
1106                                      tmp,
1107                                      bld4.type,
1108                                      num_quads);
1109       }
1110       else {
1111         /* FIXME: this is terrible and results in _huge_ vector
1112          * (for the dims > 1 case).
1113          * Should refactor this (together with extract_image_sizes) and do
1114          * something more useful. Could for instance if we have width,height
1115          * with 4-wide vector pack all elements into a 8xi16 vector
1116          * (on which we can still do useful math) instead of using a 16xi32
1117          * vector.
1118          * FIXME: some callers can't handle this yet.
1119          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1120          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
1121          */
1122          assert(bld->num_lods == bld->coord_bld.type.length);
1123          if (bld->dims == 1) {
1124             assert(bld->int_size_bld.type.length == 1);
1125             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1126                                                      bld->int_size);
1127             /* vector shift with variable shift count alert... */
1128             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
1129          }
1130          else {
1131             LLVMValueRef ilevel1;
1132             for (i = 0; i < bld->num_lods; i++) {
1133                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1134                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
1135                                                     bld->int_size_in_bld.type, ilevel, indexi);
1136                tmp[i] = bld->int_size;
1137                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1138             }
1139             *out_size = lp_build_concat(bld->gallivm, tmp,
1140                                         bld->int_size_in_bld.type,
1141                                         bld->num_lods);
1142          }
1143       }
1144    }
1145
1146    if (dims >= 2) {
1147       *row_stride_vec = lp_build_get_level_stride_vec(bld,
1148                                                       bld->row_stride_array,
1149                                                       ilevel);
1150    }
1151    if (dims == 3 ||
1152        bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1153        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1154        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1155       *img_stride_vec = lp_build_get_level_stride_vec(bld,
1156                                                       bld->img_stride_array,
1157                                                       ilevel);
1158    }
1159 }
1160
1161
1162 /**
1163  * Extract and broadcast texture size.
1164  *
1165  * @param size_type   type of the texture size vector (either
1166  *                    bld->int_size_type or bld->float_size_type)
1167  * @param coord_type  type of the texture size vector (either
1168  *                    bld->int_coord_type or bld->coord_type)
1169  * @param size        vector with the texture size (width, height, depth)
1170  */
1171 void
1172 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1173                              struct lp_build_context *size_bld,
1174                              struct lp_type coord_type,
1175                              LLVMValueRef size,
1176                              LLVMValueRef *out_width,
1177                              LLVMValueRef *out_height,
1178                              LLVMValueRef *out_depth)
1179 {
1180    const unsigned dims = bld->dims;
1181    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1182    struct lp_type size_type = size_bld->type;
1183
1184    if (bld->num_lods == 1) {
1185       *out_width = lp_build_extract_broadcast(bld->gallivm,
1186                                               size_type,
1187                                               coord_type,
1188                                               size,
1189                                               LLVMConstInt(i32t, 0, 0));
1190       if (dims >= 2) {
1191          *out_height = lp_build_extract_broadcast(bld->gallivm,
1192                                                   size_type,
1193                                                   coord_type,
1194                                                   size,
1195                                                   LLVMConstInt(i32t, 1, 0));
1196          if (dims == 3) {
1197             *out_depth = lp_build_extract_broadcast(bld->gallivm,
1198                                                     size_type,
1199                                                     coord_type,
1200                                                     size,
1201                                                     LLVMConstInt(i32t, 2, 0));
1202          }
1203       }
1204    }
1205    else {
1206       unsigned num_quads = bld->coord_bld.type.length / 4;
1207
1208       if (dims == 1) {
1209          *out_width = size;
1210       }
1211       else if (bld->num_lods == num_quads) {
1212          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1213          if (dims >= 2) {
1214             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1215             if (dims == 3) {
1216                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1217             }
1218          }
1219       }
1220       else {
1221          assert(bld->num_lods == bld->coord_type.length);
1222          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1223                                                 coord_type, size, 0);
1224          if (dims >= 2) {
1225             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1226                                                     coord_type, size, 1);
1227             if (dims == 3) {
1228                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1229                                                       coord_type, size, 2);
1230             }
1231          }
1232       }
1233    }
1234 }
1235
1236
1237 /**
1238  * Unnormalize coords.
1239  *
1240  * @param flt_size  vector with the integer texture size (width, height, depth)
1241  */
1242 void
1243 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1244                              LLVMValueRef flt_size,
1245                              LLVMValueRef *s,
1246                              LLVMValueRef *t,
1247                              LLVMValueRef *r)
1248 {
1249    const unsigned dims = bld->dims;
1250    LLVMValueRef width;
1251    LLVMValueRef height;
1252    LLVMValueRef depth;
1253
1254    lp_build_extract_image_sizes(bld,
1255                                 &bld->float_size_bld,
1256                                 bld->coord_type,
1257                                 flt_size,
1258                                 &width,
1259                                 &height,
1260                                 &depth);
1261
1262    /* s = s * width, t = t * height */
1263    *s = lp_build_mul(&bld->coord_bld, *s, width);
1264    if (dims >= 2) {
1265       *t = lp_build_mul(&bld->coord_bld, *t, height);
1266       if (dims >= 3) {
1267          *r = lp_build_mul(&bld->coord_bld, *r, depth);
1268       }
1269    }
1270 }
1271
1272
1273 /** Helper used by lp_build_cube_lookup() */
1274 static LLVMValueRef
1275 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1276 {
1277    /* ima = +0.5 / abs(coord); */
1278    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1279    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1280    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1281    return ima;
1282 }
1283
1284 /** Helper used by lp_build_cube_lookup() */
1285 static LLVMValueRef
1286 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1287 {
1288    /* ima = -0.5 / abs(coord); */
1289    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1290    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1291    LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1292    return ima;
1293 }
1294
1295 /**
1296  * Helper used by lp_build_cube_lookup()
1297  * FIXME: the sign here can also be 0.
1298  * Arithmetically this could definitely make a difference. Either
1299  * fix the comment or use other (simpler) sign function, not sure
1300  * which one it should be.
1301  * \param sign  scalar +1 or -1
1302  * \param coord  float vector
1303  * \param ima  float vector
1304  */
1305 static LLVMValueRef
1306 lp_build_cube_coord(struct lp_build_context *coord_bld,
1307                     LLVMValueRef sign, int negate_coord,
1308                     LLVMValueRef coord, LLVMValueRef ima)
1309 {
1310    /* return negate(coord) * ima * sign + 0.5; */
1311    LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1312    LLVMValueRef res;
1313
1314    assert(negate_coord == +1 || negate_coord == -1);
1315
1316    if (negate_coord == -1) {
1317       coord = lp_build_negate(coord_bld, coord);
1318    }
1319
1320    res = lp_build_mul(coord_bld, coord, ima);
1321    if (sign) {
1322       sign = lp_build_broadcast_scalar(coord_bld, sign);
1323       res = lp_build_mul(coord_bld, res, sign);
1324    }
1325    res = lp_build_add(coord_bld, res, half);
1326
1327    return res;
1328 }
1329
1330
1331 /** Helper used by lp_build_cube_lookup()
1332  * Return (major_coord >= 0) ? pos_face : neg_face;
1333  */
1334 static LLVMValueRef
1335 lp_build_cube_face(struct lp_build_sample_context *bld,
1336                    LLVMValueRef major_coord,
1337                    unsigned pos_face, unsigned neg_face)
1338 {
1339    struct gallivm_state *gallivm = bld->gallivm;
1340    LLVMBuilderRef builder = gallivm->builder;
1341    LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1342                                     major_coord,
1343                                     bld->float_bld.zero, "");
1344    LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1345    LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1346    LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1347    return res;
1348 }
1349
1350
1351
1352 /**
1353  * Generate code to do cube face selection and compute per-face texcoords.
1354  */
1355 void
1356 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1357                      LLVMValueRef s,
1358                      LLVMValueRef t,
1359                      LLVMValueRef r,
1360                      const struct lp_derivatives *derivs, /* optional */
1361                      LLVMValueRef *face,
1362                      LLVMValueRef *face_s,
1363                      LLVMValueRef *face_t,
1364                      LLVMValueRef *rho,
1365                      boolean need_derivs)
1366 {
1367    struct lp_build_context *coord_bld = &bld->coord_bld;
1368    LLVMBuilderRef builder = bld->gallivm->builder;
1369    struct gallivm_state *gallivm = bld->gallivm;
1370    LLVMValueRef si, ti, ri;
1371
1372    if (1 || coord_bld->type.length > 4) {
1373       /*
1374        * Do per-pixel face selection. We cannot however (as we used to do)
1375        * simply calculate the derivs afterwards (which is very bogus for
1376        * explicit derivs btw) because the values would be "random" when
1377        * not all pixels lie on the same face. So what we do here is just
1378        * calculate the derivatives after scaling the coords by the absolute
1379        * value of the inverse major axis, and essentially do rho calculation
1380        * steps as if it were a 3d texture. This is perfect if all pixels hit
1381        * the same face, but not so great at edges, I believe the max error
1382        * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
1383        * the 3d distance between 2 points on the cube instead of measuring up/down
1384        * the edge). Still this is possibly a win over just selecting the same face
1385        * for all pixels. Unfortunately, something like that doesn't work for
1386        * explicit derivatives.
1387        * TODO: handle explicit derivatives by transforming them alongside coords
1388        * somehow.
1389        */
1390       struct lp_build_context *cint_bld = &bld->int_coord_bld;
1391       struct lp_type intctype = cint_bld->type;
1392       LLVMValueRef signs, signt, signr, signma;
1393       LLVMValueRef as, at, ar;
1394       LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
1395       LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1396       LLVMValueRef tnegi, rnegi;
1397       LLVMValueRef ma, mai, ima;
1398       LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1399       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1400                                                      1 << (intctype.width - 1));
1401       LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1402                                                       intctype.width -1);
1403       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1404       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1405       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1406
1407       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1408       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1409       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1410
1411       /*
1412        * get absolute value (for x/y/z face selection) and sign bit
1413        * (for mirroring minor coords and pos/neg face selection)
1414        * of the original coords.
1415        */
1416       as = lp_build_abs(&bld->coord_bld, s);
1417       at = lp_build_abs(&bld->coord_bld, t);
1418       ar = lp_build_abs(&bld->coord_bld, r);
1419
1420       /*
1421        * major face determination: select x if x > y else select y
1422        * select z if z >= max(x,y) else select previous result
1423        * if some axis are the same we chose z over y, y over x - the
1424        * dx10 spec seems to ask for it while OpenGL doesn't care (if we
1425        * wouldn't care could save a select or two if using different
1426        * compares and doing at_g_as_ar last since tnewx and tnewz are the
1427        * same).
1428        */
1429       as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
1430       maxasat = lp_build_max(coord_bld, as, at);
1431       ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
1432
1433       if (need_derivs) {
1434          LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
1435          static const unsigned char swizzle0[] = { /* no-op swizzle */
1436             0, LP_BLD_SWIZZLE_DONTCARE,
1437             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1438          };
1439          static const unsigned char swizzle1[] = {
1440             1, LP_BLD_SWIZZLE_DONTCARE,
1441             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1442          };
1443          static const unsigned char swizzle01[] = { /* no-op swizzle */
1444             0, 1,
1445             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1446          };
1447          static const unsigned char swizzle23[] = {
1448             2, 3,
1449             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1450          };
1451          static const unsigned char swizzle02[] = {
1452             0, 2,
1453             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1454          };
1455
1456          /*
1457           * scale the s/t/r coords pre-select/mirror so we can calculate
1458           * "reasonable" derivs.
1459           */
1460          ma = lp_build_select(coord_bld, as_ge_at, s, t);
1461          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1462          ima = lp_build_cube_imapos(coord_bld, ma);
1463          s = lp_build_mul(coord_bld, s, ima);
1464          t = lp_build_mul(coord_bld, t, ima);
1465          r = lp_build_mul(coord_bld, r, ima);
1466
1467          /*
1468           * This isn't quite the same as the "ordinary" (3d deriv) path since we
1469           * know the texture is square which simplifies things (we can omit the
1470           * size mul which happens very early completely here and do it at the
1471           * very end).
1472           */
1473          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
1474          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
1475
1476          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1477             ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
1478             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
1479          }
1480          else {
1481             ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
1482             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
1483          }
1484
1485          tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
1486          tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
1487          tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
1488
1489          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1490             rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
1491             rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
1492          }
1493          else {
1494             rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
1495             rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
1496          }
1497
1498          tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
1499          tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
1500          *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
1501       }
1502
1503       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1504       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1505       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1506       signs = LLVMBuildAnd(builder, si, signmask, "");
1507       signt = LLVMBuildAnd(builder, ti, signmask, "");
1508       signr = LLVMBuildAnd(builder, ri, signmask, "");
1509
1510       /*
1511        * compute all possible new s/t coords
1512        * snewx = signs * -r;
1513        * tnewx = -t;
1514        * snewy = s;
1515        * tnewy = signt * r;
1516        * snewz = signr * s;
1517        * tnewz = -t;
1518        */
1519       tnegi = LLVMBuildXor(builder, ti, signmask, "");
1520       rnegi = LLVMBuildXor(builder, ri, signmask, "");
1521
1522       snewx = LLVMBuildXor(builder, signs, rnegi, "");
1523       tnewx = tnegi;
1524
1525       snewy = si;
1526       tnewy = LLVMBuildXor(builder, signt, ri, "");
1527
1528       snewz = LLVMBuildXor(builder, signr, si, "");
1529       tnewz = tnegi;
1530
1531       /* XXX on x86 unclear if we should cast the values back to float
1532        * or not - on some cpus (nehalem) pblendvb has twice the throughput
1533        * of blendvps though on others there just might be domain
1534        * transition penalties when using it (this depends on what llvm
1535        * will chose for the bit ops above so there appears no "right way",
1536        * but given the boatload of selects let's just use the int type).
1537        */
1538
1539       /* select/mirror */
1540       if (!need_derivs) {
1541          ma = lp_build_select(coord_bld, as_ge_at, s, t);
1542       }
1543       *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
1544       *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
1545       *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
1546
1547       if (!need_derivs) {
1548          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1549       }
1550       *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
1551       *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
1552       *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
1553
1554       *face_s = LLVMBuildBitCast(builder, *face_s,
1555                                lp_build_vec_type(gallivm, coord_bld->type), "");
1556       *face_t = LLVMBuildBitCast(builder, *face_t,
1557                                lp_build_vec_type(gallivm, coord_bld->type), "");
1558
1559       /* add +1 for neg face */
1560       /* XXX with AVX probably want to use another select here -
1561        * as long as we ensure vblendvps gets used we can actually
1562        * skip the comparison and just use sign as a "mask" directly.
1563        */
1564       mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
1565       signma = LLVMBuildLShr(builder, mai, signshift, "");
1566       *face = LLVMBuildOr(builder, *face, signma, "face");
1567
1568       /* project coords */
1569       if (!need_derivs) {
1570          ima = lp_build_cube_imapos(coord_bld, ma);
1571          *face_s = lp_build_mul(coord_bld, *face_s, ima);
1572          *face_t = lp_build_mul(coord_bld, *face_t, ima);
1573       }
1574
1575       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
1576       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
1577    }
1578
1579    else {
1580       struct lp_build_if_state if_ctx;
1581       LLVMValueRef face_s_var;
1582       LLVMValueRef face_t_var;
1583       LLVMValueRef face_var;
1584       LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1585       LLVMValueRef shuffles[4];
1586       LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1587       LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1588       LLVMValueRef tmp[4], rxyz, arxyz;
1589       struct lp_build_context *float_bld = &bld->float_bld;
1590
1591       assert(bld->coord_bld.type.length == 4);
1592
1593       tmp[0] = s;
1594       tmp[1] = t;
1595       tmp[2] = r;
1596       rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1597       arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1598
1599       shuffles[0] = lp_build_const_int32(gallivm, 0);
1600       shuffles[1] = lp_build_const_int32(gallivm, 1);
1601       shuffles[2] = lp_build_const_int32(gallivm, 0);
1602       shuffles[3] = lp_build_const_int32(gallivm, 1);
1603       arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1604       shuffles[0] = lp_build_const_int32(gallivm, 1);
1605       shuffles[1] = lp_build_const_int32(gallivm, 0);
1606       shuffles[2] = lp_build_const_int32(gallivm, 2);
1607       shuffles[3] = lp_build_const_int32(gallivm, 2);
1608       aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1609       arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1610
1611       shuffles[0] = lp_build_const_int32(gallivm, 0);
1612       shuffles[1] = lp_build_const_int32(gallivm, 1);
1613       arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1614                                             LLVMConstVector(shuffles, 2), "");
1615       shuffles[0] = lp_build_const_int32(gallivm, 2);
1616       shuffles[1] = lp_build_const_int32(gallivm, 3);
1617       arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1618                                             LLVMConstVector(shuffles, 2), "");
1619       arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1620
1621       arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1622                                                lp_build_const_int32(gallivm, 0), "");
1623       arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1624                                                lp_build_const_int32(gallivm, 0), "");
1625       ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1626                                                lp_build_const_int32(gallivm, 1), "");
1627       ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1628                                                lp_build_const_int32(gallivm, 0), "");
1629       face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1630       face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1631       face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1632
1633       lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1634       {
1635          /* +/- X face */
1636          LLVMValueRef sign, ima;
1637          si = LLVMBuildExtractElement(builder, rxyz,
1638                                       lp_build_const_int32(gallivm, 0), "");
1639          /* +/- X face */
1640          sign = lp_build_sgn(float_bld, si);
1641          ima = lp_build_cube_imaneg(coord_bld, s);
1642          *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1643          *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1644          *face = lp_build_cube_face(bld, si,
1645                                     PIPE_TEX_FACE_POS_X,
1646                                     PIPE_TEX_FACE_NEG_X);
1647          LLVMBuildStore(builder, *face_s, face_s_var);
1648          LLVMBuildStore(builder, *face_t, face_t_var);
1649          LLVMBuildStore(builder, *face, face_var);
1650       }
1651       lp_build_else(&if_ctx);
1652       {
1653          struct lp_build_if_state if_ctx2;
1654
1655          lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1656          {
1657             LLVMValueRef sign, ima;
1658             /* +/- Y face */
1659             ti = LLVMBuildExtractElement(builder, rxyz,
1660                                          lp_build_const_int32(gallivm, 1), "");
1661             sign = lp_build_sgn(float_bld, ti);
1662             ima = lp_build_cube_imaneg(coord_bld, t);
1663             *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1664             *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1665             *face = lp_build_cube_face(bld, ti,
1666                                        PIPE_TEX_FACE_POS_Y,
1667                                        PIPE_TEX_FACE_NEG_Y);
1668             LLVMBuildStore(builder, *face_s, face_s_var);
1669             LLVMBuildStore(builder, *face_t, face_t_var);
1670             LLVMBuildStore(builder, *face, face_var);
1671          }
1672          lp_build_else(&if_ctx2);
1673          {
1674             /* +/- Z face */
1675             LLVMValueRef sign, ima;
1676             ri = LLVMBuildExtractElement(builder, rxyz,
1677                                          lp_build_const_int32(gallivm, 2), "");
1678             sign = lp_build_sgn(float_bld, ri);
1679             ima = lp_build_cube_imaneg(coord_bld, r);
1680             *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1681             *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1682             *face = lp_build_cube_face(bld, ri,
1683                                        PIPE_TEX_FACE_POS_Z,
1684                                        PIPE_TEX_FACE_NEG_Z);
1685             LLVMBuildStore(builder, *face_s, face_s_var);
1686             LLVMBuildStore(builder, *face_t, face_t_var);
1687             LLVMBuildStore(builder, *face, face_var);
1688          }
1689          lp_build_endif(&if_ctx2);
1690       }
1691
1692       lp_build_endif(&if_ctx);
1693
1694       *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
1695       *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
1696       *face   = LLVMBuildLoad(builder, face_var, "face");
1697       *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
1698    }
1699 }
1700
1701
1702 /**
1703  * Compute the partial offset of a pixel block along an arbitrary axis.
1704  *
1705  * @param coord   coordinate in pixels
1706  * @param stride  number of bytes between rows of successive pixel blocks
1707  * @param block_length  number of pixels in a pixels block along the coordinate
1708  *                      axis
1709  * @param out_offset    resulting relative offset of the pixel block in bytes
1710  * @param out_subcoord  resulting sub-block pixel coordinate
1711  */
1712 void
1713 lp_build_sample_partial_offset(struct lp_build_context *bld,
1714                                unsigned block_length,
1715                                LLVMValueRef coord,
1716                                LLVMValueRef stride,
1717                                LLVMValueRef *out_offset,
1718                                LLVMValueRef *out_subcoord)
1719 {
1720    LLVMBuilderRef builder = bld->gallivm->builder;
1721    LLVMValueRef offset;
1722    LLVMValueRef subcoord;
1723
1724    if (block_length == 1) {
1725       subcoord = bld->zero;
1726    }
1727    else {
1728       /*
1729        * Pixel blocks have power of two dimensions. LLVM should convert the
1730        * rem/div to bit arithmetic.
1731        * TODO: Verify this.
1732        * It does indeed BUT it does transform it to scalar (and back) when doing so
1733        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1734        * The generated code looks seriously unfunny and is quite expensive.
1735        */
1736 #if 0
1737       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1738       subcoord = LLVMBuildURem(builder, coord, block_width, "");
1739       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
1740 #else
1741       unsigned logbase2 = util_logbase2(block_length);
1742       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1743       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1744       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1745       coord = LLVMBuildLShr(builder, coord, block_shift, "");
1746 #endif
1747    }
1748
1749    offset = lp_build_mul(bld, coord, stride);
1750
1751    assert(out_offset);
1752    assert(out_subcoord);
1753
1754    *out_offset = offset;
1755    *out_subcoord = subcoord;
1756 }
1757
1758
1759 /**
1760  * Compute the offset of a pixel block.
1761  *
1762  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1763  *
1764  * Returns the relative offset and i,j sub-block coordinates
1765  */
1766 void
1767 lp_build_sample_offset(struct lp_build_context *bld,
1768                        const struct util_format_description *format_desc,
1769                        LLVMValueRef x,
1770                        LLVMValueRef y,
1771                        LLVMValueRef z,
1772                        LLVMValueRef y_stride,
1773                        LLVMValueRef z_stride,
1774                        LLVMValueRef *out_offset,
1775                        LLVMValueRef *out_i,
1776                        LLVMValueRef *out_j)
1777 {
1778    LLVMValueRef x_stride;
1779    LLVMValueRef offset;
1780
1781    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1782                                  format_desc->block.bits/8);
1783
1784    lp_build_sample_partial_offset(bld,
1785                                   format_desc->block.width,
1786                                   x, x_stride,
1787                                   &offset, out_i);
1788
1789    if (y && y_stride) {
1790       LLVMValueRef y_offset;
1791       lp_build_sample_partial_offset(bld,
1792                                      format_desc->block.height,
1793                                      y, y_stride,
1794                                      &y_offset, out_j);
1795       offset = lp_build_add(bld, offset, y_offset);
1796    }
1797    else {
1798       *out_j = bld->zero;
1799    }
1800
1801    if (z && z_stride) {
1802       LLVMValueRef z_offset;
1803       LLVMValueRef k;
1804       lp_build_sample_partial_offset(bld,
1805                                      1, /* pixel blocks are always 2D */
1806                                      z, z_stride,
1807                                      &z_offset, &k);
1808       offset = lp_build_add(bld, offset, z_offset);
1809    }
1810
1811    *out_offset = offset;
1812 }