src/gallium/auxiliary/gallivm/lp_bld_sample.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 /**
  29  * @file
  30  * Texture sampling -- common code.
  31  *
  32  * @author Jose Fonseca <jfonseca@vmware.com>
  33  */
  34
  35 #include "pipe/p_defines.h"
  36 #include "pipe/p_state.h"
  37 #include "util/u_format.h"
  38 #include "util/u_math.h"
  39 #include "lp_bld_arit.h"
  40 #include "lp_bld_const.h"
  41 #include "lp_bld_debug.h"
  42 #include "lp_bld_printf.h"
  43 #include "lp_bld_flow.h"
  44 #include "lp_bld_sample.h"
  45 #include "lp_bld_swizzle.h"
  46 #include "lp_bld_type.h"
  47 #include "lp_bld_logic.h"
  48 #include "lp_bld_pack.h"
  49
  50
  51 /*
  52  * Bri-linear factor. Should be greater than one.
  53  */
  54 #define BRILINEAR_FACTOR 2
  55
  56 /**
  57  * Does the given texture wrap mode allow sampling the texture border color?
  58  * XXX maybe move this into gallium util code.
  59  */
  60 boolean
  61 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
  62                                        unsigned min_img_filter,
  63                                        unsigned mag_img_filter)
  64 {
  65    switch (mode) {
  66    case PIPE_TEX_WRAP_REPEAT:
  67    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
  68    case PIPE_TEX_WRAP_MIRROR_REPEAT:
  69    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
  70       return FALSE;
  71    case PIPE_TEX_WRAP_CLAMP:
  72    case PIPE_TEX_WRAP_MIRROR_CLAMP:
  73       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
  74           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
  75          return FALSE;
  76       } else {
  77          return TRUE;
  78       }
  79    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
  80    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
  81       return TRUE;
  82    default:
  83       assert(0 && "unexpected wrap mode");
  84       return FALSE;
  85    }
  86 }
  87
  88
  89 /**
  90  * Initialize lp_sampler_static_texture_state object with the gallium
  91  * texture/sampler_view state (this contains the parts which are
  92  * considered static).
  93  */
  94 void
  95 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
  96                                 const struct pipe_sampler_view *view)
  97 {
  98    const struct pipe_resource *texture;
  99
 100    memset(state, 0, sizeof *state);
 101
 102    if (!view || !view->texture)
 103       return;
 104
 105    texture = view->texture;
 106
 107    state->format            = view->format;
 108    state->swizzle_r         = view->swizzle_r;
 109    state->swizzle_g         = view->swizzle_g;
 110    state->swizzle_b         = view->swizzle_b;
 111    state->swizzle_a         = view->swizzle_a;
 112
 113    state->target            = texture->target;
 114    state->pot_width         = util_is_power_of_two(texture->width0);
 115    state->pot_height        = util_is_power_of_two(texture->height0);
 116    state->pot_depth         = util_is_power_of_two(texture->depth0);
 117    state->level_zero_only   = !view->u.tex.last_level;
 118
 119    /*
 120     * FIXME: Handle the remainder of pipe_sampler_view.
 121     */
 122 }
 123
 124
 125 /**
 126  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
 127  * state (this contains the parts which are considered static).
 128  */
 129 void
 130 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
 131                                 const struct pipe_sampler_state *sampler)
 132 {
 133    memset(state, 0, sizeof *state);
 134
 135    if (!sampler)
 136       return;
 137
 138    /*
 139     * We don't copy sampler state over unless it is actually enabled, to avoid
 140     * spurious recompiles, as the sampler static state is part of the shader
 141     * key.
 142     *
 143     * Ideally the state tracker or cso_cache module would make all state
 144     * canonical, but until that happens it's better to be safe than sorry here.
 145     *
 146     * XXX: Actually there's much more than can be done here, especially
 147     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
 148     */
 149
 150    state->wrap_s            = sampler->wrap_s;
 151    state->wrap_t            = sampler->wrap_t;
 152    state->wrap_r            = sampler->wrap_r;
 153    state->min_img_filter    = sampler->min_img_filter;
 154    state->mag_img_filter    = sampler->mag_img_filter;
 155
 156    if (sampler->max_lod > 0.0f) {
 157       state->min_mip_filter = sampler->min_mip_filter;
 158    } else {
 159       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
 160    }
 161
 162    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
 163       if (sampler->lod_bias != 0.0f) {
 164          state->lod_bias_non_zero = 1;
 165       }
 166
 167       /* If min_lod == max_lod we can greatly simplify mipmap selection.
 168        * This is a case that occurs during automatic mipmap generation.
 169        */
 170       if (sampler->min_lod == sampler->max_lod) {
 171          state->min_max_lod_equal = 1;
 172       } else {
 173          if (sampler->min_lod > 0.0f) {
 174             state->apply_min_lod = 1;
 175          }
 176
 177          /*
 178           * XXX this won't do anything with the mesa state tracker which always
 179           * sets max_lod to not more than actually present mip maps...
 180           */
 181          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
 182             state->apply_max_lod = 1;
 183          }
 184       }
 185    }
 186
 187    state->compare_mode      = sampler->compare_mode;
 188    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
 189       state->compare_func   = sampler->compare_func;
 190    }
 191
 192    state->normalized_coords = sampler->normalized_coords;
 193 }
 194
 195
 196 /**
 197  * Generate code to compute coordinate gradient (rho).
 198  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 199  *
 200  * The resulting rho is scalar per quad.
 201  */
 202 static LLVMValueRef
 203 lp_build_rho(struct lp_build_sample_context *bld,
 204              unsigned texture_unit,
 205              const struct lp_derivatives *derivs)
 206 {
 207    struct gallivm_state *gallivm = bld->gallivm;
 208    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
 209    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
 210    struct lp_build_context *float_bld = &bld->float_bld;
 211    struct lp_build_context *coord_bld = &bld->coord_bld;
 212    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
 213    const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
 214    const unsigned dims = bld->dims;
 215    LLVMBuilderRef builder = bld->gallivm->builder;
 216    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
 217    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
 218    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
 219    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
 220    LLVMValueRef rho_vec;
 221    LLVMValueRef int_size, float_size;
 222    LLVMValueRef rho;
 223    LLVMValueRef first_level, first_level_vec;
 224    LLVMValueRef abs_ddx_ddy[2];
 225    unsigned length = coord_bld->type.length;
 226    unsigned num_quads = length / 4;
 227    unsigned i;
 228    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
 229    LLVMValueRef rho_xvec, rho_yvec;
 230
 231    abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
 232    if (dims > 2) {
 233       abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 234    }
 235    else {
 236       abs_ddx_ddy[1] = NULL;
 237    }
 238
 239    if (dims == 1) {
 240       static const unsigned char swizzle1[] = {
 241          0, LP_BLD_SWIZZLE_DONTCARE,
 242          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 243       };
 244       static const unsigned char swizzle2[] = {
 245          1, LP_BLD_SWIZZLE_DONTCARE,
 246          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 247       };
 248       rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
 249       rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
 250    }
 251    else if (dims == 2) {
 252       static const unsigned char swizzle1[] = {
 253          0, 2,
 254          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 255       };
 256       static const unsigned char swizzle2[] = {
 257          1, 3,
 258          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 259       };
 260       rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
 261       rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
 262    }
 263    else {
 264       LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
 265       LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
 266       assert(dims == 3);
 267       for (i = 0; i < num_quads; i++) {
 268          shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
 269          shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
 270          shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
 271          shuffles1[4*i + 3] = i32undef;
 272          shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
 273          shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
 274          shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
 275          shuffles2[4*i + 3] = i32undef;
 276       }
 277       rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
 278                                         LLVMConstVector(shuffles1, length), "");
 279       rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
 280                                         LLVMConstVector(shuffles2, length), "");
 281    }
 282
 283    rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 284
 285    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 286                                                  bld->gallivm, texture_unit);
 287    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
 288    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
 289    float_size = lp_build_int_to_float(float_size_bld, int_size);
 290
 291    if (bld->coord_type.length > 4) {
 292       /* expand size to each quad */
 293       if (dims > 1) {
 294          /* could use some broadcast_vector helper for this? */
 295          int num_quads = bld->coord_type.length / 4;
 296          LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
 297          for (i = 0; i < num_quads; i++) {
 298             src[i] = float_size;
 299          }
 300          float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
 301       }
 302       else {
 303          float_size = lp_build_broadcast_scalar(coord_bld, float_size);
 304       }
 305       rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
 306
 307       if (dims <= 1) {
 308          rho = rho_vec;
 309       }
 310       else {
 311          if (dims >= 2) {
 312             static const unsigned char swizzle1[] = {
 313                0, LP_BLD_SWIZZLE_DONTCARE,
 314                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 315             };
 316             static const unsigned char swizzle2[] = {
 317                1, LP_BLD_SWIZZLE_DONTCARE,
 318                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 319             };
 320             LLVMValueRef rho_s, rho_t, rho_r;
 321
 322             rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
 323             rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
 324
 325             rho = lp_build_max(coord_bld, rho_s, rho_t);
 326
 327             if (dims >= 3) {
 328                static const unsigned char swizzle3[] = {
 329                   2, LP_BLD_SWIZZLE_DONTCARE,
 330                   LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
 331                };
 332                rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
 333                rho = lp_build_max(coord_bld, rho, rho_r);
 334             }
 335          }
 336       }
 337       rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
 338                                       perquadf_bld->type, rho, 0);
 339    }
 340    else {
 341       if (dims <= 1) {
 342          rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 343       }
 344       rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
 345
 346       if (dims <= 1) {
 347          rho = rho_vec;
 348       }
 349       else {
 350          if (dims >= 2) {
 351             LLVMValueRef rho_s, rho_t, rho_r;
 352
 353             rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
 354             rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
 355
 356             rho = lp_build_max(float_bld, rho_s, rho_t);
 357
 358             if (dims >= 3) {
 359                rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
 360                rho = lp_build_max(float_bld, rho, rho_r);
 361             }
 362          }
 363       }
 364    }
 365
 366    return rho;
 367 }
 368
 369
 370 /*
 371  * Bri-linear lod computation
 372  *
 373  * Use a piece-wise linear approximation of log2 such that:
 374  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
 375  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
 376  *   with the steepness specified in 'factor'
 377  * - exact result for 0.5, 1.5, etc.
 378  *
 379  *
 380  *   1.0 -              /----*
 381  *                     /
 382  *                    /
 383  *                   /
 384  *   0.5 -          *
 385  *                 /
 386  *                /
 387  *               /
 388  *   0.0 - *----/
 389  *
 390  *         |                 |
 391  *        2^0               2^1
 392  *
 393  * This is a technique also commonly used in hardware:
 394  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
 395  *
 396  * TODO: For correctness, this should only be applied when texture is known to
 397  * have regular mipmaps, i.e., mipmaps derived from the base level.
 398  *
 399  * TODO: This could be done in fixed point, where applicable.
 400  */
 401 static void
 402 lp_build_brilinear_lod(struct lp_build_context *bld,
 403                        LLVMValueRef lod,
 404                        double factor,
 405                        LLVMValueRef *out_lod_ipart,
 406                        LLVMValueRef *out_lod_fpart)
 407 {
 408    LLVMValueRef lod_fpart;
 409    double pre_offset = (factor - 0.5)/factor - 0.5;
 410    double post_offset = 1 - factor;
 411
 412    if (0) {
 413       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
 414    }
 415
 416    lod = lp_build_add(bld, lod,
 417                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
 418
 419    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
 420
 421    lod_fpart = lp_build_mul(bld, lod_fpart,
 422                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 423
 424    lod_fpart = lp_build_add(bld, lod_fpart,
 425                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 426
 427    /*
 428     * It's not necessary to clamp lod_fpart since:
 429     * - the above expression will never produce numbers greater than one.
 430     * - the mip filtering branch is only taken if lod_fpart is positive
 431     */
 432
 433    *out_lod_fpart = lod_fpart;
 434
 435    if (0) {
 436       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
 437       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
 438    }
 439 }
 440
 441
 442 /*
 443  * Combined log2 and brilinear lod computation.
 444  *
 445  * It's in all identical to calling lp_build_fast_log2() and
 446  * lp_build_brilinear_lod() above, but by combining we can compute the integer
 447  * and fractional part independently.
 448  */
 449 static void
 450 lp_build_brilinear_rho(struct lp_build_context *bld,
 451                        LLVMValueRef rho,
 452                        double factor,
 453                        LLVMValueRef *out_lod_ipart,
 454                        LLVMValueRef *out_lod_fpart)
 455 {
 456    LLVMValueRef lod_ipart;
 457    LLVMValueRef lod_fpart;
 458
 459    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
 460    const double post_offset = 1 - 2*factor;
 461
 462    assert(bld->type.floating);
 463
 464    assert(lp_check_value(bld->type, rho));
 465
 466    /*
 467     * The pre factor will make the intersections with the exact powers of two
 468     * happen precisely where we want then to be, which means that the integer
 469     * part will not need any post adjustments.
 470     */
 471    rho = lp_build_mul(bld, rho,
 472                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
 473
 474    /* ipart = ifloor(log2(rho)) */
 475    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
 476
 477    /* fpart = rho / 2**ipart */
 478    lod_fpart = lp_build_extract_mantissa(bld, rho);
 479
 480    lod_fpart = lp_build_mul(bld, lod_fpart,
 481                             lp_build_const_vec(bld->gallivm, bld->type, factor));
 482
 483    lod_fpart = lp_build_add(bld, lod_fpart,
 484                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
 485
 486    /*
 487     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
 488     * - the above expression will never produce numbers greater than one.
 489     * - the mip filtering branch is only taken if lod_fpart is positive
 490     */
 491
 492    *out_lod_ipart = lod_ipart;
 493    *out_lod_fpart = lod_fpart;
 494 }
 495
 496
 497 /**
 498  * Generate code to compute texture level of detail (lambda).
 499  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
 500  * \param lod_bias  optional float vector with the shader lod bias
 501  * \param explicit_lod  optional float vector with the explicit lod
 502  * \param width  scalar int texture width
 503  * \param height  scalar int texture height
 504  * \param depth  scalar int texture depth
 505  *
 506  * The resulting lod is scalar per quad, so only the first value per quad
 507  * passed in from lod_bias, explicit_lod is used.
 508  */
 509 void
 510 lp_build_lod_selector(struct lp_build_sample_context *bld,
 511                       unsigned texture_unit,
 512                       unsigned sampler_unit,
 513                       const struct lp_derivatives *derivs,
 514                       LLVMValueRef lod_bias, /* optional */
 515                       LLVMValueRef explicit_lod, /* optional */
 516                       unsigned mip_filter,
 517                       LLVMValueRef *out_lod_ipart,
 518                       LLVMValueRef *out_lod_fpart)
 519
 520 {
 521    LLVMBuilderRef builder = bld->gallivm->builder;
 522    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
 523    LLVMValueRef lod;
 524
 525    *out_lod_ipart = bld->perquadi_bld.zero;
 526    *out_lod_fpart = perquadf_bld->zero;
 527
 528    if (bld->static_sampler_state->min_max_lod_equal) {
 529       /* User is forcing sampling from a particular mipmap level.
 530        * This is hit during mipmap generation.
 531        */
 532       LLVMValueRef min_lod =
 533          bld->dynamic_state->min_lod(bld->dynamic_state,
 534                                      bld->gallivm, sampler_unit);
 535
 536       lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
 537    }
 538    else {
 539       if (explicit_lod) {
 540          lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 541                                          perquadf_bld->type, explicit_lod, 0);
 542       }
 543       else {
 544          LLVMValueRef rho;
 545
 546          rho = lp_build_rho(bld, texture_unit, derivs);
 547
 548          /*
 549           * Compute lod = log2(rho)
 550           */
 551
 552          if (!lod_bias &&
 553              !bld->static_sampler_state->lod_bias_non_zero &&
 554              !bld->static_sampler_state->apply_max_lod &&
 555              !bld->static_sampler_state->apply_min_lod) {
 556             /*
 557              * Special case when there are no post-log2 adjustments, which
 558              * saves instructions but keeping the integer and fractional lod
 559              * computations separate from the start.
 560              */
 561
 562             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
 563                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
 564                *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
 565                *out_lod_fpart = perquadf_bld->zero;
 566                return;
 567             }
 568             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
 569                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 570                lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
 571                                       out_lod_ipart, out_lod_fpart);
 572                return;
 573             }
 574          }
 575
 576          if (0) {
 577             lod = lp_build_log2(perquadf_bld, rho);
 578          }
 579          else {
 580             lod = lp_build_fast_log2(perquadf_bld, rho);
 581          }
 582
 583          /* add shader lod bias */
 584          if (lod_bias) {
 585             lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
 586                   perquadf_bld->type, lod_bias, 0);
 587             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
 588          }
 589       }
 590
 591       /* add sampler lod bias */
 592       if (bld->static_sampler_state->lod_bias_non_zero) {
 593          LLVMValueRef sampler_lod_bias =
 594             bld->dynamic_state->lod_bias(bld->dynamic_state,
 595                                          bld->gallivm, sampler_unit);
 596          sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
 597                                                       sampler_lod_bias);
 598          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
 599       }
 600
 601       /* clamp lod */
 602       if (bld->static_sampler_state->apply_max_lod) {
 603          LLVMValueRef max_lod =
 604             bld->dynamic_state->max_lod(bld->dynamic_state,
 605                                         bld->gallivm, sampler_unit);
 606          max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
 607
 608          lod = lp_build_min(perquadf_bld, lod, max_lod);
 609       }
 610       if (bld->static_sampler_state->apply_min_lod) {
 611          LLVMValueRef min_lod =
 612             bld->dynamic_state->min_lod(bld->dynamic_state,
 613                                         bld->gallivm, sampler_unit);
 614          min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
 615
 616          lod = lp_build_max(perquadf_bld, lod, min_lod);
 617       }
 618    }
 619
 620    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
 621       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
 622          lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
 623                                 out_lod_ipart, out_lod_fpart);
 624       }
 625       else {
 626          lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
 627       }
 628
 629       lp_build_name(*out_lod_fpart, "lod_fpart");
 630    }
 631    else {
 632       *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
 633    }
 634
 635    lp_build_name(*out_lod_ipart, "lod_ipart");
 636
 637    return;
 638 }
 639
 640
 641 /**
 642  * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
 643  * mipmap level index.
 644  * Note: this is all scalar per quad code.
 645  * \param lod_ipart  int texture level of detail
 646  * \param level_out  returns integer
 647  */
 648 void
 649 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 650                            unsigned texture_unit,
 651                            LLVMValueRef lod_ipart,
 652                            LLVMValueRef *level_out)
 653 {
 654    struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
 655    LLVMValueRef first_level, last_level, level;
 656
 657    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 658                                                  bld->gallivm, texture_unit);
 659    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 660                                                bld->gallivm, texture_unit);
 661    first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
 662    last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
 663
 664    level = lp_build_add(perquadi_bld, lod_ipart, first_level);
 665
 666    /* clamp level to legal range of levels */
 667    *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
 668 }
 669
 670
 671 /**
 672  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
 673  * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
 674  * Later, we'll sample from those two mipmap levels and interpolate between them.
 675  */
 676 void
 677 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
 678                            unsigned texture_unit,
 679                            LLVMValueRef lod_ipart,
 680                            LLVMValueRef *lod_fpart_inout,
 681                            LLVMValueRef *level0_out,
 682                            LLVMValueRef *level1_out)
 683 {
 684    LLVMBuilderRef builder = bld->gallivm->builder;
 685    struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
 686    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
 687    LLVMValueRef first_level, last_level;
 688    LLVMValueRef clamp_min;
 689    LLVMValueRef clamp_max;
 690
 691    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
 692                                                  bld->gallivm, texture_unit);
 693    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
 694                                                bld->gallivm, texture_unit);
 695    first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
 696    last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
 697
 698    *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
 699    *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
 700
 701    /*
 702     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
 703     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
 704     * ends in the process.
 705     */
 706
 707    /*
 708     * This code (vector select in particular) only works with llvm 3.1
 709     * (if there's more than one quad, with x86 backend). Might consider
 710     * converting to our lp_bld_logic helpers.
 711     */
 712 #if HAVE_LLVM < 0x0301
 713    assert(perquadi_bld->type.length == 1);
 714 #endif
 715
 716    /* *level0_out < first_level */
 717    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
 718                              *level0_out, first_level,
 719                              "clamp_lod_to_first");
 720
 721    *level0_out = LLVMBuildSelect(builder, clamp_min,
 722                                  first_level, *level0_out, "");
 723
 724    *level1_out = LLVMBuildSelect(builder, clamp_min,
 725                                  first_level, *level1_out, "");
 726
 727    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
 728                                       perquadf_bld->zero, *lod_fpart_inout, "");
 729
 730    /* *level0_out >= last_level */
 731    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
 732                              *level0_out, last_level,
 733                              "clamp_lod_to_last");
 734
 735    *level0_out = LLVMBuildSelect(builder, clamp_max,
 736                                  last_level, *level0_out, "");
 737
 738    *level1_out = LLVMBuildSelect(builder, clamp_max,
 739                                  last_level, *level1_out, "");
 740
 741    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
 742                                       perquadf_bld->zero, *lod_fpart_inout, "");
 743
 744    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
 745    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
 746    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
 747 }
 748
 749
 750 /**
 751  * Return pointer to a single mipmap level.
 752  * \param level  integer mipmap level
 753  */
 754 LLVMValueRef
 755 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 756                           LLVMValueRef level)
 757 {
 758    LLVMBuilderRef builder = bld->gallivm->builder;
 759    LLVMValueRef indexes[2], data_ptr, mip_offset;
 760
 761    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 762    indexes[1] = level;
 763    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 764    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
 765    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
 766    return data_ptr;
 767 }
 768
 769 /**
 770  * Return (per-pixel) offsets to mip levels.
 771  * \param level  integer mipmap level
 772  */
 773 LLVMValueRef
 774 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
 775                          LLVMValueRef level)
 776 {
 777    LLVMBuilderRef builder = bld->gallivm->builder;
 778    LLVMValueRef indexes[2], offsets, offset1;
 779
 780    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 781    if (bld->num_lods == 1) {
 782       indexes[1] = level;
 783       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 784       offset1 = LLVMBuildLoad(builder, offset1, "");
 785       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
 786    }
 787    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
 788       unsigned i;
 789
 790       offsets = bld->int_coord_bld.undef;
 791       for (i = 0; i < bld->num_lods; i++) {
 792          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 793          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
 794          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 795          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 796          offset1 = LLVMBuildLoad(builder, offset1, "");
 797          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
 798       }
 799       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
 800    }
 801    else {
 802       unsigned i;
 803
 804       assert (bld->num_lods == bld->coord_bld.type.length);
 805
 806       offsets = bld->int_coord_bld.undef;
 807       for (i = 0; i < bld->num_lods; i++) {
 808          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 809          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 810          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
 811          offset1 = LLVMBuildLoad(builder, offset1, "");
 812          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
 813       }
 814    }
 815    return offsets;
 816 }
 817
 818
 819 /**
 820  * Codegen equivalent for u_minify().
 821  * Return max(1, base_size >> level);
 822  */
 823 LLVMValueRef
 824 lp_build_minify(struct lp_build_context *bld,
 825                 LLVMValueRef base_size,
 826                 LLVMValueRef level)
 827 {
 828    LLVMBuilderRef builder = bld->gallivm->builder;
 829    assert(lp_check_value(bld->type, base_size));
 830    assert(lp_check_value(bld->type, level));
 831
 832    if (level == bld->zero) {
 833       /* if we're using mipmap level zero, no minification is needed */
 834       return base_size;
 835    }
 836    else {
 837       LLVMValueRef size =
 838          LLVMBuildLShr(builder, base_size, level, "minify");
 839       assert(bld->type.sign);
 840       size = lp_build_max(bld, size, bld->one);
 841       return size;
 842    }
 843 }
 844
 845
 846 /**
 847  * Dereference stride_array[mipmap_level] array to get a stride.
 848  * Return stride as a vector.
 849  */
 850 static LLVMValueRef
 851 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
 852                               LLVMValueRef stride_array, LLVMValueRef level)
 853 {
 854    LLVMBuilderRef builder = bld->gallivm->builder;
 855    LLVMValueRef indexes[2], stride, stride1;
 856    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
 857    if (bld->num_lods == 1) {
 858       indexes[1] = level;
 859       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
 860       stride1 = LLVMBuildLoad(builder, stride1, "");
 861       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
 862    }
 863    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
 864       LLVMValueRef stride1;
 865       unsigned i;
 866
 867       stride = bld->int_coord_bld.undef;
 868       for (i = 0; i < bld->num_lods; i++) {
 869          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 870          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, i);
 871          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 872          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
 873          stride1 = LLVMBuildLoad(builder, stride1, "");
 874          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
 875       }
 876       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
 877    }
 878    else {
 879       LLVMValueRef stride1;
 880       unsigned i;
 881
 882       assert (bld->num_lods == bld->coord_bld.type.length);
 883
 884       stride = bld->int_coord_bld.undef;
 885       for (i = 0; i < bld->coord_bld.type.length; i++) {
 886          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 887          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
 888          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
 889          stride1 = LLVMBuildLoad(builder, stride1, "");
 890          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
 891       }
 892    }
 893    return stride;
 894 }
 895
 896
 897 /**
 898  * When sampling a mipmap, we need to compute the width, height, depth
 899  * of the source levels from the level indexes.  This helper function
 900  * does that.
 901  */
 902 void
 903 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
 904                             LLVMValueRef ilevel,
 905                             LLVMValueRef *out_size,
 906                             LLVMValueRef *row_stride_vec,
 907                             LLVMValueRef *img_stride_vec)
 908 {
 909    const unsigned dims = bld->dims;
 910    LLVMValueRef ilevel_vec;
 911
 912    /*
 913     * Compute width, height, depth at mipmap level 'ilevel'
 914     */
 915    if (bld->num_lods == 1) {
 916       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
 917       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
 918    }
 919    else {
 920       LLVMValueRef int_size_vec;
 921       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 922       unsigned num_quads = bld->coord_bld.type.length / 4;
 923       unsigned i;
 924
 925       if (bld->num_lods == num_quads) {
 926          /*
 927           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
 928           * intel "forgot" the variable shift count instruction until avx2.
 929           * A harmless 8x32 shift gets translated into 32 instructions
 930           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
 931           * unable to recognize if there are really just 2 different shift
 932           * count values. So do the shift 4-wide before expansion.
 933           */
 934          struct lp_build_context bld4;
 935          struct lp_type type4;
 936
 937          type4 = bld->int_coord_bld.type;
 938          type4.length = 4;
 939
 940          lp_build_context_init(&bld4, bld->gallivm, type4);
 941
 942          if (bld->dims == 1) {
 943             assert(bld->int_size_in_bld.type.length == 1);
 944             int_size_vec = lp_build_broadcast_scalar(&bld4,
 945                                                      bld->int_size);
 946          }
 947          else {
 948             assert(bld->int_size_in_bld.type.length == 4);
 949             int_size_vec = bld->int_size;
 950          }
 951
 952          for (i = 0; i < num_quads; i++) {
 953             LLVMValueRef ileveli;
 954             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 955
 956             ileveli = lp_build_extract_broadcast(bld->gallivm,
 957                                                  bld->perquadi_bld.type,
 958                                                  bld4.type,
 959                                                  ilevel,
 960                                                  indexi);
 961             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
 962          }
 963          /*
 964           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
 965           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
 966           */
 967          *out_size = lp_build_concat(bld->gallivm,
 968                                      tmp,
 969                                      bld4.type,
 970                                      num_quads);
 971       }
 972       else {
 973         /* FIXME: this is terrible and results in _huge_ vector
 974          * (for the dims > 1 case).
 975          * Should refactor this (together with extract_image_sizes) and do
 976          * something more useful. Could for instance if we have width,height
 977          * with 4-wide vector pack all elements into a 8xi16 vector
 978          * (on which we can still do useful math) instead of using a 16xi32
 979          * vector.
 980          * FIXME: some callers can't handle this yet.
 981          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
 982          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
 983          */
 984          assert(bld->num_lods == bld->coord_bld.type.length);
 985          if (bld->dims == 1) {
 986             assert(bld->int_size_bld.type.length == 1);
 987             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
 988                                                      bld->int_size);
 989             /* vector shift with variable shift count alert... */
 990             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
 991          }
 992          else {
 993             LLVMValueRef ilevel1;
 994             for (i = 0; i < bld->num_lods; i++) {
 995                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 996                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
 997                                                     bld->int_size_in_bld.type, ilevel, indexi);
 998                tmp[i] = bld->int_size;
 999                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1000             }
1001             int_size_vec = lp_build_concat(bld->gallivm,
1002                                            tmp,
1003                                            bld->int_size_in_bld.type,
1004                                            bld->num_lods);
1005          }
1006       }
1007    }
1008
1009    if (dims >= 2) {
1010       *row_stride_vec = lp_build_get_level_stride_vec(bld,
1011                                                       bld->row_stride_array,
1012                                                       ilevel);
1013    }
1014    if (dims == 3 ||
1015        bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1016        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1017        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1018       *img_stride_vec = lp_build_get_level_stride_vec(bld,
1019                                                       bld->img_stride_array,
1020                                                       ilevel);
1021    }
1022 }
1023
1024
1025 /**
1026  * Extract and broadcast texture size.
1027  *
1028  * @param size_type   type of the texture size vector (either
1029  *                    bld->int_size_type or bld->float_size_type)
1030  * @param coord_type  type of the texture size vector (either
1031  *                    bld->int_coord_type or bld->coord_type)
1032  * @param size        vector with the texture size (width, height, depth)
1033  */
1034 void
1035 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1036                              struct lp_build_context *size_bld,
1037                              struct lp_type coord_type,
1038                              LLVMValueRef size,
1039                              LLVMValueRef *out_width,
1040                              LLVMValueRef *out_height,
1041                              LLVMValueRef *out_depth)
1042 {
1043    const unsigned dims = bld->dims;
1044    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1045    struct lp_type size_type = size_bld->type;
1046
1047    if (bld->num_lods == 1) {
1048       *out_width = lp_build_extract_broadcast(bld->gallivm,
1049                                               size_type,
1050                                               coord_type,
1051                                               size,
1052                                               LLVMConstInt(i32t, 0, 0));
1053       if (dims >= 2) {
1054          *out_height = lp_build_extract_broadcast(bld->gallivm,
1055                                                   size_type,
1056                                                   coord_type,
1057                                                   size,
1058                                                   LLVMConstInt(i32t, 1, 0));
1059          if (dims == 3) {
1060             *out_depth = lp_build_extract_broadcast(bld->gallivm,
1061                                                     size_type,
1062                                                     coord_type,
1063                                                     size,
1064                                                     LLVMConstInt(i32t, 2, 0));
1065          }
1066       }
1067    }
1068    else {
1069       unsigned num_quads = bld->coord_bld.type.length / 4;
1070
1071       if (dims == 1) {
1072          *out_width = size;
1073       }
1074       else if (bld->num_lods == num_quads) {
1075          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1076          if (dims >= 2) {
1077             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1078             if (dims == 3) {
1079                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1080             }
1081          }
1082       }
1083       else {
1084          assert(bld->num_lods == bld->coord_type.length);
1085          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1086                                                 coord_type, size, 0);
1087          if (dims >= 2) {
1088             *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1089                                                    coord_type, size, 1);
1090             if (dims == 3) {
1091                *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1092                                                       coord_type, size, 2);
1093             }
1094          }
1095       }
1096    }
1097 }
1098
1099
1100 /**
1101  * Unnormalize coords.
1102  *
1103  * @param flt_size  vector with the integer texture size (width, height, depth)
1104  */
1105 void
1106 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1107                              LLVMValueRef flt_size,
1108                              LLVMValueRef *s,
1109                              LLVMValueRef *t,
1110                              LLVMValueRef *r)
1111 {
1112    const unsigned dims = bld->dims;
1113    LLVMValueRef width;
1114    LLVMValueRef height;
1115    LLVMValueRef depth;
1116
1117    lp_build_extract_image_sizes(bld,
1118                                 &bld->float_size_bld,
1119                                 bld->coord_type,
1120                                 flt_size,
1121                                 &width,
1122                                 &height,
1123                                 &depth);
1124
1125    /* s = s * width, t = t * height */
1126    *s = lp_build_mul(&bld->coord_bld, *s, width);
1127    if (dims >= 2) {
1128       *t = lp_build_mul(&bld->coord_bld, *t, height);
1129       if (dims >= 3) {
1130          *r = lp_build_mul(&bld->coord_bld, *r, depth);
1131       }
1132    }
1133 }
1134
1135
1136 /** Helper used by lp_build_cube_lookup() */
1137 static LLVMValueRef
1138 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1139 {
1140    /* ima = +0.5 / abs(coord); */
1141    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1142    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1143    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1144    return ima;
1145 }
1146
1147 /** Helper used by lp_build_cube_lookup() */
1148 static LLVMValueRef
1149 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1150 {
1151    /* ima = -0.5 / abs(coord); */
1152    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1153    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1154    LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1155    return ima;
1156 }
1157
1158 /**
1159  * Helper used by lp_build_cube_lookup()
1160  * FIXME: the sign here can also be 0.
1161  * Arithmetically this could definitely make a difference. Either
1162  * fix the comment or use other (simpler) sign function, not sure
1163  * which one it should be.
1164  * \param sign  scalar +1 or -1
1165  * \param coord  float vector
1166  * \param ima  float vector
1167  */
1168 static LLVMValueRef
1169 lp_build_cube_coord(struct lp_build_context *coord_bld,
1170                     LLVMValueRef sign, int negate_coord,
1171                     LLVMValueRef coord, LLVMValueRef ima)
1172 {
1173    /* return negate(coord) * ima * sign + 0.5; */
1174    LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1175    LLVMValueRef res;
1176
1177    assert(negate_coord == +1 || negate_coord == -1);
1178
1179    if (negate_coord == -1) {
1180       coord = lp_build_negate(coord_bld, coord);
1181    }
1182
1183    res = lp_build_mul(coord_bld, coord, ima);
1184    if (sign) {
1185       sign = lp_build_broadcast_scalar(coord_bld, sign);
1186       res = lp_build_mul(coord_bld, res, sign);
1187    }
1188    res = lp_build_add(coord_bld, res, half);
1189
1190    return res;
1191 }
1192
1193
1194 /** Helper used by lp_build_cube_lookup()
1195  * Return (major_coord >= 0) ? pos_face : neg_face;
1196  */
1197 static LLVMValueRef
1198 lp_build_cube_face(struct lp_build_sample_context *bld,
1199                    LLVMValueRef major_coord,
1200                    unsigned pos_face, unsigned neg_face)
1201 {
1202    struct gallivm_state *gallivm = bld->gallivm;
1203    LLVMBuilderRef builder = gallivm->builder;
1204    LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1205                                     major_coord,
1206                                     bld->float_bld.zero, "");
1207    LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1208    LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1209    LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1210    return res;
1211 }
1212
1213
1214
1215 /**
1216  * Generate code to do cube face selection and compute per-face texcoords.
1217  */
1218 void
1219 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1220                      LLVMValueRef s,
1221                      LLVMValueRef t,
1222                      LLVMValueRef r,
1223                      LLVMValueRef *face,
1224                      LLVMValueRef *face_s,
1225                      LLVMValueRef *face_t)
1226 {
1227    struct lp_build_context *coord_bld = &bld->coord_bld;
1228    LLVMBuilderRef builder = bld->gallivm->builder;
1229    struct gallivm_state *gallivm = bld->gallivm;
1230    LLVMValueRef rx, ry, rz;
1231    LLVMValueRef tmp[4], rxyz, arxyz;
1232
1233    /*
1234     * Use the average of the four pixel's texcoords to choose the face.
1235     * Slight simplification just calculate the sum, skip scaling.
1236     */
1237    tmp[0] = s;
1238    tmp[1] = t;
1239    tmp[2] = r;
1240    rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1241    arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1242
1243    if (coord_bld->type.length > 4) {
1244       struct lp_build_context *cint_bld = &bld->int_coord_bld;
1245       struct lp_type intctype = cint_bld->type;
1246       LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
1247       LLVMValueRef arxs, arys, arzs;
1248       LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
1249       LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1250       LLVMValueRef ryneg, rzneg;
1251       LLVMValueRef ma, ima;
1252       LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1253       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1254                                                      1 << (intctype.width - 1));
1255       LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1256                                                       intctype.width -1);
1257       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1258       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1259       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1260
1261       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1262       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1263       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1264
1265       rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1266       ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1267       rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1268       ryneg = LLVMBuildXor(builder, ry, signmask, "");
1269       rzneg = LLVMBuildXor(builder, rz, signmask, "");
1270
1271       /* the sign bit comes from the averaged vector (per quad),
1272        * as does the decision which face to use */
1273       signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
1274       signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
1275
1276       arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0, 4);
1277       arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1, 4);
1278       arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2, 4);
1279
1280       /*
1281        * select x if x >= y else select y
1282        * select previous result if y >= max(x,y) else select z
1283        */
1284       arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
1285       maxarxsarys = lp_build_max(coord_bld, arxs, arys);
1286       arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
1287
1288       /*
1289        * compute all possible new s/t coords
1290        * snewx = signrx * -rz;
1291        * tnewx = -ry;
1292        * snewy = rx;
1293        * tnewy = signry * rz;
1294        * snewz = signrz * rx;
1295        * tnewz = -ry;
1296        */
1297       signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0, 4);
1298       snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
1299       tnewx = ryneg;
1300
1301       signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1, 4);
1302       snewy = rx;
1303       tnewy = LLVMBuildXor(builder, signrys, rz, "");
1304
1305       signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2, 4);
1306       snewz = LLVMBuildXor(builder, signrzs, rx, "");
1307       tnewz = ryneg;
1308
1309       /* XXX on x86 unclear if we should cast the values back to float
1310        * or not - on some cpus (nehalem) pblendvb has twice the throughput
1311        * of blendvps though on others there just might be domain
1312        * transition penalties when using it (this depends on what llvm
1313        * will chose for the bit ops above so there appears no "right way",
1314        * but given the boatload of selects let's just use the int type).
1315        *
1316        * Unfortunately we also need the sign bit of the summed coords.
1317        */
1318       *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
1319       *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
1320       ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
1321       *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
1322       sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
1323
1324       *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
1325       *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
1326       ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
1327       *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
1328       sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
1329
1330       *face_s = LLVMBuildBitCast(builder, *face_s,
1331                                lp_build_vec_type(gallivm, coord_bld->type), "");
1332       *face_t = LLVMBuildBitCast(builder, *face_t,
1333                                lp_build_vec_type(gallivm, coord_bld->type), "");
1334
1335       /* add +1 for neg face */
1336       /* XXX with AVX probably want to use another select here -
1337        * as long as we ensure vblendvps gets used we can actually
1338        * skip the comparison and just use sign as a "mask" directly.
1339        */
1340       sign = LLVMBuildLShr(builder, sign, signshift, "");
1341       *face = LLVMBuildOr(builder, *face, sign, "face");
1342
1343       ima = lp_build_cube_imapos(coord_bld, ma);
1344
1345       *face_s = lp_build_mul(coord_bld, *face_s, ima);
1346       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
1347       *face_t = lp_build_mul(coord_bld, *face_t, ima);
1348       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
1349    }
1350
1351    else {
1352       struct lp_build_if_state if_ctx;
1353       LLVMValueRef face_s_var;
1354       LLVMValueRef face_t_var;
1355       LLVMValueRef face_var;
1356       LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1357       LLVMValueRef shuffles[4];
1358       LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1359       LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1360       struct lp_build_context *float_bld = &bld->float_bld;
1361
1362       assert(bld->coord_bld.type.length == 4);
1363
1364       shuffles[0] = lp_build_const_int32(gallivm, 0);
1365       shuffles[1] = lp_build_const_int32(gallivm, 1);
1366       shuffles[2] = lp_build_const_int32(gallivm, 0);
1367       shuffles[3] = lp_build_const_int32(gallivm, 1);
1368       arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1369       shuffles[0] = lp_build_const_int32(gallivm, 1);
1370       shuffles[1] = lp_build_const_int32(gallivm, 0);
1371       shuffles[2] = lp_build_const_int32(gallivm, 2);
1372       shuffles[3] = lp_build_const_int32(gallivm, 2);
1373       aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1374       arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1375
1376       shuffles[0] = lp_build_const_int32(gallivm, 0);
1377       shuffles[1] = lp_build_const_int32(gallivm, 1);
1378       arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1379                                             LLVMConstVector(shuffles, 2), "");
1380       shuffles[0] = lp_build_const_int32(gallivm, 2);
1381       shuffles[1] = lp_build_const_int32(gallivm, 3);
1382       arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1383                                             LLVMConstVector(shuffles, 2), "");
1384       arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1385
1386       arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1387                                                lp_build_const_int32(gallivm, 0), "");
1388       arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1389                                                lp_build_const_int32(gallivm, 0), "");
1390       ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1391                                                lp_build_const_int32(gallivm, 1), "");
1392       ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1393                                                lp_build_const_int32(gallivm, 0), "");
1394       face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1395       face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1396       face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1397
1398       lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1399       {
1400          /* +/- X face */
1401          LLVMValueRef sign, ima;
1402          rx = LLVMBuildExtractElement(builder, rxyz,
1403                                       lp_build_const_int32(gallivm, 0), "");
1404          /* +/- X face */
1405          sign = lp_build_sgn(float_bld, rx);
1406          ima = lp_build_cube_imaneg(coord_bld, s);
1407          *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1408          *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1409          *face = lp_build_cube_face(bld, rx,
1410                                     PIPE_TEX_FACE_POS_X,
1411                                     PIPE_TEX_FACE_NEG_X);
1412          LLVMBuildStore(builder, *face_s, face_s_var);
1413          LLVMBuildStore(builder, *face_t, face_t_var);
1414          LLVMBuildStore(builder, *face, face_var);
1415       }
1416       lp_build_else(&if_ctx);
1417       {
1418          struct lp_build_if_state if_ctx2;
1419
1420          lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1421          {
1422             LLVMValueRef sign, ima;
1423             /* +/- Y face */
1424             ry = LLVMBuildExtractElement(builder, rxyz,
1425                                          lp_build_const_int32(gallivm, 1), "");
1426             sign = lp_build_sgn(float_bld, ry);
1427             ima = lp_build_cube_imaneg(coord_bld, t);
1428             *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1429             *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1430             *face = lp_build_cube_face(bld, ry,
1431                                        PIPE_TEX_FACE_POS_Y,
1432                                        PIPE_TEX_FACE_NEG_Y);
1433             LLVMBuildStore(builder, *face_s, face_s_var);
1434             LLVMBuildStore(builder, *face_t, face_t_var);
1435             LLVMBuildStore(builder, *face, face_var);
1436          }
1437          lp_build_else(&if_ctx2);
1438          {
1439             /* +/- Z face */
1440             LLVMValueRef sign, ima;
1441             rz = LLVMBuildExtractElement(builder, rxyz,
1442                                          lp_build_const_int32(gallivm, 2), "");
1443             sign = lp_build_sgn(float_bld, rz);
1444             ima = lp_build_cube_imaneg(coord_bld, r);
1445             *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1446             *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1447             *face = lp_build_cube_face(bld, rz,
1448                                        PIPE_TEX_FACE_POS_Z,
1449                                        PIPE_TEX_FACE_NEG_Z);
1450             LLVMBuildStore(builder, *face_s, face_s_var);
1451             LLVMBuildStore(builder, *face_t, face_t_var);
1452             LLVMBuildStore(builder, *face, face_var);
1453          }
1454          lp_build_endif(&if_ctx2);
1455       }
1456
1457       lp_build_endif(&if_ctx);
1458
1459       *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
1460       *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
1461       *face   = LLVMBuildLoad(builder, face_var, "face");
1462       *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
1463    }
1464 }
1465
1466
1467 /**
1468  * Compute the partial offset of a pixel block along an arbitrary axis.
1469  *
1470  * @param coord   coordinate in pixels
1471  * @param stride  number of bytes between rows of successive pixel blocks
1472  * @param block_length  number of pixels in a pixels block along the coordinate
1473  *                      axis
1474  * @param out_offset    resulting relative offset of the pixel block in bytes
1475  * @param out_subcoord  resulting sub-block pixel coordinate
1476  */
1477 void
1478 lp_build_sample_partial_offset(struct lp_build_context *bld,
1479                                unsigned block_length,
1480                                LLVMValueRef coord,
1481                                LLVMValueRef stride,
1482                                LLVMValueRef *out_offset,
1483                                LLVMValueRef *out_subcoord)
1484 {
1485    LLVMBuilderRef builder = bld->gallivm->builder;
1486    LLVMValueRef offset;
1487    LLVMValueRef subcoord;
1488
1489    if (block_length == 1) {
1490       subcoord = bld->zero;
1491    }
1492    else {
1493       /*
1494        * Pixel blocks have power of two dimensions. LLVM should convert the
1495        * rem/div to bit arithmetic.
1496        * TODO: Verify this.
1497        * It does indeed BUT it does transform it to scalar (and back) when doing so
1498        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1499        * The generated code looks seriously unfunny and is quite expensive.
1500        */
1501 #if 0
1502       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1503       subcoord = LLVMBuildURem(builder, coord, block_width, "");
1504       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
1505 #else
1506       unsigned logbase2 = util_logbase2(block_length);
1507       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1508       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1509       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1510       coord = LLVMBuildLShr(builder, coord, block_shift, "");
1511 #endif
1512    }
1513
1514    offset = lp_build_mul(bld, coord, stride);
1515
1516    assert(out_offset);
1517    assert(out_subcoord);
1518
1519    *out_offset = offset;
1520    *out_subcoord = subcoord;
1521 }
1522
1523
1524 /**
1525  * Compute the offset of a pixel block.
1526  *
1527  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1528  *
1529  * Returns the relative offset and i,j sub-block coordinates
1530  */
1531 void
1532 lp_build_sample_offset(struct lp_build_context *bld,
1533                        const struct util_format_description *format_desc,
1534                        LLVMValueRef x,
1535                        LLVMValueRef y,
1536                        LLVMValueRef z,
1537                        LLVMValueRef y_stride,
1538                        LLVMValueRef z_stride,
1539                        LLVMValueRef *out_offset,
1540                        LLVMValueRef *out_i,
1541                        LLVMValueRef *out_j)
1542 {
1543    LLVMValueRef x_stride;
1544    LLVMValueRef offset;
1545
1546    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1547                                  format_desc->block.bits/8);
1548
1549    lp_build_sample_partial_offset(bld,
1550                                   format_desc->block.width,
1551                                   x, x_stride,
1552                                   &offset, out_i);
1553
1554    if (y && y_stride) {
1555       LLVMValueRef y_offset;
1556       lp_build_sample_partial_offset(bld,
1557                                      format_desc->block.height,
1558                                      y, y_stride,
1559                                      &y_offset, out_j);
1560       offset = lp_build_add(bld, offset, y_offset);
1561    }
1562    else {
1563       *out_j = bld->zero;
1564    }
1565
1566    if (z && z_stride) {
1567       LLVMValueRef z_offset;
1568       LLVMValueRef k;
1569       lp_build_sample_partial_offset(bld,
1570                                      1, /* pixel blocks are always 2D */
1571                                      z, z_stride,
1572                                      &z_offset, &k);
1573       offset = lp_build_add(bld, offset, z_offset);
1574    }
1575
1576    *out_offset = offset;
1577 }