src/gallium/drivers/llvmpipe/lp_bld_interp.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * Copyright 2007-2008 VMware, Inc.
   5  * All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 /**
  30  * @file
  31  * Position and shader input interpolation.
  32  *
  33  * @author Jose Fonseca <jfonseca@vmware.com>
  34  */
  35
  36 #include "pipe/p_shader_tokens.h"
  37 #include "util/u_debug.h"
  38 #include "util/u_memory.h"
  39 #include "util/u_math.h"
  40 #include "tgsi/tgsi_scan.h"
  41 #include "gallivm/lp_bld_debug.h"
  42 #include "gallivm/lp_bld_const.h"
  43 #include "gallivm/lp_bld_arit.h"
  44 #include "gallivm/lp_bld_swizzle.h"
  45 #include "gallivm/lp_bld_flow.h"
  46 #include "gallivm/lp_bld_logic.h"
  47 #include "gallivm/lp_bld_struct.h"
  48 #include "lp_bld_interp.h"
  49
  50
  51 /*
  52  * The shader JIT function operates on blocks of quads.
  53  * Each block has 2x2 quads and each quad has 2x2 pixels.
  54  *
  55  * We iterate over the quads in order 0, 1, 2, 3:
  56  *
  57  * #################
  58  * #   |   #   |   #
  59  * #---0---#---1---#
  60  * #   |   #   |   #
  61  * #################
  62  * #   |   #   |   #
  63  * #---2---#---3---#
  64  * #   |   #   |   #
  65  * #################
  66  *
  67  * If we iterate over multiple quads at once, quads 01 and 23 are processed
  68  * together.
  69  *
  70  * Within each quad, we have four pixels which are represented in SOA
  71  * order:
  72  *
  73  * #########
  74  * # 0 | 1 #
  75  * #---+---#
  76  * # 2 | 3 #
  77  * #########
  78  *
  79  * So the green channel (for example) of the four pixels is stored in
  80  * a single vector register: {g0, g1, g2, g3}.
  81  * The order stays the same even with multiple quads:
  82  * 0 1 4 5
  83  * 2 3 6 7
  84  * is stored as g0..g7
  85  */
  86
  87
  88 /**
  89  * Do one perspective divide per quad.
  90  *
  91  * For perspective interpolation, the final attribute value is given
  92  *
  93  *  a' = a/w = a * oow
  94  *
  95  * where
  96  *
  97  *  a = a0 + dadx*x + dady*y
  98  *  w = w0 + dwdx*x + dwdy*y
  99  *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
 100  *
 101  * Instead of computing the division per pixel, with this macro we compute the
 102  * division on the upper left pixel of each quad, and use a linear
 103  * approximation in the remaining pixels, given by:
 104  *
 105  *  da'dx = (dadx - dwdx*a)*oow
 106  *  da'dy = (dady - dwdy*a)*oow
 107  *
 108  * Ironically, this actually makes things slower -- probably because the
 109  * divide hardware unit is rarely used, whereas the multiply unit is typically
 110  * already saturated.
 111  */
 112 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
 113
 114
 115 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
 116 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
 117
 118
 119 static void
 120 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
 121 {
 122    if(attrib == 0)
 123       lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
 124    else
 125       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
 126 }
 127
 128 static void
 129 calc_offsets(struct lp_build_context *coeff_bld,
 130              unsigned quad_start_index,
 131              LLVMValueRef *pixoffx,
 132              LLVMValueRef *pixoffy)
 133 {
 134    unsigned i;
 135    unsigned num_pix = coeff_bld->type.length;
 136    struct gallivm_state *gallivm = coeff_bld->gallivm;
 137    LLVMBuilderRef builder = coeff_bld->gallivm->builder;
 138    LLVMValueRef nr, pixxf, pixyf;
 139
 140    *pixoffx = coeff_bld->undef;
 141    *pixoffy = coeff_bld->undef;
 142
 143    for (i = 0; i < num_pix; i++) {
 144       nr = lp_build_const_int32(gallivm, i);
 145       pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
 146                                    (quad_start_index & 1) * 2);
 147       pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
 148                                    (quad_start_index & 2));
 149       *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
 150       *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
 151    }
 152 }
 153
 154
 155 /* Much easier, and significantly less instructions in the per-stamp
 156  * part (less than half) but overall more instructions so a loss if
 157  * most quads are active. Might be a win though with larger vectors.
 158  * No ability to do per-quad divide (doable but not implemented)
 159  * Could be made to work with passed in pixel offsets (i.e. active quad merging).
 160  */
 161 static void
 162 coeffs_init_simple(struct lp_build_interp_soa_context *bld,
 163                    LLVMValueRef a0_ptr,
 164                    LLVMValueRef dadx_ptr,
 165                    LLVMValueRef dady_ptr)
 166 {
 167    struct lp_build_context *coeff_bld = &bld->coeff_bld;
 168    struct lp_build_context *setup_bld = &bld->setup_bld;
 169    struct gallivm_state *gallivm = coeff_bld->gallivm;
 170    LLVMBuilderRef builder = gallivm->builder;
 171    unsigned attrib;
 172
 173    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
 174       /*
 175        * always fetch all 4 values for performance/simplicity
 176        * Note: we do that here because it seems to generate better
 177        * code. It generates a lot of moves initially but less
 178        * moves later. As far as I can tell this looks like a
 179        * llvm issue, instead of simply reloading the values from
 180        * the passed in pointers it if it runs out of registers
 181        * it spills/reloads them. Maybe some optimization passes
 182        * would help.
 183        * Might want to investigate this again later.
 184        */
 185       const unsigned interp = bld->interp[attrib];
 186       LLVMValueRef index = lp_build_const_int32(gallivm,
 187                                 attrib * TGSI_NUM_CHANNELS);
 188       LLVMValueRef ptr;
 189       LLVMValueRef dadxaos = setup_bld->zero;
 190       LLVMValueRef dadyaos = setup_bld->zero;
 191       LLVMValueRef a0aos = setup_bld->zero;
 192
 193       switch (interp) {
 194       case LP_INTERP_PERSPECTIVE:
 195          /* fall-through */
 196
 197       case LP_INTERP_LINEAR:
 198          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
 199          ptr = LLVMBuildBitCast(builder, ptr,
 200                LLVMPointerType(setup_bld->vec_type, 0), "");
 201          dadxaos = LLVMBuildLoad(builder, ptr, "");
 202
 203          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
 204          ptr = LLVMBuildBitCast(builder, ptr,
 205                LLVMPointerType(setup_bld->vec_type, 0), "");
 206          dadyaos = LLVMBuildLoad(builder, ptr, "");
 207
 208          attrib_name(dadxaos, attrib, 0, ".dadxaos");
 209          attrib_name(dadyaos, attrib, 0, ".dadyaos");
 210          /* fall-through */
 211
 212       case LP_INTERP_CONSTANT:
 213       case LP_INTERP_FACING:
 214          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
 215          ptr = LLVMBuildBitCast(builder, ptr,
 216                LLVMPointerType(setup_bld->vec_type, 0), "");
 217          a0aos = LLVMBuildLoad(builder, ptr, "");
 218          attrib_name(a0aos, attrib, 0, ".a0aos");
 219          break;
 220
 221       case LP_INTERP_POSITION:
 222          /* Nothing to do as the position coeffs are already setup in slot 0 */
 223          continue;
 224
 225       default:
 226          assert(0);
 227          break;
 228       }
 229       bld->a0aos[attrib] = a0aos;
 230       bld->dadxaos[attrib] = dadxaos;
 231       bld->dadyaos[attrib] = dadyaos;
 232    }
 233 }
 234
 235 /**
 236  * Interpolate the shader input attribute values.
 237  * This is called for each (group of) quad(s).
 238  */
 239 static void
 240 attribs_update_simple(struct lp_build_interp_soa_context *bld,
 241                       struct gallivm_state *gallivm,
 242                       LLVMValueRef loop_iter,
 243                       LLVMValueRef mask_store,
 244                       LLVMValueRef sample_id,
 245                       int start,
 246                       int end)
 247 {
 248    LLVMBuilderRef builder = gallivm->builder;
 249    struct lp_build_context *coeff_bld = &bld->coeff_bld;
 250    struct lp_build_context *setup_bld = &bld->setup_bld;
 251    LLVMValueRef oow = NULL;
 252    unsigned attrib;
 253    LLVMValueRef pixoffx;
 254    LLVMValueRef pixoffy;
 255    LLVMValueRef ptr;
 256
 257    /* could do this with code-generated passed in pixel offsets too */
 258
 259    assert(loop_iter);
 260    ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
 261    pixoffx = LLVMBuildLoad(builder, ptr, "");
 262    ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
 263    pixoffy = LLVMBuildLoad(builder, ptr, "");
 264
 265    pixoffx = LLVMBuildFAdd(builder, pixoffx,
 266                            lp_build_broadcast_scalar(coeff_bld, bld->x), "");
 267    pixoffy = LLVMBuildFAdd(builder, pixoffy,
 268                            lp_build_broadcast_scalar(coeff_bld, bld->y), "");
 269
 270    for (attrib = start; attrib < end; attrib++) {
 271       const unsigned mask = bld->mask[attrib];
 272       const unsigned interp = bld->interp[attrib];
 273       const unsigned loc = bld->interp_loc[attrib];
 274       unsigned chan;
 275
 276       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 277          if (mask & (1 << chan)) {
 278             LLVMValueRef index;
 279             LLVMValueRef dadx = coeff_bld->zero;
 280             LLVMValueRef dady = coeff_bld->zero;
 281             LLVMValueRef a = coeff_bld->zero;
 282             LLVMValueRef chan_pixoffx = pixoffx, chan_pixoffy = pixoffy;
 283
 284             index = lp_build_const_int32(gallivm, chan);
 285             switch (interp) {
 286             case LP_INTERP_PERSPECTIVE:
 287                /* fall-through */
 288
 289             case LP_INTERP_LINEAR:
 290                if (attrib == 0 && chan == 0) {
 291                   dadx = coeff_bld->one;
 292                   if (bld->pos_offset) {
 293                      a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
 294                   }
 295                }
 296                else if (attrib == 0 && chan == 1) {
 297                   dady = coeff_bld->one;
 298                   if (bld->pos_offset) {
 299                      a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
 300                   }
 301                }
 302                else {
 303                   dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
 304                                                     coeff_bld->type, bld->dadxaos[attrib],
 305                                                     index);
 306                   dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
 307                                                     coeff_bld->type, bld->dadyaos[attrib],
 308                                                     index);
 309                   a = lp_build_extract_broadcast(gallivm, setup_bld->type,
 310                                                  coeff_bld->type, bld->a0aos[attrib],
 311                                                  index);
 312
 313                   if (bld->coverage_samples > 1) {
 314                      LLVMValueRef xoffset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
 315                      LLVMValueRef yoffset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
 316                      if (loc == TGSI_INTERPOLATE_LOC_SAMPLE || (attrib == 0 && chan == 2 && sample_id)) {
 317                         LLVMValueRef x_val_idx = LLVMBuildMul(gallivm->builder, sample_id, lp_build_const_int32(gallivm, 2), "");
 318                         LLVMValueRef y_val_idx = LLVMBuildAdd(gallivm->builder, x_val_idx, lp_build_const_int32(gallivm, 1), "");
 319
 320                         x_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &x_val_idx, 1, "");
 321                         y_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &y_val_idx, 1, "");
 322                         xoffset = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, x_val_idx, ""));
 323                         yoffset = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, y_val_idx, ""));
 324                      } else if (loc == TGSI_INTERPOLATE_LOC_CENTROID) {
 325                         LLVMValueRef centroid_x_offset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
 326                         LLVMValueRef centroid_y_offset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
 327
 328                         /* for centroid find covered samples for this quad. */
 329                         /* if all samples are covered use pixel centers */
 330                         LLVMValueRef s_mask_and = NULL;
 331                         for (int s = bld->coverage_samples - 1; s >= 0; s--) {
 332                            LLVMValueRef sample_cov;
 333                            LLVMValueRef s_mask_idx = LLVMBuildMul(builder, bld->num_loop, lp_build_const_int32(gallivm, s), "");
 334
 335                            s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_iter, "");
 336                            sample_cov = lp_build_pointer_get(builder, mask_store, s_mask_idx);
 337                            if (s == bld->coverage_samples - 1)
 338                               s_mask_and = sample_cov;
 339                            else
 340                               s_mask_and = LLVMBuildAnd(builder, s_mask_and, sample_cov, "");
 341
 342                            LLVMValueRef x_val_idx = lp_build_const_int32(gallivm, s * 2);
 343                            LLVMValueRef y_val_idx = lp_build_const_int32(gallivm, s * 2 + 1);
 344
 345                            x_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &x_val_idx, 1, "");
 346                            y_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &y_val_idx, 1, "");
 347                            x_val_idx = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, x_val_idx, ""));
 348                            y_val_idx = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, y_val_idx, ""));
 349                            centroid_x_offset = lp_build_select(coeff_bld, sample_cov, x_val_idx, centroid_x_offset);
 350                            centroid_y_offset = lp_build_select(coeff_bld, sample_cov, y_val_idx, centroid_y_offset);
 351                         }
 352                         xoffset = lp_build_select(coeff_bld, s_mask_and, xoffset, centroid_x_offset);
 353                         yoffset = lp_build_select(coeff_bld, s_mask_and, yoffset, centroid_y_offset);
 354                      }
 355                      chan_pixoffx = lp_build_add(coeff_bld, chan_pixoffx, xoffset);
 356                      chan_pixoffy = lp_build_add(coeff_bld, chan_pixoffy, yoffset);
 357                   }
 358                }
 359                /*
 360                 * a = a0 + (x * dadx + y * dady)
 361                 */
 362                a = lp_build_fmuladd(builder, dadx, chan_pixoffx, a);
 363                a = lp_build_fmuladd(builder, dady, chan_pixoffy, a);
 364
 365                if (interp == LP_INTERP_PERSPECTIVE) {
 366                   if (oow == NULL) {
 367                      LLVMValueRef w = bld->attribs[0][3];
 368                      assert(attrib != 0);
 369                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
 370                      oow = lp_build_rcp(coeff_bld, w);
 371                   }
 372                   a = lp_build_mul(coeff_bld, a, oow);
 373                }
 374                break;
 375
 376             case LP_INTERP_CONSTANT:
 377             case LP_INTERP_FACING:
 378                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
 379                                               coeff_bld->type, bld->a0aos[attrib],
 380                                               index);
 381                break;
 382
 383             case LP_INTERP_POSITION:
 384                assert(attrib > 0);
 385                a = bld->attribs[0][chan];
 386                break;
 387
 388             default:
 389                assert(0);
 390                break;
 391             }
 392
 393             if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
 394                /* FIXME: Depth values can exceed 1.0, due to the fact that
 395                 * setup interpolation coefficients refer to (0,0) which causes
 396                 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
 397                 * Note though values outside [0,1] are perfectly valid with
 398                 * depth clip disabled.
 399                 * XXX: If depth clip is disabled but we force depth clamp
 400                 * we may get values larger than 1.0 in the fs (but not in
 401                 * depth test). Not sure if that's an issue...
 402                 * Also, on a similar note, it is not obvious if the depth values
 403                 * appearing in fs (with depth clip disabled) should be clamped
 404                 * to [0,1], clamped to near/far or not be clamped at all...
 405                 */
 406                a = lp_build_min(coeff_bld, a, coeff_bld->one);
 407             }
 408             bld->attribs[attrib][chan] = a;
 409          }
 410       }
 411    }
 412 }
 413
 414 /**
 415  * Initialize the bld->a, dadq fields.  This involves fetching
 416  * those values from the arrays which are passed into the JIT function.
 417  */
 418 static void
 419 coeffs_init(struct lp_build_interp_soa_context *bld,
 420             LLVMValueRef a0_ptr,
 421             LLVMValueRef dadx_ptr,
 422             LLVMValueRef dady_ptr)
 423 {
 424    struct lp_build_context *coeff_bld = &bld->coeff_bld;
 425    struct lp_build_context *setup_bld = &bld->setup_bld;
 426    struct gallivm_state *gallivm = coeff_bld->gallivm;
 427    LLVMBuilderRef builder = gallivm->builder;
 428    LLVMValueRef pixoffx, pixoffy;
 429    unsigned attrib;
 430    unsigned chan;
 431    unsigned i;
 432
 433    pixoffx = coeff_bld->undef;
 434    pixoffy = coeff_bld->undef;
 435    for (i = 0; i < coeff_bld->type.length; i++) {
 436       LLVMValueRef nr = lp_build_const_int32(gallivm, i);
 437       LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
 438       LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
 439       pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
 440       pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
 441    }
 442
 443
 444    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
 445       const unsigned mask = bld->mask[attrib];
 446       const unsigned interp = bld->interp[attrib];
 447       LLVMValueRef index = lp_build_const_int32(gallivm,
 448                                 attrib * TGSI_NUM_CHANNELS);
 449       LLVMValueRef ptr;
 450       LLVMValueRef dadxaos = setup_bld->zero;
 451       LLVMValueRef dadyaos = setup_bld->zero;
 452       LLVMValueRef a0aos = setup_bld->zero;
 453
 454       /* always fetch all 4 values for performance/simplicity */
 455       switch (interp) {
 456       case LP_INTERP_PERSPECTIVE:
 457          /* fall-through */
 458
 459       case LP_INTERP_LINEAR:
 460          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
 461          ptr = LLVMBuildBitCast(builder, ptr,
 462                LLVMPointerType(setup_bld->vec_type, 0), "");
 463          dadxaos = LLVMBuildLoad(builder, ptr, "");
 464
 465          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
 466          ptr = LLVMBuildBitCast(builder, ptr,
 467                LLVMPointerType(setup_bld->vec_type, 0), "");
 468          dadyaos = LLVMBuildLoad(builder, ptr, "");
 469
 470          attrib_name(dadxaos, attrib, 0, ".dadxaos");
 471          attrib_name(dadyaos, attrib, 0, ".dadyaos");
 472          /* fall-through */
 473
 474       case LP_INTERP_CONSTANT:
 475       case LP_INTERP_FACING:
 476          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
 477          ptr = LLVMBuildBitCast(builder, ptr,
 478                LLVMPointerType(setup_bld->vec_type, 0), "");
 479          a0aos = LLVMBuildLoad(builder, ptr, "");
 480          attrib_name(a0aos, attrib, 0, ".a0aos");
 481          break;
 482
 483       case LP_INTERP_POSITION:
 484          /* Nothing to do as the position coeffs are already setup in slot 0 */
 485          continue;
 486
 487       default:
 488          assert(0);
 489          break;
 490       }
 491
 492       /*
 493        * a = a0 + (x * dadx + y * dady)
 494        * a0aos is the attrib value at top left corner of stamp
 495        */
 496       if (interp != LP_INTERP_CONSTANT &&
 497           interp != LP_INTERP_FACING) {
 498          LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x);
 499          LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y);
 500          a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos);
 501          a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos);
 502       }
 503
 504       /*
 505        * dadq = {0, dadx, dady, dadx + dady}
 506        * for two quads (side by side) this is:
 507        * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
 508        */
 509       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
 510          /* this generates a CRAPLOAD of shuffles... */
 511          if (mask & (1 << chan)) {
 512             LLVMValueRef dadx, dady;
 513             LLVMValueRef dadq, dadq2;
 514             LLVMValueRef a;
 515             LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
 516
 517             if (attrib == 0 && chan == 0) {
 518                a = bld->x;
 519                if (bld->pos_offset) {
 520                   a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
 521                }
 522                a = lp_build_broadcast_scalar(coeff_bld, a);
 523                dadx = coeff_bld->one;
 524                dady = coeff_bld->zero;
 525             }
 526             else if (attrib == 0 && chan == 1) {
 527                a = bld->y;
 528                if (bld->pos_offset) {
 529                   a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
 530                }
 531                a = lp_build_broadcast_scalar(coeff_bld, a);
 532                dady = coeff_bld->one;
 533                dadx = coeff_bld->zero;
 534             }
 535             else {
 536                dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
 537                                               coeff_bld->type, dadxaos, chan_index);
 538                dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
 539                                               coeff_bld->type, dadyaos, chan_index);
 540
 541                /*
 542                 * a = {a, a, a, a}
 543                 */
 544                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
 545                                               coeff_bld->type, a0aos, chan_index);
 546             }
 547
 548             dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
 549             dady = LLVMBuildFMul(builder, dady, pixoffy, "");
 550             dadq = LLVMBuildFAdd(builder, dadx, dady, "");
 551
 552             /*
 553              * Compute the attrib values on the upper-left corner of each
 554              * group of quads.
 555              * Note that if we process 2 quads at once this doesn't
 556              * really exactly to what we want.
 557              * We need to access elem 0 and 2 respectively later if we process
 558              * 2 quads at once.
 559              */
 560
 561             if (interp != LP_INTERP_CONSTANT &&
 562                 interp != LP_INTERP_FACING) {
 563                dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
 564                a = LLVMBuildFAdd(builder, a, dadq2, "");
 565             }
 566
 567 #if PERSPECTIVE_DIVIDE_PER_QUAD
 568             /*
 569              * a *= 1 / w
 570              */
 571
 572             /*
 573              * XXX since we're only going to access elements 0,2 out of 8
 574              * if we have 8-wide vectors we should do the division only 4-wide.
 575              * a is really a 2-elements in a 4-wide vector disguised as 8-wide
 576              * in this case.
 577              */
 578             if (interp == LP_INTERP_PERSPECTIVE) {
 579                LLVMValueRef w = bld->a[0][3];
 580                assert(attrib != 0);
 581                assert(bld->mask[0] & TGSI_WRITEMASK_W);
 582                if (!bld->oow) {
 583                   bld->oow = lp_build_rcp(coeff_bld, w);
 584                   lp_build_name(bld->oow, "oow");
 585                }
 586                a = lp_build_mul(coeff_bld, a, bld->oow);
 587             }
 588 #endif
 589
 590             attrib_name(a, attrib, chan, ".a");
 591             attrib_name(dadq, attrib, chan, ".dadq");
 592
 593             bld->a[attrib][chan] = lp_build_alloca(gallivm,
 594                                                    LLVMTypeOf(a), "");
 595             LLVMBuildStore(builder, a, bld->a[attrib][chan]);
 596             bld->dadq[attrib][chan] = dadq;
 597          }
 598       }
 599    }
 600 }
 601
 602
 603 /**
 604  * Increment the shader input attribute values.
 605  * This is called when we move from one quad to the next.
 606  */
 607 static void
 608 attribs_update(struct lp_build_interp_soa_context *bld,
 609                struct gallivm_state *gallivm,
 610                LLVMValueRef loop_iter,
 611                int start,
 612                int end)
 613 {
 614    LLVMBuilderRef builder = gallivm->builder;
 615    struct lp_build_context *coeff_bld = &bld->coeff_bld;
 616    LLVMValueRef oow = NULL;
 617    unsigned attrib;
 618    unsigned chan;
 619
 620    for(attrib = start; attrib < end; ++attrib) {
 621       const unsigned mask = bld->mask[attrib];
 622       const unsigned interp = bld->interp[attrib];
 623       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
 624          if(mask & (1 << chan)) {
 625             LLVMValueRef a;
 626             if (interp == LP_INTERP_CONSTANT ||
 627                 interp == LP_INTERP_FACING) {
 628                a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
 629             }
 630             else if (interp == LP_INTERP_POSITION) {
 631                assert(attrib > 0);
 632                a = bld->attribs[0][chan];
 633             }
 634             else {
 635                LLVMValueRef dadq;
 636
 637                a = bld->a[attrib][chan];
 638
 639                /*
 640                 * Broadcast the attribute value for this quad into all elements
 641                 */
 642
 643                {
 644                   /* stored as vector load as float */
 645                   LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
 646                                                             gallivm->context), 0);
 647                   LLVMValueRef ptr;
 648                   a = LLVMBuildBitCast(builder, a, ptr_type, "");
 649                   ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
 650                   a = LLVMBuildLoad(builder, ptr, "");
 651                   a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
 652                }
 653
 654                /*
 655                 * Get the derivatives.
 656                 */
 657
 658                dadq = bld->dadq[attrib][chan];
 659
 660 #if PERSPECTIVE_DIVIDE_PER_QUAD
 661                if (interp == LP_INTERP_PERSPECTIVE) {
 662                   LLVMValueRef dwdq = bld->dadq[0][3];
 663
 664                   if (oow == NULL) {
 665                      assert(bld->oow);
 666                      oow = LLVMBuildShuffleVector(coeff_bld->builder,
 667                                                   bld->oow, coeff_bld->undef,
 668                                                   shuffle, "");
 669                   }
 670
 671                   dadq = lp_build_sub(coeff_bld,
 672                                       dadq,
 673                                       lp_build_mul(coeff_bld, a, dwdq));
 674                   dadq = lp_build_mul(coeff_bld, dadq, oow);
 675                }
 676 #endif
 677
 678                /*
 679                 * Add the derivatives
 680                 */
 681
 682                a = lp_build_add(coeff_bld, a, dadq);
 683
 684 #if !PERSPECTIVE_DIVIDE_PER_QUAD
 685                if (interp == LP_INTERP_PERSPECTIVE) {
 686                   if (oow == NULL) {
 687                      LLVMValueRef w = bld->attribs[0][3];
 688                      assert(attrib != 0);
 689                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
 690                      oow = lp_build_rcp(coeff_bld, w);
 691                   }
 692                   a = lp_build_mul(coeff_bld, a, oow);
 693                }
 694 #endif
 695
 696                if (attrib == 0 && chan == 2 && !bld->depth_clamp) {
 697                   /* FIXME: Depth values can exceed 1.0, due to the fact that
 698                    * setup interpolation coefficients refer to (0,0) which causes
 699                    * precision loss. So we must clamp to 1.0 here to avoid artifacts.
 700                    * Note though values outside [0,1] are perfectly valid with
 701                    * depth clip disabled..
 702                    * XXX: If depth clip is disabled but we force depth clamp
 703                    * we may get values larger than 1.0 in the fs (but not in
 704                    * depth test). Not sure if that's an issue...
 705                    * Also, on a similar note, it is not obvious if the depth values
 706                    * appearing in fs (with depth clip disabled) should be clamped
 707                    * to [0,1], clamped to near/far or not be clamped at all...
 708                    */
 709                   a = lp_build_min(coeff_bld, a, coeff_bld->one);
 710                }
 711
 712                attrib_name(a, attrib, chan, "");
 713             }
 714             bld->attribs[attrib][chan] = a;
 715          }
 716       }
 717    }
 718 }
 719
 720
 721 /**
 722  * Generate the position vectors.
 723  *
 724  * Parameter x0, y0 are the integer values with upper left coordinates.
 725  */
 726 static void
 727 pos_init(struct lp_build_interp_soa_context *bld,
 728          LLVMValueRef x0,
 729          LLVMValueRef y0)
 730 {
 731    LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
 732    struct lp_build_context *coeff_bld = &bld->coeff_bld;
 733
 734    bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
 735    bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
 736 }
 737
 738
 739 /**
 740  * Initialize fragment shader input attribute info.
 741  */
 742 void
 743 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 744                          struct gallivm_state *gallivm,
 745                          unsigned num_inputs,
 746                          const struct lp_shader_input *inputs,
 747                          boolean pixel_center_integer,
 748                          unsigned coverage_samples,
 749                          LLVMValueRef sample_pos_array,
 750                          LLVMValueRef num_loop,
 751                          boolean depth_clamp,
 752                          LLVMBuilderRef builder,
 753                          struct lp_type type,
 754                          LLVMValueRef a0_ptr,
 755                          LLVMValueRef dadx_ptr,
 756                          LLVMValueRef dady_ptr,
 757                          LLVMValueRef x0,
 758                          LLVMValueRef y0)
 759 {
 760    struct lp_type coeff_type;
 761    struct lp_type setup_type;
 762    unsigned attrib;
 763    unsigned chan;
 764
 765    memset(bld, 0, sizeof *bld);
 766
 767    memset(&coeff_type, 0, sizeof coeff_type);
 768    coeff_type.floating = TRUE;
 769    coeff_type.sign = TRUE;
 770    coeff_type.width = 32;
 771    coeff_type.length = type.length;
 772
 773    memset(&setup_type, 0, sizeof setup_type);
 774    setup_type.floating = TRUE;
 775    setup_type.sign = TRUE;
 776    setup_type.width = 32;
 777    setup_type.length = TGSI_NUM_CHANNELS;
 778
 779
 780    /* XXX: we don't support interpolating into any other types */
 781    assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
 782
 783    lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
 784    lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
 785
 786    /* For convenience */
 787    bld->pos = bld->attribs[0];
 788    bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
 789
 790    /* Position */
 791    bld->mask[0] = TGSI_WRITEMASK_XYZW;
 792    bld->interp[0] = LP_INTERP_LINEAR;
 793    bld->interp_loc[0] = 0;
 794
 795    /* Inputs */
 796    for (attrib = 0; attrib < num_inputs; ++attrib) {
 797       bld->mask[1 + attrib] = inputs[attrib].usage_mask;
 798       bld->interp[1 + attrib] = inputs[attrib].interp;
 799       bld->interp_loc[1 + attrib] = inputs[attrib].location;
 800    }
 801    bld->num_attribs = 1 + num_inputs;
 802
 803    /* Ensure all masked out input channels have a valid value */
 804    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
 805       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
 806          bld->attribs[attrib][chan] = bld->coeff_bld.undef;
 807       }
 808    }
 809
 810    if (pixel_center_integer) {
 811       bld->pos_offset = 0.0;
 812    } else {
 813       bld->pos_offset = 0.5;
 814    }
 815    bld->depth_clamp = depth_clamp;
 816    bld->coverage_samples = coverage_samples;
 817    bld->num_loop = num_loop;
 818    bld->sample_pos_array = sample_pos_array;
 819
 820    pos_init(bld, x0, y0);
 821
 822    /*
 823     * Simple method (single step interpolation) may be slower if vector length
 824     * is just 4, but the results are different (generally less accurate) with
 825     * the other method, so always use more accurate version.
 826     */
 827    if (1) {
 828       bld->simple_interp = TRUE;
 829       {
 830          /* XXX this should use a global static table */
 831          unsigned i;
 832          unsigned num_loops = 16 / type.length;
 833          LLVMValueRef pixoffx, pixoffy, index;
 834          LLVMValueRef ptr;
 835
 836          bld->xoffset_store = lp_build_array_alloca(gallivm,
 837                                                     lp_build_vec_type(gallivm, type),
 838                                                     lp_build_const_int32(gallivm, num_loops),
 839                                                     "");
 840          bld->yoffset_store = lp_build_array_alloca(gallivm,
 841                                                     lp_build_vec_type(gallivm, type),
 842                                                     lp_build_const_int32(gallivm, num_loops),
 843                                                     "");
 844          for (i = 0; i < num_loops; i++) {
 845             index = lp_build_const_int32(gallivm, i);
 846             calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
 847             ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
 848             LLVMBuildStore(builder, pixoffx, ptr);
 849             ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
 850             LLVMBuildStore(builder, pixoffy, ptr);
 851          }
 852       }
 853       coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
 854    }
 855    else {
 856       bld->simple_interp = FALSE;
 857       coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
 858    }
 859
 860 }
 861
 862
 863 /*
 864  * Advance the position and inputs to the given quad within the block.
 865  */
 866
 867 void
 868 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
 869                                       struct gallivm_state *gallivm,
 870                                       LLVMValueRef quad_start_index,
 871                                       LLVMValueRef mask_store,
 872                                       LLVMValueRef sample_id)
 873 {
 874    if (bld->simple_interp) {
 875       attribs_update_simple(bld, gallivm, quad_start_index, mask_store, sample_id, 1, bld->num_attribs);
 876    }
 877    else {
 878       attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
 879    }
 880 }
 881
 882 void
 883 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
 884                                    struct gallivm_state *gallivm,
 885                                    LLVMValueRef quad_start_index,
 886                                    LLVMValueRef sample_id)
 887 {
 888    if (bld->simple_interp) {
 889       attribs_update_simple(bld, gallivm, quad_start_index, NULL, sample_id, 0, 1);
 890    }
 891    else {
 892       attribs_update(bld, gallivm, quad_start_index, 0, 1);
 893    }
 894 }
 895