clover: Wrap event::wait_count in a method taking care of the required locking.
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_interp.c
index d1f0185684d502701daf8ab49ee76b53b74bc7b6..79325683c61efe858212f2fb9def50320ec0766d 100644 (file)
@@ -1,7 +1,7 @@
 /**************************************************************************
  * 
  * Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2007-2008 VMware, Inc.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -19,7 +19,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -42,6 +42,7 @@
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
 #include "lp_bld_interp.h"
 
 
@@ -61,6 +62,9 @@
  * #   |   #   |   #
  * #################
  *
+ * If we iterate over multiple quads at once, quads 01 and 23 are processed
+ * together.
+ *
  * Within each quad, we have four pixels which are represented in SOA
  * order:
  *
  *
  * So the green channel (for example) of the four pixels is stored in
  * a single vector register: {g0, g1, g2, g3}.
+ * The order stays the same even with multiple quads:
+ * 0 1 4 5
+ * 2 3 6 7
+ * is stored as g0..g7
  */
 
 
+/**
+ * Do one perspective divide per quad.
+ *
+ * For perspective interpolation, the final attribute value is given
+ *
+ *  a' = a/w = a * oow
+ *
+ * where
+ *
+ *  a = a0 + dadx*x + dady*y
+ *  w = w0 + dwdx*x + dwdy*y
+ *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
+ *
+ * Instead of computing the division per pixel, with this macro we compute the
+ * division on the upper left pixel of each quad, and use a linear
+ * approximation in the remaining pixels, given by:
+ *
+ *  da'dx = (dadx - dwdx*a)*oow
+ *  da'dy = (dady - dwdy*a)*oow
+ *
+ * Ironically, this actually makes things slower -- probably because the
+ * divide hardware unit is rarely used, whereas the multiply unit is typically
+ * already saturated.
+ */
+#define PERSPECTIVE_DIVIDE_PER_QUAD 0
+
+
+static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
+static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
+
+
 static void
 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
 {
@@ -84,53 +123,214 @@ attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix
       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
 }
 
+static void
+calc_offsets(struct lp_build_context *coeff_bld,
+             unsigned quad_start_index,
+             LLVMValueRef *pixoffx,
+             LLVMValueRef *pixoffy)
+{
+   unsigned i;
+   unsigned num_pix = coeff_bld->type.length;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = coeff_bld->gallivm->builder;
+   LLVMValueRef nr, pixxf, pixyf;
+
+   *pixoffx = coeff_bld->undef;
+   *pixoffy = coeff_bld->undef;
+
+   for (i = 0; i < num_pix; i++) {
+      nr = lp_build_const_int32(gallivm, i);
+      pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
+                                   (quad_start_index & 1) * 2);
+      pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
+                                   (quad_start_index & 2));
+      *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
+      *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
+   }
+}
+
+
+/* Much easier, and significantly less instructions in the per-stamp
+ * part (less than half) but overall more instructions so a loss if
+ * most quads are active. Might be a win though with larger vectors.
+ * No ability to do per-quad divide (doable but not implemented)
+ * Could be made to work with passed in pixel offsets (i.e. active quad merging).
+ */
+static void
+coeffs_init_simple(struct lp_build_interp_soa_context *bld,
+                   LLVMValueRef a0_ptr,
+                   LLVMValueRef dadx_ptr,
+                   LLVMValueRef dady_ptr)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned attrib;
+
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      /*
+       * always fetch all 4 values for performance/simplicity
+       * Note: we do that here because it seems to generate better
+       * code. It generates a lot of moves initially but less
+       * moves later. As far as I can tell this looks like a
+       * llvm issue, instead of simply reloading the values from
+       * the passed in pointers it if it runs out of registers
+       * it spills/reloads them. Maybe some optimization passes
+       * would help.
+       * Might want to investigate this again later.
+       */
+      const unsigned interp = bld->interp[attrib];
+      LLVMValueRef index = lp_build_const_int32(gallivm,
+                                attrib * TGSI_NUM_CHANNELS);
+      LLVMValueRef ptr;
+      LLVMValueRef dadxaos = setup_bld->zero;
+      LLVMValueRef dadyaos = setup_bld->zero;
+      LLVMValueRef a0aos = setup_bld->zero;
+
+      switch (interp) {
+      case LP_INTERP_PERSPECTIVE:
+         /* fall-through */
+
+      case LP_INTERP_LINEAR:
+         ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+         ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+         attrib_name(dadxaos, attrib, 0, ".dadxaos");
+         attrib_name(dadyaos, attrib, 0, ".dadyaos");
+         /* fall-through */
+
+      case LP_INTERP_CONSTANT:
+      case LP_INTERP_FACING:
+         ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         a0aos = LLVMBuildLoad(builder, ptr, "");
+         attrib_name(a0aos, attrib, 0, ".a0aos");
+         break;
+
+      case LP_INTERP_POSITION:
+         /* Nothing to do as the position coeffs are already setup in slot 0 */
+         continue;
+
+      default:
+         assert(0);
+         break;
+      }
+      bld->a0aos[attrib] = a0aos;
+      bld->dadxaos[attrib] = dadxaos;
+      bld->dadyaos[attrib] = dadyaos;
+   }
+}
 
 /**
- * Initialize the bld->a0, dadx, dady fields.  This involves fetching
- * those values from the arrays which are passed into the JIT function.
+ * Interpolate the shader input attribute values.
+ * This is called for each (group of) quad(s).
  */
 static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
-            LLVMValueRef a0_ptr,
-            LLVMValueRef dadx_ptr,
-            LLVMValueRef dady_ptr)
+attribs_update_simple(struct lp_build_interp_soa_context *bld,
+                      struct gallivm_state *gallivm,
+                      LLVMValueRef loop_iter,
+                      int start,
+                      int end)
 {
-   LLVMBuilderRef builder = bld->base.builder;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   LLVMValueRef oow = NULL;
    unsigned attrib;
-   unsigned chan;
+   LLVMValueRef pixoffx;
+   LLVMValueRef pixoffy;
+   LLVMValueRef ptr;
 
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+   /* could do this with code-generated passed in pixel offsets too */
+
+   assert(loop_iter);
+   ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
+   pixoffx = LLVMBuildLoad(builder, ptr, "");
+   ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
+   pixoffy = LLVMBuildLoad(builder, ptr, "");
+
+   pixoffx = LLVMBuildFAdd(builder, pixoffx,
+                           lp_build_broadcast_scalar(coeff_bld, bld->x), "");
+   pixoffy = LLVMBuildFAdd(builder, pixoffy,
+                           lp_build_broadcast_scalar(coeff_bld, bld->y), "");
+
+   for (attrib = start; attrib < end; attrib++) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
-            LLVMValueRef a0 = bld->base.undef;
-            LLVMValueRef dadx = bld->base.undef;
-            LLVMValueRef dady = bld->base.undef;
+      unsigned chan;
+
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+         if (mask & (1 << chan)) {
+            LLVMValueRef index;
+            LLVMValueRef dadx = coeff_bld->zero;
+            LLVMValueRef dady = coeff_bld->zero;
+            LLVMValueRef a = coeff_bld->zero;
 
-            switch( interp ) {
+            index = lp_build_const_int32(gallivm, chan);
+            switch (interp) {
             case LP_INTERP_PERSPECTIVE:
                /* fall-through */
 
             case LP_INTERP_LINEAR:
-               dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
-               dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
-               dadx = lp_build_broadcast_scalar(&bld->base, dadx);
-               dady = lp_build_broadcast_scalar(&bld->base, dady);
-               attrib_name(dadx, attrib, chan, ".dadx");
-               attrib_name(dady, attrib, chan, ".dady");
-               /* fall-through */
+               if (attrib == 0 && chan == 0) {
+                  dadx = coeff_bld->one;
+                  if (bld->pos_offset) {
+                     a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
+                  }
+               }
+               else if (attrib == 0 && chan == 1) {
+                  dady = coeff_bld->one;
+                  if (bld->pos_offset) {
+                     a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
+                  }
+               }
+               else {
+                  dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                    coeff_bld->type, bld->dadxaos[attrib],
+                                                    index);
+                  dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                    coeff_bld->type, bld->dadyaos[attrib],
+                                                    index);
+                  a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                 coeff_bld->type, bld->a0aos[attrib],
+                                                 index);
+               }
+               /*
+                * a = a0 + (x * dadx + y * dady)
+                */
+               a = lp_build_fmuladd(builder, dadx, pixoffx, a);
+               a = lp_build_fmuladd(builder, dady, pixoffy, a);
+
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  if (oow == NULL) {
+                     LLVMValueRef w = bld->attribs[0][3];
+                     assert(attrib != 0);
+                     assert(bld->mask[0] & TGSI_WRITEMASK_W);
+                     oow = lp_build_rcp(coeff_bld, w);
+                  }
+                  a = lp_build_mul(coeff_bld, a, oow);
+               }
+               break;
 
             case LP_INTERP_CONSTANT:
             case LP_INTERP_FACING:
-               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
-               a0 = lp_build_broadcast_scalar(&bld->base, a0);
-               attrib_name(a0, attrib, chan, ".a0");
+               a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, bld->a0aos[attrib],
+                                              index);
                break;
 
             case LP_INTERP_POSITION:
-               /* Nothing to do as the position coeffs are already setup in slot 0 */
+               assert(attrib > 0);
+               a = bld->attribs[0][chan];
                break;
 
             default:
@@ -138,74 +338,210 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
                break;
             }
 
-            bld->a0  [attrib][chan] = a0;
-            bld->dadx[attrib][chan] = dadx;
-            bld->dady[attrib][chan] = dady;
+            if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
+               /* FIXME: Depth values can exceed 1.0, due to the fact that
+                * setup interpolation coefficients refer to (0,0) which causes
+                * precision loss. So we must clamp to 1.0 here to avoid artifacts.
+                * Note though values outside [0,1] are perfectly valid with
+                * depth clip disabled.
+                * XXX: If depth clip is disabled but we force depth clamp
+                * we may get values larger than 1.0 in the fs (but not in
+                * depth test). Not sure if that's an issue...
+                * Also, on a similar note, it is not obvious if the depth values
+                * appearing in fs (with depth clip disabled) should be clamped
+                * to [0,1], clamped to near/far or not be clamped at all...
+                */
+               a = lp_build_min(coeff_bld, a, coeff_bld->one);
+            }
+            bld->attribs[attrib][chan] = a;
          }
       }
    }
 }
 
-
 /**
- * Emit LLVM code to compute the fragment shader input attribute values.
- * For example, for a color input, we'll compute red, green, blue and alpha
- * values for the four pixels in a quad.
- * Recall that we're operating on 4-element vectors so each arithmetic
- * operation is operating on the four pixels in a quad.
+ * Initialize the bld->a, dadq fields.  This involves fetching
+ * those values from the arrays which are passed into the JIT function.
  */
 static void
-attribs_init(struct lp_build_interp_soa_context *bld)
+coeffs_init(struct lp_build_interp_soa_context *bld,
+            LLVMValueRef a0_ptr,
+            LLVMValueRef dadx_ptr,
+            LLVMValueRef dady_ptr)
 {
-   LLVMValueRef x = bld->pos[0];
-   LLVMValueRef y = bld->pos[1];
-   LLVMValueRef oow = NULL;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef pixoffx, pixoffy;
    unsigned attrib;
    unsigned chan;
+   unsigned i;
+
+   pixoffx = coeff_bld->undef;
+   pixoffy = coeff_bld->undef;
+   for (i = 0; i < coeff_bld->type.length; i++) {
+      LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+      LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
+      LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
+      pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+      pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
+   }
+
 
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(mask & (1 << chan)) {
-            if (interp == LP_INTERP_POSITION) {
-               assert(attrib > 0);
-               bld->attribs[attrib][chan] = bld->attribs[0][chan];
+      LLVMValueRef index = lp_build_const_int32(gallivm,
+                                attrib * TGSI_NUM_CHANNELS);
+      LLVMValueRef ptr;
+      LLVMValueRef dadxaos = setup_bld->zero;
+      LLVMValueRef dadyaos = setup_bld->zero;
+      LLVMValueRef a0aos = setup_bld->zero;
+
+      /* always fetch all 4 values for performance/simplicity */
+      switch (interp) {
+      case LP_INTERP_PERSPECTIVE:
+         /* fall-through */
+
+      case LP_INTERP_LINEAR:
+         ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+         ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+         attrib_name(dadxaos, attrib, 0, ".dadxaos");
+         attrib_name(dadyaos, attrib, 0, ".dadyaos");
+         /* fall-through */
+
+      case LP_INTERP_CONSTANT:
+      case LP_INTERP_FACING:
+         ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         a0aos = LLVMBuildLoad(builder, ptr, "");
+         attrib_name(a0aos, attrib, 0, ".a0aos");
+         break;
+
+      case LP_INTERP_POSITION:
+         /* Nothing to do as the position coeffs are already setup in slot 0 */
+         continue;
+
+      default:
+         assert(0);
+         break;
+      }
+
+      /*
+       * a = a0 + (x * dadx + y * dady)
+       * a0aos is the attrib value at top left corner of stamp
+       */
+      if (interp != LP_INTERP_CONSTANT &&
+          interp != LP_INTERP_FACING) {
+         LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x);
+         LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y);
+         a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos);
+         a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos);
+      }
+
+      /*
+       * dadq = {0, dadx, dady, dadx + dady}
+       * for two quads (side by side) this is:
+       * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
+       */
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         /* this generates a CRAPLOAD of shuffles... */
+         if (mask & (1 << chan)) {
+            LLVMValueRef dadx, dady;
+            LLVMValueRef dadq, dadq2;
+            LLVMValueRef a;
+            LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
+
+            if (attrib == 0 && chan == 0) {
+               a = bld->x;
+               if (bld->pos_offset) {
+                  a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
+               }
+               a = lp_build_broadcast_scalar(coeff_bld, a);
+               dadx = coeff_bld->one;
+               dady = coeff_bld->zero;
             }
-            else {
-               LLVMValueRef a0   = bld->a0  [attrib][chan];
-               LLVMValueRef dadx = bld->dadx[attrib][chan];
-               LLVMValueRef dady = bld->dady[attrib][chan];
-               LLVMValueRef res;
-
-               res = a0;
-
-               if (interp != LP_INTERP_CONSTANT &&
-                   interp != LP_INTERP_FACING) {
-                  /* res = res + x * dadx */
-                  res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
-                  /* res = res + y * dady */
-                  res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
+            else if (attrib == 0 && chan == 1) {
+               a = bld->y;
+               if (bld->pos_offset) {
+                  a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
                }
+               a = lp_build_broadcast_scalar(coeff_bld, a);
+               dady = coeff_bld->one;
+               dadx = coeff_bld->zero;
+            }
+            else {
+               dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, dadxaos, chan_index);
+               dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, dadyaos, chan_index);
 
-               /* Keep the value of the attribute before perspective divide
-                * for faster updates.
+               /*
+                * a = {a, a, a, a}
                 */
-               bld->attribs_pre[attrib][chan] = res;
+               a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, a0aos, chan_index);
+            }
 
-               if (interp == LP_INTERP_PERSPECTIVE) {
-                  LLVMValueRef w = bld->pos[3];
-                  assert(attrib != 0);
-                  assert(bld->mask[0] & TGSI_WRITEMASK_W);
-                  if(!oow)
-                     oow = lp_build_rcp(&bld->base, w);
-                  res = lp_build_mul(&bld->base, res, oow);
+            dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+            dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+            dadq = LLVMBuildFAdd(builder, dadx, dady, "");
+
+            /*
+             * Compute the attrib values on the upper-left corner of each
+             * group of quads.
+             * Note that if we process 2 quads at once this doesn't
+             * really exactly to what we want.
+             * We need to access elem 0 and 2 respectively later if we process
+             * 2 quads at once.
+             */
+
+            if (interp != LP_INTERP_CONSTANT &&
+                interp != LP_INTERP_FACING) {
+               dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
+               a = LLVMBuildFAdd(builder, a, dadq2, "");
+           }
+
+#if PERSPECTIVE_DIVIDE_PER_QUAD
+            /*
+             * a *= 1 / w
+             */
+
+            /*
+             * XXX since we're only going to access elements 0,2 out of 8
+             * if we have 8-wide vectors we should do the division only 4-wide.
+             * a is really a 2-elements in a 4-wide vector disguised as 8-wide
+             * in this case.
+             */
+            if (interp == LP_INTERP_PERSPECTIVE) {
+               LLVMValueRef w = bld->a[0][3];
+               assert(attrib != 0);
+               assert(bld->mask[0] & TGSI_WRITEMASK_W);
+               if (!bld->oow) {
+                  bld->oow = lp_build_rcp(coeff_bld, w);
+                  lp_build_name(bld->oow, "oow");
                }
+               a = lp_build_mul(coeff_bld, a, bld->oow);
+            }
+#endif
 
-               attrib_name(res, attrib, chan, "");
+            attrib_name(a, attrib, chan, ".a");
+            attrib_name(dadq, attrib, chan, ".dadq");
 
-               bld->attribs[attrib][chan] = res;
-            }
+            bld->a[attrib][chan] = lp_build_alloca(gallivm,
+                                                   LLVMTypeOf(a), "");
+            LLVMBuildStore(builder, a, bld->a[attrib][chan]);
+            bld->dadq[attrib][chan] = dadq;
          }
       }
    }
@@ -217,61 +553,113 @@ attribs_init(struct lp_build_interp_soa_context *bld)
  * This is called when we move from one quad to the next.
  */
 static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+attribs_update(struct lp_build_interp_soa_context *bld,
+               struct gallivm_state *gallivm,
+               LLVMValueRef loop_iter,
+               int start,
+               int end)
 {
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
    LLVMValueRef oow = NULL;
    unsigned attrib;
    unsigned chan;
 
-   assert(quad_index < 4);
-
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+   for(attrib = start; attrib < end; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
+      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef a;
+            if (interp == LP_INTERP_CONSTANT ||
+                interp == LP_INTERP_FACING) {
+               a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
+            }
+            else if (interp == LP_INTERP_POSITION) {
+               assert(attrib > 0);
+               a = bld->attribs[0][chan];
+            }
+            else {
+               LLVMValueRef dadq;
 
-      if (interp != LP_INTERP_CONSTANT &&
-          interp != LP_INTERP_FACING) {
-         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-            if(mask & (1 << chan)) {
-               if (interp == LP_INTERP_POSITION) {
-                  assert(attrib > 0);
-                  bld->attribs[attrib][chan] = bld->attribs[0][chan];
+               a = bld->a[attrib][chan];
+
+               /*
+                * Broadcast the attribute value for this quad into all elements
+                */
+
+               {
+                  /* stored as vector load as float */
+                  LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
+                                                            gallivm->context), 0);
+                  LLVMValueRef ptr;
+                  a = LLVMBuildBitCast(builder, a, ptr_type, "");
+                  ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
+                  a = LLVMBuildLoad(builder, ptr, "");
+                  a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
                }
-               else {
-                  LLVMValueRef dadx = bld->dadx[attrib][chan];
-                  LLVMValueRef dady = bld->dady[attrib][chan];
-                  LLVMValueRef res;
 
-                  res = bld->attribs_pre[attrib][chan];
+               /*
+                * Get the derivatives.
+                */
+
+               dadq = bld->dadq[attrib][chan];
 
-                  if (quad_index == 1 || quad_index == 3) {
-                     /* top-right or bottom-right quad */
-                     /* build res = res + dadx + dadx */
-                     res = lp_build_add(&bld->base, res, dadx);
-                     res = lp_build_add(&bld->base, res, dadx);
-                  }
+#if PERSPECTIVE_DIVIDE_PER_QUAD
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  LLVMValueRef dwdq = bld->dadq[0][3];
 
-                  if (quad_index == 2 || quad_index == 3) {
-                     /* bottom-left or bottom-right quad */
-                     /* build res = res + dady + dady */
-                     res = lp_build_add(&bld->base, res, dady);
-                     res = lp_build_add(&bld->base, res, dady);
+                  if (oow == NULL) {
+                     assert(bld->oow);
+                     oow = LLVMBuildShuffleVector(coeff_bld->builder,
+                                                  bld->oow, coeff_bld->undef,
+                                                  shuffle, "");
                   }
 
-                  if (interp == LP_INTERP_PERSPECTIVE) {
-                     LLVMValueRef w = bld->pos[3];
+                  dadq = lp_build_sub(coeff_bld,
+                                      dadq,
+                                      lp_build_mul(coeff_bld, a, dwdq));
+                  dadq = lp_build_mul(coeff_bld, dadq, oow);
+               }
+#endif
+
+               /*
+                * Add the derivatives
+                */
+
+               a = lp_build_add(coeff_bld, a, dadq);
+
+#if !PERSPECTIVE_DIVIDE_PER_QUAD
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  if (oow == NULL) {
+                     LLVMValueRef w = bld->attribs[0][3];
                      assert(attrib != 0);
                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
-                     if(!oow)
-                        oow = lp_build_rcp(&bld->base, w);
-                     res = lp_build_mul(&bld->base, res, oow);
+                     oow = lp_build_rcp(coeff_bld, w);
                   }
-
-                  attrib_name(res, attrib, chan, "");
-
-                  bld->attribs[attrib][chan] = res;
+                  a = lp_build_mul(coeff_bld, a, oow);
                }
+#endif
+
+               if (attrib == 0 && chan == 2 && !bld->depth_clamp) {
+                  /* FIXME: Depth values can exceed 1.0, due to the fact that
+                   * setup interpolation coefficients refer to (0,0) which causes
+                   * precision loss. So we must clamp to 1.0 here to avoid artifacts.
+                   * Note though values outside [0,1] are perfectly valid with
+                   * depth clip disabled..
+                   * XXX: If depth clip is disabled but we force depth clamp
+                   * we may get values larger than 1.0 in the fs (but not in
+                   * depth test). Not sure if that's an issue...
+                   * Also, on a similar note, it is not obvious if the depth values
+                   * appearing in fs (with depth clip disabled) should be clamped
+                   * to [0,1], clamped to near/far or not be clamped at all...
+                   */
+                  a = lp_build_min(coeff_bld, a, coeff_bld->one);
+               }
+
+               attrib_name(a, attrib, chan, "");
             }
+            bld->attribs[attrib][chan] = a;
          }
       }
    }
@@ -281,53 +669,18 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
 /**
  * Generate the position vectors.
  *
- * Parameter x0, y0 are the integer values with the quad upper left coordinates.
+ * Parameter x0, y0 are the integer values with upper left coordinates.
  */
 static void
 pos_init(struct lp_build_interp_soa_context *bld,
          LLVMValueRef x0,
          LLVMValueRef y0)
 {
-   lp_build_name(x0, "pos.x");
-   lp_build_name(y0, "pos.y");
-
-   bld->attribs[0][0] = x0;
-   bld->attribs[0][1] = y0;
-}
-
-
-/**
- * Update quad position values when moving to the next quad.
- */
-static void
-pos_update(struct lp_build_interp_soa_context *bld, int quad_index)
-{
-   LLVMValueRef x = bld->attribs[0][0];
-   LLVMValueRef y = bld->attribs[0][1];
-   const int xstep = 2, ystep = 2;
-
-   if (quad_index == 1 || quad_index == 3) {
-      /* top-right or bottom-right quad in block */
-      /* build x += xstep */
-      x = lp_build_add(&bld->base, x,
-                       lp_build_const_vec(bld->base.type, xstep));
-   }
-
-   if (quad_index == 2) {
-      /* bottom-left quad in block */
-      /* build y += ystep */
-      y = lp_build_add(&bld->base, y,
-                       lp_build_const_vec(bld->base.type, ystep));
-      /* build x -= xstep */
-      x = lp_build_sub(&bld->base, x,
-                       lp_build_const_vec(bld->base.type, xstep));
-   }
-
-   lp_build_name(x, "pos.x");
-   lp_build_name(y, "pos.y");
+   LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
 
-   bld->attribs[0][0] = x;
-   bld->attribs[0][1] = y;
+   bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
+   bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
 }
 
 
@@ -336,8 +689,11 @@ pos_update(struct lp_build_interp_soa_context *bld, int quad_index)
  */
 void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         struct gallivm_state *gallivm,
                          unsigned num_inputs,
                          const struct lp_shader_input *inputs,
+                         boolean pixel_center_integer,
+                         boolean depth_clamp,
                          LLVMBuilderRef builder,
                          struct lp_type type,
                          LLVMValueRef a0_ptr,
@@ -346,20 +702,38 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          LLVMValueRef x0,
                          LLVMValueRef y0)
 {
+   struct lp_type coeff_type;
+   struct lp_type setup_type;
    unsigned attrib;
    unsigned chan;
 
    memset(bld, 0, sizeof *bld);
 
-   lp_build_context_init(&bld->base, builder, type);
+   memset(&coeff_type, 0, sizeof coeff_type);
+   coeff_type.floating = TRUE;
+   coeff_type.sign = TRUE;
+   coeff_type.width = 32;
+   coeff_type.length = type.length;
+
+   memset(&setup_type, 0, sizeof setup_type);
+   setup_type.floating = TRUE;
+   setup_type.sign = TRUE;
+   setup_type.width = 32;
+   setup_type.length = TGSI_NUM_CHANNELS;
+
+
+   /* XXX: we don't support interpolating into any other types */
+   assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
+
+   lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
+   lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
 
    /* For convenience */
    bld->pos = bld->attribs[0];
-   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
+   bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
 
    /* Position */
-   bld->num_attribs = 1;
-   bld->mask[0] = TGSI_WRITEMASK_ZW;
+   bld->mask[0] = TGSI_WRITEMASK_XYZW;
    bld->interp[0] = LP_INTERP_LINEAR;
 
    /* Inputs */
@@ -371,29 +745,88 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 
    /* Ensure all masked out input channels have a valid value */
    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
-      for (chan = 0; chan < NUM_CHANNELS; ++chan) {
-         bld->attribs[attrib][chan] = bld->base.undef;
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         bld->attribs[attrib][chan] = bld->coeff_bld.undef;
       }
    }
 
-   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+   if (pixel_center_integer) {
+      bld->pos_offset = 0.0;
+   } else {
+      bld->pos_offset = 0.5;
+   }
+   bld->depth_clamp = depth_clamp;
 
    pos_init(bld, x0, y0);
 
-   attribs_init(bld);
+   /*
+    * Simple method (single step interpolation) may be slower if vector length
+    * is just 4, but the results are different (generally less accurate) with
+    * the other method, so always use more accurate version.
+    */
+   if (1) {
+      bld->simple_interp = TRUE;
+      {
+         /* XXX this should use a global static table */
+         unsigned i;
+         unsigned num_loops = 16 / type.length;
+         LLVMValueRef pixoffx, pixoffy, index;
+         LLVMValueRef ptr;
+
+         bld->xoffset_store = lp_build_array_alloca(gallivm,
+                                                    lp_build_vec_type(gallivm, type),
+                                                    lp_build_const_int32(gallivm, num_loops),
+                                                    "");
+         bld->yoffset_store = lp_build_array_alloca(gallivm,
+                                                    lp_build_vec_type(gallivm, type),
+                                                    lp_build_const_int32(gallivm, num_loops),
+                                                    "");
+         for (i = 0; i < num_loops; i++) {
+            index = lp_build_const_int32(gallivm, i);
+            calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
+            ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
+            LLVMBuildStore(builder, pixoffx, ptr);
+            ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
+            LLVMBuildStore(builder, pixoffy, ptr);
+         }
+      }
+      coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
+   }
+   else {
+      bld->simple_interp = FALSE;
+      coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+   }
+
 }
 
 
-/**
+/*
  * Advance the position and inputs to the given quad within the block.
  */
+
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index)
+lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
+                                      struct gallivm_state *gallivm,
+                                      LLVMValueRef quad_start_index)
 {
-   assert(quad_index < 4);
-
-   pos_update(bld, quad_index);
+   if (bld->simple_interp) {
+      attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+   }
+   else {
+      attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+   }
+}
 
-   attribs_update(bld, quad_index);
+void
+lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
+                                   struct gallivm_state *gallivm,
+                                   LLVMValueRef quad_start_index)
+{
+   if (bld->simple_interp) {
+      attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
+   }
+   else {
+      attribs_update(bld, gallivm, quad_start_index, 0, 1);
+   }
 }
+