Merge master and fix conflicts

[mesa.git] / src / gallium / drivers / cell / spu / spu_tri.c
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c

index 1519b8cd7efea27e3074b4075d1f23527772be5b..58be001be4cf32c2131b1cae686d0374c81d471e 100644 (file)
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -29,12 +29,12 @@
   * Triangle rendering within a tile.
   */
  
-#include <transpose_matrix4x4.h>
  #include "pipe/p_compiler.h"
  #include "pipe/p_format.h"
  #include "util/u_math.h"
  #include "spu_colorpack.h"
  #include "spu_main.h"
+#include "spu_shuffle.h"
  #include "spu_texture.h"
  #include "spu_tile.h"
  #include "spu_tri.h"
@@ -43,11 +43,6 @@
  /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
  typedef vector unsigned int mask_t;
  
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
  
  
  /**
@@ -61,7 +56,7 @@ struct vertex_header {
  
  /* XXX fix this */
  #undef CEILF
-#define CEILF(X) ((float) (int) ((X) + 0.99999))
+#define CEILF(X) ((float) (int) ((X) + 0.99999f))
  
  
  #define QUAD_TOP_LEFT     0
@@ -75,14 +70,25 @@ struct vertex_header {
  #define MASK_ALL          0xf
  
  
+#define CHAN0 0
+#define CHAN1 1
+#define CHAN2 2
+#define CHAN3 3
+
+
  #define DEBUG_VERTS 0
  
  /**
   * Triangle edge info
   */
  struct edge {
-   float dx;           /**< X(v1) - X(v0), used only during setup */
-   float dy;           /**< Y(v1) - Y(v0), used only during setup */
+   union {
+      struct {
+         float dx;     /**< X(v1) - X(v0), used only during setup */
+         float dy;     /**< Y(v1) - Y(v0), used only during setup */
+      };
+      vec_float4 ds;    /**< vector accessor for dx and dy */
+   };
     float dxdy;         /**< dx/dy */
     float sx, sy;       /**< first sample point coord */
     int lines;          /**< number of lines on this edge */
@@ -91,9 +97,9 @@ struct edge {
  
  struct interp_coef
  {
-   float4 a0;
-   float4 dadx;
-   float4 dady;
+   vector float a0;
+   vector float dadx;
+   vector float dady;
  };
  
  
@@ -107,28 +113,40 @@ struct setup_stage {
      * turn.  Currently fixed at 4 floats, but should change in time.
      * Codegen will help cope with this.
      */
-   const struct vertex_header *vmax;
-   const struct vertex_header *vmid;
-   const struct vertex_header *vmin;
-   const struct vertex_header *vprovoke;
+   union {
+      struct {
+         const struct vertex_header *vmin;
+         const struct vertex_header *vmid;
+         const struct vertex_header *vmax;
+         const struct vertex_header *vprovoke;
+      };
+      qword vertex_headers;
+   };
  
     struct edge ebot;
     struct edge etop;
     struct edge emaj;
  
-   float oneOverArea;
+   float oneOverArea;  /* XXX maybe make into vector? */
  
     uint facing;
  
     uint tx, ty;  /**< position of current tile (x, y) */
  
-   int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
+   union {
+      struct {
+         int cliprect_minx;
+         int cliprect_miny;
+         int cliprect_maxx;
+         int cliprect_maxy;
+      };
+      qword cliprect;
+   };
  
     struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
  
     struct {
-      int left[2];   /**< [0] = row0, [1] = row1 */
-      int right[2];
+      vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
        int y;
        unsigned y_flags;
        unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
@@ -139,99 +157,97 @@ struct setup_stage {
  static struct setup_stage setup;
  
  
-/**
- * Evaluate attribute coefficients (plane equations) to compute
- * attribute values for the four fragments in a quad.
- * Eg: four colors will be computed (in AoS format).
- */
-static INLINE void
-eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
+static INLINE vector float
+splatx(vector float v)
  {
-   switch (spu.vertex_info.attrib[slot].interp_mode) {
-   case INTERP_CONSTANT:
-      result[QUAD_TOP_LEFT] =
-      result[QUAD_TOP_RIGHT] =
-      result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
-      break;
-   case INTERP_LINEAR:
-      {
-         vector float dadx = setup.coef[slot].dadx.v;
-         vector float dady = setup.coef[slot].dady.v;
-         vector float topLeft =
-            spu_add(setup.coef[slot].a0.v,
-                    spu_add(spu_mul(spu_splats(x), dadx),
-                            spu_mul(spu_splats(y), dady)));
-
-         result[QUAD_TOP_LEFT] = topLeft;
-         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
-         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
-         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
-      }
-      break;
-   case INTERP_PERSPECTIVE:
-      {
-         vector float dadx = setup.coef[slot].dadx.v;
-         vector float dady = setup.coef[slot].dady.v;
-         vector float topLeft =
-            spu_add(setup.coef[slot].a0.v,
-                    spu_add(spu_mul(spu_splats(x), dadx),
-                            spu_mul(spu_splats(y), dady)));
-
-         vector float wInv = spu_re(w);  /* 1.0 / w */
-
-         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
-         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
-         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
-         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
-      }
-      break;
-   case INTERP_POS:
-   case INTERP_NONE:
-      break;
-   default:
-      ASSERT(0);
-   }
+   return spu_splats(spu_extract(v, CHAN0));
  }
  
-
-/**
- * As above, but return 4 vectors in SOA format.
- * XXX this will all be re-written someday.
- */
-static INLINE void
-eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
+static INLINE vector float
+splaty(vector float v)
  {
-   eval_coeff(slot, x, y, w, result);
-   _transpose_matrix4x4(result, result);
+   return spu_splats(spu_extract(v, CHAN1));
  }
  
+static INLINE vector float
+splatz(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN2));
+}
  
-/** Evalute coefficients to get Z for four pixels in a quad */
  static INLINE vector float
-eval_z(float x, float y)
+splatw(vector float v)
  {
-   const uint slot = 0;
-   const float dzdx = setup.coef[slot].dadx.f[2];
-   const float dzdy = setup.coef[slot].dady.f[2];
-   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
-   const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
-   return spu_add(topLeftv, derivs);
+   return spu_splats(spu_extract(v, CHAN3));
  }
  
  
-/** Evalute coefficients to get W for four pixels in a quad */
-static INLINE vector float
-eval_w(float x, float y)
+/**
+ * Setup fragment shader inputs by evaluating triangle's vertex
+ * attribute coefficient info.
+ * \param x  quad x pos
+ * \param y  quad y pos
+ * \param fragZ  returns quad Z values
+ * \param fragInputs  returns fragment program inputs
+ * Note: this code could be incorporated into the fragment program
+ * itself to avoid the loop and switch.
+ */
+static void
+eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
  {
-   const uint slot = 0;
-   const float dwdx = setup.coef[slot].dadx.f[3];
-   const float dwdy = setup.coef[slot].dady.f[3];
-   const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
-   const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
-   return spu_add(topLeftv, derivs);
+   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
+   static const vector float deltaY = (const vector float) {0, 0, 1, 1};
+
+   const uint posSlot = 0;
+   const vector float pos = setup.coef[posSlot].a0;
+   const vector float dposdx = setup.coef[posSlot].dadx;
+   const vector float dposdy = setup.coef[posSlot].dady;
+   const vector float fragX = spu_splats(x) + deltaX;
+   const vector float fragY = spu_splats(y) + deltaY;
+   vector float fragW, wInv;
+   uint i;
+
+   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
+   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
+   wInv = spu_re(fragW);  /* 1 / w */
+
+   /* loop over fragment program inputs */
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      uint attr = i + 1;
+      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
+
+      /* constant term */
+      vector float a0 = setup.coef[attr].a0;
+      vector float r0 = splatx(a0);
+      vector float r1 = splaty(a0);
+      vector float r2 = splatz(a0);
+      vector float r3 = splatw(a0);
+
+      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
+         /* linear term */
+         vector float dadx = setup.coef[attr].dadx;
+         vector float dady = setup.coef[attr].dady;
+         /* Use SPU intrinsics here to get slightly better code.
+          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
+          */
+         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
+         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
+         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
+         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
+         if (interp == INTERP_PERSPECTIVE) {
+            /* perspective term */
+            r0 *= wInv;
+            r1 *= wInv;
+            r2 *= wInv;
+            r3 *= wInv;
+         }
+      }
+      fragInputs[CHAN0] = r0;
+      fragInputs[CHAN1] = r1;
+      fragInputs[CHAN2] = r2;
+      fragInputs[CHAN3] = r3;
+      fragInputs += 4;
+   }
  }
  
  
@@ -257,35 +273,35 @@ emit_quad( int x, int y, mask_t mask)
            * Run fragment shader, execute per-fragment ops, update fb/tile.
            */
           vector float inputs[4*4], outputs[2*4];
-         vector float fragZ = eval_z((float) x, (float) y);
-         vector float fragW = eval_w((float) x, (float) y);
-
-         /* setup inputs */
-#if 0
-         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
-#else
-         uint i;
-         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
-         }
-#endif
+         vector unsigned int kill_mask;
+         vector float fragZ;
+
+         eval_inputs((float) x, (float) y, &fragZ, inputs);
+
           ASSERT(spu.fragment_program);
           ASSERT(spu.fragment_ops);
  
           /* Execute the current fragment program */
-         spu.fragment_program(inputs, outputs, spu.constants);
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
  
           /* Execute per-fragment/quad operations, including:
            * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
            */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                            fragZ,
                            outputs[0*4+0],
                            outputs[0*4+1],
                            outputs[0*4+2],
                            outputs[0*4+3],
-                          mask,
-                          setup.facing);
+                          mask);
        }
     }
  }
@@ -302,27 +318,6 @@ block(int x)
  }
  
  
-/**
- * Compute mask which indicates which pixels in the 2x2 quad are actually inside
- * the triangle's bounds.
- * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
- */
-static INLINE mask_t
-calculate_mask(int x)
-{
-   /* This is a little tricky.
-    * Use & instead of && to avoid branches.
-    * Use negation to convert true/false to ~0/0 values.
-    */
-   mask_t mask;
-   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
-   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
-   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
-   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
-   return mask;
-}
-
-
  /**
   * Render a horizontal span of quads
   */
@@ -330,25 +325,29 @@ static void
  flush_spans(void)
  {
     int minleft, maxright;
-   int x;
+
+   const int l0 = spu_extract(setup.span.quad, 0);
+   const int l1 = spu_extract(setup.span.quad, 1);
+   const int r0 = spu_extract(setup.span.quad, 2);
+   const int r1 = spu_extract(setup.span.quad, 3);
  
     switch (setup.span.y_flags) {
     case 0x3:
        /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
-      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+      minleft = MIN2(l0, l1);
+      maxright = MAX2(r0, r1);
        break;
  
     case 0x1:
        /* only even line written (quad top row) */
-      minleft = setup.span.left[0];
-      maxright = setup.span.right[0];
+      minleft = l0;
+      maxright = r0;
        break;
  
     case 0x2:
        /* only odd line written (quad bottom row) */
-      minleft = setup.span.left[1];
-      maxright = setup.span.right[1];
+      minleft = l1;
+      maxright = r1;
        break;
  
     default:
@@ -371,7 +370,7 @@ flush_spans(void)
     }
     ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
  
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
        if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
           /* wait for mfc_get() to complete */
           //printf("SPU: %u: waiting for ztile\n", spu.init.id);
@@ -386,17 +385,42 @@ flush_spans(void)
        ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
     }
  
-   /* XXX this loop could be moved into the above switch cases and
-    * calculate_mask() could be simplified a bit...
-    */
-   for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( x, setup.span.y, calculate_mask( x ));
+   /* XXX this loop could be moved into the above switch cases... */
+   
+   /* Setup for mask calculation */
+   const vec_int4 quad_LlRr = setup.span.quad;
+   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
+   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
+   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
+
+   const vec_int4 twos = spu_splats(2);
+
+   const int x = block(minleft);
+   vec_int4 xs = {x, x+1, x, x+1};
+
+   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
+      /**
+       * Computes mask to indicate which pixels in the 2x2 quad are actually
+       * inside the triangle's bounds.
+       */
+      
+      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
+      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
+      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
+      
+      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
+      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
+
+      /* Combine results to create mask */
+      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
+
+      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
     }
  
     setup.span.y = 0;
     setup.span.y_flags = 0;
-   setup.span.right[0] = 0;
-   setup.span.right[1] = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
  }
  
  
@@ -416,12 +440,53 @@ print_vertex(const struct vertex_header *v)
  }
  #endif
  
+/* Returns the minimum of each slot of two vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n]);
+ */
+static qword
+minfq(qword q0, qword q1)
+{
+   const qword q0q1m = si_fcgt(q0, q1);
+   return si_selb(q0, q1, q0q1m);
+}
+
+/* Returns the minimum of each slot of three vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+min3fq(qword q0, qword q1, qword q2)
+{
+   return minfq(minfq(q0, q1), q2);
+}
+
+/* Returns the maximum of each slot of two vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+maxfq(qword q0, qword q1) {
+   const qword q0q1m = si_fcgt(q0, q1);
+   return si_selb(q1, q0, q0q1m);
+}
+
+/* Returns the maximum of each slot of three vec_float4s as qwords.
+ * i.e. return[n] = min(q0[n],q1[n],q2[n]);
+ */
+static qword
+max3fq(qword q0, qword q1, qword q2) {
+   return maxfq(maxfq(q0, q1), q2);
+}
  
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
  static boolean
-setup_sort_vertices(const struct vertex_header *v0,
-                    const struct vertex_header *v1,
-                    const struct vertex_header *v2)
+setup_sort_vertices(const qword vs)
  {
+   float area, sign;
+
  #if DEBUG_VERTS
     if (spu.init.id==0) {
        fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
@@ -431,104 +496,79 @@ setup_sort_vertices(const struct vertex_header *v0,
     }
  #endif
  
-   setup.vprovoke = v2;
-
-   /* determine bottom to top order of vertices */
     {
-      float y0 = spu_extract(v0->data[0], 1);
-      float y1 = spu_extract(v1->data[0], 1);
-      float y2 = spu_extract(v2->data[0], 1);
-      if (y0 <= y1) {
-        if (y1 <= y2) {
-           /* y0<=y1<=y2 */
-           setup.vmin = v0;   
-           setup.vmid = v1;   
-           setup.vmax = v2;
-        }
-        else if (y2 <= y0) {
-           /* y2<=y0<=y1 */
-           setup.vmin = v2;   
-           setup.vmid = v0;   
-           setup.vmax = v1;   
-        }
-        else {
-           /* y0<=y2<=y1 */
-           setup.vmin = v0;   
-           setup.vmid = v2;   
-           setup.vmax = v1;  
-        }
-      }
-      else {
-        if (y0 <= y2) {
-           /* y1<=y0<=y2 */
-           setup.vmin = v1;   
-           setup.vmid = v0;   
-           setup.vmax = v2;  
-        }
-        else if (y2 <= y1) {
-           /* y2<=y1<=y0 */
-           setup.vmin = v2;   
-           setup.vmid = v1;   
-           setup.vmax = v0;  
-        }
-        else {
-           /* y1<=y2<=y0 */
-           setup.vmin = v1;   
-           setup.vmid = v2;   
-           setup.vmax = v0;
-        }
-      }
+      /* Load the float values for various processing... */
+      const qword f0 = (qword)(((const struct vertex_header*)si_to_ptr(vs))->data[0]);
+      const qword f1 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 4)))->data[0]);
+      const qword f2 = (qword)(((const struct vertex_header*)si_to_ptr(si_rotqbyi(vs, 8)))->data[0]);
+
+      /* Check if triangle is completely outside the tile bounds
+       * Find the min and max x and y positions of the three poits */
+      const qword minf = min3fq(f0, f1, f2);
+      const qword maxf = max3fq(f0, f1, f2);
+
+      /* Compare min and max against cliprect vals */
+      const qword maxsmins = si_shufb(maxf, minf, SHUFB4(A,B,a,b));
+      const qword outside = si_fcgt(maxsmins, si_csflt(setup.cliprect, 0));
+
+      /* Use a little magic to work out of the tri is visible or not */
+      if(si_to_uint(si_xori(si_gb(outside), 0xc))) return FALSE;
+
+      /* determine bottom to top order of vertices */
+      /* A table of shuffle patterns for putting vertex_header pointers into
+         correct order.  Quite magical. */
+      const qword sort_order_patterns[] = {
+         SHUFB4(A,B,C,C),
+         SHUFB4(C,A,B,C),
+         SHUFB4(A,C,B,C),
+         SHUFB4(B,C,A,C),
+         SHUFB4(B,A,C,C),
+         SHUFB4(C,B,A,C) };
+
+      /* Collate y values into two vectors for comparison.
+         Using only one shuffle constant! ;) */
+      const qword y_02_ = si_shufb(f0, f2, SHUFB4(0,B,b,C));
+      const qword y_10_ = si_shufb(f1, f0, SHUFB4(0,B,b,C));
+      const qword y_012 = si_shufb(y_02_, f1, SHUFB4(0,B,b,C));
+      const qword y_120 = si_shufb(y_10_, f2, SHUFB4(0,B,b,C));
+
+      /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
+      const qword compare = si_fcgt(y_012, y_120);
+      /* Compress the result of the comparison into 4 bits */
+      const qword gather = si_gb(compare);
+      /* Subtract one to attain the index into the LUT.  Magical. */
+      const unsigned int index = si_to_uint(gather) - 1;
+
+      /* Load the appropriate pattern and construct the desired vector. */
+      setup.vertex_headers = si_shufb(vs, vs, sort_order_patterns[index]);
+
+      /* Using the result of the comparison, set sign.
+         Very magical. */
+      sign = ((si_to_uint(si_cntb(gather)) == 2) ? 1.0f : -1.0f);
     }
  
-   /* Check if triangle is completely outside the tile bounds */
-   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
-      return FALSE;
-   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
-      return FALSE;
-   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
-       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
-       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
-      return FALSE;
-   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
-       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
-       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
-      return FALSE;
-
-   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
-   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
-   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
-   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
-   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
-   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+   setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
+   setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
+   setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
  
     /*
      * Compute triangle's area.  Use 1/area to compute partial
      * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
      */
-   {
-      const float area = (setup.emaj.dx * setup.ebot.dy -
-                          setup.ebot.dx * setup.emaj.dy);
-
-      setup.oneOverArea = 1.0f / area;
-      /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneOverArea, area, prim->det );
-      */
-   }
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
+
+   setup.oneOverArea = 1.0f / area;
  
-#if 0
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
      */
-   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
  
     return TRUE;
  }
@@ -543,9 +583,9 @@ setup_sort_vertices(const struct vertex_header *v0,
  static INLINE void
  const_coeff4(uint slot)
  {
-   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
  }
  
  
@@ -569,13 +609,13 @@ tri_linear_coeff4(uint slot)
     vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                              spu_mul(majda, spu_splats(setup.ebot.dx)));
  
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
  
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                           
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
  }
  
  
@@ -613,13 +653,13 @@ tri_persp_coeff4(uint slot)
     vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                              spu_mul(majda, spu_splats(setup.ebot.dx)));
  
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
  
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                           
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
  }
  
  
@@ -739,9 +779,11 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
              setup.span.y = block(_y);
           }
  
-         setup.span.left[_y&1] = left;
-         setup.span.right[_y&1] = right;
-         setup.span.y_flags |= 1<<(_y&1);
+         int offset = _y&1;
+         vec_int4 quad_LlRr = {left, left, right, right};
+         /* Store left and right in 0 or 1 row of quad based on offset */
+         setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
+         setup.span.y_flags |= 1<<offset;
        }
     }
  
@@ -755,46 +797,24 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
  }
  
  
-static float
-determinant(const float *v0, const float *v1, const float *v2)
-{
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0] - v2[0];
-   const float ey = v0[1] - v2[1];
-   const float fx = v1[0] - v2[0];
-   const float fy = v1[1] - v2[1];
-
-   /* det = cross(e,f).z */
-   return ex * fy - ey * fx;
-}
-
-
  /**
   * Draw triangle into tile at (tx, ty) (tile coords)
   * The tile data should have already been fetched.
   */
  boolean
-tri_draw(const float *v0, const float *v1, const float *v2,
+tri_draw(const qword vs,
           uint tx, uint ty)
  {
     setup.tx = tx;
     setup.ty = ty;
  
     /* set clipping bounds to tile bounds */
-   setup.cliprect_minx = tx * TILE_SIZE;
-   setup.cliprect_miny = ty * TILE_SIZE;
-   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
-   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
-
-   /* Before we sort vertices, determine the facing of the triangle,
-    * which will be needed for front/back-face stencil application
-    */
-   float det = determinant(v0, v1, v2);
-   setup.facing = (det > 0.0) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+   const qword clipbase = (qword)((vec_uint4){tx, ty});
+   const qword clipmin = si_mpyui(clipbase, TILE_SIZE);
+   const qword clipmax = si_ai(clipmin, TILE_SIZE);
+   setup.cliprect = si_shufb(clipmin, clipmax, SHUFB4(A,B,a,b));
  
-   if (!setup_sort_vertices((struct vertex_header *) v0,
-                            (struct vertex_header *) v1,
-                            (struct vertex_header *) v2)) {
+   if(!setup_sort_vertices(vs)) {
        return FALSE; /* totally clipped */
     }
  
@@ -803,8 +823,8 @@ tri_draw(const float *v0, const float *v1, const float *v2,
  
     setup.span.y = 0;
     setup.span.y_flags = 0;
-   setup.span.right[0] = 0;
-   setup.span.right[1] = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
  
     if (setup.oneOverArea < 0.0) {
        /* emaj on left */