Merge branch 'gallium-front-ccw'

[mesa.git] / src / gallium / drivers / llvmpipe / lp_setup_tri.c
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c

index efd91124a097afbf404a93cd05b13fc420c898a2..306cb6e27d2895dcb8a667ae332a4441549fbd54 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -29,69 +29,71 @@
   * Binning code for triangles
   */
  
-#include "lp_setup.h"
-#include "lp_state.h"
  #include "util/u_math.h"
  #include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+
+#define NUM_CHANNELS 4
  
  
  /**
   * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
   */
-static void constant_coef( struct lp_rast_triangle *tri,
-                          const float (*v3)[4],
-                          unsigned vert_attr,
-                          unsigned i )
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_rast_triangle *tri,
+                           unsigned slot,
+                          const float value,
+                           unsigned i )
  {
-   tri->inputs.a0[i] = v3[vert_attr][i];
-   tri->inputs.dadx[i] = 0;
-   tri->inputs.dady[i] = 0;
+   tri->inputs.a0[slot][i] = value;
+   tri->inputs.dadx[slot][i] = 0.0f;
+   tri->inputs.dady[slot][i] = 0.0f;
  }
  
+
  /**
   * Compute a0, dadx and dady for a linearly interpolated coefficient,
   * for a triangle.
   */
-static void linear_coef( struct lp_rast_triangle *tri,
-                         unsigned input,
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        const float (*v3)[4],
-                        unsigned vert_attr)
+static void linear_coef( struct lp_setup_context *setup,
+                         struct lp_rast_triangle *tri,
+                         float oneoverarea,
+                         unsigned slot,
+                         const float (*v1)[4],
+                         const float (*v2)[4],
+                         const float (*v3)[4],
+                         unsigned vert_attr,
+                         unsigned i)
  {
-   unsigned i;
-
-   input *= 4;
-
-   for (i = 0; i < NUM_CHANNELS; i++) {
-      float a1 = v1[vert_attr][i];
-      float a2 = v2[vert_attr][i];
-      float a3 = v3[vert_attr][i];
-
-      float da12 = a1 - a2;
-      float da31 = a3 - a1;
-      float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * tri->oneoverarea;
-      float dady = (da31 * tri->dx12 - tri->dx31 * da12) * tri->oneoverarea;
-
-      tri->inputs.dadx[input+i] = dadx;
-      tri->inputs.dady[input+i] = dady;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      tri->inputs.a0[input+i] = (v1[vert_attr][i] -
-                                 (dadx * (v1[0][0] - 0.5f) +
-                                  dady * (v1[0][1] - 0.5f)));
-   }
+   float a1 = v1[vert_attr][i];
+   float a2 = v2[vert_attr][i];
+   float a3 = v3[vert_attr][i];
+
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (v1[0][0] - setup->pixel_offset) +
+                               dady * (v1[0][1] - setup->pixel_offset)));
  }
  
  
@@ -103,113 +105,134 @@ static void linear_coef( struct lp_rast_triangle *tri,
   * Later, when we compute the value at a particular fragment position we'll
   * divide the interpolated value by the interpolated W at that fragment.
   */
-static void perspective_coef( struct lp_rast_triangle *tri,
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_rast_triangle *tri,
+                              float oneoverarea,
+                              unsigned slot,
                               const float (*v1)[4],
                               const float (*v2)[4],
                               const float (*v3)[4],
                               unsigned vert_attr,
-                             unsigned i)
+                              unsigned i)
  {
-   unsigned i;
-
-   input *= 4;
-
-   for (i = 0; i < NUM_CHANNELS; i++) {
-      /* premultiply by 1/w  (v[0][3] is always 1/w):
-       */
-      float a1 = v1[vert_attr][i] * v1[0][3];
-      float a2 = v2[vert_attr][i] * v2[0][3];
-      float a3 = v3[vert_attr][i] * v3[0][3];
-      float da12 = a1 - a2;
-      float da31 = a3 - a1;
-      float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * tri->oneoverarea;
-      float dady = (da31 * tri->dx12 - tri->dx31 * da12) * tri->oneoverarea;
-
-
-      tri->inputs.dadx[input+i] = dadx;
-      tri->inputs.dady[input+i] = dady;
-      tri->inputs.a0[input+i] = (a1 -
-                           (dadx * (v1[0][0] - 0.5f) +
-                            dady * (v1[0][1] - 0.5f)));
-   }
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = v1[vert_attr][i] * v1[0][3];
+   float a2 = v2[vert_attr][i] * v2[0][3];
+   float a3 = v3[vert_attr][i] * v3[0][3];
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (v1[0][0] - setup->pixel_offset) +
+                               dady * (v1[0][1] - setup->pixel_offset)));
  }
  
  
  /**
   * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial, though Y has to be inverted for OpenGL.
+ * X and Y are trivial
   * Z and W are copied from position_coef which should have already been computed.
   * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
   */
  static void
-setup_fragcoord_coef(struct lp_rast_triangle *tri, unsigned slot)
+setup_fragcoord_coef(struct lp_setup_context *setup,
+                     struct lp_rast_triangle *tri,
+                     float oneoverarea,
+                     unsigned slot,
+                     const float (*v1)[4],
+                     const float (*v2)[4],
+                     const float (*v3)[4])
  {
-   slot *= 4;
-
     /*X*/
-   tri->inputs.a0[slot+0] = 0.0;
-   tri->inputs.dadx[slot+0] = 1.0;
-   tri->inputs.dady[slot+0] = 0.0;
+   tri->inputs.a0[slot][0] = 0.0;
+   tri->inputs.dadx[slot][0] = 1.0;
+   tri->inputs.dady[slot][0] = 0.0;
     /*Y*/
-   tri->inputs.a0[slot+1] = 0.0;
-   tri->inputs.dadx[slot+1] = 0.0;
-   tri->inputs.dady[slot+1] = 1.0;
+   tri->inputs.a0[slot][1] = 0.0;
+   tri->inputs.dadx[slot][1] = 0.0;
+   tri->inputs.dady[slot][1] = 1.0;
     /*Z*/
-   tri->inputs.a0[slot+2] = tri->inputs.a0[2];
-   tri->inputs.dadx[slot+2] = tri->inputs.dadx[2];
-   tri->inputs.dady[slot+2] = tri->inputs.dady[2];
+   linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 2);
     /*W*/
-   tri->inputs.a0[slot+3] = tri->inputs.a0[3];
-   tri->inputs.dadx[slot+3] = tri->inputs.dadx[3];
-   tri->inputs.dady[slot+3] = tri->inputs.dady[3];
+   linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 3);
  }
  
  
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void setup_facing_coef( struct lp_setup_context *setup,
+                               struct lp_rast_triangle *tri,
+                               unsigned slot,
+                               boolean frontface )
+{
+   /* convert TRUE to 1.0 and FALSE to -1.0 */
+   constant_coef( setup, tri, slot, 2.0f * frontface - 1.0f, 0 );
+   constant_coef( setup, tri, slot, 0.0f, 1 ); /* wasted */
+   constant_coef( setup, tri, slot, 0.0f, 2 ); /* wasted */
+   constant_coef( setup, tri, slot, 0.0f, 3 ); /* wasted */
+}
+
  
  /**
   * Compute the tri->coef[] array dadx, dady, a0 values.
   */
-static void setup_tri_coefficients( struct setup_context *setup,
+static void setup_tri_coefficients( struct lp_setup_context *setup,
                                     struct lp_rast_triangle *tri,
+                                    float oneoverarea,
                                     const float (*v1)[4],
                                     const float (*v2)[4],
                                     const float (*v3)[4],
-                                   boolean frontface )
+                                   boolean frontface)
  {
-   unsigned input;
+   unsigned slot;
  
-   /* z and w are done by linear interpolation:
+   /* The internal position input is in slot zero:
      */
-   setup_fragcoord_coef(tri, 0);
-            linear_coef(tri, input, v1, v2, v3, vert_attr, i);
+   setup_fragcoord_coef(setup, tri, oneoverarea, 0, v1, v2, v3);
  
-   /* setup interpolation for all the remaining attrbutes:
+   /* setup interpolation for all the remaining attributes:
      */
-   for (input = 0; input < setup->fs.nr_inputs; input++) {
-      unsigned vert_attr = setup->fs.input[input].src_index;
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
        unsigned i;
  
-      switch (setup->fs.input[input].interp_mode) {
+      switch (setup->fs.input[slot].interp) {
        case LP_INTERP_CONSTANT:
-         constant_coef(tri, input, v3, vert_attr, i);
+         if (setup->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               constant_coef(setup, tri, slot+1, v1[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               constant_coef(setup, tri, slot+1, v3[vert_attr][i], i);
+         }
           break;
  
        case LP_INTERP_LINEAR:
-         linear_coef(tri, input, v1, v2, v3, vert_attr, i);
+         for (i = 0; i < NUM_CHANNELS; i++)
+            linear_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i);
           break;
  
        case LP_INTERP_PERSPECTIVE:
-            perspective_coef(tri, input, v1, v2, v3, vert_attr, i);
+         for (i = 0; i < NUM_CHANNELS; i++)
+            perspective_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i);
           break;
  
-      case LP_INTERP_POS:
-         setup_fragcoord_coef(tri, input);
+      case LP_INTERP_POSITION:
+         /* XXX: fix me - duplicates the values in slot zero.
+          */
+         setup_fragcoord_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3);
           break;
  
        case LP_INTERP_FACING:
-         tri->inputs.a0[input*4+0] = 1.0f - frontface;
-         tri->inputs.dadx[input*4+0] = 0.0;
-         tri->da[input].dady[0] = 0.0;
+         setup_facing_coef(setup, tri, slot+1, frontface);
           break;
  
        default:
@@ -220,54 +243,113 @@ static void setup_tri_coefficients( struct setup_context *setup,
  
  
  
-/* XXX: do this by add/subtracting a large floating point number:
- */
-static inline float subpixel_snap( float a )
+static INLINE int subpixel_snap( float a )
  {
-   int i = a * 16;
-   return (float)i * (1.0/16);
+   return util_iround(FIXED_ONE * a - (FIXED_ONE / 2));
  }
  
  
  
+/**
+ * Alloc space for a new triangle plus the input.a0/dadx/dady arrays
+ * immediately after it.
+ * The memory is allocated from the per-scene pool, not per-tile.
+ * \param tri_size  returns number of bytes allocated
+ * \param nr_inputs  number of fragment shader inputs
+ * \return pointer to triangle space
+ */
+static INLINE struct lp_rast_triangle *
+alloc_triangle(struct lp_scene *scene, unsigned nr_inputs, unsigned *tri_size)
+{
+   unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   struct lp_rast_triangle *tri;
+   unsigned bytes;
+   char *inputs;
+
+   assert(sizeof(*tri) % 16 == 0);
+
+   bytes = sizeof(*tri) + (3 * input_array_sz);
  
+   tri = lp_scene_alloc_aligned( scene, bytes, 16 );
  
-/* to avoid having to allocate power-of-four, square render targets,
- * end up having a specialized version of the above that runs only at
- * the topmost level.
- *
- * at the topmost level there may be an arbitary number of steps on
- * either dimension, so this loop needs to be either separately
- * code-generated and unrolled for each render target size, or kept as
- * generic looping code:
+   inputs = (char *) (tri + 1);
+   tri->inputs.a0   = (float (*)[4]) inputs;
+   tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
+   tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+
+   *tri_size = bytes;
+
+   return tri;
+}
+
+
+/**
+ * Print triangle vertex attribs (for debug).
   */
+static void
+print_triangle(struct lp_setup_context *setup,
+               const float (*v1)[4],
+               const float (*v2)[4],
+               const float (*v3)[4])
+{
+   uint i;
+
+   debug_printf("llvmpipe triangle\n");
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
+                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
+   }
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
+                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
+   }
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      debug_printf("  v3[%d]:  %f %f %f %f\n", i,
+                   v3[i][0], v3[i][1], v3[i][2], v3[i][3]);
+   }
+}
  
-#define MIN3(a,b,c) MIN2(MIN2(a,b),c)
-#define MAX3(a,b,c) MAX2(MAX2(a,b),c)
  
+/**
+ * Do basic setup for triangle rasterization and determine which
+ * framebuffer tiles are touched.  Put the triangle in the scene's
+ * bins for the tiles which we overlap.
+ */
  static void 
-do_triangle_ccw(struct lp_setup *setup,
+do_triangle_ccw(struct lp_setup_context *setup,
                 const float (*v1)[4],
                 const float (*v2)[4],
                 const float (*v3)[4],
                 boolean frontfacing )
  {
-   const int rt_width = setup->framebuffer.cbufs[0]->width;
-   const int rt_height = setup->framebuffer.cbufs[0]->height;
+   /* x/y positions in fixed point */
+   const int x1 = subpixel_snap(v1[0][0] + 0.5 - setup->pixel_offset);
+   const int x2 = subpixel_snap(v2[0][0] + 0.5 - setup->pixel_offset);
+   const int x3 = subpixel_snap(v3[0][0] + 0.5 - setup->pixel_offset);
+   const int y1 = subpixel_snap(v1[0][1] + 0.5 - setup->pixel_offset);
+   const int y2 = subpixel_snap(v2[0][1] + 0.5 - setup->pixel_offset);
+   const int y3 = subpixel_snap(v3[0][1] + 0.5 - setup->pixel_offset);
+
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *tri;
+   int area;
+   float oneoverarea;
+   int minx, maxx, miny, maxy;
+   unsigned tri_bytes;
  
-   const float y1 = subpixel_snap(v1[0][1]);
-   const float y2 = subpixel_snap(v2[0][1]);
-   const float y3 = subpixel_snap(v3[0][1]);
+   if (0)
+      print_triangle(setup, v1, v2, v3);
  
-   const float x1 = subpixel_snap(v1[0][0]);
-   const float x2 = subpixel_snap(v2[0][0]);
-   const float x3 = subpixel_snap(v3[0][0]);
-   
-   struct lp_setup_triangle *tri = lp_setup_alloc_data( setup, sizeof *tri );
-   float area;
-   float c1, c2, c3;
-   int i;
-   int minx, maxx, miny, maxy;
+   tri = alloc_triangle(scene, setup->fs.nr_inputs, &tri_bytes);
+
+#ifdef DEBUG
+   tri->v[0][0] = v1[0][0];
+   tri->v[1][0] = v2[0][0];
+   tri->v[2][0] = v3[0][0];
+   tri->v[0][1] = v1[0][1];
+   tri->v[1][1] = v2[0][1];
+   tri->v[2][1] = v3[0][1];
+#endif
  
     tri->dx12 = x1 - x2;
     tri->dx23 = x2 - x3;
@@ -277,50 +359,69 @@ do_triangle_ccw(struct lp_setup *setup,
     tri->dy23 = y2 - y3;
     tri->dy31 = y3 - y1;
  
-   area = (tri->dx12 * tri->dy31 - 
-          tri->dx31 * tri->dy12);
+   area = (tri->dx12 * tri->dy31 - tri->dx31 * tri->dy12);
  
-   /* Cull non-ccw and zero-sized triangles.
+   LP_COUNT(nr_tris);
+
+   /* Cull non-ccw and zero-sized triangles. 
+    *
+    * XXX: subject to overflow??
      */
-   if (area <= 0 || util_is_inf_or_nan(area))
+   if (area <= 0) {
+      lp_scene_putback_data( scene, tri_bytes );
+      LP_COUNT(nr_culled_tris);
        return;
+   }
  
-   // Bounding rectangle
-   minx = util_iround(MIN3(x1, x2, x3) - .5);
-   maxx = util_iround(MAX3(x1, x2, x3) + .5);
-   miny = util_iround(MIN3(y1, y2, y3) - .5);
-   maxy = util_iround(MAX3(y1, y2, y3) + .5);
+   /* Bounding rectangle (in pixels) */
+   minx = (MIN3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   maxx = (MAX3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   miny = (MIN3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER;
+   maxy = (MAX3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER;
     
-   /* Clamp to framebuffer (or tile) dimensions:
-    */
-   miny = MAX2(0, miny);
-   minx = MAX2(0, minx);
-   maxy = MIN2(rt_height, maxy);
-   maxx = MIN2(rt_width, maxx);
+   if (setup->scissor_test) {
+      minx = MAX2(minx, setup->scissor.current.minx);
+      maxx = MIN2(maxx, setup->scissor.current.maxx);
+      miny = MAX2(miny, setup->scissor.current.miny);
+      maxy = MIN2(maxy, setup->scissor.current.maxy);
+   }
  
-   if (miny == maxy || minx == maxx)
+   if (miny == maxy || 
+       minx == maxx) {
+      lp_scene_putback_data( scene, tri_bytes );
+      LP_COUNT(nr_culled_tris);
        return;
+   }
  
-   /* The only divide in this code.  Is it really needed?
+   /* 
      */
-   tri->oneoverarea = 1.0f / area;
+   oneoverarea = ((float)FIXED_ONE) / (float)area;
  
     /* Setup parameter interpolants:
      */
-   setup_tri_coefficients( setup, tri, v1, v2, v3, frontfacing );
+   setup_tri_coefficients( setup, tri, oneoverarea, v1, v2, v3, frontfacing );
  
-   /* half-edge constants, will be interated over the whole
-    * rendertarget.
+   tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
+
+   /* half-edge constants, will be interated over the whole render target.
      */
-   c1 = tri->dy12 * x1 - tri->dx12 * y1;
-   c2 = tri->dy23 * x2 - tri->dx23 * y2;
-   c3 = tri->dy31 * x3 - tri->dx31 * y3;
+   tri->c1 = tri->dy12 * x1 - tri->dx12 * y1;
+   tri->c2 = tri->dy23 * x2 - tri->dx23 * y2;
+   tri->c3 = tri->dy31 * x3 - tri->dx31 * y3;
  
     /* correct for top-left fill convention:
      */
-   if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) c1++;
-   if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) c2++;
-   if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) c3++;
+   if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) tri->c1++;
+   if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) tri->c2++;
+   if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) tri->c3++;
+
+   tri->dy12 *= FIXED_ONE;
+   tri->dy23 *= FIXED_ONE;
+   tri->dy31 *= FIXED_ONE;
+
+   tri->dx12 *= FIXED_ONE;
+   tri->dx23 *= FIXED_ONE;
+   tri->dx31 *= FIXED_ONE;
  
     /* find trivial reject offsets for each edge for a single-pixel
      * sized block.  These will be scaled up at each recursive level to
@@ -345,79 +446,153 @@ do_triangle_ccw(struct lp_setup *setup,
     tri->ei2 = tri->dx23 - tri->dy23 - tri->eo2;
     tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3;
  
-   minx &= ~(TILESIZE-1);              /* aligned blocks */
-   miny &= ~(TILESIZE-1);              /* aligned blocks */
+   /* Fill in the inputs.step[][] arrays.
+    * We've manually unrolled some loops here.
+    */
+   {
+      const int xstep1 = -tri->dy12;
+      const int xstep2 = -tri->dy23;
+      const int xstep3 = -tri->dy31;
+      const int ystep1 = tri->dx12;
+      const int ystep2 = tri->dx23;
+      const int ystep3 = tri->dx31;
+
+#define SETUP_STEP(i, x, y)                                \
+      do {                                                 \
+         tri->inputs.step[0][i] = x * xstep1 + y * ystep1; \
+         tri->inputs.step[1][i] = x * xstep2 + y * ystep2; \
+         tri->inputs.step[2][i] = x * xstep3 + y * ystep3; \
+      } while (0)
+
+      SETUP_STEP(0, 0, 0);
+      SETUP_STEP(1, 1, 0);
+      SETUP_STEP(2, 0, 1);
+      SETUP_STEP(3, 1, 1);
+
+      SETUP_STEP(4, 2, 0);
+      SETUP_STEP(5, 3, 0);
+      SETUP_STEP(6, 2, 1);
+      SETUP_STEP(7, 3, 1);
+
+      SETUP_STEP(8, 0, 2);
+      SETUP_STEP(9, 1, 2);
+      SETUP_STEP(10, 0, 3);
+      SETUP_STEP(11, 1, 3);
+
+      SETUP_STEP(12, 2, 2);
+      SETUP_STEP(13, 3, 2);
+      SETUP_STEP(14, 2, 3);
+      SETUP_STEP(15, 3, 3);
+#undef STEP
+   }
  
-   c1 += tri->dx12 * miny - tri->dy12 * minx;
-   c2 += tri->dx23 * miny - tri->dy23 * minx;
-   c3 += tri->dx31 * miny - tri->dy31 * minx;
+   /*
+    * All fields of 'tri' are now set.  The remaining code here is
+    * concerned with binning.
+    */
  
     /* Convert to tile coordinates:
      */
-   minx /= TILESIZE;
-   maxx /= TILESIZE;
-   miny /= TILESIZE;
-   maxy /= TILESIZE;
-   
+   minx = minx / TILE_SIZE;
+   miny = miny / TILE_SIZE;
+   maxx = maxx / TILE_SIZE;
+   maxy = maxy / TILE_SIZE;
+
+   /*
+    * Clamp to framebuffer size
+    */
+   minx = MAX2(minx, 0);
+   miny = MAX2(miny, 0);
+   maxx = MIN2(maxx, scene->tiles_x - 1);
+   maxy = MIN2(maxy, scene->tiles_y - 1);
+
+   /* Determine which tile(s) intersect the triangle's bounding box
+    */
     if (miny == maxy && minx == maxx)
     {
        /* Triangle is contained in a single tile:
         */
-      bin_command(setup->tile[minx][miny], lp_rast_triangle, tri );
+      lp_scene_bin_command( scene, minx, miny, lp_rast_triangle, 
+                           lp_rast_arg_triangle(tri) );
     }
     else 
     {
-      const int step = TILESIZE;
-
-      float ei1 = tri->ei1 * step;
-      float ei2 = tri->ei2 * step;
-      float ei3 = tri->ei3 * step;
-
-      float eo1 = tri->eo1 * step;
-      float eo2 = tri->eo2 * step;
-      float eo3 = tri->eo3 * step;
-
-      float xstep1 = -step * tri->dy12;
-      float xstep2 = -step * tri->dy23;
-      float xstep3 = -step * tri->dy31;
-
-      float ystep1 = step * tri->dx12;
-      float ystep2 = step * tri->dx23;
-      float ystep3 = step * tri->dx31;
+      int c1 = (tri->c1 + 
+                tri->dx12 * miny * TILE_SIZE - 
+                tri->dy12 * minx * TILE_SIZE);
+      int c2 = (tri->c2 + 
+                tri->dx23 * miny * TILE_SIZE -
+                tri->dy23 * minx * TILE_SIZE);
+      int c3 = (tri->c3 +
+                tri->dx31 * miny * TILE_SIZE -
+                tri->dy31 * minx * TILE_SIZE);
+
+      int ei1 = tri->ei1 << TILE_ORDER;
+      int ei2 = tri->ei2 << TILE_ORDER;
+      int ei3 = tri->ei3 << TILE_ORDER;
+
+      int eo1 = tri->eo1 << TILE_ORDER;
+      int eo2 = tri->eo2 << TILE_ORDER;
+      int eo3 = tri->eo3 << TILE_ORDER;
+
+      int xstep1 = -(tri->dy12 << TILE_ORDER);
+      int xstep2 = -(tri->dy23 << TILE_ORDER);
+      int xstep3 = -(tri->dy31 << TILE_ORDER);
+
+      int ystep1 = tri->dx12 << TILE_ORDER;
+      int ystep2 = tri->dx23 << TILE_ORDER;
+      int ystep3 = tri->dx31 << TILE_ORDER;
        int x, y;
  
  
-      /* Subdivide space into NxM blocks, where each block is square and
-       * power-of-four in dimension.
-       *
-       * Trivially accept or reject blocks, else jump to per-pixel
-       * examination above.
+      /* Test tile-sized blocks against the triangle.
+       * Discard blocks fully outside the tri.  If the block is fully
+       * contained inside the tri, bin an lp_rast_shade_tile command.
+       * Else, bin a lp_rast_triangle command.
         */
-      for (y = miny; y < maxy; y++)
+      for (y = miny; y <= maxy; y++)
        {
-        float cx1 = c1;
-        float cx2 = c2;
-        float cx3 = c3;
+        int cx1 = c1;
+        int cx2 = c2;
+        int cx3 = c3;
+        boolean in = FALSE;  /* are we inside the triangle? */
  
-        for (x = minx; x < maxx; x++)
+        for (x = minx; x <= maxx; x++)
          {
             if (cx1 + eo1 < 0 || 
                 cx2 + eo2 < 0 ||
                 cx3 + eo3 < 0) 
             {
                /* do nothing */
+               LP_COUNT(nr_empty_64);
+              if (in)
+                 break;  /* exiting triangle, all done with this row */
             }
             else if (cx1 + ei1 > 0 &&
                      cx2 + ei2 > 0 &&
                      cx3 + ei3 > 0) 
             {
-               /* shade whole tile */
-               bin_command(setup->tile[x][y], lp_rast_shade_tile, &tri->inputs );
+               /* triangle covers the whole tile- shade whole tile */
+               LP_COUNT(nr_fully_covered_64);
+              in = TRUE;
+              if(setup->fs.current.opaque) {
+                 lp_scene_bin_reset( scene, x, y );
+                 lp_scene_bin_command( scene, x, y,
+                                       lp_rast_set_state,
+                                       lp_rast_arg_state(setup->fs.stored) );
+              }
+               lp_scene_bin_command( scene, x, y,
+                                    lp_rast_shade_tile,
+                                    lp_rast_arg_inputs(&tri->inputs) );
             }
             else 
-           {
-               /* shade partial tile */
-              bin_command(setup->tile[x][y], lp_rast_triangle, tri );
+           { 
+               /* rasterizer/shade partial tile */
+               LP_COUNT(nr_partially_covered_64);
+              in = TRUE;
+               lp_scene_bin_command( scene, x, y,
+                                    lp_rast_triangle, 
+                                    lp_rast_arg_triangle(tri) );
             }
  
             /* Iterate cx values across the region:
@@ -436,7 +611,11 @@ do_triangle_ccw(struct lp_setup *setup,
     }
  }
  
-static void triangle_cw( struct setup_context *setup,
+
+/**
+ * Draw triangle if it's CW, cull otherwise.
+ */
+static void triangle_cw( struct lp_setup_context *setup,
                          const float (*v0)[4],
                          const float (*v1)[4],
                          const float (*v2)[4] )
@@ -444,7 +623,11 @@ static void triangle_cw( struct setup_context *setup,
     do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface );
  }
  
-static void triangle_ccw( struct setup_context *setup,
+
+/**
+ * Draw triangle if it's CCW, cull otherwise.
+ */
+static void triangle_ccw( struct lp_setup_context *setup,
                          const float (*v0)[4],
                          const float (*v1)[4],
                          const float (*v2)[4] )
@@ -452,7 +635,12 @@ static void triangle_ccw( struct setup_context *setup,
     do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
  }
  
-static void triangle_both( struct setup_context *setup,
+
+
+/**
+ * Draw triangle whether it's CW or CCW.
+ */
+static void triangle_both( struct lp_setup_context *setup,
                            const float (*v0)[4],
                            const float (*v1)[4],
                            const float (*v2)[4] )
@@ -464,39 +652,36 @@ static void triangle_both( struct setup_context *setup,
     const float fy = v1[0][1] - v2[0][1];
  
     /* det = cross(e,f).z */
-   if (ex * fy - ey * fx < 0) 
+   if (ex * fy - ey * fx < 0.0f) 
        triangle_ccw( setup, v0, v1, v2 );
     else
        triangle_cw( setup, v0, v1, v2 );
  }
  
-static void triangle_nop( struct setup_context *setup,
+
+static void triangle_nop( struct lp_setup_context *setup,
                           const float (*v0)[4],
                           const float (*v1)[4],
                           const float (*v2)[4] )
  {
  }
  
-void setup_set_tri_state( struct setup_context *setup,
-                          unsigned cull_mode,
-                          boolean ccw_is_frontface)
-{
-   setup->ccw_is_frontface = ccw_is_frontface;
  
-   switch (cull_mode) {
-   case PIPE_WINDING_NONE:
+void 
+lp_setup_choose_triangle( struct lp_setup_context *setup )
+{
+   switch (setup->cullmode) {
+   case PIPE_FACE_NONE:
        setup->triangle = triangle_both;
        break;
-   case PIPE_WINDING_CCW:
-      setup->triangle = triangle_cw;
+   case PIPE_FACE_BACK:
+      setup->triangle = setup->ccw_is_frontface ? triangle_ccw : triangle_cw;
        break;
-   case PIPE_WINDING_CW:
-      setup->triangle = triangle_ccw;
+   case PIPE_FACE_FRONT:
+      setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw;
        break;
     default:
        setup->triangle = triangle_nop;
        break;
     }
  }
-
-