st/xorg: If dri2 version is 2 support version 1 as well
[mesa.git] / src / gallium / drivers / llvmpipe / lp_setup.c
index c81a2b7ca5f69d0e6b17cb69249a0c8915feafa7..ffcbc9a379f08b562f12f233a76ac50afe5adb75 100644 (file)
@@ -33,9 +33,7 @@
  */
 
 #include "lp_context.h"
-#include "lp_prim_setup.h"
 #include "lp_quad.h"
-#include "lp_quad_pipe.h"
 #include "lp_setup.h"
 #include "lp_state.h"
 #include "draw/draw_context.h"
@@ -45,6 +43,9 @@
 #include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "lp_bld_debug.h"
+#include "lp_tile_cache.h"
+#include "lp_tile_soa.h"
 
 
 #define DEBUG_VERTS 0
@@ -88,12 +89,13 @@ struct setup_context {
    float oneoverarea;
    int facing;
 
+   float pixel_offset;
+
    struct quad_header quad[MAX_QUADS];
    struct quad_header *quad_ptrs[MAX_QUADS];
    unsigned count;
 
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-   struct tgsi_interp_coef posCoef;  /* For Z, W */
+   struct quad_interp_coef coef;
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
@@ -111,6 +113,84 @@ struct setup_context {
 
 
 
+/**
+ * Execute fragment shader for the four fragments in the quad.
+ */
+ALIGN_STACK
+static void
+shade_quads(struct llvmpipe_context *llvmpipe,
+            struct quad_header *quads[],
+            unsigned nr)
+{
+   struct lp_fragment_shader *fs = llvmpipe->fs;
+   struct quad_header *quad = quads[0];
+   const unsigned x = quad->input.x0;
+   const unsigned y = quad->input.y0;
+   uint8_t *tile;
+   uint8_t *color;
+   void *depth;
+   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   unsigned chan_index;
+   unsigned q;
+
+   assert(fs->current);
+   if(!fs->current)
+      return;
+
+   /* Sanity checks */
+   assert(nr * QUAD_SIZE == TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH);
+   assert(x % TILE_VECTOR_WIDTH == 0);
+   assert(y % TILE_VECTOR_HEIGHT == 0);
+   for (q = 0; q < nr; ++q) {
+      assert(quads[q]->input.x0 == x + q*2);
+      assert(quads[q]->input.y0 == y);
+   }
+
+   /* mask */
+   for (q = 0; q < 4; ++q)
+      for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
+         mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
+
+   /* color buffer */
+   if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
+      llvmpipe->framebuffer.cbufs[0]) {
+      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+      color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
+   }
+   else
+      color = NULL;
+
+   /* depth buffer */
+   if(llvmpipe->zsbuf_map) {
+      assert((x % 2) == 0);
+      assert((y % 2) == 0);
+      depth = llvmpipe->zsbuf_map +
+              y*llvmpipe->zsbuf_transfer->stride +
+              2*x*llvmpipe->zsbuf_transfer->block.size;
+   }
+   else
+      depth = NULL;
+
+   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
+   assert(lp_check_alignment(mask, 16));
+
+   assert(lp_check_alignment(depth, 16));
+   assert(lp_check_alignment(color, 16));
+   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
+
+   /* run shader */
+   fs->current->jit_function( &llvmpipe->jit_context,
+                              x, y,
+                              quad->coef->a0,
+                              quad->coef->dadx,
+                              quad->coef->dady,
+                              &mask[0][0],
+                              color,
+                              depth);
+}
+
+
+
 
 /**
  * Do triangle cull test using tri determinant (sign indicates orientation)
@@ -167,22 +247,6 @@ quad_clip( struct setup_context *setup, struct quad_header *quad )
 }
 
 
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
-{
-   quad_clip( setup, quad );
-
-   if (quad->inout.mask) {
-      struct llvmpipe_context *lp = setup->llvmpipe;
-
-      lp->quad.first->run( lp->quad.first, &quad, 1 );
-   }
-}
-
-
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -195,7 +259,51 @@ static INLINE int block( int x )
 
 static INLINE int block_x( int x )
 {
-   return x & ~(16-1);
+   return x & ~(TILE_VECTOR_WIDTH - 1);
+}
+
+
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
+{
+   quad_clip( setup, quad );
+
+   if (quad->inout.mask) {
+      struct llvmpipe_context *lp = setup->llvmpipe;
+
+#if 1
+      /* XXX: The blender expects 4 quads. This is far from efficient, but
+       * until we codegenerate single-quad variants of the fragment pipeline
+       * we need this hack. */
+      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
+      struct quad_header quads[4];
+      struct quad_header *quad_ptrs[4];
+      int x0 = block_x(quad->input.x0);
+      unsigned i;
+
+      assert(nr_quads == 4);
+
+      for(i = 0; i < nr_quads; ++i) {
+         int x = x0 + 2*i;
+         if(x == quad->input.x0)
+            memcpy(&quads[i], quad, sizeof quads[i]);
+         else {
+            memset(&quads[i], 0, sizeof quads[i]);
+            quads[i].input.x0 = x;
+            quads[i].input.y0 = quad->input.y0;
+            quads[i].coef = quad->coef;
+         }
+         quad_ptrs[i] = &quads[i];
+      }
+
+      shade_quads( lp, quad_ptrs, nr_quads );
+#else
+      shade_quads( lp, &quad, 1 );
+#endif
+   }
 }
 
 
@@ -204,12 +312,11 @@ static INLINE int block_x( int x )
  */
 static void flush_spans( struct setup_context *setup )
 {
-   const int step = 16;
+   const int step = TILE_VECTOR_WIDTH;
    const int xleft0 = setup->span.left[0];
    const int xleft1 = setup->span.left[1];
    const int xright0 = setup->span.right[0];
    const int xright1 = setup->span.right[1];
-   struct quad_stage *pipe = setup->llvmpipe->quad.first;
 
 
    int minleft = block_x(MIN2(xleft0, xleft1));
@@ -222,6 +329,7 @@ static void flush_spans( struct setup_context *setup )
       unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
       unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
       unsigned lx = x;
+      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
       unsigned q = 0;
 
       unsigned skipmask_left0 = (1U << skip_left0) - 1U;
@@ -236,21 +344,19 @@ static void flush_spans( struct setup_context *setup )
       unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
 
       if (mask0 | mask1) {
-         do {
+         for(q = 0; q < nr_quads; ++q) {
             unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
-            if (quadmask) {
-               setup->quad[q].input.x0 = lx;
-               setup->quad[q].input.y0 = setup->span.y;
-               setup->quad[q].inout.mask = quadmask;
-               setup->quad_ptrs[q] = &setup->quad[q];
-               q++;
-            }
+            setup->quad[q].input.x0 = lx;
+            setup->quad[q].input.y0 = setup->span.y;
+            setup->quad[q].inout.mask = quadmask;
+            setup->quad_ptrs[q] = &setup->quad[q];
             mask0 >>= 2;
             mask1 >>= 2;
             lx += 2;
-         } while (mask0 | mask1);
+         }
+         assert(!(mask0 | mask1));
 
-         pipe->run( pipe, setup->quad_ptrs, q );
+         shade_quads(setup->llvmpipe, setup->quad_ptrs, nr_quads );
       }
    }
 
@@ -378,29 +484,17 @@ static boolean setup_sort_vertices( struct setup_context *setup,
       ((det > 0.0) ^ 
        (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
-   return TRUE;
-}
-
-
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
- * \param slot  which attribute slot
- * \param i  which component of the slot (0..3)
- */
-static void const_coeff( struct setup_context *setup,
-                         struct tgsi_interp_coef *coef,
-                         uint vertSlot, uint i)
-{
-   assert(i <= 3);
-
-   coef->dadx[i] = 0;
-   coef->dady[i] = 0;
-
-   /* need provoking vertex info!
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
     */
-   coef->a0[i] = setup->vprovoke[vertSlot][i];
+   if (setup->llvmpipe->rasterizer->gl_rasterization_rules) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
+
+   return TRUE;
 }
 
 
@@ -408,9 +502,8 @@ static void const_coeff( struct setup_context *setup,
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( struct setup_context *setup,
-                              struct tgsi_interp_coef *coef,
-                              uint vertSlot, uint i)
+static void tri_pos_coeff( struct setup_context *setup,
+                           uint vertSlot, unsigned i)
 {
    float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
    float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
@@ -421,12 +514,12 @@ static void tri_linear_coeff( struct setup_context *setup,
 
    assert(i <= 3);
 
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
+   setup->coef.dadx[0][i] = dadx;
+   setup->coef.dady[0][i] = dady;
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
+    * pixel centers, in other words (pixel_offset, pixel_offset).
     *
     * this is neat but unfortunately not a good way to do things for
     * triangles with very large values of dadx or dady as it will
@@ -436,20 +529,111 @@ static void tri_linear_coeff( struct setup_context *setup,
     * to define a0 as the sample at a pixel center somewhere near vmin
     * instead - i'll switch to this later.
     */
-   coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
    /*
    debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-               slot, "xyzw"[i],
-               setup->coef[slot].a0[i],
-               setup->coef[slot].dadx[i],
-               setup->coef[slot].dady[i]);
+                slot, "xyzw"[i],
+                setup->coef[slot].a0[i],
+                setup->coef[slot].dadx[i],
+                setup->coef[slot].dady[i]);
    */
 }
 
 
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_pos_coeff( struct setup_context *setup,
+                             uint vertSlot, unsigned i)
+{
+   setup->coef.dadx[0][i] = 0;
+   setup->coef.dady[0][i] = 0;
+
+   /* need provoking vertex info!
+    */
+   setup->coef.a0[0][i] = setup->vprovoke[vertSlot][i];
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_coeff( struct setup_context *setup,
+                         unsigned attrib,
+                         uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      setup->coef.dadx[1 + attrib][i] = 0;
+      setup->coef.dady[1 + attrib][i] = 0;
+
+      /* need provoking vertex info!
+       */
+      setup->coef.a0[1 + attrib][i] = setup->vprovoke[vertSlot][i];
+   }
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void tri_linear_coeff( struct setup_context *setup,
+                              unsigned attrib,
+                              uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
+      float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float dadx = a * setup->oneoverarea;
+      float dady = b * setup->oneoverarea;
+
+      assert(i <= 3);
+
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+
+      /* calculate a0 as the value which would be sampled for the
+       * fragment at (0,0), taking into account that we want to sample at
+       * pixel centers, in other words (0.5, 0.5).
+       *
+       * this is neat but unfortunately not a good way to do things for
+       * triangles with very large values of dadx or dady as it will
+       * result in the subtraction and re-addition from a0 of a very
+       * large number, which means we'll end up loosing a lot of the
+       * fractional bits and precision from a0.  the way to fix this is
+       * to define a0 as the sample at a pixel center somewhere near vmin
+       * instead - i'll switch to this later.
+       */
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+
+      /*
+      debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+                   slot, "xyzw"[i],
+                   setup->coef[slot].a0[i],
+                   setup->coef[slot].dadx[i],
+                   setup->coef[slot].dady[i]);
+      */
+   }
+}
+
+
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -459,35 +643,38 @@ static void tri_linear_coeff( struct setup_context *setup,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void tri_persp_coeff( struct setup_context *setup,
-                             struct tgsi_interp_coef *coef,
-                             uint vertSlot, uint i)
+                             unsigned attrib,
+                             uint vertSlot)
 {
-   /* premultiply by 1/w  (v[0][3] is always W):
-    */
-   float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-   float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
-   float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-   float dadx = a * setup->oneoverarea;
-   float dady = b * setup->oneoverarea;
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      /* premultiply by 1/w  (v[0][3] is always W):
+       */
+      float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+      float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
+      float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+      float botda = mida - mina;
+      float majda = maxa - mina;
+      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float dadx = a * setup->oneoverarea;
+      float dady = b * setup->oneoverarea;
 
-   /*
-   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-               setup->vmin[vertSlot][i],
-               setup->vmid[vertSlot][i],
-                       setup->vmax[vertSlot][i]
-          );
-   */
-   assert(i <= 3);
+      /*
+      debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+                   setup->vmin[vertSlot][i],
+                   setup->vmid[vertSlot][i],
+                   setup->vmax[vertSlot][i]
+             );
+      */
+      assert(i <= 3);
 
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
-   coef->a0[i] = (mina -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (mina -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   }
 }
 
 
@@ -501,21 +688,21 @@ static void
 setup_fragcoord_coeff(struct setup_context *setup, uint slot)
 {
    /*X*/
-   setup->coef[slot].a0[0] = 0;
-   setup->coef[slot].dadx[0] = 1.0;
-   setup->coef[slot].dady[0] = 0.0;
+   setup->coef.a0[1 + slot][0] = 0;
+   setup->coef.dadx[1 + slot][0] = 1.0;
+   setup->coef.dady[1 + slot][0] = 0.0;
    /*Y*/
-   setup->coef[slot].a0[1] = 0.0;
-   setup->coef[slot].dadx[1] = 0.0;
-   setup->coef[slot].dady[1] = 1.0;
+   setup->coef.a0[1 + slot][1] = 0.0;
+   setup->coef.dadx[1 + slot][1] = 0.0;
+   setup->coef.dady[1 + slot][1] = 1.0;
    /*Z*/
-   setup->coef[slot].a0[2] = setup->posCoef.a0[2];
-   setup->coef[slot].dadx[2] = setup->posCoef.dadx[2];
-   setup->coef[slot].dady[2] = setup->posCoef.dady[2];
+   setup->coef.a0[1 + slot][2] = setup->coef.a0[0][2];
+   setup->coef.dadx[1 + slot][2] = setup->coef.dadx[0][2];
+   setup->coef.dady[1 + slot][2] = setup->coef.dady[0][2];
    /*W*/
-   setup->coef[slot].a0[3] = setup->posCoef.a0[3];
-   setup->coef[slot].dadx[3] = setup->posCoef.dadx[3];
-   setup->coef[slot].dady[3] = setup->posCoef.dady[3];
+   setup->coef.a0[1 + slot][3] = setup->coef.a0[0][3];
+   setup->coef.dadx[1 + slot][3] = setup->coef.dadx[0][3];
+   setup->coef.dady[1 + slot][3] = setup->coef.dady[0][3];
 }
 
 
@@ -533,27 +720,23 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
    /* z and w are done by linear interpolation:
     */
-   tri_linear_coeff(setup, &setup->posCoef, 0, 2);
-   tri_linear_coeff(setup, &setup->posCoef, 0, 3);
+   tri_pos_coeff(setup, 0, 2);
+   tri_pos_coeff(setup, 0, 3);
 
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-      uint j;
 
       switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         const_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            tri_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         tri_linear_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            tri_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         tri_persp_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
@@ -563,9 +746,9 @@ static void setup_tri_coefficients( struct setup_context *setup )
       }
 
       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
-         setup->coef[fragSlot].dadx[0] = 0.0;
-         setup->coef[fragSlot].dady[0] = 0.0;
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
       }
    }
 }
@@ -574,12 +757,12 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
 static void setup_tri_edges( struct setup_context *setup )
 {
-   float vmin_x = setup->vmin[0][0] + 0.5f;
-   float vmid_x = setup->vmid[0][0] + 0.5f;
+   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
+   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
 
-   float vmin_y = setup->vmin[0][1] - 0.5f;
-   float vmid_y = setup->vmid[0][1] - 0.5f;
-   float vmax_y = setup->vmax[0][1] - 0.5f;
+   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
+   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
+   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
 
    setup->emaj.sy = ceilf(vmin_y);
    setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
@@ -696,7 +879,7 @@ calc_det( const float (*v0)[4],
 /**
  * Do setup for triangle rasterization, then render the triangle.
  */
-void setup_tri( struct setup_context *setup,
+void llvmpipe_setup_tri( struct setup_context *setup,
                 const float (*v0)[4],
                 const float (*v1)[4],
                 const float (*v2)[4] )
@@ -769,18 +952,40 @@ void setup_tri( struct setup_context *setup,
  * for a line.
  */
 static void
-line_linear_coeff(const struct setup_context *setup,
-                  struct tgsi_interp_coef *coef,
-                  uint vertSlot, uint i)
+linear_pos_coeff(struct setup_context *setup,
+                 uint vertSlot, uint i)
 {
    const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
    const float dadx = da * setup->emaj.dx * setup->oneoverarea;
    const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
-   coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+   setup->coef.dadx[0][i] = dadx;
+   setup->coef.dady[0][i] = dady;
+   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ */
+static void
+line_linear_coeff(struct setup_context *setup,
+                  unsigned attrib,
+                  uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+      const float dady = da * setup->emaj.dy * setup->oneoverarea;
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   }
 }
 
 
@@ -789,21 +994,24 @@ line_linear_coeff(const struct setup_context *setup,
  * for a line.
  */
 static void
-line_persp_coeff(const struct setup_context *setup,
-                 struct tgsi_interp_coef *coef,
-                 uint vertSlot, uint i)
+line_persp_coeff(struct setup_context *setup,
+                 unsigned attrib,
+                 uint vertSlot)
 {
-   /* XXX double-check/verify this arithmetic */
-   const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-   const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-   const float da = a1 - a0;
-   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-   const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
-   coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      /* XXX double-check/verify this arithmetic */
+      const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+      const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+      const float da = a1 - a0;
+      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+      const float dady = da * setup->emaj.dy * setup->oneoverarea;
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   }
 }
 
 
@@ -841,27 +1049,23 @@ setup_line_coefficients(struct setup_context *setup,
 
    /* z and w are done by linear interpolation:
     */
-   line_linear_coeff(setup, &setup->posCoef, 0, 2);
-   line_linear_coeff(setup, &setup->posCoef, 0, 3);
+   linear_pos_coeff(setup, 0, 2);
+   linear_pos_coeff(setup, 0, 3);
 
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-      uint j;
 
       switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         const_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            line_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         line_linear_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            line_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         line_persp_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
@@ -871,9 +1075,9 @@ setup_line_coefficients(struct setup_context *setup,
       }
 
       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
-         setup->coef[fragSlot].dadx[0] = 0.0;
-         setup->coef[fragSlot].dady[0] = 0.0;
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
       }
    }
    return TRUE;
@@ -915,7 +1119,7 @@ plot(struct setup_context *setup, int x, int y)
  * to handle stippling and wide lines.
  */
 void
-setup_line(struct setup_context *setup,
+llvmpipe_setup_line(struct setup_context *setup,
            const float (*v0)[4],
            const float (*v1)[4])
 {
@@ -1027,15 +1231,17 @@ setup_line(struct setup_context *setup,
 
 
 static void
-point_persp_coeff(const struct setup_context *setup,
+point_persp_coeff(struct setup_context *setup,
                   const float (*vert)[4],
-                  struct tgsi_interp_coef *coef,
-                  uint vertSlot, uint i)
+                  unsigned attrib,
+                  uint vertSlot)
 {
-   assert(i <= 3);
-   coef->dadx[i] = 0.0F;
-   coef->dady[i] = 0.0F;
-   coef->a0[i] = vert[vertSlot][i] * vert[0][3];
+   unsigned i;
+   for(i = 0; i < NUM_CHANNELS; ++i) {
+      setup->coef.dadx[1 + attrib][i] = 0.0F;
+      setup->coef.dady[1 + attrib][i] = 0.0F;
+      setup->coef.a0[1 + attrib][i] = vert[vertSlot][i] * vert[0][3];
+   }
 }
 
 
@@ -1045,7 +1251,7 @@ point_persp_coeff(const struct setup_context *setup,
  * XXX could optimize a lot for 1-pixel points.
  */
 void
-setup_point( struct setup_context *setup,
+llvmpipe_setup_point( struct setup_context *setup,
              const float (*v0)[4] )
 {
    struct llvmpipe_context *llvmpipe = setup->llvmpipe;
@@ -1090,24 +1296,20 @@ setup_point( struct setup_context *setup,
    setup->vprovoke = v0;
 
    /* setup Z, W */
-   const_coeff(setup, &setup->posCoef, 0, 2);
-   const_coeff(setup, &setup->posCoef, 0, 3);
+   const_pos_coeff(setup, 0, 2);
+   const_pos_coeff(setup, 0, 3);
 
    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-      uint j;
 
       switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          /* fall-through */
       case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         const_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            point_persp_coeff(setup, setup->vprovoke,
-                              &setup->coef[fragSlot], vertSlot, j);
+         point_persp_coeff(setup, setup->vprovoke, fragSlot, vertSlot);
          break;
       case INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
@@ -1117,9 +1319,9 @@ setup_point( struct setup_context *setup,
       }
 
       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
-         setup->coef[fragSlot].dadx[0] = 0.0;
-         setup->coef[fragSlot].dady[0] = 0.0;
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
       }
    }
 
@@ -1246,7 +1448,7 @@ setup_point( struct setup_context *setup,
    }
 }
 
-void setup_prepare( struct setup_context *setup )
+void llvmpipe_setup_prepare( struct setup_context *setup )
 {
    struct llvmpipe_context *lp = setup->llvmpipe;
 
@@ -1254,8 +1456,6 @@ void setup_prepare( struct setup_context *setup )
       llvmpipe_update_derived(lp);
    }
 
-   lp->quad.first->begin( lp->quad.first );
-
    if (lp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        lp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
        lp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
@@ -1270,25 +1470,29 @@ void setup_prepare( struct setup_context *setup )
 
 
 
-void setup_destroy_context( struct setup_context *setup )
+void llvmpipe_setup_destroy_context( struct setup_context *setup )
 {
-   FREE( setup );
+   align_free( setup );
 }
 
 
 /**
  * Create a new primitive setup/render stage.
  */
-struct setup_context *setup_create_context( struct llvmpipe_context *llvmpipe )
+struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe )
 {
-   struct setup_context *setup = CALLOC_STRUCT(setup_context);
+   struct setup_context *setup;
    unsigned i;
 
+   setup = align_malloc(sizeof(struct setup_context), 16);
+   if (!setup)
+      return NULL;
+
+   memset(setup, 0, sizeof *setup);
    setup->llvmpipe = llvmpipe;
 
    for (i = 0; i < MAX_QUADS; i++) {
-      setup->quad[i].coef = setup->coef;
-      setup->quad[i].posCoef = &setup->posCoef;
+      setup->quad[i].coef = &setup->coef;
    }
 
    setup->span.left[0] = 1000000;     /* greater than right[0] */