X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_setup.c;h=ffcbc9a379f08b562f12f233a76ac50afe5adb75;hb=ccc888c39ee8a7c460dca5b1b659d28dbbc4c689;hp=21e769931a92d82b38e8ba75028c6dc1a2178226;hpb=df1823ec5b2c64a9d1a5fc13be0e0f37741c3ffa;p=mesa.git

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 21e769931a9..ffcbc9a379f 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -33,9 +33,7 @@
  */
 
 #include "lp_context.h"
-#include "lp_prim_setup.h"
 #include "lp_quad.h"
-#include "lp_quad_pipe.h"
 #include "lp_setup.h"
 #include "lp_state.h"
 #include "draw/draw_context.h"
@@ -45,6 +43,9 @@
 #include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "lp_bld_debug.h"
+#include "lp_tile_cache.h"
+#include "lp_tile_soa.h"
 
 
 #define DEBUG_VERTS 0
@@ -61,87 +62,9 @@ struct edge {
    int lines;		/**< number of lines on this edge */
 };
 
-#if LP_NUM_QUAD_THREADS > 1
 
-/* Set to 1 if you want other threads to be instantly
- * notified of pending jobs.
- */
-#define INSTANT_NOTEMPTY_NOTIFY 0
-
-struct thread_info
-{
-   struct setup_context *setup;
-   uint id;
-   pipe_thread handle;
-};
-
-struct quad_job;
+#define MAX_QUADS 16
 
-typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
-
-struct quad_job
-{
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-   quad_job_routine routine;
-};
-
-#define NUM_QUAD_JOBS 64
-
-struct quad_job_que
-{
-   struct quad_job jobs[NUM_QUAD_JOBS];
-   uint first;
-   uint last;
-   pipe_mutex que_mutex;
-   pipe_condvar que_notfull_condvar;
-   pipe_condvar que_notempty_condvar;
-   uint jobs_added;
-   uint jobs_done;
-   pipe_condvar que_done_condvar;
-};
-
-static void
-add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
-{
-#if INSTANT_NOTEMPTY_NOTIFY
-   boolean empty;
-#endif
-
-   /* Wait for empty slot, see if the que is empty.
-    */
-   pipe_mutex_lock( que->que_mutex );
-   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
-#if !INSTANT_NOTEMPTY_NOTIFY
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-#endif
-      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
-   }
-#if INSTANT_NOTEMPTY_NOTIFY
-   empty = que->last == que->first;
-#endif
-   que->jobs_added++;
-   pipe_mutex_unlock( que->que_mutex );
-
-   /* Submit new job.
-    */
-   que->jobs[que->last].input = quad->input;
-   que->jobs[que->last].inout = quad->inout;
-   que->jobs[que->last].routine = routine;
-   que->last = (que->last + 1) % NUM_QUAD_JOBS;
-
-#if INSTANT_NOTEMPTY_NOTIFY
-   /* If the que was empty, notify consumers there's a job to be done.
-    */
-   if (empty) {
-      pipe_mutex_lock( que->que_mutex );
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-#endif
-}
-
-#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -164,15 +87,15 @@ struct setup_context {
    struct edge emaj;
 
    float oneoverarea;
+   int facing;
 
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-   struct tgsi_interp_coef posCoef;  /* For Z, W */
-   struct quad_header quad;
+   float pixel_offset;
 
-#if LP_NUM_QUAD_THREADS > 1
-   struct quad_job_que que;
-   struct thread_info threads[LP_NUM_QUAD_THREADS];
-#endif
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
+
+   struct quad_interp_coef coef;
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
@@ -188,67 +111,84 @@ struct setup_context {
    unsigned winding;		/* which winding to cull */
 };
 
-#if LP_NUM_QUAD_THREADS > 1
-
-static PIPE_THREAD_ROUTINE( quad_thread, param )
-{
-   struct thread_info *info = (struct thread_info *) param;
-   struct quad_job_que *que = &info->setup->que;
-
-   for (;;) {
-      struct quad_job job;
-      boolean full;
 
-      /* Wait for an available job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      while (que->last == que->first)
-         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
-
-      /* See if the que is full.
-       */
-      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
 
-      /* Take a job and remove it from que.
-       */
-      job = que->jobs[que->first];
-      que->first = (que->first + 1) % NUM_QUAD_JOBS;
+/**
+ * Execute fragment shader for the four fragments in the quad.
+ */
+ALIGN_STACK
+static void
+shade_quads(struct llvmpipe_context *llvmpipe,
+            struct quad_header *quads[],
+            unsigned nr)
+{
+   struct lp_fragment_shader *fs = llvmpipe->fs;
+   struct quad_header *quad = quads[0];
+   const unsigned x = quad->input.x0;
+   const unsigned y = quad->input.y0;
+   uint8_t *tile;
+   uint8_t *color;
+   void *depth;
+   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   unsigned chan_index;
+   unsigned q;
+
+   assert(fs->current);
+   if(!fs->current)
+      return;
 
-      /* Notify the producer if the que is not full.
-       */
-      if (full)
-         pipe_condvar_signal( que->que_notfull_condvar );
-      pipe_mutex_unlock( que->que_mutex );
+   /* Sanity checks */
+   assert(nr * QUAD_SIZE == TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH);
+   assert(x % TILE_VECTOR_WIDTH == 0);
+   assert(y % TILE_VECTOR_HEIGHT == 0);
+   for (q = 0; q < nr; ++q) {
+      assert(quads[q]->input.x0 == x + q*2);
+      assert(quads[q]->input.y0 == y);
+   }
 
-      job.routine( info->setup, info->id, &job );
+   /* mask */
+   for (q = 0; q < 4; ++q)
+      for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
+         mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
 
-      /* Notify the producer if that's the last finished job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      que->jobs_done++;
-      if (que->jobs_added == que->jobs_done)
-         pipe_condvar_signal( que->que_done_condvar );
-      pipe_mutex_unlock( que->que_mutex );
+   /* color buffer */
+   if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
+      llvmpipe->framebuffer.cbufs[0]) {
+      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+      color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
    }
-
-   return NULL;
+   else
+      color = NULL;
+
+   /* depth buffer */
+   if(llvmpipe->zsbuf_map) {
+      assert((x % 2) == 0);
+      assert((y % 2) == 0);
+      depth = llvmpipe->zsbuf_map +
+              y*llvmpipe->zsbuf_transfer->stride +
+              2*x*llvmpipe->zsbuf_transfer->block.size;
+   }
+   else
+      depth = NULL;
+
+   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
+   assert(lp_check_alignment(mask, 16));
+
+   assert(lp_check_alignment(depth, 16));
+   assert(lp_check_alignment(color, 16));
+   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
+
+   /* run shader */
+   fs->current->jit_function( &llvmpipe->jit_context,
+                              x, y,
+                              quad->coef->a0,
+                              quad->coef->dadx,
+                              quad->coef->dady,
+                              &mask[0][0],
+                              color,
+                              depth);
 }
 
-#define WAIT_FOR_COMPLETION(setup) \
-   do {\
-      pipe_mutex_lock( setup->que.que_mutex );\
-      if (!INSTANT_NOTEMPTY_NOTIFY)\
-         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
-      while (setup->que.jobs_added != setup->que.jobs_done)\
-         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
-      pipe_mutex_unlock( setup->que.que_mutex );\
-   } while (0)
-
-#else
-
-#define WAIT_FOR_COMPLETION(setup) ((void) 0)
-
-#endif
 
 
 
@@ -307,110 +247,63 @@ quad_clip( struct setup_context *setup, struct quad_header *quad )
 }
 
 
+
 /**
- * Emit a quad (pass to next stage) with clipping.
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
  */
-static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+static INLINE int block( int x )
 {
-   quad_clip( setup, quad );
-   if (quad->inout.mask) {
-      struct llvmpipe_context *lp = setup->llvmpipe;
-
-      lp->quad[thread].first->run( lp->quad[thread].first, quad );
-   }
+   return x & ~(2-1);
 }
 
-#if LP_NUM_QUAD_THREADS > 1
-
-static void
-clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+static INLINE int block_x( int x )
 {
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   clip_emit_quad( setup, &quad, thread );
+   return x & ~(TILE_VECTOR_WIDTH - 1);
 }
 
-#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
-
-#else
-
-#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
-
-#endif
 
 /**
- * Emit a quad (pass to next stage).  No clipping is done.
+ * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
 {
-   struct llvmpipe_context *lp = setup->llvmpipe;
-#if DEBUG_FRAGS
-   uint mask = quad->inout.mask;
-#endif
-
-#if DEBUG_FRAGS
-   if (mask & 1) setup->numFragsEmitted++;
-   if (mask & 2) setup->numFragsEmitted++;
-   if (mask & 4) setup->numFragsEmitted++;
-   if (mask & 8) setup->numFragsEmitted++;
-#endif
-   lp->quad[thread].first->run( lp->quad[thread].first, quad );
-#if DEBUG_FRAGS
-   mask = quad->inout.mask;
-   if (mask & 1) setup->numFragsWritten++;
-   if (mask & 2) setup->numFragsWritten++;
-   if (mask & 4) setup->numFragsWritten++;
-   if (mask & 8) setup->numFragsWritten++;
-#endif
-}
-
-#if LP_NUM_QUAD_THREADS > 1
+   quad_clip( setup, quad );
 
-static void
-emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   emit_quad( setup, &quad, thread );
-}
+   if (quad->inout.mask) {
+      struct llvmpipe_context *lp = setup->llvmpipe;
 
-#define EMIT_QUAD(setup,x,y,qmask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = qmask;\
-      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
-   } while (0)
+#if 1
+      /* XXX: The blender expects 4 quads. This is far from efficient, but
+       * until we codegenerate single-quad variants of the fragment pipeline
+       * we need this hack. */
+      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
+      struct quad_header quads[4];
+      struct quad_header *quad_ptrs[4];
+      int x0 = block_x(quad->input.x0);
+      unsigned i;
+
+      assert(nr_quads == 4);
+
+      for(i = 0; i < nr_quads; ++i) {
+         int x = x0 + 2*i;
+         if(x == quad->input.x0)
+            memcpy(&quads[i], quad, sizeof quads[i]);
+         else {
+            memset(&quads[i], 0, sizeof quads[i]);
+            quads[i].input.x0 = x;
+            quads[i].input.y0 = quad->input.y0;
+            quads[i].coef = quad->coef;
+         }
+         quad_ptrs[i] = &quads[i];
+      }
 
+      shade_quads( lp, quad_ptrs, nr_quads );
 #else
-
-#define EMIT_QUAD(setup,x,y,qmask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = qmask;\
-      emit_quad( setup, &setup->quad, 0 );\
-   } while (0)
-
+      shade_quads( lp, &quad, 1 );
 #endif
-
-/**
- * Given an X or Y coordinate, return the block/quad coordinate that it
- * belongs to.
- */
-static INLINE int block( int x )
-{
-   return x & ~1;
+   }
 }
 
 
@@ -419,13 +312,14 @@ static INLINE int block( int x )
  */
 static void flush_spans( struct setup_context *setup )
 {
-   const int step = 30;
+   const int step = TILE_VECTOR_WIDTH;
    const int xleft0 = setup->span.left[0];
    const int xleft1 = setup->span.left[1];
    const int xright0 = setup->span.right[0];
    const int xright1 = setup->span.right[1];
 
-   int minleft = block(MIN2(xleft0, xleft1));
+
+   int minleft = block_x(MIN2(xleft0, xleft1));
    int maxright = MAX2(xright0, xright1);
    int x;
 
@@ -435,7 +329,9 @@ static void flush_spans( struct setup_context *setup )
       unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
       unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
       unsigned lx = x;
-      
+      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
+      unsigned q = 0;
+
       unsigned skipmask_left0 = (1U << skip_left0) - 1U;
       unsigned skipmask_left1 = (1U << skip_left1) - 1U;
 
@@ -447,13 +343,20 @@ static void flush_spans( struct setup_context *setup )
       unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
       unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
 
-      while (mask0 | mask1) {
-         unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
-         if (quadmask)
-            EMIT_QUAD( setup, lx, setup->span.y, quadmask );
-         mask0 >>= 2;
-         mask1 >>= 2;
-         lx += 2;
+      if (mask0 | mask1) {
+         for(q = 0; q < nr_quads; ++q) {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            setup->quad[q].input.x0 = lx;
+            setup->quad[q].input.y0 = setup->span.y;
+            setup->quad[q].inout.mask = quadmask;
+            setup->quad_ptrs[q] = &setup->quad[q];
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         }
+         assert(!(mask0 | mask1));
+
+         shade_quads(setup->llvmpipe, setup->quad_ptrs, nr_quads );
       }
    }
 
@@ -472,7 +375,7 @@ static void print_vertex(const struct setup_context *setup,
 {
    int i;
    debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
       debug_printf("     %d: %f %f %f %f\n",  i,
               v[i][0], v[i][1], v[i][2], v[i][3]);
       if (util_is_inf_or_nan(v[i][0])) {
@@ -577,31 +480,21 @@ static boolean setup_sort_vertices( struct setup_context *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.input.facing = (det > 0.0) ^ (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-
-   return TRUE;
-}
-
+   setup->facing = 
+      ((det > 0.0) ^ 
+       (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
- * \param slot  which attribute slot
- * \param i  which component of the slot (0..3)
- */
-static void const_coeff( struct setup_context *setup,
-                         struct tgsi_interp_coef *coef,
-                         uint vertSlot, uint i)
-{
-   assert(i <= 3);
-
-   coef->dadx[i] = 0;
-   coef->dady[i] = 0;
-
-   /* need provoking vertex info!
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
     */
-   coef->a0[i] = setup->vprovoke[vertSlot][i];
+   if (setup->llvmpipe->rasterizer->gl_rasterization_rules) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
+
+   return TRUE;
 }
 
 
@@ -609,9 +502,8 @@ static void const_coeff( struct setup_context *setup,
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( struct setup_context *setup,
-                              struct tgsi_interp_coef *coef,
-                              uint vertSlot, uint i)
+static void tri_pos_coeff( struct setup_context *setup,
+                           uint vertSlot, unsigned i)
 {
    float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
    float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
@@ -622,12 +514,12 @@ static void tri_linear_coeff( struct setup_context *setup,
 
    assert(i <= 3);
 
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
+   setup->coef.dadx[0][i] = dadx;
+   setup->coef.dady[0][i] = dady;
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
+    * pixel centers, in other words (pixel_offset, pixel_offset).
     *
     * this is neat but unfortunately not a good way to do things for
     * triangles with very large values of dadx or dady as it will
@@ -637,20 +529,111 @@ static void tri_linear_coeff( struct setup_context *setup,
     * to define a0 as the sample at a pixel center somewhere near vmin
     * instead - i'll switch to this later.
     */
-   coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
    /*
    debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-		slot, "xyzw"[i],
-		setup->coef[slot].a0[i],
-		setup->coef[slot].dadx[i],
-		setup->coef[slot].dady[i]);
+                slot, "xyzw"[i],
+                setup->coef[slot].a0[i],
+                setup->coef[slot].dadx[i],
+                setup->coef[slot].dady[i]);
    */
 }
 
 
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_pos_coeff( struct setup_context *setup,
+                             uint vertSlot, unsigned i)
+{
+   setup->coef.dadx[0][i] = 0;
+   setup->coef.dady[0][i] = 0;
+
+   /* need provoking vertex info!
+    */
+   setup->coef.a0[0][i] = setup->vprovoke[vertSlot][i];
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_coeff( struct setup_context *setup,
+                         unsigned attrib,
+                         uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      setup->coef.dadx[1 + attrib][i] = 0;
+      setup->coef.dady[1 + attrib][i] = 0;
+
+      /* need provoking vertex info!
+       */
+      setup->coef.a0[1 + attrib][i] = setup->vprovoke[vertSlot][i];
+   }
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void tri_linear_coeff( struct setup_context *setup,
+                              unsigned attrib,
+                              uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
+      float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float dadx = a * setup->oneoverarea;
+      float dady = b * setup->oneoverarea;
+
+      assert(i <= 3);
+
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+
+      /* calculate a0 as the value which would be sampled for the
+       * fragment at (0,0), taking into account that we want to sample at
+       * pixel centers, in other words (0.5, 0.5).
+       *
+       * this is neat but unfortunately not a good way to do things for
+       * triangles with very large values of dadx or dady as it will
+       * result in the subtraction and re-addition from a0 of a very
+       * large number, which means we'll end up loosing a lot of the
+       * fractional bits and precision from a0.  the way to fix this is
+       * to define a0 as the sample at a pixel center somewhere near vmin
+       * instead - i'll switch to this later.
+       */
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+
+      /*
+      debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+                   slot, "xyzw"[i],
+                   setup->coef[slot].a0[i],
+                   setup->coef[slot].dadx[i],
+                   setup->coef[slot].dady[i]);
+      */
+   }
+}
+
+
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -660,35 +643,38 @@ static void tri_linear_coeff( struct setup_context *setup,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void tri_persp_coeff( struct setup_context *setup,
-                             struct tgsi_interp_coef *coef,
-                             uint vertSlot, uint i)
+                             unsigned attrib,
+                             uint vertSlot)
 {
-   /* premultiply by 1/w  (v[0][3] is always W):
-    */
-   float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-   float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
-   float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-   float dadx = a * setup->oneoverarea;
-   float dady = b * setup->oneoverarea;
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      /* premultiply by 1/w  (v[0][3] is always W):
+       */
+      float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+      float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
+      float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+      float botda = mida - mina;
+      float majda = maxa - mina;
+      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float dadx = a * setup->oneoverarea;
+      float dady = b * setup->oneoverarea;
 
-   /*
-   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-          	setup->vmin[vertSlot][i],
-          	setup->vmid[vertSlot][i],
-       		setup->vmax[vertSlot][i]
-          );
-   */
-   assert(i <= 3);
+      /*
+      debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+                   setup->vmin[vertSlot][i],
+                   setup->vmid[vertSlot][i],
+                   setup->vmax[vertSlot][i]
+             );
+      */
+      assert(i <= 3);
 
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
-   coef->a0[i] = (mina -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (mina -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   }
 }
 
 
@@ -702,21 +688,21 @@ static void
 setup_fragcoord_coeff(struct setup_context *setup, uint slot)
 {
    /*X*/
-   setup->coef[slot].a0[0] = 0;
-   setup->coef[slot].dadx[0] = 1.0;
-   setup->coef[slot].dady[0] = 0.0;
+   setup->coef.a0[1 + slot][0] = 0;
+   setup->coef.dadx[1 + slot][0] = 1.0;
+   setup->coef.dady[1 + slot][0] = 0.0;
    /*Y*/
-   setup->coef[slot].a0[1] = 0.0;
-   setup->coef[slot].dadx[1] = 0.0;
-   setup->coef[slot].dady[1] = 1.0;
+   setup->coef.a0[1 + slot][1] = 0.0;
+   setup->coef.dadx[1 + slot][1] = 0.0;
+   setup->coef.dady[1 + slot][1] = 1.0;
    /*Z*/
-   setup->coef[slot].a0[2] = setup->posCoef.a0[2];
-   setup->coef[slot].dadx[2] = setup->posCoef.dadx[2];
-   setup->coef[slot].dady[2] = setup->posCoef.dady[2];
+   setup->coef.a0[1 + slot][2] = setup->coef.a0[0][2];
+   setup->coef.dadx[1 + slot][2] = setup->coef.dadx[0][2];
+   setup->coef.dady[1 + slot][2] = setup->coef.dady[0][2];
    /*W*/
-   setup->coef[slot].a0[3] = setup->posCoef.a0[3];
-   setup->coef[slot].dadx[3] = setup->posCoef.dadx[3];
-   setup->coef[slot].dady[3] = setup->posCoef.dady[3];
+   setup->coef.a0[1 + slot][3] = setup->coef.a0[0][3];
+   setup->coef.dadx[1 + slot][3] = setup->coef.dadx[0][3];
+   setup->coef.dady[1 + slot][3] = setup->coef.dady[0][3];
 }
 
 
@@ -734,27 +720,23 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
    /* z and w are done by linear interpolation:
     */
-   tri_linear_coeff(setup, &setup->posCoef, 0, 2);
-   tri_linear_coeff(setup, &setup->posCoef, 0, 3);
+   tri_pos_coeff(setup, 0, 2);
+   tri_pos_coeff(setup, 0, 3);
 
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-      uint j;
 
       switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         const_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            tri_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         tri_linear_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            tri_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         tri_persp_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
@@ -764,9 +746,9 @@ static void setup_tri_coefficients( struct setup_context *setup )
       }
 
       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
-         setup->coef[fragSlot].dadx[0] = 0.0;
-         setup->coef[fragSlot].dady[0] = 0.0;
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
       }
    }
 }
@@ -775,12 +757,12 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
 static void setup_tri_edges( struct setup_context *setup )
 {
-   float vmin_x = setup->vmin[0][0] + 0.5f;
-   float vmid_x = setup->vmid[0][0] + 0.5f;
+   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
+   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
 
-   float vmin_y = setup->vmin[0][1] - 0.5f;
-   float vmid_y = setup->vmid[0][1] - 0.5f;
-   float vmax_y = setup->vmax[0][1] - 0.5f;
+   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
+   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
+   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
 
    setup->emaj.sy = ceilf(vmin_y);
    setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
@@ -897,7 +879,7 @@ calc_det( const float (*v0)[4],
 /**
  * Do setup for triangle rasterization, then render the triangle.
  */
-void setup_tri( struct setup_context *setup,
+void llvmpipe_setup_tri( struct setup_context *setup,
                 const float (*v0)[4],
                 const float (*v1)[4],
                 const float (*v2)[4] )
@@ -932,7 +914,7 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.input.prim = QUAD_PRIM_TRI;
+   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
 
    setup->span.y = 0;
    setup->span.right[0] = 0;
@@ -956,8 +938,6 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
-   WAIT_FOR_COMPLETION(setup);
-
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -972,18 +952,40 @@ void setup_tri( struct setup_context *setup,
  * for a line.
  */
 static void
-line_linear_coeff(const struct setup_context *setup,
-                  struct tgsi_interp_coef *coef,
-                  uint vertSlot, uint i)
+linear_pos_coeff(struct setup_context *setup,
+                 uint vertSlot, uint i)
 {
    const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
    const float dadx = da * setup->emaj.dx * setup->oneoverarea;
    const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
-   coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+   setup->coef.dadx[0][i] = dadx;
+   setup->coef.dady[0][i] = dady;
+   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ */
+static void
+line_linear_coeff(struct setup_context *setup,
+                  unsigned attrib,
+                  uint vertSlot)
+{
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+      const float dady = da * setup->emaj.dy * setup->oneoverarea;
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   }
 }
 
 
@@ -992,21 +994,24 @@ line_linear_coeff(const struct setup_context *setup,
  * for a line.
  */
 static void
-line_persp_coeff(const struct setup_context *setup,
-                 struct tgsi_interp_coef *coef,
-                 uint vertSlot, uint i)
+line_persp_coeff(struct setup_context *setup,
+                 unsigned attrib,
+                 uint vertSlot)
 {
-   /* XXX double-check/verify this arithmetic */
-   const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-   const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-   const float da = a1 - a0;
-   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-   const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   coef->dadx[i] = dadx;
-   coef->dady[i] = dady;
-   coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+   unsigned i;
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      /* XXX double-check/verify this arithmetic */
+      const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+      const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+      const float da = a1 - a0;
+      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+      const float dady = da * setup->emaj.dy * setup->oneoverarea;
+      setup->coef.dadx[1 + attrib][i] = dadx;
+      setup->coef.dady[1 + attrib][i] = dady;
+      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
+   }
 }
 
 
@@ -1044,27 +1049,23 @@ setup_line_coefficients(struct setup_context *setup,
 
    /* z and w are done by linear interpolation:
     */
-   line_linear_coeff(setup, &setup->posCoef, 0, 2);
-   line_linear_coeff(setup, &setup->posCoef, 0, 3);
+   linear_pos_coeff(setup, 0, 2);
+   linear_pos_coeff(setup, 0, 3);
 
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-      uint j;
 
       switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         const_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            line_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         line_linear_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            line_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         line_persp_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
@@ -1074,9 +1075,9 @@ setup_line_coefficients(struct setup_context *setup,
       }
 
       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
-         setup->coef[fragSlot].dadx[0] = 0.0;
-         setup->coef[fragSlot].dady[0] = 0.0;
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
       }
    }
    return TRUE;
@@ -1095,20 +1096,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.input.x0 ||
-       quadY != setup->quad.input.y0)
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.input.x0 != -1)
-         CLIP_EMIT_QUAD(setup);
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
 
-      setup->quad.input.x0 = quadX;
-      setup->quad.input.y0 = quadY;
-      setup->quad.inout.mask = 0x0;
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
    }
 
-   setup->quad.inout.mask |= mask;
+   setup->quad[0].inout.mask |= mask;
 }
 
 
@@ -1118,7 +1119,7 @@ plot(struct setup_context *setup, int x, int y)
  * to handle stippling and wide lines.
  */
 void
-setup_line(struct setup_context *setup,
+llvmpipe_setup_line(struct setup_context *setup,
            const float (*v0)[4],
            const float (*v1)[4])
 {
@@ -1168,17 +1169,18 @@ setup_line(struct setup_context *setup,
 
    assert(dx >= 0);
    assert(dy >= 0);
+   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
 
-   setup->quad.input.x0 = setup->quad.input.y0 = -1;
-   setup->quad.inout.mask = 0x0;
-   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.input.coverage[0] =
-   setup->quad.input.coverage[1] =
-   setup->quad.input.coverage[2] =
-   setup->quad.input.coverage[3] = 1.0;
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1222,24 +1224,24 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.inout.mask) {
-      CLIP_EMIT_QUAD(setup);
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 
 static void
-point_persp_coeff(const struct setup_context *setup,
+point_persp_coeff(struct setup_context *setup,
                   const float (*vert)[4],
-                  struct tgsi_interp_coef *coef,
-                  uint vertSlot, uint i)
+                  unsigned attrib,
+                  uint vertSlot)
 {
-   assert(i <= 3);
-   coef->dadx[i] = 0.0F;
-   coef->dady[i] = 0.0F;
-   coef->a0[i] = vert[vertSlot][i] * vert[0][3];
+   unsigned i;
+   for(i = 0; i < NUM_CHANNELS; ++i) {
+      setup->coef.dadx[1 + attrib][i] = 0.0F;
+      setup->coef.dady[1 + attrib][i] = 0.0F;
+      setup->coef.a0[1 + attrib][i] = vert[vertSlot][i] * vert[0][3];
+   }
 }
 
 
@@ -1249,7 +1251,7 @@ point_persp_coeff(const struct setup_context *setup,
  * XXX could optimize a lot for 1-pixel points.
  */
 void
-setup_point( struct setup_context *setup,
+llvmpipe_setup_point( struct setup_context *setup,
              const float (*v0)[4] )
 {
    struct llvmpipe_context *llvmpipe = setup->llvmpipe;
@@ -1273,6 +1275,8 @@ setup_point( struct setup_context *setup,
    if (llvmpipe->no_rast)
       return;
 
+   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_POINTS);
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
@@ -1292,24 +1296,20 @@ setup_point( struct setup_context *setup,
    setup->vprovoke = v0;
 
    /* setup Z, W */
-   const_coeff(setup, &setup->posCoef, 0, 2);
-   const_coeff(setup, &setup->posCoef, 0, 3);
+   const_pos_coeff(setup, 0, 2);
+   const_pos_coeff(setup, 0, 3);
 
    for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
       const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-      uint j;
 
       switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          /* fall-through */
       case INTERP_LINEAR:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         const_coeff(setup, fragSlot, vertSlot);
          break;
       case INTERP_PERSPECTIVE:
-         for (j = 0; j < NUM_CHANNELS; j++)
-            point_persp_coeff(setup, setup->vprovoke,
-                              &setup->coef[fragSlot], vertSlot, j);
+         point_persp_coeff(setup, setup->vprovoke, fragSlot, vertSlot);
          break;
       case INTERP_POS:
          setup_fragcoord_coeff(setup, fragSlot);
@@ -1319,22 +1319,21 @@ setup_point( struct setup_context *setup,
       }
 
       if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
-         setup->coef[fragSlot].dadx[0] = 0.0;
-         setup->coef[fragSlot].dady[0] = 0.0;
+         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
+         setup->coef.dadx[1 + fragSlot][0] = 0.0;
+         setup->coef.dady[1 + fragSlot][0] = 0.0;
       }
    }
 
-   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.input.x0 = (int) x - ix;
-      setup->quad.input.y0 = (int) y - iy;
-      setup->quad.inout.mask = (1 << ix) << (2 * iy);
-      CLIP_EMIT_QUAD(setup);
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
    }
    else {
       if (round) {
@@ -1354,15 +1353,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.inout.mask = 0x0;
+               setup->quad[0].inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1370,8 +1369,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1379,8 +1378,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1388,14 +1387,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.inout.mask) {
-                  setup->quad.input.x0 = ix;
-                  setup->quad.input.y0 = iy;
-                  CLIP_EMIT_QUAD(setup);
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
                }
             }
          }
@@ -1439,34 +1438,24 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.inout.mask = mask;
-               setup->quad.input.x0 = ix;
-               setup->quad.input.y0 = iy;
-               CLIP_EMIT_QUAD(setup);
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
             }
          }
       }
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
-void setup_prepare( struct setup_context *setup )
+void llvmpipe_setup_prepare( struct setup_context *setup )
 {
    struct llvmpipe_context *lp = setup->llvmpipe;
-   unsigned i;
 
    if (lp->dirty) {
       llvmpipe_update_derived(lp);
    }
 
-   /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->quad.nr_attrs = draw_num_vs_outputs(lp->draw);
-
-   for (i = 0; i < LP_NUM_QUAD_THREADS; i++) {
-      lp->quad[i].first->begin( lp->quad[i].first );
-   }
-
    if (lp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        lp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
        lp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
@@ -1481,46 +1470,34 @@ void setup_prepare( struct setup_context *setup )
 
 
 
-void setup_destroy_context( struct setup_context *setup )
+void llvmpipe_setup_destroy_context( struct setup_context *setup )
 {
-   FREE( setup );
+   align_free( setup );
 }
 
 
 /**
  * Create a new primitive setup/render stage.
  */
-struct setup_context *setup_create_context( struct llvmpipe_context *llvmpipe )
+struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe )
 {
-   struct setup_context *setup = CALLOC_STRUCT(setup_context);
-#if LP_NUM_QUAD_THREADS > 1
-   uint i;
-#endif
+   struct setup_context *setup;
+   unsigned i;
 
+   setup = align_malloc(sizeof(struct setup_context), 16);
+   if (!setup)
+      return NULL;
+
+   memset(setup, 0, sizeof *setup);
    setup->llvmpipe = llvmpipe;
 
-   setup->quad.coef = setup->coef;
-   setup->quad.posCoef = &setup->posCoef;
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = &setup->coef;
+   }
 
    setup->span.left[0] = 1000000;     /* greater than right[0] */
    setup->span.left[1] = 1000000;     /* greater than right[1] */
 
-#if LP_NUM_QUAD_THREADS > 1
-   setup->que.first = 0;
-   setup->que.last = 0;
-   pipe_mutex_init( setup->que.que_mutex );
-   pipe_condvar_init( setup->que.que_notfull_condvar );
-   pipe_condvar_init( setup->que.que_notempty_condvar );
-   setup->que.jobs_added = 0;
-   setup->que.jobs_done = 0;
-   pipe_condvar_init( setup->que.que_done_condvar );
-   for (i = 0; i < LP_NUM_QUAD_THREADS; i++) {
-      setup->threads[i].setup = setup;
-      setup->threads[i].id = i;
-      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
-   }
-#endif
-
    return setup;
 }