Merge branch 'master' into radeon-rewrite
[mesa.git] / src / gallium / drivers / softpipe / sp_setup.c
index 543d86a5cb949b64448394c0753b0d460c3b1fe4..accc692b66fc0d6f268fa42f024e4688fe55a06d 100644 (file)
  * \author  Brian Paul
  */
 
-#include "sp_setup.h"
-
 #include "sp_context.h"
-#include "sp_headers.h"
+#include "sp_prim_setup.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
+#include "sp_setup.h"
 #include "sp_state.h"
-#include "sp_prim_setup.h"
+#include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vertex.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 
 #define DEBUG_VERTS 0
 #define DEBUG_FRAGS 0
@@ -58,6 +61,87 @@ struct edge {
    int lines;          /**< number of lines on this edge */
 };
 
+#if SP_NUM_QUAD_THREADS > 1
+
+/* Set to 1 if you want other threads to be instantly
+ * notified of pending jobs.
+ */
+#define INSTANT_NOTEMPTY_NOTIFY 0
+
+struct thread_info
+{
+   struct setup_context *setup;
+   uint id;
+   pipe_thread handle;
+};
+
+struct quad_job;
+
+typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
+
+struct quad_job
+{
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   quad_job_routine routine;
+};
+
+#define NUM_QUAD_JOBS 64
+
+struct quad_job_que
+{
+   struct quad_job jobs[NUM_QUAD_JOBS];
+   uint first;
+   uint last;
+   pipe_mutex que_mutex;
+   pipe_condvar que_notfull_condvar;
+   pipe_condvar que_notempty_condvar;
+   uint jobs_added;
+   uint jobs_done;
+   pipe_condvar que_done_condvar;
+};
+
+static void
+add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
+{
+#if INSTANT_NOTEMPTY_NOTIFY
+   boolean empty;
+#endif
+
+   /* Wait for empty slot, see if the que is empty.
+    */
+   pipe_mutex_lock( que->que_mutex );
+   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
+#if !INSTANT_NOTEMPTY_NOTIFY
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+#endif
+      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
+   }
+#if INSTANT_NOTEMPTY_NOTIFY
+   empty = que->last == que->first;
+#endif
+   que->jobs_added++;
+   pipe_mutex_unlock( que->que_mutex );
+
+   /* Submit new job.
+    */
+   que->jobs[que->last].input = quad->input;
+   que->jobs[que->last].inout = quad->inout;
+   que->jobs[que->last].routine = routine;
+   que->last = (que->last + 1) % NUM_QUAD_JOBS;
+
+#if INSTANT_NOTEMPTY_NOTIFY
+   /* If the que was empty, notify consumers there's a job to be done.
+    */
+   if (empty) {
+      pipe_mutex_lock( que->que_mutex );
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+#endif
+}
+
+#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -85,6 +169,11 @@ struct setup_context {
    struct tgsi_interp_coef posCoef;  /* For Z, W */
    struct quad_header quad;
 
+#if SP_NUM_QUAD_THREADS > 1
+   struct quad_job_que que;
+   struct thread_info threads[SP_NUM_QUAD_THREADS];
+#endif
+
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
@@ -101,21 +190,84 @@ struct setup_context {
    unsigned winding;           /* which winding to cull */
 };
 
+#if SP_NUM_QUAD_THREADS > 1
+
+static PIPE_THREAD_ROUTINE( quad_thread, param )
+{
+   struct thread_info *info = (struct thread_info *) param;
+   struct quad_job_que *que = &info->setup->que;
 
+   for (;;) {
+      struct quad_job job;
+      boolean full;
 
+      /* Wait for an available job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      while (que->last == que->first)
+         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
+
+      /* See if the que is full.
+       */
+      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
 
+      /* Take a job and remove it from que.
+       */
+      job = que->jobs[que->first];
+      que->first = (que->first + 1) % NUM_QUAD_JOBS;
+
+      /* Notify the producer if the que is not full.
+       */
+      if (full)
+         pipe_condvar_signal( que->que_notfull_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+
+      job.routine( info->setup, info->id, &job );
+
+      /* Notify the producer if that's the last finished job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      que->jobs_done++;
+      if (que->jobs_added == que->jobs_done)
+         pipe_condvar_signal( que->que_done_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+
+   return NULL;
+}
+
+#define WAIT_FOR_COMPLETION(setup) \
+   do {\
+      pipe_mutex_lock( setup->que.que_mutex );\
+      if (!INSTANT_NOTEMPTY_NOTIFY)\
+         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
+      while (setup->que.jobs_added != setup->que.jobs_done)\
+         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
+      pipe_mutex_unlock( setup->que.que_mutex );\
+   } while (0)
+
+#else
+
+#define WAIT_FOR_COMPLETION(setup) ((void) 0)
+
+#endif
 
-static boolean cull_tri( struct setup_context *setup,
-                     float det )
+
+
+/**
+ * Do triangle cull test using tri determinant (sign indicates orientation)
+ * \return true if triangle is to be culled.
+ */
+static INLINE boolean
+cull_tri(const struct setup_context *setup, float det)
 {
-   if (det != 0) 
-   {   
+   if (det != 0) {   
       /* if (det < 0 then Z points toward camera and triangle is 
        * counter-clockwise winding.
        */
       unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
-      
-      if ((winding & setup->winding) == 0) 
+
+      if ((winding & setup->winding) == 0)
         return FALSE;
    }
 
@@ -130,7 +282,7 @@ static boolean cull_tri( struct setup_context *setup,
  * Clip setup->quad against the scissor/surface bounds.
  */
 static INLINE void
-quad_clip(struct setup_context *setup)
+quad_clip( struct setup_context *setup, struct quad_header *quad )
 {
    const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
    const int minx = (int) cliprect->minx;
@@ -138,22 +290,22 @@ quad_clip(struct setup_context *setup)
    const int miny = (int) cliprect->miny;
    const int maxy = (int) cliprect->maxy;
 
-   if (setup->quad.x0 >= maxx ||
-       setup->quad.y0 >= maxy ||
-       setup->quad.x0 + 1 < minx ||
-       setup->quad.y0 + 1 < miny) {
+   if (quad->input.x0 >= maxx ||
+       quad->input.y0 >= maxy ||
+       quad->input.x0 + 1 < minx ||
+       quad->input.y0 + 1 < miny) {
       /* totally clipped */
-      setup->quad.mask = 0x0;
+      quad->inout.mask = 0x0;
       return;
    }
-   if (setup->quad.x0 < minx)
-      setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup->quad.y0 < miny)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup->quad.x0 == maxx - 1)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup->quad.y0 == maxy - 1)
-      setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   if (quad->input.x0 < minx)
+      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (quad->input.y0 < miny)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (quad->input.x0 == maxx - 1)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (quad->input.y0 == maxy - 1)
+      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 }
 
 
@@ -161,35 +313,59 @@ quad_clip(struct setup_context *setup)
  * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-clip_emit_quad(struct setup_context *setup)
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
 {
-   quad_clip(setup);
-   if (setup->quad.mask) {
+   quad_clip( setup, quad );
+   if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
-      sp->quad.first->run(sp->quad.first, &setup->quad);
+
+      sp->quad[thread].first->run( sp->quad[thread].first, quad );
    }
 }
 
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   clip_emit_quad( setup, &quad, thread );
+}
+
+#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
+
+#else
+
+#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
+
+#endif
 
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( struct setup_context *setup, int x, int y, unsigned mask )
+emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
 {
    struct softpipe_context *sp = setup->softpipe;
-   setup->quad.x0 = x;
-   setup->quad.y0 = y;
-   setup->quad.mask = mask;
+#if DEBUG_FRAGS
+   uint mask = quad->inout.mask;
+#endif
+
 #if DEBUG_FRAGS
    if (mask & 1) setup->numFragsEmitted++;
    if (mask & 2) setup->numFragsEmitted++;
    if (mask & 4) setup->numFragsEmitted++;
    if (mask & 8) setup->numFragsEmitted++;
 #endif
-   sp->quad.first->run(sp->quad.first, &setup->quad);
+   sp->quad[thread].first->run( sp->quad[thread].first, quad );
 #if DEBUG_FRAGS
-   mask = setup->quad.mask;
+   mask = quad->inout.mask;
    if (mask & 1) setup->numFragsWritten++;
    if (mask & 2) setup->numFragsWritten++;
    if (mask & 4) setup->numFragsWritten++;
@@ -197,6 +373,38 @@ emit_quad( struct setup_context *setup, int x, int y, unsigned mask )
 #endif
 }
 
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   emit_quad( setup, &quad, thread );
+}
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
+   } while (0)
+
+#else
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      emit_quad( setup, &setup->quad, 0 );\
+   } while (0)
+
+#endif
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -236,7 +444,7 @@ static void flush_spans( struct setup_context *setup )
             mask |= MASK_TOP_RIGHT;
          if (x+1 >= xleft1 && x+1 < xright1)
             mask |= MASK_BOTTOM_RIGHT;
-         emit_quad( setup, x, setup->span.y, mask );
+         EMIT_QUAD( setup, x, setup->span.y, mask );
       }
       break;
 
@@ -250,7 +458,7 @@ static void flush_spans( struct setup_context *setup )
             mask |= MASK_TOP_LEFT;
          if (x+1 >= xleft0 && x+1 < xright0)
             mask |= MASK_TOP_RIGHT;
-         emit_quad( setup, x, setup->span.y, mask );
+         EMIT_QUAD( setup, x, setup->span.y, mask );
       }
       break;
 
@@ -264,7 +472,7 @@ static void flush_spans( struct setup_context *setup )
             mask |= MASK_BOTTOM_LEFT;
          if (x+1 >= xleft1 && x+1 < xright1)
             mask |= MASK_BOTTOM_RIGHT;
-         emit_quad( setup, x, setup->span.y, mask );
+         EMIT_QUAD( setup, x, setup->span.y, mask );
       }
       break;
 
@@ -284,14 +492,20 @@ static void print_vertex(const struct setup_context *setup,
                          const float (*v)[4])
 {
    int i;
-   debug_printf("Vertex: (%p)\n", v);
+   debug_printf("   Vertex: (%p)\n", v);
    for (i = 0; i < setup->quad.nr_attrs; i++) {
-      debug_printf("  %d: %f %f %f %f\n",  i,
+      debug_printf("     %d: %f %f %f %f\n",  i,
               v[i][0], v[i][1], v[i][2], v[i][3]);
+      if (util_is_inf_or_nan(v[i][0])) {
+         debug_printf("   NaN!\n");
+      }
    }
 }
 #endif
 
+/**
+ * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
+ */
 static boolean setup_sort_vertices( struct setup_context *setup,
                                     float det,
                                     const float (*v0)[4],
@@ -369,17 +583,20 @@ static boolean setup_sort_vertices( struct setup_context *setup,
                            setup->ebot.dx * setup->emaj.dy);
 
       setup->oneoverarea = 1.0f / area;
+
       /*
       debug_printf("%s one-over-area %f  area %f  det %f\n",
                    __FUNCTION__, setup->oneoverarea, area, det );
       */
+      if (util_is_inf_or_nan(setup->oneoverarea))
+         return FALSE;
    }
 
    /* We need to know if this is a front or back-facing triangle for:
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 
    return TRUE;
 }
@@ -508,18 +725,9 @@ setup_fragcoord_coeff(struct setup_context *setup, uint slot)
    setup->coef[slot].dadx[0] = 1.0;
    setup->coef[slot].dady[0] = 0.0;
    /*Y*/
-   if (setup->softpipe->rasterizer->origin_lower_left) {
-      /* y=0=bottom */
-      const int winHeight = setup->softpipe->framebuffer.height;
-      setup->coef[slot].a0[1] = (float) (winHeight - 1);
-      setup->coef[slot].dady[1] = -1.0;
-   }
-   else {
-      /* y=0=top */
-      setup->coef[slot].a0[1] = 0.0;
-      setup->coef[slot].dady[1] = 1.0;
-   }
+   setup->coef[slot].a0[1] = 0.0;
    setup->coef[slot].dadx[1] = 0.0;
+   setup->coef[slot].dady[1] = 1.0;
    /*Z*/
    setup->coef[slot].a0[2] = setup->posCoef.a0[2];
    setup->coef[slot].dadx[2] = setup->posCoef.dadx[2];
@@ -551,10 +759,10 @@ static void setup_tri_coefficients( struct setup_context *setup )
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          for (j = 0; j < NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -576,7 +784,7 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
          /* FOG.y = front/back facing  XXX fix this */
-         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.facing;
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
          setup->coef[fragSlot].dadx[1] = 0.0;
          setup->coef[fragSlot].dady[1] = 0.0;
       }
@@ -594,18 +802,18 @@ static void setup_tri_edges( struct setup_context *setup )
    float vmid_y = setup->vmid[0][1] - 0.5f;
    float vmax_y = setup->vmax[0][1] - 0.5f;
 
-   setup->emaj.sy = CEILF(vmin_y);
-   setup->emaj.lines = (int) CEILF(vmax_y - setup->emaj.sy);
+   setup->emaj.sy = ceilf(vmin_y);
+   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
    setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
    setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
 
-   setup->etop.sy = CEILF(vmid_y);
-   setup->etop.lines = (int) CEILF(vmax_y - setup->etop.sy);
+   setup->etop.sy = ceilf(vmid_y);
+   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
    setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
    setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
 
-   setup->ebot.sy = CEILF(vmin_y);
-   setup->ebot.lines = (int) CEILF(vmid_y - setup->ebot.sy);
+   setup->ebot.sy = ceilf(vmin_y);
+   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
    setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
    setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
 }
@@ -741,11 +949,12 @@ void setup_tri( struct setup_context *setup,
    if (cull_tri( setup, det ))
       return;
 
-   setup_sort_vertices( setup, det, v0, v1, v2 );
+   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
+      return;
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.prim = PRIM_TRI;
+   setup->quad.input.prim = QUAD_PRIM_TRI;
 
    setup->span.y = 0;
    setup->span.y_flags = 0;
@@ -770,6 +979,8 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
+   WAIT_FOR_COMPLETION(setup);
+
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -784,7 +995,7 @@ void setup_tri( struct setup_context *setup,
  * for a line.
  */
 static void
-line_linear_coeff(struct setup_context *setup,
+line_linear_coeff(const struct setup_context *setup,
                   struct tgsi_interp_coef *coef,
                   uint vertSlot, uint i)
 {
@@ -804,9 +1015,9 @@ line_linear_coeff(struct setup_context *setup,
  * for a line.
  */
 static void
-line_persp_coeff(struct setup_context *setup,
-                  struct tgsi_interp_coef *coef,
-                  uint vertSlot, uint i)
+line_persp_coeff(const struct setup_context *setup,
+                 struct tgsi_interp_coef *coef,
+                 uint vertSlot, uint i)
 {
    /* XXX double-check/verify this arithmetic */
    const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
@@ -826,7 +1037,7 @@ line_persp_coeff(struct setup_context *setup,
  * Compute the setup->coef[] array dadx, dady, a0 values.
  * Must be called after setup->vmin,vmax are initialized.
  */
-static INLINE void
+static INLINE boolean
 setup_line_coefficients(struct setup_context *setup,
                         const float (*v0)[4],
                         const float (*v1)[4])
@@ -835,6 +1046,7 @@ setup_line_coefficients(struct setup_context *setup,
    const struct sp_fragment_shader *spfs = softpipe->fs;
    const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
    uint fragSlot;
+   float area;
 
    /* use setup->vmin, vmax to point to vertices */
    setup->vprovoke = v1;
@@ -843,9 +1055,12 @@ setup_line_coefficients(struct setup_context *setup,
 
    setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
    setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
-   /* NOTE: this is not really 1/area */
-   setup->oneoverarea = 1.0f / (setup->emaj.dx * setup->emaj.dx +
-                                setup->emaj.dy * setup->emaj.dy);
+
+   /* NOTE: this is not really area but something proportional to it */
+   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
+   if (area == 0.0f || util_is_inf_or_nan(area))
+      return FALSE;
+   setup->oneoverarea = 1.0f / area;
 
    /* z and w are done by linear interpolation:
     */
@@ -855,10 +1070,10 @@ setup_line_coefficients(struct setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          for (j = 0; j < NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -880,11 +1095,12 @@ setup_line_coefficients(struct setup_context *setup,
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
          /* FOG.y = front/back facing  XXX fix this */
-         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.facing;
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
          setup->coef[fragSlot].dadx[1] = 0.0;
          setup->coef[fragSlot].dady[1] = 0.0;
       }
    }
+   return TRUE;
 }
 
 
@@ -900,20 +1116,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.x0 ||
-       quadY != setup->quad.y0)
+   if (quadX != setup->quad.input.x0 ||
+       quadY != setup->quad.input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.x0 != -1)
-         clip_emit_quad(setup);
+      if (setup->quad.input.x0 != -1)
+         CLIP_EMIT_QUAD(setup);
 
-      setup->quad.x0 = quadX;
-      setup->quad.y0 = quadY;
-      setup->quad.mask = 0x0;
+      setup->quad.input.x0 = quadX;
+      setup->quad.input.y0 = quadY;
+      setup->quad.inout.mask = 0x0;
    }
 
-   setup->quad.mask |= mask;
+   setup->quad.inout.mask |= mask;
 }
 
 
@@ -947,7 +1163,13 @@ setup_line(struct setup_context *setup,
    if (dx == 0 && dy == 0)
       return;
 
-   setup_line_coefficients(setup, v0, v1);
+   if (!setup_line_coefficients(setup, v0, v1))
+      return;
+
+   assert(v0[0][0] < 1.0e9);
+   assert(v0[0][1] < 1.0e9);
+   assert(v1[0][0] < 1.0e9);
+   assert(v1[0][1] < 1.0e9);
 
    if (dx < 0) {
       dx = -dx;   /* make positive */
@@ -968,16 +1190,16 @@ setup_line(struct setup_context *setup,
    assert(dx >= 0);
    assert(dy >= 0);
 
-   setup->quad.x0 = setup->quad.y0 = -1;
-   setup->quad.mask = 0x0;
-   setup->quad.prim = PRIM_LINE;
+   setup->quad.input.x0 = setup->quad.input.y0 = -1;
+   setup->quad.inout.mask = 0x0;
+   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.coverage[0] =
-   setup->quad.coverage[1] =
-   setup->quad.coverage[2] =
-   setup->quad.coverage[3] = 1.0;
+   setup->quad.input.coverage[0] =
+   setup->quad.input.coverage[1] =
+   setup->quad.input.coverage[2] =
+   setup->quad.input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1021,14 +1243,16 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.mask) {
-      clip_emit_quad(setup);
+   if (setup->quad.inout.mask) {
+      CLIP_EMIT_QUAD(setup);
    }
+
+   WAIT_FOR_COMPLETION(setup);
 }
 
 
 static void
-point_persp_coeff(struct setup_context *setup,
+point_persp_coeff(const struct setup_context *setup,
                   const float (*vert)[4],
                   struct tgsi_interp_coef *coef,
                   uint vertSlot, uint i)
@@ -1093,10 +1317,10 @@ setup_point( struct setup_context *setup,
    const_coeff(setup, &setup->posCoef, 0, 3);
 
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          /* fall-through */
       case INTERP_LINEAR:
@@ -1117,22 +1341,22 @@ setup_point( struct setup_context *setup,
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
          /* FOG.y = front/back facing  XXX fix this */
-         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.facing;
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
          setup->coef[fragSlot].dadx[1] = 0.0;
          setup->coef[fragSlot].dady[1] = 0.0;
       }
    }
 
-   setup->quad.prim = PRIM_POINT;
+   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.x0 = (int) x - ix;
-      setup->quad.y0 = (int) y - iy;
-      setup->quad.mask = (1 << ix) << (2 * iy);
-      clip_emit_quad(setup);
+      setup->quad.input.x0 = (int) x - ix;
+      setup->quad.input.y0 = (int) y - iy;
+      setup->quad.inout.mask = (1 << ix) << (2 * iy);
+      CLIP_EMIT_QUAD(setup);
    }
    else {
       if (round) {
@@ -1152,15 +1376,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.mask = 0x0;
+               setup->quad.inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_TOP_LEFT;
+                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1168,8 +1392,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_TOP_RIGHT;
+                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1177,8 +1401,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1186,14 +1410,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.mask) {
-                  setup->quad.x0 = ix;
-                  setup->quad.y0 = iy;
-                  clip_emit_quad(setup);
+               if (setup->quad.inout.mask) {
+                  setup->quad.input.x0 = ix;
+                  setup->quad.input.y0 = iy;
+                  CLIP_EMIT_QUAD(setup);
                }
             }
          }
@@ -1237,14 +1461,16 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.mask = mask;
-               setup->quad.x0 = ix;
-               setup->quad.y0 = iy;
-               clip_emit_quad(setup);
+               setup->quad.inout.mask = mask;
+               setup->quad.input.x0 = ix;
+               setup->quad.input.y0 = iy;
+               CLIP_EMIT_QUAD(setup);
             }
          }
       }
    }
+
+   WAIT_FOR_COMPLETION(setup);
 }
 
 void setup_prepare( struct setup_context *setup )
@@ -1256,20 +1482,11 @@ void setup_prepare( struct setup_context *setup )
       softpipe_update_derived(sp);
    }
 
-   /* Mark surfaces as defined now */
-   for (i = 0; i < sp->framebuffer.num_cbufs; i++){
-      if (sp->framebuffer.cbufs[i]) {
-         sp->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
-      }
-   }
-   if (sp->framebuffer.zsbuf) {
-      sp->framebuffer.zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
-   }
+   /* Note: nr_attrs is only used for debugging (vertex printing) */
+   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
 
-   {
-      const struct sp_fragment_shader *fs = setup->softpipe->fs;
-      setup->quad.nr_attrs = fs->info.num_inputs;
-      sp->quad.first->begin(sp->quad.first);
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first->begin( sp->quad[i].first );
    }
 
    if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
@@ -1298,11 +1515,31 @@ void setup_destroy_context( struct setup_context *setup )
 struct setup_context *setup_create_context( struct softpipe_context *softpipe )
 {
    struct setup_context *setup = CALLOC_STRUCT(setup_context);
+#if SP_NUM_QUAD_THREADS > 1
+   uint i;
+#endif
 
    setup->softpipe = softpipe;
 
    setup->quad.coef = setup->coef;
    setup->quad.posCoef = &setup->posCoef;
 
+#if SP_NUM_QUAD_THREADS > 1
+   setup->que.first = 0;
+   setup->que.last = 0;
+   pipe_mutex_init( setup->que.que_mutex );
+   pipe_condvar_init( setup->que.que_notfull_condvar );
+   pipe_condvar_init( setup->que.que_notempty_condvar );
+   setup->que.jobs_added = 0;
+   setup->que.jobs_done = 0;
+   pipe_condvar_init( setup->que.que_done_condvar );
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      setup->threads[i].setup = setup;
+      setup->threads[i].id = i;
+      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   }
+#endif
+
    return setup;
 }
+