From bdbb4beb21876010b14785569a920fa65a67d1ad Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 24 Jul 2009 16:49:35 +0100
Subject: [PATCH] llvmpipe:  expand quad pipeline to process >1 quad at a time

This is part one -- we still only pass a single quad down, but
the code can now cope with more.  The quads must all be from the same
tile.
---
 .../drivers/llvmpipe/lp_quad_alpha_test.c     | 106 +--
 src/gallium/drivers/llvmpipe/lp_quad_blend.c  | 730 +++++++++---------
 .../drivers/llvmpipe/lp_quad_colormask.c      |  15 +-
 .../drivers/llvmpipe/lp_quad_coverage.c       |  48 +-
 .../drivers/llvmpipe/lp_quad_depth_test.c     |  23 +-
 src/gallium/drivers/llvmpipe/lp_quad_earlyz.c |  28 +-
 src/gallium/drivers/llvmpipe/lp_quad_fs.c     |  40 +-
 .../drivers/llvmpipe/lp_quad_occlusion.c      |  10 +-
 src/gallium/drivers/llvmpipe/lp_quad_output.c |  49 +-
 src/gallium/drivers/llvmpipe/lp_quad_pipe.c   |  88 +--
 src/gallium/drivers/llvmpipe/lp_quad_pipe.h   |   4 +-
 .../drivers/llvmpipe/lp_quad_stencil.c        | 185 ++---
 .../drivers/llvmpipe/lp_quad_stipple.c        |  48 +-
 src/gallium/drivers/llvmpipe/lp_setup.c       |   4 +-
 14 files changed, 745 insertions(+), 633 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_quad_alpha_test.c b/src/gallium/drivers/llvmpipe/lp_quad_alpha_test.c
index eea4ef9c85f..947daf56955 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_alpha_test.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_alpha_test.c
@@ -9,76 +9,80 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 
+#define ALPHATEST( FUNC, COMP )                                         \
+   static void                                                          \
+   alpha_test_quads_##FUNC( struct quad_stage *qs,                      \
+                           struct quad_header *quads[],                 \
+                           unsigned nr )                                \
+   {                                                                    \
+      const float ref = qs->llvmpipe->depth_stencil->alpha.ref_value;   \
+      const uint cbuf = 0; /* only output[0].alpha is tested */         \
+      unsigned pass_nr = 0;                                             \
+      unsigned i;                                                       \
+                                                                        \
+      for (i = 0; i < nr; i++) {                                        \
+         const float *aaaa = quads[i]->output.color[cbuf][3];           \
+         unsigned passMask = 0;                                         \
+                                                                        \
+         if (aaaa[0] COMP ref) passMask |= (1 << 0);                    \
+         if (aaaa[1] COMP ref) passMask |= (1 << 1);                    \
+         if (aaaa[2] COMP ref) passMask |= (1 << 2);                    \
+         if (aaaa[3] COMP ref) passMask |= (1 << 3);                    \
+                                                                        \
+         quads[i]->inout.mask &= passMask;                              \
+                                                                        \
+         if (quads[i]->inout.mask)                                      \
+            quads[pass_nr++] = quads[i];                                \
+      }                                                                 \
+                                                                        \
+      if (pass_nr)                                                      \
+         qs->next->run(qs->next, quads, pass_nr);                       \
+   }
+
+
+ALPHATEST( LESS,     < )
+ALPHATEST( EQUAL,    == )
+ALPHATEST( LEQUAL,   <= )
+ALPHATEST( GREATER,  > )
+ALPHATEST( NOTEQUAL, != )
+ALPHATEST( GEQUAL,   >= )
 
+
+/* XXX: Incorporate into shader using KILP.
+ */
 static void
-alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
+alpha_test_quad(struct quad_stage *qs, 
+                struct quad_header *quads[], 
+                unsigned nr)
 {
-   struct llvmpipe_context *llvmpipe = qs->llvmpipe;
-   const float ref = llvmpipe->depth_stencil->alpha.ref_value;
-   unsigned passMask = 0x0, j;
-   const uint cbuf = 0; /* only output[0].alpha is tested */
-   const float *aaaa = quad->output.color[cbuf][3];
-
-   switch (llvmpipe->depth_stencil->alpha.func) {
-   case PIPE_FUNC_NEVER:
-      break;
+   switch (qs->llvmpipe->depth_stencil->alpha.func) {
    case PIPE_FUNC_LESS:
-      /*
-       * If mask were an array [4] we could do this SIMD-style:
-       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
-       */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] < ref) {
-            passMask |= (1 << j);
-         }
-      }
+      alpha_test_quads_LESS( qs, quads, nr );
       break;
    case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] == ref) {
-            passMask |= (1 << j);
-         }
-      }
+      alpha_test_quads_EQUAL( qs, quads, nr );
       break;
    case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] <= ref) {
-            passMask |= (1 << j);
-         }
-      }
+      alpha_test_quads_LEQUAL( qs, quads, nr );
       break;
    case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] > ref) {
-            passMask |= (1 << j);
-         }
-      }
+      alpha_test_quads_GREATER( qs, quads, nr );
       break;
    case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] != ref) {
-            passMask |= (1 << j);
-         }
-      }
+      alpha_test_quads_NOTEQUAL( qs, quads, nr );
       break;
    case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] >= ref) {
-            passMask |= (1 << j);
-         }
-      }
+      alpha_test_quads_GEQUAL( qs, quads, nr );
       break;
    case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
+      assert(0); /* should be caught earlier */
+      qs->next->run(qs->next, quads, nr);
       break;
+   case PIPE_FUNC_NEVER:
    default:
-      assert(0);
+      assert(0); /* should be caught earlier */
+      return;
    }
-
-   quad->inout.mask &= passMask;
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_blend.c b/src/gallium/drivers/llvmpipe/lp_quad_blend.c
index 98603be52e4..6beb9647399 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_blend.c
@@ -117,10 +117,16 @@ do { \
 
 
 static void
-logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+logicop_quad(struct quad_stage *qs, 
+             struct quad_header *quads[],
+             unsigned nr)
 {
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
    uint cbuf;
+   struct llvmpipe_cached_tile *
+      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[cbuf],
+                                quads[0]->input.x0, 
+                                quads[0]->input.y0);
 
    /* loop over colorbuffer outputs */
    for (cbuf = 0; cbuf < llvmpipe->framebuffer.nr_cbufs; cbuf++) {
@@ -129,165 +135,161 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
       uint *src4 = (uint *) src;
       uint *dst4 = (uint *) dst;
       uint *res4 = (uint *) res;
-      struct llvmpipe_cached_tile *
-         tile = lp_get_cached_tile(llvmpipe->cbuf_cache[cbuf],
-                                   quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
       uint i, j;
 
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
+      for (i = 0; i < nr; i++) {
+         struct quad_header *quad = quads[i];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+
+         /* get/swizzle dest colors */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+            int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
          }
-      }
 
-      /* convert to ubyte */
-      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
-         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
-         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
-         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
-
-         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
-         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
-         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
-         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
-      }
+         /* convert to ubyte */
+         for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+            dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+            dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+            dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+            dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+            src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+            src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+            src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+            src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+         }
 
-      switch (llvmpipe->blend->logicop_func) {
-      case PIPE_LOGICOP_CLEAR:
-         for (j = 0; j < 4; j++)
-            res4[j] = 0;
-         break;
-      case PIPE_LOGICOP_NOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] | dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j];
-         break;
-      case PIPE_LOGICOP_AND_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & ~dst4[j];
-         break;
-      case PIPE_LOGICOP_INVERT:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~dst4[j];
-         break;
-      case PIPE_LOGICOP_XOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j] ^ src4[j];
-         break;
-      case PIPE_LOGICOP_NAND:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] & dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_EQUIV:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] ^ dst4[j]);
-         break;
-      case PIPE_LOGICOP_NOOP:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j];
-         break;
-      case PIPE_LOGICOP_OR_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j];
-         break;
-      case PIPE_LOGICOP_OR_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | ~dst4[j];
-         break;
-      case PIPE_LOGICOP_OR:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_SET:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~0;
-         break;
-      default:
-         assert(0);
-      }
+         switch (llvmpipe->blend->logicop_func) {
+         case PIPE_LOGICOP_CLEAR:
+            for (j = 0; j < 4; j++)
+               res4[j] = 0;
+            break;
+         case PIPE_LOGICOP_NOR:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~(src4[j] | dst4[j]);
+            break;
+         case PIPE_LOGICOP_AND_INVERTED:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~src4[j] & dst4[j];
+            break;
+         case PIPE_LOGICOP_COPY_INVERTED:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~src4[j];
+            break;
+         case PIPE_LOGICOP_AND_REVERSE:
+            for (j = 0; j < 4; j++)
+               res4[j] = src4[j] & ~dst4[j];
+            break;
+         case PIPE_LOGICOP_INVERT:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~dst4[j];
+            break;
+         case PIPE_LOGICOP_XOR:
+            for (j = 0; j < 4; j++)
+               res4[j] = dst4[j] ^ src4[j];
+            break;
+         case PIPE_LOGICOP_NAND:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~(src4[j] & dst4[j]);
+            break;
+         case PIPE_LOGICOP_AND:
+            for (j = 0; j < 4; j++)
+               res4[j] = src4[j] & dst4[j];
+            break;
+         case PIPE_LOGICOP_EQUIV:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~(src4[j] ^ dst4[j]);
+            break;
+         case PIPE_LOGICOP_NOOP:
+            for (j = 0; j < 4; j++)
+               res4[j] = dst4[j];
+            break;
+         case PIPE_LOGICOP_OR_INVERTED:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~src4[j] | dst4[j];
+            break;
+         case PIPE_LOGICOP_COPY:
+            for (j = 0; j < 4; j++)
+               res4[j] = src4[j];
+            break;
+         case PIPE_LOGICOP_OR_REVERSE:
+            for (j = 0; j < 4; j++)
+               res4[j] = src4[j] | ~dst4[j];
+            break;
+         case PIPE_LOGICOP_OR:
+            for (j = 0; j < 4; j++)
+               res4[j] = src4[j] | dst4[j];
+            break;
+         case PIPE_LOGICOP_SET:
+            for (j = 0; j < 4; j++)
+               res4[j] = ~0;
+            break;
+         default:
+            assert(0);
+         }
 
-      for (j = 0; j < 4; j++) {
-         quadColor[j][0] = ubyte_to_float(res[j][0]);
-         quadColor[j][1] = ubyte_to_float(res[j][1]);
-         quadColor[j][2] = ubyte_to_float(res[j][2]);
-         quadColor[j][3] = ubyte_to_float(res[j][3]);
+         for (j = 0; j < 4; j++) {
+            quadColor[j][0] = ubyte_to_float(res[j][0]);
+            quadColor[j][1] = ubyte_to_float(res[j][1]);
+            quadColor[j][2] = ubyte_to_float(res[j][2]);
+            quadColor[j][3] = ubyte_to_float(res[j][3]);
+         }
       }
    }
-
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
 }
 
 
-
-
 static void
-blend_quad(struct quad_stage *qs, struct quad_header *quad)
+blend_quads(struct quad_stage *qs, 
+            struct quad_header *quads[],
+            unsigned nr)
 {
    static const float zero[4] = { 0, 0, 0, 0 };
    static const float one[4] = { 1, 1, 1, 1 };
-
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
    uint cbuf;
 
-   if (llvmpipe->blend->logicop_enable) {
-      logicop_quad(qs, quad);
-      return;
-   }
-
    /* loop over colorbuffer outputs */
    for (cbuf = 0; cbuf < llvmpipe->framebuffer.nr_cbufs; cbuf++) {
       float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
       struct llvmpipe_cached_tile *tile
          = lp_get_cached_tile(llvmpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      uint q, i, j;
+
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+
+         /* get/swizzle dest colors */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+            int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
          }
-      }
 
-      /*
-       * Compute src/first term RGB
-       */
-      switch (llvmpipe->blend->rgb_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[0], quadColor[0]); /* R */
-         VEC4_COPY(source[1], quadColor[1]); /* G */
-         VEC4_COPY(source[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         /*
+          * Compute src/first term RGB
+          */
+         switch (llvmpipe->blend->rgb_src_factor) {
+         case PIPE_BLENDFACTOR_ONE:
+            VEC4_COPY(source[0], quadColor[0]); /* R */
+            VEC4_COPY(source[1], quadColor[1]); /* G */
+            VEC4_COPY(source[2], quadColor[2]); /* B */
+            break;
+         case PIPE_BLENDFACTOR_SRC_COLOR:
+            VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+            VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+            VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+            break;
+         case PIPE_BLENDFACTOR_SRC_ALPHA:
          {
             const float *alpha = quadColor[3];
             VEC4_MUL(source[0], quadColor[0], alpha); /* R */
@@ -295,12 +297,12 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
+         case PIPE_BLENDFACTOR_DST_COLOR:
+            VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+            VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+            VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+            break;
+         case PIPE_BLENDFACTOR_DST_ALPHA:
          {
             const float *alpha = dest[3];
             VEC4_MUL(source[0], quadColor[0], alpha); /* R */
@@ -308,7 +310,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
          {
             const float *alpha = quadColor[3];
             float diff[4], temp[4];
@@ -319,7 +321,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], temp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
+         case PIPE_BLENDFACTOR_CONST_COLOR:
          {
             float comp[4];
             VEC4_SCALAR(comp, llvmpipe->blend_color.color[0]); /* R */
@@ -330,7 +332,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], comp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_CONST_ALPHA:
          {
             float alpha[4];
             VEC4_SCALAR(alpha, llvmpipe->blend_color.color[3]);
@@ -339,18 +341,18 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[0], zero); /* R */
-         VEC4_COPY(source[1], zero); /* G */
-         VEC4_COPY(source[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         case PIPE_BLENDFACTOR_SRC1_COLOR:
+            assert(0); /* to do */
+            break;
+         case PIPE_BLENDFACTOR_SRC1_ALPHA:
+            assert(0); /* to do */
+            break;
+         case PIPE_BLENDFACTOR_ZERO:
+            VEC4_COPY(source[0], zero); /* R */
+            VEC4_COPY(source[1], zero); /* G */
+            VEC4_COPY(source[2], zero); /* B */
+            break;
+         case PIPE_BLENDFACTOR_INV_SRC_COLOR:
          {
             float inv_comp[4];
             VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
@@ -361,7 +363,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
          {
             float inv_alpha[4];
             VEC4_SUB(inv_alpha, one, quadColor[3]);
@@ -370,7 +372,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_DST_ALPHA:
          {
             float inv_alpha[4];
             VEC4_SUB(inv_alpha, one, dest[3]);
@@ -379,7 +381,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         case PIPE_BLENDFACTOR_INV_DST_COLOR:
          {
             float inv_comp[4];
             VEC4_SUB(inv_comp, one, dest[0]); /* R */
@@ -390,7 +392,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         case PIPE_BLENDFACTOR_INV_CONST_COLOR:
          {
             float inv_comp[4];
             /* R */
@@ -404,7 +406,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], inv_comp);
          }
          break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
          {
             float inv_alpha[4];
             VEC4_SCALAR(inv_alpha, 1.0f - llvmpipe->blend_color.color[3]);
@@ -413,73 +415,73 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      default:
-         assert(0);
-      }
+         case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+            assert(0); /* to do */
+            break;
+         case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+            assert(0); /* to do */
+            break;
+         default:
+            assert(0);
+         }
 
-      /*
-       * Compute src/first term A
-       */
-      switch (llvmpipe->blend->alpha_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         /*
+          * Compute src/first term A
+          */
+         switch (llvmpipe->blend->alpha_src_factor) {
+         case PIPE_BLENDFACTOR_ONE:
+            VEC4_COPY(source[3], quadColor[3]); /* A */
+            break;
+         case PIPE_BLENDFACTOR_SRC_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_SRC_ALPHA:
          {
             const float *alpha = quadColor[3];
             VEC4_MUL(source[3], quadColor[3], alpha); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         /* multiply alpha by 1.0 */
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_DST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_DST_ALPHA:
+            VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+            break;
+         case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+            /* multiply alpha by 1.0 */
+            VEC4_COPY(source[3], quadColor[3]); /* A */
+            break;
+         case PIPE_BLENDFACTOR_CONST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_CONST_ALPHA:
          {
             float comp[4];
             VEC4_SCALAR(comp, llvmpipe->blend_color.color[3]); /* A */
             VEC4_MUL(source[3], quadColor[3], comp); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         case PIPE_BLENDFACTOR_ZERO:
+            VEC4_COPY(source[3], zero); /* A */
+            break;
+         case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
          {
             float inv_alpha[4];
             VEC4_SUB(inv_alpha, one, quadColor[3]);
             VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_DST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_INV_DST_ALPHA:
          {
             float inv_alpha[4];
             VEC4_SUB(inv_alpha, one, dest[3]);
             VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
          {
             float inv_comp[4];
             /* A */
@@ -487,42 +489,42 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(source[3], quadColor[3], inv_comp);
          }
          break;
-      default:
-         assert(0);
-      }
+         default:
+            assert(0);
+         }
 
 
-      /*
-       * Compute dest/second term RGB
-       */
-      switch (llvmpipe->blend->rgb_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /*
+          * Compute dest/second term RGB
+          */
+         switch (llvmpipe->blend->rgb_dst_factor) {
+         case PIPE_BLENDFACTOR_ONE:
+            /* dest = dest * 1   NO-OP, leave dest as-is */
+            break;
+         case PIPE_BLENDFACTOR_SRC_COLOR:
+            VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+            VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+            VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+            break;
+         case PIPE_BLENDFACTOR_SRC_ALPHA:
+            VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+            VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+            VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+            break;
+         case PIPE_BLENDFACTOR_DST_ALPHA:
+            VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+            VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+            VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+            break;
+         case PIPE_BLENDFACTOR_DST_COLOR:
+            VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+            VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+            VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+            break;
+         case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+            assert(0); /* illegal */
+            break;
+         case PIPE_BLENDFACTOR_CONST_COLOR:
          {
             float comp[4];
             VEC4_SCALAR(comp, llvmpipe->blend_color.color[0]); /* R */
@@ -533,7 +535,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], dest[2], comp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_CONST_ALPHA:
          {
             float comp[4];
             VEC4_SCALAR(comp, llvmpipe->blend_color.color[3]); /* A */
@@ -542,17 +544,17 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], dest[2], comp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[0], zero); /* R */
-         VEC4_COPY(dest[1], zero); /* G */
-         VEC4_COPY(dest[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         case PIPE_BLENDFACTOR_ZERO:
+            VEC4_COPY(dest[0], zero); /* R */
+            VEC4_COPY(dest[1], zero); /* G */
+            VEC4_COPY(dest[2], zero); /* B */
+            break;
+         case PIPE_BLENDFACTOR_SRC1_COLOR:
+         case PIPE_BLENDFACTOR_SRC1_ALPHA:
+            /* XXX what are these? */
+            assert(0);
+            break;
+         case PIPE_BLENDFACTOR_INV_SRC_COLOR:
          {
             float inv_comp[4];
             VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
@@ -563,7 +565,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
          {
             float one_minus_alpha[QUAD_SIZE];
             VEC4_SUB(one_minus_alpha, one, quadColor[3]);
@@ -572,7 +574,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_DST_ALPHA:
          {
             float inv_comp[4];
             VEC4_SUB(inv_comp, one, dest[3]); /* A */
@@ -581,7 +583,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         case PIPE_BLENDFACTOR_INV_DST_COLOR:
          {
             float inv_comp[4];
             VEC4_SUB(inv_comp, one, dest[0]); /* R */
@@ -592,7 +594,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         case PIPE_BLENDFACTOR_INV_CONST_COLOR:
          {
             float inv_comp[4];
             /* R */
@@ -606,7 +608,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], dest[2], inv_comp);
          }
          break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
          {
             float inv_comp[4];
             VEC4_SCALAR(inv_comp, 1.0f - llvmpipe->blend_color.color[3]);
@@ -615,138 +617,154 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
             VEC4_MUL(dest[2], dest[2], inv_comp);
          }
          break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      default:
-         assert(0);
-      }
+         case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+         case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+            /* XXX what are these? */
+            assert(0);
+            break;
+         default:
+            assert(0);
+         }
 
-      /*
-       * Compute dest/second term A
-       */
-      switch (llvmpipe->blend->alpha_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         /*
+          * Compute dest/second term A
+          */
+         switch (llvmpipe->blend->alpha_dst_factor) {
+         case PIPE_BLENDFACTOR_ONE:
+            /* dest = dest * 1   NO-OP, leave dest as-is */
+            break;
+         case PIPE_BLENDFACTOR_SRC_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_SRC_ALPHA:
+            VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+            break;
+         case PIPE_BLENDFACTOR_DST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_DST_ALPHA:
+            VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+            break;
+         case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+            assert(0); /* illegal */
+            break;
+         case PIPE_BLENDFACTOR_CONST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_CONST_ALPHA:
          {
             float comp[4];
             VEC4_SCALAR(comp, llvmpipe->blend_color.color[3]); /* A */
             VEC4_MUL(dest[3], dest[3], comp); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         case PIPE_BLENDFACTOR_ZERO:
+            VEC4_COPY(dest[3], zero); /* A */
+            break;
+         case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
          {
             float one_minus_alpha[QUAD_SIZE];
             VEC4_SUB(one_minus_alpha, one, quadColor[3]);
             VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_DST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_INV_DST_ALPHA:
          {
             float inv_comp[4];
             VEC4_SUB(inv_comp, one, dest[3]); /* A */
             VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
          }
          break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+            /* fall-through */
+         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
          {
             float inv_comp[4];
             VEC4_SCALAR(inv_comp, 1.0f - llvmpipe->blend_color.color[3]);
             VEC4_MUL(dest[3], dest[3], inv_comp);
          }
          break;
-      default:
-         assert(0);
-      }
+         default:
+            assert(0);
+         }
 
-      /*
-       * Combine RGB terms
-       */
-      switch (llvmpipe->blend->rgb_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      default:
-         assert(0);
-      }
+         /*
+          * Combine RGB terms
+          */
+         switch (llvmpipe->blend->rgb_func) {
+         case PIPE_BLEND_ADD:
+            VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+            VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+            VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+            break;
+         case PIPE_BLEND_SUBTRACT:
+            VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
+            VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
+            VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+            break;
+         case PIPE_BLEND_REVERSE_SUBTRACT:
+            VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
+            VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
+            VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+            break;
+         case PIPE_BLEND_MIN:
+            VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+            VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+            VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+            break;
+         case PIPE_BLEND_MAX:
+            VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+            VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+            VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+            break;
+         default:
+            assert(0);
+         }
 
-      /*
-       * Combine A terms
-       */
-      switch (llvmpipe->blend->alpha_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      default:
-         assert(0);
+         /*
+          * Combine A terms
+          */
+         switch (llvmpipe->blend->alpha_func) {
+         case PIPE_BLEND_ADD:
+            VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+            break;
+         case PIPE_BLEND_SUBTRACT:
+            VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+            break;
+         case PIPE_BLEND_REVERSE_SUBTRACT:
+            VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+            break;
+         case PIPE_BLEND_MIN:
+            VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+            break;
+         case PIPE_BLEND_MAX:
+            VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+            break;
+         default:
+            assert(0);
+         }
       }
-
    } /* cbuf loop */
+}
+
+
+static void
+blend_quad(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+   struct llvmpipe_context *llvmpipe = qs->llvmpipe;
+
+   if (llvmpipe->blend->logicop_enable) {
+      logicop_quad(qs, quads, nr);
+   }
+   else if (llvmpipe->blend->blend_enable) {
+      blend_quads(qs, quads, nr );
+   }
 
    /* pass blended quad to next stage */
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, nr);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_colormask.c b/src/gallium/drivers/llvmpipe/lp_quad_colormask.c
index 205dea48828..df811a72d7c 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_colormask.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_colormask.c
@@ -84,12 +84,23 @@ colormask_quad(struct quad_stage *qs, struct quad_header *quad)
       if (!(llvmpipe->blend->colormask & PIPE_MASK_A))
           COPY_4V(quadColor[3], dest[3]);
    }
+}
+
+static void
+colormask_quads(struct quad_stage *qs, struct quad_header *quads[],
+                unsigned nr)
+{
+   unsigned i;
+
+   for (i = 0; i < nr; i++)
+      colormask_quad(qs, quads[i]);
 
    /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, nr);
 }
 
 
+
 static void colormask_begin(struct quad_stage *qs)
 {
    qs->next->begin(qs->next);
@@ -108,7 +119,7 @@ struct quad_stage *lp_quad_colormask_stage( struct llvmpipe_context *llvmpipe )
 
    stage->llvmpipe = llvmpipe;
    stage->begin = colormask_begin;
-   stage->run = colormask_quad;
+   stage->run = colormask_quads;
    stage->destroy = colormask_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_coverage.c b/src/gallium/drivers/llvmpipe/lp_quad_coverage.c
index 01c5982e859..b7b531d836d 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_coverage.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_coverage.c
@@ -42,33 +42,47 @@
 /**
  * Multiply quad's alpha values by the fragment coverage.
  */
-static void
+static INLINE void
 coverage_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
-   const uint prim = quad->input.prim;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < llvmpipe->framebuffer.nr_cbufs; cbuf++) {
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      unsigned j;
+      for (j = 0; j < QUAD_SIZE; j++) {
+         assert(quad->input.coverage[j] >= 0.0);
+         assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
+      }
+   }
+}
+
+
+/* XXX: Incorporate into shader after alpha_test.
+ */
+static void
+coverage_run(struct quad_stage *qs,
+               struct quad_header *quads[],
+               unsigned nr)
+{
+   struct llvmpipe_context *llvmpipe = qs->llvmpipe;
+   const uint prim = quads[0]->input.prim;
+   unsigned i;
 
    if ((llvmpipe->rasterizer->poly_smooth && prim == QUAD_PRIM_TRI) ||
        (llvmpipe->rasterizer->line_smooth && prim == QUAD_PRIM_LINE) ||
        (llvmpipe->rasterizer->point_smooth && prim == QUAD_PRIM_POINT)) {
-      uint cbuf;
-
-      /* loop over colorbuffer outputs */
-      for (cbuf = 0; cbuf < llvmpipe->framebuffer.nr_cbufs; cbuf++) {
-         float (*quadColor)[4] = quad->output.color[cbuf];
-         unsigned j;
-         for (j = 0; j < QUAD_SIZE; j++) {
-            assert(quad->input.coverage[j] >= 0.0);
-            assert(quad->input.coverage[j] <= 1.0);
-         quadColor[3][j] *= quad->input.coverage[j];
-         }
-      }
+
+      for (i = 0; i < nr; i++)
+         coverage_quad( qs, quads[i] );
    }
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, nr);
 }
 
-
 static void coverage_begin(struct quad_stage *qs)
 {
    qs->next->begin(qs->next);
@@ -87,7 +101,7 @@ struct quad_stage *lp_quad_coverage_stage( struct llvmpipe_context *llvmpipe )
 
    stage->llvmpipe = llvmpipe;
    stage->begin = coverage_begin;
-   stage->run = coverage_quad;
+   stage->run = coverage_run;
    stage->destroy = coverage_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_depth_test.c b/src/gallium/drivers/llvmpipe/lp_quad_depth_test.c
index fdb64ac3b45..8ecd68393f3 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_depth_test.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_depth_test.c
@@ -49,7 +49,7 @@
  * Try to effectively do that with codegen...
  */
 
-void
+boolean
 lp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
@@ -193,6 +193,8 @@ lp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
    }
 
    quad->inout.mask &= zmask;
+   if (quad->inout.mask == 0)
+      return FALSE;
 
    if (llvmpipe->depth_stencil->depth.writemask) {
       
@@ -252,16 +254,25 @@ lp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          assert(0);
       }
    }
+
+   return TRUE;
 }
 
 
 static void
-depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+depth_test_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[],
+                 unsigned nr)
 {
-   lp_depth_test_quad(qs, quad);
+   unsigned i, pass = 0;
 
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
+   for (i = 0; i < nr; i++) {
+      if (lp_depth_test_quad(qs, quads[i]))
+         quads[pass++] = quads[i];
+   }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
 
 
@@ -283,7 +294,7 @@ struct quad_stage *lp_quad_depth_test_stage( struct llvmpipe_context *llvmpipe )
 
    stage->llvmpipe = llvmpipe;
    stage->begin = depth_test_begin;
-   stage->run = depth_test_quad;
+   stage->run = depth_test_quads;
    stage->destroy = depth_test_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_earlyz.c b/src/gallium/drivers/llvmpipe/lp_quad_earlyz.c
index e4b4c3b55cd..915d2d9f782 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_earlyz.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_earlyz.c
@@ -43,20 +43,26 @@
 static void
 earlyz_quad(
    struct quad_stage    *qs,
-   struct quad_header   *quad )
+   struct quad_header   *quads[],
+   unsigned nr )
 {
-   const float fx = (float) quad->input.x0;
-   const float fy = (float) quad->input.y0;
-   const float dzdx = quad->posCoef->dadx[2];
-   const float dzdy = quad->posCoef->dady[2];
-   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   const float a0z = quads[0]->posCoef->a0[2];
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   unsigned i;
 
-   quad->output.depth[0] = z0;
-   quad->output.depth[1] = z0 + dzdx;
-   quad->output.depth[2] = z0 + dzdy;
-   quad->output.depth[3] = z0 + dzdx + dzdy;
+   for (i = 0; i < nr; i++) {
+      const float fx = (float) quads[i]->input.x0;
+      const float fy = (float) quads[i]->input.y0;
+      const float z0 = a0z + dzdx * fx + dzdy * fy;
 
-   qs->next->run( qs->next, quad );
+      quads[i]->output.depth[0] = z0;
+      quads[i]->output.depth[1] = z0 + dzdx;
+      quads[i]->output.depth[2] = z0 + dzdy;
+      quads[i]->output.depth[3] = z0 + dzdx + dzdy;
+   }
+
+   qs->next->run( qs->next, quads, nr );
 }
 
 static void
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_fs.c b/src/gallium/drivers/llvmpipe/lp_quad_fs.c
index cabc54155cf..25518c09f40 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_fs.c
@@ -68,21 +68,18 @@ quad_shade_stage(struct quad_stage *qs)
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-static void
+static boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
    struct tgsi_exec_machine *machine = qss->machine;
    boolean z_written;
-   
-   /* Consts do not require 16 byte alignment. */
-   machine->Consts = llvmpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   machine->InterpCoefs = quad->coef;
 
    /* run shader */
    quad->inout.mask &= llvmpipe->fs->run( llvmpipe->fs, machine, quad );
+   if (quad->inout.mask == 0)
+      return FALSE;
 
    /* store outputs */
    z_written = FALSE;
@@ -129,11 +126,34 @@ shade_quad(struct quad_stage *qs, struct quad_header *quad)
       quad->output.depth[3] = z0 + dzdx + dzdy;
    }
 
-   /* shader may cull fragments */
-   if (quad->inout.mask) {
-      qs->next->run( qs->next, quad );
+   return TRUE;
+}
+
+static void
+shade_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[],
+                 unsigned nr)
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct llvmpipe_context *llvmpipe = qs->llvmpipe;
+   struct tgsi_exec_machine *machine = qss->machine;
+
+   unsigned i, pass = 0;
+   
+   machine->Consts = llvmpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+   machine->InterpCoefs = quads[0]->coef;
+
+   for (i = 0; i < nr; i++) {
+      if (shade_quad(qs, quads[i]))
+         quads[pass++] = quads[i];
    }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
+   
+
+
 
 
 /**
@@ -174,7 +194,7 @@ lp_quad_shade_stage( struct llvmpipe_context *llvmpipe )
 
    qss->stage.llvmpipe = llvmpipe;
    qss->stage.begin = shade_begin;
-   qss->stage.run = shade_quad;
+   qss->stage.run = shade_quads;
    qss->stage.destroy = shade_destroy;
 
    qss->machine = tgsi_exec_machine_create();
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_occlusion.c b/src/gallium/drivers/llvmpipe/lp_quad_occlusion.c
index 6441ca30f21..c4d5b86d424 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_occlusion.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_occlusion.c
@@ -50,13 +50,15 @@ static unsigned count_bits( unsigned val )
 }
 
 static void
-occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
+occlusion_count_quads(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
+   unsigned i;
 
-   llvmpipe->occlusion_count += count_bits(quad->inout.mask);
+   for (i = 0; i < nr; i++)
+      llvmpipe->occlusion_count += count_bits(quads[i]->inout.mask);
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, nr);
 }
 
 
@@ -78,7 +80,7 @@ struct quad_stage *lp_quad_occlusion_stage( struct llvmpipe_context *llvmpipe )
 
    stage->llvmpipe = llvmpipe;
    stage->begin = occlusion_begin;
-   stage->run = occlusion_count_quad;
+   stage->run = occlusion_count_quads;
    stage->destroy = occlusion_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_output.c b/src/gallium/drivers/llvmpipe/lp_quad_output.c
index d344b4e3a72..07cc8408485 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_output.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_output.c
@@ -38,11 +38,8 @@
  * taking mask into account.
  */
 static void
-output_quad(struct quad_stage *qs, struct quad_header *quad)
+output_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
-   /* in-tile pos: */
-   const int itx = quad->input.x0 % TILE_SIZE;
-   const int ity = quad->input.y0 % TILE_SIZE;
 
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
    uint cbuf;
@@ -51,25 +48,35 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
    for (cbuf = 0; cbuf < llvmpipe->framebuffer.nr_cbufs; cbuf++) {
       struct llvmpipe_cached_tile *tile
          = lp_get_cached_tile(llvmpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      int i, j;
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      int i, j, q;
 
       /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (quad->inout.mask & (1 << j)) {
-            int x = itx + (j & 1);
-            int y = ity + (j >> 1);
-            for (i = 0; i < 4; i++) { /* loop over color chans */
-               tile->data.color[y][x][i] = quadColor[i][j];
-            }
-            if (0) {
-               debug_printf("lp write pixel %d,%d: %g, %g, %g\n",
-                            quad->input.x0 + x,
-                            quad->input.y0 + y,
-                            quadColor[0][j],
-                            quadColor[1][j],
-                            quadColor[2][j]);
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+
+         /* in-tile pos: */
+         const int itx = quad->input.x0 % TILE_SIZE;
+         const int ity = quad->input.y0 % TILE_SIZE;
+
+         
+         for (j = 0; j < QUAD_SIZE; j++) {
+            if (quad->inout.mask & (1 << j)) {
+               int x = itx + (j & 1);
+               int y = ity + (j >> 1);
+               for (i = 0; i < 4; i++) { /* loop over color chans */
+                  tile->data.color[y][x][i] = quadColor[i][j];
+               }
+               if (0) {
+                  debug_printf("lp write pixel %d,%d: %g, %g, %g\n",
+                               quad->input.x0 + x,
+                               quad->input.y0 + y,
+                               quadColor[0][j],
+                               quadColor[1][j],
+                               quadColor[2][j]);
+               }
             }
          }
       }
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_pipe.c b/src/gallium/drivers/llvmpipe/lp_quad_pipe.c
index d738d08d9e2..60ec31eaf39 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_pipe.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_pipe.c
@@ -55,50 +55,52 @@ void
 lp_build_quad_pipeline(struct llvmpipe_context *lp)
 {
    boolean early_depth_test =
-               lp->depth_stencil->depth.enabled &&
-               lp->framebuffer.zsbuf &&
-               !lp->depth_stencil->alpha.enabled &&
-               !lp->fs->info.uses_kill &&
-               !lp->fs->info.writes_z;
+      lp->depth_stencil->depth.enabled &&
+      lp->framebuffer.zsbuf &&
+      !lp->depth_stencil->alpha.enabled &&
+      !lp->fs->info.uses_kill &&
+      !lp->fs->info.writes_z;
 
    /* build up the pipeline in reverse order... */
-      lp->quad.first = lp->quad.output;
-
-      if (lp->blend->colormask != 0xf) {
-         lp_push_quad_first( lp, lp->quad.colormask );
-      }
-
-      if (lp->blend->blend_enable ||
-          lp->blend->logicop_enable) {
-         lp_push_quad_first( lp, lp->quad.blend );
-      }
-
-      if (lp->active_query_count) {
-         lp_push_quad_first( lp, lp->quad.occlusion );
-      }
-
-      if (lp->rasterizer->poly_smooth ||
-          lp->rasterizer->line_smooth ||
-          lp->rasterizer->point_smooth) {
-         lp_push_quad_first( lp, lp->quad.coverage );
-      }
-
-      if (!early_depth_test) {
-         lp_build_depth_stencil( lp );
-      }
-
-      if (lp->depth_stencil->alpha.enabled) {
-         lp_push_quad_first( lp, lp->quad.alpha_test );
-      }
-
-      /* XXX always enable shader? */
-      if (1) {
-         lp_push_quad_first( lp, lp->quad.shade );
-      }
-
-      if (early_depth_test) {
-         lp_build_depth_stencil( lp );
-         lp_push_quad_first( lp, lp->quad.earlyz );
-      }
+
+   /* Color combine
+    */
+   lp->quad.first = lp->quad.output;
+
+   if (lp->blend->colormask != 0xf) {
+      lp_push_quad_first( lp, lp->quad.colormask );
+   }
+
+   if (lp->blend->blend_enable ||
+       lp->blend->logicop_enable) {
+      lp_push_quad_first( lp, lp->quad.blend );
+   }
+
+   if (lp->rasterizer->poly_smooth ||
+       lp->rasterizer->line_smooth ||
+       lp->rasterizer->point_smooth) {
+      lp_push_quad_first( lp, lp->quad.coverage );
+   }
+
+   /* Shade/Depth/Stencil/Alpha
+    */
+   if (lp->active_query_count) {
+      lp_push_quad_first( lp, lp->quad.occlusion );
+   }
+
+   if (!early_depth_test) {
+      lp_build_depth_stencil( lp );
+   }
+
+   if (lp->depth_stencil->alpha.enabled) {
+      lp_push_quad_first( lp, lp->quad.alpha_test );
+   }
+
+   lp_push_quad_first( lp, lp->quad.shade );
+
+   if (early_depth_test) {
+      lp_build_depth_stencil( lp );
+      lp_push_quad_first( lp, lp->quad.earlyz );
+   }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_pipe.h b/src/gallium/drivers/llvmpipe/lp_quad_pipe.h
index 4c3efdee69c..5c8c7b3a737 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_pipe.h
+++ b/src/gallium/drivers/llvmpipe/lp_quad_pipe.h
@@ -49,7 +49,7 @@ struct quad_stage {
    void (*begin)(struct quad_stage *qs);
 
    /** the stage action */
-   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+   void (*run)(struct quad_stage *qs, struct quad_header *quad[], unsigned nr);
 
    void (*destroy)(struct quad_stage *qs);
 };
@@ -69,6 +69,6 @@ struct quad_stage *lp_quad_output_stage( struct llvmpipe_context *llvmpipe );
 
 void lp_build_quad_pipeline(struct llvmpipe_context *lp);
 
-void lp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
+boolean lp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
 
 #endif /* LP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_stencil.c b/src/gallium/drivers/llvmpipe/lp_quad_stencil.c
index 229f0d054d7..0acfa7cb68a 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_stencil.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_stencil.c
@@ -198,7 +198,8 @@ apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
  * depth testing.
  */
 static void
-stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
+stencil_test_quad(struct quad_stage *qs, struct quad_header *quads[],
+                  unsigned nr)
 {
    struct llvmpipe_context *llvmpipe = qs->llvmpipe;
    struct pipe_surface *ps = llvmpipe->framebuffer.zsbuf;
@@ -206,9 +207,12 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
    ubyte ref, wrtMask, valMask;
    ubyte stencilVals[QUAD_SIZE];
    struct llvmpipe_cached_tile *tile
-      = lp_get_cached_tile(llvmpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
-   uint j;
-   uint face = quad->input.facing;
+      = lp_get_cached_tile(llvmpipe->zsbuf_cache, 
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+   uint face = quads[0]->input.facing;
+   uint pass = 0;
+   uint j, q;
 
    if (!llvmpipe->depth_stencil->stencil[1].enabled) {
       /* single-sided stencil test, use front (face=0) state */
@@ -227,103 +231,110 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
 
    assert(ps); /* shouldn't get here if there's no stencil buffer */
 
-   /* get stencil values from cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] >> 24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.stencil8[y][x];
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+
+      /* get stencil values from cached tile */
+      switch (ps->format) {
+      case PIPE_FORMAT_S8Z24_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            stencilVals[j] = tile->data.depth32[y][x] >> 24;
+         }
+         break;
+      case PIPE_FORMAT_Z24S8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+         }
+         break;
+      case PIPE_FORMAT_S8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            stencilVals[j] = tile->data.stencil8[y][x];
+         }
+         break;
+      default:
+         assert(0);
       }
-      break;
-   default:
-      assert(0);
-   }
 
-   /* do the stencil test first */
-   {
-      unsigned passMask, failMask;
-      passMask = do_stencil_test(stencilVals, func, ref, valMask);
-      failMask = quad->inout.mask & ~passMask;
-      quad->inout.mask &= passMask;
+      /* do the stencil test first */
+      {
+         unsigned passMask, failMask;
+         passMask = do_stencil_test(stencilVals, func, ref, valMask);
+         failMask = quad->inout.mask & ~passMask;
+         quad->inout.mask &= passMask;
 
-      if (failOp != PIPE_STENCIL_OP_KEEP) {
-         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
+         if (failOp != PIPE_STENCIL_OP_KEEP) {
+            apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
+         }
       }
-   }
 
-   if (quad->inout.mask) {
-      /* now the pixels that passed the stencil test are depth tested */
-      if (llvmpipe->depth_stencil->depth.enabled) {
-         const unsigned origMask = quad->inout.mask;
+      if (quad->inout.mask) {
+         /* now the pixels that passed the stencil test are depth tested */
+         if (llvmpipe->depth_stencil->depth.enabled) {
+            const unsigned origMask = quad->inout.mask;
 
-         lp_depth_test_quad(qs, quad);  /* quad->mask is updated */
+            lp_depth_test_quad(qs, quad);  /* quad->mask is updated */
 
-         /* update stencil buffer values according to z pass/fail result */
-         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned failMask = origMask & ~quad->inout.mask;
-            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
-         }
+            /* update stencil buffer values according to z pass/fail result */
+            if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+               const unsigned failMask = origMask & ~quad->inout.mask;
+               apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
+            }
 
-         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned passMask = origMask & quad->inout.mask;
-            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
+            if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+               const unsigned passMask = origMask & quad->inout.mask;
+               apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
+            }
+         }
+         else {
+            /* no depth test, apply Zpass operator to stencil buffer values */
+            apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
          }
-      }
-      else {
-         /* no depth test, apply Zpass operator to stencil buffer values */
-         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
-      }
-
-   }
 
-   /* put new stencil values into cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint s8z24 = tile->data.depth32[y][x];
-         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
-         tile->data.depth32[y][x] = s8z24;
       }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint z24s8 = tile->data.depth32[y][x];
-         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
-         tile->data.depth32[y][x] = z24s8;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         tile->data.stencil8[y][x] = stencilVals[j];
+
+      /* put new stencil values into cached tile */
+      switch (ps->format) {
+      case PIPE_FORMAT_S8Z24_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            uint s8z24 = tile->data.depth32[y][x];
+            s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
+            tile->data.depth32[y][x] = s8z24;
+         }
+         break;
+      case PIPE_FORMAT_Z24S8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            uint z24s8 = tile->data.depth32[y][x];
+            z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
+            tile->data.depth32[y][x] = z24s8;
+         }
+         break;
+      case PIPE_FORMAT_S8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.stencil8[y][x] = stencilVals[j];
+         }
+         break;
+      default:
+         assert(0);
       }
-      break;
-   default:
-      assert(0);
+
+      if (quad->inout.mask)
+         quads[pass++] = q;
    }
 
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_quad_stipple.c b/src/gallium/drivers/llvmpipe/lp_quad_stipple.c
index 616394619a5..429a2185406 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad_stipple.c
+++ b/src/gallium/drivers/llvmpipe/lp_quad_stipple.c
@@ -14,40 +14,46 @@
  * Apply polygon stipple to quads produced by triangle rasterization
  */
 static void
-stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+stipple_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
+   unsigned pass = nr;
 
-   if (quad->input.prim == QUAD_PRIM_TRI) {
+   if (quads[0]->input.prim == QUAD_PRIM_TRI) {
       struct llvmpipe_context *llvmpipe = qs->llvmpipe;
-      /* need to invert Y to index into OpenGL's stipple pattern */
-      const int col0 = quad->input.x0 % 32;
-      const int y0 = quad->input.y0;
-      const int y1 = y0 + 1;
-      const uint stipple0 = llvmpipe->poly_stipple.stipple[y0 % 32];
-      const uint stipple1 = llvmpipe->poly_stipple.stipple[y1 % 32];
+      unsigned q;
 
-      /* turn off quad mask bits that fail the stipple test */
-      if ((stipple0 & (bit31 >> col0)) == 0)
-         quad->inout.mask &= ~MASK_TOP_LEFT;
+      pass = 0;
 
-      if ((stipple0 & (bit30 >> col0)) == 0)
-         quad->inout.mask &= ~MASK_TOP_RIGHT;
+      for (q = 0; q < nr; q++)  {
+         struct quad_header *quad = quads[q];
 
-      if ((stipple1 & (bit31 >> col0)) == 0)
-         quad->inout.mask &= ~MASK_BOTTOM_LEFT;
+         const int col0 = quad->input.x0 % 32;
+         const int y0 = quad->input.y0;
+         const int y1 = y0 + 1;
+         const uint stipple0 = llvmpipe->poly_stipple.stipple[y0 % 32];
+         const uint stipple1 = llvmpipe->poly_stipple.stipple[y1 % 32];
 
-      if ((stipple1 & (bit30 >> col0)) == 0)
-         quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
+         /* turn off quad mask bits that fail the stipple test */
+         if ((stipple0 & (bit31 >> col0)) == 0)
+            quad->inout.mask &= ~MASK_TOP_LEFT;
 
-      if (!quad->inout.mask) {
-         /* all fragments failed stipple test, end of quad pipeline */
-         return;
+         if ((stipple0 & (bit30 >> col0)) == 0)
+            quad->inout.mask &= ~MASK_TOP_RIGHT;
+
+         if ((stipple1 & (bit31 >> col0)) == 0)
+            quad->inout.mask &= ~MASK_BOTTOM_LEFT;
+
+         if ((stipple1 & (bit30 >> col0)) == 0)
+            quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
+
+         if (quad->inout.mask)
+            quads[pass++] = quad;
       }
    }
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, pass);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 783f36bc7f0..9a15a0d32bb 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -172,7 +172,7 @@ clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
    if (quad->inout.mask) {
       struct llvmpipe_context *lp = setup->llvmpipe;
 
-      lp->quad.first->run( lp->quad.first, quad );
+      lp->quad.first->run( lp->quad.first, &quad, 1 );
    }
 }
 
@@ -193,7 +193,7 @@ emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
    if (mask & 4) setup->numFragsEmitted++;
    if (mask & 8) setup->numFragsEmitted++;
 #endif
-   lp->quad.first->run( lp->quad.first, quad );
+   lp->quad.first->run( lp->quad.first, &quad, 1 );
 #if DEBUG_FRAGS
    mask = quad->inout.mask;
    if (mask & 1) setup->numFragsWritten++;
-- 
2.30.2