radeonsi: use shader_info::cs::local_size_variable to clean up some code
[mesa.git] / src / gallium / drivers / softpipe / sp_quad_blend.c
index 3b8c2d5789c8eb87d30e28c0dd2362d0f9e366ef..975a760118f901c27557d83fe898634c4f51148e 100644 (file)
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/format/u_format.h"
+#include "util/u_dual_blend.h"
 #include "sp_context.h"
+#include "sp_state.h"
 #include "sp_quad.h"
 #include "sp_tile_cache.h"
 #include "sp_quad_pipe.h"
 
 
+enum format
+{
+   RGBA,
+   RGB,
+   LUMINANCE,
+   LUMINANCE_ALPHA,
+   INTENSITY
+};
+
+
+/** Subclass of quad_stage */
+struct blend_quad_stage
+{
+   struct quad_stage base;
+   boolean clamp[PIPE_MAX_COLOR_BUFS];  /**< clamp colors to [0,1]? */
+   enum format base_format[PIPE_MAX_COLOR_BUFS];
+   enum util_format_type format_type[PIPE_MAX_COLOR_BUFS];
+};
+
+
+/** cast wrapper */
+static inline struct blend_quad_stage *
+blend_quad_stage(struct quad_stage *stage)
+{
+   return (struct blend_quad_stage *) stage;
+}
+
+
 #define VEC4_COPY(DST, SRC) \
 do { \
     DST[0] = SRC[0]; \
@@ -207,7 +238,7 @@ logicop_quad(struct quad_stage *qs,
          res4[j] = ~0;
       break;
    default:
-      assert(0);
+      assert(0 && "invalid logicop mode");
    }
 
    for (j = 0; j < 4; j++) {
@@ -220,20 +251,31 @@ logicop_quad(struct quad_stage *qs,
 
 
 
+/**
+ * Do blending for a 2x2 quad for one color buffer.
+ * \param quadColor  the incoming quad colors
+ * \param dest  the destination/framebuffer quad colors
+ * \param const_blend_color  the constant blend color
+ * \param blend_index  which set of blending terms to use
+ */
 static void
 blend_quad(struct quad_stage *qs, 
            float (*quadColor)[4],
-           float (*dest)[4])
+           float (*quadColor2)[4],
+           float (*dest)[4],
+           const float const_blend_color[4],
+           unsigned blend_index)
 {
    static const float zero[4] = { 0, 0, 0, 0 };
    static const float one[4] = { 1, 1, 1, 1 };
    struct softpipe_context *softpipe = qs->softpipe;
-   float source[4][QUAD_SIZE] = { { 0 } };
+   float source[4][TGSI_QUAD_SIZE] = { { 0 } };
+   float blend_dest[4][TGSI_QUAD_SIZE];
 
    /*
     * Compute src/first term RGB
     */
-   switch (softpipe->blend->rgb_src_factor) {
+   switch (softpipe->blend->rt[blend_index].rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
       VEC4_COPY(source[0], quadColor[0]); /* R */
       VEC4_COPY(source[1], quadColor[1]); /* G */
@@ -245,62 +287,69 @@ blend_quad(struct quad_stage *qs,
       VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
-   {
-      const float *alpha = quadColor[3];
-      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-   }
-   break;
+      {
+         const float *alpha = quadColor[3];
+         VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_DST_COLOR:
       VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
       VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
       VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
       break;
    case PIPE_BLENDFACTOR_DST_ALPHA:
-   {
-      const float *alpha = dest[3];
-      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-   }
-   break;
+      {
+         const float *alpha = dest[3];
+         VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      } 
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-   {
-      const float *alpha = quadColor[3];
-      float diff[4], temp[4];
-      VEC4_SUB(diff, one, dest[3]);
-      VEC4_MIN(temp, alpha, diff);
-      VEC4_MUL(source[0], quadColor[0], temp); /* R */
-      VEC4_MUL(source[1], quadColor[1], temp); /* G */
-      VEC4_MUL(source[2], quadColor[2], temp); /* B */
-   }
-   break;
+      {
+         const float *alpha = quadColor[3];
+         float diff[4], temp[4];
+         VEC4_SUB(diff, one, dest[3]);
+         VEC4_MIN(temp, alpha, diff);
+         VEC4_MUL(source[0], quadColor[0], temp); /* R */
+         VEC4_MUL(source[1], quadColor[1], temp); /* G */
+         VEC4_MUL(source[2], quadColor[2], temp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
-   {
-      float comp[4];
-      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-      VEC4_MUL(source[0], quadColor[0], comp); /* R */
-      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-      VEC4_MUL(source[1], quadColor[1], comp); /* G */
-      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-      VEC4_MUL(source[2], quadColor[2], comp); /* B */
-   }
-   break;
+      {
+         float comp[4];
+         VEC4_SCALAR(comp, const_blend_color[0]); /* R */
+         VEC4_MUL(source[0], quadColor[0], comp); /* R */
+         VEC4_SCALAR(comp, const_blend_color[1]); /* G */
+         VEC4_MUL(source[1], quadColor[1], comp); /* G */
+         VEC4_SCALAR(comp, const_blend_color[2]); /* B */
+         VEC4_MUL(source[2], quadColor[2], comp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
-   {
-      float alpha[4];
-      VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
-      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-   }
-   break;
+      {
+         float alpha[4];
+         VEC4_SCALAR(alpha, const_blend_color[3]);
+         VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_SRC1_COLOR:
-      assert(0); /* to do */
+      VEC4_MUL(source[0], quadColor[0], quadColor2[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], quadColor2[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], quadColor2[2]); /* B */         
       break;
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      assert(0); /* to do */
+      {
+         const float *alpha = quadColor2[3];
+         VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      }
       break;
    case PIPE_BLENDFACTOR_ZERO:
       VEC4_COPY(source[0], zero); /* R */
@@ -308,93 +357,107 @@ blend_quad(struct quad_stage *qs,
       VEC4_COPY(source[2], zero); /* B */
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   {
-      float inv_comp[4];
-      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+         VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+         VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+         VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+         VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+         VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   {
-      float inv_alpha[4];
-      VEC4_SUB(inv_alpha, one, quadColor[3]);
-      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-   }
-   break;
+      {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, quadColor[3]);
+         VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   {
-      float inv_alpha[4];
-      VEC4_SUB(inv_alpha, one, dest[3]);
-      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-   }
-   break;
+      {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, dest[3]);
+         VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   {
-      float inv_comp[4];
-      VEC4_SUB(inv_comp, one, dest[0]); /* R */
-      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-      VEC4_SUB(inv_comp, one, dest[1]); /* G */
-      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-      VEC4_SUB(inv_comp, one, dest[2]); /* B */
-      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, dest[0]); /* R */
+         VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+         VEC4_SUB(inv_comp, one, dest[1]); /* G */
+         VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+         VEC4_SUB(inv_comp, one, dest[2]); /* B */
+         VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   {
-      float inv_comp[4];
-      /* R */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-      VEC4_MUL(source[0], quadColor[0], inv_comp);
-      /* G */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-      VEC4_MUL(source[1], quadColor[1], inv_comp);
-      /* B */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-      VEC4_MUL(source[2], quadColor[2], inv_comp);
-   }
-   break;
+      {
+         float inv_comp[4];
+         /* R */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[0]);
+         VEC4_MUL(source[0], quadColor[0], inv_comp);
+         /* G */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[1]);
+         VEC4_MUL(source[1], quadColor[1], inv_comp);
+         /* B */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[2]);
+         VEC4_MUL(source[2], quadColor[2], inv_comp);
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   {
-      float inv_alpha[4];
-      VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
-      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-   }
-   break;
+      {
+         float inv_alpha[4];
+         VEC4_SCALAR(inv_alpha, 1.0f - const_blend_color[3]);
+         VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      assert(0); /* to do */
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, quadColor2[0]); /* R */
+         VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+         VEC4_SUB(inv_comp, one, quadColor2[1]); /* G */
+         VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+         VEC4_SUB(inv_comp, one, quadColor2[2]); /* B */
+         VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+      }
       break;
    case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      assert(0); /* to do */
+      {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, quadColor2[3]);
+         VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+         VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+         VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      }
       break;
    default:
-      assert(0);
+      assert(0 && "invalid rgb src factor");
    }
 
    /*
     * Compute src/first term A
     */
-   switch (softpipe->blend->alpha_src_factor) {
+   switch (softpipe->blend->rt[blend_index].alpha_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
       VEC4_COPY(source[3], quadColor[3]); /* A */
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_SRC_ALPHA:
-   {
-      const float *alpha = quadColor[3];
-      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
-   }
-   break;
+      {
+         const float *alpha = quadColor[3];
+         VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_DST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_DST_ALPHA:
@@ -407,386 +470,550 @@ blend_quad(struct quad_stage *qs,
    case PIPE_BLENDFACTOR_CONST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_CONST_ALPHA:
-   {
-      float comp[4];
-      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-      VEC4_MUL(source[3], quadColor[3], comp); /* A */
-   }
-   break;
+      {
+         float comp[4];
+         VEC4_SCALAR(comp, const_blend_color[3]); /* A */
+         VEC4_MUL(source[3], quadColor[3], comp); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_ZERO:
       VEC4_COPY(source[3], zero); /* A */
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   {
-      float inv_alpha[4];
-      VEC4_SUB(inv_alpha, one, quadColor[3]);
-      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-   }
-   break;
+      {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, quadColor[3]);
+         VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   {
-      float inv_alpha[4];
-      VEC4_SUB(inv_alpha, one, dest[3]);
-      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-   }
-   break;
+      {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, dest[3]);
+         VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   {
-      float inv_comp[4];
-      /* A */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-      VEC4_MUL(source[3], quadColor[3], inv_comp);
-   }
-   break;
+      {
+         float inv_comp[4];
+         /* A */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[3]);
+         VEC4_MUL(source[3], quadColor[3], inv_comp);
+      }
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      {
+         const float *alpha = quadColor2[3];
+         VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+      }
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      {
+         float inv_alpha[4];
+         VEC4_SUB(inv_alpha, one, quadColor2[3]);
+         VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+      }
+      break;
    default:
-      assert(0);
+      assert(0 && "invalid alpha src factor");
    }
 
+   /* Save the original dest for use in masking */
+   VEC4_COPY(blend_dest[0], dest[0]);
+   VEC4_COPY(blend_dest[1], dest[1]);
+   VEC4_COPY(blend_dest[2], dest[2]);
+   VEC4_COPY(blend_dest[3], dest[3]);
+
 
    /*
-    * Compute dest/second term RGB
+    * Compute blend_dest/second term RGB
     */
-   switch (softpipe->blend->rgb_dst_factor) {
+   switch (softpipe->blend->rt[blend_index].rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
-      /* dest = dest * 1   NO-OP, leave dest as-is */
+      /* blend_dest = blend_dest * 1   NO-OP, leave blend_dest as-is */
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
-      VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
-      VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
-      VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+      VEC4_MUL(blend_dest[0], blend_dest[0], quadColor[0]); /* R */
+      VEC4_MUL(blend_dest[1], blend_dest[1], quadColor[1]); /* G */
+      VEC4_MUL(blend_dest[2], blend_dest[2], quadColor[2]); /* B */
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
-      VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
-      VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
-      VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+      VEC4_MUL(blend_dest[0], blend_dest[0], quadColor[3]); /* R * A */
+      VEC4_MUL(blend_dest[1], blend_dest[1], quadColor[3]); /* G * A */
+      VEC4_MUL(blend_dest[2], blend_dest[2], quadColor[3]); /* B * A */
       break;
    case PIPE_BLENDFACTOR_DST_ALPHA:
-      VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
-      VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
-      VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+      VEC4_MUL(blend_dest[0], blend_dest[0], blend_dest[3]); /* R * A */
+      VEC4_MUL(blend_dest[1], blend_dest[1], blend_dest[3]); /* G * A */
+      VEC4_MUL(blend_dest[2], blend_dest[2], blend_dest[3]); /* B * A */
       break;
    case PIPE_BLENDFACTOR_DST_COLOR:
-      VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
-      VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
-      VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+      VEC4_MUL(blend_dest[0], blend_dest[0], blend_dest[0]); /* R */
+      VEC4_MUL(blend_dest[1], blend_dest[1], blend_dest[1]); /* G */
+      VEC4_MUL(blend_dest[2], blend_dest[2], blend_dest[2]); /* B */
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-   {
-      const float *alpha = quadColor[3];
-      float diff[4], temp[4];
-      VEC4_SUB(diff, one, dest[3]);
-      VEC4_MIN(temp, alpha, diff);
-      VEC4_MUL(dest[0], quadColor[0], temp); /* R */
-      VEC4_MUL(dest[1], quadColor[1], temp); /* G */
-      VEC4_MUL(dest[2], quadColor[2], temp); /* B */
-   }
+      {
+         const float *alpha = quadColor[3];
+         float diff[4], temp[4];
+         VEC4_SUB(diff, one, blend_dest[3]);
+         VEC4_MIN(temp, alpha, diff);
+         VEC4_MUL(blend_dest[0], blend_dest[0], temp); /* R */
+         VEC4_MUL(blend_dest[1], blend_dest[1], temp); /* G */
+         VEC4_MUL(blend_dest[2], blend_dest[2], temp); /* B */
+      }
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
-   {
-      float comp[4];
-      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-      VEC4_MUL(dest[0], dest[0], comp); /* R */
-      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-      VEC4_MUL(dest[1], dest[1], comp); /* G */
-      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-      VEC4_MUL(dest[2], dest[2], comp); /* B */
-   }
-   break;
+      {
+         float comp[4];
+         VEC4_SCALAR(comp, const_blend_color[0]); /* R */
+         VEC4_MUL(blend_dest[0], blend_dest[0], comp); /* R */
+         VEC4_SCALAR(comp, const_blend_color[1]); /* G */
+         VEC4_MUL(blend_dest[1], blend_dest[1], comp); /* G */
+         VEC4_SCALAR(comp, const_blend_color[2]); /* B */
+         VEC4_MUL(blend_dest[2], blend_dest[2], comp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
-   {
-      float comp[4];
-      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-      VEC4_MUL(dest[0], dest[0], comp); /* R */
-      VEC4_MUL(dest[1], dest[1], comp); /* G */
-      VEC4_MUL(dest[2], dest[2], comp); /* B */
-   }
-   break;
+      {
+         float comp[4];
+         VEC4_SCALAR(comp, const_blend_color[3]); /* A */
+         VEC4_MUL(blend_dest[0], blend_dest[0], comp); /* R */
+         VEC4_MUL(blend_dest[1], blend_dest[1], comp); /* G */
+         VEC4_MUL(blend_dest[2], blend_dest[2], comp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_ZERO:
-      VEC4_COPY(dest[0], zero); /* R */
-      VEC4_COPY(dest[1], zero); /* G */
-      VEC4_COPY(dest[2], zero); /* B */
+      VEC4_COPY(blend_dest[0], zero); /* R */
+      VEC4_COPY(blend_dest[1], zero); /* G */
+      VEC4_COPY(blend_dest[2], zero); /* B */
       break;
    case PIPE_BLENDFACTOR_SRC1_COLOR:
+      VEC4_MUL(blend_dest[0], blend_dest[0], quadColor2[0]); /* R */
+      VEC4_MUL(blend_dest[1], blend_dest[1], quadColor2[1]); /* G */
+      VEC4_MUL(blend_dest[2], blend_dest[2], quadColor2[2]); /* B */
+      break;
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
-      /* XXX what are these? */
-      assert(0);
+      VEC4_MUL(blend_dest[0], blend_dest[0], quadColor2[3]); /* R * A */
+      VEC4_MUL(blend_dest[1], blend_dest[1], quadColor2[3]); /* G * A */
+      VEC4_MUL(blend_dest[2], blend_dest[2], quadColor2[3]); /* B * A */
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-   {
-      float inv_comp[4];
-      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+         VEC4_MUL(blend_dest[0], inv_comp, blend_dest[0]); /* R */
+         VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+         VEC4_MUL(blend_dest[1], inv_comp, blend_dest[1]); /* G */
+         VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+         VEC4_MUL(blend_dest[2], inv_comp, blend_dest[2]); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   {
-      float one_minus_alpha[QUAD_SIZE];
-      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
-      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
-      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
-   }
-   break;
+      {
+         float one_minus_alpha[TGSI_QUAD_SIZE];
+         VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+         VEC4_MUL(blend_dest[0], blend_dest[0], one_minus_alpha); /* R */
+         VEC4_MUL(blend_dest[1], blend_dest[1], one_minus_alpha); /* G */
+         VEC4_MUL(blend_dest[2], blend_dest[2], one_minus_alpha); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   {
-      float inv_comp[4];
-      VEC4_SUB(inv_comp, one, dest[3]); /* A */
-      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, blend_dest[3]); /* A */
+         VEC4_MUL(blend_dest[0], inv_comp, blend_dest[0]); /* R */
+         VEC4_MUL(blend_dest[1], inv_comp, blend_dest[1]); /* G */
+         VEC4_MUL(blend_dest[2], inv_comp, blend_dest[2]); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
-   {
-      float inv_comp[4];
-      VEC4_SUB(inv_comp, one, dest[0]); /* R */
-      VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
-      VEC4_SUB(inv_comp, one, dest[1]); /* G */
-      VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
-      VEC4_SUB(inv_comp, one, dest[2]); /* B */
-      VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, blend_dest[0]); /* R */
+         VEC4_MUL(blend_dest[0], blend_dest[0], inv_comp); /* R */
+         VEC4_SUB(inv_comp, one, blend_dest[1]); /* G */
+         VEC4_MUL(blend_dest[1], blend_dest[1], inv_comp); /* G */
+         VEC4_SUB(inv_comp, one, blend_dest[2]); /* B */
+         VEC4_MUL(blend_dest[2], blend_dest[2], inv_comp); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-   {
-      float inv_comp[4];
-      /* R */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-      VEC4_MUL(dest[0], dest[0], inv_comp);
-      /* G */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-      VEC4_MUL(dest[1], dest[1], inv_comp);
-      /* B */
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-      VEC4_MUL(dest[2], dest[2], inv_comp);
-   }
-   break;
+      {
+         float inv_comp[4];
+         /* R */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[0]);
+         VEC4_MUL(blend_dest[0], blend_dest[0], inv_comp);
+         /* G */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[1]);
+         VEC4_MUL(blend_dest[1], blend_dest[1], inv_comp);
+         /* B */
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[2]);
+         VEC4_MUL(blend_dest[2], blend_dest[2], inv_comp);
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   {
-      float inv_comp[4];
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-      VEC4_MUL(dest[0], dest[0], inv_comp);
-      VEC4_MUL(dest[1], dest[1], inv_comp);
-      VEC4_MUL(dest[2], dest[2], inv_comp);
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[3]);
+         VEC4_MUL(blend_dest[0], blend_dest[0], inv_comp);
+         VEC4_MUL(blend_dest[1], blend_dest[1], inv_comp);
+         VEC4_MUL(blend_dest[2], blend_dest[2], inv_comp);
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, quadColor2[0]); /* R */
+         VEC4_MUL(blend_dest[0], inv_comp, blend_dest[0]); /* R */
+         VEC4_SUB(inv_comp, one, quadColor2[1]); /* G */
+         VEC4_MUL(blend_dest[1], inv_comp, blend_dest[1]); /* G */
+         VEC4_SUB(inv_comp, one, quadColor2[2]); /* B */
+         VEC4_MUL(blend_dest[2], inv_comp, blend_dest[2]); /* B */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-      /* XXX what are these? */
-      assert(0);
+      {
+         float one_minus_alpha[TGSI_QUAD_SIZE];
+         VEC4_SUB(one_minus_alpha, one, quadColor2[3]);
+         VEC4_MUL(blend_dest[0], blend_dest[0], one_minus_alpha); /* R */
+         VEC4_MUL(blend_dest[1], blend_dest[1], one_minus_alpha); /* G */
+         VEC4_MUL(blend_dest[2], blend_dest[2], one_minus_alpha); /* B */
+      }
       break;
    default:
-      assert(0);
+      assert(0 && "invalid rgb dst factor");
    }
 
    /*
-    * Compute dest/second term A
+    * Compute blend_dest/second term A
     */
-   switch (softpipe->blend->alpha_dst_factor) {
+   switch (softpipe->blend->rt[blend_index].alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
-      /* dest = dest * 1   NO-OP, leave dest as-is */
+      /* blend_dest = blend_dest * 1   NO-OP, leave blend_dest as-is */
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_SRC_ALPHA:
-      VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+      VEC4_MUL(blend_dest[3], blend_dest[3], quadColor[3]); /* A * A */
       break;
    case PIPE_BLENDFACTOR_DST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_DST_ALPHA:
-      VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+      VEC4_MUL(blend_dest[3], blend_dest[3], blend_dest[3]); /* A */
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      /* dest = dest * 1   NO-OP, leave dest as-is */
+      /* blend_dest = blend_dest * 1   NO-OP, leave blend_dest as-is */
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_CONST_ALPHA:
-   {
-      float comp[4];
-      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-      VEC4_MUL(dest[3], dest[3], comp); /* A */
-   }
-   break;
+      {
+         float comp[4];
+         VEC4_SCALAR(comp, const_blend_color[3]); /* A */
+         VEC4_MUL(blend_dest[3], blend_dest[3], comp); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_ZERO:
-      VEC4_COPY(dest[3], zero); /* A */
+      VEC4_COPY(blend_dest[3], zero); /* A */
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-   {
-      float one_minus_alpha[QUAD_SIZE];
-      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
-   }
-   break;
+      {
+         float one_minus_alpha[TGSI_QUAD_SIZE];
+         VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+         VEC4_MUL(blend_dest[3], blend_dest[3], one_minus_alpha); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-   {
-      float inv_comp[4];
-      VEC4_SUB(inv_comp, one, dest[3]); /* A */
-      VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SUB(inv_comp, one, blend_dest[3]); /* A */
+         VEC4_MUL(blend_dest[3], inv_comp, blend_dest[3]); /* A */
+      }
+      break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
       /* fall-through */
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-   {
-      float inv_comp[4];
-      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-      VEC4_MUL(dest[3], dest[3], inv_comp);
-   }
-   break;
+      {
+         float inv_comp[4];
+         VEC4_SCALAR(inv_comp, 1.0f - const_blend_color[3]);
+         VEC4_MUL(blend_dest[3], blend_dest[3], inv_comp);
+      }
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      VEC4_MUL(blend_dest[3], blend_dest[3], quadColor2[3]); /* A * A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      {
+         float one_minus_alpha[TGSI_QUAD_SIZE];
+         VEC4_SUB(one_minus_alpha, one, quadColor2[3]);
+         VEC4_MUL(blend_dest[3], blend_dest[3], one_minus_alpha); /* A */
+      }
+      break;
    default:
-      assert(0);
+      assert(0 && "invalid alpha dst factor");
    }
 
    /*
     * Combine RGB terms
     */
-   switch (softpipe->blend->rgb_func) {
+   switch (softpipe->blend->rt[blend_index].rgb_func) {
    case PIPE_BLEND_ADD:
-      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD(quadColor[0], source[0], blend_dest[0]); /* R */
+      VEC4_ADD(quadColor[1], source[1], blend_dest[1]); /* G */
+      VEC4_ADD(quadColor[2], source[2], blend_dest[2]); /* B */
       break;
    case PIPE_BLEND_SUBTRACT:
-      VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
-      VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
-      VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_SUB(quadColor[0], source[0], blend_dest[0]); /* R */
+      VEC4_SUB(quadColor[1], source[1], blend_dest[1]); /* G */
+      VEC4_SUB(quadColor[2], source[2], blend_dest[2]); /* B */
       break;
    case PIPE_BLEND_REVERSE_SUBTRACT:
-      VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
-      VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
-      VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+      VEC4_SUB(quadColor[0], blend_dest[0], source[0]); /* R */
+      VEC4_SUB(quadColor[1], blend_dest[1], source[1]); /* G */
+      VEC4_SUB(quadColor[2], blend_dest[2], source[2]); /* B */
       break;
    case PIPE_BLEND_MIN:
-      VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
-      VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
-      VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_MIN(quadColor[0], source[0], blend_dest[0]); /* R */
+      VEC4_MIN(quadColor[1], source[1], blend_dest[1]); /* G */
+      VEC4_MIN(quadColor[2], source[2], blend_dest[2]); /* B */
       break;
    case PIPE_BLEND_MAX:
-      VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
-      VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
-      VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_MAX(quadColor[0], source[0], blend_dest[0]); /* R */
+      VEC4_MAX(quadColor[1], source[1], blend_dest[1]); /* G */
+      VEC4_MAX(quadColor[2], source[2], blend_dest[2]); /* B */
       break;
    default:
-      assert(0);
+      assert(0 && "invalid rgb blend func");
    }
 
    /*
     * Combine A terms
     */
-   switch (softpipe->blend->alpha_func) {
+   switch (softpipe->blend->rt[blend_index].alpha_func) {
    case PIPE_BLEND_ADD:
-      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      VEC4_ADD(quadColor[3], source[3], blend_dest[3]); /* A */
       break;
    case PIPE_BLEND_SUBTRACT:
-      VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+      VEC4_SUB(quadColor[3], source[3], blend_dest[3]); /* A */
       break;
    case PIPE_BLEND_REVERSE_SUBTRACT:
-      VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+      VEC4_SUB(quadColor[3], blend_dest[3], source[3]); /* A */
       break;
    case PIPE_BLEND_MIN:
-      VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+      VEC4_MIN(quadColor[3], source[3], blend_dest[3]); /* A */
       break;
    case PIPE_BLEND_MAX:
-      VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+      VEC4_MAX(quadColor[3], source[3], blend_dest[3]); /* A */
       break;
    default:
-      assert(0);
+      assert(0 && "invalid alpha blend func");
    }
 }
 
 static void
-colormask_quad(struct quad_stage *qs,
+colormask_quad(unsigned colormask,
                float (*quadColor)[4],
                float (*dest)[4])
 {
-   struct softpipe_context *softpipe = qs->softpipe;
-
    /* R */
-   if (!(softpipe->blend->colormask & PIPE_MASK_R))
+   if (!(colormask & PIPE_MASK_R))
       COPY_4V(quadColor[0], dest[0]);
 
    /* G */
-   if (!(softpipe->blend->colormask & PIPE_MASK_G))
+   if (!(colormask & PIPE_MASK_G))
       COPY_4V(quadColor[1], dest[1]);
 
    /* B */
-   if (!(softpipe->blend->colormask & PIPE_MASK_B))
+   if (!(colormask & PIPE_MASK_B))
       COPY_4V(quadColor[2], dest[2]);
 
    /* A */
-   if (!(softpipe->blend->colormask & PIPE_MASK_A))
+   if (!(colormask & PIPE_MASK_A))
       COPY_4V(quadColor[3], dest[3]);
 }
 
 
+/**
+ * Clamp all colors in a quad to [0, 1]
+ */
+static void
+clamp_colors(float (*quadColor)[4])
+{
+   unsigned i, j;
+
+   for (i = 0; i < 4; i++) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         quadColor[i][j] = CLAMP(quadColor[i][j], 0.0F, 1.0F);
+      }
+   }
+}
+
+
+/**
+ * If we're drawing to a luminance, luminance/alpha or intensity surface
+ * we have to adjust (rebase) the fragment/quad colors before writing them
+ * to the tile cache.  The tile cache always stores RGBA colors but if
+ * we're caching a L/A surface (for example) we need to be sure that R=G=B
+ * so that subsequent reads from the surface cache appear to return L/A
+ * values.
+ * The piglit fbo-blending-formats test will exercise this.
+ */
+static void
+rebase_colors(enum format base_format, float (*quadColor)[4])
+{
+   unsigned i;
+
+   switch (base_format) {
+   case RGB:
+      for (i = 0; i < 4; i++) {
+         /* A = 1 */
+         quadColor[3][i] = 1.0F;
+      }
+      break;
+   case LUMINANCE:
+      for (i = 0; i < 4; i++) {
+         /* B = G = R */
+         quadColor[2][i] = quadColor[1][i] = quadColor[0][i];
+         /* A = 1 */
+         quadColor[3][i] = 1.0F;
+      }
+      break;
+   case LUMINANCE_ALPHA:
+      for (i = 0; i < 4; i++) {
+         /* B = G = R */
+         quadColor[2][i] = quadColor[1][i] = quadColor[0][i];
+      }
+      break;
+   case INTENSITY:
+      for (i = 0; i < 4; i++) {
+         /* A = B = G = R */
+         quadColor[3][i] = quadColor[2][i] = quadColor[1][i] = quadColor[0][i];
+      }
+      break;
+   default:
+      ; /* nothing */
+   }
+}
+
 static void
 blend_fallback(struct quad_stage *qs, 
                struct quad_header *quads[],
                unsigned nr)
 {
+   const struct blend_quad_stage *bqs = blend_quad_stage(qs);
    struct softpipe_context *softpipe = qs->softpipe;
    const struct pipe_blend_state *blend = softpipe->blend;
    unsigned cbuf;
-
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
-   {
-      float dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
-                              quads[0]->input.x0, 
-                              quads[0]->input.y0);
-      uint q, i, j;
-
-      for (q = 0; q < nr; q++) {
-         struct quad_header *quad = quads[q];
-         float (*quadColor)[4] = quad->output.color[cbuf];
-         const int itx = (quad->input.x0 & (TILE_SIZE-1));
-         const int ity = (quad->input.y0 & (TILE_SIZE-1));
-
-         /* get/swizzle dest colors 
-          */
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = itx + (j & 1);
-            int y = ity + (j >> 1);
-            for (i = 0; i < 4; i++) {
-               dest[i][j] = tile->data.color[y][x][i];
+   boolean write_all =
+      softpipe->fs_variant->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
+
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      if (softpipe->framebuffer.cbufs[cbuf]) {
+         /* which blend/mask state index to use: */
+         const uint blend_buf = blend->independent_blend_enable ? cbuf : 0;
+         float dest[4][TGSI_QUAD_SIZE];
+         struct softpipe_cached_tile *tile
+            = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
+                                 quads[0]->input.x0, 
+                                 quads[0]->input.y0, quads[0]->input.layer);
+         const boolean clamp = bqs->clamp[cbuf];
+         const float *blend_color;
+         const boolean dual_source_blend = util_blend_state_is_dual(blend, cbuf);
+         uint q, i, j;
+
+         if (clamp)
+            blend_color = softpipe->blend_color_clamped.color;
+         else
+            blend_color = softpipe->blend_color.color;
+
+         for (q = 0; q < nr; q++) {
+            struct quad_header *quad = quads[q];
+            float (*quadColor)[4];
+            float (*quadColor2)[4] = NULL;
+            float temp_quad_color[TGSI_QUAD_SIZE][4];
+            const int itx = (quad->input.x0 & (TILE_SIZE-1));
+            const int ity = (quad->input.y0 & (TILE_SIZE-1));
+
+            if (write_all) {
+               for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+                  for (i = 0; i < 4; i++) {
+                     temp_quad_color[i][j] = quad->output.color[0][i][j];
+                  }
+               }
+               quadColor = temp_quad_color;
+            } else {
+               quadColor = quad->output.color[cbuf];
+               if (dual_source_blend)
+                  quadColor2 = quad->output.color[cbuf + 1];
             }
-         }
 
+            /* If fixed-point dest color buffer, need to clamp the incoming
+             * fragment colors now.
+             */
+            if (clamp || softpipe->rasterizer->clamp_fragment_color) {
+               clamp_colors(quadColor);
+            }
 
-         if (blend->logicop_enable) {
-            logicop_quad( qs, quadColor, dest );
-         }
-         else if (blend->blend_enable) {
-            blend_quad( qs, quadColor, dest );
-         }
-
-         if (blend->colormask != 0xf)
-            colormask_quad( qs, quadColor, dest );
-   
-         /* Output color values
-          */
-         for (j = 0; j < QUAD_SIZE; j++) {
-            if (quad->inout.mask & (1 << j)) {
+            /* get/swizzle dest colors
+             */
+            for (j = 0; j < TGSI_QUAD_SIZE; j++) {
                int x = itx + (j & 1);
                int y = ity + (j >> 1);
-               for (i = 0; i < 4; i++) { /* loop over color chans */
-                  tile->data.color[y][x][i] = quadColor[i][j];
+               for (i = 0; i < 4; i++) {
+                  dest[i][j] = tile->data.color[y][x][i];
+               }
+            }
+
+
+            if (blend->logicop_enable) {
+               if (bqs->format_type[cbuf] != UTIL_FORMAT_TYPE_FLOAT) {
+                  logicop_quad( qs, quadColor, dest );
+               }
+            }
+            else if (blend->rt[blend_buf].blend_enable) {
+               blend_quad(qs, quadColor, quadColor2, dest, blend_color, blend_buf);
+
+               /* If fixed-point dest color buffer, need to clamp the outgoing
+                * fragment colors now.
+                */
+               if (clamp) {
+                  clamp_colors(quadColor);
+               }
+            }
+
+            rebase_colors(bqs->base_format[cbuf], quadColor);
+
+            if (blend->rt[blend_buf].colormask != 0xf)
+               colormask_quad( blend->rt[cbuf].colormask, quadColor, dest);
+
+            /* Output color values
+             */
+            for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+               if (quad->inout.mask & (1 << j)) {
+                  int x = itx + (j & 1);
+                  int y = ity + (j >> 1);
+                  for (i = 0; i < 4; i++) { /* loop over color chans */
+                     tile->data.color[y][x][i] = quadColor[i][j];
+                  }
                }
             }
          }
@@ -800,16 +1027,17 @@ blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs,
                                          struct quad_header *quads[],
                                          unsigned nr)
 {
+   const struct blend_quad_stage *bqs = blend_quad_stage(qs);
    static const float one[4] = { 1, 1, 1, 1 };
-   float one_minus_alpha[QUAD_SIZE];
-   float dest[4][QUAD_SIZE];
-   float source[4][QUAD_SIZE];
+   float one_minus_alpha[TGSI_QUAD_SIZE];
+   float dest[4][TGSI_QUAD_SIZE];
+   float source[4][TGSI_QUAD_SIZE];
    uint i, j, q;
 
    struct softpipe_cached_tile *tile
       = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
                            quads[0]->input.x0, 
-                           quads[0]->input.y0);
+                           quads[0]->input.y0, quads[0]->input.layer);
 
    for (q = 0; q < nr; q++) {
       struct quad_header *quad = quads[q];
@@ -819,7 +1047,7 @@ blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs,
       const int ity = (quad->input.y0 & (TILE_SIZE-1));
       
       /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          int x = itx + (j & 1);
          int y = ity + (j >> 1);
          for (i = 0; i < 4; i++) {
@@ -827,6 +1055,13 @@ blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs,
          }
       }
 
+      /* If fixed-point dest color buffer, need to clamp the incoming
+       * fragment colors now.
+       */
+      if (bqs->clamp[0] || qs->softpipe->rasterizer->clamp_fragment_color) {
+         clamp_colors(quadColor);
+      }
+
       VEC4_MUL(source[0], quadColor[0], alpha); /* R */
       VEC4_MUL(source[1], quadColor[1], alpha); /* G */
       VEC4_MUL(source[2], quadColor[2], alpha); /* B */
@@ -836,14 +1071,23 @@ blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs,
       VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
       VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
       VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
-      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* B */
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+
+      VEC4_ADD(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD(quadColor[3], source[3], dest[3]); /* A */
+
+      /* If fixed-point dest color buffer, need to clamp the outgoing
+       * fragment colors now.
+       */
+      if (bqs->clamp[0]) {
+         clamp_colors(quadColor);
+      }
 
-      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
-      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      rebase_colors(bqs->base_format[0], quadColor);
 
-      for (j = 0; j < QUAD_SIZE; j++) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          if (quad->inout.mask & (1 << j)) {
             int x = itx + (j & 1);
             int y = ity + (j >> 1);
@@ -860,13 +1104,14 @@ blend_single_add_one_one(struct quad_stage *qs,
                          struct quad_header *quads[],
                          unsigned nr)
 {
-   float dest[4][QUAD_SIZE];
+   const struct blend_quad_stage *bqs = blend_quad_stage(qs);
+   float dest[4][TGSI_QUAD_SIZE];
    uint i, j, q;
 
    struct softpipe_cached_tile *tile
       = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
                            quads[0]->input.x0, 
-                           quads[0]->input.y0);
+                           quads[0]->input.y0, quads[0]->input.layer);
 
    for (q = 0; q < nr; q++) {
       struct quad_header *quad = quads[q];
@@ -875,7 +1120,7 @@ blend_single_add_one_one(struct quad_stage *qs,
       const int ity = (quad->input.y0 & (TILE_SIZE-1));
       
       /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          int x = itx + (j & 1);
          int y = ity + (j >> 1);
          for (i = 0; i < 4; i++) {
@@ -883,12 +1128,28 @@ blend_single_add_one_one(struct quad_stage *qs,
          }
       }
      
-      VEC4_ADD_SAT(quadColor[0], quadColor[0], dest[0]); /* R */
-      VEC4_ADD_SAT(quadColor[1], quadColor[1], dest[1]); /* G */
-      VEC4_ADD_SAT(quadColor[2], quadColor[2], dest[2]); /* B */
-      VEC4_ADD_SAT(quadColor[3], quadColor[3], dest[3]); /* A */
+      /* If fixed-point dest color buffer, need to clamp the incoming
+       * fragment colors now.
+       */
+      if (bqs->clamp[0] || qs->softpipe->rasterizer->clamp_fragment_color) {
+         clamp_colors(quadColor);
+      }
+
+      VEC4_ADD(quadColor[0], quadColor[0], dest[0]); /* R */
+      VEC4_ADD(quadColor[1], quadColor[1], dest[1]); /* G */
+      VEC4_ADD(quadColor[2], quadColor[2], dest[2]); /* B */
+      VEC4_ADD(quadColor[3], quadColor[3], dest[3]); /* A */
+
+      /* If fixed-point dest color buffer, need to clamp the outgoing
+       * fragment colors now.
+       */
+      if (bqs->clamp[0]) {
+         clamp_colors(quadColor);
+      }
+
+      rebase_colors(bqs->base_format[0], quadColor);
 
-      for (j = 0; j < QUAD_SIZE; j++) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          if (quad->inout.mask & (1 << j)) {
             int x = itx + (j & 1);
             int y = ity + (j >> 1);
@@ -901,25 +1162,37 @@ blend_single_add_one_one(struct quad_stage *qs,
 }
 
 
+/**
+ * Just copy the quad color to the framebuffer tile (respecting the writemask),
+ * for one color buffer.
+ * Clamping will be done, if needed (depending on the color buffer's
+ * datatype) when we write/pack the colors later.
+ */
 static void
 single_output_color(struct quad_stage *qs, 
                     struct quad_header *quads[],
                     unsigned nr)
 {
+   const struct blend_quad_stage *bqs = blend_quad_stage(qs);
    uint i, j, q;
 
    struct softpipe_cached_tile *tile
       = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
                            quads[0]->input.x0, 
-                           quads[0]->input.y0);
+                           quads[0]->input.y0, quads[0]->input.layer);
 
    for (q = 0; q < nr; q++) {
       struct quad_header *quad = quads[q];
       float (*quadColor)[4] = quad->output.color[0];
       const int itx = (quad->input.x0 & (TILE_SIZE-1));
       const int ity = (quad->input.y0 & (TILE_SIZE-1));
-      
-      for (j = 0; j < QUAD_SIZE; j++) {
+
+      if (qs->softpipe->rasterizer->clamp_fragment_color)
+         clamp_colors(quadColor);
+
+      rebase_colors(bqs->base_format[0], quadColor);
+
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          if (quad->inout.mask & (1 << j)) {
             int x = itx + (j & 1);
             int y = ity + (j >> 1);
@@ -944,8 +1217,10 @@ choose_blend_quad(struct quad_stage *qs,
                   struct quad_header *quads[],
                   unsigned nr)
 {
+   struct blend_quad_stage *bqs = blend_quad_stage(qs);
    struct softpipe_context *softpipe = qs->softpipe;
    const struct pipe_blend_state *blend = softpipe->blend;
+   unsigned i;
 
    qs->run = blend_fallback;
    
@@ -953,29 +1228,57 @@ choose_blend_quad(struct quad_stage *qs,
       qs->run = blend_noop;
    }
    else if (!softpipe->blend->logicop_enable &&
-            softpipe->blend->colormask == 0xf &&
+            softpipe->blend->rt[0].colormask == 0xf &&
             softpipe->framebuffer.nr_cbufs == 1)
    {
-      if (!blend->blend_enable) {
+      if (softpipe->framebuffer.cbufs[0] == NULL) {
+         qs->run = blend_noop;
+      }
+      else if (!blend->rt[0].blend_enable) {
          qs->run = single_output_color;
       }
-      else if (blend->rgb_src_factor == blend->alpha_src_factor &&
-               blend->rgb_dst_factor == blend->alpha_dst_factor &&
-               blend->rgb_func == blend->alpha_func)
+      else if (blend->rt[0].rgb_src_factor == blend->rt[0].alpha_src_factor &&
+               blend->rt[0].rgb_dst_factor == blend->rt[0].alpha_dst_factor &&
+               blend->rt[0].rgb_func == blend->rt[0].alpha_func)
       {
-         if (blend->alpha_func == PIPE_BLEND_ADD) {
-            if (blend->rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
-                blend->rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
+         if (blend->rt[0].alpha_func == PIPE_BLEND_ADD) {
+            if (blend->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
+                blend->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
                qs->run = blend_single_add_one_one;
             }
-            else if (blend->rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
-                blend->rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+            else if (blend->rt[0].rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
+                blend->rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
                qs->run = blend_single_add_src_alpha_inv_src_alpha;
 
          }
       }
    }
 
+   /* For each color buffer, determine if the buffer has destination alpha and
+    * whether color clamping is needed.
+    */
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
+      if (softpipe->framebuffer.cbufs[i]) {
+         const enum pipe_format format = softpipe->framebuffer.cbufs[i]->format;
+         const struct util_format_description *desc =
+            util_format_description(format);
+         /* assuming all or no color channels are normalized: */
+         bqs->clamp[i] = desc->channel[0].normalized;
+         bqs->format_type[i] = desc->channel[0].type;
+
+         if (util_format_is_intensity(format))
+            bqs->base_format[i] = INTENSITY;
+         else if (util_format_is_luminance(format))
+            bqs->base_format[i] = LUMINANCE;
+         else if (util_format_is_luminance_alpha(format))
+            bqs->base_format[i] = LUMINANCE_ALPHA;
+         else if (!util_format_has_alpha(format))
+            bqs->base_format[i] = RGB;
+         else
+            bqs->base_format[i] = RGBA;
+      }
+   }
+
    qs->run(qs, quads, nr);
 }
 
@@ -994,12 +1297,15 @@ static void blend_destroy(struct quad_stage *qs)
 
 struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
 {
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+   struct blend_quad_stage *stage = CALLOC_STRUCT(blend_quad_stage);
+
+   if (!stage)
+      return NULL;
 
-   stage->softpipe = softpipe;
-   stage->begin = blend_begin;
-   stage->run = choose_blend_quad;
-   stage->destroy = blend_destroy;
+   stage->base.softpipe = softpipe;
+   stage->base.begin = blend_begin;
+   stage->base.run = choose_blend_quad;
+   stage->base.destroy = blend_destroy;
 
-   return stage;
+   return &stage->base;
 }