i965: Prepare gs_state emitting code to include gen4-5.
[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
index 3bc0f4cf5f1e82e40674aa4bd43937c97318e120..2a5b3648102565aede7fda0845c456f978f08cf3 100644 (file)
@@ -1156,9 +1156,16 @@ genX(calculate_attr_overrides)(const struct brw_context *brw,
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
-static void
-genX(upload_depth_stencil_state)(struct brw_context *brw)
+#if GEN_GEN >= 8
+typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
+#elif GEN_GEN >= 6
+typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
+#endif
+
+static inline void
+set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -1173,66 +1180,76 @@ genX(upload_depth_stencil_state)(struct brw_context *brw)
    struct gl_stencil_attrib *stencil = &ctx->Stencil;
    const int b = stencil->_BackFace;
 
+   if (depth->Test && depth_irb) {
+      ds->DepthTestEnable = true;
+      ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
+      ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
+   }
+
+   if (brw->stencil_enabled) {
+      ds->StencilTestEnable = true;
+      ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
+      ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
+
+      ds->StencilTestFunction =
+         intel_translate_compare_func(stencil->Function[0]);
+      ds->StencilFailOp =
+         intel_translate_stencil_op(stencil->FailFunc[0]);
+      ds->StencilPassDepthPassOp =
+         intel_translate_stencil_op(stencil->ZPassFunc[0]);
+      ds->StencilPassDepthFailOp =
+         intel_translate_stencil_op(stencil->ZFailFunc[0]);
+
+      ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
+
+      if (brw->stencil_two_sided) {
+         ds->DoubleSidedStencilEnable = true;
+         ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
+         ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
+
+         ds->BackfaceStencilTestFunction =
+            intel_translate_compare_func(stencil->Function[b]);
+         ds->BackfaceStencilFailOp =
+            intel_translate_stencil_op(stencil->FailFunc[b]);
+         ds->BackfaceStencilPassDepthPassOp =
+            intel_translate_stencil_op(stencil->ZPassFunc[b]);
+         ds->BackfaceStencilPassDepthFailOp =
+            intel_translate_stencil_op(stencil->ZFailFunc[b]);
+      }
+
+#if GEN_GEN <= 5 || GEN_GEN >= 9
+      ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
+      ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
+#endif
+   }
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_depth_stencil_state)(struct brw_context *brw)
+{
 #if GEN_GEN >= 8
    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
+      set_depth_stencil_bits(brw, &wmds);
+   }
 #else
    uint32_t ds_offset;
-   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, wmds) {
-#endif
-      if (depth->Test && depth_irb) {
-         wmds.DepthTestEnable = true;
-         wmds.DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
-         wmds.DepthTestFunction = intel_translate_compare_func(depth->Func);
-      }
-
-      if (brw->stencil_enabled) {
-         wmds.StencilTestEnable = true;
-         wmds.StencilWriteMask = stencil->WriteMask[0] & 0xff;
-         wmds.StencilTestMask = stencil->ValueMask[0] & 0xff;
-
-         wmds.StencilTestFunction =
-            intel_translate_compare_func(stencil->Function[0]);
-         wmds.StencilFailOp =
-            intel_translate_stencil_op(stencil->FailFunc[0]);
-         wmds.StencilPassDepthPassOp =
-            intel_translate_stencil_op(stencil->ZPassFunc[0]);
-         wmds.StencilPassDepthFailOp =
-            intel_translate_stencil_op(stencil->ZFailFunc[0]);
-
-         wmds.StencilBufferWriteEnable = brw->stencil_write_enabled;
-
-         if (brw->stencil_two_sided) {
-            wmds.DoubleSidedStencilEnable = true;
-            wmds.BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
-            wmds.BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
-
-            wmds.BackfaceStencilTestFunction =
-               intel_translate_compare_func(stencil->Function[b]);
-            wmds.BackfaceStencilFailOp =
-               intel_translate_stencil_op(stencil->FailFunc[b]);
-            wmds.BackfaceStencilPassDepthPassOp =
-               intel_translate_stencil_op(stencil->ZPassFunc[b]);
-            wmds.BackfaceStencilPassDepthFailOp =
-               intel_translate_stencil_op(stencil->ZFailFunc[b]);
-         }
-
-#if GEN_GEN >= 9
-         wmds.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
-         wmds.BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
-#endif
-      }
+   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
+      set_depth_stencil_bits(brw, &ds);
    }
 
+   /* Now upload a pointer to the indirect state */
 #if GEN_GEN == 6
    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
       ptr.DEPTH_STENCIL_STATEChange = true;
    }
-#elif GEN_GEN == 7
+#else
    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
    }
 #endif
+#endif
 }
 
 static const struct brw_tracked_state genX(depth_stencil_state) = {
@@ -2340,7 +2357,7 @@ genX(upload_gs_state)(struct brw_context *brw)
       brw_gs_prog_data(stage_prog_data);
 #endif
 
-#if GEN_GEN < 7
+#if GEN_GEN == 6
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
       if (active && stage_state->push_const_size != 0) {
          cgs.Buffer0Valid = true;
@@ -2367,8 +2384,8 @@ genX(upload_gs_state)(struct brw_context *brw)
       gen7_emit_cs_stall_flush(brw);
 #endif
 
-   if (active) {
-      brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+   brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+      if (active) {
          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
 
 #if GEN_GEN >= 7
@@ -2452,16 +2469,28 @@ genX(upload_gs_state)(struct brw_context *brw)
          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
 #endif
-      }
 #if GEN_GEN < 7
-   } else if (brw->ff_gs.prog_active)  {
-      /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS
-       * program. This function provides the needed 3DSTATE_GS for this.
-       */
-      upload_gs_state_for_tf(brw);
+      } else if (brw->ff_gs.prog_active) {
+         /* In gen6, transform feedback for the VS stage is done with an
+          * ad-hoc GS program. This function provides the needed 3DSTATE_GS
+          * for this.
+          */
+         gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
+         gs.SingleProgramFlow = true;
+         gs.VectorMaskEnable = true;
+         gs.DispatchGRFStartRegisterForURBData = 2;
+         gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
+         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
+         gs.StatisticsEnable = true;
+         gs.SOStatisticsEnable = true;
+         gs.RenderingEnabled = true;
+         gs.SVBIPayloadEnable = true;
+         gs.SVBIPostIncrementEnable = true;
+         gs.SVBIPostIncrementValue =
+            brw->ff_gs.prog_data->svbi_postincrement_value;
+         gs.Enable = true;
 #endif
-   } else {
-      brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+      } else {
          gs.StatisticsEnable = true;
 #if GEN_GEN < 7
          gs.RenderingEnabled = true;
@@ -2475,7 +2504,8 @@ genX(upload_gs_state)(struct brw_context *brw)
 #endif
       }
    }
-#if GEN_GEN < 7
+
+#if GEN_GEN == 6
    brw->gs.enabled = active;
 #endif
 }
@@ -2513,6 +2543,129 @@ fix_dual_blend_alpha_to_one(GLenum function)
 #define blend_factor(x) brw_translate_blend_factor(x)
 #define blend_eqn(x) brw_translate_blend_equation(x)
 
+/**
+ * Modify blend function to force destination alpha to 1.0
+ *
+ * If \c function specifies a blend function that uses destination alpha,
+ * replace it with a function that hard-wires destination alpha to 1.0.  This
+ * is used when rendering to xRGB targets.
+ */
+static GLenum
+brw_fix_xRGB_alpha(GLenum function)
+{
+   switch (function) {
+   case GL_DST_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_DST_ALPHA:
+   case GL_SRC_ALPHA_SATURATE:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
+#if GEN_GEN >= 6
+typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
+#endif
+
+UNUSED static bool
+set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
+                     bool alpha_to_one)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+
+   bool independent_alpha_blend = false;
+
+   /* Used for implementing the following bit of GL_EXT_texture_integer:
+    * "Per-fragment operations that require floating-point color
+    *  components, including multisample alpha operations, alpha test,
+    *  blending, and dithering, have no effect when the corresponding
+    *  colors are written to an integer color buffer."
+    */
+   const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
+
+   const unsigned blend_enabled = GEN_GEN >= 6 ?
+      ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
+
+   /* _NEW_COLOR */
+   if (ctx->Color.ColorLogicOpEnabled) {
+      GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
+         : GL_UNSIGNED_NORMALIZED;
+      WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
+                rb_type != GL_UNSIGNED_NORMALIZED &&
+                rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
+                "renderbuffer\n",
+                _mesa_enum_to_string(ctx->Color.LogicOp),
+                _mesa_enum_to_string(rb_type));
+      if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
+         entry->LogicOpEnable = true;
+         entry->LogicOpFunction =
+            intel_translate_logic_op(ctx->Color.LogicOp);
+      }
+   } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
+              && (GEN_GEN <= 5 || !integer)) {
+      GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
+      GLenum eqA = ctx->Color.Blend[i].EquationA;
+      GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
+      GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
+      GLenum srcA = ctx->Color.Blend[i].SrcA;
+      GLenum dstA = ctx->Color.Blend[i].DstA;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
+         srcRGB = dstRGB = GL_ONE;
+
+      if (eqA == GL_MIN || eqA == GL_MAX)
+         srcA = dstA = GL_ONE;
+
+      /* Due to hardware limitations, the destination may have information
+       * in an alpha channel even when the format specifies no alpha
+       * channel. In order to avoid getting any incorrect blending due to
+       * that alpha channel, coerce the blend factors to values that will
+       * not read the alpha channel, but will instead use the correct
+       * implicit value for alpha.
+       */
+      if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
+                                               GL_TEXTURE_ALPHA_TYPE)) {
+         srcRGB = brw_fix_xRGB_alpha(srcRGB);
+         srcA = brw_fix_xRGB_alpha(srcA);
+         dstRGB = brw_fix_xRGB_alpha(dstRGB);
+         dstA = brw_fix_xRGB_alpha(dstA);
+      }
+
+      /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
+       * "If Dual Source Blending is enabled, this bit must be disabled."
+       *
+       * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
+       * and leave it enabled anyway.
+       */
+      if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
+         srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+         srcA = fix_dual_blend_alpha_to_one(srcA);
+         dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+         dstA = fix_dual_blend_alpha_to_one(dstA);
+      }
+
+      entry->ColorBufferBlendEnable = true;
+      entry->DestinationBlendFactor = blend_factor(dstRGB);
+      entry->SourceBlendFactor = blend_factor(srcRGB);
+      entry->DestinationAlphaBlendFactor = blend_factor(dstA);
+      entry->SourceAlphaBlendFactor = blend_factor(srcA);
+      entry->ColorBlendFunction = blend_eqn(eqRGB);
+      entry->AlphaBlendFunction = blend_eqn(eqA);
+
+      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
+         independent_alpha_blend = true;
+   }
+
+   return independent_alpha_blend;
+}
+
 #if GEN_GEN >= 6
 static void
 genX(upload_blend_state)(struct brw_context *brw)
@@ -2580,87 +2733,9 @@ genX(upload_blend_state)(struct brw_context *brw)
 #else
       {
 #endif
-
-         /* _NEW_BUFFERS */
-         struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-
-         /* Used for implementing the following bit of GL_EXT_texture_integer:
-          * "Per-fragment operations that require floating-point color
-          *  components, including multisample alpha operations, alpha test,
-          *  blending, and dithering, have no effect when the corresponding
-          *  colors are written to an integer color buffer."
-          */
-         bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
-
-         /* _NEW_COLOR */
-         if (ctx->Color.ColorLogicOpEnabled) {
-            GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
-                                : GL_UNSIGNED_NORMALIZED;
-            WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
-                      rb_type != GL_UNSIGNED_NORMALIZED &&
-                      rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
-                      "renderbuffer\n",
-                      _mesa_enum_to_string(ctx->Color.LogicOp),
-                      _mesa_enum_to_string(rb_type));
-            if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
-               entry.LogicOpEnable = true;
-               entry.LogicOpFunction =
-                  intel_translate_logic_op(ctx->Color.LogicOp);
-            }
-         } else if (ctx->Color.BlendEnabled & (1 << i) && !integer &&
-                    !ctx->Color._AdvancedBlendMode) {
-            GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
-            GLenum eqA = ctx->Color.Blend[i].EquationA;
-            GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
-            GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
-            GLenum srcA = ctx->Color.Blend[i].SrcA;
-            GLenum dstA = ctx->Color.Blend[i].DstA;
-
-            if (eqRGB == GL_MIN || eqRGB == GL_MAX)
-               srcRGB = dstRGB = GL_ONE;
-
-            if (eqA == GL_MIN || eqA == GL_MAX)
-               srcA = dstA = GL_ONE;
-
-            /* Due to hardware limitations, the destination may have information
-             * in an alpha channel even when the format specifies no alpha
-             * channel. In order to avoid getting any incorrect blending due to
-             * that alpha channel, coerce the blend factors to values that will
-             * not read the alpha channel, but will instead use the correct
-             * implicit value for alpha.
-             */
-            if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
-                                                     GL_TEXTURE_ALPHA_TYPE)) {
-               srcRGB = brw_fix_xRGB_alpha(srcRGB);
-               srcA = brw_fix_xRGB_alpha(srcA);
-               dstRGB = brw_fix_xRGB_alpha(dstRGB);
-               dstA = brw_fix_xRGB_alpha(dstA);
-            }
-
-            /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
-             * "If Dual Source Blending is enabled, this bit must be disabled."
-             *
-             * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
-             * and leave it enabled anyway.
-             */
-            if (ctx->Color.Blend[i]._UsesDualSrc && blend.AlphaToOneEnable) {
-               srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
-               srcA = fix_dual_blend_alpha_to_one(srcA);
-               dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
-               dstA = fix_dual_blend_alpha_to_one(dstA);
-            }
-
-            entry.ColorBufferBlendEnable = true;
-            entry.DestinationBlendFactor = blend_factor(dstRGB);
-            entry.SourceBlendFactor = blend_factor(srcRGB);
-            entry.DestinationAlphaBlendFactor = blend_factor(dstA);
-            entry.SourceAlphaBlendFactor = blend_factor(srcA);
-            entry.ColorBlendFunction = blend_eqn(eqRGB);
-            entry.AlphaBlendFunction = blend_eqn(eqA);
-
-            if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
-               blend.IndependentAlphaBlendEnable = true;
-         }
+         blend.IndependentAlphaBlendEnable =
+            set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
+            blend.IndependentAlphaBlendEnable;
 
          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
@@ -2960,53 +3035,80 @@ static const struct brw_tracked_state genX(multisample_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
 static void
 genX(upload_color_calc_state)(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
+#if GEN_GEN <= 5
+      cc.IndependentAlphaBlendEnable =
+         set_blend_entry_bits(brw, &cc, 0, false);
+      set_depth_stencil_bits(brw, &cc);
+
+      if (ctx->Color.AlphaEnabled &&
+          ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
+         cc.AlphaTestEnable = true;
+         cc.AlphaTestFunction =
+            intel_translate_compare_func(ctx->Color.AlphaFunc);
+      }
+
+      cc.ColorDitherEnable = ctx->Color.DitherFlag;
+
+      cc.StatisticsEnable = brw->stats_wm;
+
+      cc.CCViewportStatePointer =
+         instruction_ro_bo(brw->batch.bo, brw->cc.vp_offset);
+#else
       /* _NEW_COLOR */
-      cc.AlphaTestFormat = ALPHATEST_UNORM8;
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
-                               ctx->Color.AlphaRef);
+      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
 
 #if GEN_GEN < 9
       /* _NEW_STENCIL */
       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
       cc.BackfaceStencilReferenceValue =
          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
+#endif
+
 #endif
 
       /* _NEW_COLOR */
-      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
-      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
-      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
-      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
+                               ctx->Color.AlphaRef);
    }
 
+#if GEN_GEN >= 6
    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
       ptr.ColorCalcStatePointer = brw->cc.state_offset;
 #if GEN_GEN != 7
       ptr.ColorCalcStatePointerValid = true;
 #endif
    }
+#else
+   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+#endif
 }
 
 static const struct brw_tracked_state genX(color_calc_state) = {
    .dirty = {
       .mesa = _NEW_COLOR |
-              _NEW_STENCIL,
+              _NEW_STENCIL |
+              (GEN_GEN <= 5 ? _NEW_BUFFERS |
+                              _NEW_DEPTH
+                            : 0),
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
-             BRW_NEW_CC_STATE |
-             BRW_NEW_STATE_BASE_ADDRESS,
+             (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
+                             BRW_NEW_STATS_WM
+                           : BRW_NEW_CC_STATE |
+                             BRW_NEW_STATE_BASE_ADDRESS),
    },
    .emit = genX(upload_color_calc_state),
 };
 
-#endif
 
 /* ---------------------------------------------------------------------- */
 
@@ -4249,7 +4351,7 @@ genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
 #endif
 }
 
-static bool
+UNUSED static bool
 has_component(mesa_format format, int i)
 {
    if (_mesa_is_format_color_format(format))
@@ -4263,11 +4365,11 @@ has_component(mesa_format format, int i)
  * Upload SAMPLER_BORDER_COLOR_STATE.
  */
 static void
-upload_default_color(struct brw_context *brw,
-                     const struct gl_sampler_object *sampler,
-                     mesa_format format, GLenum base_format,
-                     bool is_integer_format, bool is_stencil_sampling,
-                     uint32_t *sdc_offset)
+genX(upload_default_color)(struct brw_context *brw,
+                           const struct gl_sampler_object *sampler,
+                           mesa_format format, GLenum base_format,
+                           bool is_integer_format, bool is_stencil_sampling,
+                           uint32_t *sdc_offset)
 {
    union gl_color_union color;
 
@@ -4321,27 +4423,49 @@ upload_default_color(struct brw_context *brw,
    if (base_format == GL_RGB)
       color.ui[3] = float_as_int(1.0);
 
+   int alignment = 32;
    if (brw->gen >= 8) {
-      /* On Broadwell, the border color is represented as four 32-bit floats,
-       * integers, or unsigned values, interpreted according to the surface
-       * format.  This matches the sampler->BorderColor union exactly; just
-       * memcpy the values.
-       */
-      uint32_t *sdc = brw_state_batch(brw, 4 * 4, 64, sdc_offset);
-      memcpy(sdc, color.ui, 4 * 4);
+      alignment = 64;
    } else if (brw->is_haswell && (is_integer_format || is_stencil_sampling)) {
-      /* Haswell's integer border color support is completely insane:
-       * SAMPLER_BORDER_COLOR_STATE is 20 DWords.  The first four are
-       * for float colors.  The next 12 DWords are MBZ and only exist to
-       * pad it out to a 64 byte cacheline boundary.  DWords 16-19 then
-       * contain integer colors; these are only used if SURFACE_STATE
-       * has the "Integer Surface Format" bit set.  Even then, the
-       * arrangement of the RGBA data devolves into madness.
-       */
-      uint32_t *sdc = brw_state_batch(brw, 20 * 4, 512, sdc_offset);
-      memset(sdc, 0, 20 * 4);
-      sdc = &sdc[16];
+      alignment = 512;
+   }
+
+   uint32_t *sdc = brw_state_batch(
+      brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
+      alignment, sdc_offset);
+
+   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
+
+#define ASSIGN(dst, src) \
+   do {                  \
+      dst = src;         \
+   } while (0)
 
+#define ASSIGNu16(dst, src) \
+   do {                     \
+      dst = (uint16_t)src;  \
+   } while (0)
+
+#define ASSIGNu8(dst, src) \
+   do {                    \
+      dst = (uint8_t)src;  \
+   } while (0)
+
+#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
+   macro(state.BorderColor ## _color_type ## Red, src[0]);   \
+   macro(state.BorderColor ## _color_type ## Green, src[1]);   \
+   macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
+   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
+
+#if GEN_GEN >= 8
+   /* On Broadwell, the border color is represented as four 32-bit floats,
+    * integers, or unsigned values, interpreted according to the surface
+    * format.  This matches the sampler->BorderColor union exactly; just
+    * memcpy the values.
+    */
+   BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
+#elif GEN_IS_HASWELL
+   if (is_integer_format || is_stencil_sampling) {
       bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
       const int bits_per_channel =
          _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
@@ -4361,76 +4485,61 @@ upload_default_color(struct brw_context *brw,
       switch (bits_per_channel) {
       case 8:
          /* Copy RGBA in order. */
-         for (int i = 0; i < 4; i++)
-            ((uint8_t *) sdc)[i] = c[i];
+         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
          break;
       case 10:
          /* R10G10B10A2_UINT is treated like a 16-bit format. */
       case 16:
-         ((uint16_t *) sdc)[0] = c[0]; /* R -> DWord 0, bits 15:0  */
-         ((uint16_t *) sdc)[1] = c[1]; /* G -> DWord 0, bits 31:16 */
-         /* DWord 1 is Reserved/MBZ! */
-         ((uint16_t *) sdc)[4] = c[2]; /* B -> DWord 2, bits 15:0  */
-         ((uint16_t *) sdc)[5] = c[3]; /* A -> DWord 3, bits 31:16 */
+         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
          break;
       case 32:
          if (base_format == GL_RG) {
             /* Careful inspection of the tables reveals that for RG32 formats,
              * the green channel needs to go where blue normally belongs.
              */
-            sdc[0] = c[0];
-            sdc[2] = c[1];
-            sdc[3] = 1;
+            state.BorderColor32bitRed = c[0];
+            state.BorderColor32bitBlue = c[1];
+            state.BorderColor32bitAlpha = 1;
          } else {
             /* Copy RGBA in order. */
-            for (int i = 0; i < 4; i++)
-               sdc[i] = c[i];
+            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
          }
          break;
       default:
          assert(!"Invalid number of bits per channel in integer format.");
          break;
       }
-   } else if (brw->gen == 5 || brw->gen == 6) {
-      struct gen5_sampler_default_color *sdc;
-
-      sdc = brw_state_batch(brw, sizeof(*sdc), 32, sdc_offset);
-
-      memset(sdc, 0, sizeof(*sdc));
-
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[0], color.f[0]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[1], color.f[1]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[2], color.f[2]);
-      UNCLAMPED_FLOAT_TO_UBYTE(sdc->ub[3], color.f[3]);
-
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[0], color.f[0]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[1], color.f[1]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[2], color.f[2]);
-      UNCLAMPED_FLOAT_TO_USHORT(sdc->us[3], color.f[3]);
-
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[0], color.f[0]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[1], color.f[1]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[2], color.f[2]);
-      UNCLAMPED_FLOAT_TO_SHORT(sdc->s[3], color.f[3]);
-
-      sdc->hf[0] = _mesa_float_to_half(color.f[0]);
-      sdc->hf[1] = _mesa_float_to_half(color.f[1]);
-      sdc->hf[2] = _mesa_float_to_half(color.f[2]);
-      sdc->hf[3] = _mesa_float_to_half(color.f[3]);
-
-      sdc->b[0] = sdc->s[0] >> 8;
-      sdc->b[1] = sdc->s[1] >> 8;
-      sdc->b[2] = sdc->s[2] >> 8;
-      sdc->b[3] = sdc->s[3] >> 8;
-
-      sdc->f[0] = color.f[0];
-      sdc->f[1] = color.f[1];
-      sdc->f[2] = color.f[2];
-      sdc->f[3] = color.f[3];
    } else {
-      float *sdc = brw_state_batch(brw, 4 * 4, 32, sdc_offset);
-      memcpy(sdc, color.f, 4 * 4);
+      BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
    }
+#elif GEN_GEN == 5 || GEN_GEN == 6
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
+
+#define MESA_FLOAT_TO_HALF(dst, src) \
+   dst = _mesa_float_to_half(src);
+
+   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
+
+#undef MESA_FLOAT_TO_HALF
+
+   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
+   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
+   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
+   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
+
+   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+#elif GEN_GEN == 4
+   BORDER_COLOR_ATTR(ASSIGN, , color.f);
+#else
+   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+#endif
+
+#undef ASSIGN
+#undef BORDER_COLOR_ATTR
+
+   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
 }
 
 static uint32_t
@@ -4448,9 +4557,8 @@ translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
        *
        * Gen8+ supports this natively.
        */
-         return TCM_HALF_BORDER;
-#endif
-
+      return TCM_HALF_BORDER;
+#else
       /* On Gen4-7.5, we clamp the coordinates in the fragment shader
        * and set clamp_border here, which gets the result desired.
        * We just use clamp(_to_edge) for nearest, because for nearest
@@ -4461,6 +4569,7 @@ translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
          return TCM_CLAMP;
       else
          return TCM_CLAMP_BORDER;
+#endif
    case GL_CLAMP_TO_EDGE:
       return TCM_CLAMP;
    case GL_CLAMP_TO_BORDER:
@@ -4544,7 +4653,7 @@ genX(update_sampler_state)(struct brw_context *brw,
    if (sampler->MaxAnisotropy > 1.0f) {
       if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
          samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
-      if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
+      if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
          samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
 
       if (sampler->MaxAnisotropy > 2.0f) {
@@ -4639,9 +4748,10 @@ genX(update_sampler_state)(struct brw_context *brw,
    if (wrap_mode_needs_border_color(wrap_s) ||
        wrap_mode_needs_border_color(wrap_t) ||
        wrap_mode_needs_border_color(wrap_r)) {
-      upload_default_color(brw, sampler, format, base_format,
-                           texObj->_IsIntegerFormat, texObj->StencilSampling,
-                           &border_color_offset);
+      genX(upload_default_color)(brw, sampler, format, base_format,
+                                 texObj->_IsIntegerFormat,
+                                 texObj->StencilSampling,
+                                 &border_color_offset);
    }
 
    samp_st.BorderColorPointer = border_color_offset;
@@ -4752,6 +4862,143 @@ static const struct brw_tracked_state genX(fs_samplers) = {
    .emit = genX(upload_fs_samplers),
 };
 
+static void
+genX(upload_vs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_VERTEX_PROGRAM */
+   struct gl_program *vs = (struct gl_program *) brw->vertex_program;
+   genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
+}
+
+static const struct brw_tracked_state genX(vs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VERTEX_PROGRAM,
+   },
+   .emit = genX(upload_vs_samplers),
+};
+
+#if GEN_GEN >= 6
+static void
+genX(upload_gs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   struct gl_program *gs = (struct gl_program *) brw->geometry_program;
+   if (!gs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
+}
+
+
+static const struct brw_tracked_state genX(gs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_GEOMETRY_PROGRAM,
+   },
+   .emit = genX(upload_gs_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_tcs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
+   if (!tcs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
+}
+
+static const struct brw_tracked_state genX(tcs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = genX(upload_tcs_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_tes_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
+   if (!tes)
+      return;
+
+   genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
+}
+
+static const struct brw_tracked_state genX(tes_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = genX(upload_tes_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_cs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct gl_program *cs = (struct gl_program *) brw->compute_program;
+   if (!cs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
+}
+
+const struct brw_tracked_state genX(cs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_COMPUTE_PROGRAM,
+   },
+   .emit = genX(upload_cs_samplers),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN <= 5
+
+static void genX(upload_blend_constant_color)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
+      blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+   }
+}
+
+static const struct brw_tracked_state genX(blend_constant_color) = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_BLORP,
+   },
+   .emit = genX(upload_blend_constant_color)
+};
+#endif
+
 /* ---------------------------------------------------------------------- */
 
 void
@@ -4768,7 +5015,7 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_recalculate_urb_fence,
 
       &genX(cc_vp),
-      &brw_cc_unit,
+      &genX(color_calc_state),
 
       /* Surface state setup.  Must come before the VS/WM unit.  The binding
        * table upload must be last.
@@ -4782,7 +5029,7 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_wm_binding_table,
 
       &genX(fs_samplers),
-      &brw_vs_samplers,
+      &genX(vs_samplers),
 
       /* These set up state for brw_psp_urb_cbs */
       &brw_wm_unit,
@@ -4797,7 +5044,7 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_invariant_state,
 
       &brw_binding_table_pointers,
-      &brw_blend_constant_color,
+      &genX(blend_constant_color),
 
       &brw_depthbuffer,
 
@@ -4851,8 +5098,8 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_wm_binding_table,
 
       &genX(fs_samplers),
-      &brw_vs_samplers,
-      &brw_gs_samplers,
+      &genX(vs_samplers),
+      &genX(gs_samplers),
       &gen6_sampler_state,
       &genX(multisample_state),
 
@@ -4934,10 +5181,10 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_wm_binding_table,
 
       &genX(fs_samplers),
-      &brw_vs_samplers,
-      &brw_tcs_samplers,
-      &brw_tes_samplers,
-      &brw_gs_samplers,
+      &genX(vs_samplers),
+      &genX(tcs_samplers),
+      &genX(tes_samplers),
+      &genX(gs_samplers),
       &genX(multisample_state),
 
       &genX(vs_state),
@@ -5023,10 +5270,10 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_wm_binding_table,
 
       &genX(fs_samplers),
-      &brw_vs_samplers,
-      &brw_tcs_samplers,
-      &brw_tes_samplers,
-      &brw_gs_samplers,
+      &genX(vs_samplers),
+      &genX(tcs_samplers),
+      &genX(tes_samplers),
+      &genX(gs_samplers),
       &genX(multisample_state),
 
       &genX(vs_state),
@@ -5082,7 +5329,7 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_cs_abo_surfaces,
       &brw_cs_texture_surfaces,
       &brw_cs_work_groups_surface,
-      &brw_cs_samplers,
+      &genX(cs_samplers),
       &genX(cs_state),
    };