i965: Prepare gs_state emitting code to include gen4-5.
[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
index 416677c6712654cb28f6e951f814a93f7f237634..2a5b3648102565aede7fda0845c456f978f08cf3 100644 (file)
@@ -31,6 +31,7 @@
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/macros.h"
+#include "main/state.h"
 
 #include "brw_context.h"
 #if GEN_GEN == 6
 #include "main/fbobject.h"
 #include "main/framebuffer.h"
 #include "main/glformats.h"
+#include "main/samplerobj.h"
 #include "main/shaderapi.h"
 #include "main/stencil.h"
 #include "main/transformfeedback.h"
 #include "main/varray.h"
 #include "main/viewport.h"
+#include "util/half_float.h"
 
 UNUSED static void *
 emit_dwords(struct brw_context *brw, unsigned n)
@@ -154,6 +157,29 @@ vertex_bo(struct brw_bo *bo, uint32_t offset)
    };
 }
 
+#if GEN_GEN == 4
+static inline struct brw_address
+KSP(struct brw_context *brw, uint32_t offset)
+{
+   return instruction_bo(brw->cache.bo, offset);
+}
+
+static inline struct brw_address
+KSP_ro(struct brw_context *brw, uint32_t offset)
+{
+   return instruction_ro_bo(brw->cache.bo, offset);
+}
+#else
+static inline uint32_t
+KSP(struct brw_context *brw, uint32_t offset)
+{
+   return offset;
+}
+
+#define KSP_ro KSP
+
+#endif
+
 #include "genxml/genX_pack.h"
 
 #define _brw_cmd_length(cmd) cmd ## _length
@@ -344,7 +370,9 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
 #endif
 #endif
 
-#if GEN_GEN == 9
+#if GEN_GEN == 10
+      .VertexBufferMOCS = CNL_MOCS_WB,
+#elif GEN_GEN == 9
       .VertexBufferMOCS = SKL_MOCS_WB,
 #elif GEN_GEN == 8
       .VertexBufferMOCS = BDW_MOCS_WB,
@@ -841,6 +869,39 @@ static const struct brw_tracked_state genX(vertices) = {
    .emit = genX(emit_vertices),
 };
 
+static void
+genX(emit_index_buffer)(struct brw_context *brw)
+{
+   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
+
+   if (index_buffer == NULL)
+      return;
+
+   brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
+#if GEN_GEN < 8 && !GEN_IS_HASWELL
+      ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
+#endif
+      ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
+      ib.BufferStartingAddress = vertex_bo(brw->ib.bo, 0);
+#if GEN_GEN >= 8
+      ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+      ib.BufferSize = brw->ib.size;
+#else
+      ib.BufferEndingAddress = vertex_bo(brw->ib.bo, brw->ib.size - 1);
+#endif
+   }
+}
+
+static const struct brw_tracked_state genX(index_buffer) = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_INDEX_BUFFER,
+   },
+   .emit = genX(emit_index_buffer),
+};
+
 #if GEN_IS_HASWELL || GEN_GEN >= 8
 static void
 genX(upload_cut_index)(struct brw_context *brw)
@@ -1059,7 +1120,7 @@ genX(calculate_attr_overrides)(const struct brw_context *brw,
          genX(get_attr_override)(&attribute,
                                  &brw->vue_map_geom_out,
                                  *urb_entry_read_offset, attr,
-                                 brw->ctx.VertexProgram._TwoSideEnabled,
+                                 _mesa_vertex_program_two_side_enabled(ctx),
                                  &max_source_attr);
       }
 
@@ -1095,9 +1156,16 @@ genX(calculate_attr_overrides)(const struct brw_context *brw,
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
-static void
-genX(upload_depth_stencil_state)(struct brw_context *brw)
+#if GEN_GEN >= 8
+typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
+#elif GEN_GEN >= 6
+typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
+#endif
+
+static inline void
+set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -1112,66 +1180,76 @@ genX(upload_depth_stencil_state)(struct brw_context *brw)
    struct gl_stencil_attrib *stencil = &ctx->Stencil;
    const int b = stencil->_BackFace;
 
+   if (depth->Test && depth_irb) {
+      ds->DepthTestEnable = true;
+      ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
+      ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
+   }
+
+   if (brw->stencil_enabled) {
+      ds->StencilTestEnable = true;
+      ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
+      ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
+
+      ds->StencilTestFunction =
+         intel_translate_compare_func(stencil->Function[0]);
+      ds->StencilFailOp =
+         intel_translate_stencil_op(stencil->FailFunc[0]);
+      ds->StencilPassDepthPassOp =
+         intel_translate_stencil_op(stencil->ZPassFunc[0]);
+      ds->StencilPassDepthFailOp =
+         intel_translate_stencil_op(stencil->ZFailFunc[0]);
+
+      ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
+
+      if (brw->stencil_two_sided) {
+         ds->DoubleSidedStencilEnable = true;
+         ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
+         ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
+
+         ds->BackfaceStencilTestFunction =
+            intel_translate_compare_func(stencil->Function[b]);
+         ds->BackfaceStencilFailOp =
+            intel_translate_stencil_op(stencil->FailFunc[b]);
+         ds->BackfaceStencilPassDepthPassOp =
+            intel_translate_stencil_op(stencil->ZPassFunc[b]);
+         ds->BackfaceStencilPassDepthFailOp =
+            intel_translate_stencil_op(stencil->ZFailFunc[b]);
+      }
+
+#if GEN_GEN <= 5 || GEN_GEN >= 9
+      ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
+      ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
+#endif
+   }
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_depth_stencil_state)(struct brw_context *brw)
+{
 #if GEN_GEN >= 8
    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
+      set_depth_stencil_bits(brw, &wmds);
+   }
 #else
    uint32_t ds_offset;
-   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, wmds) {
-#endif
-      if (depth->Test && depth_irb) {
-         wmds.DepthTestEnable = true;
-         wmds.DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
-         wmds.DepthTestFunction = intel_translate_compare_func(depth->Func);
-      }
-
-      if (stencil->_Enabled) {
-         wmds.StencilTestEnable = true;
-         wmds.StencilWriteMask = stencil->WriteMask[0] & 0xff;
-         wmds.StencilTestMask = stencil->ValueMask[0] & 0xff;
-
-         wmds.StencilTestFunction =
-            intel_translate_compare_func(stencil->Function[0]);
-         wmds.StencilFailOp =
-            intel_translate_stencil_op(stencil->FailFunc[0]);
-         wmds.StencilPassDepthPassOp =
-            intel_translate_stencil_op(stencil->ZPassFunc[0]);
-         wmds.StencilPassDepthFailOp =
-            intel_translate_stencil_op(stencil->ZFailFunc[0]);
-
-         wmds.StencilBufferWriteEnable = stencil->_WriteEnabled;
-
-         if (stencil->_TestTwoSide) {
-            wmds.DoubleSidedStencilEnable = true;
-            wmds.BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
-            wmds.BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
-
-            wmds.BackfaceStencilTestFunction =
-               intel_translate_compare_func(stencil->Function[b]);
-            wmds.BackfaceStencilFailOp =
-               intel_translate_stencil_op(stencil->FailFunc[b]);
-            wmds.BackfaceStencilPassDepthPassOp =
-               intel_translate_stencil_op(stencil->ZPassFunc[b]);
-            wmds.BackfaceStencilPassDepthFailOp =
-               intel_translate_stencil_op(stencil->ZFailFunc[b]);
-         }
-
-#if GEN_GEN >= 9
-         wmds.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
-         wmds.BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
-#endif
-      }
+   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
+      set_depth_stencil_bits(brw, &ds);
    }
 
+   /* Now upload a pointer to the indirect state */
 #if GEN_GEN == 6
    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
       ptr.DEPTH_STENCIL_STATEChange = true;
    }
-#elif GEN_GEN == 7
+#else
    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
    }
 #endif
+#endif
 }
 
 static const struct brw_tracked_state genX(depth_stencil_state) = {
@@ -1215,7 +1293,7 @@ genX(upload_clip_state)(struct brw_context *brw)
 #endif
 
 #if GEN_GEN == 7
-      clip.FrontWinding = ctx->Polygon._FrontBit == _mesa_is_user_fbo(fb);
+      clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
 
       if (ctx->Polygon.CullFlag) {
          switch (ctx->Polygon.CullFaceMode) {
@@ -1283,7 +1361,7 @@ genX(upload_clip_state)(struct brw_context *brw)
          clip.ClipMode = CLIPMODE_NORMAL;
       }
 
-      clip.ClipEnable = brw->primitive != _3DPRIM_RECTLIST;
+      clip.ClipEnable = true;
 
       /* _NEW_POLYGON,
        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
@@ -1322,7 +1400,6 @@ static const struct brw_tracked_state genX(clip_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
 static void
 genX(upload_sf)(struct brw_context *brw)
 {
@@ -1332,11 +1409,48 @@ genX(upload_sf)(struct brw_context *brw)
 #if GEN_GEN <= 7
    /* _NEW_BUFFERS */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+   UNUSED const bool multisampled_fbo =
+      _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 #endif
 
+#if GEN_GEN < 6
+   const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
+
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+
+   brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
+      sf.KernelStartPointer = KSP_ro(brw, brw->sf.prog_offset);
+      sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+      sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
+      sf.DispatchGRFStartRegisterForURBData = 3;
+      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+      sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
+      sf.NumberofURBEntries = brw->urb.nr_sf_entries;
+      sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
+
+      /* STATE_PREFETCH command description describes this state as being
+       * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
+       * domain.
+       */
+      sf.SetupViewportStateOffset =
+         instruction_ro_bo(brw->batch.bo, brw->sf.vp_offset);
+
+      sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+
+      /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
+      /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
+
+      sf.MaximumNumberofThreads =
+         MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
+
+      sf.SpritePointEnable = ctx->Point.PointSprite;
+
+      sf.DestinationOriginHorizontalBias = 0.5;
+      sf.DestinationOriginVerticalBias = 0.5;
+#else
    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
       sf.StatisticsEnable = true;
+#endif
       sf.ViewportTransformEnable = true;
 
 #if GEN_GEN == 7
@@ -1346,7 +1460,8 @@ genX(upload_sf)(struct brw_context *brw)
 
 #if GEN_GEN <= 7
       /* _NEW_POLYGON */
-      sf.FrontWinding = ctx->Polygon._FrontBit == render_to_fbo;
+      sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+#if GEN_GEN >= 6
       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
@@ -1379,6 +1494,14 @@ genX(upload_sf)(struct brw_context *brw)
             unreachable("not reached");
       }
 
+      if (multisampled_fbo && ctx->Multisample.Enabled)
+         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+
+      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
+      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
+      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
+#endif
+
       sf.ScissorRectangleEnable = true;
 
       if (ctx->Polygon.CullFlag) {
@@ -1403,12 +1526,6 @@ genX(upload_sf)(struct brw_context *brw)
       sf.LineStippleEnable = ctx->Line.StippleFlag;
 #endif
 
-      if (multisampled_fbo && ctx->Multisample.Enabled)
-         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
-
-      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
-      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
-      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
 #endif
 
       /* _NEW_LINE */
@@ -1444,7 +1561,9 @@ genX(upload_sf)(struct brw_context *brw)
          sf.SmoothPointEnable = true;
 #endif
 
+#if GEN_IS_G4X || GEN_GEN >= 5
       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+#endif
 
       /* _NEW_LIGHT */
       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
@@ -1494,14 +1613,21 @@ static const struct brw_tracked_state genX(sf_state) = {
    .dirty = {
       .mesa  = _NEW_LIGHT |
                _NEW_LINE |
-               _NEW_MULTISAMPLE |
                _NEW_POINT |
                _NEW_PROGRAM |
+               (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0),
       .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
                BRW_NEW_VUE_MAP_GEOM_OUT |
-               (GEN_GEN <= 7 ? BRW_NEW_GS_PROG_DATA |
+               (GEN_GEN <= 5 ? BRW_NEW_BATCH |
+                               BRW_NEW_PROGRAM_CACHE |
+                               BRW_NEW_SF_PROG_DATA |
+                               BRW_NEW_SF_VP |
+                               BRW_NEW_URB_FENCE
+                             : 0) |
+               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
+               (GEN_GEN >= 6 && GEN_GEN <= 7 ?
+                               BRW_NEW_GS_PROG_DATA |
                                BRW_NEW_PRIMITIVE |
                                BRW_NEW_TES_PROG_DATA
                              : 0) |
@@ -1511,7 +1637,6 @@ static const struct brw_tracked_state genX(sf_state) = {
    },
    .emit = genX(upload_sf),
 };
-#endif
 
 /* ---------------------------------------------------------------------- */
 
@@ -1696,20 +1821,6 @@ static const struct brw_tracked_state genX(wm_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN == 4
-static inline struct brw_address
-KSP(struct brw_context *brw, uint32_t offset)
-{
-   return instruction_bo(brw->cache.bo, offset);
-}
-#else
-static inline uint32_t
-KSP(struct brw_context *brw, uint32_t offset)
-{
-   return offset;
-}
-#endif
-
 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
    pkt.SamplerCount       =                                               \
@@ -1794,7 +1905,7 @@ genX(upload_vs_state)(struct brw_context *brw)
          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
 
       vs.StatisticsEnable = false;
-      vs.SamplerStateOffset =
+      vs.SamplerStatePointer =
          instruction_ro_bo(brw->batch.bo, stage_state->sampler_offset);
 #endif
 
@@ -1863,10 +1974,157 @@ static const struct brw_tracked_state genX(vs_state) = {
 
 /* ---------------------------------------------------------------------- */
 
+static void
+genX(upload_cc_viewport)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
+   struct GENX(CC_VIEWPORT) ccv;
+   uint32_t cc_vp_offset;
+   uint32_t *cc_map =
+      brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
+                      32, &cc_vp_offset);
+
+   for (unsigned i = 0; i < viewport_count; i++) {
+      /* _NEW_VIEWPORT | _NEW_TRANSFORM */
+      const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
+      if (ctx->Transform.DepthClamp) {
+         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
+      } else {
+         ccv.MinimumDepth = 0.0;
+         ccv.MaximumDepth = 1.0;
+      }
+      GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
+      cc_map += GENX(CC_VIEWPORT_length);
+   }
+
+#if GEN_GEN >= 7
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
+      ptr.CCViewportPointer = cc_vp_offset;
+   }
+#elif GEN_GEN == 6
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+      vp.CCViewportStateChange = 1;
+      vp.PointertoCC_VIEWPORT = cc_vp_offset;
+   }
+#else
+   brw->cc.vp_offset = cc_vp_offset;
+   ctx->NewDriverState |= BRW_NEW_CC_VP;
+#endif
+}
+
+const struct brw_tracked_state genX(cc_vp) = {
+   .dirty = {
+      .mesa = _NEW_TRANSFORM |
+              _NEW_VIEWPORT,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_cc_viewport)
+};
+
+/* ---------------------------------------------------------------------- */
+
+static inline void
+set_scissor_bits(const struct gl_context *ctx, int i,
+                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 struct GENX(SCISSOR_RECT) *sc)
+{
+   int bbox[4];
+
+   bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
+   bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
+   bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+   bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
+   _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
+
+   if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
+      /* If the scissor was out of bounds and got clamped to 0 width/height
+       * at the bounds, the subtraction of 1 from maximums could produce a
+       * negative number and thus not clip anything.  Instead, just provide
+       * a min > max scissor inside the bounds, which produces the expected
+       * no rendering.
+       */
+      sc->ScissorRectangleXMin = 1;
+      sc->ScissorRectangleXMax = 0;
+      sc->ScissorRectangleYMin = 1;
+      sc->ScissorRectangleYMax = 0;
+   } else if (render_to_fbo) {
+      /* texmemory: Y=0=bottom */
+      sc->ScissorRectangleXMin = bbox[0];
+      sc->ScissorRectangleXMax = bbox[1] - 1;
+      sc->ScissorRectangleYMin = bbox[2];
+      sc->ScissorRectangleYMax = bbox[3] - 1;
+   } else {
+      /* memory: Y=0=top */
+      sc->ScissorRectangleXMin = bbox[0];
+      sc->ScissorRectangleXMax = bbox[1] - 1;
+      sc->ScissorRectangleYMin = fb_height - bbox[3];
+      sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
+   }
+}
+
 #if GEN_GEN >= 6
 static void
-brw_calculate_guardband_size(const struct gen_device_info *devinfo,
-                             uint32_t fb_width, uint32_t fb_height,
+genX(upload_scissor_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   struct GENX(SCISSOR_RECT) scissor;
+   uint32_t scissor_state_offset;
+   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
+   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
+   uint32_t *scissor_map;
+
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
+   scissor_map = brw_state_batch(
+      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
+      32, &scissor_state_offset);
+
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
+
+   /* The scissor only needs to handle the intersection of drawable and
+    * scissor rect.  Clipping to the boundaries of static shared buffers
+    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
+    *
+    * Note that the hardware's coordinates are inclusive, while Mesa's min is
+    * inclusive but max is exclusive.
+    */
+   for (unsigned i = 0; i < viewport_count; i++) {
+      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      GENX(SCISSOR_RECT_pack)(
+         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
+   }
+
+   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
+      ptr.ScissorRectPointer = scissor_state_offset;
+   }
+}
+
+static const struct brw_tracked_state genX(scissor_state) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_SCISSOR |
+              _NEW_VIEWPORT,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_scissor_state),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static void
+brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
                              float m00, float m11, float m30, float m31,
                              float *xmin, float *xmax,
                              float *ymin, float *ymax)
@@ -1907,7 +2165,7 @@ brw_calculate_guardband_size(const struct gen_device_info *devinfo,
     *
     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
     */
-   const float gb_size = devinfo->gen >= 7 ? 16384.0f : 8192.0f;
+   const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
 
    if (m00 != 0 && m11 != 0) {
       /* First, we compute the screen-space render area */
@@ -1950,7 +2208,6 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    float y_scale, y_bias;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    /* BRW_NEW_VIEWPORT_COUNT */
    const unsigned viewport_count = brw->clip.viewport_count;
@@ -1964,15 +2221,19 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 #define clv sfv
    struct GENX(SF_CLIP_VIEWPORT) sfv;
    uint32_t sf_clip_vp_offset;
-   uint32_t *sf_clip_map = brw_state_batch(brw, 16 * 4 * viewport_count,
-                                           64, &sf_clip_vp_offset);
+   uint32_t *sf_clip_map =
+      brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
+                      64, &sf_clip_vp_offset);
 #else
    struct GENX(SF_VIEWPORT) sfv;
    struct GENX(CLIP_VIEWPORT) clv;
-   uint32_t *sf_map = brw_state_batch(brw, 8 * 4 * viewport_count,
-                                      32, &brw->sf.vp_offset);
-   uint32_t *clip_map = brw_state_batch(brw, 4 * 4 * viewport_count,
-                                        32, &brw->clip.vp_offset);
+   uint32_t sf_vp_offset, clip_vp_offset;
+   uint32_t *sf_map =
+      brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
+                      32, &sf_vp_offset);
+   uint32_t *clip_map =
+      brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
+                      32, &clip_vp_offset);
 #endif
 
    /* _NEW_BUFFERS */
@@ -1995,7 +2256,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
       sfv.ViewportMatrixElementm30 = translate[0],
       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
       sfv.ViewportMatrixElementm32 = translate[2],
-      brw_calculate_guardband_size(devinfo, fb_width, fb_height,
+      brw_calculate_guardband_size(fb_width, fb_height,
                                    sfv.ViewportMatrixElementm00,
                                    sfv.ViewportMatrixElementm11,
                                    sfv.ViewportMatrixElementm30,
@@ -2008,7 +2269,10 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
       clv.YMinClipGuardband = gb_ymin;
       clv.YMaxClipGuardband = gb_ymax;
 
-#if GEN_GEN >= 8
+#if GEN_GEN < 6
+      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+                       &sfv.ScissorRectangle);
+#elif GEN_GEN >= 8
       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
        * The hardware will take the intersection of the drawing rectangle,
        * scissor rectangle, and the viewport extents. We don't need to be
@@ -2034,12 +2298,12 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 
 #if GEN_GEN >= 7
       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
-      sf_clip_map += 16;
+      sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
 #else
       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
-      sf_map += 8;
-      clip_map += 4;
+      sf_map += GENX(SF_VIEWPORT_length);
+      clip_map += GENX(CLIP_VIEWPORT_length);
 #endif
    }
 
@@ -2047,7 +2311,16 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
       ptr.SFClipViewportPointer = sf_clip_vp_offset;
    }
+#elif GEN_GEN == 6
+   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+      vp.SFViewportStateChange = 1;
+      vp.CLIPViewportStateChange = 1;
+      vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
+      vp.PointertoSF_VIEWPORT = sf_vp_offset;
+   }
 #else
+   brw->sf.vp_offset = sf_vp_offset;
+   brw->clip.vp_offset = clip_vp_offset;
    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
 #endif
 }
@@ -2055,14 +2328,14 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 static const struct brw_tracked_state genX(sf_clip_viewport) = {
    .dirty = {
       .mesa = _NEW_BUFFERS |
-              _NEW_VIEWPORT,
+              _NEW_VIEWPORT |
+              (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = genX(upload_sf_clip_viewport),
 };
-#endif
 
 /* ---------------------------------------------------------------------- */
 
@@ -2084,7 +2357,7 @@ genX(upload_gs_state)(struct brw_context *brw)
       brw_gs_prog_data(stage_prog_data);
 #endif
 
-#if GEN_GEN < 7
+#if GEN_GEN == 6
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
       if (active && stage_state->push_const_size != 0) {
          cgs.Buffer0Valid = true;
@@ -2111,8 +2384,8 @@ genX(upload_gs_state)(struct brw_context *brw)
       gen7_emit_cs_stall_flush(brw);
 #endif
 
-   if (active) {
-      brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+   brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+      if (active) {
          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
 
 #if GEN_GEN >= 7
@@ -2196,16 +2469,28 @@ genX(upload_gs_state)(struct brw_context *brw)
          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
 #endif
-      }
 #if GEN_GEN < 7
-   } else if (brw->ff_gs.prog_active)  {
-      /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS
-       * program. This function provides the needed 3DSTATE_GS for this.
-       */
-      upload_gs_state_for_tf(brw);
+      } else if (brw->ff_gs.prog_active) {
+         /* In gen6, transform feedback for the VS stage is done with an
+          * ad-hoc GS program. This function provides the needed 3DSTATE_GS
+          * for this.
+          */
+         gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
+         gs.SingleProgramFlow = true;
+         gs.VectorMaskEnable = true;
+         gs.DispatchGRFStartRegisterForURBData = 2;
+         gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
+         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
+         gs.StatisticsEnable = true;
+         gs.SOStatisticsEnable = true;
+         gs.RenderingEnabled = true;
+         gs.SVBIPayloadEnable = true;
+         gs.SVBIPostIncrementEnable = true;
+         gs.SVBIPostIncrementValue =
+            brw->ff_gs.prog_data->svbi_postincrement_value;
+         gs.Enable = true;
 #endif
-   } else {
-      brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+      } else {
          gs.StatisticsEnable = true;
 #if GEN_GEN < 7
          gs.RenderingEnabled = true;
@@ -2219,7 +2504,8 @@ genX(upload_gs_state)(struct brw_context *brw)
 #endif
       }
    }
-#if GEN_GEN < 7
+
+#if GEN_GEN == 6
    brw->gs.enabled = active;
 #endif
 }
@@ -2240,20 +2526,157 @@ static const struct brw_tracked_state genX(gs_state) = {
 
 /* ---------------------------------------------------------------------- */
 
+UNUSED static GLenum
+fix_dual_blend_alpha_to_one(GLenum function)
+{
+   switch (function) {
+   case GL_SRC1_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_SRC1_ALPHA:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
 #define blend_factor(x) brw_translate_blend_factor(x)
 #define blend_eqn(x) brw_translate_blend_equation(x)
 
-#if GEN_GEN >= 6
-static void
-genX(upload_blend_state)(struct brw_context *brw)
+/**
+ * Modify blend function to force destination alpha to 1.0
+ *
+ * If \c function specifies a blend function that uses destination alpha,
+ * replace it with a function that hard-wires destination alpha to 1.0.  This
+ * is used when rendering to xRGB targets.
+ */
+static GLenum
+brw_fix_xRGB_alpha(GLenum function)
 {
-   struct gl_context *ctx = &brw->ctx;
-   int size;
+   switch (function) {
+   case GL_DST_ALPHA:
+      return GL_ONE;
 
-   /* We need at least one BLEND_STATE written, because we might do
-    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
-    * for computed depth or alpha test), which will do an FB write
-    * with render target 0, which will reference BLEND_STATE[0] for
+   case GL_ONE_MINUS_DST_ALPHA:
+   case GL_SRC_ALPHA_SATURATE:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
+#if GEN_GEN >= 6
+typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
+#endif
+
+UNUSED static bool
+set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
+                     bool alpha_to_one)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+
+   bool independent_alpha_blend = false;
+
+   /* Used for implementing the following bit of GL_EXT_texture_integer:
+    * "Per-fragment operations that require floating-point color
+    *  components, including multisample alpha operations, alpha test,
+    *  blending, and dithering, have no effect when the corresponding
+    *  colors are written to an integer color buffer."
+    */
+   const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
+
+   const unsigned blend_enabled = GEN_GEN >= 6 ?
+      ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
+
+   /* _NEW_COLOR */
+   if (ctx->Color.ColorLogicOpEnabled) {
+      GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
+         : GL_UNSIGNED_NORMALIZED;
+      WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
+                rb_type != GL_UNSIGNED_NORMALIZED &&
+                rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
+                "renderbuffer\n",
+                _mesa_enum_to_string(ctx->Color.LogicOp),
+                _mesa_enum_to_string(rb_type));
+      if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
+         entry->LogicOpEnable = true;
+         entry->LogicOpFunction =
+            intel_translate_logic_op(ctx->Color.LogicOp);
+      }
+   } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
+              && (GEN_GEN <= 5 || !integer)) {
+      GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
+      GLenum eqA = ctx->Color.Blend[i].EquationA;
+      GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
+      GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
+      GLenum srcA = ctx->Color.Blend[i].SrcA;
+      GLenum dstA = ctx->Color.Blend[i].DstA;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
+         srcRGB = dstRGB = GL_ONE;
+
+      if (eqA == GL_MIN || eqA == GL_MAX)
+         srcA = dstA = GL_ONE;
+
+      /* Due to hardware limitations, the destination may have information
+       * in an alpha channel even when the format specifies no alpha
+       * channel. In order to avoid getting any incorrect blending due to
+       * that alpha channel, coerce the blend factors to values that will
+       * not read the alpha channel, but will instead use the correct
+       * implicit value for alpha.
+       */
+      if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
+                                               GL_TEXTURE_ALPHA_TYPE)) {
+         srcRGB = brw_fix_xRGB_alpha(srcRGB);
+         srcA = brw_fix_xRGB_alpha(srcA);
+         dstRGB = brw_fix_xRGB_alpha(dstRGB);
+         dstA = brw_fix_xRGB_alpha(dstA);
+      }
+
+      /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
+       * "If Dual Source Blending is enabled, this bit must be disabled."
+       *
+       * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
+       * and leave it enabled anyway.
+       */
+      if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
+         srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+         srcA = fix_dual_blend_alpha_to_one(srcA);
+         dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+         dstA = fix_dual_blend_alpha_to_one(dstA);
+      }
+
+      entry->ColorBufferBlendEnable = true;
+      entry->DestinationBlendFactor = blend_factor(dstRGB);
+      entry->SourceBlendFactor = blend_factor(srcRGB);
+      entry->DestinationAlphaBlendFactor = blend_factor(dstA);
+      entry->SourceAlphaBlendFactor = blend_factor(srcA);
+      entry->ColorBlendFunction = blend_eqn(eqRGB);
+      entry->AlphaBlendFunction = blend_eqn(eqA);
+
+      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
+         independent_alpha_blend = true;
+   }
+
+   return independent_alpha_blend;
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_blend_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   int size;
+
+   /* We need at least one BLEND_STATE written, because we might do
+    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
+    * for computed depth or alpha test), which will do an FB write
+    * with render target 0, which will reference BLEND_STATE[0] for
     * alpha test enable.
     */
    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
@@ -2310,74 +2733,9 @@ genX(upload_blend_state)(struct brw_context *brw)
 #else
       {
 #endif
-
-         /* _NEW_BUFFERS */
-         struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-
-         /* Used for implementing the following bit of GL_EXT_texture_integer:
-          * "Per-fragment operations that require floating-point color
-          *  components, including multisample alpha operations, alpha test,
-          *  blending, and dithering, have no effect when the corresponding
-          *  colors are written to an integer color buffer."
-          */
-         bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
-
-         /* _NEW_COLOR */
-         if (ctx->Color.ColorLogicOpEnabled) {
-            GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
-                                : GL_UNSIGNED_NORMALIZED;
-            WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
-                      rb_type != GL_UNSIGNED_NORMALIZED &&
-                      rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
-                      "renderbuffer\n",
-                      _mesa_enum_to_string(ctx->Color.LogicOp),
-                      _mesa_enum_to_string(rb_type));
-            if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
-               entry.LogicOpEnable = true;
-               entry.LogicOpFunction =
-                  intel_translate_logic_op(ctx->Color.LogicOp);
-            }
-         } else if (ctx->Color.BlendEnabled & (1 << i) && !integer &&
-                    !ctx->Color._AdvancedBlendMode) {
-            GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
-            GLenum eqA = ctx->Color.Blend[i].EquationA;
-            GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
-            GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
-            GLenum srcA = ctx->Color.Blend[i].SrcA;
-            GLenum dstA = ctx->Color.Blend[i].DstA;
-
-            if (eqRGB == GL_MIN || eqRGB == GL_MAX)
-               srcRGB = dstRGB = GL_ONE;
-
-            if (eqA == GL_MIN || eqA == GL_MAX)
-               srcA = dstA = GL_ONE;
-
-            /* Due to hardware limitations, the destination may have information
-             * in an alpha channel even when the format specifies no alpha
-             * channel. In order to avoid getting any incorrect blending due to
-             * that alpha channel, coerce the blend factors to values that will
-             * not read the alpha channel, but will instead use the correct
-             * implicit value for alpha.
-             */
-            if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
-                                                     GL_TEXTURE_ALPHA_TYPE)) {
-               srcRGB = brw_fix_xRGB_alpha(srcRGB);
-               srcA = brw_fix_xRGB_alpha(srcA);
-               dstRGB = brw_fix_xRGB_alpha(dstRGB);
-               dstA = brw_fix_xRGB_alpha(dstA);
-            }
-
-            entry.ColorBufferBlendEnable = true;
-            entry.DestinationBlendFactor = blend_factor(dstRGB);
-            entry.SourceBlendFactor = blend_factor(srcRGB);
-            entry.DestinationAlphaBlendFactor = blend_factor(dstA);
-            entry.SourceAlphaBlendFactor = blend_factor(srcA);
-            entry.ColorBlendFunction = blend_eqn(eqRGB);
-            entry.AlphaBlendFunction = blend_eqn(eqA);
-
-            if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
-               blend.IndependentAlphaBlendEnable = true;
-         }
+         blend.IndependentAlphaBlendEnable =
+            set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
+            blend.IndependentAlphaBlendEnable;
 
          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
@@ -2405,16 +2763,6 @@ genX(upload_blend_state)(struct brw_context *brw)
          entry.WriteDisableBlue  = !ctx->Color.ColorMask[i][2];
          entry.WriteDisableAlpha = !ctx->Color.ColorMask[i][3];
 
-         /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
-          * "If Dual Source Blending is enabled, this bit must be disabled."
-          */
-         WARN_ONCE(ctx->Color.Blend[i]._UsesDualSrc &&
-                   _mesa_is_multisample_enabled(ctx) &&
-                   ctx->Multisample.SampleAlphaToOne,
-                   "HW workaround: disabling alpha to one with dual src "
-                   "blending\n");
-         if (ctx->Color.Blend[i]._UsesDualSrc)
-            blend.AlphaToOneEnable = false;
 #if GEN_GEN >= 8
          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
 #else
@@ -2457,92 +2805,6 @@ static const struct brw_tracked_state genX(blend_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
-static void
-genX(upload_scissor_state)(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   struct GENX(SCISSOR_RECT) scissor;
-   uint32_t scissor_state_offset;
-   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
-   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
-   uint32_t *scissor_map;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   scissor_map = brw_state_batch(
-      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
-      32, &scissor_state_offset);
-
-   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
-
-   /* The scissor only needs to handle the intersection of drawable and
-    * scissor rect.  Clipping to the boundaries of static shared buffers
-    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
-    *
-    * Note that the hardware's coordinates are inclusive, while Mesa's min is
-    * inclusive but max is exclusive.
-    */
-   for (unsigned i = 0; i < viewport_count; i++) {
-      int bbox[4];
-
-      bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
-      bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
-      bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
-      bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
-      _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
-
-      if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
-         /* If the scissor was out of bounds and got clamped to 0 width/height
-          * at the bounds, the subtraction of 1 from maximums could produce a
-          * negative number and thus not clip anything.  Instead, just provide
-          * a min > max scissor inside the bounds, which produces the expected
-          * no rendering.
-          */
-         scissor.ScissorRectangleXMin = 1;
-         scissor.ScissorRectangleXMax = 0;
-         scissor.ScissorRectangleYMin = 1;
-         scissor.ScissorRectangleYMax = 0;
-      } else if (render_to_fbo) {
-         /* texmemory: Y=0=bottom */
-         scissor.ScissorRectangleXMin = bbox[0];
-         scissor.ScissorRectangleXMax = bbox[1] - 1;
-         scissor.ScissorRectangleYMin = bbox[2];
-         scissor.ScissorRectangleYMax = bbox[3] - 1;
-      } else {
-         /* memory: Y=0=top */
-         scissor.ScissorRectangleXMin = bbox[0];
-         scissor.ScissorRectangleXMax = bbox[1] - 1;
-         scissor.ScissorRectangleYMin = fb_height - bbox[3];
-         scissor.ScissorRectangleYMax = fb_height - bbox[2] - 1;
-      }
-
-      GENX(SCISSOR_RECT_pack)(
-         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
-   }
-
-   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
-      ptr.ScissorRectPointer = scissor_state_offset;
-   }
-}
-
-static const struct brw_tracked_state genX(scissor_state) = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_SCISSOR |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = genX(upload_scissor_state),
-};
-#endif
-
-/* ---------------------------------------------------------------------- */
-
 #if GEN_GEN >= 7
 UNUSED static const uint32_t push_constant_opcodes[] = {
    [MESA_SHADER_VERTEX]                      = 21,
@@ -2564,15 +2826,13 @@ upload_constant_state(struct brw_context *brw,
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
       if (active) {
-#if GEN_GEN >= 9
-         pkt.ConstantBody.ConstantBuffer2ReadLength =
-            stage_state->push_const_size;
-         pkt.ConstantBody.PointerToConstantBuffer2 =
-            render_ro_bo(brw->batch.bo, stage_state->push_const_offset);
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+         pkt.ConstantBody.ReadLength[2] = stage_state->push_const_size;
+         pkt.ConstantBody.Buffer[2] =
+            render_ro_bo(brw->curbe.curbe_bo, stage_state->push_const_offset);
 #else
-         pkt.ConstantBody.ConstantBuffer0ReadLength =
-            stage_state->push_const_size;
-         pkt.ConstantBody.PointerToConstantBuffer0.offset =
+         pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
+         pkt.ConstantBody.Buffer[0].offset =
             stage_state->push_const_offset | mocs;
 #endif
       }
@@ -2775,53 +3035,80 @@ static const struct brw_tracked_state genX(multisample_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
 static void
 genX(upload_color_calc_state)(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
+#if GEN_GEN <= 5
+      cc.IndependentAlphaBlendEnable =
+         set_blend_entry_bits(brw, &cc, 0, false);
+      set_depth_stencil_bits(brw, &cc);
+
+      if (ctx->Color.AlphaEnabled &&
+          ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
+         cc.AlphaTestEnable = true;
+         cc.AlphaTestFunction =
+            intel_translate_compare_func(ctx->Color.AlphaFunc);
+      }
+
+      cc.ColorDitherEnable = ctx->Color.DitherFlag;
+
+      cc.StatisticsEnable = brw->stats_wm;
+
+      cc.CCViewportStatePointer =
+         instruction_ro_bo(brw->batch.bo, brw->cc.vp_offset);
+#else
       /* _NEW_COLOR */
-      cc.AlphaTestFormat = ALPHATEST_UNORM8;
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
-                               ctx->Color.AlphaRef);
+      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
 
 #if GEN_GEN < 9
       /* _NEW_STENCIL */
       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
       cc.BackfaceStencilReferenceValue =
          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
+#endif
+
 #endif
 
       /* _NEW_COLOR */
-      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
-      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
-      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
-      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
+                               ctx->Color.AlphaRef);
    }
 
+#if GEN_GEN >= 6
    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
       ptr.ColorCalcStatePointer = brw->cc.state_offset;
 #if GEN_GEN != 7
       ptr.ColorCalcStatePointerValid = true;
 #endif
    }
+#else
+   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+#endif
 }
 
 static const struct brw_tracked_state genX(color_calc_state) = {
    .dirty = {
       .mesa = _NEW_COLOR |
-              _NEW_STENCIL,
+              _NEW_STENCIL |
+              (GEN_GEN <= 5 ? _NEW_BUFFERS |
+                              _NEW_DEPTH
+                            : 0),
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
-             BRW_NEW_CC_STATE |
-             BRW_NEW_STATE_BASE_ADDRESS,
+             (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
+                             BRW_NEW_STATS_WM
+                           : BRW_NEW_CC_STATE |
+                             BRW_NEW_STATE_BASE_ADDRESS),
    },
    .emit = genX(upload_color_calc_state),
 };
 
-#endif
 
 /* ---------------------------------------------------------------------- */
 
@@ -2969,42 +3256,16 @@ genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
     * command feels strange -- each dword pair contains a SO_DECL per stream.
     */
    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
-      int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
-      struct GENX(SO_DECL) decl = {0};
-      int varying = linked_xfb_info->Outputs[i].OutputRegister;
-      const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
-      unsigned component_mask = (1 << components) - 1;
-      unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
-      unsigned decl_buffer_slot = buffer;
+      const struct gl_transform_feedback_output *output =
+         &linked_xfb_info->Outputs[i];
+      const int buffer = output->OutputBuffer;
+      const int varying = output->OutputRegister;
+      const unsigned stream_id = output->StreamId;
       assert(stream_id < MAX_VERTEX_STREAMS);
 
-      /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
-       * gl_Layer is stored in VARYING_SLOT_PSIZ.y
-       * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
-       */
-      if (varying == VARYING_SLOT_PSIZ) {
-         assert(components == 1);
-         component_mask <<= 3;
-      } else if (varying == VARYING_SLOT_LAYER) {
-         assert(components == 1);
-         component_mask <<= 1;
-      } else if (varying == VARYING_SLOT_VIEWPORT) {
-         assert(components == 1);
-         component_mask <<= 2;
-      } else {
-         component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
-      }
-
       buffer_mask[stream_id] |= 1 << buffer;
 
-      decl.OutputBufferSlot = decl_buffer_slot;
-      if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
-         decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ];
-      } else {
-         assert(vue_map->varying_to_slot[varying] >= 0);
-         decl.RegisterIndex = vue_map->varying_to_slot[varying];
-      }
-      decl.ComponentMask = component_mask;
+      assert(vue_map->varying_to_slot[varying] >= 0);
 
       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
        * array.  Instead, it simply increments DstOffset for the following
@@ -3016,31 +3277,25 @@ genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
        * program as many size = 4 holes as we can, then a final hole to
        * accommodate the final 1, 2, or 3 remaining.
        */
-      int skip_components =
-         linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
+      int skip_components = output->DstOffset - next_offset[buffer];
 
-      next_offset[buffer] += skip_components;
-
-      while (skip_components >= 4) {
-         struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
-         d->HoleFlag = 1;
-         d->OutputBufferSlot = decl_buffer_slot;
-         d->ComponentMask = 0xf;
+      while (skip_components > 0) {
+         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+            .HoleFlag = 1,
+            .OutputBufferSlot = output->OutputBuffer,
+            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
+         };
          skip_components -= 4;
       }
 
-      if (skip_components > 0) {
-         struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
-         d->HoleFlag = 1;
-         d->OutputBufferSlot = decl_buffer_slot;
-         d->ComponentMask = (1 << skip_components) - 1;
-      }
-
-      assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
-
-      next_offset[buffer] += components;
+      next_offset[buffer] = output->DstOffset + output->NumComponents;
 
-      so_decl[stream_id][decls[stream_id]++] = decl;
+      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+         .OutputBufferSlot = output->OutputBuffer,
+         .RegisterIndex = vue_map->varying_to_slot[varying],
+         .ComponentMask =
+            ((1 << output->NumComponents) - 1) << output->ComponentOffset,
+      };
 
       if (decls[stream_id] > max_decls)
          max_decls = decls[stream_id];
@@ -3082,7 +3337,7 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
 #else
    struct brw_transform_feedback_object *brw_obj =
       (struct brw_transform_feedback_object *) xfb_obj;
-   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+   uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
 #endif
 
    /* Set up the up to 4 output buffers.  These are the ranges defined in the
@@ -3624,7 +3879,7 @@ genX(upload_cs_state)(struct brw_context *brw)
 
       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
-      vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;;
+      vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
       vfe.ResetGatewayTimer =
          Resettingrelativetimerandlatchingtheglobaltimestamp;
 #if GEN_GEN < 9
@@ -3702,7 +3957,6 @@ static const struct brw_tracked_state genX(cs_state) = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_CS_PROG_DATA |
-             BRW_NEW_PUSH_CONSTANT_ALLOCATION |
              BRW_NEW_SAMPLER_STATE_TABLE |
              BRW_NEW_SURFACES,
    },
@@ -3729,7 +3983,7 @@ genX(upload_raster)(struct brw_context *brw)
    struct gl_point_attrib *point = &ctx->Point;
 
    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
-      if (polygon->_FrontBit == render_to_fbo)
+      if (brw->polygon_front_bit == render_to_fbo)
          raster.FrontWinding = CounterClockwise;
 
       if (polygon->CullFlag) {
@@ -3941,11 +4195,15 @@ genX(upload_ps_blend)(struct brw_context *brw)
       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
 
+      bool alpha_to_one = false;
+
       if (!buffer0_is_integer) {
          /* _NEW_MULTISAMPLE */
-         pb.AlphaToCoverageEnable =
-            _mesa_is_multisample_enabled(ctx) &&
-            ctx->Multisample.SampleAlphaToCoverage;
+
+         if (_mesa_is_multisample_enabled(ctx)) {
+            pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
+            alpha_to_one = ctx->Multisample.SampleAlphaToOne;
+         }
 
          pb.AlphaTestEnable = color->AlphaEnabled;
       }
@@ -3990,6 +4248,16 @@ genX(upload_ps_blend)(struct brw_context *brw)
             dstA = brw_fix_xRGB_alpha(dstA);
          }
 
+         /* Alpha to One doesn't work with Dual Color Blending.  Override
+          * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
+          */
+         if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
+            srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+            srcA = fix_dual_blend_alpha_to_one(srcA);
+            dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+            dstA = fix_dual_blend_alpha_to_one(dstA);
+         }
+
          pb.ColorBufferBlendEnable = true;
          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
@@ -4013,36 +4281,721 @@ static const struct brw_tracked_state genX(ps_blend) = {
    },
    .emit = genX(upload_ps_blend)
 };
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 8
+static void
+genX(emit_vf_topology)(struct brw_context *brw)
+{
+   brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
+      vftopo.PrimitiveTopologyType = brw->primitive;
+   }
+}
+
+static const struct brw_tracked_state genX(vf_topology) = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BLORP |
+             BRW_NEW_PRIMITIVE,
+   },
+   .emit = genX(emit_vf_topology),
+};
+#endif
 
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN >= 7
+static void
+genX(emit_mi_report_perf_count)(struct brw_context *brw,
+                                struct brw_bo *bo,
+                                uint32_t offset_in_bytes,
+                                uint32_t report_id)
+{
+   brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
+      mi_rpc.MemoryAddress = instruction_bo(bo, offset_in_bytes);
+      mi_rpc.ReportID = report_id;
+   }
+}
 #endif
 
 /* ---------------------------------------------------------------------- */
 
+/**
+ * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
+ */
+static void
+genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
+                                     struct brw_stage_state *stage_state)
+{
+#if GEN_GEN >= 7
+   static const uint16_t packet_headers[] = {
+      [MESA_SHADER_VERTEX] = 43,
+      [MESA_SHADER_TESS_CTRL] = 44,
+      [MESA_SHADER_TESS_EVAL] = 45,
+      [MESA_SHADER_GEOMETRY] = 46,
+      [MESA_SHADER_FRAGMENT] = 47,
+   };
+
+   /* Ivybridge requires a workaround flush before VS packets. */
+   if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
+       stage_state->stage == MESA_SHADER_VERTEX) {
+      gen7_emit_vs_workaround_flush(brw);
+   }
+
+   brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
+      ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
+      ptr.PointertoVSSamplerState = stage_state->sampler_offset;
+   }
+#endif
+}
+
+UNUSED static bool
+has_component(mesa_format format, int i)
+{
+   if (_mesa_is_format_color_format(format))
+      return _mesa_format_has_color_component(format, i);
+
+   /* depth and stencil have only one component */
+   return i == 0;
+}
+
+/**
+ * Upload SAMPLER_BORDER_COLOR_STATE.
+ */
+static void
+genX(upload_default_color)(struct brw_context *brw,
+                           const struct gl_sampler_object *sampler,
+                           mesa_format format, GLenum base_format,
+                           bool is_integer_format, bool is_stencil_sampling,
+                           uint32_t *sdc_offset)
+{
+   union gl_color_union color;
+
+   switch (base_format) {
+   case GL_DEPTH_COMPONENT:
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the
+       * channels for safety.
+       */
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = sampler->BorderColor.ui[0];
+      break;
+   case GL_ALPHA:
+      color.ui[0] = 0u;
+      color.ui[1] = 0u;
+      color.ui[2] = 0u;
+      color.ui[3] = sampler->BorderColor.ui[3];
+      break;
+   case GL_INTENSITY:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = sampler->BorderColor.ui[0];
+      break;
+   case GL_LUMINANCE:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = float_as_int(1.0);
+      break;
+   case GL_LUMINANCE_ALPHA:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[0];
+      color.ui[2] = sampler->BorderColor.ui[0];
+      color.ui[3] = sampler->BorderColor.ui[3];
+      break;
+   default:
+      color.ui[0] = sampler->BorderColor.ui[0];
+      color.ui[1] = sampler->BorderColor.ui[1];
+      color.ui[2] = sampler->BorderColor.ui[2];
+      color.ui[3] = sampler->BorderColor.ui[3];
+      break;
+   }
+
+   /* In some cases we use an RGBA surface format for GL RGB textures,
+    * where we've initialized the A channel to 1.0.  We also have to set
+    * the border color alpha to 1.0 in that case.
+    */
+   if (base_format == GL_RGB)
+      color.ui[3] = float_as_int(1.0);
+
+   int alignment = 32;
+   if (brw->gen >= 8) {
+      alignment = 64;
+   } else if (brw->is_haswell && (is_integer_format || is_stencil_sampling)) {
+      alignment = 512;
+   }
+
+   uint32_t *sdc = brw_state_batch(
+      brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
+      alignment, sdc_offset);
+
+   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
+
+#define ASSIGN(dst, src) \
+   do {                  \
+      dst = src;         \
+   } while (0)
+
+#define ASSIGNu16(dst, src) \
+   do {                     \
+      dst = (uint16_t)src;  \
+   } while (0)
+
+#define ASSIGNu8(dst, src) \
+   do {                    \
+      dst = (uint8_t)src;  \
+   } while (0)
+
+#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
+   macro(state.BorderColor ## _color_type ## Red, src[0]);   \
+   macro(state.BorderColor ## _color_type ## Green, src[1]);   \
+   macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
+   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
+
+#if GEN_GEN >= 8
+   /* On Broadwell, the border color is represented as four 32-bit floats,
+    * integers, or unsigned values, interpreted according to the surface
+    * format.  This matches the sampler->BorderColor union exactly; just
+    * memcpy the values.
+    */
+   BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
+#elif GEN_IS_HASWELL
+   if (is_integer_format || is_stencil_sampling) {
+      bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
+      const int bits_per_channel =
+         _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
+
+      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
+       * "If any color channel is missing from the surface format,
+       *  corresponding border color should be programmed as zero and if
+       *  alpha channel is missing, corresponding Alpha border color should
+       *  be programmed as 1."
+       */
+      unsigned c[4] = { 0, 0, 0, 1 };
+      for (int i = 0; i < 4; i++) {
+         if (has_component(format, i))
+            c[i] = color.ui[i];
+      }
+
+      switch (bits_per_channel) {
+      case 8:
+         /* Copy RGBA in order. */
+         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
+         break;
+      case 10:
+         /* R10G10B10A2_UINT is treated like a 16-bit format. */
+      case 16:
+         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
+         break;
+      case 32:
+         if (base_format == GL_RG) {
+            /* Careful inspection of the tables reveals that for RG32 formats,
+             * the green channel needs to go where blue normally belongs.
+             */
+            state.BorderColor32bitRed = c[0];
+            state.BorderColor32bitBlue = c[1];
+            state.BorderColor32bitAlpha = 1;
+         } else {
+            /* Copy RGBA in order. */
+            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
+         }
+         break;
+      default:
+         assert(!"Invalid number of bits per channel in integer format.");
+         break;
+      }
+   } else {
+      BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+   }
+#elif GEN_GEN == 5 || GEN_GEN == 6
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
+
+#define MESA_FLOAT_TO_HALF(dst, src) \
+   dst = _mesa_float_to_half(src);
+
+   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
+
+#undef MESA_FLOAT_TO_HALF
+
+   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
+   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
+   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
+   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
+
+   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+#elif GEN_GEN == 4
+   BORDER_COLOR_ATTR(ASSIGN, , color.f);
+#else
+   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
+#endif
+
+#undef ASSIGN
+#undef BORDER_COLOR_ATTR
+
+   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
+}
+
+static uint32_t
+translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
+{
+   switch (wrap) {
+   case GL_REPEAT:
+      return TCM_WRAP;
+   case GL_CLAMP:
+#if GEN_GEN >= 8
+      /* GL_CLAMP is the weird mode where coordinates are clamped to
+       * [0.0, 1.0], so linear filtering of coordinates outside of
+       * [0.0, 1.0] give you half edge texel value and half border
+       * color.
+       *
+       * Gen8+ supports this natively.
+       */
+      return TCM_HALF_BORDER;
+#else
+      /* On Gen4-7.5, we clamp the coordinates in the fragment shader
+       * and set clamp_border here, which gets the result desired.
+       * We just use clamp(_to_edge) for nearest, because for nearest
+       * clamping to 1.0 gives border color instead of the desired
+       * edge texels.
+       */
+      if (using_nearest)
+         return TCM_CLAMP;
+      else
+         return TCM_CLAMP_BORDER;
+#endif
+   case GL_CLAMP_TO_EDGE:
+      return TCM_CLAMP;
+   case GL_CLAMP_TO_BORDER:
+      return TCM_CLAMP_BORDER;
+   case GL_MIRRORED_REPEAT:
+      return TCM_MIRROR;
+   case GL_MIRROR_CLAMP_TO_EDGE:
+      return TCM_MIRROR_ONCE;
+   default:
+      return TCM_WRAP;
+   }
+}
+
+/**
+ * Return true if the given wrap mode requires the border color to exist.
+ */
+static bool
+wrap_mode_needs_border_color(unsigned wrap_mode)
+{
+#if GEN_GEN >= 8
+   return wrap_mode == TCM_CLAMP_BORDER ||
+          wrap_mode == TCM_HALF_BORDER;
+#else
+   return wrap_mode == TCM_CLAMP_BORDER;
+#endif
+}
+
+/**
+ * Sets the sampler state for a single unit based off of the sampler key
+ * entry.
+ */
+static void
+genX(update_sampler_state)(struct brw_context *brw,
+                           GLenum target, bool tex_cube_map_seamless,
+                           GLfloat tex_unit_lod_bias,
+                           mesa_format format, GLenum base_format,
+                           const struct gl_texture_object *texObj,
+                           const struct gl_sampler_object *sampler,
+                           uint32_t *sampler_state,
+                           uint32_t batch_offset_for_sampler_state)
+{
+   struct GENX(SAMPLER_STATE) samp_st = { 0 };
+
+   /* Select min and mip filters. */
+   switch (sampler->MinFilter) {
+   case GL_NEAREST:
+      samp_st.MinModeFilter = MAPFILTER_NEAREST;
+      samp_st.MipModeFilter = MIPFILTER_NONE;
+      break;
+   case GL_LINEAR:
+      samp_st.MinModeFilter = MAPFILTER_LINEAR;
+      samp_st.MipModeFilter = MIPFILTER_NONE;
+      break;
+   case GL_NEAREST_MIPMAP_NEAREST:
+      samp_st.MinModeFilter = MAPFILTER_NEAREST;
+      samp_st.MipModeFilter = MIPFILTER_NEAREST;
+      break;
+   case GL_LINEAR_MIPMAP_NEAREST:
+      samp_st.MinModeFilter = MAPFILTER_LINEAR;
+      samp_st.MipModeFilter = MIPFILTER_NEAREST;
+      break;
+   case GL_NEAREST_MIPMAP_LINEAR:
+      samp_st.MinModeFilter = MAPFILTER_NEAREST;
+      samp_st.MipModeFilter = MIPFILTER_LINEAR;
+      break;
+   case GL_LINEAR_MIPMAP_LINEAR:
+      samp_st.MinModeFilter = MAPFILTER_LINEAR;
+      samp_st.MipModeFilter = MIPFILTER_LINEAR;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* Select mag filter. */
+   samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
+      MAPFILTER_LINEAR : MAPFILTER_NEAREST;
+
+   /* Enable anisotropic filtering if desired. */
+   samp_st.MaximumAnisotropy = RATIO21;
+
+   if (sampler->MaxAnisotropy > 1.0f) {
+      if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
+         samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
+      if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
+         samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
+
+      if (sampler->MaxAnisotropy > 2.0f) {
+         samp_st.MaximumAnisotropy =
+            MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
+      }
+   }
+
+   /* Set address rounding bits if not using nearest filtering. */
+   if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
+      samp_st.UAddressMinFilterRoundingEnable = true;
+      samp_st.VAddressMinFilterRoundingEnable = true;
+      samp_st.RAddressMinFilterRoundingEnable = true;
+   }
+
+   if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
+      samp_st.UAddressMagFilterRoundingEnable = true;
+      samp_st.VAddressMagFilterRoundingEnable = true;
+      samp_st.RAddressMagFilterRoundingEnable = true;
+   }
+
+   bool either_nearest =
+      sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
+   unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
+   unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
+   unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
+
+   if (target == GL_TEXTURE_CUBE_MAP ||
+       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
+      /* Cube maps must use the same wrap mode for all three coordinate
+       * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
+       *
+       * Ivybridge and Baytrail seem to have problems with CUBE mode and
+       * integer formats.  Fall back to CLAMP for now.
+       */
+      if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
+          !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
+         wrap_s = TCM_CUBE;
+         wrap_t = TCM_CUBE;
+         wrap_r = TCM_CUBE;
+      } else {
+         wrap_s = TCM_CLAMP;
+         wrap_t = TCM_CLAMP;
+         wrap_r = TCM_CLAMP;
+      }
+   } else if (target == GL_TEXTURE_1D) {
+      /* There's a bug in 1D texture sampling - it actually pays
+       * attention to the wrap_t value, though it should not.
+       * Override the wrap_t value here to GL_REPEAT to keep
+       * any nonexistent border pixels from floating in.
+       */
+      wrap_t = TCM_WRAP;
+   }
+
+   samp_st.TCXAddressControlMode = wrap_s;
+   samp_st.TCYAddressControlMode = wrap_t;
+   samp_st.TCZAddressControlMode = wrap_r;
+
+   samp_st.ShadowFunction =
+      sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
+      intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
+
+#if GEN_GEN >= 7
+   /* Set shadow function. */
+   samp_st.AnisotropicAlgorithm =
+      samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
+      EWAApproximation : LEGACY;
+#endif
+
+#if GEN_GEN >= 6
+   samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
+#endif
+
+   const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
+   samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
+   samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
+   samp_st.TextureLODBias =
+      CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
+
 #if GEN_GEN == 6
+   samp_st.BaseMipLevel =
+      CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
+   samp_st.MinandMagStateNotEqual =
+      samp_st.MinModeFilter != samp_st.MagModeFilter;
+#endif
+
+   /* Upload the border color if necessary.  If not, just point it at
+    * offset 0 (the start of the batch) - the color should be ignored,
+    * but that address won't fault in case something reads it anyway.
+    */
+   uint32_t border_color_offset = 0;
+   if (wrap_mode_needs_border_color(wrap_s) ||
+       wrap_mode_needs_border_color(wrap_t) ||
+       wrap_mode_needs_border_color(wrap_r)) {
+      genX(upload_default_color)(brw, sampler, format, base_format,
+                                 texObj->_IsIntegerFormat,
+                                 texObj->StencilSampling,
+                                 &border_color_offset);
+   }
+
+   samp_st.BorderColorPointer = border_color_offset;
+
+   if (GEN_GEN < 6) {
+      samp_st.BorderColorPointer += brw->batch.bo->offset64; /* reloc */
+      brw_emit_reloc(&brw->batch, batch_offset_for_sampler_state + 8,
+                     brw->batch.bo, border_color_offset,
+                     I915_GEM_DOMAIN_SAMPLER, 0);
+   }
+
+#if GEN_GEN >= 8
+   samp_st.LODPreClampMode = CLAMP_MODE_OGL;
+#else
+   samp_st.LODPreClampEnable = true;
+#endif
+
+   GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
+}
+
 static void
-genX(upload_viewport_state_pointers)(struct brw_context *brw)
+update_sampler_state(struct brw_context *brw,
+                     int unit,
+                     uint32_t *sampler_state,
+                     uint32_t batch_offset_for_sampler_state)
 {
-   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
-      vp.CCViewportStateChange = 1;
-      vp.SFViewportStateChange = 1;
-      vp.CLIPViewportStateChange = 1;
-      vp.PointertoCLIP_VIEWPORT = brw->clip.vp_offset;
-      vp.PointertoSF_VIEWPORT = brw->sf.vp_offset;
-      vp.PointertoCC_VIEWPORT = brw->cc.vp_offset;
+   struct gl_context *ctx = &brw->ctx;
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   const struct gl_texture_object *texObj = texUnit->_Current;
+   const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
+
+   /* These don't use samplers at all. */
+   if (texObj->Target == GL_TEXTURE_BUFFER)
+      return;
+
+   struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
+   genX(update_sampler_state)(brw, texObj->Target,
+                              ctx->Texture.CubeMapSeamless,
+                              texUnit->LodBias,
+                              firstImage->TexFormat, firstImage->_BaseFormat,
+                              texObj, sampler,
+                              sampler_state, batch_offset_for_sampler_state);
+}
+
+static void
+genX(upload_sampler_state_table)(struct brw_context *brw,
+                                 struct gl_program *prog,
+                                 struct brw_stage_state *stage_state)
+{
+   struct gl_context *ctx = &brw->ctx;
+   uint32_t sampler_count = stage_state->sampler_count;
+
+   GLbitfield SamplersUsed = prog->SamplersUsed;
+
+   if (sampler_count == 0)
+      return;
+
+   /* SAMPLER_STATE is 4 DWords on all platforms. */
+   const int dwords = GENX(SAMPLER_STATE_length);
+   const int size_in_bytes = dwords * sizeof(uint32_t);
+
+   uint32_t *sampler_state = brw_state_batch(brw,
+                                             sampler_count * size_in_bytes,
+                                             32, &stage_state->sampler_offset);
+   /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
+
+   uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
+
+   for (unsigned s = 0; s < sampler_count; s++) {
+      if (SamplersUsed & (1 << s)) {
+         const unsigned unit = prog->SamplerUnits[s];
+         if (ctx->Texture.Unit[unit]._Current) {
+            update_sampler_state(brw, unit, sampler_state,
+                                 batch_offset_for_sampler_state);
+         }
+      }
+
+      sampler_state += dwords;
+      batch_offset_for_sampler_state += size_in_bytes;
+   }
+
+   if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
+      /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
+      genX(emit_sampler_state_pointers_xs)(brw, stage_state);
+   } else {
+      /* Flag that the sampler state table pointer has changed; later atoms
+       * will handle it.
+       */
+      brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
    }
 }
 
-static const struct brw_tracked_state genX(viewport_state) = {
+static void
+genX(upload_fs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct gl_program *fs = (struct gl_program *) brw->fragment_program;
+   genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
+}
+
+static const struct brw_tracked_state genX(fs_samplers) = {
    .dirty = {
-      .mesa = 0,
+      .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
-             BRW_NEW_CC_VP |
-             BRW_NEW_CLIP_VP |
-             BRW_NEW_SF_VP |
-             BRW_NEW_STATE_BASE_ADDRESS,
+             BRW_NEW_FRAGMENT_PROGRAM,
+   },
+   .emit = genX(upload_fs_samplers),
+};
+
+static void
+genX(upload_vs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_VERTEX_PROGRAM */
+   struct gl_program *vs = (struct gl_program *) brw->vertex_program;
+   genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
+}
+
+static const struct brw_tracked_state genX(vs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VERTEX_PROGRAM,
+   },
+   .emit = genX(upload_vs_samplers),
+};
+
+#if GEN_GEN >= 6
+static void
+genX(upload_gs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   struct gl_program *gs = (struct gl_program *) brw->geometry_program;
+   if (!gs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
+}
+
+
+static const struct brw_tracked_state genX(gs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_GEOMETRY_PROGRAM,
+   },
+   .emit = genX(upload_gs_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_tcs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
+   if (!tcs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
+}
+
+static const struct brw_tracked_state genX(tcs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_TESS_PROGRAMS,
+   },
+   .emit = genX(upload_tcs_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_tes_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
+   if (!tes)
+      return;
+
+   genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
+}
+
+static const struct brw_tracked_state genX(tes_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_TESS_PROGRAMS,
    },
-   .emit = genX(upload_viewport_state_pointers),
+   .emit = genX(upload_tes_samplers),
+};
+#endif
+
+#if GEN_GEN >= 7
+static void
+genX(upload_cs_samplers)(struct brw_context *brw)
+{
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct gl_program *cs = (struct gl_program *) brw->compute_program;
+   if (!cs)
+      return;
+
+   genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
+}
+
+const struct brw_tracked_state genX(cs_samplers) = {
+   .dirty = {
+      .mesa = _NEW_TEXTURE,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_COMPUTE_PROGRAM,
+   },
+   .emit = genX(upload_cs_samplers),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#if GEN_GEN <= 5
+
+static void genX(upload_blend_constant_color)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
+      blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+   }
+}
+
+static const struct brw_tracked_state genX(blend_constant_color) = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_BLORP,
+   },
+   .emit = genX(upload_blend_constant_color)
 };
 #endif
 
@@ -4061,8 +5014,8 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_curbe_offsets,
       &brw_recalculate_urb_fence,
 
-      &brw_cc_vp,
-      &brw_cc_unit,
+      &genX(cc_vp),
+      &genX(color_calc_state),
 
       /* Surface state setup.  Must come before the VS/WM unit.  The binding
        * table upload must be last.
@@ -4075,13 +5028,13 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_vs_binding_table,
       &brw_wm_binding_table,
 
-      &brw_fs_samplers,
-      &brw_vs_samplers,
+      &genX(fs_samplers),
+      &genX(vs_samplers),
 
       /* These set up state for brw_psp_urb_cbs */
       &brw_wm_unit,
-      &brw_sf_vp,
-      &brw_sf_unit,
+      &genX(sf_clip_viewport),
+      &genX(sf_state),
       &genX(vs_state), /* always required, enabled or not */
       &brw_clip_unit,
       &brw_gs_unit,
@@ -4091,7 +5044,7 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_invariant_state,
 
       &brw_binding_table_pointers,
-      &brw_blend_constant_color,
+      &genX(blend_constant_color),
 
       &brw_depthbuffer,
 
@@ -4104,7 +5057,7 @@ genX(init_atoms)(struct brw_context *brw)
 
       &genX(drawing_rect),
       &brw_indices, /* must come before brw_vertices */
-      &brw_index_buffer,
+      &genX(index_buffer),
       &genX(vertices),
 
       &brw_constant_buffer
@@ -4116,8 +5069,7 @@ genX(init_atoms)(struct brw_context *brw)
 
       /* Command packets: */
 
-      &brw_cc_vp,
-      &genX(viewport_state),   /* must do after *_vp stages */
+      &genX(cc_vp),
 
       &gen6_urb,
       &genX(blend_state),              /* must do before cc unit */
@@ -4145,9 +5097,9 @@ genX(init_atoms)(struct brw_context *brw)
       &gen6_gs_binding_table,
       &brw_wm_binding_table,
 
-      &brw_fs_samplers,
-      &brw_vs_samplers,
-      &brw_gs_samplers,
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+      &genX(gs_samplers),
       &gen6_sampler_state,
       &genX(multisample_state),
 
@@ -4171,7 +5123,7 @@ genX(init_atoms)(struct brw_context *brw)
       &genX(drawing_rect),
 
       &brw_indices, /* must come before brw_vertices */
-      &brw_index_buffer,
+      &genX(index_buffer),
       &genX(vertices),
    };
 #elif GEN_GEN == 7
@@ -4179,7 +5131,7 @@ genX(init_atoms)(struct brw_context *brw)
    {
       /* Command packets: */
 
-      &brw_cc_vp,
+      &genX(cc_vp),
       &genX(sf_clip_viewport),
 
       &gen7_l3_state,
@@ -4228,11 +5180,11 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_gs_binding_table,
       &brw_wm_binding_table,
 
-      &brw_fs_samplers,
-      &brw_vs_samplers,
-      &brw_tcs_samplers,
-      &brw_tes_samplers,
-      &brw_gs_samplers,
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+      &genX(tcs_samplers),
+      &genX(tes_samplers),
+      &genX(gs_samplers),
       &genX(multisample_state),
 
       &genX(vs_state),
@@ -4259,7 +5211,7 @@ genX(init_atoms)(struct brw_context *brw)
       &genX(drawing_rect),
 
       &brw_indices, /* must come before brw_vertices */
-      &brw_index_buffer,
+      &genX(index_buffer),
       &genX(vertices),
 
 #if GEN_IS_HASWELL
@@ -4269,7 +5221,7 @@ genX(init_atoms)(struct brw_context *brw)
 #elif GEN_GEN >= 8
    static const struct brw_tracked_state *render_atoms[] =
    {
-      &brw_cc_vp,
+      &genX(cc_vp),
       &genX(sf_clip_viewport),
 
       &gen7_l3_state,
@@ -4317,11 +5269,11 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_gs_binding_table,
       &brw_wm_binding_table,
 
-      &brw_fs_samplers,
-      &brw_vs_samplers,
-      &brw_tcs_samplers,
-      &brw_tes_samplers,
-      &brw_gs_samplers,
+      &genX(fs_samplers),
+      &genX(vs_samplers),
+      &genX(tcs_samplers),
+      &genX(tes_samplers),
+      &genX(gs_samplers),
       &genX(multisample_state),
 
       &genX(vs_state),
@@ -4351,10 +5303,10 @@ genX(init_atoms)(struct brw_context *brw)
 
       &genX(drawing_rect),
 
-      &gen8_vf_topology,
+      &genX(vf_topology),
 
       &brw_indices,
-      &gen8_index_buffer,
+      &genX(index_buffer),
       &genX(vertices),
 
       &genX(cut_index),
@@ -4377,12 +5329,14 @@ genX(init_atoms)(struct brw_context *brw)
       &brw_cs_abo_surfaces,
       &brw_cs_texture_surfaces,
       &brw_cs_work_groups_surface,
-      &brw_cs_samplers,
+      &genX(cs_samplers),
       &genX(cs_state),
    };
 
    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
                            compute_atoms, ARRAY_SIZE(compute_atoms));
+
+   brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
 #endif
 }