util: use C99 declaration in the for-loop hash_table_foreach() macro

[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c

index 3f8a7265db9b052377535a199607dab27e5b5d6f..740cb0c4d2edc41842aa37866795ff7f78ddab7c 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -23,7 +23,7 @@
  
  #include <assert.h>
  
-#include "common/gen_device_info.h"
+#include "dev/gen_device_info.h"
  #include "common/gen_sample_positions.h"
  #include "genxml/gen_macros.h"
  
@@ -34,9 +34,6 @@
  #include "main/state.h"
  
  #include "brw_context.h"
-#if GEN_GEN == 6
-#include "brw_defines.h"
-#endif
  #include "brw_draw.h"
  #include "brw_multisample_state.h"
  #include "brw_state.h"
@@ -62,7 +59,7 @@
  UNUSED static void *
  emit_dwords(struct brw_context *brw, unsigned n)
  {
-   intel_batchbuffer_begin(brw, n, RENDER_RING);
+   intel_batchbuffer_begin(brw, n);
     uint32_t *map = brw->batch.map_next;
     brw->batch.map_next += n;
     intel_batchbuffer_advance(brw);
@@ -71,23 +68,10 @@ emit_dwords(struct brw_context *brw, unsigned n)
  
  struct brw_address {
     struct brw_bo *bo;
-   uint32_t read_domains;
-   uint32_t write_domain;
+   unsigned reloc_flags;
     uint32_t offset;
  };
  
-static uint64_t
-emit_reloc(struct brw_context *brw,
-           void *location, struct brw_address address, uint32_t delta)
-{
-   uint32_t offset = (char *) location - (char *) brw->batch.map;
-
-   return brw_emit_reloc(&brw->batch, offset, address.bo,
-                         address.offset + delta,
-                         address.read_domains,
-                         address.write_domain);
-}
-
  #define __gen_address_type struct brw_address
  #define __gen_user_data struct brw_context
  
@@ -95,89 +79,89 @@ static uint64_t
  __gen_combine_address(struct brw_context *brw, void *location,
                        struct brw_address address, uint32_t delta)
  {
+   struct intel_batchbuffer *batch = &brw->batch;
+   uint32_t offset;
+
     if (address.bo == NULL) {
        return address.offset + delta;
     } else {
-      return emit_reloc(brw, location, address, delta);
+      if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
+         offset = (char *) location - (char *) brw->batch.state.map;
+         return brw_state_reloc(batch, offset, address.bo,
+                                address.offset + delta,
+                                address.reloc_flags);
+      }
+
+      assert(!brw_ptr_in_state_buffer(batch, location));
+
+      offset = (char *) location - (char *) brw->batch.batch.map;
+      return brw_batch_reloc(batch, offset, address.bo,
+                             address.offset + delta,
+                             address.reloc_flags);
     }
  }
  
-static inline struct brw_address
-render_bo(struct brw_bo *bo, uint32_t offset)
+UNUSED static struct brw_address
+rw_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
              .bo = bo,
              .offset = offset,
-            .read_domains = I915_GEM_DOMAIN_RENDER,
-            .write_domain = I915_GEM_DOMAIN_RENDER,
+            .reloc_flags = RELOC_WRITE,
     };
  }
  
-static inline struct brw_address
-render_ro_bo(struct brw_bo *bo, uint32_t offset)
+static struct brw_address
+ro_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
              .bo = bo,
              .offset = offset,
-            .read_domains = I915_GEM_DOMAIN_RENDER,
-            .write_domain = 0,
     };
  }
  
-static inline struct brw_address
-instruction_bo(struct brw_bo *bo, uint32_t offset)
+static struct brw_address
+rw_32_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
              .bo = bo,
              .offset = offset,
-            .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
-            .write_domain = I915_GEM_DOMAIN_INSTRUCTION,
+            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
     };
  }
  
-static inline struct brw_address
-instruction_ro_bo(struct brw_bo *bo, uint32_t offset)
+static struct brw_address
+ro_32_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
              .bo = bo,
              .offset = offset,
-            .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
-            .write_domain = 0,
+            .reloc_flags = RELOC_32BIT,
     };
  }
  
-static inline struct brw_address
-vertex_bo(struct brw_bo *bo, uint32_t offset)
+UNUSED static struct brw_address
+ggtt_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
              .bo = bo,
              .offset = offset,
-            .read_domains = I915_GEM_DOMAIN_VERTEX,
-            .write_domain = 0,
+            .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
     };
  }
  
  #if GEN_GEN == 4
-static inline struct brw_address
+static struct brw_address
  KSP(struct brw_context *brw, uint32_t offset)
  {
-   return instruction_bo(brw->cache.bo, offset);
-}
-
-static inline struct brw_address
-KSP_ro(struct brw_context *brw, uint32_t offset)
-{
-   return instruction_ro_bo(brw->cache.bo, offset);
+   return ro_bo(brw->cache.bo, offset);
  }
  #else
-static inline uint32_t
-KSP(struct brw_context *brw, uint32_t offset)
+static uint32_t
+KSP(UNUSED struct brw_context *brw, uint32_t offset)
  {
     return offset;
  }
-
-#define KSP_ro KSP
-
  #endif
  
  #include "genxml/genX_pack.h"
@@ -206,7 +190,7 @@ KSP(struct brw_context *brw, uint32_t offset)
     })
  
  #define brw_state_emit(brw, cmd, align, offset, name)              \
-   for (struct cmd name = { 0, },                                  \
+   for (struct cmd name = {},                                      \
          *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
                                  align, offset);                    \
          __builtin_expect(_dst != NULL, 1);                         \
@@ -233,7 +217,7 @@ genX(upload_polygon_stipple)(struct brw_context *brw)
         * to a FBO (i.e. any named frame buffer object), we *don't*
         * need to invert - we already match the layout.
         */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
           for (unsigned i = 0; i < 32; i++)
              poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
        } else {
@@ -273,7 +257,7 @@ genX(upload_polygon_stipple_offset)(struct brw_context *brw)
         * to a user-created FBO then our native pixel coordinate system
         * works just fine, and there's no window system to worry about.
         */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
           poly.PolygonStippleYOffset =
              (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
        }
@@ -346,14 +330,22 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
                                 unsigned buffer_nr,
                                 struct brw_bo *bo,
                                 unsigned start_offset,
-                               unsigned end_offset,
+                               MAYBE_UNUSED unsigned end_offset,
                                 unsigned stride,
-                               unsigned step_rate)
+                               MAYBE_UNUSED unsigned step_rate)
  {
     struct GENX(VERTEX_BUFFER_STATE) buf_state = {
        .VertexBufferIndex = buffer_nr,
        .BufferPitch = stride,
-      .BufferStartingAddress = vertex_bo(bo, start_offset),
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .BufferStartingAddress = ro_32_bo(bo, start_offset),
  #if GEN_GEN >= 8
        .BufferSize = end_offset - start_offset,
  #endif
@@ -366,11 +358,13 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
        .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
        .InstanceDataStepRate = step_rate,
  #if GEN_GEN >= 5
-      .EndAddress = vertex_bo(bo, end_offset - 1),
+      .EndAddress = ro_bo(bo, end_offset - 1),
  #endif
  #endif
  
-#if GEN_GEN == 10
+#if GEN_GEN == 11
+      .VertexBufferMOCS = ICL_MOCS_WB,
+#elif GEN_GEN == 10
        .VertexBufferMOCS = CNL_MOCS_WB,
  #elif GEN_GEN == 9
        .VertexBufferMOCS = SKL_MOCS_WB,
@@ -400,11 +394,15 @@ is_passthru_format(uint32_t format)
  }
  
  UNUSED static int
-uploads_needed(uint32_t format)
+uploads_needed(uint32_t format,
+               bool is_dual_slot)
  {
     if (!is_passthru_format(format))
        return 1;
  
+   if (is_dual_slot)
+      return 2;
+
     switch (format) {
     case ISL_FORMAT_R64_PASSTHRU:
     case ISL_FORMAT_R64G64_PASSTHRU:
@@ -433,14 +431,22 @@ downsize_format_if_needed(uint32_t format,
     if (!is_passthru_format(format))
        return format;
  
+   /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
+    * 1 means that we have been forced to do 2 uploads for a size <= 2. This
+    * happens with gen < 8 and dvec3 or dvec4 vertex shader input
+    * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
+    * flagging that we want to fill with zeroes this second forced upload.
+    */
     switch (format) {
     case ISL_FORMAT_R64_PASSTHRU:
-      return ISL_FORMAT_R32G32_FLOAT;
+      return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
+                         : ISL_FORMAT_R32_FLOAT;
     case ISL_FORMAT_R64G64_PASSTHRU:
-      return ISL_FORMAT_R32G32B32A32_FLOAT;
+      return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
+                         : ISL_FORMAT_R32_FLOAT;
     case ISL_FORMAT_R64G64B64_PASSTHRU:
-      return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
-                     : ISL_FORMAT_R32G32_FLOAT;
+      return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
+                         : ISL_FORMAT_R32G32_FLOAT;
     case ISL_FORMAT_R64G64B64A64_PASSTHRU:
        return ISL_FORMAT_R32G32B32A32_FLOAT;
     default:
@@ -456,6 +462,15 @@ static int
  upload_format_size(uint32_t upload_format)
  {
     switch (upload_format) {
+   case ISL_FORMAT_R32_FLOAT:
+
+      /* downsized_format has returned this one in order to flag that we are
+       * performing a second upload which we want to have filled with
+       * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
+       * vertex shader input variables.
+       */
+
+      return 0;
     case ISL_FORMAT_R32G32_FLOAT:
        return 2;
     case ISL_FORMAT_R32G32B32A32_FLOAT:
@@ -465,9 +480,69 @@ upload_format_size(uint32_t upload_format)
     }
  }
  
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers.  So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround.  Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   bool need_invalidate = false;
+   unsigned i;
+
+   for (i = 0; i < brw->vb.nr_buffers; i++) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   /* Don't bother with draw parameter buffers - those are generated by
+    * the driver so we can select a consistent memory zone.
+    */
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+   }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+   if (high_bits != brw->ib.last_bo_high_bits) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw->ib.last_bo_high_bits = high_bits;
+   }
+#endif
+}
+
  static void
  genX(emit_vertices)(struct brw_context *brw)
  {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
     uint32_t *dw;
  
     brw_prepare_vertices(brw);
@@ -521,28 +596,22 @@ genX(emit_vertices)(struct brw_context *brw)
     } else {
        brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
     }
+#endif
  
-   /* Normally we don't need an element for the SGVS attribute because the
-    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
-    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
-    * we're using draw parameters then we need an element for the those
-    * values.  Additionally if there is an edge flag element then the SGVS
-    * can't be inserted past that so we need a dummy element to ensure that
-    * the edge flag is the last one.
-    */
-   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
-                                    vs_prog_data->uses_baseinstance ||
-                                    ((vs_prog_data->uses_instanceid ||
-                                      vs_prog_data->uses_vertexid)
-                                     && uses_edge_flag));
-#else
-   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
-                                    vs_prog_data->uses_baseinstance ||
+   const bool uses_draw_params =
+      vs_prog_data->uses_firstvertex ||
+      vs_prog_data->uses_baseinstance;
+
+   const bool uses_derived_draw_params =
+      vs_prog_data->uses_drawid ||
+      vs_prog_data->uses_is_indexed_draw;
+
+   const bool needs_sgvs_element = (uses_draw_params ||
                                      vs_prog_data->uses_instanceid ||
                                      vs_prog_data->uses_vertexid);
-#endif
+
     unsigned nr_elements =
-      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
+      brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
  
  #if GEN_GEN < 8
     /* If any of the formats of vb.enabled needs more that one upload, we need
@@ -550,9 +619,10 @@ genX(emit_vertices)(struct brw_context *brw)
      */
     for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
        struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+      const struct gl_array_attributes *glattrib = input->glattrib;
+      uint32_t format = brw_get_vertex_surface_type(brw, glattrib);
  
-      if (uploads_needed(format) > 1)
+      if (uploads_needed(format, input->is_dual_slot) > 1)
           nr_elements++;
     }
  #endif
@@ -580,11 +650,10 @@ genX(emit_vertices)(struct brw_context *brw)
     }
  
     /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
-   const bool uses_draw_params =
-      vs_prog_data->uses_basevertex ||
-      vs_prog_data->uses_baseinstance;
     const unsigned nr_buffers = brw->vb.nr_buffers +
-      uses_draw_params + vs_prog_data->uses_drawid;
+      uses_draw_params + uses_derived_draw_params;
+
+   vf_invalidate_for_vb_48bit_transitions(brw);
  
     if (nr_buffers) {
        assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
@@ -600,7 +669,7 @@ genX(emit_vertices)(struct brw_context *brw)
            * vertex element may poke over the end of the buffer by 2 bytes.
            */
           const unsigned padding =
-            (GEN_GEN <= 7 && !brw->is_baytrail && !brw->is_haswell) * 2;
+            (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
           const unsigned end = buffer->offset + buffer->size + padding;
           dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
                                               buffer->offset,
@@ -618,11 +687,11 @@ genX(emit_vertices)(struct brw_context *brw)
                                               0 /* step rate */);
        }
  
-      if (vs_prog_data->uses_drawid) {
+      if (uses_derived_draw_params) {
           dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
-                                             brw->draw.draw_id_bo,
-                                             brw->draw.draw_id_offset,
-                                             brw->draw.draw_id_bo->size,
+                                             brw->draw.derived_draw_params_bo,
+                                             brw->draw.derived_draw_params_offset,
+                                             brw->draw.derived_draw_params_bo->size,
                                               0 /* stride */,
                                               0 /* step rate */);
        }
@@ -643,12 +712,14 @@ genX(emit_vertices)(struct brw_context *brw)
     unsigned i;
     for (i = 0; i < brw->vb.nr_enabled; i++) {
        const struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+      const struct gl_array_attributes *glattrib = input->glattrib;
+      uint32_t format = brw_get_vertex_surface_type(brw, glattrib);
        uint32_t comp0 = VFCOMP_STORE_SRC;
        uint32_t comp1 = VFCOMP_STORE_SRC;
        uint32_t comp2 = VFCOMP_STORE_SRC;
        uint32_t comp3 = VFCOMP_STORE_SRC;
-      const unsigned num_uploads = GEN_GEN < 8 ? uploads_needed(format) : 1;
+      const unsigned num_uploads = GEN_GEN < 8 ?
+         uploads_needed(format, input->is_dual_slot) : 1;
  
  #if GEN_GEN >= 8
        /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
@@ -683,17 +754,18 @@ genX(emit_vertices)(struct brw_context *brw)
            * entry. */
           const unsigned offset = input->offset + c * 16;
  
+         const struct gl_array_attributes *glattrib = input->glattrib;
           const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
-            upload_format_size(upload_format) : input->glarray->Size;
+            upload_format_size(upload_format) : glattrib->Size;
  
           switch (size) {
              case 0: comp0 = VFCOMP_STORE_0;
              case 1: comp1 = VFCOMP_STORE_0;
              case 2: comp2 = VFCOMP_STORE_0;
              case 3:
-               if (GEN_GEN >= 8 && input->glarray->Doubles) {
+               if (GEN_GEN >= 8 && glattrib->Doubles) {
                    comp3 = VFCOMP_STORE_0;
-               } else if (input->glarray->Integer) {
+               } else if (glattrib->Integer) {
                    comp3 = VFCOMP_STORE_1_INT;
                 } else {
                    comp3 = VFCOMP_STORE_1_FP;
@@ -718,7 +790,7 @@ genX(emit_vertices)(struct brw_context *brw)
            *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
            *     vertex element."
            */
-         if (input->glarray->Doubles && !input->is_dual_slot) {
+         if (glattrib->Doubles && !input->is_dual_slot) {
              /* Store vertex elements which correspond to double and dvec2 vertex
               * shader inputs as 128-bit vertex elements, instead of 256-bits.
               */
@@ -759,8 +831,7 @@ genX(emit_vertices)(struct brw_context *brw)
        };
  
  #if GEN_GEN >= 8
-      if (vs_prog_data->uses_basevertex ||
-          vs_prog_data->uses_baseinstance) {
+      if (uses_draw_params) {
           elem_state.VertexBufferIndex = brw->vb.nr_buffers;
           elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
           elem_state.Component0Control = VFCOMP_STORE_SRC;
@@ -769,11 +840,10 @@ genX(emit_vertices)(struct brw_context *brw)
  #else
        elem_state.VertexBufferIndex = brw->vb.nr_buffers;
        elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
-      if (vs_prog_data->uses_basevertex)
+      if (uses_draw_params) {
           elem_state.Component0Control = VFCOMP_STORE_SRC;
-
-      if (vs_prog_data->uses_baseinstance)
           elem_state.Component1Control = VFCOMP_STORE_SRC;
+      }
  
        if (vs_prog_data->uses_vertexid)
           elem_state.Component2Control = VFCOMP_STORE_VID;
@@ -786,13 +856,13 @@ genX(emit_vertices)(struct brw_context *brw)
        dw += GENX(VERTEX_ELEMENT_STATE_length);
     }
  
-   if (vs_prog_data->uses_drawid) {
+   if (uses_derived_draw_params) {
        struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
           .Valid = true,
           .VertexBufferIndex = brw->vb.nr_buffers + 1,
-         .SourceElementFormat = ISL_FORMAT_R32_UINT,
+         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
           .Component0Control = VFCOMP_STORE_SRC,
-         .Component1Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_SRC,
           .Component2Control = VFCOMP_STORE_0,
           .Component3Control = VFCOMP_STORE_0,
  #if GEN_GEN < 5
@@ -806,8 +876,8 @@ genX(emit_vertices)(struct brw_context *brw)
  
  #if GEN_GEN >= 6
     if (gen6_edgeflag_input) {
-      const uint32_t format =
-         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
+      const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
+      const uint32_t format = brw_get_vertex_surface_type(brw, glattrib);
  
        struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
           .Valid = true,
@@ -863,6 +933,7 @@ static const struct brw_tracked_state genX(vertices) = {
        .mesa = _NEW_POLYGON,
        .brw = BRW_NEW_BATCH |
               BRW_NEW_BLORP |
+             BRW_NEW_VERTEX_PROGRAM |
               BRW_NEW_VERTICES |
               BRW_NEW_VS_PROG_DATA,
     },
@@ -877,17 +948,27 @@ genX(emit_index_buffer)(struct brw_context *brw)
     if (index_buffer == NULL)
        return;
  
+   vf_invalidate_for_ib_48bit_transition(brw);
+
     brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
  #if GEN_GEN < 8 && !GEN_IS_HASWELL
        ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
  #endif
        ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
-      ib.BufferStartingAddress = vertex_bo(brw->ib.bo, 0);
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two index buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict index buffers to the low 32 bits of
+       * the address space.
+       */
+      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
  #if GEN_GEN >= 8
        ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
        ib.BufferSize = brw->ib.size;
  #else
-      ib.BufferEndingAddress = vertex_bo(brw->ib.bo, brw->ib.size - 1);
+      ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
  #endif
     }
  }
@@ -1051,6 +1132,9 @@ genX(calculate_attr_overrides)(const struct brw_context *brw,
     /* _NEW_POINT */
     const struct gl_point_attrib *point = &ctx->Point;
  
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
+
     /* BRW_NEW_FS_PROG_DATA */
     const struct brw_wm_prog_data *wm_prog_data =
        brw_wm_prog_data(brw->wm.base.prog_data);
@@ -1058,19 +1142,13 @@ genX(calculate_attr_overrides)(const struct brw_context *brw,
  
     *point_sprite_enables = 0;
  
-   /* BRW_NEW_FRAGMENT_PROGRAM
-    *
-    * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
-    * the full vertex header.  Otherwise, we can program the SF to start
-    * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
-    * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
-    * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
-    */
-
-   bool fs_needs_vue_header = brw->fragment_program->info.inputs_read &
-      (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+   int first_slot =
+      brw_compute_first_urb_slot_required(fp->info.inputs_read,
+                                          &brw->vue_map_geom_out);
  
-   *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
+   /* Each URB offset packs two varying slots */
+   assert(first_slot % 2 == 0);
+   *urb_entry_read_offset = first_slot / 2;
  
     /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
      * description of dw10 Point Sprite Texture Coordinate Enable:
@@ -1156,9 +1234,16 @@ genX(calculate_attr_overrides)(const struct brw_context *brw,
  
  /* ---------------------------------------------------------------------- */
  
-#if GEN_GEN >= 6
-static void
-genX(upload_depth_stencil_state)(struct brw_context *brw)
+#if GEN_GEN >= 8
+typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
+#elif GEN_GEN >= 6
+typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
+#endif
+
+static inline void
+set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
  {
     struct gl_context *ctx = &brw->ctx;
  
@@ -1173,66 +1258,76 @@ genX(upload_depth_stencil_state)(struct brw_context *brw)
     struct gl_stencil_attrib *stencil = &ctx->Stencil;
     const int b = stencil->_BackFace;
  
+   if (depth->Test && depth_irb) {
+      ds->DepthTestEnable = true;
+      ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
+      ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
+   }
+
+   if (brw->stencil_enabled) {
+      ds->StencilTestEnable = true;
+      ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
+      ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
+
+      ds->StencilTestFunction =
+         intel_translate_compare_func(stencil->Function[0]);
+      ds->StencilFailOp =
+         intel_translate_stencil_op(stencil->FailFunc[0]);
+      ds->StencilPassDepthPassOp =
+         intel_translate_stencil_op(stencil->ZPassFunc[0]);
+      ds->StencilPassDepthFailOp =
+         intel_translate_stencil_op(stencil->ZFailFunc[0]);
+
+      ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
+
+      if (brw->stencil_two_sided) {
+         ds->DoubleSidedStencilEnable = true;
+         ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
+         ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
+
+         ds->BackfaceStencilTestFunction =
+            intel_translate_compare_func(stencil->Function[b]);
+         ds->BackfaceStencilFailOp =
+            intel_translate_stencil_op(stencil->FailFunc[b]);
+         ds->BackfaceStencilPassDepthPassOp =
+            intel_translate_stencil_op(stencil->ZPassFunc[b]);
+         ds->BackfaceStencilPassDepthFailOp =
+            intel_translate_stencil_op(stencil->ZFailFunc[b]);
+      }
+
+#if GEN_GEN <= 5 || GEN_GEN >= 9
+      ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
+      ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
+#endif
+   }
+}
+
+#if GEN_GEN >= 6
+static void
+genX(upload_depth_stencil_state)(struct brw_context *brw)
+{
  #if GEN_GEN >= 8
     brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
+      set_depth_stencil_bits(brw, &wmds);
+   }
  #else
     uint32_t ds_offset;
-   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, wmds) {
-#endif
-      if (depth->Test && depth_irb) {
-         wmds.DepthTestEnable = true;
-         wmds.DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
-         wmds.DepthTestFunction = intel_translate_compare_func(depth->Func);
-      }
-
-      if (brw->stencil_enabled) {
-         wmds.StencilTestEnable = true;
-         wmds.StencilWriteMask = stencil->WriteMask[0] & 0xff;
-         wmds.StencilTestMask = stencil->ValueMask[0] & 0xff;
-
-         wmds.StencilTestFunction =
-            intel_translate_compare_func(stencil->Function[0]);
-         wmds.StencilFailOp =
-            intel_translate_stencil_op(stencil->FailFunc[0]);
-         wmds.StencilPassDepthPassOp =
-            intel_translate_stencil_op(stencil->ZPassFunc[0]);
-         wmds.StencilPassDepthFailOp =
-            intel_translate_stencil_op(stencil->ZFailFunc[0]);
-
-         wmds.StencilBufferWriteEnable = brw->stencil_write_enabled;
-
-         if (brw->stencil_two_sided) {
-            wmds.DoubleSidedStencilEnable = true;
-            wmds.BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
-            wmds.BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
-
-            wmds.BackfaceStencilTestFunction =
-               intel_translate_compare_func(stencil->Function[b]);
-            wmds.BackfaceStencilFailOp =
-               intel_translate_stencil_op(stencil->FailFunc[b]);
-            wmds.BackfaceStencilPassDepthPassOp =
-               intel_translate_stencil_op(stencil->ZPassFunc[b]);
-            wmds.BackfaceStencilPassDepthFailOp =
-               intel_translate_stencil_op(stencil->ZFailFunc[b]);
-         }
-
-#if GEN_GEN >= 9
-         wmds.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
-         wmds.BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
-#endif
-      }
+   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
+      set_depth_stencil_bits(brw, &ds);
     }
  
+   /* Now upload a pointer to the indirect state */
  #if GEN_GEN == 6
     brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
        ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
        ptr.DEPTH_STENCIL_STATEChange = true;
     }
-#elif GEN_GEN == 7
+#else
     brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
        ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
     }
  #endif
+#endif
  }
  
  static const struct brw_tracked_state genX(depth_stencil_state) = {
@@ -1251,7 +1346,106 @@ static const struct brw_tracked_state genX(depth_stencil_state) = {
  
  /* ---------------------------------------------------------------------- */
  
-#if GEN_GEN >= 6
+#if GEN_GEN <= 5
+
+static void
+genX(upload_clip_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
+      clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
+      clip.GRFRegisterCount =
+         DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
+      clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+      clip.SingleProgramFlow = true;
+      clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
+      clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
+
+      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
+      clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
+      clip.DispatchGRFStartRegisterForURBData = 1;
+      clip.VertexURBEntryReadOffset = 0;
+
+      /* BRW_NEW_URB_FENCE */
+      clip.NumberofURBEntries = brw->urb.nr_clip_entries;
+      clip.URBEntryAllocationSize = brw->urb.vsize - 1;
+
+      if (brw->urb.nr_clip_entries >= 10) {
+         /* Half of the URB entries go to each thread, and it has to be an
+          * even number.
+          */
+         assert(brw->urb.nr_clip_entries % 2 == 0);
+
+         /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
+          * only 2 threads can output VUEs at a time.
+          */
+         clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
+      } else {
+         assert(brw->urb.nr_clip_entries >= 5);
+         clip.MaximumNumberofThreads = 1 - 1;
+      }
+
+      clip.VertexPositionSpace = VPOS_NDCSPACE;
+      clip.UserClipFlagsMustClipEnable = true;
+      clip.GuardbandClipTestEnable = true;
+
+      clip.ClipperViewportStatePointer =
+         ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
+
+      clip.ScreenSpaceViewportXMin = -1;
+      clip.ScreenSpaceViewportXMax = 1;
+      clip.ScreenSpaceViewportYMin = -1;
+      clip.ScreenSpaceViewportYMax = 1;
+
+      clip.ViewportXYClipTestEnable = true;
+      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+                                       ctx->Transform.DepthClampFar);
+
+      /* _NEW_TRANSFORM */
+      if (GEN_GEN == 5 || GEN_IS_G4X) {
+         clip.UserClipDistanceClipTestEnableBitmask =
+            ctx->Transform.ClipPlanesEnabled;
+      } else {
+         /* Up to 6 actual clip flags, plus the 7th for the negative RHW
+          * workaround.
+          */
+         clip.UserClipDistanceClipTestEnableBitmask =
+            (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
+      }
+
+      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
+         clip.APIMode = APIMODE_D3D;
+      else
+         clip.APIMode = APIMODE_OGL;
+
+      clip.GuardbandClipTestEnable = true;
+
+      clip.ClipMode = brw->clip.prog_data->clip_mode;
+
+#if GEN_IS_G4X
+      clip.NegativeWClipTestEnable = true;
+#endif
+   }
+}
+
+const struct brw_tracked_state genX(clip_state) = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM |
+               _NEW_VIEWPORT,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_CLIP_PROG_DATA |
+               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+               BRW_NEW_PROGRAM_CACHE |
+               BRW_NEW_URB_FENCE,
+   },
+   .emit = genX(upload_clip_state),
+};
+
+#else
+
  static void
  genX(upload_clip_state)(struct brw_context *brw)
  {
@@ -1276,7 +1470,7 @@ genX(upload_clip_state)(struct brw_context *brw)
  #endif
  
  #if GEN_GEN == 7
-      clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
+      clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
  
        if (ctx->Polygon.CullFlag) {
           switch (ctx->Polygon.CullFaceMode) {
@@ -1301,7 +1495,8 @@ genX(upload_clip_state)(struct brw_context *brw)
        clip.UserClipDistanceCullTestEnableBitmask =
           brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
  
-      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+                                       ctx->Transform.DepthClampFar);
  #endif
  
        /* _NEW_LIGHT */
@@ -1391,7 +1586,7 @@ genX(upload_sf)(struct brw_context *brw)
  
  #if GEN_GEN <= 7
     /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   bool flip_y = ctx->DrawBuffer->FlipY;
     UNUSED const bool multisampled_fbo =
        _mesa_geometric_samples(ctx->DrawBuffer) > 1;
  #endif
@@ -1402,7 +1597,7 @@ genX(upload_sf)(struct brw_context *brw)
     ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
  
     brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
-      sf.KernelStartPointer = KSP_ro(brw, brw->sf.prog_offset);
+      sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
        sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
        sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
        sf.DispatchGRFStartRegisterForURBData = 3;
@@ -1416,7 +1611,7 @@ genX(upload_sf)(struct brw_context *brw)
         * domain.
         */
        sf.SetupViewportStateOffset =
-         instruction_ro_bo(brw->batch.bo, brw->sf.vp_offset);
+         ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
  
        sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
  
@@ -1443,7 +1638,7 @@ genX(upload_sf)(struct brw_context *brw)
  
  #if GEN_GEN <= 7
        /* _NEW_POLYGON */
-      sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+      sf.FrontWinding = brw->polygon_front_bit != flip_y;
  #if GEN_GEN >= 6
        sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
        sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
@@ -1513,7 +1708,9 @@ genX(upload_sf)(struct brw_context *brw)
  
        /* _NEW_LINE */
  #if GEN_GEN == 8
-      if (brw->is_cherryview)
+      const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+      if (devinfo->is_cherryview)
           sf.CHVLineWidth = brw_get_line_width(brw);
        else
           sf.LineWidth = brw_get_line_width(brw);
@@ -1544,6 +1741,16 @@ genX(upload_sf)(struct brw_context *brw)
           sf.SmoothPointEnable = true;
  #endif
  
+#if GEN_GEN == 10
+      /* _NEW_BUFFERS
+       * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
+       */
+      const bool multisampled_fbo =
+         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+      if (multisampled_fbo)
+         sf.SmoothPointEnable = false;
+#endif
+
  #if GEN_IS_G4X || GEN_GEN >= 5
        sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
  #endif
@@ -1569,7 +1776,7 @@ genX(upload_sf)(struct brw_context *brw)
         * Window coordinates in an FBO are inverted, which means point
         * sprite origin must be inverted, too.
         */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
           sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
        } else {
           sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -1599,7 +1806,8 @@ static const struct brw_tracked_state genX(sf_state) = {
                 _NEW_POINT |
                 _NEW_PROGRAM |
                 (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
-               (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0),
+               (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
+               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
        .brw   = BRW_NEW_BLORP |
                 BRW_NEW_VUE_MAP_GEOM_OUT |
                 (GEN_GEN <= 5 ? BRW_NEW_BATCH |
@@ -1623,7 +1831,30 @@ static const struct brw_tracked_state genX(sf_state) = {
  
  /* ---------------------------------------------------------------------- */
  
-#if GEN_GEN >= 6
+static bool
+brw_color_buffer_write_enabled(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
+   unsigned i;
+
+   /* _NEW_BUFFERS */
+   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      uint64_t outputs_written = fp->info.outputs_written;
+
+      /* _NEW_COLOR */
+      if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
+                 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
+          GET_COLORMASK(ctx->Color.ColorMask, i)) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
  static void
  genX(upload_wm)(struct brw_context *brw)
  {
@@ -1635,11 +1866,10 @@ genX(upload_wm)(struct brw_context *brw)
  
     UNUSED bool writes_depth =
        wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
+   UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
  
-#if GEN_GEN < 7
-   const struct brw_stage_state *stage_state = &brw->wm.base;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-
+#if GEN_GEN == 6
     /* We can't fold this into gen6_upload_wm_push_constants(), because
      * according to the SNB PRM, vol 2 part 1 section 7.2.2
      * (3DSTATE_CONSTANT_PS [DevSNB]):
@@ -1652,33 +1882,136 @@ genX(upload_wm)(struct brw_context *brw)
           /* Pointer to the WM constant buffer.  Covered by the set of
            * state flags from gen6_upload_wm_push_constants.
            */
-         wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
-         wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
        }
     }
  #endif
  
+#if GEN_GEN >= 6
     brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
-      wm.StatisticsEnable = true;
+#else
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
+#endif
+
+#if GEN_GEN <= 6
+      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+
+#if GEN_GEN == 4
+      /* On gen4, we only have one shader kernel */
+      if (brw_wm_state_has_ksp(wm, 0)) {
+         assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
+         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+      }
+#elif GEN_GEN == 5
+      /* On gen5, we have multiple shader kernels but only one GRF start
+       * register for all kernels
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+      wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+      wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+      wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
+
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         wm_prog_data->base.dispatch_grf_start_reg;
+
+      /* Dispatch GRF Start should be the same for all shaders on gen5 */
+      if (brw_wm_state_has_ksp(wm, 1)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
+      }
+      if (brw_wm_state_has_ksp(wm, 2)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
+      }
+#elif GEN_GEN == 6
+      /* On gen6, we have multiple shader kernels and we no longer specify a
+       * register count for each one.
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+      wm.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+      wm.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
+#endif
+
+#if GEN_GEN <= 5
+      wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
+      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
+      wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
+      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+      wm.SetupURBEntryReadOffset = 0;
+      wm.EarlyDepthTestEnable = true;
+#endif
+
+#if GEN_GEN >= 6
        wm.LineAntialiasingRegionWidth = _10pixels;
        wm.LineEndCapAntialiasingRegionWidth = _05pixels;
  
+      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#else
+      if (stage_state->sampler_count)
+         wm.SamplerStatePointer =
+            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
+
+      wm.LineAntialiasingRegionWidth = _05pixels;
+      wm.LineEndCapAntialiasingRegionWidth = _10pixels;
+
+      /* _NEW_POLYGON */
+      if (ctx->Polygon.OffsetFill) {
+         wm.GlobalDepthOffsetEnable = true;
+         /* Something weird going on with legacy_global_depth_bias,
+          * offset_constant, scaling and MRD.  This value passes glean
+          * but gives some odd results elsewere (eg. the
+          * quad-offset-units test).
+          */
+         wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
+
+         /* This is the only value that passes glean:
+         */
+         wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
+      }
+
+      wm.DepthCoefficientURBReadOffset = 1;
+#endif
+
+      /* BRW_NEW_STATS_WM */
+      wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
+
  #if GEN_GEN < 7
        if (wm_prog_data->base.use_alt_mode)
-         wm.FloatingPointMode = Alternate;
+         wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+
+      wm.SamplerCount = GEN_GEN == 5 ?
+         0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
  
-      wm.SamplerCount = DIV_ROUND_UP(stage_state->sampler_count, 4);
-      wm.BindingTableEntryCount = wm_prog_data->base.binding_table.size_bytes / 4;
+      wm.BindingTableEntryCount =
+         wm_prog_data->base.binding_table.size_bytes / 4;
        wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
-      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
-      wm.DispatchGRFStartRegisterForConstantSetupData0 =
-         wm_prog_data->base.dispatch_grf_start_reg;
-      wm.DispatchGRFStartRegisterForConstantSetupData2 =
-         wm_prog_data->dispatch_grf_start_reg_2;
-      wm.KernelStartPointer0 = stage_state->prog_offset;
-      wm.KernelStartPointer2 = stage_state->prog_offset +
-         wm_prog_data->prog_offset_2;
+
+#if GEN_GEN == 6
        wm.DualSourceBlendEnable =
           wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
           ctx->Color.Blend[0]._UsesDualSrc;
@@ -1701,43 +2034,31 @@ genX(upload_wm)(struct brw_context *brw)
           wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
        else
           wm.PositionXYOffsetSelect = POSOFFSET_NONE;
+#endif
  
        if (wm_prog_data->base.total_scratch) {
-         wm.ScratchSpaceBasePointer =
-            render_bo(stage_state->scratch_bo,
-                      ffs(stage_state->per_thread_scratch) - 11);
+         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
+         wm.PerThreadScratchSpace =
+            ffs(stage_state->per_thread_scratch) - 11;
        }
  
        wm.PixelShaderComputedDepth = writes_depth;
  #endif
  
-      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
-
        /* _NEW_LINE */
        wm.LineStippleEnable = ctx->Line.StippleFlag;
  
        /* _NEW_POLYGON */
        wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
-      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
  
  #if GEN_GEN < 8
-      /* _NEW_BUFFERS */
-      const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
  
-      wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+#if GEN_GEN >= 6
        wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
-      if (wm_prog_data->uses_kill ||
-          _mesa_is_alpha_test_enabled(ctx) ||
-          _mesa_is_alpha_to_coverage_enabled(ctx) ||
-          wm_prog_data->uses_omask) {
-         wm.PixelShaderKillsPixel = true;
-      }
  
-      /* _NEW_BUFFERS | _NEW_COLOR */
-      if (brw_color_buffer_write_enabled(brw) || writes_depth ||
-          wm_prog_data->has_side_effects || wm.PixelShaderKillsPixel) {
-         wm.ThreadDispatchEnable = true;
-      }
+      /* _NEW_BUFFERS */
+      const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+
        if (multisampled_fbo) {
           /* _NEW_MULTISAMPLE */
           if (ctx->Multisample.Enabled)
@@ -1753,6 +2074,21 @@ genX(upload_wm)(struct brw_context *brw)
           wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
           wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
        }
+#endif
+      wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+      if (wm_prog_data->uses_kill ||
+          _mesa_is_alpha_test_enabled(ctx) ||
+          _mesa_is_alpha_to_coverage_enabled(ctx) ||
+          (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
+         wm.PixelShaderKillsPixel = true;
+      }
+
+      /* _NEW_BUFFERS | _NEW_COLOR */
+      if (brw_color_buffer_write_enabled(brw) || writes_depth ||
+          wm.PixelShaderKillsPixel ||
+          (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
+         wm.ThreadDispatchEnable = true;
+      }
  
  #if GEN_GEN >= 7
        wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
@@ -1783,6 +2119,16 @@ genX(upload_wm)(struct brw_context *brw)
           wm.EarlyDepthStencilControl = EDSC_PSEXEC;
  #endif
     }
+
+#if GEN_GEN <= 5
+   if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
+      brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
+         clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
+      }
+
+      brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
+   }
+#endif
  }
  
  static const struct brw_tracked_state genX(wm_state) = {
@@ -1790,31 +2136,50 @@ static const struct brw_tracked_state genX(wm_state) = {
        .mesa  = _NEW_LINE |
                 _NEW_POLYGON |
                 (GEN_GEN < 8 ? _NEW_BUFFERS |
-                              _NEW_COLOR |
-                              _NEW_MULTISAMPLE :
+                              _NEW_COLOR :
                                0) |
-               (GEN_GEN < 7 ? _NEW_PROGRAM_CONSTANTS : 0),
+               (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
+               (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
+               (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
        .brw   = BRW_NEW_BLORP |
                 BRW_NEW_FS_PROG_DATA |
+               (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+                              BRW_NEW_FRAGMENT_PROGRAM |
+                              BRW_NEW_PROGRAM_CACHE |
+                              BRW_NEW_SAMPLER_STATE_TABLE |
+                              BRW_NEW_STATS_WM
+                            : 0) |
                 (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
     },
     .emit = genX(upload_wm),
  };
-#endif
  
  /* ---------------------------------------------------------------------- */
  
+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken.  If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
  #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
     pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
     pkt.SamplerCount       =                                               \
        DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
+    * disable prefetching of binding tables in A0 and B0 steppings.       \
+    * TODO: Revisit this WA on C0 stepping.                               \
+    */                                                                    \
     pkt.BindingTableEntryCount =                                           \
+      GEN_GEN == 11 ?                                                     \
+      0 :                                                                 \
        stage_prog_data->binding_table.size_bytes / 4;                      \
     pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                            \
     if (stage_prog_data->total_scratch) {                                  \
-      pkt.ScratchSpaceBasePointer =                                       \
-         render_bo(stage_state->scratch_bo, 0);                           \
+      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
        pkt.PerThreadScratchSpace =                                         \
           ffs(stage_state->per_thread_scratch) - 11;                       \
     }                                                                      \
@@ -1841,6 +2206,8 @@ genX(upload_vs_state)(struct brw_context *brw)
  
     assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
            vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
+   assert(GEN_GEN < 11 ||
+          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
  
  #if GEN_GEN == 6
     /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
@@ -1857,8 +2224,8 @@ genX(upload_vs_state)(struct brw_context *brw)
     brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
        if (stage_state->push_const_size != 0) {
           cvs.Buffer0Valid = true;
-         cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
-         cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
        }
     }
  #endif
@@ -1889,7 +2256,7 @@ genX(upload_vs_state)(struct brw_context *brw)
  
        vs.StatisticsEnable = false;
        vs.SamplerStatePointer =
-         instruction_ro_bo(brw->batch.bo, stage_state->sampler_offset);
+         ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
  #endif
  
  #if GEN_GEN == 5
@@ -1974,9 +2341,15 @@ genX(upload_cc_viewport)(struct brw_context *brw)
     for (unsigned i = 0; i < viewport_count; i++) {
        /* _NEW_VIEWPORT | _NEW_TRANSFORM */
        const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
-      if (ctx->Transform.DepthClamp) {
+      if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
           ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
           ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
+      } else if (ctx->Transform.DepthClampNear) {
+         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = 0.0;
+      } else if (ctx->Transform.DepthClampFar) {
+         ccv.MinimumDepth = 0.0;
+         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
        } else {
           ccv.MinimumDepth = 0.0;
           ccv.MaximumDepth = 1.0;
@@ -2013,9 +2386,9 @@ const struct brw_tracked_state genX(cc_vp) = {
  
  /* ---------------------------------------------------------------------- */
  
-static inline void
+static void
  set_scissor_bits(const struct gl_context *ctx, int i,
-                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 bool flip_y, unsigned fb_width, unsigned fb_height,
                   struct GENX(SCISSOR_RECT) *sc)
  {
     int bbox[4];
@@ -2037,7 +2410,7 @@ set_scissor_bits(const struct gl_context *ctx, int i,
        sc->ScissorRectangleXMax = 0;
        sc->ScissorRectangleYMin = 1;
        sc->ScissorRectangleYMax = 0;
-   } else if (render_to_fbo) {
+   } else if (!flip_y) {
        /* texmemory: Y=0=bottom */
        sc->ScissorRectangleXMin = bbox[0];
        sc->ScissorRectangleXMax = bbox[1] - 1;
@@ -2057,7 +2430,7 @@ static void
  genX(upload_scissor_state)(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
     struct GENX(SCISSOR_RECT) scissor;
     uint32_t scissor_state_offset;
     const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
@@ -2081,7 +2454,7 @@ genX(upload_scissor_state)(struct brw_context *brw)
      * inclusive but max is exclusive.
      */
     for (unsigned i = 0; i < viewport_count; i++) {
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
        GENX(SCISSOR_RECT_pack)(
           NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
     }
@@ -2150,6 +2523,17 @@ brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
      */
     const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
  
+   /* Workaround: prevent gpu hangs on SandyBridge
+    * by disabling guardband clipping for odd dimensions.
+    */
+   if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
+      *xmin = -1.0f;
+      *xmax =  1.0f;
+      *ymin = -1.0f;
+      *ymax =  1.0f;
+      return;
+   }
+
     if (m00 != 0 && m11 != 0) {
        /* First, we compute the screen-space render area */
        const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
@@ -2196,7 +2580,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
     const unsigned viewport_count = brw->clip.viewport_count;
  
     /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
     const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
     const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
  
@@ -2220,12 +2604,12 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
  #endif
  
     /* _NEW_BUFFERS */
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   } else {
+   if (flip_y) {
        y_scale = -1.0;
        y_bias = (float)fb_height;
+   } else {
+      y_scale = 1.0;
+      y_bias = 0;
     }
  
     for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
@@ -2253,29 +2637,33 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
        clv.YMaxClipGuardband = gb_ymax;
  
  #if GEN_GEN < 6
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
                         &sfv.ScissorRectangle);
  #elif GEN_GEN >= 8
        /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
         * The hardware will take the intersection of the drawing rectangle,
-       * scissor rectangle, and the viewport extents. We don't need to be
-       * smart, and can therefore just program the viewport extents.
+       * scissor rectangle, and the viewport extents.  However, emitting
+       * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
+       * pipeline stall so we're better off just being a little more clever
+       * with our viewport so we can emit it once at context creation time.
         */
+      const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
+      const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
        const float viewport_Xmax =
-         ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
+         MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
        const float viewport_Ymax =
-         ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
+         MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
  
-      if (render_to_fbo) {
-         sfv.XMinViewPort = ctx->ViewportArray[i].X;
+      if (flip_y) {
+         sfv.XMinViewPort = viewport_Xmin;
           sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = ctx->ViewportArray[i].Y;
-         sfv.YMaxViewPort = viewport_Ymax - 1;
+         sfv.YMinViewPort = fb_height - viewport_Ymax;
+         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
        } else {
-         sfv.XMinViewPort = ctx->ViewportArray[i].X;
+         sfv.XMinViewPort = viewport_Xmin;
           sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = fb_height - viewport_Ymax;
-         sfv.YMaxViewPort = fb_height - ctx->ViewportArray[i].Y - 1;
+         sfv.YMinViewPort = viewport_Ymin;
+         sfv.YMaxViewPort = viewport_Ymax - 1;
        }
  #endif
  
@@ -2322,30 +2710,31 @@ static const struct brw_tracked_state genX(sf_clip_viewport) = {
  
  /* ---------------------------------------------------------------------- */
  
-#if GEN_GEN >= 6
  static void
  genX(upload_gs_state)(struct brw_context *brw)
  {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   UNUSED struct gl_context *ctx = &brw->ctx;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
     const struct brw_stage_state *stage_state = &brw->gs.base;
+   const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
     /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool active = brw->geometry_program;
+   bool active = GEN_GEN >= 6 && gs_prog;
  
     /* BRW_NEW_GS_PROG_DATA */
     struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
-   const struct brw_vue_prog_data *vue_prog_data =
+   UNUSED const struct brw_vue_prog_data *vue_prog_data =
        brw_vue_prog_data(stage_prog_data);
  #if GEN_GEN >= 7
     const struct brw_gs_prog_data *gs_prog_data =
        brw_gs_prog_data(stage_prog_data);
  #endif
  
-#if GEN_GEN < 7
+#if GEN_GEN == 6
     brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
        if (active && stage_state->push_const_size != 0) {
           cgs.Buffer0Valid = true;
-         cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
-         cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
        }
     }
  #endif
@@ -2363,12 +2752,19 @@ genX(upload_gs_state)(struct brw_context *brw)
      * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
      * Stall" bit set.
      */
-   if (brw->gt == 2 && brw->gs.enabled != active)
+   if (devinfo->gt == 2 && brw->gs.enabled != active)
        gen7_emit_cs_stall_flush(brw);
  #endif
  
-   if (active) {
-      brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+#if GEN_GEN >= 6
+   brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
+#else
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
+#endif
+
+#if GEN_GEN >= 6
+      if (active) {
           INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
  
  #if GEN_GEN >= 7
@@ -2418,9 +2814,8 @@ genX(upload_gs_state)(struct brw_context *brw)
  
  #if GEN_GEN < 7
           gs.SOStatisticsEnable = true;
-         gs.RenderingEnabled = 1;
-         if (brw->geometry_program->info.has_transform_feedback_varyings)
-            gs.SVBIPayloadEnable = true;
+         if (gs_prog->info.has_transform_feedback_varyings)
+            gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
  
           /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
            * was previously done for gen6.
@@ -2453,20 +2848,40 @@ genX(upload_gs_state)(struct brw_context *brw)
           gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
  #endif
        }
-#if GEN_GEN < 7
-   } else if (brw->ff_gs.prog_active)  {
-      /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS
-       * program. This function provides the needed 3DSTATE_GS for this.
-       */
-      upload_gs_state_for_tf(brw);
-#endif
-   } else {
-      brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
-         gs.StatisticsEnable = true;
-#if GEN_GEN < 7
-         gs.RenderingEnabled = true;
  #endif
  
+#if GEN_GEN <= 6
+      if (!active && brw->ff_gs.prog_active) {
+         /* In gen6, transform feedback for the VS stage is done with an
+          * ad-hoc GS program. This function provides the needed 3DSTATE_GS
+          * for this.
+          */
+         gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
+         gs.SingleProgramFlow = true;
+         gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
+         gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
+
+#if GEN_GEN <= 5
+         gs.GRFRegisterCount =
+            DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
+         /* BRW_NEW_URB_FENCE */
+         gs.NumberofURBEntries = brw->urb.nr_gs_entries;
+         gs.URBEntryAllocationSize = brw->urb.vsize - 1;
+         gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
+         gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+#else
+         gs.Enable = true;
+         gs.VectorMaskEnable = true;
+         gs.SVBIPayloadEnable = true;
+         gs.SVBIPostIncrementEnable = true;
+         gs.SVBIPostIncrementValue =
+            brw->ff_gs.prog_data->svbi_postincrement_value;
+         gs.SOStatisticsEnable = true;
+         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
+#endif
+      }
+#endif
+      if (!active && !brw->ff_gs.prog_active) {
  #if GEN_GEN < 8
           gs.DispatchGRFStartRegisterForURBData = 1;
  #if GEN_GEN >= 7
@@ -2474,44 +2889,182 @@ genX(upload_gs_state)(struct brw_context *brw)
  #endif
  #endif
        }
+
+#if GEN_GEN >= 6
+      gs.StatisticsEnable = true;
+#endif
+#if GEN_GEN == 5 || GEN_GEN == 6
+      gs.RenderingEnabled = true;
+#endif
+#if GEN_GEN <= 5
+      gs.MaximumVPIndex = brw->clip.viewport_count - 1;
+#endif
     }
-#if GEN_GEN < 7
+
+#if GEN_GEN == 6
     brw->gs.enabled = active;
  #endif
  }
  
  static const struct brw_tracked_state genX(gs_state) = {
     .dirty = {
-      .mesa  = (GEN_GEN < 7 ? _NEW_PROGRAM_CONSTANTS : 0),
+      .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
-               BRW_NEW_GEOMETRY_PROGRAM |
-               BRW_NEW_GS_PROG_DATA |
+               (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
+                               BRW_NEW_PROGRAM_CACHE |
+                               BRW_NEW_URB_FENCE |
+                               BRW_NEW_VIEWPORT_COUNT
+                             : 0) |
+               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
+                               BRW_NEW_GEOMETRY_PROGRAM |
+                               BRW_NEW_GS_PROG_DATA
+                             : 0) |
                 (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
     },
     .emit = genX(upload_gs_state),
  };
-#endif
  
  /* ---------------------------------------------------------------------- */
  
-UNUSED static GLenum
-fix_dual_blend_alpha_to_one(GLenum function)
+UNUSED static GLenum
+fix_dual_blend_alpha_to_one(GLenum function)
+{
+   switch (function) {
+   case GL_SRC1_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_SRC1_ALPHA:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
+#define blend_factor(x) brw_translate_blend_factor(x)
+#define blend_eqn(x) brw_translate_blend_equation(x)
+
+/**
+ * Modify blend function to force destination alpha to 1.0
+ *
+ * If \c function specifies a blend function that uses destination alpha,
+ * replace it with a function that hard-wires destination alpha to 1.0.  This
+ * is used when rendering to xRGB targets.
+ */
+static GLenum
+brw_fix_xRGB_alpha(GLenum function)
  {
     switch (function) {
-   case GL_SRC1_ALPHA:
+   case GL_DST_ALPHA:
        return GL_ONE;
  
-   case GL_ONE_MINUS_SRC1_ALPHA:
+   case GL_ONE_MINUS_DST_ALPHA:
+   case GL_SRC_ALPHA_SATURATE:
        return GL_ZERO;
     }
  
     return function;
  }
  
-#define blend_factor(x) brw_translate_blend_factor(x)
-#define blend_eqn(x) brw_translate_blend_equation(x)
+#if GEN_GEN >= 6
+typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
+#endif
+
+UNUSED static bool
+set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
+                     bool alpha_to_one)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   /* _NEW_BUFFERS */
+   const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+
+   bool independent_alpha_blend = false;
+
+   /* Used for implementing the following bit of GL_EXT_texture_integer:
+    * "Per-fragment operations that require floating-point color
+    *  components, including multisample alpha operations, alpha test,
+    *  blending, and dithering, have no effect when the corresponding
+    *  colors are written to an integer color buffer."
+    */
+   const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
+
+   const unsigned blend_enabled = GEN_GEN >= 6 ?
+      ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
+
+   /* _NEW_COLOR */
+   if (ctx->Color.ColorLogicOpEnabled) {
+      GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
+         : GL_UNSIGNED_NORMALIZED;
+      WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
+                rb_type != GL_UNSIGNED_NORMALIZED &&
+                rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
+                "renderbuffer\n",
+                _mesa_enum_to_string(ctx->Color.LogicOp),
+                _mesa_enum_to_string(rb_type));
+      if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
+         entry->LogicOpEnable = true;
+         entry->LogicOpFunction = ctx->Color._LogicOp;
+      }
+   } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
+              && (GEN_GEN <= 5 || !integer)) {
+      GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
+      GLenum eqA = ctx->Color.Blend[i].EquationA;
+      GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
+      GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
+      GLenum srcA = ctx->Color.Blend[i].SrcA;
+      GLenum dstA = ctx->Color.Blend[i].DstA;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
+         srcRGB = dstRGB = GL_ONE;
+
+      if (eqA == GL_MIN || eqA == GL_MAX)
+         srcA = dstA = GL_ONE;
+
+      /* Due to hardware limitations, the destination may have information
+       * in an alpha channel even when the format specifies no alpha
+       * channel. In order to avoid getting any incorrect blending due to
+       * that alpha channel, coerce the blend factors to values that will
+       * not read the alpha channel, but will instead use the correct
+       * implicit value for alpha.
+       */
+      if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
+                                               GL_TEXTURE_ALPHA_TYPE)) {
+         srcRGB = brw_fix_xRGB_alpha(srcRGB);
+         srcA = brw_fix_xRGB_alpha(srcA);
+         dstRGB = brw_fix_xRGB_alpha(dstRGB);
+         dstA = brw_fix_xRGB_alpha(dstA);
+      }
+
+      /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
+       * "If Dual Source Blending is enabled, this bit must be disabled."
+       *
+       * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
+       * and leave it enabled anyway.
+       */
+      if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
+         srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+         srcA = fix_dual_blend_alpha_to_one(srcA);
+         dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+         dstA = fix_dual_blend_alpha_to_one(dstA);
+      }
+
+      entry->ColorBufferBlendEnable = true;
+      entry->DestinationBlendFactor = blend_factor(dstRGB);
+      entry->SourceBlendFactor = blend_factor(srcRGB);
+      entry->DestinationAlphaBlendFactor = blend_factor(dstA);
+      entry->SourceAlphaBlendFactor = blend_factor(srcA);
+      entry->ColorBlendFunction = blend_eqn(eqRGB);
+      entry->AlphaBlendFunction = blend_eqn(eqA);
+
+      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
+         independent_alpha_blend = true;
+   }
+
+   return independent_alpha_blend;
+}
  
  #if GEN_GEN >= 6
  static void
@@ -2580,87 +3133,9 @@ genX(upload_blend_state)(struct brw_context *brw)
  #else
        {
  #endif
-
-         /* _NEW_BUFFERS */
-         struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-
-         /* Used for implementing the following bit of GL_EXT_texture_integer:
-          * "Per-fragment operations that require floating-point color
-          *  components, including multisample alpha operations, alpha test,
-          *  blending, and dithering, have no effect when the corresponding
-          *  colors are written to an integer color buffer."
-          */
-         bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
-
-         /* _NEW_COLOR */
-         if (ctx->Color.ColorLogicOpEnabled) {
-            GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
-                                : GL_UNSIGNED_NORMALIZED;
-            WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
-                      rb_type != GL_UNSIGNED_NORMALIZED &&
-                      rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
-                      "renderbuffer\n",
-                      _mesa_enum_to_string(ctx->Color.LogicOp),
-                      _mesa_enum_to_string(rb_type));
-            if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
-               entry.LogicOpEnable = true;
-               entry.LogicOpFunction =
-                  intel_translate_logic_op(ctx->Color.LogicOp);
-            }
-         } else if (ctx->Color.BlendEnabled & (1 << i) && !integer &&
-                    !ctx->Color._AdvancedBlendMode) {
-            GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
-            GLenum eqA = ctx->Color.Blend[i].EquationA;
-            GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
-            GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
-            GLenum srcA = ctx->Color.Blend[i].SrcA;
-            GLenum dstA = ctx->Color.Blend[i].DstA;
-
-            if (eqRGB == GL_MIN || eqRGB == GL_MAX)
-               srcRGB = dstRGB = GL_ONE;
-
-            if (eqA == GL_MIN || eqA == GL_MAX)
-               srcA = dstA = GL_ONE;
-
-            /* Due to hardware limitations, the destination may have information
-             * in an alpha channel even when the format specifies no alpha
-             * channel. In order to avoid getting any incorrect blending due to
-             * that alpha channel, coerce the blend factors to values that will
-             * not read the alpha channel, but will instead use the correct
-             * implicit value for alpha.
-             */
-            if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
-                                                     GL_TEXTURE_ALPHA_TYPE)) {
-               srcRGB = brw_fix_xRGB_alpha(srcRGB);
-               srcA = brw_fix_xRGB_alpha(srcA);
-               dstRGB = brw_fix_xRGB_alpha(dstRGB);
-               dstA = brw_fix_xRGB_alpha(dstA);
-            }
-
-            /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
-             * "If Dual Source Blending is enabled, this bit must be disabled."
-             *
-             * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
-             * and leave it enabled anyway.
-             */
-            if (ctx->Color.Blend[i]._UsesDualSrc && blend.AlphaToOneEnable) {
-               srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
-               srcA = fix_dual_blend_alpha_to_one(srcA);
-               dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
-               dstA = fix_dual_blend_alpha_to_one(dstA);
-            }
-
-            entry.ColorBufferBlendEnable = true;
-            entry.DestinationBlendFactor = blend_factor(dstRGB);
-            entry.SourceBlendFactor = blend_factor(srcRGB);
-            entry.DestinationAlphaBlendFactor = blend_factor(dstA);
-            entry.SourceAlphaBlendFactor = blend_factor(srcA);
-            entry.ColorBlendFunction = blend_eqn(eqRGB);
-            entry.AlphaBlendFunction = blend_eqn(eqA);
-
-            if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
-               blend.IndependentAlphaBlendEnable = true;
-         }
+         blend.IndependentAlphaBlendEnable =
+            set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
+            blend.IndependentAlphaBlendEnable;
  
           /* See section 8.1.6 "Pre-Blend Color Clamping" of the
            * SandyBridge PRM Volume 2 Part 1 for HW requirements.
@@ -2683,10 +3158,10 @@ genX(upload_blend_state)(struct brw_context *brw)
           entry.PostBlendColorClampEnable = true;
           entry.ColorClampRange = COLORCLAMP_RTFORMAT;
  
-         entry.WriteDisableRed   = !ctx->Color.ColorMask[i][0];
-         entry.WriteDisableGreen = !ctx->Color.ColorMask[i][1];
-         entry.WriteDisableBlue  = !ctx->Color.ColorMask[i][2];
-         entry.WriteDisableAlpha = !ctx->Color.ColorMask[i][3];
+         entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
+         entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
+         entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
+         entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
  
  #if GEN_GEN >= 8
           GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
@@ -2741,30 +3216,112 @@ UNUSED static const uint32_t push_constant_opcodes[] = {
  };
  
  static void
-upload_constant_state(struct brw_context *brw,
-                      struct brw_stage_state *stage_state,
-                      bool active, uint32_t stage)
+genX(upload_push_constant_packets)(struct brw_context *brw)
  {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct gl_context *ctx = &brw->ctx;
+
     UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
-   active = active && stage_state->push_const_size != 0;
  
-   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
-      pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
-      if (active) {
+   struct brw_stage_state *stage_states[] = {
+      &brw->vs.base,
+      &brw->tcs.base,
+      &brw->tes.base,
+      &brw->gs.base,
+      &brw->wm.base,
+   };
+
+   if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
+       stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
+      gen7_emit_vs_workaround_flush(brw);
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      struct brw_stage_state *stage_state = stage_states[stage];
+      UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
+
+      if (!stage_state->push_constants_dirty)
+         continue;
+
+      brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
+         pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
+         if (stage_state->prog_data) {
  #if GEN_GEN >= 8 || GEN_IS_HASWELL
-         pkt.ConstantBody.ReadLength[2] = stage_state->push_const_size;
-         pkt.ConstantBody.Buffer[2] =
-            render_ro_bo(brw->curbe.curbe_bo, stage_state->push_const_offset);
+            /* The Skylake PRM contains the following restriction:
+             *
+             *    "The driver must ensure The following case does not occur
+             *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+             *     buffer 3 read length equal to zero committed followed by a
+             *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+             *     zero committed."
+             *
+             * To avoid this, we program the buffers in the highest slots.
+             * This way, slot 0 is only used if slot 3 is also used.
+             */
+            int n = 3;
+
+            for (int i = 3; i >= 0; i--) {
+               const struct brw_ubo_range *range =
+                  &stage_state->prog_data->ubo_ranges[i];
+
+               if (range->length == 0)
+                  continue;
+
+               const struct gl_uniform_block *block =
+                  prog->sh.UniformBlocks[range->block];
+               const struct gl_buffer_binding *binding =
+                  &ctx->UniformBufferBindings[block->Binding];
+
+               if (binding->BufferObject == ctx->Shared->NullBufferObj) {
+                  static unsigned msg_id = 0;
+                  _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
+                                 MESA_DEBUG_TYPE_UNDEFINED,
+                                 MESA_DEBUG_SEVERITY_HIGH,
+                                 "UBO %d unbound, %s shader uniform data "
+                                 "will be undefined.",
+                                 range->block,
+                                 _mesa_shader_stage_to_string(stage));
+                  continue;
+               }
+
+               assert(binding->Offset % 32 == 0);
+
+               struct brw_bo *bo = intel_bufferobj_buffer(brw,
+                  intel_buffer_object(binding->BufferObject),
+                  binding->Offset, range->length * 32, false);
+
+               pkt.ConstantBody.ReadLength[n] = range->length;
+               pkt.ConstantBody.Buffer[n] =
+                  ro_bo(bo, range->start * 32 + binding->Offset);
+               n--;
+            }
+
+            if (stage_state->push_const_size > 0) {
+               assert(n >= 0);
+               pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
+               pkt.ConstantBody.Buffer[n] =
+                  ro_bo(stage_state->push_const_bo,
+                        stage_state->push_const_offset);
+            }
  #else
-         pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
-         pkt.ConstantBody.Buffer[0].offset =
-            stage_state->push_const_offset | mocs;
+            pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
+            pkt.ConstantBody.Buffer[0].offset =
+               stage_state->push_const_offset | mocs;
  #endif
+         }
        }
-   }
  
-   brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
+      stage_state->push_constants_dirty = false;
+      brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
+   }
  }
+
+const struct brw_tracked_state genX(push_constant_packets) = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_DRAW_CALL,
+   },
+   .emit = genX(upload_push_constant_packets),
+};
  #endif
  
  #if GEN_GEN >= 6
@@ -2773,21 +3330,12 @@ genX(upload_vs_push_constants)(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->vs.base;
  
-   /* _BRW_NEW_VERTEX_PROGRAM */
-   const struct brw_program *vp = brw_program_const(brw->vertex_program);
+   /* BRW_NEW_VERTEX_PROGRAM */
+   const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
     /* BRW_NEW_VS_PROG_DATA */
     const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
  
-   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_VERTEX);
-   gen6_upload_push_constants(brw, &vp->program, prog_data, stage_state);
-
-#if GEN_GEN >= 7
-   if (GEN_GEN == 7 && !GEN_IS_HASWELL && !brw->is_baytrail)
-      gen7_emit_vs_workaround_flush(brw);
-
-   upload_constant_state(brw, stage_state, true /* active */,
-                         MESA_SHADER_VERTEX);
-#endif
+   gen6_upload_push_constants(brw, vp, prog_data, stage_state);
  }
  
  static const struct brw_tracked_state genX(vs_push_constants) = {
@@ -2796,7 +3344,6 @@ static const struct brw_tracked_state genX(vs_push_constants) = {
                 _NEW_TRANSFORM,
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
                 BRW_NEW_VERTEX_PROGRAM |
                 BRW_NEW_VS_PROG_DATA,
     },
@@ -2809,19 +3356,12 @@ genX(upload_gs_push_constants)(struct brw_context *brw)
     struct brw_stage_state *stage_state = &brw->gs.base;
  
     /* BRW_NEW_GEOMETRY_PROGRAM */
-   const struct brw_program *gp = brw_program_const(brw->geometry_program);
-
-   if (gp) {
-      /* BRW_NEW_GS_PROG_DATA */
-      struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
+   const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
  
-      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_GEOMETRY);
-      gen6_upload_push_constants(brw, &gp->program, prog_data, stage_state);
-   }
+   /* BRW_NEW_GS_PROG_DATA */
+   struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
  
-#if GEN_GEN >= 7
-   upload_constant_state(brw, stage_state, gp, MESA_SHADER_GEOMETRY);
-#endif
+   gen6_upload_push_constants(brw, gp, prog_data, stage_state);
  }
  
  static const struct brw_tracked_state genX(gs_push_constants) = {
@@ -2831,8 +3371,7 @@ static const struct brw_tracked_state genX(gs_push_constants) = {
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
                 BRW_NEW_GEOMETRY_PROGRAM |
-               BRW_NEW_GS_PROG_DATA |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION,
+               BRW_NEW_GS_PROG_DATA,
     },
     .emit = genX(upload_gs_push_constants),
  };
@@ -2842,17 +3381,11 @@ genX(upload_wm_push_constants)(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->wm.base;
     /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct brw_program *fp = brw_program_const(brw->fragment_program);
+   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
     /* BRW_NEW_FS_PROG_DATA */
     const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
  
-   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_FRAGMENT);
-
-   gen6_upload_push_constants(brw, &fp->program, prog_data, stage_state);
-
-#if GEN_GEN >= 7
-   upload_constant_state(brw, stage_state, true, MESA_SHADER_FRAGMENT);
-#endif
+   gen6_upload_push_constants(brw, fp, prog_data, stage_state);
  }
  
  static const struct brw_tracked_state genX(wm_push_constants) = {
@@ -2861,8 +3394,7 @@ static const struct brw_tracked_state genX(wm_push_constants) = {
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
                 BRW_NEW_FRAGMENT_PROGRAM |
-               BRW_NEW_FS_PROG_DATA |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION,
+               BRW_NEW_FS_PROG_DATA,
     },
     .emit = genX(upload_wm_push_constants),
  };
@@ -2907,9 +3439,7 @@ static void
  genX(emit_3dstate_multisample2)(struct brw_context *brw,
                                  unsigned num_samples)
  {
-   assert(brw->num_samples <= 16);
-
-   unsigned log2_samples = ffs(MAX2(num_samples, 1)) - 1;
+   unsigned log2_samples = ffs(num_samples) - 1;
  
     brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
        multi.PixelLocation = CENTER;
@@ -2940,6 +3470,8 @@ genX(emit_3dstate_multisample2)(struct brw_context *brw,
  static void
  genX(upload_multisample_state)(struct brw_context *brw)
  {
+   assert(brw->num_samples > 0 && brw->num_samples <= 16);
+
     genX(emit_3dstate_multisample2)(brw, brw->num_samples);
  
     brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
@@ -2949,7 +3481,8 @@ genX(upload_multisample_state)(struct brw_context *brw)
  
  static const struct brw_tracked_state genX(multisample_state) = {
     .dirty = {
-      .mesa = _NEW_MULTISAMPLE,
+      .mesa = _NEW_MULTISAMPLE |
+              (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
        .brw = BRW_NEW_BLORP |
               BRW_NEW_CONTEXT |
               BRW_NEW_NUM_SAMPLES,
@@ -2960,53 +3493,80 @@ static const struct brw_tracked_state genX(multisample_state) = {
  
  /* ---------------------------------------------------------------------- */
  
-#if GEN_GEN >= 6
  static void
  genX(upload_color_calc_state)(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
  
     brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
+#if GEN_GEN <= 5
+      cc.IndependentAlphaBlendEnable =
+         set_blend_entry_bits(brw, &cc, 0, false);
+      set_depth_stencil_bits(brw, &cc);
+
+      if (ctx->Color.AlphaEnabled &&
+          ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
+         cc.AlphaTestEnable = true;
+         cc.AlphaTestFunction =
+            intel_translate_compare_func(ctx->Color.AlphaFunc);
+      }
+
+      cc.ColorDitherEnable = ctx->Color.DitherFlag;
+
+      cc.StatisticsEnable = brw->stats_wm;
+
+      cc.CCViewportStatePointer =
+         ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
+#else
        /* _NEW_COLOR */
-      cc.AlphaTestFormat = ALPHATEST_UNORM8;
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
-                               ctx->Color.AlphaRef);
+      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
  
  #if GEN_GEN < 9
        /* _NEW_STENCIL */
        cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
        cc.BackfaceStencilReferenceValue =
           _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
+#endif
+
  #endif
  
        /* _NEW_COLOR */
-      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
-      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
-      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
-      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
+                               ctx->Color.AlphaRef);
     }
  
+#if GEN_GEN >= 6
     brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
        ptr.ColorCalcStatePointer = brw->cc.state_offset;
  #if GEN_GEN != 7
        ptr.ColorCalcStatePointerValid = true;
  #endif
     }
+#else
+   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+#endif
  }
  
  static const struct brw_tracked_state genX(color_calc_state) = {
     .dirty = {
        .mesa = _NEW_COLOR |
-              _NEW_STENCIL,
+              _NEW_STENCIL |
+              (GEN_GEN <= 5 ? _NEW_BUFFERS |
+                              _NEW_DEPTH
+                            : 0),
        .brw = BRW_NEW_BATCH |
               BRW_NEW_BLORP |
-             BRW_NEW_CC_STATE |
-             BRW_NEW_STATE_BASE_ADDRESS,
+             (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
+                             BRW_NEW_STATS_WM
+                           : BRW_NEW_CC_STATE |
+                             BRW_NEW_STATE_BASE_ADDRESS),
     },
     .emit = genX(upload_color_calc_state),
  };
  
-#endif
  
  /* ---------------------------------------------------------------------- */
  
@@ -3015,6 +3575,8 @@ static void
  genX(upload_sbe)(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
     /* BRW_NEW_FS_PROG_DATA */
     const struct brw_wm_prog_data *wm_prog_data =
        brw_wm_prog_data(brw->wm.base.prog_data);
@@ -3032,14 +3594,14 @@ genX(upload_sbe)(struct brw_context *brw)
        sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
  
        /* _NEW_BUFFERS */
-      bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+      bool flip_y = ctx->DrawBuffer->FlipY;
  
        /* _NEW_POINT
         *
         * Window coordinates in an FBO are inverted, which means point
         * sprite origin must be inverted.
         */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
           sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
        else
           sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -3075,18 +3637,8 @@ genX(upload_sbe)(struct brw_context *brw)
  
  #if GEN_GEN >= 9
        /* prepare the active component dwords */
-      int input_index = 0;
-      for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-         if (!(brw->fragment_program->info.inputs_read &
-               BITFIELD64_BIT(attr))) {
-            continue;
-         }
-
-         assert(input_index < 32);
-
-         sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
-         ++input_index;
-      }
+      for (int i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
  #endif
     }
  
@@ -3244,28 +3796,29 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
     for (int i = 0; i < 4; i++) {
        struct intel_buffer_object *bufferobj =
           intel_buffer_object(xfb_obj->Buffers[i]);
+      uint32_t start = xfb_obj->Offset[i];
+      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
+      uint32_t const size = end - start;
  
-      if (!bufferobj) {
+      if (!bufferobj || !size) {
           brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
              sob.SOBufferIndex = i;
           }
           continue;
        }
  
-      uint32_t start = xfb_obj->Offset[i];
        assert(start % 4 == 0);
-      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
        struct brw_bo *bo =
-         intel_bufferobj_buffer(brw, bufferobj, start, end - start);
+         intel_bufferobj_buffer(brw, bufferobj, start, size, true);
        assert(end <= bo->size);
  
        brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
           sob.SOBufferIndex = i;
  
-         sob.SurfaceBaseAddress = render_bo(bo, start);
+         sob.SurfaceBaseAddress = rw_bo(bo, start);
  #if GEN_GEN < 8
           sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
-         sob.SurfaceEndAddress = render_bo(bo, end);
+         sob.SurfaceEndAddress = rw_bo(bo, end);
  #else
           sob.SOBufferEnable = true;
           sob.StreamOffsetWriteEnable = true;
@@ -3274,7 +3827,7 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
  
           sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
           sob.StreamOutputBufferOffsetAddress =
-            instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
+            rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
  
           if (brw_obj->zero_offsets) {
              /* Zero out the offset and write that to offset_bo */
@@ -3292,7 +3845,7 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
  #endif
  }
  
-static inline bool
+static bool
  query_active(struct gl_query_object *q)
  {
     return q && q->Active;
@@ -3322,7 +3875,7 @@ genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
                 sos.RenderingDisable = true;
              } else {
                 perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
-                          "query active relies on the clipper.");
+                          "query active relies on the clipper.\n");
              }
           }
  
@@ -3428,7 +3981,13 @@ genX(upload_ps)(struct brw_context *brw)
           DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
  
        /* BRW_NEW_FS_PROG_DATA */
-      ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
+      /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
+       * prefetching of binding tables in A0 and B0 steppings.
+       * TODO: Revisit this workaround on C0 stepping.
+       */
+      ps.BindingTableEntryCount = GEN_GEN == 11 ?
+                                  0 :
+                                  prog_data->base.binding_table.size_bytes / 4;
  
        if (prog_data->base.use_alt_mode)
           ps.FloatingPointMode = Alternate;
@@ -3442,11 +4001,12 @@ genX(upload_ps)(struct brw_context *brw)
        ps.SampleMask = genX(determine_sample_mask(brw));
  #endif
  
-      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
-       * it implicitly scales for different GT levels (which have some # of
-       * PSDs).
+      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
+       * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
+       * k, it implies 2(k+1) threads. It implicitly scales for different GT
+       * levels (which have some # of PSDs).
         *
-       * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
+       * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
         */
  #if GEN_GEN >= 9
        ps.MaximumNumberofThreadsPerPSD = 64 - 1;
@@ -3456,7 +4016,8 @@ genX(upload_ps)(struct brw_context *brw)
        ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
  #endif
  
-      if (prog_data->base.nr_params > 0)
+      if (prog_data->base.nr_params > 0 ||
+          prog_data->base.ubo_ranges[0].length > 0)
           ps.PushConstantEnable = true;
  
  #if GEN_GEN < 8
@@ -3500,22 +4061,44 @@ genX(upload_ps)(struct brw_context *brw)
        else
           ps.PositionXYOffsetSelect = POSOFFSET_NONE;
  
-      ps.RenderTargetFastClearEnable = brw->wm.fast_clear_op;
        ps._8PixelDispatchEnable = prog_data->dispatch_8;
        ps._16PixelDispatchEnable = prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+       *
+       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+       *
+       * Since 16x MSAA is first introduced on SKL, we don't need to apply
+       * the workaround on any older hardware.
+       *
+       * BRW_NEW_NUM_SAMPLES
+       */
+      if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+          brw->num_samples == 16) {
+         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+         ps._32PixelDispatchEnable = false;
+      }
+
        ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->base.dispatch_grf_start_reg;
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
        ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->dispatch_grf_start_reg_2;
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
  
-      ps.KernelStartPointer0 = stage_state->prog_offset;
+      ps.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 1);
        ps.KernelStartPointer2 = stage_state->prog_offset +
-         prog_data->prog_offset_2;
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 2);
  
        if (prog_data->base.total_scratch) {
           ps.ScratchSpaceBasePointer =
-            render_bo(stage_state->scratch_bo,
-                      ffs(stage_state->per_thread_scratch) - 11);
+            rw_32_bo(stage_state->scratch_bo,
+                     ffs(stage_state->per_thread_scratch) - 11);
        }
     }
  }
@@ -3528,7 +4111,8 @@ static const struct brw_tracked_state genX(ps_state) = {
                              : 0),
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
+               BRW_NEW_FS_PROG_DATA |
+               (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
     },
     .emit = genX(upload_ps),
  };
@@ -3591,6 +4175,9 @@ genX(upload_ds_state)(struct brw_context *brw)
     if (!tes_prog_data) {
        brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
     } else {
+      assert(GEN_GEN < 11 ||
+             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
+
        brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
           INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
  
@@ -3625,7 +4212,7 @@ static void
  upload_te_state(struct brw_context *brw)
  {
     /* BRW_NEW_TESS_PROGRAMS */
-   bool active = brw->tess_eval_program;
+   bool active = brw->programs[MESA_SHADER_TESS_EVAL];
  
     /* BRW_NEW_TES_PROG_DATA */
     const struct brw_tes_prog_data *tes_prog_data =
@@ -3663,16 +4250,11 @@ genX(upload_tes_push_constants)(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->tes.base;
     /* BRW_NEW_TESS_PROGRAMS */
-   const struct brw_program *tep = brw_program_const(brw->tess_eval_program);
-
-   if (tep) {
-      /* BRW_NEW_TES_PROG_DATA */
-      const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
-      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_EVAL);
-      gen6_upload_push_constants(brw, &tep->program, prog_data, stage_state);
-   }
+   const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
  
-   upload_constant_state(brw, stage_state, tep, MESA_SHADER_TESS_EVAL);
+   /* BRW_NEW_TES_PROG_DATA */
+   const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
+   gen6_upload_push_constants(brw, tep, prog_data, stage_state);
  }
  
  static const struct brw_tracked_state genX(tes_push_constants) = {
@@ -3680,7 +4262,6 @@ static const struct brw_tracked_state genX(tes_push_constants) = {
        .mesa  = _NEW_PROGRAM_CONSTANTS,
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
                 BRW_NEW_TESS_PROGRAMS |
                 BRW_NEW_TES_PROG_DATA,
     },
@@ -3692,18 +4273,12 @@ genX(upload_tcs_push_constants)(struct brw_context *brw)
  {
     struct brw_stage_state *stage_state = &brw->tcs.base;
     /* BRW_NEW_TESS_PROGRAMS */
-   const struct brw_program *tcp = brw_program_const(brw->tess_ctrl_program);
-   bool active = brw->tess_eval_program;
-
-   if (active) {
-      /* BRW_NEW_TCS_PROG_DATA */
-      const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
+   const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
  
-      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_CTRL);
-      gen6_upload_push_constants(brw, &tcp->program, prog_data, stage_state);
-   }
+   /* BRW_NEW_TCS_PROG_DATA */
+   const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
  
-   upload_constant_state(brw, stage_state, active, MESA_SHADER_TESS_CTRL);
+   gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
  }
  
  static const struct brw_tracked_state genX(tcs_push_constants) = {
@@ -3712,7 +4287,6 @@ static const struct brw_tracked_state genX(tcs_push_constants) = {
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
                 BRW_NEW_DEFAULT_TESS_LEVELS |
-               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
                 BRW_NEW_TESS_PROGRAMS |
                 BRW_NEW_TCS_PROG_DATA,
     },
@@ -3724,6 +4298,68 @@ static const struct brw_tracked_state genX(tcs_push_constants) = {
  /* ---------------------------------------------------------------------- */
  
  #if GEN_GEN >= 7
+static void
+genX(upload_cs_push_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->cs.base;
+
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
+
+   if (cp) {
+      /* BRW_NEW_CS_PROG_DATA */
+      struct brw_cs_prog_data *cs_prog_data =
+         brw_cs_prog_data(brw->cs.base.prog_data);
+
+      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
+      brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
+   }
+}
+
+const struct brw_tracked_state genX(cs_push_constants) = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_COMPUTE_PROGRAM |
+             BRW_NEW_CS_PROG_DATA,
+   },
+   .emit = genX(upload_cs_push_constants),
+};
+
+/**
+ * Creates a new CS constant buffer reflecting the current CS program's
+ * constants, if needed by the CS program.
+ */
+static void
+genX(upload_cs_pull_constants)(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->cs.base;
+
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct brw_program *cp =
+      (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
+
+   /* BRW_NEW_CS_PROG_DATA */
+   const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
+
+   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
+   /* _NEW_PROGRAM_CONSTANTS */
+   brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
+                             stage_state, prog_data);
+}
+
+const struct brw_tracked_state genX(cs_pull_constants) = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_COMPUTE_PROGRAM |
+             BRW_NEW_CS_PROG_DATA,
+   },
+   .emit = genX(upload_cs_pull_constants),
+};
+
  static void
  genX(upload_cs_state)(struct brw_context *brw)
  {
@@ -3745,41 +4381,63 @@ genX(upload_cs_state)(struct brw_context *brw)
           brw, &stage_state->surf_offset[
                   prog_data->binding_table.shader_time_start],
           brw->shader_time.bo, 0, ISL_FORMAT_RAW,
-         brw->shader_time.bo->size, 1, true);
+         brw->shader_time.bo->size, 1,
+         RELOC_WRITE);
     }
  
     uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
                                      32, &stage_state->bind_bo_offset);
  
+   /* The MEDIA_VFE_STATE documentation for Gen8+ says:
+    *
+    * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+    *  the only bits that are changed are scoreboard related: Scoreboard
+    *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+    *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
+    *
+    * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
+    * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
+
     brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
        if (prog_data->total_scratch) {
-         uint32_t bo_offset;
+         uint32_t per_thread_scratch_value;
  
           if (GEN_GEN >= 8) {
              /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
               * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
               */
-            bo_offset = ffs(stage_state->per_thread_scratch) - 11;
+            per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
           } else if (GEN_IS_HASWELL) {
              /* Haswell's Per Thread Scratch Space is in the range [0, 10]
               * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
               */
-            bo_offset = ffs(stage_state->per_thread_scratch) - 12;
+            per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
           } else {
              /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
               * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
               */
-            bo_offset = stage_state->per_thread_scratch / 1024 - 1;
+            per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
           }
-         vfe.ScratchSpaceBasePointer =
-            render_bo(stage_state->scratch_bo, bo_offset);
+         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
+         vfe.PerThreadScratchSpace = per_thread_scratch_value;
        }
  
+      /* If brw->screen->subslice_total is greater than one, then
+       * devinfo->max_cs_threads stores number of threads per sub-slice;
+       * thus we need to multiply by that number by subslices to get
+       * the actual maximum number of threads; the -1 is because the HW
+       * has a bias of 1 (would not make sense to say the maximum number
+       * of threads is 0).
+       */
        const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
        vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
        vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
+#if GEN_GEN < 11
        vfe.ResetGatewayTimer =
           Resettingrelativetimerandlatchingtheglobaltimestamp;
+#endif
  #if GEN_GEN < 9
        vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
  #endif
@@ -3827,11 +4485,11 @@ genX(upload_cs_state)(struct brw_context *brw)
     const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
        .KernelStartPointer = brw->cs.base.prog_offset,
        .SamplerStatePointer = stage_state->sampler_offset,
-      .SamplerCount = DIV_ROUND_UP(stage_state->sampler_count, 4) >> 2,
+      .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
        .BindingTablePointer = stage_state->bind_bo_offset,
        .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
        .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
-      .SharedLocalMemorySize = encode_slm_size(devinfo->gen,
+      .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
                                                 prog_data->total_shared),
        .BarrierEnable = cs_prog_data->uses_barrier,
  #if GEN_GEN >= 8 || GEN_IS_HASWELL
@@ -3869,19 +4527,19 @@ static const struct brw_tracked_state genX(cs_state) = {
  static void
  genX(upload_raster)(struct brw_context *brw)
  {
-   struct gl_context *ctx = &brw->ctx;
+   const struct gl_context *ctx = &brw->ctx;
  
     /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
  
     /* _NEW_POLYGON */
-   struct gl_polygon_attrib *polygon = &ctx->Polygon;
+   const struct gl_polygon_attrib *polygon = &ctx->Polygon;
  
     /* _NEW_POINT */
-   struct gl_point_attrib *point = &ctx->Point;
+   const struct gl_point_attrib *point = &ctx->Point;
  
     brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
-      if (brw->polygon_front_bit == render_to_fbo)
+      if (brw->polygon_front_bit != flip_y)
           raster.FrontWinding = CounterClockwise;
  
        if (polygon->CullFlag) {
@@ -3902,7 +4560,7 @@ genX(upload_raster)(struct brw_context *brw)
           raster.CullMode = CULLMODE_NONE;
        }
  
-      point->SmoothFlag = raster.SmoothPointEnable;
+      raster.SmoothPointEnable = point->SmoothFlag;
  
        raster.DXMultisampleRasterizationEnable =
           _mesa_is_multisample_enabled(ctx);
@@ -3942,18 +4600,33 @@ genX(upload_raster)(struct brw_context *brw)
        /* _NEW_LINE */
        raster.AntialiasingEnable = ctx->Line.SmoothFlag;
  
+#if GEN_GEN == 10
+      /* _NEW_BUFFERS
+       * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
+       */
+      const bool multisampled_fbo =
+         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+      if (multisampled_fbo)
+         raster.AntialiasingEnable = false;
+#endif
+
        /* _NEW_SCISSOR */
        raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
  
        /* _NEW_TRANSFORM */
-      if (!ctx->Transform.DepthClamp) {
+#if GEN_GEN < 9
+      if (!(ctx->Transform.DepthClampNear &&
+            ctx->Transform.DepthClampFar))
+         raster.ViewportZClipTestEnable = true;
+#endif
+
  #if GEN_GEN >= 9
-         raster.ViewportZFarClipTestEnable = true;
+      if (!ctx->Transform.DepthClampNear)
           raster.ViewportZNearClipTestEnable = true;
-#else
-         raster.ViewportZClipTestEnable = true;
+
+      if (!ctx->Transform.DepthClampFar)
+         raster.ViewportZFarClipTestEnable = true;
  #endif
-      }
  
        /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
  #if GEN_GEN >= 9
@@ -4212,7 +4885,7 @@ genX(emit_mi_report_perf_count)(struct brw_context *brw,
                                  uint32_t report_id)
  {
     brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
-      mi_rpc.MemoryAddress = instruction_bo(bo, offset_in_bytes);
+      mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
        mi_rpc.ReportID = report_id;
     }
  }
@@ -4224,8 +4897,8 @@ genX(emit_mi_report_perf_count)(struct brw_context *brw,
   * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
   */
  static void
-genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
-                                     struct brw_stage_state *stage_state)
+genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
+                                     MAYBE_UNUSED struct brw_stage_state *stage_state)
  {
  #if GEN_GEN >= 7
     static const uint16_t packet_headers[] = {
@@ -4265,7 +4938,7 @@ has_component(mesa_format format, int i)
  static void
  genX(upload_default_color)(struct brw_context *brw,
                             const struct gl_sampler_object *sampler,
-                           mesa_format format, GLenum base_format,
+                           MAYBE_UNUSED mesa_format format, GLenum base_format,
                             bool is_integer_format, bool is_stencil_sampling,
                             uint32_t *sdc_offset)
  {
@@ -4322,9 +4995,9 @@ genX(upload_default_color)(struct brw_context *brw,
        color.ui[3] = float_as_int(1.0);
  
     int alignment = 32;
-   if (brw->gen >= 8) {
+   if (GEN_GEN >= 8) {
        alignment = 64;
-   } else if (brw->is_haswell && (is_integer_format || is_stencil_sampling)) {
+   } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
        alignment = 512;
     }
  
@@ -4441,7 +5114,7 @@ genX(upload_default_color)(struct brw_context *brw,
  }
  
  static uint32_t
-translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
+translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
  {
     switch (wrap) {
     case GL_REPEAT:
@@ -4455,9 +5128,8 @@ translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
         *
         * Gen8+ supports this natively.
         */
-         return TCM_HALF_BORDER;
-#endif
-
+      return TCM_HALF_BORDER;
+#else
        /* On Gen4-7.5, we clamp the coordinates in the fragment shader
         * and set clamp_border here, which gets the result desired.
         * We just use clamp(_to_edge) for nearest, because for nearest
@@ -4468,6 +5140,7 @@ translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
           return TCM_CLAMP;
        else
           return TCM_CLAMP_BORDER;
+#endif
     case GL_CLAMP_TO_EDGE:
        return TCM_CLAMP;
     case GL_CLAMP_TO_BORDER:
@@ -4506,8 +5179,7 @@ genX(update_sampler_state)(struct brw_context *brw,
                             mesa_format format, GLenum base_format,
                             const struct gl_texture_object *texObj,
                             const struct gl_sampler_object *sampler,
-                           uint32_t *sampler_state,
-                           uint32_t batch_offset_for_sampler_state)
+                           uint32_t *sampler_state)
  {
     struct GENX(SAMPLER_STATE) samp_st = { 0 };
  
@@ -4551,7 +5223,7 @@ genX(update_sampler_state)(struct brw_context *brw,
     if (sampler->MaxAnisotropy > 1.0f) {
        if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
           samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
-      if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
+      if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
           samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
  
        if (sampler->MaxAnisotropy > 2.0f) {
@@ -4575,9 +5247,9 @@ genX(update_sampler_state)(struct brw_context *brw,
  
     bool either_nearest =
        sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
-   unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
-   unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
-   unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
+   unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
+   unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
+   unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
  
     if (target == GL_TEXTURE_CUBE_MAP ||
         target == GL_TEXTURE_CUBE_MAP_ARRAY) {
@@ -4651,15 +5323,12 @@ genX(update_sampler_state)(struct brw_context *brw,
                                   texObj->StencilSampling,
                                   &border_color_offset);
     }
-
-   samp_st.BorderColorPointer = border_color_offset;
-
-   if (GEN_GEN < 6) {
-      samp_st.BorderColorPointer += brw->batch.bo->offset64; /* reloc */
-      brw_emit_reloc(&brw->batch, batch_offset_for_sampler_state + 8,
-                     brw->batch.bo, border_color_offset,
-                     I915_GEM_DOMAIN_SAMPLER, 0);
-   }
+#if GEN_GEN < 6
+      samp_st.BorderColorPointer =
+         ro_bo(brw->batch.state.bo, border_color_offset);
+#else
+      samp_st.BorderColorPointer = border_color_offset;
+#endif
  
  #if GEN_GEN >= 8
     samp_st.LODPreClampMode = CLAMP_MODE_OGL;
@@ -4673,8 +5342,7 @@ genX(update_sampler_state)(struct brw_context *brw,
  static void
  update_sampler_state(struct brw_context *brw,
                       int unit,
-                     uint32_t *sampler_state,
-                     uint32_t batch_offset_for_sampler_state)
+                     uint32_t *sampler_state)
  {
     struct gl_context *ctx = &brw->ctx;
     const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
@@ -4691,7 +5359,7 @@ update_sampler_state(struct brw_context *brw,
                                texUnit->LodBias,
                                firstImage->TexFormat, firstImage->_BaseFormat,
                                texObj, sampler,
-                              sampler_state, batch_offset_for_sampler_state);
+                              sampler_state);
  }
  
  static void
@@ -4716,19 +5384,15 @@ genX(upload_sampler_state_table)(struct brw_context *brw,
                                               32, &stage_state->sampler_offset);
     /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
  
-   uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
-
     for (unsigned s = 0; s < sampler_count; s++) {
        if (SamplersUsed & (1 << s)) {
           const unsigned unit = prog->SamplerUnits[s];
           if (ctx->Texture.Unit[unit]._Current) {
-            update_sampler_state(brw, unit, sampler_state,
-                                 batch_offset_for_sampler_state);
+            update_sampler_state(brw, unit, sampler_state);
           }
        }
  
        sampler_state += dwords;
-      batch_offset_for_sampler_state += size_in_bytes;
     }
  
     if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
@@ -4746,7 +5410,7 @@ static void
  genX(upload_fs_samplers)(struct brw_context *brw)
  {
     /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct gl_program *fs = (struct gl_program *) brw->fragment_program;
+   struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
     genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
  }
  
@@ -4764,7 +5428,7 @@ static void
  genX(upload_vs_samplers)(struct brw_context *brw)
  {
     /* BRW_NEW_VERTEX_PROGRAM */
-   struct gl_program *vs = (struct gl_program *) brw->vertex_program;
+   struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
     genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
  }
  
@@ -4783,7 +5447,7 @@ static void
  genX(upload_gs_samplers)(struct brw_context *brw)
  {
     /* BRW_NEW_GEOMETRY_PROGRAM */
-   struct gl_program *gs = (struct gl_program *) brw->geometry_program;
+   struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
     if (!gs)
        return;
  
@@ -4807,7 +5471,7 @@ static void
  genX(upload_tcs_samplers)(struct brw_context *brw)
  {
     /* BRW_NEW_TESS_PROGRAMS */
-   struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
+   struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
     if (!tcs)
        return;
  
@@ -4830,7 +5494,7 @@ static void
  genX(upload_tes_samplers)(struct brw_context *brw)
  {
     /* BRW_NEW_TESS_PROGRAMS */
-   struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
+   struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
     if (!tes)
        return;
  
@@ -4853,7 +5517,7 @@ static void
  genX(upload_cs_samplers)(struct brw_context *brw)
  {
     /* BRW_NEW_COMPUTE_PROGRAM */
-   struct gl_program *cs = (struct gl_program *) brw->compute_program;
+   struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
     if (!cs)
        return;
  
@@ -4873,6 +5537,32 @@ const struct brw_tracked_state genX(cs_samplers) = {
  
  /* ---------------------------------------------------------------------- */
  
+#if GEN_GEN <= 5
+
+static void genX(upload_blend_constant_color)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
+      blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
+      blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
+      blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
+      blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
+   }
+}
+
+static const struct brw_tracked_state genX(blend_constant_color) = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_BLORP,
+   },
+   .emit = genX(upload_blend_constant_color)
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
  void
  genX(init_atoms)(struct brw_context *brw)
  {
@@ -4887,7 +5577,7 @@ genX(init_atoms)(struct brw_context *brw)
        &brw_recalculate_urb_fence,
  
        &genX(cc_vp),
-      &brw_cc_unit,
+      &genX(color_calc_state),
  
        /* Surface state setup.  Must come before the VS/WM unit.  The binding
         * table upload must be last.
@@ -4904,19 +5594,17 @@ genX(init_atoms)(struct brw_context *brw)
        &genX(vs_samplers),
  
        /* These set up state for brw_psp_urb_cbs */
-      &brw_wm_unit,
+      &genX(wm_state),
        &genX(sf_clip_viewport),
        &genX(sf_state),
        &genX(vs_state), /* always required, enabled or not */
-      &brw_clip_unit,
-      &brw_gs_unit,
+      &genX(clip_state),
+      &genX(gs_state),
  
        /* Command packets:
         */
-      &brw_invariant_state,
-
        &brw_binding_table_pointers,
-      &brw_blend_constant_color,
+      &genX(blend_constant_color),
  
        &brw_depthbuffer,
  
@@ -5030,22 +5718,20 @@ genX(init_atoms)(struct brw_context *brw)
         */
        &brw_vs_pull_constants,
        &brw_vs_ubo_surfaces,
-      &brw_vs_abo_surfaces,
        &brw_tcs_pull_constants,
        &brw_tcs_ubo_surfaces,
-      &brw_tcs_abo_surfaces,
        &brw_tes_pull_constants,
        &brw_tes_ubo_surfaces,
-      &brw_tes_abo_surfaces,
        &brw_gs_pull_constants,
        &brw_gs_ubo_surfaces,
-      &brw_gs_abo_surfaces,
        &brw_wm_pull_constants,
        &brw_wm_ubo_surfaces,
-      &brw_wm_abo_surfaces,
        &gen6_renderbuffer_surfaces,
        &brw_renderbuffer_read_surfaces,
        &brw_texture_surfaces,
+
+      &genX(push_constant_packets),
+
        &brw_vs_binding_table,
        &brw_tcs_binding_table,
        &brw_tes_binding_table,
@@ -5073,7 +5759,7 @@ genX(init_atoms)(struct brw_context *brw)
  
        &genX(scissor_state),
  
-      &gen7_depthbuffer,
+      &brw_depthbuffer,
  
        &genX(polygon_stipple),
        &genX(polygon_stipple_offset),
@@ -5119,22 +5805,20 @@ genX(init_atoms)(struct brw_context *brw)
         */
        &brw_vs_pull_constants,
        &brw_vs_ubo_surfaces,
-      &brw_vs_abo_surfaces,
        &brw_tcs_pull_constants,
        &brw_tcs_ubo_surfaces,
-      &brw_tcs_abo_surfaces,
        &brw_tes_pull_constants,
        &brw_tes_ubo_surfaces,
-      &brw_tes_abo_surfaces,
        &brw_gs_pull_constants,
        &brw_gs_ubo_surfaces,
-      &brw_gs_abo_surfaces,
        &brw_wm_pull_constants,
        &brw_wm_ubo_surfaces,
-      &brw_wm_abo_surfaces,
        &gen6_renderbuffer_surfaces,
        &brw_renderbuffer_read_surfaces,
        &brw_texture_surfaces,
+
+      &genX(push_constant_packets),
+
        &brw_vs_binding_table,
        &brw_tcs_binding_table,
        &brw_tes_binding_table,
@@ -5166,7 +5850,7 @@ genX(init_atoms)(struct brw_context *brw)
  
        &genX(scissor_state),
  
-      &gen7_depthbuffer,
+      &brw_depthbuffer,
  
        &genX(polygon_stipple),
        &genX(polygon_stipple_offset),
@@ -5195,10 +5879,9 @@ genX(init_atoms)(struct brw_context *brw)
     {
        &gen7_l3_state,
        &brw_cs_image_surfaces,
-      &gen7_cs_push_constants,
-      &brw_cs_pull_constants,
+      &genX(cs_push_constants),
+      &genX(cs_pull_constants),
        &brw_cs_ubo_surfaces,
-      &brw_cs_abo_surfaces,
        &brw_cs_texture_surfaces,
        &brw_cs_work_groups_surface,
        &genX(cs_samplers),