i965: add CS stall on VF invalidation workaround
[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
index e3d4b5e067c80522fe35b4bf249b939cdfb0b688..dcdfb3c92929ff2ef4778ae7cd088a63182dd5fa 100644 (file)
@@ -23,7 +23,7 @@
 
 #include <assert.h>
 
-#include "common/gen_device_info.h"
+#include "dev/gen_device_info.h"
 #include "common/gen_sample_positions.h"
 #include "genxml/gen_macros.h"
 
@@ -59,7 +59,7 @@
 UNUSED static void *
 emit_dwords(struct brw_context *brw, unsigned n)
 {
-   intel_batchbuffer_begin(brw, n, RENDER_RING);
+   intel_batchbuffer_begin(brw, n);
    uint32_t *map = brw->batch.map_next;
    brw->batch.map_next += n;
    intel_batchbuffer_advance(brw);
@@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void *location,
    }
 }
 
-static struct brw_address
+UNUSED static struct brw_address
 rw_bo(struct brw_bo *bo, uint32_t offset)
 {
    return (struct brw_address) {
@@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset)
    };
 }
 
+static struct brw_address
+rw_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
+   };
+}
+
+static struct brw_address
+ro_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_32BIT,
+   };
+}
+
 UNUSED static struct brw_address
 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 {
@@ -138,7 +158,7 @@ KSP(struct brw_context *brw, uint32_t offset)
 }
 #else
 static uint32_t
-KSP(struct brw_context *brw, uint32_t offset)
+KSP(UNUSED struct brw_context *brw, uint32_t offset)
 {
    return offset;
 }
@@ -177,6 +197,37 @@ KSP(struct brw_context *brw, uint32_t offset)
         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
         _dst = NULL)
 
+#if GEN_GEN >= 7
+MAYBE_UNUSED static void
+emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
+{
+   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress  = reg;
+      lrm.MemoryAddress    = addr;
+   }
+}
+#endif
+
+MAYBE_UNUSED static void
+emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
+{
+   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = reg;
+      lri.DataDWord        = imm;
+   }
+}
+
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+MAYBE_UNUSED static void
+emit_lrr(struct brw_context *brw, uint32_t dst, uint32_t src)
+{
+   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_REG), lrr) {
+      lrr.SourceRegisterAddress        = src;
+      lrr.DestinationRegisterAddress   = dst;
+   }
+}
+#endif
+
 /**
  * Polygon stipple packet
  */
@@ -197,7 +248,7 @@ genX(upload_polygon_stipple)(struct brw_context *brw)
        * to a FBO (i.e. any named frame buffer object), we *don't*
        * need to invert - we already match the layout.
        */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
          for (unsigned i = 0; i < 32; i++)
             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
       } else {
@@ -237,7 +288,7 @@ genX(upload_polygon_stipple_offset)(struct brw_context *brw)
        * to a user-created FBO then our native pixel coordinate system
        * works just fine, and there's no window system to worry about.
        */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
          poly.PolygonStippleYOffset =
             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
       }
@@ -310,14 +361,22 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
                                unsigned buffer_nr,
                                struct brw_bo *bo,
                                unsigned start_offset,
-                               unsigned end_offset,
+                               MAYBE_UNUSED unsigned end_offset,
                                unsigned stride,
-                               unsigned step_rate)
+                               MAYBE_UNUSED unsigned step_rate)
 {
    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
       .VertexBufferIndex = buffer_nr,
       .BufferPitch = stride,
-      .BufferStartingAddress = ro_bo(bo, start_offset),
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .BufferStartingAddress = ro_32_bo(bo, start_offset),
 #if GEN_GEN >= 8
       .BufferSize = end_offset - start_offset,
 #endif
@@ -335,15 +394,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
 #endif
 
 #if GEN_GEN == 11
-      .VertexBufferMOCS = ICL_MOCS_WB,
+      .MOCS = ICL_MOCS_WB,
 #elif GEN_GEN == 10
-      .VertexBufferMOCS = CNL_MOCS_WB,
+      .MOCS = CNL_MOCS_WB,
 #elif GEN_GEN == 9
-      .VertexBufferMOCS = SKL_MOCS_WB,
+      .MOCS = SKL_MOCS_WB,
 #elif GEN_GEN == 8
-      .VertexBufferMOCS = BDW_MOCS_WB,
+      .MOCS = BDW_MOCS_WB,
 #elif GEN_GEN == 7
-      .VertexBufferMOCS = GEN7_MOCS_L3,
+      .MOCS = GEN7_MOCS_L3,
 #endif
    };
 
@@ -452,6 +511,80 @@ upload_format_size(uint32_t upload_format)
    }
 }
 
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers.  So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround.  Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ *
+ * This HW issue is gone on Gen11+.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8 && GEN_GEN < 11
+   bool need_invalidate = false;
+
+   for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   if (brw->draw.draw_params_bo) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
+
+      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
+      }
+   }
+
+   if (brw->draw.derived_draw_params_bo) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
+
+      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
+      }
+   }
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
+   }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+   if (high_bits != brw->ib.last_bo_high_bits) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw->ib.last_bo_high_bits = high_bits;
+   }
+#endif
+}
+
 static void
 genX(emit_vertices)(struct brw_context *brw)
 {
@@ -511,13 +644,20 @@ genX(emit_vertices)(struct brw_context *brw)
    }
 #endif
 
-   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
-                                    vs_prog_data->uses_baseinstance ||
+   const bool uses_draw_params =
+      vs_prog_data->uses_firstvertex ||
+      vs_prog_data->uses_baseinstance;
+
+   const bool uses_derived_draw_params =
+      vs_prog_data->uses_drawid ||
+      vs_prog_data->uses_is_indexed_draw;
+
+   const bool needs_sgvs_element = (uses_draw_params ||
                                     vs_prog_data->uses_instanceid ||
                                     vs_prog_data->uses_vertexid);
 
    unsigned nr_elements =
-      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
+      brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
 
 #if GEN_GEN < 8
    /* If any of the formats of vb.enabled needs more that one upload, we need
@@ -525,7 +665,8 @@ genX(emit_vertices)(struct brw_context *brw)
     */
    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+      const struct gl_array_attributes *glattrib = input->glattrib;
+      uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 
       if (uploads_needed(format, input->is_dual_slot) > 1)
          nr_elements++;
@@ -544,7 +685,7 @@ genX(emit_vertices)(struct brw_context *brw)
                            1 + GENX(VERTEX_ELEMENT_STATE_length));
       struct GENX(VERTEX_ELEMENT_STATE) elem = {
          .Valid = true,
-         .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
          .Component0Control = VFCOMP_STORE_0,
          .Component1Control = VFCOMP_STORE_0,
          .Component2Control = VFCOMP_STORE_0,
@@ -555,11 +696,10 @@ genX(emit_vertices)(struct brw_context *brw)
    }
 
    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
-   const bool uses_draw_params =
-      vs_prog_data->uses_basevertex ||
-      vs_prog_data->uses_baseinstance;
    const unsigned nr_buffers = brw->vb.nr_buffers +
-      uses_draw_params + vs_prog_data->uses_drawid;
+      uses_draw_params + uses_derived_draw_params;
+
+   vf_invalidate_for_vb_48bit_transitions(brw);
 
    if (nr_buffers) {
       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
@@ -593,11 +733,11 @@ genX(emit_vertices)(struct brw_context *brw)
                                              0 /* step rate */);
       }
 
-      if (vs_prog_data->uses_drawid) {
+      if (uses_derived_draw_params) {
          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
-                                             brw->draw.draw_id_bo,
-                                             brw->draw.draw_id_offset,
-                                             brw->draw.draw_id_bo->size,
+                                             brw->draw.derived_draw_params_bo,
+                                             brw->draw.derived_draw_params_offset,
+                                             brw->draw.derived_draw_params_bo->size,
                                              0 /* stride */,
                                              0 /* step rate */);
       }
@@ -618,7 +758,8 @@ genX(emit_vertices)(struct brw_context *brw)
    unsigned i;
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       const struct brw_vertex_element *input = brw->vb.enabled[i];
-      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
+      const struct gl_array_attributes *glattrib = input->glattrib;
+      uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
       uint32_t comp0 = VFCOMP_STORE_SRC;
       uint32_t comp1 = VFCOMP_STORE_SRC;
       uint32_t comp2 = VFCOMP_STORE_SRC;
@@ -659,17 +800,18 @@ genX(emit_vertices)(struct brw_context *brw)
           * entry. */
          const unsigned offset = input->offset + c * 16;
 
+         const struct gl_array_attributes *glattrib = input->glattrib;
          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
-            upload_format_size(upload_format) : input->glarray->Size;
+            upload_format_size(upload_format) : glattrib->Format.Size;
 
          switch (size) {
             case 0: comp0 = VFCOMP_STORE_0;
             case 1: comp1 = VFCOMP_STORE_0;
             case 2: comp2 = VFCOMP_STORE_0;
             case 3:
-               if (GEN_GEN >= 8 && input->glarray->Doubles) {
+               if (GEN_GEN >= 8 && glattrib->Format.Doubles) {
                   comp3 = VFCOMP_STORE_0;
-               } else if (input->glarray->Integer) {
+               } else if (glattrib->Format.Integer) {
                   comp3 = VFCOMP_STORE_1_INT;
                } else {
                   comp3 = VFCOMP_STORE_1_FP;
@@ -694,7 +836,7 @@ genX(emit_vertices)(struct brw_context *brw)
           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
           *     vertex element."
           */
-         if (input->glarray->Doubles && !input->is_dual_slot) {
+         if (glattrib->Format.Doubles && !input->is_dual_slot) {
             /* Store vertex elements which correspond to double and dvec2 vertex
              * shader inputs as 128-bit vertex elements, instead of 256-bits.
              */
@@ -735,21 +877,19 @@ genX(emit_vertices)(struct brw_context *brw)
       };
 
 #if GEN_GEN >= 8
-      if (vs_prog_data->uses_basevertex ||
-          vs_prog_data->uses_baseinstance) {
+      if (uses_draw_params) {
          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
-         elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
+         elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
          elem_state.Component0Control = VFCOMP_STORE_SRC;
          elem_state.Component1Control = VFCOMP_STORE_SRC;
       }
 #else
       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
-      elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
-      if (vs_prog_data->uses_basevertex)
+      elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+      if (uses_draw_params) {
          elem_state.Component0Control = VFCOMP_STORE_SRC;
-
-      if (vs_prog_data->uses_baseinstance)
          elem_state.Component1Control = VFCOMP_STORE_SRC;
+      }
 
       if (vs_prog_data->uses_vertexid)
          elem_state.Component2Control = VFCOMP_STORE_VID;
@@ -762,13 +902,13 @@ genX(emit_vertices)(struct brw_context *brw)
       dw += GENX(VERTEX_ELEMENT_STATE_length);
    }
 
-   if (vs_prog_data->uses_drawid) {
+   if (uses_derived_draw_params) {
       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
          .Valid = true,
          .VertexBufferIndex = brw->vb.nr_buffers + 1,
-         .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32_UINT,
+         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
          .Component0Control = VFCOMP_STORE_SRC,
-         .Component1Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_SRC,
          .Component2Control = VFCOMP_STORE_0,
          .Component3Control = VFCOMP_STORE_0,
 #if GEN_GEN < 5
@@ -782,8 +922,8 @@ genX(emit_vertices)(struct brw_context *brw)
 
 #if GEN_GEN >= 6
    if (gen6_edgeflag_input) {
-      const uint32_t format =
-         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
+      const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
+      const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 
       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
          .Valid = true,
@@ -839,6 +979,7 @@ static const struct brw_tracked_state genX(vertices) = {
       .mesa = _NEW_POLYGON,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_VERTEX_PROGRAM |
              BRW_NEW_VERTICES |
              BRW_NEW_VS_PROG_DATA,
    },
@@ -853,14 +994,24 @@ genX(emit_index_buffer)(struct brw_context *brw)
    if (index_buffer == NULL)
       return;
 
+   vf_invalidate_for_ib_48bit_transition(brw);
+
    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 #if GEN_GEN < 8 && !GEN_IS_HASWELL
       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 #endif
       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
-      ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two index buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict index buffers to the low 32 bits of
+       * the address space.
+       */
+      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
 #if GEN_GEN >= 8
-      ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+      ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
       ib.BufferSize = brw->ib.size;
 #else
       ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
@@ -1295,7 +1446,8 @@ genX(upload_clip_state)(struct brw_context *brw)
       clip.ScreenSpaceViewportYMax = 1;
 
       clip.ViewportXYClipTestEnable = true;
-      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+                                       ctx->Transform.DepthClampFar);
 
       /* _NEW_TRANSFORM */
       if (GEN_GEN == 5 || GEN_IS_G4X) {
@@ -1364,7 +1516,7 @@ genX(upload_clip_state)(struct brw_context *brw)
 #endif
 
 #if GEN_GEN == 7
-      clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
+      clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
 
       if (ctx->Polygon.CullFlag) {
          switch (ctx->Polygon.CullFaceMode) {
@@ -1389,7 +1541,8 @@ genX(upload_clip_state)(struct brw_context *brw)
       clip.UserClipDistanceCullTestEnableBitmask =
          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
 
-      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+                                       ctx->Transform.DepthClampFar);
 #endif
 
       /* _NEW_LIGHT */
@@ -1479,7 +1632,7 @@ genX(upload_sf)(struct brw_context *brw)
 
 #if GEN_GEN <= 7
    /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   bool flip_y = ctx->DrawBuffer->FlipY;
    UNUSED const bool multisampled_fbo =
       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 #endif
@@ -1531,7 +1684,7 @@ genX(upload_sf)(struct brw_context *brw)
 
 #if GEN_GEN <= 7
       /* _NEW_POLYGON */
-      sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+      sf.FrontWinding = brw->polygon_front_bit != flip_y;
 #if GEN_GEN >= 6
       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
@@ -1669,7 +1822,7 @@ genX(upload_sf)(struct brw_context *brw)
        * Window coordinates in an FBO are inverted, which means point
        * sprite origin must be inverted, too.
        */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
       } else {
          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -1775,47 +1928,100 @@ genX(upload_wm)(struct brw_context *brw)
          /* Pointer to the WM constant buffer.  Covered by the set of
           * state flags from gen6_upload_wm_push_constants.
           */
-         wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
-         wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
       }
    }
 #endif
 
 #if GEN_GEN >= 6
    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
-      wm.LineAntialiasingRegionWidth = _10pixels;
-      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
-
-      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
-      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
 #else
    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
    brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
-      if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
-         /* These two fields should be the same pre-gen6, which is why we
-          * only have one hardware field to program for both dispatch
-          * widths.
-          */
-         assert(wm_prog_data->base.dispatch_grf_start_reg ==
-                wm_prog_data->dispatch_grf_start_reg_2);
+#endif
+
+#if GEN_GEN <= 6
+      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+
+#if GEN_GEN == 4
+      /* On gen4, we only have one shader kernel */
+      if (brw_wm_state_has_ksp(wm, 0)) {
+         assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
+         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
       }
+#elif GEN_GEN == 5
+      /* On gen5, we have multiple shader kernels but only one GRF start
+       * register for all kernels
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
 
-      if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
-         wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
+      wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+      wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+      wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
 
-      if (stage_state->sampler_count)
-         wm.SamplerStatePointer =
-            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
-#if GEN_GEN == 5
-      if (wm_prog_data->prog_offset_2)
-         wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         wm_prog_data->base.dispatch_grf_start_reg;
+
+      /* Dispatch GRF Start should be the same for all shaders on gen5 */
+      if (brw_wm_state_has_ksp(wm, 1)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
+      }
+      if (brw_wm_state_has_ksp(wm, 2)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
+      }
+#elif GEN_GEN == 6
+      /* On gen6, we have multiple shader kernels and we no longer specify a
+       * register count for each one.
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+      wm.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+      wm.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
 #endif
 
-      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+#if GEN_GEN <= 5
       wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
       wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
+      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+      wm.SetupURBEntryReadOffset = 0;
       wm.EarlyDepthTestEnable = true;
+#endif
+
+#if GEN_GEN >= 6
+      wm.LineAntialiasingRegionWidth = _10pixels;
+      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+
+      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#else
+      if (stage_state->sampler_count)
+         wm.SamplerStatePointer =
+            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
+
       wm.LineAntialiasingRegionWidth = _05pixels;
       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
 
@@ -1844,27 +2050,13 @@ genX(upload_wm)(struct brw_context *brw)
       if (wm_prog_data->base.use_alt_mode)
          wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
 
-      wm.SamplerCount = GEN_GEN == 5 ?
+      /* WA_1606682166 */
+      wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ?
          0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
 
       wm.BindingTableEntryCount =
          wm_prog_data->base.binding_table.size_bytes / 4;
       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
-      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
-      wm.DispatchGRFStartRegisterForConstantSetupData0 =
-         wm_prog_data->base.dispatch_grf_start_reg;
-      if (GEN_GEN == 6 ||
-          wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
-         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
-      }
-
-#if GEN_GEN >= 5
-      if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
-         wm.KernelStartPointer2 =
-            KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
-      }
-#endif
 
 #if GEN_GEN == 6
       wm.DualSourceBlendEnable =
@@ -1889,13 +2081,10 @@ genX(upload_wm)(struct brw_context *brw)
          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
       else
          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
-
-      wm.DispatchGRFStartRegisterForConstantSetupData2 =
-         wm_prog_data->dispatch_grf_start_reg_2;
 #endif
 
       if (wm_prog_data->base.total_scratch) {
-         wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
          wm.PerThreadScratchSpace =
             ffs(stage_state->per_thread_scratch) - 11;
       }
@@ -2014,16 +2203,33 @@ static const struct brw_tracked_state genX(wm_state) = {
 
 /* ---------------------------------------------------------------------- */
 
+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken.  If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
+   /* WA_1606682166 */                                                    \
    pkt.SamplerCount       =                                               \
+      GEN_GEN == 11 ?                                                     \
+      0 :                                                                 \
       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
+    * disable prefetching of binding tables in A0 and B0 steppings.       \
+    * TODO: Revisit this WA on C0 stepping.                               \
+    */                                                                    \
    pkt.BindingTableEntryCount =                                           \
+      GEN_GEN == 11 ?                                                     \
+      0 :                                                                 \
       stage_prog_data->binding_table.size_bytes / 4;                      \
    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                           \
    if (stage_prog_data->total_scratch) {                                  \
-      pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
+      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
       pkt.PerThreadScratchSpace =                                         \
          ffs(stage_state->per_thread_scratch) - 11;                       \
    }                                                                      \
@@ -2068,8 +2274,8 @@ genX(upload_vs_state)(struct brw_context *brw)
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
       if (stage_state->push_const_size != 0) {
          cvs.Buffer0Valid = true;
-         cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
-         cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
       }
    }
 #endif
@@ -2185,8 +2391,14 @@ genX(upload_cc_viewport)(struct brw_context *brw)
    for (unsigned i = 0; i < viewport_count; i++) {
       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
-      if (ctx->Transform.DepthClamp) {
+      if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
+         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
+      } else if (ctx->Transform.DepthClampNear) {
          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = 0.0;
+      } else if (ctx->Transform.DepthClampFar) {
+         ccv.MinimumDepth = 0.0;
          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
       } else {
          ccv.MinimumDepth = 0.0;
@@ -2226,7 +2438,7 @@ const struct brw_tracked_state genX(cc_vp) = {
 
 static void
 set_scissor_bits(const struct gl_context *ctx, int i,
-                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 bool flip_y, unsigned fb_width, unsigned fb_height,
                  struct GENX(SCISSOR_RECT) *sc)
 {
    int bbox[4];
@@ -2248,7 +2460,7 @@ set_scissor_bits(const struct gl_context *ctx, int i,
       sc->ScissorRectangleXMax = 0;
       sc->ScissorRectangleYMin = 1;
       sc->ScissorRectangleYMax = 0;
-   } else if (render_to_fbo) {
+   } else if (!flip_y) {
       /* texmemory: Y=0=bottom */
       sc->ScissorRectangleXMin = bbox[0];
       sc->ScissorRectangleXMax = bbox[1] - 1;
@@ -2268,7 +2480,7 @@ static void
 genX(upload_scissor_state)(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
    struct GENX(SCISSOR_RECT) scissor;
    uint32_t scissor_state_offset;
    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
@@ -2292,7 +2504,7 @@ genX(upload_scissor_state)(struct brw_context *brw)
     * inclusive but max is exclusive.
     */
    for (unsigned i = 0; i < viewport_count; i++) {
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
       GENX(SCISSOR_RECT_pack)(
          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
    }
@@ -2361,6 +2573,17 @@ brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
     */
    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
 
+   /* Workaround: prevent gpu hangs on SandyBridge
+    * by disabling guardband clipping for odd dimensions.
+    */
+   if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
+      *xmin = -1.0f;
+      *xmax =  1.0f;
+      *ymin = -1.0f;
+      *ymax =  1.0f;
+      return;
+   }
+
    if (m00 != 0 && m11 != 0) {
       /* First, we compute the screen-space render area */
       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
@@ -2407,7 +2630,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
    const unsigned viewport_count = brw->clip.viewport_count;
 
    /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
 
@@ -2431,12 +2654,12 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 #endif
 
    /* _NEW_BUFFERS */
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   } else {
+   if (flip_y) {
       y_scale = -1.0;
       y_bias = (float)fb_height;
+   } else {
+      y_scale = 1.0;
+      y_bias = 0;
    }
 
    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
@@ -2464,7 +2687,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
       clv.YMaxClipGuardband = gb_ymax;
 
 #if GEN_GEN < 6
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
                        &sfv.ScissorRectangle);
 #elif GEN_GEN >= 8
       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
@@ -2481,16 +2704,16 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
       const float viewport_Ymax =
          MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
 
-      if (render_to_fbo) {
+      if (flip_y) {
          sfv.XMinViewPort = viewport_Xmin;
          sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = viewport_Ymin;
-         sfv.YMaxViewPort = viewport_Ymax - 1;
+         sfv.YMinViewPort = fb_height - viewport_Ymax;
+         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
       } else {
          sfv.XMinViewPort = viewport_Xmin;
          sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = fb_height - viewport_Ymax;
-         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
+         sfv.YMinViewPort = viewport_Ymin;
+         sfv.YMaxViewPort = viewport_Ymax - 1;
       }
 #endif
 
@@ -2560,8 +2783,8 @@ genX(upload_gs_state)(struct brw_context *brw)
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
       if (active && stage_state->push_const_size != 0) {
          cgs.Buffer0Valid = true;
-         cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
-         cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
       }
    }
 #endif
@@ -2642,7 +2865,7 @@ genX(upload_gs_state)(struct brw_context *brw)
 #if GEN_GEN < 7
          gs.SOStatisticsEnable = true;
          if (gs_prog->info.has_transform_feedback_varyings)
-            gs.SVBIPayloadEnable = true;
+            gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
 
          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
           * was previously done for gen6.
@@ -2878,7 +3101,26 @@ set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
          dstA = fix_dual_blend_alpha_to_one(dstA);
       }
 
-      entry->ColorBufferBlendEnable = true;
+      /* BRW_NEW_FS_PROG_DATA */
+      const struct brw_wm_prog_data *wm_prog_data =
+         brw_wm_prog_data(brw->wm.base.prog_data);
+
+      /* The Dual Source Blending documentation says:
+       *
+       * "If SRC1 is included in a src/dst blend factor and
+       * a DualSource RT Write message is not used, results
+       * are UNDEFINED. (This reflects the same restriction in DX APIs,
+       * where undefined results are produced if “o1” is not written
+       * by a PS – there are no default values defined).
+       * If SRC1 is not included in a src/dst blend factor,
+       * dual source blending must be disabled."
+       *
+       * There is no way to gracefully fix this undefined situation
+       * so we just disable the blending to prevent possible issues.
+       */
+      entry->ColorBufferBlendEnable =
+         !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
+
       entry->DestinationBlendFactor = blend_factor(dstRGB);
       entry->SourceBlendFactor = blend_factor(srcRGB);
       entry->DestinationAlphaBlendFactor = blend_factor(dstA);
@@ -3024,6 +3266,7 @@ static const struct brw_tracked_state genX(blend_state) = {
               _NEW_MULTISAMPLE,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
+             BRW_NEW_FS_PROG_DATA |
              BRW_NEW_STATE_BASE_ADDRESS,
    },
    .emit = genX(upload_blend_state),
@@ -3421,14 +3664,14 @@ genX(upload_sbe)(struct brw_context *brw)
       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
 
       /* _NEW_BUFFERS */
-      bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+      bool flip_y = ctx->DrawBuffer->FlipY;
 
       /* _NEW_POINT
        *
        * Window coordinates in an FBO are inverted, which means point
        * sprite origin must be inverted.
        */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
       else
          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -3623,19 +3866,20 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
    for (int i = 0; i < 4; i++) {
       struct intel_buffer_object *bufferobj =
          intel_buffer_object(xfb_obj->Buffers[i]);
+      uint32_t start = xfb_obj->Offset[i];
+      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
+      uint32_t const size = end - start;
 
-      if (!bufferobj) {
+      if (!bufferobj || !size) {
          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
             sob.SOBufferIndex = i;
          }
          continue;
       }
 
-      uint32_t start = xfb_obj->Offset[i];
       assert(start % 4 == 0);
-      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
       struct brw_bo *bo =
-         intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
+         intel_bufferobj_buffer(brw, bufferobj, start, size, true);
       assert(end <= bo->size);
 
       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
@@ -3649,7 +3893,7 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
          sob.SOBufferEnable = true;
          sob.StreamOffsetWriteEnable = true;
          sob.StreamOutputBufferOffsetAddressEnable = true;
-         sob.SOBufferMOCS = mocs_wb;
+         sob.MOCS = mocs_wb;
 
          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
          sob.StreamOutputBufferOffsetAddress =
@@ -3803,11 +4047,22 @@ genX(upload_ps)(struct brw_context *brw)
        */
       ps.VectorMaskEnable = GEN_GEN >= 8;
 
-      ps.SamplerCount =
-         DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
+      /* WA_1606682166:
+       * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
+       * Disable the Sampler state prefetch functionality in the SARB by
+       * programming 0xB000[30] to '1'."
+       */
+      ps.SamplerCount = GEN_GEN == 11 ?
+         0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
 
       /* BRW_NEW_FS_PROG_DATA */
-      ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
+      /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
+       * prefetching of binding tables in A0 and B0 steppings.
+       * TODO: Revisit this workaround on C0 stepping.
+       */
+      ps.BindingTableEntryCount = GEN_GEN == 11 ?
+                                  0 :
+                                  prog_data->base.binding_table.size_bytes / 4;
 
       if (prog_data->base.use_alt_mode)
          ps.FloatingPointMode = Alternate;
@@ -3883,19 +4138,42 @@ genX(upload_ps)(struct brw_context *brw)
 
       ps._8PixelDispatchEnable = prog_data->dispatch_8;
       ps._16PixelDispatchEnable = prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+       *
+       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+       *
+       * Since 16x MSAA is first introduced on SKL, we don't need to apply
+       * the workaround on any older hardware.
+       *
+       * BRW_NEW_NUM_SAMPLES
+       */
+      if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+          brw->num_samples == 16) {
+         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+         ps._32PixelDispatchEnable = false;
+      }
+
       ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->base.dispatch_grf_start_reg;
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
       ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->dispatch_grf_start_reg_2;
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
 
-      ps.KernelStartPointer0 = stage_state->prog_offset;
+      ps.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 1);
       ps.KernelStartPointer2 = stage_state->prog_offset +
-         prog_data->prog_offset_2;
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
       if (prog_data->base.total_scratch) {
          ps.ScratchSpaceBasePointer =
-            rw_bo(stage_state->scratch_bo,
-                  ffs(stage_state->per_thread_scratch) - 11);
+            rw_32_bo(stage_state->scratch_bo,
+                     ffs(stage_state->per_thread_scratch) - 11);
       }
    }
 }
@@ -3908,7 +4186,8 @@ static const struct brw_tracked_state genX(ps_state) = {
                             : 0),
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
+               BRW_NEW_FS_PROG_DATA |
+               (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
    },
    .emit = genX(upload_ps),
 };
@@ -4216,7 +4495,7 @@ genX(upload_cs_state)(struct brw_context *brw)
              */
             per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
          }
-         vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
          vfe.PerThreadScratchSpace = per_thread_scratch_value;
       }
 
@@ -4315,6 +4594,107 @@ static const struct brw_tracked_state genX(cs_state) = {
    .emit = genX(upload_cs_state)
 };
 
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+#define MI_PREDICATE_SRC0  0x2400
+#define MI_PREDICATE_SRC1  0x2408
+
+static void
+prepare_indirect_gpgpu_walker(struct brw_context *brw)
+{
+   GLintptr indirect_offset = brw->compute.num_work_groups_offset;
+   struct brw_bo *bo = brw->compute.num_work_groups_bo;
+
+   emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
+   emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
+   emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
+
+#if GEN_GEN <= 7
+   /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
+   emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
+   emit_lri(brw, MI_PREDICATE_SRC1    , 0);
+   emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
+
+   /* Load compute_dispatch_indirect_x_size into SRC0 */
+   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
+
+   /* predicate = (compute_dispatch_indirect_x_size == 0); */
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* Load compute_dispatch_indirect_y_size into SRC0 */
+   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
+
+   /* predicate |= (compute_dispatch_indirect_y_size == 0); */
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* Load compute_dispatch_indirect_z_size into SRC0 */
+   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
+
+   /* predicate |= (compute_dispatch_indirect_z_size == 0); */
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* predicate = !predicate; */
+#define COMPARE_FALSE                           1
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_FALSE;
+   }
+#endif
+}
+
+static void
+genX(emit_gpgpu_walker)(struct brw_context *brw)
+{
+   const struct brw_cs_prog_data *prog_data =
+      brw_cs_prog_data(brw->cs.base.prog_data);
+
+   const GLuint *num_groups = brw->compute.num_work_groups;
+
+   bool indirect = brw->compute.num_work_groups_bo != NULL;
+   if (indirect)
+      prepare_indirect_gpgpu_walker(brw);
+
+   const unsigned simd_size = prog_data->simd_size;
+   unsigned group_size = prog_data->local_size[0] *
+      prog_data->local_size[1] * prog_data->local_size[2];
+
+   uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
+   const unsigned right_non_aligned = group_size & (simd_size - 1);
+   if (right_non_aligned != 0)
+      right_mask >>= (simd_size - right_non_aligned);
+
+   brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = indirect;
+      ggw.PredicateEnable              = GEN_GEN <= 7 && indirect;
+      ggw.SIMDSize                     = prog_data->simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
+      ggw.ThreadGroupIDXDimension      = num_groups[0];
+      ggw.ThreadGroupIDYDimension      = num_groups[1];
+      ggw.ThreadGroupIDZDimension      = num_groups[2];
+      ggw.RightExecutionMask           = right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
+
+   brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
 #endif
 
 /* ---------------------------------------------------------------------- */
@@ -4326,7 +4706,7 @@ genX(upload_raster)(struct brw_context *brw)
    const struct gl_context *ctx = &brw->ctx;
 
    /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
 
    /* _NEW_POLYGON */
    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
@@ -4335,7 +4715,7 @@ genX(upload_raster)(struct brw_context *brw)
    const struct gl_point_attrib *point = &ctx->Point;
 
    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
-      if (brw->polygon_front_bit == render_to_fbo)
+      if (brw->polygon_front_bit != flip_y)
          raster.FrontWinding = CounterClockwise;
 
       if (polygon->CullFlag) {
@@ -4410,14 +4790,19 @@ genX(upload_raster)(struct brw_context *brw)
       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
 
       /* _NEW_TRANSFORM */
-      if (!ctx->Transform.DepthClamp) {
+#if GEN_GEN < 9
+      if (!(ctx->Transform.DepthClampNear &&
+            ctx->Transform.DepthClampFar))
+         raster.ViewportZClipTestEnable = true;
+#endif
+
 #if GEN_GEN >= 9
-         raster.ViewportZFarClipTestEnable = true;
+      if (!ctx->Transform.DepthClampNear)
          raster.ViewportZNearClipTestEnable = true;
-#else
-         raster.ViewportZClipTestEnable = true;
+
+      if (!ctx->Transform.DepthClampFar)
+         raster.ViewportZFarClipTestEnable = true;
 #endif
-      }
 
       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
 #if GEN_GEN >= 9
@@ -4620,7 +5005,25 @@ genX(upload_ps_blend)(struct brw_context *brw)
             dstA = fix_dual_blend_alpha_to_one(dstA);
          }
 
-         pb.ColorBufferBlendEnable = true;
+         /* BRW_NEW_FS_PROG_DATA */
+         const struct brw_wm_prog_data *wm_prog_data =
+            brw_wm_prog_data(brw->wm.base.prog_data);
+
+         /* The Dual Source Blending documentation says:
+          *
+          * "If SRC1 is included in a src/dst blend factor and
+          * a DualSource RT Write message is not used, results
+          * are UNDEFINED. (This reflects the same restriction in DX APIs,
+          * where undefined results are produced if “o1” is not written
+          * by a PS – there are no default values defined).
+          * If SRC1 is not included in a src/dst blend factor,
+          * dual source blending must be disabled."
+          *
+          * There is no way to gracefully fix this undefined situation
+          * so we just disable the blending to prevent possible issues.
+          */
+         pb.ColorBufferBlendEnable =
+            !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
@@ -4639,7 +5042,8 @@ static const struct brw_tracked_state genX(ps_blend) = {
               _NEW_MULTISAMPLE,
       .brw = BRW_NEW_BLORP |
              BRW_NEW_CONTEXT |
-             BRW_NEW_FRAGMENT_PROGRAM,
+             BRW_NEW_FRAGMENT_PROGRAM |
+             BRW_NEW_FS_PROG_DATA,
    },
    .emit = genX(upload_ps_blend)
 };
@@ -4688,8 +5092,8 @@ genX(emit_mi_report_perf_count)(struct brw_context *brw,
  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
  */
 static void
-genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
-                                     struct brw_stage_state *stage_state)
+genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
+                                     MAYBE_UNUSED struct brw_stage_state *stage_state)
 {
 #if GEN_GEN >= 7
    static const uint16_t packet_headers[] = {
@@ -4729,7 +5133,7 @@ has_component(mesa_format format, int i)
 static void
 genX(upload_default_color)(struct brw_context *brw,
                            const struct gl_sampler_object *sampler,
-                           mesa_format format, GLenum base_format,
+                           MAYBE_UNUSED mesa_format format, GLenum base_format,
                            bool is_integer_format, bool is_stencil_sampling,
                            uint32_t *sdc_offset)
 {
@@ -4905,7 +5309,7 @@ genX(upload_default_color)(struct brw_context *brw,
 }
 
 static uint32_t
-translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
+translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
 {
    switch (wrap) {
    case GL_REPEAT:
@@ -4970,8 +5374,7 @@ genX(update_sampler_state)(struct brw_context *brw,
                            mesa_format format, GLenum base_format,
                            const struct gl_texture_object *texObj,
                            const struct gl_sampler_object *sampler,
-                           uint32_t *sampler_state,
-                           uint32_t batch_offset_for_sampler_state)
+                           uint32_t *sampler_state)
 {
    struct GENX(SAMPLER_STATE) samp_st = { 0 };
 
@@ -5039,9 +5442,9 @@ genX(update_sampler_state)(struct brw_context *brw,
 
    bool either_nearest =
       sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
-   unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
-   unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
-   unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
+   unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
+   unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
+   unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
 
    if (target == GL_TEXTURE_CUBE_MAP ||
        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
@@ -5134,8 +5537,7 @@ genX(update_sampler_state)(struct brw_context *brw,
 static void
 update_sampler_state(struct brw_context *brw,
                      int unit,
-                     uint32_t *sampler_state,
-                     uint32_t batch_offset_for_sampler_state)
+                     uint32_t *sampler_state)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
@@ -5152,7 +5554,7 @@ update_sampler_state(struct brw_context *brw,
                               texUnit->LodBias,
                               firstImage->TexFormat, firstImage->_BaseFormat,
                               texObj, sampler,
-                              sampler_state, batch_offset_for_sampler_state);
+                              sampler_state);
 }
 
 static void
@@ -5177,19 +5579,15 @@ genX(upload_sampler_state_table)(struct brw_context *brw,
                                              32, &stage_state->sampler_offset);
    /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
 
-   uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
-
    for (unsigned s = 0; s < sampler_count; s++) {
       if (SamplersUsed & (1 << s)) {
          const unsigned unit = prog->SamplerUnits[s];
          if (ctx->Texture.Unit[unit]._Current) {
-            update_sampler_state(brw, unit, sampler_state,
-                                 batch_offset_for_sampler_state);
+            update_sampler_state(brw, unit, sampler_state);
          }
       }
 
       sampler_state += dwords;
-      batch_offset_for_sampler_state += size_in_bytes;
    }
 
    if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
@@ -5556,7 +5954,7 @@ genX(init_atoms)(struct brw_context *brw)
 
       &genX(scissor_state),
 
-      &gen7_depthbuffer,
+      &brw_depthbuffer,
 
       &genX(polygon_stipple),
       &genX(polygon_stipple_offset),
@@ -5647,7 +6045,7 @@ genX(init_atoms)(struct brw_context *brw)
 
       &genX(scissor_state),
 
-      &gen7_depthbuffer,
+      &brw_depthbuffer,
 
       &genX(polygon_stipple),
       &genX(polygon_stipple_offset),
@@ -5690,5 +6088,6 @@ genX(init_atoms)(struct brw_context *brw)
                            compute_atoms, ARRAY_SIZE(compute_atoms));
 
    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
+   brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
 #endif
 }