i965/gen9: Optimize slice and subslice load balancing behavior.

[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c

index 8e3fcbf12ec9d19ff29446303d497b1f49805d9f..629849def749515e8734f48e7d83519fd32ad5a4 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -26,6 +26,7 @@
  #include "dev/gen_device_info.h"
  #include "common/gen_sample_positions.h"
  #include "genxml/gen_macros.h"
+#include "common/gen_guardband.h"
  
  #include "main/bufferobj.h"
  #include "main/context.h"
@@ -33,6 +34,8 @@
  #include "main/macros.h"
  #include "main/state.h"
  
+#include "genX_boilerplate.h"
+
  #include "brw_context.h"
  #include "brw_draw.h"
  #include "brw_multisample_state.h"
@@ -56,100 +59,6 @@
  #include "main/viewport.h"
  #include "util/half_float.h"
  
-UNUSED static void *
-emit_dwords(struct brw_context *brw, unsigned n)
-{
-   intel_batchbuffer_begin(brw, n);
-   uint32_t *map = brw->batch.map_next;
-   brw->batch.map_next += n;
-   intel_batchbuffer_advance(brw);
-   return map;
-}
-
-struct brw_address {
-   struct brw_bo *bo;
-   unsigned reloc_flags;
-   uint32_t offset;
-};
-
-#define __gen_address_type struct brw_address
-#define __gen_user_data struct brw_context
-
-static uint64_t
-__gen_combine_address(struct brw_context *brw, void *location,
-                      struct brw_address address, uint32_t delta)
-{
-   struct intel_batchbuffer *batch = &brw->batch;
-   uint32_t offset;
-
-   if (address.bo == NULL) {
-      return address.offset + delta;
-   } else {
-      if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
-         offset = (char *) location - (char *) brw->batch.state.map;
-         return brw_state_reloc(batch, offset, address.bo,
-                                address.offset + delta,
-                                address.reloc_flags);
-      }
-
-      assert(!brw_ptr_in_state_buffer(batch, location));
-
-      offset = (char *) location - (char *) brw->batch.batch.map;
-      return brw_batch_reloc(batch, offset, address.bo,
-                             address.offset + delta,
-                             address.reloc_flags);
-   }
-}
-
-UNUSED static struct brw_address
-rw_bo(struct brw_bo *bo, uint32_t offset)
-{
-   return (struct brw_address) {
-            .bo = bo,
-            .offset = offset,
-            .reloc_flags = RELOC_WRITE,
-   };
-}
-
-static struct brw_address
-ro_bo(struct brw_bo *bo, uint32_t offset)
-{
-   return (struct brw_address) {
-            .bo = bo,
-            .offset = offset,
-   };
-}
-
-static struct brw_address
-rw_32_bo(struct brw_bo *bo, uint32_t offset)
-{
-   return (struct brw_address) {
-            .bo = bo,
-            .offset = offset,
-            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
-   };
-}
-
-static struct brw_address
-ro_32_bo(struct brw_bo *bo, uint32_t offset)
-{
-   return (struct brw_address) {
-            .bo = bo,
-            .offset = offset,
-            .reloc_flags = RELOC_32BIT,
-   };
-}
-
-UNUSED static struct brw_address
-ggtt_bo(struct brw_bo *bo, uint32_t offset)
-{
-   return (struct brw_address) {
-            .bo = bo,
-            .offset = offset,
-            .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
-   };
-}
-
  #if GEN_GEN == 4
  static struct brw_address
  KSP(struct brw_context *brw, uint32_t offset)
@@ -164,38 +73,27 @@ KSP(UNUSED struct brw_context *brw, uint32_t offset)
  }
  #endif
  
-#include "genxml/genX_pack.h"
-
-#define _brw_cmd_length(cmd) cmd ## _length
-#define _brw_cmd_length_bias(cmd) cmd ## _length_bias
-#define _brw_cmd_header(cmd) cmd ## _header
-#define _brw_cmd_pack(cmd) cmd ## _pack
-
-#define brw_batch_emit(brw, cmd, name)                  \
-   for (struct cmd name = { _brw_cmd_header(cmd) },     \
-        *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
-        __builtin_expect(_dst != NULL, 1);              \
-        _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
-        _dst = NULL)
-
-#define brw_batch_emitn(brw, cmd, n, ...) ({           \
-      uint32_t *_dw = emit_dwords(brw, n);             \
-      struct cmd template = {                          \
-         _brw_cmd_header(cmd),                         \
-         .DWordLength = n - _brw_cmd_length_bias(cmd), \
-         __VA_ARGS__                                   \
-      };                                               \
-      _brw_cmd_pack(cmd)(brw, _dw, &template);         \
-      _dw + 1; /* Array starts at dw[1] */             \
-   })
-
-#define brw_state_emit(brw, cmd, align, offset, name)              \
-   for (struct cmd name = {},                                      \
-        *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
-                                align, offset);                    \
-        __builtin_expect(_dst != NULL, 1);                         \
-        _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
-        _dst = NULL)
+#if GEN_GEN >= 7
+static void
+emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
+{
+   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress  = reg;
+      lrm.MemoryAddress    = addr;
+   }
+}
+#endif
+
+#if GEN_GEN == 7
+static void
+emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
+{
+   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = reg;
+      lri.DataDWord        = imm;
+   }
+}
+#endif
  
  /**
   * Polygon stipple packet
@@ -330,9 +228,9 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
                                 unsigned buffer_nr,
                                 struct brw_bo *bo,
                                 unsigned start_offset,
-                               MAYBE_UNUSED unsigned end_offset,
+                               UNUSED unsigned end_offset,
                                 unsigned stride,
-                               MAYBE_UNUSED unsigned step_rate)
+                               UNUSED unsigned step_rate)
  {
     struct GENX(VERTEX_BUFFER_STATE) buf_state = {
        .VertexBufferIndex = buffer_nr,
@@ -363,15 +261,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
  #endif
  
  #if GEN_GEN == 11
-      .VertexBufferMOCS = ICL_MOCS_WB,
+      .MOCS = ICL_MOCS_WB,
  #elif GEN_GEN == 10
-      .VertexBufferMOCS = CNL_MOCS_WB,
+      .MOCS = CNL_MOCS_WB,
  #elif GEN_GEN == 9
-      .VertexBufferMOCS = SKL_MOCS_WB,
+      .MOCS = SKL_MOCS_WB,
  #elif GEN_GEN == 8
-      .VertexBufferMOCS = BDW_MOCS_WB,
+      .MOCS = BDW_MOCS_WB,
  #elif GEN_GEN == 7
-      .VertexBufferMOCS = GEN7_MOCS_L3,
+      .MOCS = GEN7_MOCS_L3,
  #endif
     };
  
@@ -499,15 +397,16 @@ pinned_bo_high_bits(struct brw_bo *bo)
   * In the relocation world, we have no idea what the addresses will be, so
   * we can't apply this workaround.  Instead, we tell the kernel to move it
   * to the low 4GB regardless.
+ *
+ * This HW issue is gone on Gen11+.
   */
  static void
  vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
  {
-#if GEN_GEN >= 8
+#if GEN_GEN >= 8 && GEN_GEN < 11
     bool need_invalidate = false;
-   unsigned i;
  
-   for (i = 0; i < brw->vb.nr_buffers; i++) {
+   for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
        uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
  
        if (high_bits != brw->vb.last_bo_high_bits[i]) {
@@ -516,12 +415,26 @@ vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
        }
     }
  
-   /* Don't bother with draw parameter buffers - those are generated by
-    * the driver so we can select a consistent memory zone.
-    */
+   if (brw->draw.draw_params_bo) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
+
+      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
+      }
+   }
+
+   if (brw->draw.derived_draw_params_bo) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
+
+      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
+      }
+   }
  
     if (need_invalidate) {
-      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
     }
  #endif
  }
@@ -952,7 +865,8 @@ genX(emit_index_buffer)(struct brw_context *brw)
  
     brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
  #if GEN_GEN < 8 && !GEN_IS_HASWELL
-      ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
+      assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
+      ib.CutIndexEnable = brw->ib.enable_cut_index;
  #endif
        ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
  
@@ -965,7 +879,7 @@ genX(emit_index_buffer)(struct brw_context *brw)
         */
        ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
  #if GEN_GEN >= 8
-      ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+      ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
        ib.BufferSize = brw->ib.size;
  #else
        ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
@@ -1006,6 +920,22 @@ const struct brw_tracked_state genX(cut_index) = {
  };
  #endif
  
+static void
+genX(upload_vf_statistics)(struct brw_context *brw)
+{
+   brw_batch_emit(brw, GENX(3DSTATE_VF_STATISTICS), vf) {
+      vf.StatisticsEnable = true;
+   }
+}
+
+const struct brw_tracked_state genX(vf_statistics) = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = BRW_NEW_BLORP | BRW_NEW_CONTEXT,
+   },
+   .emit = genX(upload_vf_statistics),
+};
+
  #if GEN_GEN >= 6
  /**
   * Determine the appropriate attribute override value to store into the
@@ -1721,7 +1651,7 @@ genX(upload_sf)(struct brw_context *brw)
        if (ctx->Line.SmoothFlag) {
           sf.LineEndCapAntialiasingRegionWidth = _10pixels;
  #if GEN_GEN <= 7
-         sf.AntiAliasingEnable = true;
+         sf.AntialiasingEnable = true;
  #endif
        }
  
@@ -2004,8 +1934,7 @@ genX(upload_wm)(struct brw_context *brw)
        if (wm_prog_data->base.use_alt_mode)
           wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
  
-      /* WA_1606682166 */
-      wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ?
+      wm.SamplerCount = GEN_GEN == 5 ?
           0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
  
        wm.BindingTableEntryCount =
@@ -2399,7 +2328,7 @@ set_scissor_bits(const struct gl_context *ctx, int i,
  
     bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
     bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
-   bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+   bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
     bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
     _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
  
@@ -2483,97 +2412,6 @@ static const struct brw_tracked_state genX(scissor_state) = {
  
  /* ---------------------------------------------------------------------- */
  
-static void
-brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
-                             float m00, float m11, float m30, float m31,
-                             float *xmin, float *xmax,
-                             float *ymin, float *ymax)
-{
-   /* According to the "Vertex X,Y Clamping and Quantization" section of the
-    * Strips and Fans documentation:
-    *
-    * "The vertex X and Y screen-space coordinates are also /clamped/ to the
-    *  fixed-point "guardband" range supported by the rasterization hardware"
-    *
-    * and
-    *
-    * "In almost all circumstances, if an object’s vertices are actually
-    *  modified by this clamping (i.e., had X or Y coordinates outside of
-    *  the guardband extent the rendered object will not match the intended
-    *  result.  Therefore software should take steps to ensure that this does
-    *  not happen - e.g., by clipping objects such that they do not exceed
-    *  these limits after the Drawing Rectangle is applied."
-    *
-    * I believe the fundamental restriction is that the rasterizer (in
-    * the SF/WM stages) have a limit on the number of pixels that can be
-    * rasterized.  We need to ensure any coordinates beyond the rasterizer
-    * limit are handled by the clipper.  So effectively that limit becomes
-    * the clipper's guardband size.
-    *
-    * It goes on to say:
-    *
-    * "In addition, in order to be correctly rendered, objects must have a
-    *  screenspace bounding box not exceeding 8K in the X or Y direction.
-    *  This additional restriction must also be comprehended by software,
-    *  i.e., enforced by use of clipping."
-    *
-    * This makes no sense.  Gen7+ hardware supports 16K render targets,
-    * and you definitely need to be able to draw polygons that fill the
-    * surface.  Our assumption is that the rasterizer was limited to 8K
-    * on Sandybridge, which only supports 8K surfaces, and it was actually
-    * increased to 16K on Ivybridge and later.
-    *
-    * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
-    */
-   const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
-
-   /* Workaround: prevent gpu hangs on SandyBridge
-    * by disabling guardband clipping for odd dimensions.
-    */
-   if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
-      *xmin = -1.0f;
-      *xmax =  1.0f;
-      *ymin = -1.0f;
-      *ymax =  1.0f;
-      return;
-   }
-
-   if (m00 != 0 && m11 != 0) {
-      /* First, we compute the screen-space render area */
-      const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
-      const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
-      const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
-      const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
-
-      /* We want the guardband to be centered on that */
-      const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
-      const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
-      const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
-      const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
-
-      /* Now we need it in native device coordinates */
-      const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
-      const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
-      const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
-      const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
-
-      /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
-       * flipped upside-down.  X should be fine though.
-       */
-      assert(ndc_gb_xmin <= ndc_gb_xmax);
-      *xmin = ndc_gb_xmin;
-      *xmax = ndc_gb_xmax;
-      *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
-      *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
-   } else {
-      /* The viewport scales to 0, so nothing will be rendered. */
-      *xmin = 0.0f;
-      *xmax = 0.0f;
-      *ymin = 0.0f;
-      *ymax = 0.0f;
-   }
-}
-
  static void
  genX(upload_sf_clip_viewport)(struct brw_context *brw)
  {
@@ -2627,7 +2465,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
        sfv.ViewportMatrixElementm30 = translate[0],
        sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
        sfv.ViewportMatrixElementm32 = translate[2],
-      brw_calculate_guardband_size(fb_width, fb_height,
+      gen_calculate_guardband_size(fb_width, fb_height,
                                     sfv.ViewportMatrixElementm00,
                                     sfv.ViewportMatrixElementm11,
                                     sfv.ViewportMatrixElementm30,
@@ -3297,13 +3135,13 @@ genX(upload_push_constant_packets)(struct brw_context *brw)
  
                 if (binding->BufferObject == ctx->Shared->NullBufferObj) {
                    static unsigned msg_id = 0;
-                  _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
-                                 MESA_DEBUG_TYPE_UNDEFINED,
-                                 MESA_DEBUG_SEVERITY_HIGH,
-                                 "UBO %d unbound, %s shader uniform data "
-                                 "will be undefined.",
-                                 range->block,
-                                 _mesa_shader_stage_to_string(stage));
+                  _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
+                                  MESA_DEBUG_TYPE_UNDEFINED,
+                                  MESA_DEBUG_SEVERITY_HIGH,
+                                  "UBO %d unbound, %s shader uniform data "
+                                  "will be undefined.",
+                                  range->block,
+                                  _mesa_shader_stage_to_string(stage));
                    continue;
                 }
  
@@ -3847,7 +3685,7 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
           sob.SOBufferEnable = true;
           sob.StreamOffsetWriteEnable = true;
           sob.StreamOutputBufferOffsetAddressEnable = true;
-         sob.SOBufferMOCS = mocs_wb;
+         sob.MOCS = mocs_wb;
  
           sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
           sob.StreamOutputBufferOffsetAddress =
@@ -4173,6 +4011,11 @@ genX(upload_hs_state)(struct brw_context *brw)
           hs.IncludeVertexHandles = true;
  
           hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+
+#if GEN_GEN >= 9
+         hs.DispatchMode = vue_prog_data->dispatch_mode;
+         hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
        }
     }
  }
@@ -4514,7 +4357,9 @@ genX(upload_cs_state)(struct brw_context *brw)
     const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
        .KernelStartPointer = brw->cs.base.prog_offset,
        .SamplerStatePointer = stage_state->sampler_offset,
-      .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
+      /* WA_1606682166 */
+      .SamplerCount = GEN_GEN == 11 ? 0 :
+                      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
        .BindingTablePointer = stage_state->bind_bo_offset,
        .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
        .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
@@ -4548,6 +4393,107 @@ static const struct brw_tracked_state genX(cs_state) = {
     .emit = genX(upload_cs_state)
  };
  
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+#define MI_PREDICATE_SRC0  0x2400
+#define MI_PREDICATE_SRC1  0x2408
+
+static void
+prepare_indirect_gpgpu_walker(struct brw_context *brw)
+{
+   GLintptr indirect_offset = brw->compute.num_work_groups_offset;
+   struct brw_bo *bo = brw->compute.num_work_groups_bo;
+
+   emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
+   emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
+   emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
+
+#if GEN_GEN <= 7
+   /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
+   emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
+   emit_lri(brw, MI_PREDICATE_SRC1    , 0);
+   emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
+
+   /* Load compute_dispatch_indirect_x_size into SRC0 */
+   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
+
+   /* predicate = (compute_dispatch_indirect_x_size == 0); */
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* Load compute_dispatch_indirect_y_size into SRC0 */
+   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
+
+   /* predicate |= (compute_dispatch_indirect_y_size == 0); */
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* Load compute_dispatch_indirect_z_size into SRC0 */
+   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
+
+   /* predicate |= (compute_dispatch_indirect_z_size == 0); */
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* predicate = !predicate; */
+#define COMPARE_FALSE                           1
+   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_FALSE;
+   }
+#endif
+}
+
+static void
+genX(emit_gpgpu_walker)(struct brw_context *brw)
+{
+   const struct brw_cs_prog_data *prog_data =
+      brw_cs_prog_data(brw->cs.base.prog_data);
+
+   const GLuint *num_groups = brw->compute.num_work_groups;
+
+   bool indirect = brw->compute.num_work_groups_bo != NULL;
+   if (indirect)
+      prepare_indirect_gpgpu_walker(brw);
+
+   const unsigned simd_size = prog_data->simd_size;
+   unsigned group_size = prog_data->local_size[0] *
+      prog_data->local_size[1] * prog_data->local_size[2];
+
+   uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
+   const unsigned right_non_aligned = group_size & (simd_size - 1);
+   if (right_non_aligned != 0)
+      right_mask >>= (simd_size - right_non_aligned);
+
+   brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = indirect;
+      ggw.PredicateEnable              = GEN_GEN <= 7 && indirect;
+      ggw.SIMDSize                     = prog_data->simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
+      ggw.ThreadGroupIDXDimension      = num_groups[0];
+      ggw.ThreadGroupIDYDimension      = num_groups[1];
+      ggw.ThreadGroupIDZDimension      = num_groups[2];
+      ggw.RightExecutionMask           = right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
+
+   brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
  #endif
  
  /* ---------------------------------------------------------------------- */
@@ -4945,8 +4891,8 @@ genX(emit_mi_report_perf_count)(struct brw_context *brw,
   * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
   */
  static void
-genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
-                                     MAYBE_UNUSED struct brw_stage_state *stage_state)
+genX(emit_sampler_state_pointers_xs)(UNUSED struct brw_context *brw,
+                                     UNUSED struct brw_stage_state *stage_state)
  {
  #if GEN_GEN >= 7
     static const uint16_t packet_headers[] = {
@@ -4986,7 +4932,7 @@ has_component(mesa_format format, int i)
  static void
  genX(upload_default_color)(struct brw_context *brw,
                             const struct gl_sampler_object *sampler,
-                           MAYBE_UNUSED mesa_format format, GLenum base_format,
+                           mesa_format format, GLenum base_format,
                             bool is_integer_format, bool is_stencil_sampling,
                             uint32_t *sdc_offset)
  {
@@ -5162,7 +5108,7 @@ genX(upload_default_color)(struct brw_context *brw,
  }
  
  static uint32_t
-translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
+translate_wrap_mode(GLenum wrap, UNUSED bool using_nearest)
  {
     switch (wrap) {
     case GL_REPEAT:
@@ -5617,6 +5563,8 @@ genX(init_atoms)(struct brw_context *brw)
  #if GEN_GEN < 6
     static const struct brw_tracked_state *render_atoms[] =
     {
+      &genX(vf_statistics),
+
        /* Once all the programs are done, we know how large urb entry
         * sizes need to be and can decide if we need to change the urb
         * layout.
@@ -5673,6 +5621,8 @@ genX(init_atoms)(struct brw_context *brw)
  #elif GEN_GEN == 6
     static const struct brw_tracked_state *render_atoms[] =
     {
+      &genX(vf_statistics),
+
        &genX(sf_clip_viewport),
  
        /* Command packets: */
@@ -5737,6 +5687,8 @@ genX(init_atoms)(struct brw_context *brw)
  #elif GEN_GEN == 7
     static const struct brw_tracked_state *render_atoms[] =
     {
+      &genX(vf_statistics),
+
        /* Command packets: */
  
        &genX(cc_vp),
@@ -5827,6 +5779,8 @@ genX(init_atoms)(struct brw_context *brw)
  #elif GEN_GEN >= 8
     static const struct brw_tracked_state *render_atoms[] =
     {
+      &genX(vf_statistics),
+
        &genX(cc_vp),
        &genX(sf_clip_viewport),
  
@@ -5941,5 +5895,6 @@ genX(init_atoms)(struct brw_context *brw)
                             compute_atoms, ARRAY_SIZE(compute_atoms));
  
     brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
+   brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
  #endif
  }