anv: Deduplicate anv_CmdDraw calls
[mesa.git] / src / vulkan / gen8_cmd_buffer.c
index f626cad2831122a3471ff1eaa23c7bb4b117a56a..b997a2ecf0552ace8a8ecf7eda81bf323a0ba063 100644 (file)
 
 #include "anv_private.h"
 
-static void
-gen8_cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
+#include "gen8_pack.h"
+#include "gen9_pack.h"
+
+static uint32_t
+cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
 {
    static const uint32_t push_constant_opcodes[] = {
-      [VK_SHADER_STAGE_VERTEX]                  = 21,
-      [VK_SHADER_STAGE_TESS_CONTROL]            = 25, /* HS */
-      [VK_SHADER_STAGE_TESS_EVALUATION]         = 26, /* DS */
-      [VK_SHADER_STAGE_GEOMETRY]                = 22,
-      [VK_SHADER_STAGE_FRAGMENT]                = 23,
-      [VK_SHADER_STAGE_COMPUTE]                 = 0,
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+      [MESA_SHADER_COMPUTE]                     = 0,
    };
 
-   VkShaderStage stage;
    VkShaderStageFlags flushed = 0;
 
-   for_each_bit(stage, cmd_buffer->state.push_constants_dirty) {
+   anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
+      if (stage == MESA_SHADER_COMPUTE)
+         continue;
+
       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
 
       if (state.offset == 0)
          continue;
 
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_CONSTANT_VS,
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS),
                      ._3DCommandSubOpcode = push_constant_opcodes[stage],
                      .ConstantBody = {
-                        .PointerToConstantBuffer0 = { .offset = state.offset },
-                        .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+                        .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
+                        .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
                      });
 
-      flushed |= 1 << stage;
+      flushed |= mesa_to_vk_shader_stage(stage);
    }
 
    cmd_buffer->state.push_constants_dirty &= ~flushed;
+
+   return flushed;
 }
 
+#if ANV_GEN == 8
 static void
-gen8_cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
+emit_viewport_state(struct anv_cmd_buffer *cmd_buffer,
+                    uint32_t count, const VkViewport *viewports)
 {
-   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   uint32_t *p;
+   struct anv_state sf_clip_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
+   struct anv_state cc_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
+
+   for (uint32_t i = 0; i < count; i++) {
+      const VkViewport *vp = &viewports[i];
+
+      /* The gen7 state struct has just the matrix and guardband fields, the
+       * gen8 struct adds the min/max viewport fields. */
+      struct GENX(SF_CLIP_VIEWPORT) sf_clip_viewport = {
+         .ViewportMatrixElementm00 = vp->width / 2,
+         .ViewportMatrixElementm11 = vp->height / 2,
+         .ViewportMatrixElementm22 = 1.0,
+         .ViewportMatrixElementm30 = vp->x + vp->width / 2,
+         .ViewportMatrixElementm31 = vp->y + vp->height / 2,
+         .ViewportMatrixElementm32 = 0.0,
+         .XMinClipGuardband = -1.0f,
+         .XMaxClipGuardband = 1.0f,
+         .YMinClipGuardband = -1.0f,
+         .YMaxClipGuardband = 1.0f,
+         .XMinViewPort = vp->x,
+         .XMaxViewPort = vp->x + vp->width - 1,
+         .YMinViewPort = vp->y,
+         .YMaxViewPort = vp->y + vp->height - 1,
+      };
 
-   uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
+      struct GENX(CC_VIEWPORT) cc_viewport = {
+         .MinimumDepth = vp->minDepth,
+         .MaximumDepth = vp->maxDepth
+      };
 
-   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64,
+                                 &sf_clip_viewport);
+      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
+   }
+
+   if (!cmd_buffer->device->info.has_llc) {
+      anv_state_clflush(sf_clip_state);
+      anv_state_clflush(cc_state);
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC),
+                  .CCViewportPointer = cc_state.offset);
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP),
+                  .SFClipViewportPointer = sf_clip_state.offset);
+}
+
+void
+gen8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.dynamic.viewport.count > 0) {
+      emit_viewport_state(cmd_buffer, cmd_buffer->state.dynamic.viewport.count,
+                          cmd_buffer->state.dynamic.viewport.viewports);
+   } else {
+      /* If viewport count is 0, this is taken to mean "use the default" */
+      emit_viewport_state(cmd_buffer, 1,
+                          &(VkViewport) {
+                             .x = 0.0f,
+                             .y = 0.0f,
+                             .width = cmd_buffer->state.framebuffer->width,
+                             .height = cmd_buffer->state.framebuffer->height,
+                             .minDepth = 0.0f,
+                             .maxDepth = 1.0f,
+                          });
+   }
+}
+#endif
+
+static void
+emit_lrm(struct anv_batch *batch,
+         uint32_t reg, struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
+                  .RegisterAddress = reg,
+                  .MemoryAddress = { bo, offset });
+}
+
+static void
+emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM),
+                  .RegisterOffset = reg,
+                  .DataDWord = imm);
+}
+
+#define GEN8_L3CNTLREG                  0x7034
+
+static void
+config_l3(struct anv_cmd_buffer *cmd_buffer, bool enable_slm)
+{
+   /* References for GL state:
+    *
+    * - commits e307cfa..228d5a3
+    * - src/mesa/drivers/dri/i965/gen7_l3_state.c
+    */
+
+   uint32_t val = enable_slm ?
+      /* All = 48 ways; URB = 16 ways; DC and RO = 0, SLM = 1 */
+      0x60000021 :
+      /* All = 48 ways; URB = 48 ways; DC, RO and SLM = 0 */
+      0x60000060;
+   bool changed = cmd_buffer->state.current_l3_config != val;
+
+   if (changed) {
+      /* According to the hardware docs, the L3 partitioning can only be changed
+       * while the pipeline is completely drained and the caches are flushed,
+       * which involves a first PIPE_CONTROL flush which stalls the pipeline and
+       * initiates invalidation of the relevant caches...
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .TextureCacheInvalidationEnable = true,
+                     .ConstantCacheInvalidationEnable = true,
+                     .InstructionCacheInvalidateEnable = true,
+                     .DCFlushEnable = true,
+                     .PostSyncOperation = NoWrite,
+                     .CommandStreamerStallEnable = true);
+
+      /* ...followed by a second stalling flush which guarantees that
+       * invalidation is complete when the L3 configuration registers are
+       * modified.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .DCFlushEnable = true,
+                     .PostSyncOperation = NoWrite,
+                     .CommandStreamerStallEnable = true);
+
+      emit_lri(&cmd_buffer->batch, GEN8_L3CNTLREG, val);
+      cmd_buffer->state.current_l3_config = val;
+   }
+}
+
+static void
+flush_pipeline_select_3d(struct anv_cmd_buffer *cmd_buffer)
+{
+   config_l3(cmd_buffer, false);
 
    if (cmd_buffer->state.current_pipeline != _3D) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_PIPELINE_SELECT,
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
+#if ANV_GEN >= 9
+                     .MaskBits = 3,
+#endif
                      .PipelineSelection = _3D);
       cmd_buffer->state.current_pipeline = _3D;
    }
+}
+
+static void
+__emit_genx_sf_state(struct anv_cmd_buffer *cmd_buffer)
+{
+      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
+      struct GENX(3DSTATE_SF) sf = {
+         GENX(3DSTATE_SF_header),
+         .LineWidth = cmd_buffer->state.dynamic.line_width,
+      };
+      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
+      /* FIXME: gen9.fs */
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw,
+                           cmd_buffer->state.pipeline->gen8.sf);
+}
+static void
+__emit_gen9_sf_state(struct anv_cmd_buffer *cmd_buffer)
+{
+      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
+      struct GEN9_3DSTATE_SF sf = {
+         GEN9_3DSTATE_SF_header,
+         .LineWidth = cmd_buffer->state.dynamic.line_width,
+      };
+      GEN9_3DSTATE_SF_pack(NULL, sf_dw, &sf);
+      /* FIXME: gen9.fs */
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw,
+                           cmd_buffer->state.pipeline->gen8.sf);
+}
+
+static void
+__emit_sf_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->device->info.is_cherryview)
+      __emit_gen9_sf_state(cmd_buffer);
+   else
+      __emit_genx_sf_state(cmd_buffer);
+}
+
+void
+genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   uint32_t *p;
+
+   uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
+
+   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+   flush_pipeline_select_3d(cmd_buffer);
 
    if (vb_emit) {
       const uint32_t num_buffers = __builtin_popcount(vb_emit);
       const uint32_t num_dwords = 1 + num_buffers * 4;
 
       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
-                          GEN8_3DSTATE_VERTEX_BUFFERS);
+                          GENX(3DSTATE_VERTEX_BUFFERS));
       uint32_t vb, i = 0;
       for_each_bit(vb, vb_emit) {
          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
 
-         struct GEN8_VERTEX_BUFFER_STATE state = {
+         struct GENX(VERTEX_BUFFER_STATE) state = {
             .VertexBufferIndex = vb,
-            .MemoryObjectControlState = GEN8_MOCS,
+            .MemoryObjectControlState = GENX(MOCS),
             .AddressModifyEnable = true,
             .BufferPitch = pipeline->binding_stride[vb],
             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
             .BufferSize = buffer->size - offset
          };
 
-         GEN8_VERTEX_BUFFER_STATE_pack(&cmd_buffer->batch, &p[1 + i * 4], &state);
+         GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
          i++;
       }
    }
@@ -115,54 +308,66 @@ gen8_cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
    }
 
+   /* We emit the binding tables and sampler tables first, then emit push
+    * constants and then finally emit binding table and sampler table
+    * pointers.  It has to happen in this order, since emitting the binding
+    * tables may change the push constants (in case of storage images). After
+    * emitting push constants, on SKL+ we have to emit the corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+    */
+   uint32_t dirty = 0;
    if (cmd_buffer->state.descriptors_dirty)
-      anv_flush_descriptor_sets(cmd_buffer);
+      dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer);
 
    if (cmd_buffer->state.push_constants_dirty)
-      gen8_cmd_buffer_flush_push_constants(cmd_buffer);
+      dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
+
+   if (dirty)
+      gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
 
    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
-      anv_cmd_buffer_emit_viewport(cmd_buffer);
+      gen8_cmd_buffer_emit_viewport(cmd_buffer);
 
    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
-      anv_cmd_buffer_emit_scissor(cmd_buffer);
+      gen7_cmd_buffer_emit_scissor(cmd_buffer);
 
    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
                                   ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)) {
-      uint32_t sf_dw[GEN8_3DSTATE_SF_length];
-      struct GEN8_3DSTATE_SF sf = {
-         GEN8_3DSTATE_SF_header,
-         .LineWidth = cmd_buffer->state.dynamic.line_width,
-      };
-      GEN8_3DSTATE_SF_pack(NULL, sf_dw, &sf);
-      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gen8.sf);
+      __emit_sf_state(cmd_buffer);
    }
 
    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
                                   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)){
       bool enable_bias = cmd_buffer->state.dynamic.depth_bias.bias != 0.0f ||
-         cmd_buffer->state.dynamic.depth_bias.slope_scaled != 0.0f;
+         cmd_buffer->state.dynamic.depth_bias.slope != 0.0f;
 
-      uint32_t raster_dw[GEN8_3DSTATE_RASTER_length];
-      struct GEN8_3DSTATE_RASTER raster = {
-         GEN8_3DSTATE_RASTER_header,
+      uint32_t raster_dw[GENX(3DSTATE_RASTER_length)];
+      struct GENX(3DSTATE_RASTER) raster = {
+         GENX(3DSTATE_RASTER_header),
          .GlobalDepthOffsetEnableSolid = enable_bias,
          .GlobalDepthOffsetEnableWireframe = enable_bias,
          .GlobalDepthOffsetEnablePoint = enable_bias,
          .GlobalDepthOffsetConstant = cmd_buffer->state.dynamic.depth_bias.bias,
-         .GlobalDepthOffsetScale = cmd_buffer->state.dynamic.depth_bias.slope_scaled,
+         .GlobalDepthOffsetScale = cmd_buffer->state.dynamic.depth_bias.slope,
          .GlobalDepthOffsetClamp = cmd_buffer->state.dynamic.depth_bias.clamp
       };
-      GEN8_3DSTATE_RASTER_pack(NULL, raster_dw, &raster);
+      GENX(3DSTATE_RASTER_pack)(NULL, raster_dw, &raster);
       anv_batch_emit_merge(&cmd_buffer->batch, raster_dw,
                            pipeline->gen8.raster);
    }
 
+   /* Stencil reference values moved from COLOR_CALC_STATE in gen8 to
+    * 3DSTATE_WM_DEPTH_STENCIL in gen9. That means the dirty bits gets split
+    * across different state packets for gen8 and gen9. We handle that by
+    * using a big old #if switch here.
+    */
+#if ANV_GEN == 8
    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
                                   ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
       struct anv_state cc_state =
          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                            GEN8_COLOR_CALC_STATE_length, 64);
+                                            GEN8_COLOR_CALC_STATE_length * 4,
+                                            64);
       struct GEN8_COLOR_CALC_STATE cc = {
          .BlendConstantColorRed = cmd_buffer->state.dynamic.blend_constants[0],
          .BlendConstantColorGreen = cmd_buffer->state.dynamic.blend_constants[1],
@@ -175,6 +380,9 @@ gen8_cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
       };
       GEN8_COLOR_CALC_STATE_pack(NULL, cc_state.map, &cc);
 
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(cc_state);
+
       anv_batch_emit(&cmd_buffer->batch,
                      GEN8_3DSTATE_CC_STATE_POINTERS,
                      .ColorCalcStatePointer = cc_state.offset,
@@ -209,115 +417,75 @@ gen8_cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
       anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
                            pipeline->gen8.wm_depth_stencil);
    }
+#else
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GEN9_COLOR_CALC_STATE_length * 4,
+                                            64);
+      struct GEN9_COLOR_CALC_STATE cc = {
+         .BlendConstantColorRed = cmd_buffer->state.dynamic.blend_constants[0],
+         .BlendConstantColorGreen = cmd_buffer->state.dynamic.blend_constants[1],
+         .BlendConstantColorBlue = cmd_buffer->state.dynamic.blend_constants[2],
+         .BlendConstantColorAlpha = cmd_buffer->state.dynamic.blend_constants[3],
+      };
+      GEN9_COLOR_CALC_STATE_pack(NULL, cc_state.map, &cc);
 
-   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                  ANV_CMD_DIRTY_INDEX_BUFFER)) {
-      anv_batch_emit_merge(&cmd_buffer->batch,
-                           cmd_buffer->state.state_vf, pipeline->gen8.vf);
-   }
-
-   cmd_buffer->state.vb_dirty &= ~vb_emit;
-   cmd_buffer->state.dirty = 0;
-}
-
-void gen8_CmdDraw(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    vertexCount,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstVertex,
-    uint32_t                                    firstInstance)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-
-   gen8_cmd_buffer_flush_state(cmd_buffer);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .VertexAccessType = SEQUENTIAL,
-                  .VertexCountPerInstance = vertexCount,
-                  .StartVertexLocation = firstVertex,
-                  .InstanceCount = instanceCount,
-                  .StartInstanceLocation = firstInstance,
-                  .BaseVertexLocation = 0);
-}
-
-void gen8_CmdDrawIndexed(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    indexCount,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstIndex,
-    int32_t                                     vertexOffset,
-    uint32_t                                    firstInstance)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(cc_state);
 
-   gen8_cmd_buffer_flush_state(cmd_buffer);
+      anv_batch_emit(&cmd_buffer->batch,
+                     GEN9_3DSTATE_CC_STATE_POINTERS,
+                     .ColorCalcStatePointer = cc_state.offset,
+                     .ColorCalcStatePointerValid = true);
+   }
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .VertexAccessType = RANDOM,
-                  .VertexCountPerInstance = indexCount,
-                  .StartVertexLocation = firstIndex,
-                  .InstanceCount = instanceCount,
-                  .StartInstanceLocation = firstInstance,
-                  .BaseVertexLocation = vertexOffset);
-}
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
+                                  ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
+      uint32_t dwords[GEN9_3DSTATE_WM_DEPTH_STENCIL_length];
+      struct anv_dynamic_state *d = &cmd_buffer->state.dynamic;
+      struct GEN9_3DSTATE_WM_DEPTH_STENCIL wm_depth_stencil = {
+         GEN9_3DSTATE_WM_DEPTH_STENCIL_header,
 
-static void
-emit_lrm(struct anv_batch *batch,
-         uint32_t reg, struct anv_bo *bo, uint32_t offset)
-{
-   anv_batch_emit(batch, GEN8_MI_LOAD_REGISTER_MEM,
-                  .RegisterAddress = reg,
-                  .MemoryAddress = { bo, offset });
-}
+         .StencilBufferWriteEnable = d->stencil_write_mask.front != 0,
 
-static void
-emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
-{
-   anv_batch_emit(batch, GEN8_MI_LOAD_REGISTER_IMM,
-                  .RegisterOffset = reg,
-                  .DataDWord = imm);
-}
+         .StencilTestMask = d->stencil_compare_mask.front & 0xff,
+         .StencilWriteMask = d->stencil_write_mask.front & 0xff,
 
-/* Auto-Draw / Indirect Registers */
-#define GEN7_3DPRIM_END_OFFSET          0x2420
-#define GEN7_3DPRIM_START_VERTEX        0x2430
-#define GEN7_3DPRIM_VERTEX_COUNT        0x2434
-#define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
-#define GEN7_3DPRIM_START_INSTANCE      0x243C
-#define GEN7_3DPRIM_BASE_VERTEX         0x2440
+         .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
+         .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
 
-void gen8_CmdDrawIndirect(
-    VkCmdBuffer                                 cmdBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    uint32_t                                    count,
-    uint32_t                                    stride)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
+         .StencilReferenceValue = d->stencil_reference.front,
+         .BackfaceStencilReferenceValue = d->stencil_reference.back
+      };
+      GEN9_3DSTATE_WM_DEPTH_STENCIL_pack(NULL, dwords, &wm_depth_stencil);
 
-   gen8_cmd_buffer_flush_state(cmd_buffer);
+      anv_batch_emit_merge(&cmd_buffer->batch, dwords,
+                           pipeline->gen9.wm_depth_stencil);
+   }
+#endif
 
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
-   emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
+   if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                  ANV_CMD_DIRTY_INDEX_BUFFER)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF),
+         .IndexedDrawCutIndexEnable = pipeline->primitive_restart,
+         .CutIndex = cmd_buffer->state.restart_index,
+      );
+   }
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .IndirectParameterEnable = true,
-                  .VertexAccessType = SEQUENTIAL);
+   cmd_buffer->state.vb_dirty &= ~vb_emit;
+   cmd_buffer->state.dirty = 0;
 }
 
-void gen8_CmdBindIndexBuffer(
-    VkCmdBuffer                                 cmdBuffer,
+void genX(CmdBindIndexBuffer)(
+    VkCommandBuffer                             commandBuffer,
     VkBuffer                                    _buffer,
     VkDeviceSize                                offset,
     VkIndexType                                 indexType)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 
    static const uint32_t vk_to_gen_index_type[] = {
@@ -325,23 +493,24 @@ void gen8_CmdBindIndexBuffer(
       [VK_INDEX_TYPE_UINT32]                    = INDEX_DWORD,
    };
 
-   struct GEN8_3DSTATE_VF vf = {
-      GEN8_3DSTATE_VF_header,
-      .CutIndex = (indexType == VK_INDEX_TYPE_UINT16) ? UINT16_MAX : UINT32_MAX,
+   static const uint32_t restart_index_for_type[] = {
+      [VK_INDEX_TYPE_UINT16]                    = UINT16_MAX,
+      [VK_INDEX_TYPE_UINT32]                    = UINT32_MAX,
    };
-   GEN8_3DSTATE_VF_pack(NULL, cmd_buffer->state.state_vf, &vf);
 
-   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
+   cmd_buffer->state.restart_index = restart_index_for_type[indexType];
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_INDEX_BUFFER,
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER),
                   .IndexFormat = vk_to_gen_index_type[indexType],
-                  .MemoryObjectControlState = GEN8_MOCS,
+                  .MemoryObjectControlState = GENX(MOCS),
                   .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
                   .BufferSize = buffer->size - offset);
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
 }
 
 static VkResult
-gen8_flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
+flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_device *device = cmd_buffer->device;
    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
@@ -349,31 +518,59 @@ gen8_flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
    VkResult result;
 
    result = anv_cmd_buffer_emit_samplers(cmd_buffer,
-                                         VK_SHADER_STAGE_COMPUTE, &samplers);
+                                         MESA_SHADER_COMPUTE, &samplers);
    if (result != VK_SUCCESS)
       return result;
    result = anv_cmd_buffer_emit_binding_table(cmd_buffer,
-                                               VK_SHADER_STAGE_COMPUTE, &surfaces);
+                                              MESA_SHADER_COMPUTE, &surfaces);
    if (result != VK_SUCCESS)
       return result;
 
-   struct GEN8_INTERFACE_DESCRIPTOR_DATA desc = {
-      .KernelStartPointer = pipeline->cs_simd,
-      .KernelStartPointerHigh = 0,
-      .BindingTablePointer = surfaces.offset,
-      .BindingTableEntryCount = 0,
-      .SamplerStatePointer = samplers.offset,
-      .SamplerCount = 0,
-      .NumberofThreadsinGPGPUThreadGroup = 0 /* FIXME: Really? */
-   };
+   struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer);
 
-   uint32_t size = GEN8_INTERFACE_DESCRIPTOR_DATA_length * sizeof(uint32_t);
-   struct anv_state state =
-      anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
 
-   GEN8_INTERFACE_DESCRIPTOR_DATA_pack(NULL, state.map, &desc);
+   unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
+   unsigned push_constant_data_size =
+      (prog_data->nr_params + local_id_dwords) * 4;
+   unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
+   unsigned push_constant_regs = reg_aligned_constant_size / 32;
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD,
+   if (push_state.alloc_size) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD),
+                     .CURBETotalDataLength = push_state.alloc_size,
+                     .CURBEDataStartAddress = push_state.offset);
+   }
+
+   assert(prog_data->total_shared <= 64 * 1024);
+   uint32_t slm_size = 0;
+   if (prog_data->total_shared > 0) {
+      /* slm_size is in 4k increments, but must be a power of 2. */
+      slm_size = 4 * 1024;
+      while (slm_size < prog_data->total_shared)
+         slm_size <<= 1;
+      slm_size /= 4 * 1024;
+   }
+
+   struct anv_state state =
+      anv_state_pool_emit(&device->dynamic_state_pool,
+                          GENX(INTERFACE_DESCRIPTOR_DATA), 64,
+                          .KernelStartPointer = pipeline->cs_simd,
+                          .KernelStartPointerHigh = 0,
+                          .BindingTablePointer = surfaces.offset,
+                          .BindingTableEntryCount = 0,
+                          .SamplerStatePointer = samplers.offset,
+                          .SamplerCount = 0,
+                          .ConstantIndirectURBEntryReadLength = push_constant_regs,
+                          .ConstantURBEntryReadOffset = 0,
+                          .BarrierEnable = cs_prog_data->uses_barrier,
+                          .SharedLocalMemorySize = slm_size,
+                          .NumberofThreadsinGPGPUThreadGroup =
+                             pipeline->cs_thread_width_max);
+
+   uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD),
                   .InterfaceDescriptorTotalLength = size,
                   .InterfaceDescriptorDataStartAddress = state.offset);
 
@@ -381,15 +578,35 @@ gen8_flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
 }
 
 static void
-gen8_cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
+cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
    VkResult result;
 
    assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
 
+   bool needs_slm = pipeline->cs_prog_data.base.total_shared > 0;
+   config_l3(cmd_buffer, needs_slm);
+
    if (cmd_buffer->state.current_pipeline != GPGPU) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_PIPELINE_SELECT,
+#if ANV_GEN < 10
+      /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
+       *
+       *   Software must clear the COLOR_CALC_STATE Valid field in
+       *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
+       *   with Pipeline Select set to GPGPU.
+       *
+       * The internal hardware docs recommend the same workaround for Gen9
+       * hardware too.
+       */
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_CC_STATE_POINTERS));
+#endif
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
+#if ANV_GEN >= 9
+                     .MaskBits = 3,
+#endif
                      .PipelineSelection = GPGPU);
       cmd_buffer->state.current_pipeline = GPGPU;
    }
@@ -399,121 +616,120 @@ gen8_cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
 
    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
        (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
-      result = gen8_flush_compute_descriptor_set(cmd_buffer);
+      result = flush_compute_descriptor_set(cmd_buffer);
       assert(result == VK_SUCCESS);
-      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE;
+      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
    }
 
    cmd_buffer->state.compute_dirty = 0;
 }
 
-void gen8_CmdDrawIndexedIndirect(
-    VkCmdBuffer                                 cmdBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    uint32_t                                    count,
-    uint32_t                                    stride)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
-
-   gen8_cmd_buffer_flush_state(cmd_buffer);
-
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
-   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .IndirectParameterEnable = true,
-                  .VertexAccessType = RANDOM);
-}
-
-void gen8_CmdDispatch(
-    VkCmdBuffer                                 cmdBuffer,
+void genX(CmdDispatch)(
+    VkCommandBuffer                             commandBuffer,
     uint32_t                                    x,
     uint32_t                                    y,
     uint32_t                                    z)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
    struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
 
-   gen8_cmd_buffer_flush_compute_state(cmd_buffer);
+   if (prog_data->uses_num_work_groups) {
+      struct anv_state state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
+      uint32_t *sizes = state.map;
+      sizes[0] = x;
+      sizes[1] = y;
+      sizes[2] = z;
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(state);
+      cmd_buffer->state.num_workgroups_offset = state.offset;
+      cmd_buffer->state.num_workgroups_bo =
+         &cmd_buffer->device->dynamic_state_block_pool.bo;
+   }
+
+   cmd_buffer_flush_compute_state(cmd_buffer);
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_GPGPU_WALKER,
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
                   .SIMDSize = prog_data->simd_size / 16,
                   .ThreadDepthCounterMaximum = 0,
                   .ThreadHeightCounterMaximum = 0,
-                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
                   .ThreadGroupIDXDimension = x,
                   .ThreadGroupIDYDimension = y,
                   .ThreadGroupIDZDimension = z,
                   .RightExecutionMask = pipeline->cs_right_mask,
                   .BottomExecutionMask = 0xffffffff);
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_STATE_FLUSH);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
 }
 
 #define GPGPU_DISPATCHDIMX 0x2500
 #define GPGPU_DISPATCHDIMY 0x2504
 #define GPGPU_DISPATCHDIMZ 0x2508
 
-void gen8_CmdDispatchIndirect(
-    VkCmdBuffer                                 cmdBuffer,
+void genX(CmdDispatchIndirect)(
+    VkCommandBuffer                             commandBuffer,
     VkBuffer                                    _buffer,
     VkDeviceSize                                offset)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
    struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
    struct anv_bo *bo = buffer->bo;
    uint32_t bo_offset = buffer->offset + offset;
 
-   gen8_cmd_buffer_flush_compute_state(cmd_buffer);
+   if (prog_data->uses_num_work_groups) {
+      cmd_buffer->state.num_workgroups_offset = bo_offset;
+      cmd_buffer->state.num_workgroups_bo = bo;
+   }
+
+   cmd_buffer_flush_compute_state(cmd_buffer);
 
    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_GPGPU_WALKER,
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
                   .IndirectParameterEnable = true,
                   .SIMDSize = prog_data->simd_size / 16,
                   .ThreadDepthCounterMaximum = 0,
                   .ThreadHeightCounterMaximum = 0,
-                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
                   .RightExecutionMask = pipeline->cs_right_mask,
                   .BottomExecutionMask = 0xffffffff);
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_STATE_FLUSH);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
 }
 
 static void
-gen8_cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
+cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 {
    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
    const struct anv_image_view *iview =
       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
    const struct anv_image *image = iview ? iview->image : NULL;
-   const bool has_depth = iview && iview->format->depth_format;
-   const bool has_stencil = iview && iview->format->has_stencil;
+
+   /* XXX: isl needs to grow depth format support */
+   const struct anv_format *anv_format =
+      iview ? anv_format_for_vk_format(iview->vk_format) : NULL;
+
+   const bool has_depth = iview && anv_format->depth_format;
+   const bool has_stencil = iview && anv_format->has_stencil;
 
    /* FIXME: Implement the PMA stall W/A */
    /* FIXME: Width and Height are wrong */
 
    /* Emit 3DSTATE_DEPTH_BUFFER */
    if (has_depth) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_DEPTH_BUFFER,
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
          .SurfaceType = SURFTYPE_2D,
-         .DepthWriteEnable = iview->format->depth_format,
+         .DepthWriteEnable = anv_format->depth_format,
          .StencilWriteEnable = has_stencil,
          .HierarchicalDepthBufferEnable = false,
-         .SurfaceFormat = iview->format->depth_format,
-         .SurfacePitch = image->depth_surface.stride - 1,
+         .SurfaceFormat = anv_format->depth_format,
+         .SurfacePitch = image->depth_surface.isl.row_pitch - 1,
          .SurfaceBaseAddress = {
             .bo = image->bo,
             .offset = image->depth_surface.offset,
@@ -523,9 +739,9 @@ gen8_cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
          .LOD = 0,
          .Depth = 1 - 1,
          .MinimumArrayElement = 0,
-         .DepthBufferObjectControlState = GEN8_MOCS,
+         .DepthBufferObjectControlState = GENX(MOCS),
          .RenderTargetViewExtent = 1 - 1,
-         .SurfaceQPitch = image->depth_surface.qpitch >> 2);
+         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2);
    } else {
       /* Even when no depth buffer is present, the hardware requires that
        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
@@ -544,7 +760,7 @@ gen8_cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
        * actual framebuffer's width and height, even when neither depth buffer
        * nor stencil buffer is present.
        */
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_DEPTH_BUFFER,
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
          .SurfaceType = SURFTYPE_2D,
          .SurfaceFormat = D16_UNORM,
          .Width = fb->width - 1,
@@ -554,59 +770,65 @@ gen8_cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 
    /* Emit 3DSTATE_STENCIL_BUFFER */
    if (has_stencil) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_STENCIL_BUFFER,
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER),
          .StencilBufferEnable = true,
-         .StencilBufferObjectControlState = GEN8_MOCS,
+         .StencilBufferObjectControlState = GENX(MOCS),
 
          /* Stencil buffers have strange pitch. The PRM says:
           *
           *    The pitch must be set to 2x the value computed based on width,
           *    as the stencil buffer is stored with two rows interleaved.
           */
-         .SurfacePitch = 2 * image->stencil_surface.stride - 1,
+         .SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
 
          .SurfaceBaseAddress = {
             .bo = image->bo,
             .offset = image->offset + image->stencil_surface.offset,
          },
-         .SurfaceQPitch = image->stencil_surface.stride >> 2);
+         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2);
    } else {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_STENCIL_BUFFER);
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER));
    }
 
    /* Disable hierarchial depth buffers. */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_HIER_DEPTH_BUFFER);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER));
 
    /* Clear the clear params. */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_CLEAR_PARAMS);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS));
 }
 
+/**
+ * @see anv_cmd_buffer_set_subpass()
+ */
 void
-gen8_cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
+genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
                              struct anv_subpass *subpass)
 {
    cmd_buffer->state.subpass = subpass;
 
    cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
 
-   gen8_cmd_buffer_emit_depth_stencil(cmd_buffer);
+   cmd_buffer_emit_depth_stencil(cmd_buffer);
 }
 
-void gen8_CmdBeginRenderPass(
-    VkCmdBuffer                                 cmdBuffer,
+void genX(CmdBeginRenderPass)(
+    VkCommandBuffer                             commandBuffer,
     const VkRenderPassBeginInfo*                pRenderPassBegin,
-    VkRenderPassContents                        contents)
+    VkSubpassContents                           contents)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
 
    cmd_buffer->state.framebuffer = framebuffer;
    cmd_buffer->state.pass = pass;
+   anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
+
+   flush_pipeline_select_3d(cmd_buffer);
 
    const VkRect2D *render_area = &pRenderPassBegin->renderArea;
 
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_DRAWING_RECTANGLE,
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DRAWING_RECTANGLE),
                   .ClippedDrawingRectangleYMin = render_area->offset.y,
                   .ClippedDrawingRectangleXMin = render_area->offset.x,
                   .ClippedDrawingRectangleYMax =
@@ -616,34 +838,36 @@ void gen8_CmdBeginRenderPass(
                   .DrawingRectangleOriginY = 0,
                   .DrawingRectangleOriginX = 0);
 
-   anv_cmd_buffer_clear_attachments(cmd_buffer, pass,
-                                    pRenderPassBegin->pClearValues);
-
-   gen8_cmd_buffer_begin_subpass(cmd_buffer, pass->subpasses);
+   genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
 }
 
-void gen8_CmdNextSubpass(
-    VkCmdBuffer                                 cmdBuffer,
-    VkRenderPassContents                        contents)
+void genX(CmdNextSubpass)(
+    VkCommandBuffer                             commandBuffer,
+    VkSubpassContents                           contents)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   assert(cmd_buffer->level == VK_CMD_BUFFER_LEVEL_PRIMARY);
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-   gen8_cmd_buffer_begin_subpass(cmd_buffer, cmd_buffer->state.subpass + 1);
+   anv_cmd_buffer_resolve_subpass(cmd_buffer);
+   genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
 }
 
-void gen8_CmdEndRenderPass(
-    VkCmdBuffer                                 cmdBuffer)
+void genX(CmdEndRenderPass)(
+    VkCommandBuffer                             commandBuffer)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   anv_cmd_buffer_resolve_subpass(cmd_buffer);
 
    /* Emit a flushing pipe control at the end of a pass.  This is kind of a
     * hack but it ensures that render targets always actually get written.
     * Eventually, we should do flushing based on image format transitions
     * or something of that nature.
     */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
                   .PostSyncOperation = NoWrite,
                   .RenderTargetCacheFlushEnable = true,
                   .InstructionCacheInvalidateEnable = true,
@@ -657,25 +881,50 @@ static void
 emit_ps_depth_count(struct anv_batch *batch,
                     struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GEN8_PIPE_CONTROL,
+   anv_batch_emit(batch, GENX(PIPE_CONTROL),
                   .DestinationAddressType = DAT_PPGTT,
                   .PostSyncOperation = WritePSDepthCount,
-                  .Address = { bo, offset });  /* FIXME: This is only lower 32 bits */
+                  .DepthStallEnable = true,
+                  .Address = { bo, offset });
 }
 
-void gen8_CmdBeginQuery(
-    VkCmdBuffer                                 cmdBuffer,
+static void
+emit_query_availability(struct anv_batch *batch,
+                        struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = { bo, offset },
+                  .ImmediateData = 1);
+}
+
+void genX(CmdBeginQuery)(
+    VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
-    uint32_t                                    slot,
+    uint32_t                                    query,
     VkQueryControlFlags                         flags)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 
+   /* Workaround: When meta uses the pipeline with the VS disabled, it seems
+    * that the pipelining of the depth write breaks. What we see is that
+    * samples from the render pass clear leaks into the first query
+    * immediately after the clear. Doing a pipecontrol with a post-sync
+    * operation and DepthStallEnable seems to work around the issue.
+    */
+   if (cmd_buffer->state.need_query_wa) {
+      cmd_buffer->state.need_query_wa = false;
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                     .DepthCacheFlushEnable = true,
+                     .DepthStallEnable = true);
+   }
+
    switch (pool->type) {
    case VK_QUERY_TYPE_OCCLUSION:
       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
-                                    slot * sizeof(struct anv_query_pool_slot));
+                          query * sizeof(struct anv_query_pool_slot));
       break;
 
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
@@ -684,18 +933,21 @@ void gen8_CmdBeginQuery(
    }
 }
 
-void gen8_CmdEndQuery(
-    VkCmdBuffer                                 cmdBuffer,
+void genX(CmdEndQuery)(
+    VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
-    uint32_t                                    slot)
+    uint32_t                                    query)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 
    switch (pool->type) {
    case VK_QUERY_TYPE_OCCLUSION:
       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
-                          slot * sizeof(struct anv_query_pool_slot) + 8);
+                          query * sizeof(struct anv_query_pool_slot) + 8);
+
+      emit_query_availability(&cmd_buffer->batch, &pool->bo,
+                              query * sizeof(struct anv_query_pool_slot) + 16);
       break;
 
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
@@ -706,37 +958,38 @@ void gen8_CmdEndQuery(
 
 #define TIMESTAMP 0x2358
 
-void gen8_CmdWriteTimestamp(
-    VkCmdBuffer                                 cmdBuffer,
-    VkTimestampType                             timestampType,
-    VkBuffer                                    destBuffer,
-    VkDeviceSize                                destOffset)
+void genX(CmdWriteTimestamp)(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlagBits                     pipelineStage,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
-   struct anv_bo *bo = buffer->bo;
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   uint32_t offset = query * sizeof(struct anv_query_pool_slot);
 
-   switch (timestampType) {
-   case VK_TIMESTAMP_TYPE_TOP:
-      anv_batch_emit(&cmd_buffer->batch, GEN8_MI_STORE_REGISTER_MEM,
+   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
+
+   switch (pipelineStage) {
+   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM),
                      .RegisterAddress = TIMESTAMP,
-                     .MemoryAddress = { bo, buffer->offset + destOffset });
-      anv_batch_emit(&cmd_buffer->batch, GEN8_MI_STORE_REGISTER_MEM,
+                     .MemoryAddress = { &pool->bo, offset });
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM),
                      .RegisterAddress = TIMESTAMP + 4,
-                     .MemoryAddress = { bo, buffer->offset + destOffset + 4 });
+                     .MemoryAddress = { &pool->bo, offset + 4 });
       break;
 
-   case VK_TIMESTAMP_TYPE_BOTTOM:
-      anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
+   default:
+      /* Everything else is bottom-of-pipe */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
                      .DestinationAddressType = DAT_PPGTT,
                      .PostSyncOperation = WriteTimestamp,
-                     .Address = /* FIXME: This is only lower 32 bits */
-                        { bo, buffer->offset + destOffset });
-      break;
-
-   default:
+                     .Address = { &pool->bo, offset });
       break;
    }
+
+   emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
 }
 
 #define alu_opcode(v)   __gen_field((v),  20, 31)
@@ -775,298 +1028,161 @@ static void
 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
                       struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GEN8_MI_LOAD_REGISTER_MEM,
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
                   .RegisterAddress = reg,
                   .MemoryAddress = { bo, offset });
-   anv_batch_emit(batch, GEN8_MI_LOAD_REGISTER_MEM,
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
                   .RegisterAddress = reg + 4,
                   .MemoryAddress = { bo, offset + 4 });
 }
 
-void gen8_CmdCopyQueryPoolResults(
-    VkCmdBuffer                                 cmdBuffer,
+static void
+store_query_result(struct anv_batch *batch, uint32_t reg,
+                   struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
+{
+      anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM),
+                     .RegisterAddress = reg,
+                     .MemoryAddress = { bo, offset });
+
+      if (flags & VK_QUERY_RESULT_64_BIT)
+         anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM),
+                        .RegisterAddress = reg + 4,
+                        .MemoryAddress = { bo, offset + 4 });
+}
+
+void genX(CmdCopyQueryPoolResults)(
+    VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
-    uint32_t                                    startQuery,
+    uint32_t                                    firstQuery,
     uint32_t                                    queryCount,
     VkBuffer                                    destBuffer,
     VkDeviceSize                                destOffset,
     VkDeviceSize                                destStride,
     VkQueryResultFlags                          flags)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
    uint32_t slot_offset, dst_offset;
 
-   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
-      /* Where is the availabilty info supposed to go? */
-      anv_finishme("VK_QUERY_RESULT_WITH_AVAILABILITY_BIT");
-      return;
-   }
-
-   assert(pool->type == VK_QUERY_TYPE_OCCLUSION);
-
-   /* FIXME: If we're not waiting, should we just do this on the CPU? */
    if (flags & VK_QUERY_RESULT_WAIT_BIT)
-      anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
                      .CommandStreamerStallEnable = true,
                      .StallAtPixelScoreboard = true);
 
    dst_offset = buffer->offset + destOffset;
    for (uint32_t i = 0; i < queryCount; i++) {
 
-      slot_offset = (startQuery + i) * sizeof(struct anv_query_pool_slot);
-
-      emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), &pool->bo, slot_offset);
-      emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(1), &pool->bo, slot_offset + 8);
-
-      /* FIXME: We need to clamp the result for 32 bit. */
+      slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
+      switch (pool->type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+         emit_load_alu_reg_u64(&cmd_buffer->batch,
+                               CS_GPR(0), &pool->bo, slot_offset);
+         emit_load_alu_reg_u64(&cmd_buffer->batch,
+                               CS_GPR(1), &pool->bo, slot_offset + 8);
+
+         /* FIXME: We need to clamp the result for 32 bit. */
+
+         uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
+         dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
+         dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
+         dw[3] = alu(OPCODE_SUB, 0, 0);
+         dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
+         break;
 
-      uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GEN8_MI_MATH);
-      dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
-      dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
-      dw[3] = alu(OPCODE_SUB, 0, 0);
-      dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
+      case VK_QUERY_TYPE_TIMESTAMP:
+         emit_load_alu_reg_u64(&cmd_buffer->batch,
+                               CS_GPR(2), &pool->bo, slot_offset);
+         break;
 
-      anv_batch_emit(&cmd_buffer->batch, GEN8_MI_STORE_REGISTER_MEM,
-                     .RegisterAddress = CS_GPR(2),
-                     /* FIXME: This is only lower 32 bits */
-                     .MemoryAddress = { buffer->bo, dst_offset });
+      default:
+         unreachable("unhandled query type");
+      }
 
-      if (flags & VK_QUERY_RESULT_64_BIT)
-         anv_batch_emit(&cmd_buffer->batch, GEN8_MI_STORE_REGISTER_MEM,
-                        .RegisterAddress = CS_GPR(2) + 4,
-                        /* FIXME: This is only lower 32 bits */
-                        .MemoryAddress = { buffer->bo, dst_offset + 4 });
+      store_query_result(&cmd_buffer->batch,
+                         CS_GPR(2), buffer->bo, dst_offset, flags);
+
+      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
+         emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
+                               &pool->bo, slot_offset + 16);
+         if (flags & VK_QUERY_RESULT_64_BIT)
+            store_query_result(&cmd_buffer->batch,
+                               CS_GPR(0), buffer->bo, dst_offset + 8, flags);
+         else
+            store_query_result(&cmd_buffer->batch,
+                               CS_GPR(0), buffer->bo, dst_offset + 4, flags);
+      }
 
       dst_offset += destStride;
    }
 }
 
-void
-gen8_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
+void genX(CmdSetEvent)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     _event,
+    VkPipelineStageFlags                        stageMask)
 {
-   struct anv_device *device = cmd_buffer->device;
-   struct anv_bo *scratch_bo = NULL;
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_event, event, _event);
 
-   cmd_buffer->state.scratch_size =
-      anv_block_pool_size(&device->scratch_block_pool);
-   if (cmd_buffer->state.scratch_size > 0)
-      scratch_bo = &device->scratch_block_pool.bo;
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = {
+                     &cmd_buffer->device->dynamic_state_block_pool.bo,
+                     event->state.offset
+                   },
+                  .ImmediateData = VK_EVENT_SET);
+}
 
-   /* Emit a render target cache flush.
-    *
-    * This isn't documented anywhere in the PRM.  However, it seems to be
-    * necessary prior to changing the surface state base adress.  Without
-    * this, we get GPU hangs when using multi-level command buffers which
-    * clear depth, reset state base address, and then go render stuff.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
-                  .RenderTargetCacheFlushEnable = true);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_STATE_BASE_ADDRESS,
-                  .GeneralStateBaseAddress = { scratch_bo, 0 },
-                  .GeneralStateMemoryObjectControlState = GEN8_MOCS,
-                  .GeneralStateBaseAddressModifyEnable = true,
-                  .GeneralStateBufferSize = 0xfffff,
-                  .GeneralStateBufferSizeModifyEnable = true,
-
-                  .SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer),
-                  .SurfaceStateMemoryObjectControlState = GEN8_MOCS,
-                  .SurfaceStateBaseAddressModifyEnable = true,
-
-                  .DynamicStateBaseAddress = { &device->dynamic_state_block_pool.bo, 0 },
-                  .DynamicStateMemoryObjectControlState = GEN8_MOCS,
-                  .DynamicStateBaseAddressModifyEnable = true,
-                  .DynamicStateBufferSize = 0xfffff,
-                  .DynamicStateBufferSizeModifyEnable = true,
-
-                  .IndirectObjectBaseAddress = { NULL, 0 },
-                  .IndirectObjectMemoryObjectControlState = GEN8_MOCS,
-                  .IndirectObjectBaseAddressModifyEnable = true,
-                  .IndirectObjectBufferSize = 0xfffff,
-                  .IndirectObjectBufferSizeModifyEnable = true,
-
-                  .InstructionBaseAddress = { &device->instruction_block_pool.bo, 0 },
-                  .InstructionMemoryObjectControlState = GEN8_MOCS,
-                  .InstructionBaseAddressModifyEnable = true,
-                  .InstructionBufferSize = 0xfffff,
-                  .InstructionBuffersizeModifyEnable = true);
-
-   /* After re-setting the surface state base address, we have to do some
-    * cache flusing so that the sampler engine will pick up the new
-    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
-    * Shared Function > 3D Sampler > State > State Caching (page 96):
-    *
-    *    Coherency with system memory in the state cache, like the texture
-    *    cache is handled partially by software. It is expected that the
-    *    command stream or shader will issue Cache Flush operation or
-    *    Cache_Flush sampler message to ensure that the L1 cache remains
-    *    coherent with system memory.
-    *
-    *    [...]
-    *
-    *    Whenever the value of the Dynamic_State_Base_Addr,
-    *    Surface_State_Base_Addr are altered, the L1 state cache must be
-    *    invalidated to ensure the new surface or sampler state is fetched
-    *    from system memory.
-    *
-    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
-    * which, according the PIPE_CONTROL instruction documentation in the
-    * Broadwell PRM:
-    *
-    *    Setting this bit is independent of any other bit in this packet.
-    *    This bit controls the invalidation of the L1 and L2 state caches
-    *    at the top of the pipe i.e. at the parsing time.
-    *
-    * Unfortunately, experimentation seems to indicate that state cache
-    * invalidation through a PIPE_CONTROL does nothing whatsoever in
-    * regards to surface state and binding tables.  In stead, it seems that
-    * invalidating the texture cache is what is actually needed.
-    *
-    * XXX:  As far as we have been able to determine through
-    * experimentation, shows that flush the texture cache appears to be
-    * sufficient.  The theory here is that all of the sampling/rendering
-    * units cache the binding table in the texture cache.  However, we have
-    * yet to be able to actually confirm this.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
-                  .TextureCacheInvalidationEnable = true);
+void genX(CmdResetEvent)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     _event,
+    VkPipelineStageFlags                        stageMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .DestinationAddressType = DAT_PPGTT,
+                  .PostSyncOperation = WriteImmediateData,
+                  .Address = {
+                     &cmd_buffer->device->dynamic_state_block_pool.bo,
+                     event->state.offset
+                   },
+                  .ImmediateData = VK_EVENT_RESET);
 }
 
-void gen8_CmdPipelineBarrier(
-    VkCmdBuffer                                 cmdBuffer,
+void genX(CmdWaitEvents)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    eventCount,
+    const VkEvent*                              pEvents,
     VkPipelineStageFlags                        srcStageMask,
     VkPipelineStageFlags                        destStageMask,
-    VkBool32                                    byRegion,
-    uint32_t                                    memBarrierCount,
-    const void* const*                          ppMemBarriers)
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   uint32_t b, *dw;
-
-   struct GEN8_PIPE_CONTROL cmd = {
-      GEN8_PIPE_CONTROL_header,
-      .PostSyncOperation = NoWrite,
-   };
-
-   /* XXX: I think waitEvent is a no-op on our HW.  We should verify that. */
-
-   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
-      /* This is just what PIPE_CONTROL does */
-   }
-
-   if (anv_clear_mask(&srcStageMask,
-                      VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
-                      VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-                      VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TESS_CONTROL_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TESS_EVALUATION_SHADER_BIT |
-                      VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
-                      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
-                      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
-                      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
-                      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT)) {
-      cmd.StallAtPixelScoreboard = true;
-   }
-
-
-   if (anv_clear_mask(&srcStageMask,
-                      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TRANSFER_BIT)) {
-      cmd.CommandStreamerStallEnable = true;
-   }
-
-   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_HOST_BIT)) {
-      anv_finishme("VK_PIPE_EVENT_CPU_SIGNAL_BIT");
-   }
-
-   /* On our hardware, all stages will wait for execution as needed. */
-   (void)destStageMask;
-
-   /* We checked all known VkPipeEventFlags. */
-   anv_assert(srcStageMask == 0);
-
-   /* XXX: Right now, we're really dumb and just flush whatever categories
-    * the app asks for.  One of these days we may make this a bit better
-    * but right now that's all the hardware allows for in most areas.
-    */
-   VkMemoryOutputFlags out_flags = 0;
-   VkMemoryInputFlags in_flags = 0;
-
-   for (uint32_t i = 0; i < memBarrierCount; i++) {
-      const struct anv_common *common = ppMemBarriers[i];
-      switch (common->sType) {
-      case VK_STRUCTURE_TYPE_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkMemoryBarrier, barrier, common);
-         out_flags |= barrier->outputMask;
-         in_flags |= barrier->inputMask;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkBufferMemoryBarrier, barrier, common);
-         out_flags |= barrier->outputMask;
-         in_flags |= barrier->inputMask;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkImageMemoryBarrier, barrier, common);
-         out_flags |= barrier->outputMask;
-         in_flags |= barrier->inputMask;
-         break;
-      }
-      default:
-         unreachable("Invalid memory barrier type");
-      }
-   }
-
-   for_each_bit(b, out_flags) {
-      switch ((VkMemoryOutputFlags)(1 << b)) {
-      case VK_MEMORY_OUTPUT_HOST_WRITE_BIT:
-         break; /* FIXME: Little-core systems */
-      case VK_MEMORY_OUTPUT_SHADER_WRITE_BIT:
-         cmd.DCFlushEnable = true;
-         break;
-      case VK_MEMORY_OUTPUT_COLOR_ATTACHMENT_BIT:
-         cmd.RenderTargetCacheFlushEnable = true;
-         break;
-      case VK_MEMORY_OUTPUT_DEPTH_STENCIL_ATTACHMENT_BIT:
-         cmd.DepthCacheFlushEnable = true;
-         break;
-      case VK_MEMORY_OUTPUT_TRANSFER_BIT:
-         cmd.RenderTargetCacheFlushEnable = true;
-         cmd.DepthCacheFlushEnable = true;
-         break;
-      default:
-         unreachable("Invalid memory output flag");
-      }
-   }
-
-   for_each_bit(b, out_flags) {
-      switch ((VkMemoryInputFlags)(1 << b)) {
-      case VK_MEMORY_INPUT_HOST_READ_BIT:
-         break; /* FIXME: Little-core systems */
-      case VK_MEMORY_INPUT_INDIRECT_COMMAND_BIT:
-      case VK_MEMORY_INPUT_INDEX_FETCH_BIT:
-      case VK_MEMORY_INPUT_VERTEX_ATTRIBUTE_FETCH_BIT:
-         cmd.VFCacheInvalidationEnable = true;
-         break;
-      case VK_MEMORY_INPUT_UNIFORM_READ_BIT:
-         cmd.ConstantCacheInvalidationEnable = true;
-         /* fallthrough */
-      case VK_MEMORY_INPUT_SHADER_READ_BIT:
-         cmd.DCFlushEnable = true;
-         cmd.TextureCacheInvalidationEnable = true;
-         break;
-      case VK_MEMORY_INPUT_COLOR_ATTACHMENT_BIT:
-      case VK_MEMORY_INPUT_DEPTH_STENCIL_ATTACHMENT_BIT:
-         break; /* XXX: Hunh? */
-      case VK_MEMORY_INPUT_TRANSFER_BIT:
-         cmd.TextureCacheInvalidationEnable = true;
-         break;
-      }
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   for (uint32_t i = 0; i < eventCount; i++) {
+      ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT),
+                     .WaitMode = PollingMode,
+                     .CompareOperation = SAD_EQUAL_SDD,
+                     .SemaphoreDataDword = VK_EVENT_SET,
+                     .SemaphoreAddress = {
+                        &cmd_buffer->device->dynamic_state_block_pool.bo,
+                        event->state.offset
+                     });
    }
 
-   dw = anv_batch_emit_dwords(&cmd_buffer->batch, GEN8_PIPE_CONTROL_length);
-   GEN8_PIPE_CONTROL_pack(&cmd_buffer->batch, dw, &cmd);
+   genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask,
+                            false, /* byRegion */
+                            memoryBarrierCount, pMemoryBarriers,
+                            bufferMemoryBarrierCount, pBufferMemoryBarriers,
+                            imageMemoryBarrierCount, pImageMemoryBarriers);
 }