anv: Rework vkCmdPipelineBarrier()
[mesa.git] / src / vulkan / genX_cmd_buffer.c
index ac1ab822b494ab51df0d3e04de92038f7041daf9..2552cd1befec8e29f6f66931a8b8774fb7e6205d 100644 (file)
@@ -48,7 +48,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
       scratch_bo = &device->scratch_block_pool.bo;
 
 /* XXX: Do we need this on more than just BDW? */
-#if (ANV_GEN == 8)
+#if (ANV_GEN >= 8)
    /* Emit a render target cache flush.
     *
     * This isn't documented anywhere in the PRM.  However, it seems to be
@@ -56,7 +56,7 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
     * this, we get GPU hangs when using multi-level command buffers which
     * clear depth, reset state base address, and then go render stuff.
     */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
                   .RenderTargetCacheFlushEnable = true);
 #endif
 
@@ -143,140 +143,589 @@ void genX(CmdPipelineBarrier)(
     VkPipelineStageFlags                        srcStageMask,
     VkPipelineStageFlags                        destStageMask,
     VkBool32                                    byRegion,
-    uint32_t                                    memBarrierCount,
-    const void* const*                          ppMemBarriers)
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    uint32_t b, *dw;
 
-   struct GENX(PIPE_CONTROL) cmd = {
-      GENX(PIPE_CONTROL_header),
-      .PostSyncOperation = NoWrite,
-   };
-
-   /* XXX: I think waitEvent is a no-op on our HW.  We should verify that. */
-
-   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
-      /* This is just what PIPE_CONTROL does */
-   }
+   /* XXX: Right now, we're really dumb and just flush whatever categories
+    * the app asks for.  One of these days we may make this a bit better
+    * but right now that's all the hardware allows for in most areas.
+    */
+   VkAccessFlags src_flags = 0;
+   VkAccessFlags dst_flags = 0;
 
-   if (anv_clear_mask(&srcStageMask,
-                      VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
-                      VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-                      VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
-                      VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
-                      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
-                      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
-                      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
-                      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT)) {
-      cmd.StallAtPixelScoreboard = true;
+   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
+      src_flags |= pMemoryBarriers[i].srcAccessMask;
+      dst_flags |= pMemoryBarriers[i].dstAccessMask;
    }
 
-   if (anv_clear_mask(&srcStageMask,
-                      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TRANSFER_BIT)) {
-      cmd.CommandStreamerStallEnable = true;
+   for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
+      src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
+      dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
    }
 
-   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_HOST_BIT)) {
-      anv_finishme("VK_PIPE_EVENT_CPU_SIGNAL_BIT");
+   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
+      src_flags |= pImageMemoryBarriers[i].srcAccessMask;
+      dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
    }
 
-   /* On our hardware, all stages will wait for execution as needed. */
-   (void)destStageMask;
+   /* Mask out the Source access flags we care about */
+   const uint32_t src_mask =
+      VK_ACCESS_SHADER_WRITE_BIT |
+      VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
+      VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
+      VK_ACCESS_TRANSFER_WRITE_BIT;
 
-   /* We checked all known VkPipeEventFlags. */
-   anv_assert(srcStageMask == 0);
+   src_flags = src_flags & src_mask;
 
-   /* XXX: Right now, we're really dumb and just flush whatever categories
-    * the app asks for.  One of these days we may make this a bit better
-    * but right now that's all the hardware allows for in most areas.
-    */
-   VkAccessFlags src_flags = 0;
-   VkAccessFlags dst_flags = 0;
+   /* Mask out the destination access flags we care about */
+   const uint32_t dst_mask =
+      VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
+      VK_ACCESS_INDEX_READ_BIT |
+      VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
+      VK_ACCESS_UNIFORM_READ_BIT |
+      VK_ACCESS_SHADER_READ_BIT |
+      VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
+      VK_ACCESS_TRANSFER_READ_BIT;
 
-   for (uint32_t i = 0; i < memBarrierCount; i++) {
-      const struct anv_common *common = ppMemBarriers[i];
-      switch (common->sType) {
-      case VK_STRUCTURE_TYPE_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkMemoryBarrier, barrier, common);
-         src_flags |= barrier->srcAccessMask;
-         dst_flags |= barrier->dstAccessMask;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkBufferMemoryBarrier, barrier, common);
-         src_flags |= barrier->srcAccessMask;
-         dst_flags |= barrier->dstAccessMask;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkImageMemoryBarrier, barrier, common);
-         src_flags |= barrier->srcAccessMask;
-         dst_flags |= barrier->dstAccessMask;
-         break;
-      }
-      default:
-         unreachable("Invalid memory barrier type");
-      }
-   }
+   dst_flags = dst_flags & dst_mask;
 
    /* The src flags represent how things were used previously.  This is
     * what we use for doing flushes.
     */
+   struct GENX(PIPE_CONTROL) flush_cmd = {
+      GENX(PIPE_CONTROL_header),
+      .PostSyncOperation = NoWrite,
+   };
+
    for_each_bit(b, src_flags) {
       switch ((VkAccessFlagBits)(1 << b)) {
       case VK_ACCESS_SHADER_WRITE_BIT:
-         cmd.DCFlushEnable = true;
+         flush_cmd.DCFlushEnable = true;
          break;
       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
-         cmd.RenderTargetCacheFlushEnable = true;
+         flush_cmd.RenderTargetCacheFlushEnable = true;
          break;
       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
-         cmd.DepthCacheFlushEnable = true;
+         flush_cmd.DepthCacheFlushEnable = true;
          break;
       case VK_ACCESS_TRANSFER_WRITE_BIT:
-         cmd.RenderTargetCacheFlushEnable = true;
-         cmd.DepthCacheFlushEnable = true;
+         flush_cmd.RenderTargetCacheFlushEnable = true;
+         flush_cmd.DepthCacheFlushEnable = true;
          break;
       default:
-         /* Doesn't require a flush */
-         break;
+         unreachable("should've masked this out by now");
       }
    }
 
-   /* The dst flags represent how things will be used in the fugure.  This
+   /* If we end up doing two PIPE_CONTROLs, the first, flusing one also has to
+    * stall and wait for the flushing to finish, so we don't re-dirty the
+    * caches with in-flight rendering after the second PIPE_CONTROL
+    * invalidates.
+    */
+
+   if (dst_flags)
+      flush_cmd.CommandStreamerStallEnable = true;
+
+   if (src_flags && dst_flags) {
+      dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
+      GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &flush_cmd);
+   }
+
+   /* The dst flags represent how things will be used in the future.  This
     * is what we use for doing cache invalidations.
     */
+   struct GENX(PIPE_CONTROL) invalidate_cmd = {
+      GENX(PIPE_CONTROL_header),
+      .PostSyncOperation = NoWrite,
+   };
+
    for_each_bit(b, dst_flags) {
       switch ((VkAccessFlagBits)(1 << b)) {
       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
       case VK_ACCESS_INDEX_READ_BIT:
       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
-         cmd.VFCacheInvalidationEnable = true;
+         invalidate_cmd.VFCacheInvalidationEnable = true;
          break;
       case VK_ACCESS_UNIFORM_READ_BIT:
-         cmd.ConstantCacheInvalidationEnable = true;
+         invalidate_cmd.ConstantCacheInvalidationEnable = true;
          /* fallthrough */
       case VK_ACCESS_SHADER_READ_BIT:
-         cmd.TextureCacheInvalidationEnable = true;
+         invalidate_cmd.TextureCacheInvalidationEnable = true;
          break;
       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
-         cmd.TextureCacheInvalidationEnable = true;
+         invalidate_cmd.TextureCacheInvalidationEnable = true;
          break;
       case VK_ACCESS_TRANSFER_READ_BIT:
-         cmd.TextureCacheInvalidationEnable = true;
+         invalidate_cmd.TextureCacheInvalidationEnable = true;
          break;
-      case VK_ACCESS_MEMORY_READ_BIT:
-         break; /* XXX: What is this? */
       default:
-         /* Doesn't require a flush */
-         break;
+         unreachable("should've masked this out by now");
       }
    }
 
-   dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
-   GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &cmd);
+   if (dst_flags) {
+      dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
+      GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &invalidate_cmd);
+   }
+}
+
+static void
+emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_bo *bo, uint32_t offset)
+{
+   uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
+                                 GENX(3DSTATE_VERTEX_BUFFERS));
+
+   GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
+      &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex = 32, /* Reserved for this */
+         .AddressModifyEnable = true,
+         .BufferPitch = 0,
+#if (ANV_GEN >= 8)
+         .MemoryObjectControlState = GENX(MOCS),
+         .BufferStartingAddress = { bo, offset },
+         .BufferSize = 8
+#else
+         .VertexBufferMemoryObjectControlState = GENX(MOCS),
+         .BufferStartingAddress = { bo, offset },
+         .EndAddress = { bo, offset + 8 },
+#endif
+      });
+}
+
+static void
+emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
+                          uint32_t base_vertex, uint32_t base_instance)
+{
+   struct anv_state id_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
+
+   ((uint32_t *)id_state.map)[0] = base_vertex;
+   ((uint32_t *)id_state.map)[1] = base_instance;
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(id_state);
+
+   emit_base_vertex_instance_bo(cmd_buffer,
+      &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
+}
+
+void genX(CmdDraw)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
+       cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
+      emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+      .VertexAccessType                         = SEQUENTIAL,
+      .PrimitiveTopologyType                    = pipeline->topology,
+      .VertexCountPerInstance                   = vertexCount,
+      .StartVertexLocation                      = firstVertex,
+      .InstanceCount                            = instanceCount,
+      .StartInstanceLocation                    = firstInstance,
+      .BaseVertexLocation                       = 0);
+}
+
+void genX(CmdDrawIndexed)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
+       cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
+      emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+      .VertexAccessType                         = RANDOM,
+      .PrimitiveTopologyType                    = pipeline->topology,
+      .VertexCountPerInstance                   = indexCount,
+      .StartVertexLocation                      = firstIndex,
+      .InstanceCount                            = instanceCount,
+      .StartInstanceLocation                    = firstInstance,
+      .BaseVertexLocation                       = vertexOffset);
+}
+
+/* Auto-Draw / Indirect Registers */
+#define GEN7_3DPRIM_END_OFFSET          0x2420
+#define GEN7_3DPRIM_START_VERTEX        0x2430
+#define GEN7_3DPRIM_VERTEX_COUNT        0x2434
+#define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
+#define GEN7_3DPRIM_START_INSTANCE      0x243C
+#define GEN7_3DPRIM_BASE_VERTEX         0x2440
+
+static void
+emit_lrm(struct anv_batch *batch,
+         uint32_t reg, struct anv_bo *bo, uint32_t offset)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
+                  .RegisterAddress = reg,
+                  .MemoryAddress = { bo, offset });
+}
+
+static void
+emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM),
+                  .RegisterOffset = reg,
+                  .DataDWord = imm);
+}
+
+void genX(CmdDrawIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
+       cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
+      emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
+
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
+   emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+      .IndirectParameterEnable                  = true,
+      .VertexAccessType                         = SEQUENTIAL,
+      .PrimitiveTopologyType                    = pipeline->topology);
+}
+
+void genX(CmdDrawIndexedIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   /* TODO: We need to stomp base vertex to 0 somehow */
+   if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
+       cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
+      emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
+
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
+   emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
+      .IndirectParameterEnable                  = true,
+      .VertexAccessType                         = RANDOM,
+      .PrimitiveTopologyType                    = pipeline->topology);
+}
+
+
+void genX(CmdDispatch)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    x,
+    uint32_t                                    y,
+    uint32_t                                    z)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+
+   if (prog_data->uses_num_work_groups) {
+      struct anv_state state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
+      uint32_t *sizes = state.map;
+      sizes[0] = x;
+      sizes[1] = y;
+      sizes[2] = z;
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(state);
+      cmd_buffer->state.num_workgroups_offset = state.offset;
+      cmd_buffer->state.num_workgroups_bo =
+         &cmd_buffer->device->dynamic_state_block_pool.bo;
+   }
+
+   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
+                  .SIMDSize = prog_data->simd_size / 16,
+                  .ThreadDepthCounterMaximum = 0,
+                  .ThreadHeightCounterMaximum = 0,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
+                  .ThreadGroupIDXDimension = x,
+                  .ThreadGroupIDYDimension = y,
+                  .ThreadGroupIDZDimension = z,
+                  .RightExecutionMask = pipeline->cs_right_mask,
+                  .BottomExecutionMask = 0xffffffff);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
+}
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+void genX(CmdDispatchIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+   struct anv_bo *bo = buffer->bo;
+   uint32_t bo_offset = buffer->offset + offset;
+
+   if (prog_data->uses_num_work_groups) {
+      cmd_buffer->state.num_workgroups_offset = bo_offset;
+      cmd_buffer->state.num_workgroups_bo = bo;
+   }
+
+   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+   emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
+   emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
+   emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
+                  .IndirectParameterEnable = true,
+                  .SIMDSize = prog_data->simd_size / 16,
+                  .ThreadDepthCounterMaximum = 0,
+                  .ThreadHeightCounterMaximum = 0,
+                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
+                  .RightExecutionMask = pipeline->cs_right_mask,
+                  .BottomExecutionMask = 0xffffffff);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
+}
+
+void
+genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.current_pipeline != _3D) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
+#if ANV_GEN >= 9
+                     .MaskBits = 3,
+#endif
+                     .PipelineSelection = _3D);
+      cmd_buffer->state.current_pipeline = _3D;
+   }
+}
+
+static void
+cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   const struct anv_image_view *iview =
+      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
+   const struct anv_image *image = iview ? iview->image : NULL;
+   const struct anv_format *anv_format =
+      iview ? anv_format_for_vk_format(iview->vk_format) : NULL;
+   const bool has_depth = iview && anv_format->has_depth;
+   const bool has_stencil = iview && anv_format->has_stencil;
+
+   /* FIXME: Implement the PMA stall W/A */
+   /* FIXME: Width and Height are wrong */
+
+   /* Emit 3DSTATE_DEPTH_BUFFER */
+   if (has_depth) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
+         .SurfaceType = SURFTYPE_2D,
+         .DepthWriteEnable = true,
+         .StencilWriteEnable = has_stencil,
+         .HierarchicalDepthBufferEnable = false,
+         .SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
+                                                    &image->depth_surface.isl),
+         .SurfacePitch = image->depth_surface.isl.row_pitch - 1,
+         .SurfaceBaseAddress = {
+            .bo = image->bo,
+            .offset = image->depth_surface.offset,
+         },
+         .Height = fb->height - 1,
+         .Width = fb->width - 1,
+         .LOD = 0,
+         .Depth = 1 - 1,
+         .MinimumArrayElement = 0,
+         .DepthBufferObjectControlState = GENX(MOCS),
+#if ANV_GEN >= 8
+         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
+#endif
+         .RenderTargetViewExtent = 1 - 1);
+   } else {
+      /* Even when no depth buffer is present, the hardware requires that
+       * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
+       *
+       *    If a null depth buffer is bound, the driver must instead bind depth as:
+       *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
+       *       3DSTATE_DEPTH.Width = 1
+       *       3DSTATE_DEPTH.Height = 1
+       *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
+       *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
+       *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
+       *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
+       *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
+       *
+       * The PRM is wrong, though. The width and height must be programmed to
+       * actual framebuffer's width and height, even when neither depth buffer
+       * nor stencil buffer is present.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
+         .SurfaceType = SURFTYPE_2D,
+         .SurfaceFormat = D16_UNORM,
+         .Width = fb->width - 1,
+         .Height = fb->height - 1,
+         .StencilWriteEnable = has_stencil);
+   }
+
+   /* Emit 3DSTATE_STENCIL_BUFFER */
+   if (has_stencil) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER),
+#if ANV_GEN >= 8 || ANV_IS_HASWELL
+         .StencilBufferEnable = true,
+#endif
+         .StencilBufferObjectControlState = GENX(MOCS),
+
+         /* Stencil buffers have strange pitch. The PRM says:
+          *
+          *    The pitch must be set to 2x the value computed based on width,
+          *    as the stencil buffer is stored with two rows interleaved.
+          */
+         .SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
+
+#if ANV_GEN >= 8
+         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
+#endif
+         .SurfaceBaseAddress = {
+            .bo = image->bo,
+            .offset = image->offset + image->stencil_surface.offset,
+         });
+   } else {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER));
+   }
+
+   /* Disable hierarchial depth buffers. */
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER));
+
+   /* Clear the clear params. */
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS));
+}
+
+/**
+ * @see anv_cmd_buffer_set_subpass()
+ */
+void
+genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_subpass *subpass)
+{
+   cmd_buffer->state.subpass = subpass;
+
+   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+
+   cmd_buffer_emit_depth_stencil(cmd_buffer);
+}
+
+void genX(CmdBeginRenderPass)(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBegin,
+    VkSubpassContents                           contents)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
+   ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
+
+   cmd_buffer->state.framebuffer = framebuffer;
+   cmd_buffer->state.pass = pass;
+   anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   const VkRect2D *render_area = &pRenderPassBegin->renderArea;
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DRAWING_RECTANGLE),
+                  .ClippedDrawingRectangleYMin = render_area->offset.y,
+                  .ClippedDrawingRectangleXMin = render_area->offset.x,
+                  .ClippedDrawingRectangleYMax =
+                     render_area->offset.y + render_area->extent.height - 1,
+                  .ClippedDrawingRectangleXMax =
+                     render_area->offset.x + render_area->extent.width - 1,
+                  .DrawingRectangleOriginY = 0,
+                  .DrawingRectangleOriginX = 0);
+
+   genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
+}
+
+void genX(CmdNextSubpass)(
+    VkCommandBuffer                             commandBuffer,
+    VkSubpassContents                           contents)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   anv_cmd_buffer_resolve_subpass(cmd_buffer);
+   genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
+   anv_cmd_buffer_clear_subpass(cmd_buffer);
+}
+
+void genX(CmdEndRenderPass)(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   anv_cmd_buffer_resolve_subpass(cmd_buffer);
+
+   /* Emit a flushing pipe control at the end of a pass.  This is kind of a
+    * hack but it ensures that render targets always actually get written.
+    * Eventually, we should do flushing based on image format transitions
+    * or something of that nature.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
+                  .PostSyncOperation = NoWrite,
+                  .RenderTargetCacheFlushEnable = true,
+                  .InstructionCacheInvalidateEnable = true,
+                  .DepthCacheFlushEnable = true,
+                  .VFCacheInvalidationEnable = true,
+                  .TextureCacheInvalidationEnable = true,
+                  .CommandStreamerStallEnable = true);
 }