vk: Fill out buffer surface state when updating descriptor set
[mesa.git] / src / vulkan / anv_cmd_buffer.c
index 5178f6529abef6c1f751d050759978320349b527..1eaa3df633ce73e0b2288b405bbc585422ae6584 100644 (file)
  * is concerned, most of anv_cmd_buffer is magic.
  */
 
+/* TODO: These are taken from GLES.  We should check the Vulkan spec */
+const struct anv_dynamic_state default_dynamic_state = {
+   .viewport = {
+      .count = 0,
+   },
+   .scissor = {
+      .count = 0,
+   },
+   .line_width = 1.0f,
+   .depth_bias = {
+      .bias = 0.0f,
+      .clamp = 0.0f,
+      .slope = 0.0f,
+   },
+   .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
+   .depth_bounds = {
+      .min = 0.0f,
+      .max = 1.0f,
+   },
+   .stencil_compare_mask = {
+      .front = ~0u,
+      .back = ~0u,
+   },
+   .stencil_write_mask = {
+      .front = ~0u,
+      .back = ~0u,
+   },
+   .stencil_reference = {
+      .front = 0u,
+      .back = 0u,
+   },
+};
+
+void
+anv_dynamic_state_copy(struct anv_dynamic_state *dest,
+                       const struct anv_dynamic_state *src,
+                       uint32_t copy_mask)
+{
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_VIEWPORT)) {
+      dest->viewport.count = src->viewport.count;
+      typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
+                   src->viewport.count);
+   }
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_SCISSOR)) {
+      dest->scissor.count = src->scissor.count;
+      typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
+                   src->scissor.count);
+   }
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_LINE_WIDTH))
+      dest->line_width = src->line_width;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_DEPTH_BIAS))
+      dest->depth_bias = src->depth_bias;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_BLEND_CONSTANTS))
+      typed_memcpy(dest->blend_constants, src->blend_constants, 4);
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS))
+      dest->depth_bounds = src->depth_bounds;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK))
+      dest->stencil_compare_mask = src->stencil_compare_mask;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_WRITE_MASK))
+      dest->stencil_write_mask = src->stencil_write_mask;
+
+   if (copy_mask & (1 << VK_DYNAMIC_STATE_STENCIL_REFERENCE))
+      dest->stencil_reference = src->stencil_reference;
+}
+
 static void
 anv_cmd_state_init(struct anv_cmd_state *state)
 {
-   state->rs_state = NULL;
-   state->vp_state = NULL;
-   state->cb_state = NULL;
-   state->ds_state = NULL;
-   memset(&state->state_vf, 0, sizeof(state->state_vf));
    memset(&state->descriptors, 0, sizeof(state->descriptors));
+   memset(&state->push_constants, 0, sizeof(state->push_constants));
 
-   state->dirty = 0;
+   state->dirty = ~0;
    state->vb_dirty = 0;
    state->descriptors_dirty = 0;
+   state->push_constants_dirty = 0;
    state->pipeline = NULL;
-   state->vp_state = NULL;
-   state->rs_state = NULL;
-   state->ds_state = NULL;
+   state->restart_index = UINT32_MAX;
+   state->dynamic = default_dynamic_state;
+
+   state->gen7.index_buffer = NULL;
 }
 
-VkResult anv_CreateCommandBuffer(
-    VkDevice                                    _device,
-    const VkCmdBufferCreateInfo*                pCreateInfo,
-    VkCmdBuffer*                                pCmdBuffer)
+static VkResult
+anv_cmd_buffer_ensure_push_constants_size(struct anv_cmd_buffer *cmd_buffer,
+                                          gl_shader_stage stage, uint32_t size)
+{
+   struct anv_push_constants **ptr = &cmd_buffer->state.push_constants[stage];
+
+   if (*ptr == NULL) {
+      *ptr = anv_alloc(&cmd_buffer->pool->alloc, size, 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (*ptr == NULL)
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   } else if ((*ptr)->size < size) {
+      *ptr = anv_realloc(&cmd_buffer->pool->alloc, *ptr, size, 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (*ptr == NULL)
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+   (*ptr)->size = size;
+
+   return VK_SUCCESS;
+}
+
+#define anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, field) \
+   anv_cmd_buffer_ensure_push_constants_size(cmd_buffer, stage, \
+      (offsetof(struct anv_push_constants, field) + \
+       sizeof(cmd_buffer->state.push_constants[0]->field)))
+
+static VkResult anv_create_cmd_buffer(
+    struct anv_device *                         device,
+    struct anv_cmd_pool *                       pool,
+    VkCommandBufferLevel                        level,
+    VkCommandBuffer*                            pCommandBuffer)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_cmd_pool, pool, pCreateInfo->cmdPool);
    struct anv_cmd_buffer *cmd_buffer;
    VkResult result;
 
-   cmd_buffer = anv_device_alloc(device, sizeof(*cmd_buffer), 8,
-                                 VK_SYSTEM_ALLOC_TYPE_API_OBJECT);
+   cmd_buffer = anv_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
    cmd_buffer->device = device;
+   cmd_buffer->pool = pool;
 
    result = anv_cmd_buffer_init_batch_bo_chain(cmd_buffer);
    if (result != VK_SUCCESS)
@@ -83,8 +181,8 @@ VkResult anv_CreateCommandBuffer(
    anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
                          &device->dynamic_state_block_pool);
 
-   cmd_buffer->level = pCreateInfo->level;
-   cmd_buffer->opt_flags = 0;
+   cmd_buffer->level = level;
+   cmd_buffer->usage_flags = 0;
 
    anv_cmd_state_init(&cmd_buffer->state);
 
@@ -97,38 +195,72 @@ VkResult anv_CreateCommandBuffer(
       list_inithead(&cmd_buffer->pool_link);
    }
 
-   *pCmdBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
+   *pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
 
    return VK_SUCCESS;
 
- fail: anv_device_free(device, cmd_buffer);
+ fail:
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer);
 
    return result;
 }
 
-VkResult anv_DestroyCommandBuffer(
+VkResult anv_AllocateCommandBuffers(
     VkDevice                                    _device,
-    VkCmdBuffer                                 _cmd_buffer)
+    const VkCommandBufferAllocateInfo*          pAllocateInfo,
+    VkCommandBuffer*                            pCommandBuffers)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _cmd_buffer);
+   ANV_FROM_HANDLE(anv_cmd_pool, pool, pAllocateInfo->commandPool);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t i;
+
+   for (i = 0; i < pAllocateInfo->bufferCount; i++) {
+      result = anv_create_cmd_buffer(device, pool, pAllocateInfo->level,
+                                     &pCommandBuffers[i]);
+      if (result != VK_SUCCESS)
+         break;
+   }
 
+   if (result != VK_SUCCESS)
+      anv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
+                             i, pCommandBuffers);
+
+   return result;
+}
+
+static void
+anv_cmd_buffer_destroy(struct anv_cmd_buffer *cmd_buffer)
+{
    list_del(&cmd_buffer->pool_link);
 
    anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
 
    anv_state_stream_finish(&cmd_buffer->surface_state_stream);
    anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
-   anv_device_free(device, cmd_buffer);
 
-   return VK_SUCCESS;
+   anv_free(&cmd_buffer->pool->alloc, cmd_buffer);
+}
+
+void anv_FreeCommandBuffers(
+    VkDevice                                    device,
+    VkCommandPool                               commandPool,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers)
+{
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
+
+      anv_cmd_buffer_destroy(cmd_buffer);
+   }
 }
 
 VkResult anv_ResetCommandBuffer(
-    VkCmdBuffer                                 cmdBuffer,
-    VkCmdBufferResetFlags                       flags)
+    VkCommandBuffer                             commandBuffer,
+    VkCommandBufferResetFlags                   flags)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
    anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
 
@@ -140,101 +272,41 @@ VkResult anv_ResetCommandBuffer(
 void
 anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_device *device = cmd_buffer->device;
-   struct anv_bo *scratch_bo = NULL;
-
-   cmd_buffer->state.scratch_size =
-      anv_block_pool_size(&device->scratch_block_pool);
-   if (cmd_buffer->state.scratch_size > 0)
-      scratch_bo = &device->scratch_block_pool.bo;
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_STATE_BASE_ADDRESS,
-                  .GeneralStateBaseAddress = { scratch_bo, 0 },
-                  .GeneralStateMemoryObjectControlState = GEN8_MOCS,
-                  .GeneralStateBaseAddressModifyEnable = true,
-                  .GeneralStateBufferSize = 0xfffff,
-                  .GeneralStateBufferSizeModifyEnable = true,
-
-                  .SurfaceStateBaseAddress = { anv_cmd_buffer_current_surface_bo(cmd_buffer), 0 },
-                  .SurfaceStateMemoryObjectControlState = GEN8_MOCS,
-                  .SurfaceStateBaseAddressModifyEnable = true,
-
-                  .DynamicStateBaseAddress = { &device->dynamic_state_block_pool.bo, 0 },
-                  .DynamicStateMemoryObjectControlState = GEN8_MOCS,
-                  .DynamicStateBaseAddressModifyEnable = true,
-                  .DynamicStateBufferSize = 0xfffff,
-                  .DynamicStateBufferSizeModifyEnable = true,
-
-                  .IndirectObjectBaseAddress = { NULL, 0 },
-                  .IndirectObjectMemoryObjectControlState = GEN8_MOCS,
-                  .IndirectObjectBaseAddressModifyEnable = true,
-                  .IndirectObjectBufferSize = 0xfffff,
-                  .IndirectObjectBufferSizeModifyEnable = true,
-
-                  .InstructionBaseAddress = { &device->instruction_block_pool.bo, 0 },
-                  .InstructionMemoryObjectControlState = GEN8_MOCS,
-                  .InstructionBaseAddressModifyEnable = true,
-                  .InstructionBufferSize = 0xfffff,
-                  .InstructionBuffersizeModifyEnable = true);
-
-   /* After re-setting the surface state base address, we have to do some
-    * cache flusing so that the sampler engine will pick up the new
-    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
-    * Shared Function > 3D Sampler > State > State Caching (page 96):
-    *
-    *    Coherency with system memory in the state cache, like the texture
-    *    cache is handled partially by software. It is expected that the
-    *    command stream or shader will issue Cache Flush operation or
-    *    Cache_Flush sampler message to ensure that the L1 cache remains
-    *    coherent with system memory.
-    *
-    *    [...]
-    *
-    *    Whenever the value of the Dynamic_State_Base_Addr,
-    *    Surface_State_Base_Addr are altered, the L1 state cache must be
-    *    invalidated to ensure the new surface or sampler state is fetched
-    *    from system memory.
-    *
-    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
-    * which, according the PIPE_CONTROL instruction documentation in the
-    * Broadwell PRM:
-    *
-    *    Setting this bit is independent of any other bit in this packet.
-    *    This bit controls the invalidation of the L1 and L2 state caches
-    *    at the top of the pipe i.e. at the parsing time.
-    *
-    * Unfortunately, experimentation seems to indicate that state cache
-    * invalidation through a PIPE_CONTROL does nothing whatsoever in
-    * regards to surface state and binding tables.  In stead, it seems that
-    * invalidating the texture cache is what is actually needed.
-    *
-    * XXX:  As far as we have been able to determine through
-    * experimentation, shows that flush the texture cache appears to be
-    * sufficient.  The theory here is that all of the sampling/rendering
-    * units cache the binding table in the texture cache.  However, we have
-    * yet to be able to actually confirm this.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
-                  .TextureCacheInvalidationEnable = true);
+   switch (cmd_buffer->device->info.gen) {
+   case 7:
+      if (cmd_buffer->device->info.is_haswell)
+         return gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
+      else
+         return gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
+   case 8:
+      return gen8_cmd_buffer_emit_state_base_address(cmd_buffer);
+   case 9:
+      return gen9_cmd_buffer_emit_state_base_address(cmd_buffer);
+   default:
+      unreachable("unsupported gen\n");
+   }
 }
 
 VkResult anv_BeginCommandBuffer(
-    VkCmdBuffer                                 cmdBuffer,
-    const VkCmdBufferBeginInfo*                 pBeginInfo)
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   cmd_buffer->opt_flags = pBeginInfo->flags;
+   anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
 
-   if (cmd_buffer->level == VK_CMD_BUFFER_LEVEL_SECONDARY) {
+   cmd_buffer->usage_flags = pBeginInfo->flags;
+
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
       cmd_buffer->state.framebuffer =
          anv_framebuffer_from_handle(pBeginInfo->framebuffer);
       cmd_buffer->state.pass =
          anv_render_pass_from_handle(pBeginInfo->renderPass);
 
-      /* FIXME: We shouldn't be starting on the first subpass */
-      anv_cmd_buffer_begin_subpass(cmd_buffer,
-                                   &cmd_buffer->state.pass->subpasses[0]);
+      struct anv_subpass *subpass =
+         &cmd_buffer->state.pass->subpasses[pBeginInfo->subpass];
+
+      anv_cmd_buffer_begin_subpass(cmd_buffer, subpass);
    }
 
    anv_cmd_buffer_emit_state_base_address(cmd_buffer);
@@ -244,14 +316,14 @@ VkResult anv_BeginCommandBuffer(
 }
 
 VkResult anv_EndCommandBuffer(
-    VkCmdBuffer                                 cmdBuffer)
+    VkCommandBuffer                             commandBuffer)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    struct anv_device *device = cmd_buffer->device;
 
    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 
-   if (cmd_buffer->level == VK_CMD_BUFFER_LEVEL_PRIMARY) {
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
       /* The algorithm used to compute the validate list is not threadsafe as
        * it uses the bo->index field.  We have to lock the device around it.
        * Fortunately, the chances for contention here are probably very low.
@@ -265,23 +337,31 @@ VkResult anv_EndCommandBuffer(
 }
 
 void anv_CmdBindPipeline(
-    VkCmdBuffer                                 cmdBuffer,
+    VkCommandBuffer                             commandBuffer,
     VkPipelineBindPoint                         pipelineBindPoint,
     VkPipeline                                  _pipeline)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
 
    switch (pipelineBindPoint) {
    case VK_PIPELINE_BIND_POINT_COMPUTE:
       cmd_buffer->state.compute_pipeline = pipeline;
-      cmd_buffer->state.compute_dirty |= ANV_CMD_BUFFER_PIPELINE_DIRTY;
+      cmd_buffer->state.compute_dirty |= ANV_CMD_DIRTY_PIPELINE;
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
       break;
 
    case VK_PIPELINE_BIND_POINT_GRAPHICS:
       cmd_buffer->state.pipeline = pipeline;
       cmd_buffer->state.vb_dirty |= pipeline->vb_used;
-      cmd_buffer->state.dirty |= ANV_CMD_BUFFER_PIPELINE_DIRTY;
+      cmd_buffer->state.dirty |= ANV_CMD_DIRTY_PIPELINE;
+      cmd_buffer->state.push_constants_dirty |= pipeline->active_stages;
+
+      /* Apply the dynamic state from the pipeline */
+      cmd_buffer->state.dirty |= pipeline->dynamic_state_mask;
+      anv_dynamic_state_copy(&cmd_buffer->state.dynamic,
+                             &pipeline->dynamic_state,
+                             pipeline->dynamic_state_mask);
       break;
 
    default:
@@ -290,122 +370,195 @@ void anv_CmdBindPipeline(
    }
 }
 
-void anv_CmdBindDynamicViewportState(
-    VkCmdBuffer                                 cmdBuffer,
-    VkDynamicViewportState                      dynamicViewportState)
+void anv_CmdSetViewport(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    viewportCount,
+    const VkViewport*                           pViewports)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_dynamic_vp_state, vp_state, dynamicViewportState);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   cmd_buffer->state.vp_state = vp_state;
-   cmd_buffer->state.dirty |= ANV_CMD_BUFFER_VP_DIRTY;
+   cmd_buffer->state.dynamic.viewport.count = viewportCount;
+   memcpy(cmd_buffer->state.dynamic.viewport.viewports,
+          pViewports, viewportCount * sizeof(*pViewports));
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
+}
+
+void anv_CmdSetScissor(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    scissorCount,
+    const VkRect2D*                             pScissors)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.scissor.count = scissorCount;
+   memcpy(cmd_buffer->state.dynamic.scissor.scissors,
+          pScissors, scissorCount * sizeof(*pScissors));
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
 }
 
-void anv_CmdBindDynamicRasterState(
-    VkCmdBuffer                                 cmdBuffer,
-    VkDynamicRasterState                        dynamicRasterState)
+void anv_CmdSetLineWidth(
+    VkCommandBuffer                             commandBuffer,
+    float                                       lineWidth)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_dynamic_rs_state, rs_state, dynamicRasterState);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   cmd_buffer->state.rs_state = rs_state;
-   cmd_buffer->state.dirty |= ANV_CMD_BUFFER_RS_DIRTY;
+   cmd_buffer->state.dynamic.line_width = lineWidth;
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
 }
 
-void anv_CmdBindDynamicColorBlendState(
-    VkCmdBuffer                                 cmdBuffer,
-    VkDynamicColorBlendState                    dynamicColorBlendState)
+void anv_CmdSetDepthBias(
+    VkCommandBuffer                             commandBuffer,
+    float                                       depthBiasConstantFactor,
+    float                                       depthBiasClamp,
+    float                                       depthBiasSlopeFactor)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_dynamic_cb_state, cb_state, dynamicColorBlendState);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
+   cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
+   cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
 
-   cmd_buffer->state.cb_state = cb_state;
-   cmd_buffer->state.dirty |= ANV_CMD_BUFFER_CB_DIRTY;
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
 }
 
-void anv_CmdBindDynamicDepthStencilState(
-    VkCmdBuffer                                 cmdBuffer,
-    VkDynamicDepthStencilState                  dynamicDepthStencilState)
+void anv_CmdSetBlendConstants(
+    VkCommandBuffer                             commandBuffer,
+    const float                                 blendConstants[4])
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_dynamic_ds_state, ds_state, dynamicDepthStencilState);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   memcpy(cmd_buffer->state.dynamic.blend_constants,
+          blendConstants, sizeof(float) * 4);
 
-   cmd_buffer->state.ds_state = ds_state;
-   cmd_buffer->state.dirty |= ANV_CMD_BUFFER_DS_DIRTY;
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
+}
+
+void anv_CmdSetDepthBounds(
+    VkCommandBuffer                             commandBuffer,
+    float                                       minDepthBounds,
+    float                                       maxDepthBounds)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
+   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
+}
+
+void anv_CmdSetStencilCompareMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    compareMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+      cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask;
+   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+      cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
+}
+
+void anv_CmdSetStencilWriteMask(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    writeMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+      cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask;
+   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+      cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
+}
+
+void anv_CmdSetStencilReference(
+    VkCommandBuffer                             commandBuffer,
+    VkStencilFaceFlags                          faceMask,
+    uint32_t                                    reference)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+      cmd_buffer->state.dynamic.stencil_reference.front = reference;
+   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+      cmd_buffer->state.dynamic.stencil_reference.back = reference;
+
+   cmd_buffer->state.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
 }
 
 void anv_CmdBindDescriptorSets(
-    VkCmdBuffer                                 cmdBuffer,
+    VkCommandBuffer                             commandBuffer,
     VkPipelineBindPoint                         pipelineBindPoint,
     VkPipelineLayout                            _layout,
     uint32_t                                    firstSet,
-    uint32_t                                    setCount,
+    uint32_t                                    descriptorSetCount,
     const VkDescriptorSet*                      pDescriptorSets,
     uint32_t                                    dynamicOffsetCount,
     const uint32_t*                             pDynamicOffsets)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
    struct anv_descriptor_set_layout *set_layout;
 
-   assert(firstSet + setCount < MAX_SETS);
+   assert(firstSet + descriptorSetCount < MAX_SETS);
 
    uint32_t dynamic_slot = 0;
-   for (uint32_t i = 0; i < setCount; i++) {
+   for (uint32_t i = 0; i < descriptorSetCount; i++) {
       ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
       set_layout = layout->set[firstSet + i].layout;
 
-      cmd_buffer->state.descriptors[firstSet + i].set = set;
-
-      assert(set_layout->num_dynamic_buffers <
-             ARRAY_SIZE(cmd_buffer->state.descriptors[0].dynamic_offsets));
-      memcpy(cmd_buffer->state.descriptors[firstSet + i].dynamic_offsets,
-             pDynamicOffsets + dynamic_slot,
-             set_layout->num_dynamic_buffers * sizeof(*pDynamicOffsets));
-
-      cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages;
+      if (cmd_buffer->state.descriptors[firstSet + i] != set) {
+         cmd_buffer->state.descriptors[firstSet + i] = set;
+         cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages;
+      }
 
-      dynamic_slot += set_layout->num_dynamic_buffers;
+      if (set_layout->dynamic_offset_count > 0) {
+         anv_foreach_stage(s, set_layout->shader_stages) {
+            anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, s, dynamic);
+
+            struct anv_push_constants *push =
+               cmd_buffer->state.push_constants[s];
+
+            unsigned d = layout->set[firstSet + i].dynamic_offset_start;
+            const uint32_t *offsets = pDynamicOffsets + dynamic_slot;
+            struct anv_descriptor *desc = set->descriptors;
+
+            for (unsigned b = 0; b < set_layout->binding_count; b++) {
+               if (set_layout->binding[b].dynamic_offset_index < 0)
+                  continue;
+
+               unsigned array_size = set_layout->binding[b].array_size;
+               for (unsigned j = 0; j < array_size; j++) {
+                  uint32_t range = 0;
+                  if (desc->buffer_view)
+                     range = desc->buffer_view;
+                  push->dynamic[d].offset = *(offsets++);
+                  push->dynamic[d].range = range;
+                  desc++;
+                  d++;
+               }
+            }
+         }
+         cmd_buffer->state.push_constants_dirty |= set_layout->shader_stages;
+      }
    }
 }
 
-void anv_CmdBindIndexBuffer(
-    VkCmdBuffer                                 cmdBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    VkIndexType                                 indexType)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-
-   static const uint32_t vk_to_gen_index_type[] = {
-      [VK_INDEX_TYPE_UINT16]                    = INDEX_WORD,
-      [VK_INDEX_TYPE_UINT32]                    = INDEX_DWORD,
-   };
-
-   struct GEN8_3DSTATE_VF vf = {
-      GEN8_3DSTATE_VF_header,
-      .CutIndex = (indexType == VK_INDEX_TYPE_UINT16) ? UINT16_MAX : UINT32_MAX,
-   };
-   GEN8_3DSTATE_VF_pack(NULL, cmd_buffer->state.state_vf, &vf);
-
-   cmd_buffer->state.dirty |= ANV_CMD_BUFFER_INDEX_BUFFER_DIRTY;
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_INDEX_BUFFER,
-                  .IndexFormat = vk_to_gen_index_type[indexType],
-                  .MemoryObjectControlState = GEN8_MOCS,
-                  .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
-                  .BufferSize = buffer->size - offset);
-}
-
 void anv_CmdBindVertexBuffers(
-    VkCmdBuffer                                 cmdBuffer,
+    VkCommandBuffer                             commandBuffer,
     uint32_t                                    startBinding,
     uint32_t                                    bindingCount,
     const VkBuffer*                             pBuffers,
     const VkDeviceSize*                         pOffsets)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    struct anv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
 
    /* We have to defer setting up vertex buffer since we need the buffer
@@ -419,26 +572,63 @@ void anv_CmdBindVertexBuffers(
    }
 }
 
-static VkResult
-cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
-                              unsigned stage, struct anv_state *bt_state)
+static void
+add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer,
+                        struct anv_state state, struct anv_bo *bo, uint32_t offset)
+{
+   /* The address goes in SURFACE_STATE dword 1 for gens < 8 and dwords 8 and
+    * 9 for gen8+.  We only write the first dword for gen8+ here and rely on
+    * the initial state to set the high bits to 0. */
+
+   const uint32_t dword = cmd_buffer->device->info.gen < 8 ? 1 : 8;
+
+   anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
+                      state.offset + dword * 4, bo, offset);
+}
+
+const struct anv_format *
+anv_format_for_descriptor_type(VkDescriptorType type)
+{
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      return anv_format_for_vk_format(VK_FORMAT_R32G32B32A32_SFLOAT);
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      return anv_format_for_vk_format(VK_FORMAT_UNDEFINED);
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
+}
+
+VkResult
+anv_cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                  gl_shader_stage stage,
+                                  struct anv_state *bt_state)
 {
    struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
    struct anv_subpass *subpass = cmd_buffer->state.subpass;
    struct anv_pipeline_layout *layout;
-   uint32_t attachments, bias, size;
+   uint32_t color_count, bias, state_offset;
 
-   if (stage == VK_SHADER_STAGE_COMPUTE)
-      layout = cmd_buffer->state.compute_pipeline->layout;
-   else
+   switch (stage) {
+   case  MESA_SHADER_FRAGMENT:
       layout = cmd_buffer->state.pipeline->layout;
-
-   if (stage == VK_SHADER_STAGE_FRAGMENT) {
       bias = MAX_RTS;
-      attachments = subpass->color_count;
-   } else {
+      color_count = subpass->color_count;
+      break;
+   case  MESA_SHADER_COMPUTE:
+      layout = cmd_buffer->state.compute_pipeline->layout;
+      bias = 1;
+      color_count = 0;
+      break;
+   default:
+      layout = cmd_buffer->state.pipeline->layout;
       bias = 0;
-      attachments = 0;
+      color_count = 0;
+      break;
    }
 
    /* This is a little awkward: layout can be NULL but we still have to
@@ -446,107 +636,146 @@ cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
     * targets. */
    uint32_t surface_count = layout ? layout->stage[stage].surface_count : 0;
 
-   if (attachments + surface_count == 0)
+   if (color_count + surface_count == 0)
       return VK_SUCCESS;
 
-   size = (bias + surface_count) * sizeof(uint32_t);
-   *bt_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer, size, 32);
+   *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
+                                                  bias + surface_count,
+                                                  &state_offset);
    uint32_t *bt_map = bt_state->map;
 
    if (bt_state->map == NULL)
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
-   /* This is highly annoying.  The Vulkan spec puts the depth-stencil
-    * attachments in with the color attachments.  Unfortunately, thanks to
-    * other aspects of the API, we cana't really saparate them before this
-    * point.  Therefore, we have to walk all of the attachments but only
-    * put the color attachments into the binding table.
-    */
-   for (uint32_t a = 0; a < attachments; a++) {
-      const struct anv_attachment_view *attachment =
+   for (uint32_t a = 0; a < color_count; a++) {
+      const struct anv_image_view *iview =
          fb->attachments[subpass->color_attachments[a]];
 
-      assert(attachment->attachment_type == ANV_ATTACHMENT_VIEW_TYPE_COLOR);
-      const struct anv_color_attachment_view *view =
-         (const struct anv_color_attachment_view *)attachment;
+      bt_map[a] = iview->color_rt_surface_state.offset + state_offset;
+      add_surface_state_reloc(cmd_buffer, iview->color_rt_surface_state,
+                              iview->bo, iview->offset);
+   }
 
-      struct anv_state state =
-         anv_cmd_buffer_alloc_surface_state(cmd_buffer, 64, 64);
+   if (stage == MESA_SHADER_COMPUTE &&
+       cmd_buffer->state.compute_pipeline->cs_prog_data.uses_num_work_groups) {
+      struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo;
+      uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset;
 
-      if (state.map == NULL)
-         return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+      struct anv_state surface_state;
+      surface_state =
+         anv_cmd_buffer_alloc_surface_state(cmd_buffer);
 
-      memcpy(state.map, view->view.surface_state.map, 64);
+      const struct anv_format *format =
+         anv_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+      anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
+                                    format->surface_format, bo_offset, 12, 1);
 
-      /* The address goes in dwords 8 and 9 of the SURFACE_STATE */
-      *(uint64_t *)(state.map + 8 * 4) =
-         anv_reloc_list_add(anv_cmd_buffer_current_surface_relocs(cmd_buffer),
-                            cmd_buffer->device,
-                            state.offset + 8 * 4,
-                            view->view.bo, view->view.offset);
+      if (!cmd_buffer->device->info.has_llc)
+         anv_state_clflush(surface_state);
 
-      bt_map[a] = state.offset;
+      bt_map[0] = surface_state.offset + state_offset;
+      add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
    }
 
    if (layout == NULL)
-      return VK_SUCCESS;
+      goto out;
 
-   for (uint32_t set = 0; set < layout->num_sets; set++) {
-      struct anv_descriptor_set_binding *d = &cmd_buffer->state.descriptors[set];
-      struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
-      struct anv_descriptor_slot *surface_slots =
-         set_layout->stage[stage].surface_start;
+   if (layout->stage[stage].image_count > 0) {
+      VkResult result =
+         anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
+      if (result != VK_SUCCESS)
+         return result;
 
-      uint32_t start = bias + layout->set[set].surface_start[stage];
+      cmd_buffer->state.push_constants_dirty |= 1 << stage;
+   }
+
+   uint32_t image = 0;
+   for (uint32_t s = 0; s < layout->stage[stage].surface_count; s++) {
+      struct anv_pipeline_binding *binding =
+         &layout->stage[stage].surface_to_descriptor[s];
+      struct anv_descriptor_set *set =
+         cmd_buffer->state.descriptors[binding->set];
+      struct anv_descriptor *desc = &set->descriptors[binding->offset];
+
+      struct anv_state surface_state;
+      struct anv_bo *bo;
+      uint32_t bo_offset;
+
+      switch (desc->type) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+         /* Nothing for us to do here */
+         continue;
+
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         surface_state = desc->image_view->nonrt_surface_state;
+         bo = desc->image_view->bo;
+         bo_offset = desc->image_view->offset;
+         break;
 
-      for (uint32_t b = 0; b < set_layout->stage[stage].surface_count; b++) {
-         struct anv_surface_view *view =
-            d->set->descriptors[surface_slots[b].index].view;
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
+         surface_state = desc->image_view->storage_surface_state;
+         bo = desc->image_view->bo;
+         bo_offset = desc->image_view->offset;
 
-         if (!view)
-            continue;
+         struct brw_image_param *image_param =
+            &cmd_buffer->state.push_constants[stage]->images[image++];
 
-         struct anv_state state =
-            anv_cmd_buffer_alloc_surface_state(cmd_buffer, 64, 64);
+         anv_image_view_fill_image_param(cmd_buffer->device, desc->image_view,
+                                         image_param);
+         image_param->surface_idx = bias + s;
+         break;
+      }
 
-         if (state.map == NULL)
-            return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+         surface_state = desc->buffer_view->surface_state;
+         bo = desc->buffer_view->bo;
+         bo_offset = desc->buffer_view->offset;
+         break;
 
-         uint32_t offset;
-         if (surface_slots[b].dynamic_slot >= 0) {
-            uint32_t dynamic_offset =
-               d->dynamic_offsets[surface_slots[b].dynamic_slot];
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         surface_state = desc->buffer_view->storage_surface_state;
+         bo = desc->buffer_view->bo;
+         bo_offset = desc->buffer_view->offset;
 
-            offset = view->offset + dynamic_offset;
-            anv_fill_buffer_surface_state(state.map, view->format, offset,
-                                          view->range - dynamic_offset);
-         } else {
-            offset = view->offset;
-            memcpy(state.map, view->surface_state.map, 64);
-         }
+         struct brw_image_param *image_param =
+            &cmd_buffer->state.push_constants[stage]->images[image++];
 
-         /* The address goes in dwords 8 and 9 of the SURFACE_STATE */
-         *(uint64_t *)(state.map + 8 * 4) =
-            anv_reloc_list_add(anv_cmd_buffer_current_surface_relocs(cmd_buffer),
-                               cmd_buffer->device,
-                               state.offset + 8 * 4,
-                               view->bo, offset);
+         anv_buffer_view_fill_image_param(cmd_buffer->device, desc->buffer_view,
+                                          image_param);
+         image_param->surface_idx = bias + s;
+         break;
 
-         bt_map[start + b] = state.offset;
+      default:
+         assert(!"Invalid descriptor type");
+         continue;
       }
+
+      bt_map[bias + s] = surface_state.offset + state_offset;
+      add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
    }
+   assert(image == layout->stage[stage].image_count);
+
+ out:
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(*bt_state);
 
    return VK_SUCCESS;
 }
 
-static VkResult
-cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
-                         unsigned stage, struct anv_state *state)
+VkResult
+anv_cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
+                             gl_shader_stage stage, struct anv_state *state)
 {
    struct anv_pipeline_layout *layout;
    uint32_t sampler_count;
 
-   if (stage == VK_SHADER_STAGE_COMPUTE)
+   if (stage == MESA_SHADER_COMPUTE)
       layout = cmd_buffer->state.compute_pipeline->layout;
    else
       layout = cmd_buffer->state.pipeline->layout;
@@ -561,129 +790,53 @@ cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
    if (state->map == NULL)
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
-   for (uint32_t set = 0; set < layout->num_sets; set++) {
-      struct anv_descriptor_set_binding *d = &cmd_buffer->state.descriptors[set];
-      struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
-      struct anv_descriptor_slot *sampler_slots =
-         set_layout->stage[stage].sampler_start;
-
-      uint32_t start = layout->set[set].sampler_start[stage];
-
-      for (uint32_t b = 0; b < set_layout->stage[stage].sampler_count; b++) {
-         struct anv_sampler *sampler =
-            d->set->descriptors[sampler_slots[b].index].sampler;
+   for (uint32_t s = 0; s < layout->stage[stage].sampler_count; s++) {
+      struct anv_pipeline_binding *binding =
+         &layout->stage[stage].sampler_to_descriptor[s];
+      struct anv_descriptor_set *set =
+         cmd_buffer->state.descriptors[binding->set];
+      struct anv_descriptor *desc = &set->descriptors[binding->offset];
 
-         if (!sampler)
-            continue;
+      if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
 
-         memcpy(state->map + (start + b) * 16,
-                sampler->state, sizeof(sampler->state));
-      }
-   }
-
-   return VK_SUCCESS;
-}
+      struct anv_sampler *sampler = desc->sampler;
 
-static VkResult
-flush_descriptor_set(struct anv_cmd_buffer *cmd_buffer, uint32_t stage)
-{
-   struct anv_state surfaces = { 0, }, samplers = { 0, };
-   VkResult result;
+      /* This can happen if we have an unfilled slot since TYPE_SAMPLER
+       * happens to be zero.
+       */
+      if (sampler == NULL)
+         continue;
 
-   result = cmd_buffer_emit_samplers(cmd_buffer, stage, &samplers);
-   if (result != VK_SUCCESS)
-      return result;
-   result = cmd_buffer_emit_binding_table(cmd_buffer, stage, &surfaces);
-   if (result != VK_SUCCESS)
-      return result;
-
-   static const uint32_t sampler_state_opcodes[] = {
-      [VK_SHADER_STAGE_VERTEX]                  = 43,
-      [VK_SHADER_STAGE_TESS_CONTROL]            = 44, /* HS */
-      [VK_SHADER_STAGE_TESS_EVALUATION]         = 45, /* DS */
-      [VK_SHADER_STAGE_GEOMETRY]                = 46,
-      [VK_SHADER_STAGE_FRAGMENT]                = 47,
-      [VK_SHADER_STAGE_COMPUTE]                 = 0,
-   };
-
-   static const uint32_t binding_table_opcodes[] = {
-      [VK_SHADER_STAGE_VERTEX]                  = 38,
-      [VK_SHADER_STAGE_TESS_CONTROL]            = 39,
-      [VK_SHADER_STAGE_TESS_EVALUATION]         = 40,
-      [VK_SHADER_STAGE_GEOMETRY]                = 41,
-      [VK_SHADER_STAGE_FRAGMENT]                = 42,
-      [VK_SHADER_STAGE_COMPUTE]                 = 0,
-   };
-
-   if (samplers.alloc_size > 0) {
-      anv_batch_emit(&cmd_buffer->batch,
-                     GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS,
-                     ._3DCommandSubOpcode  = sampler_state_opcodes[stage],
-                     .PointertoVSSamplerState = samplers.offset);
+      memcpy(state->map + (s * 16),
+             sampler->state, sizeof(sampler->state));
    }
 
-   if (surfaces.alloc_size > 0) {
-      anv_batch_emit(&cmd_buffer->batch,
-                     GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS,
-                     ._3DCommandSubOpcode  = binding_table_opcodes[stage],
-                     .PointertoVSBindingTable = surfaces.offset);
-   }
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(*state);
 
    return VK_SUCCESS;
 }
 
-static void
-flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
-{
-   uint32_t s, dirty = cmd_buffer->state.descriptors_dirty &
-                       cmd_buffer->state.pipeline->active_stages;
-
-   VkResult result = VK_SUCCESS;
-   for_each_bit(s, dirty) {
-      result = flush_descriptor_set(cmd_buffer, s);
-      if (result != VK_SUCCESS)
-         break;
-   }
-
-   if (result != VK_SUCCESS) {
-      assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
-
-      result = anv_cmd_buffer_new_surface_state_bo(cmd_buffer);
-      assert(result == VK_SUCCESS);
-
-      /* Re-emit state base addresses so we get the new surface state base
-       * address before we start emitting binding tables etc.
-       */
-      anv_cmd_buffer_emit_state_base_address(cmd_buffer);
-
-      /* Re-emit all active binding tables */
-      for_each_bit(s, cmd_buffer->state.pipeline->active_stages) {
-         result = flush_descriptor_set(cmd_buffer, s);
-
-         /* It had better succeed this time */
-         assert(result == VK_SUCCESS);
-      }
-   }
-
-   cmd_buffer->state.descriptors_dirty &= ~cmd_buffer->state.pipeline->active_stages;
-}
-
-static struct anv_state
+struct anv_state
 anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
-                             uint32_t *a, uint32_t dwords, uint32_t alignment)
+                            const void *data, uint32_t size, uint32_t alignment)
 {
    struct anv_state state;
 
-   state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                              dwords * 4, alignment);
-   memcpy(state.map, a, dwords * 4);
+   state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment);
+   memcpy(state.map, data, size);
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
 
-   VG(VALGRIND_CHECK_MEM_IS_DEFINED(state.map, dwords * 4));
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(state.map, size));
 
    return state;
 }
 
-static struct anv_state
+struct anv_state
 anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
                              uint32_t *a, uint32_t *b,
                              uint32_t dwords, uint32_t alignment)
@@ -697,687 +850,157 @@ anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
    for (uint32_t i = 0; i < dwords; i++)
       p[i] = a[i] | b[i];
 
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
+
    VG(VALGRIND_CHECK_MEM_IS_DEFINED(p, dwords * 4));
 
    return state;
 }
 
-static VkResult
-flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
-{
-   struct anv_device *device = cmd_buffer->device;
-   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-   struct anv_state surfaces = { 0, }, samplers = { 0, };
-   VkResult result;
-
-   result = cmd_buffer_emit_samplers(cmd_buffer,
-                                     VK_SHADER_STAGE_COMPUTE, &samplers);
-   if (result != VK_SUCCESS)
-      return result;
-   result = cmd_buffer_emit_binding_table(cmd_buffer,
-                                          VK_SHADER_STAGE_COMPUTE, &surfaces);
-   if (result != VK_SUCCESS)
-      return result;
-
-   struct GEN8_INTERFACE_DESCRIPTOR_DATA desc = {
-      .KernelStartPointer = pipeline->cs_simd,
-      .KernelStartPointerHigh = 0,
-      .BindingTablePointer = surfaces.offset,
-      .BindingTableEntryCount = 0,
-      .SamplerStatePointer = samplers.offset,
-      .SamplerCount = 0,
-      .NumberofThreadsinGPGPUThreadGroup = 0 /* FIXME: Really? */
-   };
-
-   uint32_t size = GEN8_INTERFACE_DESCRIPTOR_DATA_length * sizeof(uint32_t);
-   struct anv_state state =
-      anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
-
-   GEN8_INTERFACE_DESCRIPTOR_DATA_pack(NULL, state.map, &desc);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD,
-                  .InterfaceDescriptorTotalLength = size,
-                  .InterfaceDescriptorDataStartAddress = state.offset);
-
-   return VK_SUCCESS;
-}
-
-static void
-anv_cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
+void
+anv_cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_subpass *subpass)
 {
-   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-   VkResult result;
-
-   assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
-
-   if (cmd_buffer->state.current_pipeline != GPGPU) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_PIPELINE_SELECT,
-                     .PipelineSelection = GPGPU);
-      cmd_buffer->state.current_pipeline = GPGPU;
-   }
-
-   if (cmd_buffer->state.compute_dirty & ANV_CMD_BUFFER_PIPELINE_DIRTY)
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
-
-   if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
-       (cmd_buffer->state.compute_dirty & ANV_CMD_BUFFER_PIPELINE_DIRTY)) {
-      result = flush_compute_descriptor_set(cmd_buffer);
-      assert(result == VK_SUCCESS);
-      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE;
+   switch (cmd_buffer->device->info.gen) {
+   case 7:
+      gen7_cmd_buffer_begin_subpass(cmd_buffer, subpass);
+      break;
+   case 8:
+      gen8_cmd_buffer_begin_subpass(cmd_buffer, subpass);
+      break;
+   case 9:
+      gen9_cmd_buffer_begin_subpass(cmd_buffer, subpass);
+      break;
+   default:
+      unreachable("unsupported gen\n");
    }
-
-   cmd_buffer->state.compute_dirty = 0;
 }
 
-static void
-anv_cmd_buffer_flush_state(struct anv_cmd_buffer *cmd_buffer)
+struct anv_state
+anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
+                              gl_shader_stage stage)
 {
-   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
-   uint32_t *p;
+   struct anv_push_constants *data =
+      cmd_buffer->state.push_constants[stage];
+   struct brw_stage_prog_data *prog_data =
+      cmd_buffer->state.pipeline->prog_data[stage];
 
-   uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
+   /* If we don't actually have any push constants, bail. */
+   if (data == NULL || prog_data->nr_params == 0)
+      return (struct anv_state) { .offset = 0 };
 
-   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
-
-   if (cmd_buffer->state.current_pipeline != _3D) {
-      anv_batch_emit(&cmd_buffer->batch, GEN8_PIPELINE_SELECT,
-                     .PipelineSelection = _3D);
-      cmd_buffer->state.current_pipeline = _3D;
-   }
-
-   if (vb_emit) {
-      const uint32_t num_buffers = __builtin_popcount(vb_emit);
-      const uint32_t num_dwords = 1 + num_buffers * 4;
-
-      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
-                          GEN8_3DSTATE_VERTEX_BUFFERS);
-      uint32_t vb, i = 0;
-      for_each_bit(vb, vb_emit) {
-         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
-         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
-
-         struct GEN8_VERTEX_BUFFER_STATE state = {
-            .VertexBufferIndex = vb,
-            .MemoryObjectControlState = GEN8_MOCS,
-            .AddressModifyEnable = true,
-            .BufferPitch = pipeline->binding_stride[vb],
-            .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
-            .BufferSize = buffer->size - offset
-         };
-
-         GEN8_VERTEX_BUFFER_STATE_pack(&cmd_buffer->batch, &p[1 + i * 4], &state);
-         i++;
-      }
-   }
-
-   if (cmd_buffer->state.dirty & ANV_CMD_BUFFER_PIPELINE_DIRTY) {
-      /* If somebody compiled a pipeline after starting a command buffer the
-       * scratch bo may have grown since we started this cmd buffer (and
-       * emitted STATE_BASE_ADDRESS).  If we're binding that pipeline now,
-       * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
-      if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
-         anv_cmd_buffer_emit_state_base_address(cmd_buffer);
-
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
-   }
-
-   if (cmd_buffer->state.descriptors_dirty)
-      flush_descriptor_sets(cmd_buffer);
-
-   if (cmd_buffer->state.dirty & ANV_CMD_BUFFER_VP_DIRTY) {
-      struct anv_dynamic_vp_state *vp_state = cmd_buffer->state.vp_state;
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_SCISSOR_STATE_POINTERS,
-                     .ScissorRectPointer = vp_state->scissor.offset);
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC,
-                     .CCViewportPointer = vp_state->cc_vp.offset);
-      anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP,
-                     .SFClipViewportPointer = vp_state->sf_clip_vp.offset);
-   }
-
-   if (cmd_buffer->state.dirty & (ANV_CMD_BUFFER_PIPELINE_DIRTY |
-                                  ANV_CMD_BUFFER_RS_DIRTY)) {
-      anv_batch_emit_merge(&cmd_buffer->batch,
-                           cmd_buffer->state.rs_state->state_sf,
-                           pipeline->state_sf);
-      anv_batch_emit_merge(&cmd_buffer->batch,
-                           cmd_buffer->state.rs_state->state_raster,
-                           pipeline->state_raster);
-   }
-
-   if (cmd_buffer->state.ds_state &&
-       (cmd_buffer->state.dirty & (ANV_CMD_BUFFER_PIPELINE_DIRTY |
-                                   ANV_CMD_BUFFER_DS_DIRTY))) {
-      anv_batch_emit_merge(&cmd_buffer->batch,
-                           cmd_buffer->state.ds_state->state_wm_depth_stencil,
-                           pipeline->state_wm_depth_stencil);
-   }
-
-   if (cmd_buffer->state.dirty & (ANV_CMD_BUFFER_CB_DIRTY |
-                                  ANV_CMD_BUFFER_DS_DIRTY)) {
-      struct anv_state state;
-      if (cmd_buffer->state.ds_state == NULL)
-         state = anv_cmd_buffer_emit_dynamic(cmd_buffer,
-                                             cmd_buffer->state.cb_state->state_color_calc,
-                                             GEN8_COLOR_CALC_STATE_length, 64);
-      else if (cmd_buffer->state.cb_state == NULL)
-         state = anv_cmd_buffer_emit_dynamic(cmd_buffer,
-                                             cmd_buffer->state.ds_state->state_color_calc,
-                                             GEN8_COLOR_CALC_STATE_length, 64);
-      else
-         state = anv_cmd_buffer_merge_dynamic(cmd_buffer,
-                                              cmd_buffer->state.ds_state->state_color_calc,
-                                              cmd_buffer->state.cb_state->state_color_calc,
-                                              GEN8_COLOR_CALC_STATE_length, 64);
-
-      anv_batch_emit(&cmd_buffer->batch,
-                     GEN8_3DSTATE_CC_STATE_POINTERS,
-                     .ColorCalcStatePointer = state.offset,
-                     .ColorCalcStatePointerValid = true);
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         prog_data->nr_params * sizeof(float),
+                                         32 /* bottom 5 bits MBZ */);
+
+   /* Walk through the param array and fill the buffer with data */
+   uint32_t *u32_map = state.map;
+   for (unsigned i = 0; i < prog_data->nr_params; i++) {
+      uint32_t offset = (uintptr_t)prog_data->param[i];
+      u32_map[i] = *(uint32_t *)((uint8_t *)data + offset);
    }
 
-   if (cmd_buffer->state.dirty & (ANV_CMD_BUFFER_PIPELINE_DIRTY |
-                                  ANV_CMD_BUFFER_INDEX_BUFFER_DIRTY)) {
-      anv_batch_emit_merge(&cmd_buffer->batch,
-                           cmd_buffer->state.state_vf, pipeline->state_vf);
-   }
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
 
-   cmd_buffer->state.vb_dirty &= ~vb_emit;
-   cmd_buffer->state.dirty = 0;
-}
-
-void anv_CmdDraw(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    firstVertex,
-    uint32_t                                    vertexCount,
-    uint32_t                                    firstInstance,
-    uint32_t                                    instanceCount)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-
-   anv_cmd_buffer_flush_state(cmd_buffer);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .VertexAccessType = SEQUENTIAL,
-                  .VertexCountPerInstance = vertexCount,
-                  .StartVertexLocation = firstVertex,
-                  .InstanceCount = instanceCount,
-                  .StartInstanceLocation = firstInstance,
-                  .BaseVertexLocation = 0);
-}
-
-void anv_CmdDrawIndexed(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    firstIndex,
-    uint32_t                                    indexCount,
-    int32_t                                     vertexOffset,
-    uint32_t                                    firstInstance,
-    uint32_t                                    instanceCount)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-
-   anv_cmd_buffer_flush_state(cmd_buffer);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .VertexAccessType = RANDOM,
-                  .VertexCountPerInstance = indexCount,
-                  .StartVertexLocation = firstIndex,
-                  .InstanceCount = instanceCount,
-                  .StartInstanceLocation = firstInstance,
-                  .BaseVertexLocation = vertexOffset);
-}
-
-static void
-anv_batch_lrm(struct anv_batch *batch,
-              uint32_t reg, struct anv_bo *bo, uint32_t offset)
-{
-   anv_batch_emit(batch, GEN8_MI_LOAD_REGISTER_MEM,
-                  .RegisterAddress = reg,
-                  .MemoryAddress = { bo, offset });
-}
-
-static void
-anv_batch_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
-{
-   anv_batch_emit(batch, GEN8_MI_LOAD_REGISTER_IMM,
-                  .RegisterOffset = reg,
-                  .DataDWord = imm);
-}
-
-/* Auto-Draw / Indirect Registers */
-#define GEN7_3DPRIM_END_OFFSET          0x2420
-#define GEN7_3DPRIM_START_VERTEX        0x2430
-#define GEN7_3DPRIM_VERTEX_COUNT        0x2434
-#define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
-#define GEN7_3DPRIM_START_INSTANCE      0x243C
-#define GEN7_3DPRIM_BASE_VERTEX         0x2440
-
-void anv_CmdDrawIndirect(
-    VkCmdBuffer                                 cmdBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    uint32_t                                    count,
-    uint32_t                                    stride)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
-
-   anv_cmd_buffer_flush_state(cmd_buffer);
-
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
-   anv_batch_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .IndirectParameterEnable = true,
-                  .VertexAccessType = SEQUENTIAL);
-}
-
-void anv_CmdDrawIndexedIndirect(
-    VkCmdBuffer                                 cmdBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    uint32_t                                    count,
-    uint32_t                                    stride)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
-
-   anv_cmd_buffer_flush_state(cmd_buffer);
-
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
-   anv_batch_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DPRIMITIVE,
-                  .IndirectParameterEnable = true,
-                  .VertexAccessType = RANDOM);
-}
-
-void anv_CmdDispatch(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    x,
-    uint32_t                                    y,
-    uint32_t                                    z)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
-
-   anv_cmd_buffer_flush_compute_state(cmd_buffer);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_GPGPU_WALKER,
-                  .SIMDSize = prog_data->simd_size / 16,
-                  .ThreadDepthCounterMaximum = 0,
-                  .ThreadHeightCounterMaximum = 0,
-                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max,
-                  .ThreadGroupIDXDimension = x,
-                  .ThreadGroupIDYDimension = y,
-                  .ThreadGroupIDZDimension = z,
-                  .RightExecutionMask = pipeline->cs_right_mask,
-                  .BottomExecutionMask = 0xffffffff);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_STATE_FLUSH);
+   return state;
 }
 
-#define GPGPU_DISPATCHDIMX 0x2500
-#define GPGPU_DISPATCHDIMY 0x2504
-#define GPGPU_DISPATCHDIMZ 0x2508
-
-void anv_CmdDispatchIndirect(
-    VkCmdBuffer                                 cmdBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset)
+struct anv_state
+anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_push_constants *data =
+      cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE];
    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
-   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
-
-   anv_cmd_buffer_flush_compute_state(cmd_buffer);
-
-   anv_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
-   anv_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
-   anv_batch_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_GPGPU_WALKER,
-                  .IndirectParameterEnable = true,
-                  .SIMDSize = prog_data->simd_size / 16,
-                  .ThreadDepthCounterMaximum = 0,
-                  .ThreadHeightCounterMaximum = 0,
-                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max,
-                  .RightExecutionMask = pipeline->cs_right_mask,
-                  .BottomExecutionMask = 0xffffffff);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_MEDIA_STATE_FLUSH);
-}
-
-void anv_CmdSetEvent(
-    VkCmdBuffer                                 cmdBuffer,
-    VkEvent                                     event,
-    VkPipelineStageFlags                        stageMask)
-{
-   stub();
-}
-
-void anv_CmdResetEvent(
-    VkCmdBuffer                                 cmdBuffer,
-    VkEvent                                     event,
-    VkPipelineStageFlags                        stageMask)
-{
-   stub();
-}
-
-void anv_CmdWaitEvents(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    eventCount,
-    const VkEvent*                              pEvents,
-    VkPipelineStageFlags                        srcStageMask,
-    VkPipelineStageFlags                        destStageMask,
-    uint32_t                                    memBarrierCount,
-    const void* const*                          ppMemBarriers)
-{
-   stub();
-}
-
-void anv_CmdPipelineBarrier(
-    VkCmdBuffer                                 cmdBuffer,
-    VkPipelineStageFlags                        srcStageMask,
-    VkPipelineStageFlags                        destStageMask,
-    VkBool32                                    byRegion,
-    uint32_t                                    memBarrierCount,
-    const void* const*                          ppMemBarriers)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   uint32_t b, *dw;
+   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+
+   const unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
+   const unsigned push_constant_data_size =
+      (local_id_dwords + prog_data->nr_params) * 4;
+   const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);
+   const unsigned param_aligned_count =
+      reg_aligned_constant_size / sizeof(uint32_t);
+
+   /* If we don't actually have any push constants, bail. */
+   if (reg_aligned_constant_size == 0)
+      return (struct anv_state) { .offset = 0 };
+
+   const unsigned threads = pipeline->cs_thread_width_max;
+   const unsigned total_push_constants_size =
+      reg_aligned_constant_size * threads;
+   const unsigned push_constant_alignment =
+      cmd_buffer->device->info.gen < 8 ? 32 : 64;
+   const unsigned aligned_total_push_constants_size =
+      ALIGN(total_push_constants_size, push_constant_alignment);
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         aligned_total_push_constants_size,
+                                         push_constant_alignment);
 
-   struct GEN8_PIPE_CONTROL cmd = {
-      GEN8_PIPE_CONTROL_header,
-      .PostSyncOperation = NoWrite,
-   };
+   /* Walk through the param array and fill the buffer with data */
+   uint32_t *u32_map = state.map;
 
-   /* XXX: I think waitEvent is a no-op on our HW.  We should verify that. */
+   brw_cs_fill_local_id_payload(cs_prog_data, u32_map, threads,
+                                reg_aligned_constant_size);
 
-   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
-      /* This is just what PIPE_CONTROL does */
+   /* Setup uniform data for the first thread */
+   for (unsigned i = 0; i < prog_data->nr_params; i++) {
+      uint32_t offset = (uintptr_t)prog_data->param[i];
+      u32_map[local_id_dwords + i] = *(uint32_t *)((uint8_t *)data + offset);
    }
 
-   if (anv_clear_mask(&srcStageMask,
-                      VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
-                      VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-                      VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TESS_CONTROL_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TESS_EVALUATION_SHADER_BIT |
-                      VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
-                      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
-                      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
-                      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
-                      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT)) {
-      cmd.StallAtPixelScoreboard = true;
+   /* Copy uniform data from the first thread to every other thread */
+   const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t);
+   for (unsigned t = 1; t < threads; t++) {
+      memcpy(&u32_map[t * param_aligned_count + local_id_dwords],
+             &u32_map[local_id_dwords],
+             uniform_data_size);
    }
 
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
 
-   if (anv_clear_mask(&srcStageMask,
-                      VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
-                      VK_PIPELINE_STAGE_TRANSFER_BIT |
-                      VK_PIPELINE_STAGE_TRANSITION_BIT)) {
-      cmd.CommandStreamerStallEnable = true;
-   }
-
-   if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_HOST_BIT)) {
-      anv_finishme("VK_PIPE_EVENT_CPU_SIGNAL_BIT");
-   }
-
-   /* On our hardware, all stages will wait for execution as needed. */
-   (void)destStageMask;
-
-   /* We checked all known VkPipeEventFlags. */
-   anv_assert(srcStageMask == 0);
-
-   /* XXX: Right now, we're really dumb and just flush whatever categories
-    * the app asks for.  One of these days we may make this a bit better
-    * but right now that's all the hardware allows for in most areas.
-    */
-   VkMemoryOutputFlags out_flags = 0;
-   VkMemoryInputFlags in_flags = 0;
-
-   for (uint32_t i = 0; i < memBarrierCount; i++) {
-      const struct anv_common *common = ppMemBarriers[i];
-      switch (common->sType) {
-      case VK_STRUCTURE_TYPE_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkMemoryBarrier, barrier, common);
-         out_flags |= barrier->outputMask;
-         in_flags |= barrier->inputMask;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkBufferMemoryBarrier, barrier, common);
-         out_flags |= barrier->outputMask;
-         in_flags |= barrier->inputMask;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER: {
-         ANV_COMMON_TO_STRUCT(VkImageMemoryBarrier, barrier, common);
-         out_flags |= barrier->outputMask;
-         in_flags |= barrier->inputMask;
-         break;
-      }
-      default:
-         unreachable("Invalid memory barrier type");
-      }
-   }
-
-   for_each_bit(b, out_flags) {
-      switch ((VkMemoryOutputFlags)(1 << b)) {
-      case VK_MEMORY_OUTPUT_HOST_WRITE_BIT:
-         break; /* FIXME: Little-core systems */
-      case VK_MEMORY_OUTPUT_SHADER_WRITE_BIT:
-         cmd.DCFlushEnable = true;
-         break;
-      case VK_MEMORY_OUTPUT_COLOR_ATTACHMENT_BIT:
-         cmd.RenderTargetCacheFlushEnable = true;
-         break;
-      case VK_MEMORY_OUTPUT_DEPTH_STENCIL_ATTACHMENT_BIT:
-         cmd.DepthCacheFlushEnable = true;
-         break;
-      case VK_MEMORY_OUTPUT_TRANSFER_BIT:
-         cmd.RenderTargetCacheFlushEnable = true;
-         cmd.DepthCacheFlushEnable = true;
-         break;
-      default:
-         unreachable("Invalid memory output flag");
-      }
-   }
-
-   for_each_bit(b, out_flags) {
-      switch ((VkMemoryInputFlags)(1 << b)) {
-      case VK_MEMORY_INPUT_HOST_READ_BIT:
-         break; /* FIXME: Little-core systems */
-      case VK_MEMORY_INPUT_INDIRECT_COMMAND_BIT:
-      case VK_MEMORY_INPUT_INDEX_FETCH_BIT:
-      case VK_MEMORY_INPUT_VERTEX_ATTRIBUTE_FETCH_BIT:
-         cmd.VFCacheInvalidationEnable = true;
-         break;
-      case VK_MEMORY_INPUT_UNIFORM_READ_BIT:
-         cmd.ConstantCacheInvalidationEnable = true;
-         /* fallthrough */
-      case VK_MEMORY_INPUT_SHADER_READ_BIT:
-         cmd.DCFlushEnable = true;
-         cmd.TextureCacheInvalidationEnable = true;
-         break;
-      case VK_MEMORY_INPUT_COLOR_ATTACHMENT_BIT:
-      case VK_MEMORY_INPUT_DEPTH_STENCIL_ATTACHMENT_BIT:
-         break; /* XXX: Hunh? */
-      case VK_MEMORY_INPUT_TRANSFER_BIT:
-         cmd.TextureCacheInvalidationEnable = true;
-         break;
-      }
-   }
-
-   dw = anv_batch_emit_dwords(&cmd_buffer->batch, GEN8_PIPE_CONTROL_length);
-   GEN8_PIPE_CONTROL_pack(&cmd_buffer->batch, dw, &cmd);
+   return state;
 }
 
 void anv_CmdPushConstants(
-    VkCmdBuffer                                 cmdBuffer,
+    VkCommandBuffer                             commandBuffer,
     VkPipelineLayout                            layout,
     VkShaderStageFlags                          stageFlags,
-    uint32_t                                    start,
-    uint32_t                                    length,
-    const void*                                 values)
-{
-   stub();
-}
-
-static void
-anv_cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
+    uint32_t                                    offset,
+    uint32_t                                    size,
+    const void*                                 pValues)
 {
-   struct anv_subpass *subpass = cmd_buffer->state.subpass;
-   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
-   const struct anv_depth_stencil_view *view;
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   static const struct anv_depth_stencil_view null_view =
-      { .depth_format = D16_UNORM, .depth_stride = 0, .stencil_stride = 0 };
+   anv_foreach_stage(stage, stageFlags) {
+      anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, client_data);
 
-   if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
-      const struct anv_attachment_view *aview =
-         fb->attachments[subpass->depth_stencil_attachment];
-      assert(aview->attachment_type == ANV_ATTACHMENT_VIEW_TYPE_DEPTH_STENCIL);
-      view = (const struct anv_depth_stencil_view *)aview;
-   } else {
-      view = &null_view;
+      memcpy(cmd_buffer->state.push_constants[stage]->client_data + offset,
+             pValues, size);
    }
 
-   /* FIXME: Implement the PMA stall W/A */
-   /* FIXME: Width and Height are wrong */
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_DEPTH_BUFFER,
-                  .SurfaceType = SURFTYPE_2D,
-                  .DepthWriteEnable = view->depth_stride > 0,
-                  .StencilWriteEnable = view->stencil_stride > 0,
-                  .HierarchicalDepthBufferEnable = false,
-                  .SurfaceFormat = view->depth_format,
-                  .SurfacePitch = view->depth_stride > 0 ? view->depth_stride - 1 : 0,
-                  .SurfaceBaseAddress = { view->bo,  view->depth_offset },
-                  .Height = cmd_buffer->state.framebuffer->height - 1,
-                  .Width = cmd_buffer->state.framebuffer->width - 1,
-                  .LOD = 0,
-                  .Depth = 1 - 1,
-                  .MinimumArrayElement = 0,
-                  .DepthBufferObjectControlState = GEN8_MOCS,
-                  .RenderTargetViewExtent = 1 - 1,
-                  .SurfaceQPitch = view->depth_qpitch >> 2);
-
-   /* Disable hierarchial depth buffers. */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_HIER_DEPTH_BUFFER);
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_STENCIL_BUFFER,
-                  .StencilBufferEnable = view->stencil_stride > 0,
-                  .StencilBufferObjectControlState = GEN8_MOCS,
-                  .SurfacePitch = view->stencil_stride > 0 ? view->stencil_stride - 1 : 0,
-                  .SurfaceBaseAddress = { view->bo, view->stencil_offset },
-                  .SurfaceQPitch = view->stencil_qpitch >> 2);
-
-   /* Clear the clear params. */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_CLEAR_PARAMS);
-}
-
-void
-anv_cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
-                             struct anv_subpass *subpass)
-{
-   cmd_buffer->state.subpass = subpass;
-
-   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
-
-   anv_cmd_buffer_emit_depth_stencil(cmd_buffer);
-}
-
-void anv_CmdBeginRenderPass(
-    VkCmdBuffer                                 cmdBuffer,
-    const VkRenderPassBeginInfo*                pRenderPassBegin,
-    VkRenderPassContents                        contents)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-   ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
-   ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
-
-   cmd_buffer->state.framebuffer = framebuffer;
-   cmd_buffer->state.pass = pass;
-
-   const VkRect2D *render_area = &pRenderPassBegin->renderArea;
-
-   anv_batch_emit(&cmd_buffer->batch, GEN8_3DSTATE_DRAWING_RECTANGLE,
-                  .ClippedDrawingRectangleYMin = render_area->offset.y,
-                  .ClippedDrawingRectangleXMin = render_area->offset.x,
-                  .ClippedDrawingRectangleYMax =
-                     render_area->offset.y + render_area->extent.height - 1,
-                  .ClippedDrawingRectangleXMax =
-                     render_area->offset.x + render_area->extent.width - 1,
-                  .DrawingRectangleOriginY = 0,
-                  .DrawingRectangleOriginX = 0);
-
-   anv_cmd_buffer_clear_attachments(cmd_buffer, pass,
-                                    pRenderPassBegin->pAttachmentClearValues);
-
-   anv_cmd_buffer_begin_subpass(cmd_buffer, pass->subpasses);
-}
-
-void anv_CmdNextSubpass(
-    VkCmdBuffer                                 cmdBuffer,
-    VkRenderPassContents                        contents)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-
-   assert(cmd_buffer->level == VK_CMD_BUFFER_LEVEL_PRIMARY);
-
-   anv_cmd_buffer_begin_subpass(cmd_buffer, cmd_buffer->state.subpass + 1);
-}
-
-void anv_CmdEndRenderPass(
-    VkCmdBuffer                                 cmdBuffer)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, cmdBuffer);
-
-   /* Emit a flushing pipe control at the end of a pass.  This is kind of a
-    * hack but it ensures that render targets always actually get written.
-    * Eventually, we should do flushing based on image format transitions
-    * or something of that nature.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GEN8_PIPE_CONTROL,
-                  .PostSyncOperation = NoWrite,
-                  .RenderTargetCacheFlushEnable = true,
-                  .InstructionCacheInvalidateEnable = true,
-                  .DepthCacheFlushEnable = true,
-                  .VFCacheInvalidationEnable = true,
-                  .TextureCacheInvalidationEnable = true,
-                  .CommandStreamerStallEnable = true);
+   cmd_buffer->state.push_constants_dirty |= stageFlags;
 }
 
 void anv_CmdExecuteCommands(
-    VkCmdBuffer                                 cmdBuffer,
-    uint32_t                                    cmdBuffersCount,
-    const VkCmdBuffer*                          pCmdBuffers)
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    commandBuffersCount,
+    const VkCommandBuffer*                      pCmdBuffers)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, primary, cmdBuffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
 
-   assert(primary->level == VK_CMD_BUFFER_LEVEL_PRIMARY);
+   assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
    anv_assert(primary->state.subpass == &primary->state.pass->subpasses[0]);
 
-   for (uint32_t i = 0; i < cmdBuffersCount; i++) {
+   for (uint32_t i = 0; i < commandBuffersCount; i++) {
       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 
-      assert(secondary->level == VK_CMD_BUFFER_LEVEL_SECONDARY);
+      assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
 
       anv_cmd_buffer_add_secondary(primary, secondary);
    }
@@ -1385,17 +1008,23 @@ void anv_CmdExecuteCommands(
 
 VkResult anv_CreateCommandPool(
     VkDevice                                    _device,
-    const VkCmdPoolCreateInfo*                  pCreateInfo,
-    VkCmdPool*                                  pCmdPool)
+    const VkCommandPoolCreateInfo*              pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkCommandPool*                              pCmdPool)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
    struct anv_cmd_pool *pool;
 
-   pool = anv_device_alloc(device, sizeof(*pool), 8,
-                           VK_SYSTEM_ALLOC_TYPE_API_OBJECT);
+   pool = anv_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool == NULL)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   if (pAllocator)
+      pool->alloc = *pAllocator;
+   else
+      pool->alloc = device->alloc;
+
    list_inithead(&pool->cmd_buffers);
 
    *pCmdPool = anv_cmd_pool_to_handle(pool);
@@ -1403,31 +1032,50 @@ VkResult anv_CreateCommandPool(
    return VK_SUCCESS;
 }
 
-VkResult anv_DestroyCommandPool(
+void anv_DestroyCommandPool(
     VkDevice                                    _device,
-    VkCmdPool                                   cmdPool)
+    VkCommandPool                               commandPool,
+    const VkAllocationCallbacks*                pAllocator)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_cmd_pool, pool, cmdPool);
+   ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
 
-   anv_ResetCommandPool(_device, cmdPool, 0);
+   anv_ResetCommandPool(_device, commandPool, 0);
 
-   anv_device_free(device, pool);
-
-   return VK_SUCCESS;
+   anv_free2(&device->alloc, pAllocator, pool);
 }
 
 VkResult anv_ResetCommandPool(
     VkDevice                                    device,
-    VkCmdPool                                   cmdPool,
-    VkCmdPoolResetFlags                         flags)
+    VkCommandPool                               commandPool,
+    VkCommandPoolResetFlags                     flags)
 {
-   ANV_FROM_HANDLE(anv_cmd_pool, pool, cmdPool);
+   ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
 
    list_for_each_entry_safe(struct anv_cmd_buffer, cmd_buffer,
                             &pool->cmd_buffers, pool_link) {
-      anv_DestroyCommandBuffer(device, anv_cmd_buffer_to_handle(cmd_buffer));
+      anv_cmd_buffer_destroy(cmd_buffer);
    }
 
    return VK_SUCCESS;
 }
+
+/**
+ * Return NULL if the current subpass has no depthstencil attachment.
+ */
+const struct anv_image_view *
+anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
+   const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+
+   if (subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
+      return NULL;
+
+   const struct anv_image_view *iview =
+      fb->attachments[subpass->depth_stencil_attachment];
+
+   assert(anv_format_is_depth_or_stencil(iview->format));
+
+   return iview;
+}