winsys/amdgpu: avoid ioctl call when fence_wait is called without timeout
[mesa.git] / src / intel / vulkan / genX_cmd_buffer.c
index 2606a66f2a7dcf3798e984905e7c13f34413e128..ee47c2926e0579c00346a16d7eecc6b1b84c4081 100644 (file)
@@ -49,46 +49,50 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
     * this, we get GPU hangs when using multi-level command buffers which
     * clear depth, reset state base address, and then go render stuff.
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
-                  .RenderTargetCacheFlushEnable = true);
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.RenderTargetCacheFlushEnable = true;
+   }
 #endif
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS),
-      .GeneralStateBaseAddress = { scratch_bo, 0 },
-      .GeneralStateMemoryObjectControlState = GENX(MOCS),
-      .GeneralStateBaseAddressModifyEnable = true,
+   anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
+      sba.GeneralStateBaseAddress = (struct anv_address) { scratch_bo, 0 };
+      sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
+      sba.GeneralStateBaseAddressModifyEnable = true;
 
-      .SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer),
-      .SurfaceStateMemoryObjectControlState = GENX(MOCS),
-      .SurfaceStateBaseAddressModifyEnable = true,
+      sba.SurfaceStateBaseAddress =
+         anv_cmd_buffer_surface_base_address(cmd_buffer);
+      sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
+      sba.SurfaceStateBaseAddressModifyEnable = true;
 
-      .DynamicStateBaseAddress = { &device->dynamic_state_block_pool.bo, 0 },
-      .DynamicStateMemoryObjectControlState = GENX(MOCS),
-      .DynamicStateBaseAddressModifyEnable = true,
+      sba.DynamicStateBaseAddress =
+         (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
+      sba.DynamicStateMemoryObjectControlState = GENX(MOCS),
+      sba.DynamicStateBaseAddressModifyEnable = true,
 
-      .IndirectObjectBaseAddress = { NULL, 0 },
-      .IndirectObjectMemoryObjectControlState = GENX(MOCS),
-      .IndirectObjectBaseAddressModifyEnable = true,
+      sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
+      sba.IndirectObjectBaseAddressModifyEnable = true;
 
-      .InstructionBaseAddress = { &device->instruction_block_pool.bo, 0 },
-      .InstructionMemoryObjectControlState = GENX(MOCS),
-      .InstructionBaseAddressModifyEnable = true,
+      sba.InstructionBaseAddress =
+         (struct anv_address) { &device->instruction_block_pool.bo, 0 };
+      sba.InstructionMemoryObjectControlState = GENX(MOCS);
+      sba.InstructionBaseAddressModifyEnable = true;
 
 #  if (GEN_GEN >= 8)
       /* Broadwell requires that we specify a buffer size for a bunch of
        * these fields.  However, since we will be growing the BO's live, we
        * just set them all to the maximum.
        */
-      .GeneralStateBufferSize = 0xfffff,
-      .GeneralStateBufferSizeModifyEnable = true,
-      .DynamicStateBufferSize = 0xfffff,
-      .DynamicStateBufferSizeModifyEnable = true,
-      .IndirectObjectBufferSize = 0xfffff,
-      .IndirectObjectBufferSizeModifyEnable = true,
-      .InstructionBufferSize = 0xfffff,
-      .InstructionBuffersizeModifyEnable = true,
+      sba.GeneralStateBufferSize                = 0xfffff;
+      sba.GeneralStateBufferSizeModifyEnable    = true;
+      sba.DynamicStateBufferSize                = 0xfffff;
+      sba.DynamicStateBufferSizeModifyEnable    = true;
+      sba.IndirectObjectBufferSize              = 0xfffff;
+      sba.IndirectObjectBufferSizeModifyEnable  = true;
+      sba.InstructionBufferSize                 = 0xfffff;
+      sba.InstructionBuffersizeModifyEnable     = true;
 #  endif
-   );
+   }
 
    /* After re-setting the surface state base address, we have to do some
     * cache flusing so that the sampler engine will pick up the new
@@ -127,8 +131,9 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
     * units cache the binding table in the texture cache.  However, we have
     * yet to be able to actually confirm this.
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
-                  .TextureCacheInvalidationEnable = true);
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.TextureCacheInvalidationEnable = true;
+   }
 }
 
 void genX(CmdPipelineBarrier)(
@@ -269,6 +274,185 @@ void genX(CmdPipelineBarrier)(
    }
 }
 
+static uint32_t
+cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   static const uint32_t push_constant_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+      [MESA_SHADER_COMPUTE]                     = 0,
+   };
+
+   VkShaderStageFlags flushed = 0;
+
+   anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
+      if (stage == MESA_SHADER_COMPUTE)
+         continue;
+
+      struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
+
+      if (state.offset == 0) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
+            c._3DCommandSubOpcode = push_constant_opcodes[stage];
+      } else {
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
+            c._3DCommandSubOpcode = push_constant_opcodes[stage],
+            c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
+#if GEN_GEN >= 9
+               .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
+               .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+#else
+               .PointerToConstantBuffer0 = { .offset = state.offset },
+               .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
+#endif
+            };
+         }
+      }
+
+      flushed |= mesa_to_vk_shader_stage(stage);
+   }
+
+   cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
+
+   return flushed;
+}
+
+void
+genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   uint32_t *p;
+
+   uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
+
+   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   if (vb_emit) {
+      const uint32_t num_buffers = __builtin_popcount(vb_emit);
+      const uint32_t num_dwords = 1 + num_buffers * 4;
+
+      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                          GENX(3DSTATE_VERTEX_BUFFERS));
+      uint32_t vb, i = 0;
+      for_each_bit(vb, vb_emit) {
+         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
+         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
+
+         struct GENX(VERTEX_BUFFER_STATE) state = {
+            .VertexBufferIndex = vb,
+
+#if GEN_GEN >= 8
+            .MemoryObjectControlState = GENX(MOCS),
+#else
+            .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
+            .InstanceDataStepRate = 1,
+            .VertexBufferMemoryObjectControlState = GENX(MOCS),
+#endif
+
+            .AddressModifyEnable = true,
+            .BufferPitch = pipeline->binding_stride[vb],
+            .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
+
+#if GEN_GEN >= 8
+            .BufferSize = buffer->size - offset
+#else
+            .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
+#endif
+         };
+
+         GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
+         i++;
+      }
+   }
+
+   cmd_buffer->state.vb_dirty &= ~vb_emit;
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      /* If somebody compiled a pipeline after starting a command buffer the
+       * scratch bo may have grown since we started this cmd buffer (and
+       * emitted STATE_BASE_ADDRESS).  If we're binding that pipeline now,
+       * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
+      if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
+         anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
+
+      /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
+       *
+       *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
+       *    the next 3DPRIMITIVE command after programming the
+       *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
+       *
+       * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
+       * pipeline setup, we need to dirty push constants.
+       */
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+   }
+
+#if GEN_GEN <= 7
+   if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
+       cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
+      /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+       *
+       *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
+       *    stall needs to be sent just prior to any 3DSTATE_VS,
+       *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
+       *    3DSTATE_BINDING_TABLE_POINTER_VS,
+       *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
+       *    PIPE_CONTROL needs to be sent before any combination of VS
+       *    associated 3DSTATE."
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.DepthStallEnable  = true;
+         pc.PostSyncOperation = WriteImmediateData;
+         pc.Address           =
+            (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
+      }
+   }
+#endif
+
+   /* We emit the binding tables and sampler tables first, then emit push
+    * constants and then finally emit binding table and sampler table
+    * pointers.  It has to happen in this order, since emitting the binding
+    * tables may change the push constants (in case of storage images). After
+    * emitting push constants, on SKL+ we have to emit the corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+    */
+   uint32_t dirty = 0;
+   if (cmd_buffer->state.descriptors_dirty)
+      dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer);
+
+   if (cmd_buffer->state.push_constants_dirty) {
+#if GEN_GEN >= 9
+      /* On Sky Lake and later, the binding table pointers commands are
+       * what actually flush the changes to push constant state so we need
+       * to dirty them so they get re-emitted below.
+       */
+      dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
+#else
+      cmd_buffer_flush_push_constants(cmd_buffer);
+#endif
+   }
+
+   if (dirty)
+      gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
+      gen8_cmd_buffer_emit_viewport(cmd_buffer);
+
+   if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
+      gen7_cmd_buffer_emit_scissor(cmd_buffer);
+
+   genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+}
+
 static void
 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
                              struct anv_bo *bo, uint32_t offset)
@@ -326,14 +510,15 @@ void genX(CmdDraw)(
    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
-      .VertexAccessType                         = SEQUENTIAL,
-      .PrimitiveTopologyType                    = pipeline->topology,
-      .VertexCountPerInstance                   = vertexCount,
-      .StartVertexLocation                      = firstVertex,
-      .InstanceCount                            = instanceCount,
-      .StartInstanceLocation                    = firstInstance,
-      .BaseVertexLocation                       = 0);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = pipeline->topology;
+      prim.VertexCountPerInstance   = vertexCount;
+      prim.StartVertexLocation      = firstVertex;
+      prim.InstanceCount            = instanceCount;
+      prim.StartInstanceLocation    = firstInstance;
+      prim.BaseVertexLocation       = 0;
+   }
 }
 
 void genX(CmdDrawIndexed)(
@@ -353,14 +538,15 @@ void genX(CmdDrawIndexed)(
    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
-      .VertexAccessType                         = RANDOM,
-      .PrimitiveTopologyType                    = pipeline->topology,
-      .VertexCountPerInstance                   = indexCount,
-      .StartVertexLocation                      = firstIndex,
-      .InstanceCount                            = instanceCount,
-      .StartInstanceLocation                    = firstInstance,
-      .BaseVertexLocation                       = vertexOffset);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = RANDOM;
+      prim.PrimitiveTopologyType    = pipeline->topology;
+      prim.VertexCountPerInstance   = indexCount;
+      prim.StartVertexLocation      = firstIndex;
+      prim.InstanceCount            = instanceCount;
+      prim.StartInstanceLocation    = firstInstance;
+      prim.BaseVertexLocation       = vertexOffset;
+   }
 }
 
 /* Auto-Draw / Indirect Registers */
@@ -375,17 +561,19 @@ static void
 emit_lrm(struct anv_batch *batch,
          uint32_t reg, struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
-                  .RegisterAddress = reg,
-                  .MemoryAddress = { bo, offset });
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress  = reg;
+      lrm.MemoryAddress    = (struct anv_address) { bo, offset };
+   }
 }
 
 static void
 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 {
-   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM),
-                  .RegisterOffset = reg,
-                  .DataDWord = imm);
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = reg;
+      lri.DataDWord        = imm;
+   }
 }
 
 void genX(CmdDrawIndirect)(
@@ -413,10 +601,11 @@ void genX(CmdDrawIndirect)(
    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
-      .IndirectParameterEnable                  = true,
-      .VertexAccessType                         = SEQUENTIAL,
-      .PrimitiveTopologyType                    = pipeline->topology);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.IndirectParameterEnable  = true;
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = pipeline->topology;
+   }
 }
 
 void genX(CmdDrawIndexedIndirect)(
@@ -445,12 +634,31 @@ void genX(CmdDrawIndexedIndirect)(
    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
-      .IndirectParameterEnable                  = true,
-      .VertexAccessType                         = RANDOM,
-      .PrimitiveTopologyType                    = pipeline->topology);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.IndirectParameterEnable  = true;
+      prim.VertexAccessType         = RANDOM;
+      prim.PrimitiveTopologyType    = pipeline->topology;
+   }
 }
 
+#if GEN_GEN == 7
+
+static bool
+verify_cmd_parser(const struct anv_device *device,
+                  int required_version,
+                  const char *function)
+{
+   if (device->instance->physicalDevice.cmd_parser_version < required_version) {
+      vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
+                "cmd parser version %d is required for %s",
+                required_version, function);
+      return false;
+   } else {
+      return true;
+   }
+}
+
+#endif
 
 void genX(CmdDispatch)(
     VkCommandBuffer                             commandBuffer,
@@ -478,18 +686,19 @@ void genX(CmdDispatch)(
 
    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
-                  .SIMDSize = prog_data->simd_size / 16,
-                  .ThreadDepthCounterMaximum = 0,
-                  .ThreadHeightCounterMaximum = 0,
-                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
-                  .ThreadGroupIDXDimension = x,
-                  .ThreadGroupIDYDimension = y,
-                  .ThreadGroupIDZDimension = z,
-                  .RightExecutionMask = pipeline->cs_right_mask,
-                  .BottomExecutionMask = 0xffffffff);
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.SIMDSize                     = prog_data->simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = pipeline->cs_thread_width_max - 1;
+      ggw.ThreadGroupIDXDimension      = x;
+      ggw.ThreadGroupIDYDimension      = y;
+      ggw.ThreadGroupIDZDimension      = z;
+      ggw.RightExecutionMask           = pipeline->cs_right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
 }
 
 #define GPGPU_DISPATCHDIMX 0x2500
@@ -512,6 +721,14 @@ void genX(CmdDispatchIndirect)(
    uint32_t bo_offset = buffer->offset + offset;
    struct anv_batch *batch = &cmd_buffer->batch;
 
+#if GEN_GEN == 7
+   /* Linux 4.4 added command parser version 5 which allows the GPGPU
+    * indirect dispatch registers to be written.
+    */
+   if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
+      return;
+#endif
+
    if (prog_data->uses_num_work_groups) {
       cmd_buffer->state.num_workgroups_offset = bo_offset;
       cmd_buffer->state.num_workgroups_bo = bo;
@@ -533,63 +750,164 @@ void genX(CmdDispatchIndirect)(
    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
 
    /* predicate = (compute_dispatch_indirect_x_size == 0); */
-   anv_batch_emit(batch, GENX(MI_PREDICATE),
-                  .LoadOperation = LOAD_LOAD,
-                  .CombineOperation = COMBINE_SET,
-                  .CompareOperation = COMPARE_SRCS_EQUAL);
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
 
    /* Load compute_dispatch_indirect_y_size into SRC0 */
    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
 
    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
-   anv_batch_emit(batch, GENX(MI_PREDICATE),
-                  .LoadOperation = LOAD_LOAD,
-                  .CombineOperation = COMBINE_OR,
-                  .CompareOperation = COMPARE_SRCS_EQUAL);
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
 
    /* Load compute_dispatch_indirect_z_size into SRC0 */
    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
 
    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
-   anv_batch_emit(batch, GENX(MI_PREDICATE),
-                  .LoadOperation = LOAD_LOAD,
-                  .CombineOperation = COMBINE_OR,
-                  .CompareOperation = COMPARE_SRCS_EQUAL);
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
 
    /* predicate = !predicate; */
 #define COMPARE_FALSE                           1
-   anv_batch_emit(batch, GENX(MI_PREDICATE),
-                  .LoadOperation = LOAD_LOADINV,
-                  .CombineOperation = COMBINE_OR,
-                  .CompareOperation = COMPARE_FALSE);
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_FALSE;
+   }
 #endif
 
-   anv_batch_emit(batch, GENX(GPGPU_WALKER),
-                  .IndirectParameterEnable = true,
-                  .PredicateEnable = GEN_GEN <= 7,
-                  .SIMDSize = prog_data->simd_size / 16,
-                  .ThreadDepthCounterMaximum = 0,
-                  .ThreadHeightCounterMaximum = 0,
-                  .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
-                  .RightExecutionMask = pipeline->cs_right_mask,
-                  .BottomExecutionMask = 0xffffffff);
+   anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = true;
+      ggw.PredicateEnable              = GEN_GEN <= 7;
+      ggw.SIMDSize                     = prog_data->simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = pipeline->cs_thread_width_max - 1;
+      ggw.RightExecutionMask           = pipeline->cs_right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
+
+   anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
+static void
+flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
+                                      uint32_t pipeline)
+{
+#if GEN_GEN >= 8 && GEN_GEN < 10
+   /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
+    *
+    *   Software must clear the COLOR_CALC_STATE Valid field in
+    *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
+    *   with Pipeline Select set to GPGPU.
+    *
+    * The internal hardware docs recommend the same workaround for Gen9
+    * hardware too.
+    */
+   if (pipeline == GPGPU)
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
+#elif GEN_GEN <= 7
+      /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+       * PIPELINE_SELECT [DevBWR+]":
+       *
+       *   Project: DEVSNB+
+       *
+       *   Software must ensure all the write caches are flushed through a
+       *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+       *   command to invalidate read only caches prior to programming
+       *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.RenderTargetCacheFlushEnable  = true;
+         pc.DepthCacheFlushEnable         = true;
+         pc.DCFlushEnable                 = true;
+         pc.PostSyncOperation             = NoWrite;
+         pc.CommandStreamerStallEnable    = true;
+      }
 
-   anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH));
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.TextureCacheInvalidationEnable   = true;
+         pc.ConstantCacheInvalidationEnable  = true;
+         pc.StateCacheInvalidationEnable     = true;
+         pc.InstructionCacheInvalidateEnable = true;
+         pc.PostSyncOperation                = NoWrite;
+      }
+#endif
 }
 
 void
 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
 {
    if (cmd_buffer->state.current_pipeline != _3D) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
+      flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
 #if GEN_GEN >= 9
-                     .MaskBits = 3,
+         ps.MaskBits = 3;
 #endif
-                     .PipelineSelection = _3D);
+         ps.PipelineSelection = _3D;
+      }
+
       cmd_buffer->state.current_pipeline = _3D;
    }
 }
 
+void
+genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.current_pipeline != GPGPU) {
+      flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
+#if GEN_GEN >= 9
+         ps.MaskBits = 3;
+#endif
+         ps.PipelineSelection = GPGPU;
+      }
+
+      cmd_buffer->state.current_pipeline = GPGPU;
+   }
+}
+
+struct anv_state
+genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
+                                          struct anv_framebuffer *fb)
+{
+   struct anv_state state =
+      anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
+
+   struct GENX(RENDER_SURFACE_STATE) null_ss = {
+      .SurfaceType = SURFTYPE_NULL,
+      .SurfaceArray = fb->layers > 0,
+      .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
+#if GEN_GEN >= 8
+      .TileMode = YMAJOR,
+#else
+      .TiledSurface = true,
+#endif
+      .Width = fb->width - 1,
+      .Height = fb->height - 1,
+      .Depth = fb->layers - 1,
+      .RenderTargetViewExtent = fb->layers - 1,
+   };
+
+   GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss);
+
+   if (!cmd_buffer->device->info.has_llc)
+      anv_state_clflush(state);
+
+   return state;
+}
+
 static void
 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -598,38 +916,43 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
    const struct anv_image_view *iview =
       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
    const struct anv_image *image = iview ? iview->image : NULL;
-   const struct anv_format *anv_format =
-      iview ? anv_format_for_vk_format(iview->vk_format) : NULL;
-   const bool has_depth = iview && anv_format->has_depth;
-   const bool has_stencil = iview && anv_format->has_stencil;
+   const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
+   const bool has_stencil =
+      image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
 
    /* FIXME: Implement the PMA stall W/A */
    /* FIXME: Width and Height are wrong */
 
    /* Emit 3DSTATE_DEPTH_BUFFER */
    if (has_depth) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
-         .SurfaceType = SURFTYPE_2D,
-         .DepthWriteEnable = true,
-         .StencilWriteEnable = has_stencil,
-         .HierarchicalDepthBufferEnable = false,
-         .SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
-                                                    &image->depth_surface.isl),
-         .SurfacePitch = image->depth_surface.isl.row_pitch - 1,
-         .SurfaceBaseAddress = {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
+         db.SurfaceType                   = SURFTYPE_2D;
+         db.DepthWriteEnable              = true;
+         db.StencilWriteEnable            = has_stencil;
+         db.HierarchicalDepthBufferEnable = false;
+
+         db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
+                                                      &image->depth_surface.isl);
+
+         db.SurfaceBaseAddress = (struct anv_address) {
             .bo = image->bo,
             .offset = image->offset + image->depth_surface.offset,
-         },
-         .Height = fb->height - 1,
-         .Width = fb->width - 1,
-         .LOD = 0,
-         .Depth = 1 - 1,
-         .MinimumArrayElement = 0,
-         .DepthBufferObjectControlState = GENX(MOCS),
+         };
+         db.DepthBufferObjectControlState = GENX(MOCS),
+
+         db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
+         db.Height               = fb->height - 1;
+         db.Width                = fb->width - 1;
+         db.LOD                  = 0;
+         db.Depth                = 1 - 1;
+         db.MinimumArrayElement  = 0;
+
 #if GEN_GEN >= 8
-         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
+         db.SurfaceQPitch =
+            isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
 #endif
-         .RenderTargetViewExtent = 1 - 1);
+         db.RenderTargetViewExtent = 1 - 1;
+      }
    } else {
       /* Even when no depth buffer is present, the hardware requires that
        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
@@ -649,45 +972,47 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
        * be combined with a stencil buffer so we use D32_FLOAT instead.
        */
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
-         .SurfaceType = SURFTYPE_2D,
-         .SurfaceFormat = D32_FLOAT,
-         .Width = fb->width - 1,
-         .Height = fb->height - 1,
-         .StencilWriteEnable = has_stencil);
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
+         db.SurfaceType          = SURFTYPE_2D;
+         db.SurfaceFormat        = D32_FLOAT;
+         db.Width                = fb->width - 1;
+         db.Height               = fb->height - 1;
+         db.StencilWriteEnable   = has_stencil;
+      }
    }
 
    /* Emit 3DSTATE_STENCIL_BUFFER */
    if (has_stencil) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER),
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
 #if GEN_GEN >= 8 || GEN_IS_HASWELL
-         .StencilBufferEnable = true,
+         sb.StencilBufferEnable = true,
 #endif
-         .StencilBufferObjectControlState = GENX(MOCS),
+         sb.StencilBufferObjectControlState = GENX(MOCS),
 
          /* Stencil buffers have strange pitch. The PRM says:
           *
           *    The pitch must be set to 2x the value computed based on width,
           *    as the stencil buffer is stored with two rows interleaved.
           */
-         .SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
+         sb.SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
 
 #if GEN_GEN >= 8
-         .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
+         sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
 #endif
-         .SurfaceBaseAddress = {
+         sb.SurfaceBaseAddress = (struct anv_address) {
             .bo = image->bo,
             .offset = image->offset + image->stencil_surface.offset,
-         });
+         };
+      }
    } else {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER));
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
    }
 
    /* Disable hierarchial depth buffers. */
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER));
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hz);
 
    /* Clear the clear params. */
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS));
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp);
 }
 
 /**
@@ -721,15 +1046,16 @@ void genX(CmdBeginRenderPass)(
 
    const VkRect2D *render_area = &pRenderPassBegin->renderArea;
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DRAWING_RECTANGLE),
-                  .ClippedDrawingRectangleYMin = MAX2(render_area->offset.y, 0),
-                  .ClippedDrawingRectangleXMin = MAX2(render_area->offset.x, 0),
-                  .ClippedDrawingRectangleYMax =
-                     render_area->offset.y + render_area->extent.height - 1,
-                  .ClippedDrawingRectangleXMax =
-                     render_area->offset.x + render_area->extent.width - 1,
-                  .DrawingRectangleOriginY = 0,
-                  .DrawingRectangleOriginX = 0);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DRAWING_RECTANGLE), r) {
+      r.ClippedDrawingRectangleYMin = MAX2(render_area->offset.y, 0);
+      r.ClippedDrawingRectangleXMin = MAX2(render_area->offset.x, 0);
+      r.ClippedDrawingRectangleYMax =
+         render_area->offset.y + render_area->extent.height - 1;
+      r.ClippedDrawingRectangleXMax =
+         render_area->offset.x + render_area->extent.width - 1;
+      r.DrawingRectangleOriginY     = 0;
+      r.DrawingRectangleOriginX     = 0;
+   }
 
    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
    anv_cmd_buffer_clear_subpass(cmd_buffer);
@@ -760,22 +1086,24 @@ static void
 emit_ps_depth_count(struct anv_batch *batch,
                     struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL),
-                  .DestinationAddressType = DAT_PPGTT,
-                  .PostSyncOperation = WritePSDepthCount,
-                  .DepthStallEnable = true,
-                  .Address = { bo, offset });
+   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+      pc.DestinationAddressType  = DAT_PPGTT;
+      pc.PostSyncOperation       = WritePSDepthCount;
+      pc.DepthStallEnable        = true;
+      pc.Address                 = (struct anv_address) { bo, offset };
+   }
 }
 
 static void
 emit_query_availability(struct anv_batch *batch,
                         struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL),
-                  .DestinationAddressType = DAT_PPGTT,
-                  .PostSyncOperation = WriteImmediateData,
-                  .Address = { bo, offset },
-                  .ImmediateData = 1);
+   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+      pc.DestinationAddressType  = DAT_PPGTT;
+      pc.PostSyncOperation       = WriteImmediateData;
+      pc.Address                 = (struct anv_address) { bo, offset };
+      pc.ImmediateData           = 1;
+   }
 }
 
 void genX(CmdBeginQuery)(
@@ -795,9 +1123,10 @@ void genX(CmdBeginQuery)(
     */
    if (cmd_buffer->state.need_query_wa) {
       cmd_buffer->state.need_query_wa = false;
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
-                     .DepthCacheFlushEnable = true,
-                     .DepthStallEnable = true);
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.DepthCacheFlushEnable   = true;
+         pc.DepthStallEnable        = true;
+      }
    }
 
    switch (pool->type) {
@@ -851,20 +1180,23 @@ void genX(CmdWriteTimestamp)(
 
    switch (pipelineStage) {
    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM),
-                     .RegisterAddress = TIMESTAMP,
-                     .MemoryAddress = { &pool->bo, offset });
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM),
-                     .RegisterAddress = TIMESTAMP + 4,
-                     .MemoryAddress = { &pool->bo, offset + 4 });
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+         srm.RegisterAddress  = TIMESTAMP;
+         srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+         srm.RegisterAddress  = TIMESTAMP + 4;
+         srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
+      }
       break;
 
    default:
       /* Everything else is bottom-of-pipe */
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
-                     .DestinationAddressType = DAT_PPGTT,
-                     .PostSyncOperation = WriteTimestamp,
-                     .Address = { &pool->bo, offset });
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.DestinationAddressType  = DAT_PPGTT,
+         pc.PostSyncOperation       = WriteTimestamp,
+         pc.Address = (struct anv_address) { &pool->bo, offset };
+      }
       break;
    }
 
@@ -909,26 +1241,31 @@ static void
 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
                       struct anv_bo *bo, uint32_t offset)
 {
-   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
-                  .RegisterAddress = reg,
-                  .MemoryAddress = { bo, offset });
-   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
-                  .RegisterAddress = reg + 4,
-                  .MemoryAddress = { bo, offset + 4 });
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress  = reg,
+      lrm.MemoryAddress    = (struct anv_address) { bo, offset };
+   }
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress  = reg + 4;
+      lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
+   }
 }
 
 static void
 store_query_result(struct anv_batch *batch, uint32_t reg,
                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
 {
-      anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM),
-                     .RegisterAddress = reg,
-                     .MemoryAddress = { bo, offset });
+   anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+      srm.RegisterAddress  = reg;
+      srm.MemoryAddress    = (struct anv_address) { bo, offset };
+   }
 
-      if (flags & VK_QUERY_RESULT_64_BIT)
-         anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM),
-                        .RegisterAddress = reg + 4,
-                        .MemoryAddress = { bo, offset + 4 });
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+      anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+         srm.RegisterAddress  = reg + 4;
+         srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
+      }
+   }
 }
 
 void genX(CmdCopyQueryPoolResults)(
@@ -946,10 +1283,12 @@ void genX(CmdCopyQueryPoolResults)(
    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
    uint32_t slot_offset, dst_offset;
 
-   if (flags & VK_QUERY_RESULT_WAIT_BIT)
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
-                     .CommandStreamerStallEnable = true,
-                     .StallAtPixelScoreboard = true);
+   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard     = true;
+      }
+   }
 
    dst_offset = buffer->offset + destOffset;
    for (uint32_t i = 0; i < queryCount; i++) {