src/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28
  29 #if (ANV_GEN == 9)
  30 #  include "gen9_pack.h"
  31 #elif (ANV_GEN == 8)
  32 #  include "gen8_pack.h"
  33 #elif (ANV_IS_HASWELL)
  34 #  include "gen75_pack.h"
  35 #elif (ANV_GEN == 7)
  36 #  include "gen7_pack.h"
  37 #endif
  38
  39 void
  40 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  41 {
  42    struct anv_device *device = cmd_buffer->device;
  43    struct anv_bo *scratch_bo = NULL;
  44
  45    cmd_buffer->state.scratch_size =
  46       anv_block_pool_size(&device->scratch_block_pool);
  47    if (cmd_buffer->state.scratch_size > 0)
  48       scratch_bo = &device->scratch_block_pool.bo;
  49
  50 /* XXX: Do we need this on more than just BDW? */
  51 #if (ANV_GEN >= 8)
  52    /* Emit a render target cache flush.
  53     *
  54     * This isn't documented anywhere in the PRM.  However, it seems to be
  55     * necessary prior to changing the surface state base adress.  Without
  56     * this, we get GPU hangs when using multi-level command buffers which
  57     * clear depth, reset state base address, and then go render stuff.
  58     */
  59    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
  60                   .RenderTargetCacheFlushEnable = true);
  61 #endif
  62
  63    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS),
  64       .GeneralStateBaseAddress = { scratch_bo, 0 },
  65       .GeneralStateMemoryObjectControlState = GENX(MOCS),
  66       .GeneralStateBaseAddressModifyEnable = true,
  67
  68       .SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer),
  69       .SurfaceStateMemoryObjectControlState = GENX(MOCS),
  70       .SurfaceStateBaseAddressModifyEnable = true,
  71
  72       .DynamicStateBaseAddress = { &device->dynamic_state_block_pool.bo, 0 },
  73       .DynamicStateMemoryObjectControlState = GENX(MOCS),
  74       .DynamicStateBaseAddressModifyEnable = true,
  75
  76       .IndirectObjectBaseAddress = { NULL, 0 },
  77       .IndirectObjectMemoryObjectControlState = GENX(MOCS),
  78       .IndirectObjectBaseAddressModifyEnable = true,
  79
  80       .InstructionBaseAddress = { &device->instruction_block_pool.bo, 0 },
  81       .InstructionMemoryObjectControlState = GENX(MOCS),
  82       .InstructionBaseAddressModifyEnable = true,
  83
  84 #  if (ANV_GEN >= 8)
  85       /* Broadwell requires that we specify a buffer size for a bunch of
  86        * these fields.  However, since we will be growing the BO's live, we
  87        * just set them all to the maximum.
  88        */
  89       .GeneralStateBufferSize = 0xfffff,
  90       .GeneralStateBufferSizeModifyEnable = true,
  91       .DynamicStateBufferSize = 0xfffff,
  92       .DynamicStateBufferSizeModifyEnable = true,
  93       .IndirectObjectBufferSize = 0xfffff,
  94       .IndirectObjectBufferSizeModifyEnable = true,
  95       .InstructionBufferSize = 0xfffff,
  96       .InstructionBuffersizeModifyEnable = true,
  97 #  endif
  98    );
  99
 100    /* After re-setting the surface state base address, we have to do some
 101     * cache flusing so that the sampler engine will pick up the new
 102     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 103     * Shared Function > 3D Sampler > State > State Caching (page 96):
 104     *
 105     *    Coherency with system memory in the state cache, like the texture
 106     *    cache is handled partially by software. It is expected that the
 107     *    command stream or shader will issue Cache Flush operation or
 108     *    Cache_Flush sampler message to ensure that the L1 cache remains
 109     *    coherent with system memory.
 110     *
 111     *    [...]
 112     *
 113     *    Whenever the value of the Dynamic_State_Base_Addr,
 114     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 115     *    invalidated to ensure the new surface or sampler state is fetched
 116     *    from system memory.
 117     *
 118     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 119     * which, according the PIPE_CONTROL instruction documentation in the
 120     * Broadwell PRM:
 121     *
 122     *    Setting this bit is independent of any other bit in this packet.
 123     *    This bit controls the invalidation of the L1 and L2 state caches
 124     *    at the top of the pipe i.e. at the parsing time.
 125     *
 126     * Unfortunately, experimentation seems to indicate that state cache
 127     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 128     * regards to surface state and binding tables.  In stead, it seems that
 129     * invalidating the texture cache is what is actually needed.
 130     *
 131     * XXX:  As far as we have been able to determine through
 132     * experimentation, shows that flush the texture cache appears to be
 133     * sufficient.  The theory here is that all of the sampling/rendering
 134     * units cache the binding table in the texture cache.  However, we have
 135     * yet to be able to actually confirm this.
 136     */
 137    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
 138                   .TextureCacheInvalidationEnable = true);
 139 }
 140
 141 void genX(CmdPipelineBarrier)(
 142     VkCommandBuffer                             commandBuffer,
 143     VkPipelineStageFlags                        srcStageMask,
 144     VkPipelineStageFlags                        destStageMask,
 145     VkBool32                                    byRegion,
 146     uint32_t                                    memoryBarrierCount,
 147     const VkMemoryBarrier*                      pMemoryBarriers,
 148     uint32_t                                    bufferMemoryBarrierCount,
 149     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 150     uint32_t                                    imageMemoryBarrierCount,
 151     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 152 {
 153    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 154    uint32_t b, *dw;
 155
 156    struct GENX(PIPE_CONTROL) cmd = {
 157       GENX(PIPE_CONTROL_header),
 158       .PostSyncOperation = NoWrite,
 159    };
 160
 161    /* XXX: I think waitEvent is a no-op on our HW.  We should verify that. */
 162
 163    if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
 164       /* This is just what PIPE_CONTROL does */
 165    }
 166
 167    if (anv_clear_mask(&srcStageMask,
 168                       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
 169                       VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
 170                       VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
 171                       VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
 172                       VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
 173                       VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
 174                       VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
 175                       VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
 176                       VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
 177                       VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT)) {
 178       cmd.StallAtPixelScoreboard = true;
 179    }
 180
 181    if (anv_clear_mask(&srcStageMask,
 182                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
 183                       VK_PIPELINE_STAGE_TRANSFER_BIT)) {
 184       cmd.CommandStreamerStallEnable = true;
 185    }
 186
 187    if (anv_clear_mask(&srcStageMask, VK_PIPELINE_STAGE_HOST_BIT)) {
 188       anv_finishme("VK_PIPE_EVENT_CPU_SIGNAL_BIT");
 189    }
 190
 191    /* On our hardware, all stages will wait for execution as needed. */
 192    (void)destStageMask;
 193
 194    /* We checked all known VkPipeEventFlags. */
 195    anv_assert(srcStageMask == 0);
 196
 197    /* XXX: Right now, we're really dumb and just flush whatever categories
 198     * the app asks for.  One of these days we may make this a bit better
 199     * but right now that's all the hardware allows for in most areas.
 200     */
 201    VkAccessFlags src_flags = 0;
 202    VkAccessFlags dst_flags = 0;
 203
 204    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 205       src_flags |= pMemoryBarriers[i].srcAccessMask;
 206       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 207    }
 208
 209    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 210       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 211       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 212    }
 213
 214    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 215       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 216       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 217    }
 218
 219    /* The src flags represent how things were used previously.  This is
 220     * what we use for doing flushes.
 221     */
 222    for_each_bit(b, src_flags) {
 223       switch ((VkAccessFlagBits)(1 << b)) {
 224       case VK_ACCESS_SHADER_WRITE_BIT:
 225          cmd.DCFlushEnable = true;
 226          break;
 227       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 228          cmd.RenderTargetCacheFlushEnable = true;
 229          break;
 230       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 231          cmd.DepthCacheFlushEnable = true;
 232          break;
 233       case VK_ACCESS_TRANSFER_WRITE_BIT:
 234          cmd.RenderTargetCacheFlushEnable = true;
 235          cmd.DepthCacheFlushEnable = true;
 236          break;
 237       default:
 238          /* Doesn't require a flush */
 239          break;
 240       }
 241    }
 242
 243    /* The dst flags represent how things will be used in the fugure.  This
 244     * is what we use for doing cache invalidations.
 245     */
 246    for_each_bit(b, dst_flags) {
 247       switch ((VkAccessFlagBits)(1 << b)) {
 248       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 249       case VK_ACCESS_INDEX_READ_BIT:
 250       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 251          cmd.VFCacheInvalidationEnable = true;
 252          break;
 253       case VK_ACCESS_UNIFORM_READ_BIT:
 254          cmd.ConstantCacheInvalidationEnable = true;
 255          /* fallthrough */
 256       case VK_ACCESS_SHADER_READ_BIT:
 257          cmd.TextureCacheInvalidationEnable = true;
 258          break;
 259       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 260          cmd.TextureCacheInvalidationEnable = true;
 261          break;
 262       case VK_ACCESS_TRANSFER_READ_BIT:
 263          cmd.TextureCacheInvalidationEnable = true;
 264          break;
 265       case VK_ACCESS_MEMORY_READ_BIT:
 266          break; /* XXX: What is this? */
 267       default:
 268          /* Doesn't require a flush */
 269          break;
 270       }
 271    }
 272
 273    dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
 274    GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &cmd);
 275 }
 276
 277 static void
 278 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
 279                              struct anv_bo *bo, uint32_t offset)
 280 {
 281    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
 282                                  GENX(3DSTATE_VERTEX_BUFFERS));
 283
 284    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
 285       &(struct GENX(VERTEX_BUFFER_STATE)) {
 286          .VertexBufferIndex = 32, /* Reserved for this */
 287          .AddressModifyEnable = true,
 288          .BufferPitch = 0,
 289 #if (ANV_GEN >= 8)
 290          .MemoryObjectControlState = GENX(MOCS),
 291          .BufferStartingAddress = { bo, offset },
 292          .BufferSize = 8
 293 #else
 294          .VertexBufferMemoryObjectControlState = GENX(MOCS),
 295          .BufferStartingAddress = { bo, offset },
 296          .EndAddress = { bo, offset + 8 },
 297 #endif
 298       });
 299 }
 300
 301 static void
 302 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
 303                           uint32_t base_vertex, uint32_t base_instance)
 304 {
 305    struct anv_state id_state =
 306       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
 307
 308    ((uint32_t *)id_state.map)[0] = base_vertex;
 309    ((uint32_t *)id_state.map)[1] = base_instance;
 310
 311    if (!cmd_buffer->device->info.has_llc)
 312       anv_state_clflush(id_state);
 313
 314    emit_base_vertex_instance_bo(cmd_buffer,
 315       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
 316 }
 317
 318 void genX(CmdDraw)(
 319     VkCommandBuffer                             commandBuffer,
 320     uint32_t                                    vertexCount,
 321     uint32_t                                    instanceCount,
 322     uint32_t                                    firstVertex,
 323     uint32_t                                    firstInstance)
 324 {
 325    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 326    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 327
 328    genX(cmd_buffer_flush_state)(cmd_buffer);
 329
 330    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 331        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 332       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
 333
 334    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 335       .VertexAccessType                         = SEQUENTIAL,
 336       .PrimitiveTopologyType                    = pipeline->topology,
 337       .VertexCountPerInstance                   = vertexCount,
 338       .StartVertexLocation                      = firstVertex,
 339       .InstanceCount                            = instanceCount,
 340       .StartInstanceLocation                    = firstInstance,
 341       .BaseVertexLocation                       = 0);
 342 }
 343
 344 void genX(CmdDrawIndexed)(
 345     VkCommandBuffer                             commandBuffer,
 346     uint32_t                                    indexCount,
 347     uint32_t                                    instanceCount,
 348     uint32_t                                    firstIndex,
 349     int32_t                                     vertexOffset,
 350     uint32_t                                    firstInstance)
 351 {
 352    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 353    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 354
 355    genX(cmd_buffer_flush_state)(cmd_buffer);
 356
 357    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 358        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 359       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
 360
 361    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 362       .VertexAccessType                         = RANDOM,
 363       .PrimitiveTopologyType                    = pipeline->topology,
 364       .VertexCountPerInstance                   = indexCount,
 365       .StartVertexLocation                      = firstIndex,
 366       .InstanceCount                            = instanceCount,
 367       .StartInstanceLocation                    = firstInstance,
 368       .BaseVertexLocation                       = vertexOffset);
 369 }
 370
 371 /* Auto-Draw / Indirect Registers */
 372 #define GEN7_3DPRIM_END_OFFSET          0x2420
 373 #define GEN7_3DPRIM_START_VERTEX        0x2430
 374 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
 375 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
 376 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 377 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 378
 379 static void
 380 emit_lrm(struct anv_batch *batch,
 381          uint32_t reg, struct anv_bo *bo, uint32_t offset)
 382 {
 383    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
 384                   .RegisterAddress = reg,
 385                   .MemoryAddress = { bo, offset });
 386 }
 387
 388 static void
 389 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 390 {
 391    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM),
 392                   .RegisterOffset = reg,
 393                   .DataDWord = imm);
 394 }
 395
 396 void genX(CmdDrawIndirect)(
 397     VkCommandBuffer                             commandBuffer,
 398     VkBuffer                                    _buffer,
 399     VkDeviceSize                                offset,
 400     uint32_t                                    drawCount,
 401     uint32_t                                    stride)
 402 {
 403    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 404    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 405    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 406    struct anv_bo *bo = buffer->bo;
 407    uint32_t bo_offset = buffer->offset + offset;
 408
 409    genX(cmd_buffer_flush_state)(cmd_buffer);
 410
 411    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 412        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 413       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
 414
 415    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 416    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 417    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 418    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
 419    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
 420
 421    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 422       .IndirectParameterEnable                  = true,
 423       .VertexAccessType                         = SEQUENTIAL,
 424       .PrimitiveTopologyType                    = pipeline->topology);
 425 }
 426
 427 void genX(CmdDrawIndexedIndirect)(
 428     VkCommandBuffer                             commandBuffer,
 429     VkBuffer                                    _buffer,
 430     VkDeviceSize                                offset,
 431     uint32_t                                    drawCount,
 432     uint32_t                                    stride)
 433 {
 434    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 435    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 436    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 437    struct anv_bo *bo = buffer->bo;
 438    uint32_t bo_offset = buffer->offset + offset;
 439
 440    genX(cmd_buffer_flush_state)(cmd_buffer);
 441
 442    /* TODO: We need to stomp base vertex to 0 somehow */
 443    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 444        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 445       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
 446
 447    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 448    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 449    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 450    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
 451    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
 452
 453    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 454       .IndirectParameterEnable                  = true,
 455       .VertexAccessType                         = RANDOM,
 456       .PrimitiveTopologyType                    = pipeline->topology);
 457 }
 458
 459
 460 void genX(CmdDispatch)(
 461     VkCommandBuffer                             commandBuffer,
 462     uint32_t                                    x,
 463     uint32_t                                    y,
 464     uint32_t                                    z)
 465 {
 466    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 467    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 468    struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
 469
 470    if (prog_data->uses_num_work_groups) {
 471       struct anv_state state =
 472          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
 473       uint32_t *sizes = state.map;
 474       sizes[0] = x;
 475       sizes[1] = y;
 476       sizes[2] = z;
 477       if (!cmd_buffer->device->info.has_llc)
 478          anv_state_clflush(state);
 479       cmd_buffer->state.num_workgroups_offset = state.offset;
 480       cmd_buffer->state.num_workgroups_bo =
 481          &cmd_buffer->device->dynamic_state_block_pool.bo;
 482    }
 483
 484    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 485
 486    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
 487                   .SIMDSize = prog_data->simd_size / 16,
 488                   .ThreadDepthCounterMaximum = 0,
 489                   .ThreadHeightCounterMaximum = 0,
 490                   .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
 491                   .ThreadGroupIDXDimension = x,
 492                   .ThreadGroupIDYDimension = y,
 493                   .ThreadGroupIDZDimension = z,
 494                   .RightExecutionMask = pipeline->cs_right_mask,
 495                   .BottomExecutionMask = 0xffffffff);
 496
 497    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
 498 }
 499
 500 #define GPGPU_DISPATCHDIMX 0x2500
 501 #define GPGPU_DISPATCHDIMY 0x2504
 502 #define GPGPU_DISPATCHDIMZ 0x2508
 503
 504 void genX(CmdDispatchIndirect)(
 505     VkCommandBuffer                             commandBuffer,
 506     VkBuffer                                    _buffer,
 507     VkDeviceSize                                offset)
 508 {
 509    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 510    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 511    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 512    struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
 513    struct anv_bo *bo = buffer->bo;
 514    uint32_t bo_offset = buffer->offset + offset;
 515
 516    if (prog_data->uses_num_work_groups) {
 517       cmd_buffer->state.num_workgroups_offset = bo_offset;
 518       cmd_buffer->state.num_workgroups_bo = bo;
 519    }
 520
 521    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 522
 523    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
 524    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
 525    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
 526
 527    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
 528                   .IndirectParameterEnable = true,
 529                   .SIMDSize = prog_data->simd_size / 16,
 530                   .ThreadDepthCounterMaximum = 0,
 531                   .ThreadHeightCounterMaximum = 0,
 532                   .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
 533                   .RightExecutionMask = pipeline->cs_right_mask,
 534                   .BottomExecutionMask = 0xffffffff);
 535
 536    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
 537 }