src/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28
  29 #if (ANV_GEN == 9)
  30 #  include "gen9_pack.h"
  31 #elif (ANV_GEN == 8)
  32 #  include "gen8_pack.h"
  33 #elif (ANV_IS_HASWELL)
  34 #  include "gen75_pack.h"
  35 #elif (ANV_GEN == 7)
  36 #  include "gen7_pack.h"
  37 #endif
  38
  39 void
  40 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  41 {
  42    struct anv_device *device = cmd_buffer->device;
  43    struct anv_bo *scratch_bo = NULL;
  44
  45    cmd_buffer->state.scratch_size =
  46       anv_block_pool_size(&device->scratch_block_pool);
  47    if (cmd_buffer->state.scratch_size > 0)
  48       scratch_bo = &device->scratch_block_pool.bo;
  49
  50 /* XXX: Do we need this on more than just BDW? */
  51 #if (ANV_GEN >= 8)
  52    /* Emit a render target cache flush.
  53     *
  54     * This isn't documented anywhere in the PRM.  However, it seems to be
  55     * necessary prior to changing the surface state base adress.  Without
  56     * this, we get GPU hangs when using multi-level command buffers which
  57     * clear depth, reset state base address, and then go render stuff.
  58     */
  59    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
  60                   .RenderTargetCacheFlushEnable = true);
  61 #endif
  62
  63    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS),
  64       .GeneralStateBaseAddress = { scratch_bo, 0 },
  65       .GeneralStateMemoryObjectControlState = GENX(MOCS),
  66       .GeneralStateBaseAddressModifyEnable = true,
  67
  68       .SurfaceStateBaseAddress = anv_cmd_buffer_surface_base_address(cmd_buffer),
  69       .SurfaceStateMemoryObjectControlState = GENX(MOCS),
  70       .SurfaceStateBaseAddressModifyEnable = true,
  71
  72       .DynamicStateBaseAddress = { &device->dynamic_state_block_pool.bo, 0 },
  73       .DynamicStateMemoryObjectControlState = GENX(MOCS),
  74       .DynamicStateBaseAddressModifyEnable = true,
  75
  76       .IndirectObjectBaseAddress = { NULL, 0 },
  77       .IndirectObjectMemoryObjectControlState = GENX(MOCS),
  78       .IndirectObjectBaseAddressModifyEnable = true,
  79
  80       .InstructionBaseAddress = { &device->instruction_block_pool.bo, 0 },
  81       .InstructionMemoryObjectControlState = GENX(MOCS),
  82       .InstructionBaseAddressModifyEnable = true,
  83
  84 #  if (ANV_GEN >= 8)
  85       /* Broadwell requires that we specify a buffer size for a bunch of
  86        * these fields.  However, since we will be growing the BO's live, we
  87        * just set them all to the maximum.
  88        */
  89       .GeneralStateBufferSize = 0xfffff,
  90       .GeneralStateBufferSizeModifyEnable = true,
  91       .DynamicStateBufferSize = 0xfffff,
  92       .DynamicStateBufferSizeModifyEnable = true,
  93       .IndirectObjectBufferSize = 0xfffff,
  94       .IndirectObjectBufferSizeModifyEnable = true,
  95       .InstructionBufferSize = 0xfffff,
  96       .InstructionBuffersizeModifyEnable = true,
  97 #  endif
  98    );
  99
 100    /* After re-setting the surface state base address, we have to do some
 101     * cache flusing so that the sampler engine will pick up the new
 102     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 103     * Shared Function > 3D Sampler > State > State Caching (page 96):
 104     *
 105     *    Coherency with system memory in the state cache, like the texture
 106     *    cache is handled partially by software. It is expected that the
 107     *    command stream or shader will issue Cache Flush operation or
 108     *    Cache_Flush sampler message to ensure that the L1 cache remains
 109     *    coherent with system memory.
 110     *
 111     *    [...]
 112     *
 113     *    Whenever the value of the Dynamic_State_Base_Addr,
 114     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 115     *    invalidated to ensure the new surface or sampler state is fetched
 116     *    from system memory.
 117     *
 118     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 119     * which, according the PIPE_CONTROL instruction documentation in the
 120     * Broadwell PRM:
 121     *
 122     *    Setting this bit is independent of any other bit in this packet.
 123     *    This bit controls the invalidation of the L1 and L2 state caches
 124     *    at the top of the pipe i.e. at the parsing time.
 125     *
 126     * Unfortunately, experimentation seems to indicate that state cache
 127     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 128     * regards to surface state and binding tables.  In stead, it seems that
 129     * invalidating the texture cache is what is actually needed.
 130     *
 131     * XXX:  As far as we have been able to determine through
 132     * experimentation, shows that flush the texture cache appears to be
 133     * sufficient.  The theory here is that all of the sampling/rendering
 134     * units cache the binding table in the texture cache.  However, we have
 135     * yet to be able to actually confirm this.
 136     */
 137    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
 138                   .TextureCacheInvalidationEnable = true);
 139 }
 140
 141 void genX(CmdPipelineBarrier)(
 142     VkCommandBuffer                             commandBuffer,
 143     VkPipelineStageFlags                        srcStageMask,
 144     VkPipelineStageFlags                        destStageMask,
 145     VkBool32                                    byRegion,
 146     uint32_t                                    memoryBarrierCount,
 147     const VkMemoryBarrier*                      pMemoryBarriers,
 148     uint32_t                                    bufferMemoryBarrierCount,
 149     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 150     uint32_t                                    imageMemoryBarrierCount,
 151     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 152 {
 153    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 154    uint32_t b, *dw;
 155
 156    /* XXX: Right now, we're really dumb and just flush whatever categories
 157     * the app asks for.  One of these days we may make this a bit better
 158     * but right now that's all the hardware allows for in most areas.
 159     */
 160    VkAccessFlags src_flags = 0;
 161    VkAccessFlags dst_flags = 0;
 162
 163    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 164       src_flags |= pMemoryBarriers[i].srcAccessMask;
 165       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 166    }
 167
 168    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 169       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 170       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 171    }
 172
 173    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 174       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 175       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 176    }
 177
 178    /* Mask out the Source access flags we care about */
 179    const uint32_t src_mask =
 180       VK_ACCESS_SHADER_WRITE_BIT |
 181       VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
 182       VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
 183       VK_ACCESS_TRANSFER_WRITE_BIT;
 184
 185    src_flags = src_flags & src_mask;
 186
 187    /* Mask out the destination access flags we care about */
 188    const uint32_t dst_mask =
 189       VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
 190       VK_ACCESS_INDEX_READ_BIT |
 191       VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
 192       VK_ACCESS_UNIFORM_READ_BIT |
 193       VK_ACCESS_SHADER_READ_BIT |
 194       VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
 195       VK_ACCESS_TRANSFER_READ_BIT;
 196
 197    dst_flags = dst_flags & dst_mask;
 198
 199    /* The src flags represent how things were used previously.  This is
 200     * what we use for doing flushes.
 201     */
 202    struct GENX(PIPE_CONTROL) flush_cmd = {
 203       GENX(PIPE_CONTROL_header),
 204       .PostSyncOperation = NoWrite,
 205    };
 206
 207    for_each_bit(b, src_flags) {
 208       switch ((VkAccessFlagBits)(1 << b)) {
 209       case VK_ACCESS_SHADER_WRITE_BIT:
 210          flush_cmd.DCFlushEnable = true;
 211          break;
 212       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 213          flush_cmd.RenderTargetCacheFlushEnable = true;
 214          break;
 215       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 216          flush_cmd.DepthCacheFlushEnable = true;
 217          break;
 218       case VK_ACCESS_TRANSFER_WRITE_BIT:
 219          flush_cmd.RenderTargetCacheFlushEnable = true;
 220          flush_cmd.DepthCacheFlushEnable = true;
 221          break;
 222       default:
 223          unreachable("should've masked this out by now");
 224       }
 225    }
 226
 227    /* If we end up doing two PIPE_CONTROLs, the first, flusing one also has to
 228     * stall and wait for the flushing to finish, so we don't re-dirty the
 229     * caches with in-flight rendering after the second PIPE_CONTROL
 230     * invalidates.
 231     */
 232
 233    if (dst_flags)
 234       flush_cmd.CommandStreamerStallEnable = true;
 235
 236    if (src_flags && dst_flags) {
 237       dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
 238       GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &flush_cmd);
 239    }
 240
 241    /* The dst flags represent how things will be used in the future.  This
 242     * is what we use for doing cache invalidations.
 243     */
 244    struct GENX(PIPE_CONTROL) invalidate_cmd = {
 245       GENX(PIPE_CONTROL_header),
 246       .PostSyncOperation = NoWrite,
 247    };
 248
 249    for_each_bit(b, dst_flags) {
 250       switch ((VkAccessFlagBits)(1 << b)) {
 251       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 252       case VK_ACCESS_INDEX_READ_BIT:
 253       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 254          invalidate_cmd.VFCacheInvalidationEnable = true;
 255          break;
 256       case VK_ACCESS_UNIFORM_READ_BIT:
 257          invalidate_cmd.ConstantCacheInvalidationEnable = true;
 258          /* fallthrough */
 259       case VK_ACCESS_SHADER_READ_BIT:
 260          invalidate_cmd.TextureCacheInvalidationEnable = true;
 261          break;
 262       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 263          invalidate_cmd.TextureCacheInvalidationEnable = true;
 264          break;
 265       case VK_ACCESS_TRANSFER_READ_BIT:
 266          invalidate_cmd.TextureCacheInvalidationEnable = true;
 267          break;
 268       default:
 269          unreachable("should've masked this out by now");
 270       }
 271    }
 272
 273    if (dst_flags) {
 274       dw = anv_batch_emit_dwords(&cmd_buffer->batch, GENX(PIPE_CONTROL_length));
 275       GENX(PIPE_CONTROL_pack)(&cmd_buffer->batch, dw, &invalidate_cmd);
 276    }
 277 }
 278
 279 static void
 280 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
 281                              struct anv_bo *bo, uint32_t offset)
 282 {
 283    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
 284                                  GENX(3DSTATE_VERTEX_BUFFERS));
 285
 286    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
 287       &(struct GENX(VERTEX_BUFFER_STATE)) {
 288          .VertexBufferIndex = 32, /* Reserved for this */
 289          .AddressModifyEnable = true,
 290          .BufferPitch = 0,
 291 #if (ANV_GEN >= 8)
 292          .MemoryObjectControlState = GENX(MOCS),
 293          .BufferStartingAddress = { bo, offset },
 294          .BufferSize = 8
 295 #else
 296          .VertexBufferMemoryObjectControlState = GENX(MOCS),
 297          .BufferStartingAddress = { bo, offset },
 298          .EndAddress = { bo, offset + 8 },
 299 #endif
 300       });
 301 }
 302
 303 static void
 304 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
 305                           uint32_t base_vertex, uint32_t base_instance)
 306 {
 307    struct anv_state id_state =
 308       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
 309
 310    ((uint32_t *)id_state.map)[0] = base_vertex;
 311    ((uint32_t *)id_state.map)[1] = base_instance;
 312
 313    if (!cmd_buffer->device->info.has_llc)
 314       anv_state_clflush(id_state);
 315
 316    emit_base_vertex_instance_bo(cmd_buffer,
 317       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
 318 }
 319
 320 void genX(CmdDraw)(
 321     VkCommandBuffer                             commandBuffer,
 322     uint32_t                                    vertexCount,
 323     uint32_t                                    instanceCount,
 324     uint32_t                                    firstVertex,
 325     uint32_t                                    firstInstance)
 326 {
 327    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 328    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 329
 330    genX(cmd_buffer_flush_state)(cmd_buffer);
 331
 332    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 333        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 334       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
 335
 336    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 337       .VertexAccessType                         = SEQUENTIAL,
 338       .PrimitiveTopologyType                    = pipeline->topology,
 339       .VertexCountPerInstance                   = vertexCount,
 340       .StartVertexLocation                      = firstVertex,
 341       .InstanceCount                            = instanceCount,
 342       .StartInstanceLocation                    = firstInstance,
 343       .BaseVertexLocation                       = 0);
 344 }
 345
 346 void genX(CmdDrawIndexed)(
 347     VkCommandBuffer                             commandBuffer,
 348     uint32_t                                    indexCount,
 349     uint32_t                                    instanceCount,
 350     uint32_t                                    firstIndex,
 351     int32_t                                     vertexOffset,
 352     uint32_t                                    firstInstance)
 353 {
 354    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 355    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 356
 357    genX(cmd_buffer_flush_state)(cmd_buffer);
 358
 359    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 360        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 361       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
 362
 363    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 364       .VertexAccessType                         = RANDOM,
 365       .PrimitiveTopologyType                    = pipeline->topology,
 366       .VertexCountPerInstance                   = indexCount,
 367       .StartVertexLocation                      = firstIndex,
 368       .InstanceCount                            = instanceCount,
 369       .StartInstanceLocation                    = firstInstance,
 370       .BaseVertexLocation                       = vertexOffset);
 371 }
 372
 373 /* Auto-Draw / Indirect Registers */
 374 #define GEN7_3DPRIM_END_OFFSET          0x2420
 375 #define GEN7_3DPRIM_START_VERTEX        0x2430
 376 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
 377 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
 378 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 379 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 380
 381 static void
 382 emit_lrm(struct anv_batch *batch,
 383          uint32_t reg, struct anv_bo *bo, uint32_t offset)
 384 {
 385    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM),
 386                   .RegisterAddress = reg,
 387                   .MemoryAddress = { bo, offset });
 388 }
 389
 390 static void
 391 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 392 {
 393    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM),
 394                   .RegisterOffset = reg,
 395                   .DataDWord = imm);
 396 }
 397
 398 void genX(CmdDrawIndirect)(
 399     VkCommandBuffer                             commandBuffer,
 400     VkBuffer                                    _buffer,
 401     VkDeviceSize                                offset,
 402     uint32_t                                    drawCount,
 403     uint32_t                                    stride)
 404 {
 405    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 406    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 407    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 408    struct anv_bo *bo = buffer->bo;
 409    uint32_t bo_offset = buffer->offset + offset;
 410
 411    genX(cmd_buffer_flush_state)(cmd_buffer);
 412
 413    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 414        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 415       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
 416
 417    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 418    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 419    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 420    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
 421    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
 422
 423    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 424       .IndirectParameterEnable                  = true,
 425       .VertexAccessType                         = SEQUENTIAL,
 426       .PrimitiveTopologyType                    = pipeline->topology);
 427 }
 428
 429 void genX(CmdDrawIndexedIndirect)(
 430     VkCommandBuffer                             commandBuffer,
 431     VkBuffer                                    _buffer,
 432     VkDeviceSize                                offset,
 433     uint32_t                                    drawCount,
 434     uint32_t                                    stride)
 435 {
 436    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 437    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 438    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 439    struct anv_bo *bo = buffer->bo;
 440    uint32_t bo_offset = buffer->offset + offset;
 441
 442    genX(cmd_buffer_flush_state)(cmd_buffer);
 443
 444    /* TODO: We need to stomp base vertex to 0 somehow */
 445    if (cmd_buffer->state.pipeline->vs_prog_data.uses_basevertex ||
 446        cmd_buffer->state.pipeline->vs_prog_data.uses_baseinstance)
 447       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
 448
 449    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 450    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 451    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 452    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
 453    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
 454
 455    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE),
 456       .IndirectParameterEnable                  = true,
 457       .VertexAccessType                         = RANDOM,
 458       .PrimitiveTopologyType                    = pipeline->topology);
 459 }
 460
 461
 462 void genX(CmdDispatch)(
 463     VkCommandBuffer                             commandBuffer,
 464     uint32_t                                    x,
 465     uint32_t                                    y,
 466     uint32_t                                    z)
 467 {
 468    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 469    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 470    struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
 471
 472    if (prog_data->uses_num_work_groups) {
 473       struct anv_state state =
 474          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
 475       uint32_t *sizes = state.map;
 476       sizes[0] = x;
 477       sizes[1] = y;
 478       sizes[2] = z;
 479       if (!cmd_buffer->device->info.has_llc)
 480          anv_state_clflush(state);
 481       cmd_buffer->state.num_workgroups_offset = state.offset;
 482       cmd_buffer->state.num_workgroups_bo =
 483          &cmd_buffer->device->dynamic_state_block_pool.bo;
 484    }
 485
 486    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 487
 488    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
 489                   .SIMDSize = prog_data->simd_size / 16,
 490                   .ThreadDepthCounterMaximum = 0,
 491                   .ThreadHeightCounterMaximum = 0,
 492                   .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
 493                   .ThreadGroupIDXDimension = x,
 494                   .ThreadGroupIDYDimension = y,
 495                   .ThreadGroupIDZDimension = z,
 496                   .RightExecutionMask = pipeline->cs_right_mask,
 497                   .BottomExecutionMask = 0xffffffff);
 498
 499    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
 500 }
 501
 502 #define GPGPU_DISPATCHDIMX 0x2500
 503 #define GPGPU_DISPATCHDIMY 0x2504
 504 #define GPGPU_DISPATCHDIMZ 0x2508
 505
 506 void genX(CmdDispatchIndirect)(
 507     VkCommandBuffer                             commandBuffer,
 508     VkBuffer                                    _buffer,
 509     VkDeviceSize                                offset)
 510 {
 511    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 512    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 513    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 514    struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
 515    struct anv_bo *bo = buffer->bo;
 516    uint32_t bo_offset = buffer->offset + offset;
 517
 518    if (prog_data->uses_num_work_groups) {
 519       cmd_buffer->state.num_workgroups_offset = bo_offset;
 520       cmd_buffer->state.num_workgroups_bo = bo;
 521    }
 522
 523    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 524
 525    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
 526    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
 527    emit_lrm(&cmd_buffer->batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
 528
 529    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER),
 530                   .IndirectParameterEnable = true,
 531                   .SIMDSize = prog_data->simd_size / 16,
 532                   .ThreadDepthCounterMaximum = 0,
 533                   .ThreadHeightCounterMaximum = 0,
 534                   .ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1,
 535                   .RightExecutionMask = pipeline->cs_right_mask,
 536                   .BottomExecutionMask = 0xffffffff);
 537
 538    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH));
 539 }
 540
 541 void
 542 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
 543 {
 544    if (cmd_buffer->state.current_pipeline != _3D) {
 545       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT),
 546 #if ANV_GEN >= 9
 547                      .MaskBits = 3,
 548 #endif
 549                      .PipelineSelection = _3D);
 550       cmd_buffer->state.current_pipeline = _3D;
 551    }
 552 }
 553
 554 static void
 555 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 556 {
 557    struct anv_device *device = cmd_buffer->device;
 558    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
 559    const struct anv_image_view *iview =
 560       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
 561    const struct anv_image *image = iview ? iview->image : NULL;
 562    const struct anv_format *anv_format =
 563       iview ? anv_format_for_vk_format(iview->vk_format) : NULL;
 564    const bool has_depth = iview && anv_format->has_depth;
 565    const bool has_stencil = iview && anv_format->has_stencil;
 566
 567    /* FIXME: Implement the PMA stall W/A */
 568    /* FIXME: Width and Height are wrong */
 569
 570    /* Emit 3DSTATE_DEPTH_BUFFER */
 571    if (has_depth) {
 572       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
 573          .SurfaceType = SURFTYPE_2D,
 574          .DepthWriteEnable = true,
 575          .StencilWriteEnable = has_stencil,
 576          .HierarchicalDepthBufferEnable = false,
 577          .SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
 578                                                     &image->depth_surface.isl),
 579          .SurfacePitch = image->depth_surface.isl.row_pitch - 1,
 580          .SurfaceBaseAddress = {
 581             .bo = image->bo,
 582             .offset = image->depth_surface.offset,
 583          },
 584          .Height = fb->height - 1,
 585          .Width = fb->width - 1,
 586          .LOD = 0,
 587          .Depth = 1 - 1,
 588          .MinimumArrayElement = 0,
 589          .DepthBufferObjectControlState = GENX(MOCS),
 590 #if ANV_GEN >= 8
 591          .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
 592 #endif
 593          .RenderTargetViewExtent = 1 - 1);
 594    } else {
 595       /* Even when no depth buffer is present, the hardware requires that
 596        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
 597        *
 598        *    If a null depth buffer is bound, the driver must instead bind depth as:
 599        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
 600        *       3DSTATE_DEPTH.Width = 1
 601        *       3DSTATE_DEPTH.Height = 1
 602        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
 603        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
 604        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
 605        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
 606        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
 607        *
 608        * The PRM is wrong, though. The width and height must be programmed to
 609        * actual framebuffer's width and height, even when neither depth buffer
 610        * nor stencil buffer is present.
 611        */
 612       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER),
 613          .SurfaceType = SURFTYPE_2D,
 614          .SurfaceFormat = D16_UNORM,
 615          .Width = fb->width - 1,
 616          .Height = fb->height - 1,
 617          .StencilWriteEnable = has_stencil);
 618    }
 619
 620    /* Emit 3DSTATE_STENCIL_BUFFER */
 621    if (has_stencil) {
 622       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER),
 623 #if ANV_GEN >= 8 || ANV_IS_HASWELL
 624          .StencilBufferEnable = true,
 625 #endif
 626          .StencilBufferObjectControlState = GENX(MOCS),
 627
 628          /* Stencil buffers have strange pitch. The PRM says:
 629           *
 630           *    The pitch must be set to 2x the value computed based on width,
 631           *    as the stencil buffer is stored with two rows interleaved.
 632           */
 633          .SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
 634
 635 #if ANV_GEN >= 8
 636          .SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
 637 #endif
 638          .SurfaceBaseAddress = {
 639             .bo = image->bo,
 640             .offset = image->offset + image->stencil_surface.offset,
 641          });
 642    } else {
 643       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER));
 644    }
 645
 646    /* Disable hierarchial depth buffers. */
 647    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER));
 648
 649    /* Clear the clear params. */
 650    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS));
 651 }
 652
 653 /**
 654  * @see anv_cmd_buffer_set_subpass()
 655  */
 656 void
 657 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
 658                              struct anv_subpass *subpass)
 659 {
 660    cmd_buffer->state.subpass = subpass;
 661
 662    cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
 663
 664    cmd_buffer_emit_depth_stencil(cmd_buffer);
 665 }
 666
 667 void genX(CmdBeginRenderPass)(
 668     VkCommandBuffer                             commandBuffer,
 669     const VkRenderPassBeginInfo*                pRenderPassBegin,
 670     VkSubpassContents                           contents)
 671 {
 672    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 673    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
 674    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
 675
 676    cmd_buffer->state.framebuffer = framebuffer;
 677    cmd_buffer->state.pass = pass;
 678    anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
 679
 680    genX(flush_pipeline_select_3d)(cmd_buffer);
 681
 682    const VkRect2D *render_area = &pRenderPassBegin->renderArea;
 683
 684    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DRAWING_RECTANGLE),
 685                   .ClippedDrawingRectangleYMin = render_area->offset.y,
 686                   .ClippedDrawingRectangleXMin = render_area->offset.x,
 687                   .ClippedDrawingRectangleYMax =
 688                      render_area->offset.y + render_area->extent.height - 1,
 689                   .ClippedDrawingRectangleXMax =
 690                      render_area->offset.x + render_area->extent.width - 1,
 691                   .DrawingRectangleOriginY = 0,
 692                   .DrawingRectangleOriginX = 0);
 693
 694    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
 695    anv_cmd_buffer_clear_subpass(cmd_buffer);
 696 }
 697
 698 void genX(CmdNextSubpass)(
 699     VkCommandBuffer                             commandBuffer,
 700     VkSubpassContents                           contents)
 701 {
 702    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 703
 704    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 705
 706    anv_cmd_buffer_resolve_subpass(cmd_buffer);
 707    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
 708    anv_cmd_buffer_clear_subpass(cmd_buffer);
 709 }
 710
 711 void genX(CmdEndRenderPass)(
 712     VkCommandBuffer                             commandBuffer)
 713 {
 714    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 715
 716    anv_cmd_buffer_resolve_subpass(cmd_buffer);
 717
 718    /* Emit a flushing pipe control at the end of a pass.  This is kind of a
 719     * hack but it ensures that render targets always actually get written.
 720     * Eventually, we should do flushing based on image format transitions
 721     * or something of that nature.
 722     */
 723    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL),
 724                   .PostSyncOperation = NoWrite,
 725                   .RenderTargetCacheFlushEnable = true,
 726                   .InstructionCacheInvalidateEnable = true,
 727                   .DepthCacheFlushEnable = true,
 728                   .VFCacheInvalidationEnable = true,
 729                   .TextureCacheInvalidationEnable = true,
 730                   .CommandStreamerStallEnable = true);
 731 }