src/intel/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28
  29 #include "common/gen_l3_config.h"
  30 #include "genxml/gen_macros.h"
  31 #include "genxml/genX_pack.h"
  32
  33 static void
  34 emit_lrm(struct anv_batch *batch,
  35          uint32_t reg, struct anv_bo *bo, uint32_t offset)
  36 {
  37    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
  38       lrm.RegisterAddress  = reg;
  39       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
  40    }
  41 }
  42
  43 static void
  44 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
  45 {
  46    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
  47       lri.RegisterOffset   = reg;
  48       lri.DataDWord        = imm;
  49    }
  50 }
  51
  52 void
  53 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  54 {
  55    struct anv_device *device = cmd_buffer->device;
  56
  57 /* XXX: Do we need this on more than just BDW? */
  58 #if (GEN_GEN >= 8)
  59    /* Emit a render target cache flush.
  60     *
  61     * This isn't documented anywhere in the PRM.  However, it seems to be
  62     * necessary prior to changing the surface state base adress.  Without
  63     * this, we get GPU hangs when using multi-level command buffers which
  64     * clear depth, reset state base address, and then go render stuff.
  65     */
  66    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
  67       pc.RenderTargetCacheFlushEnable = true;
  68    }
  69 #endif
  70
  71    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
  72       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
  73       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
  74       sba.GeneralStateBaseAddressModifyEnable = true;
  75
  76       sba.SurfaceStateBaseAddress =
  77          anv_cmd_buffer_surface_base_address(cmd_buffer);
  78       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
  79       sba.SurfaceStateBaseAddressModifyEnable = true;
  80
  81       sba.DynamicStateBaseAddress =
  82          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
  83       sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
  84       sba.DynamicStateBaseAddressModifyEnable = true;
  85
  86       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
  87       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
  88       sba.IndirectObjectBaseAddressModifyEnable = true;
  89
  90       sba.InstructionBaseAddress =
  91          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
  92       sba.InstructionMemoryObjectControlState = GENX(MOCS);
  93       sba.InstructionBaseAddressModifyEnable = true;
  94
  95 #  if (GEN_GEN >= 8)
  96       /* Broadwell requires that we specify a buffer size for a bunch of
  97        * these fields.  However, since we will be growing the BO's live, we
  98        * just set them all to the maximum.
  99        */
 100       sba.GeneralStateBufferSize                = 0xfffff;
 101       sba.GeneralStateBufferSizeModifyEnable    = true;
 102       sba.DynamicStateBufferSize                = 0xfffff;
 103       sba.DynamicStateBufferSizeModifyEnable    = true;
 104       sba.IndirectObjectBufferSize              = 0xfffff;
 105       sba.IndirectObjectBufferSizeModifyEnable  = true;
 106       sba.InstructionBufferSize                 = 0xfffff;
 107       sba.InstructionBuffersizeModifyEnable     = true;
 108 #  endif
 109    }
 110
 111    /* After re-setting the surface state base address, we have to do some
 112     * cache flusing so that the sampler engine will pick up the new
 113     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 114     * Shared Function > 3D Sampler > State > State Caching (page 96):
 115     *
 116     *    Coherency with system memory in the state cache, like the texture
 117     *    cache is handled partially by software. It is expected that the
 118     *    command stream or shader will issue Cache Flush operation or
 119     *    Cache_Flush sampler message to ensure that the L1 cache remains
 120     *    coherent with system memory.
 121     *
 122     *    [...]
 123     *
 124     *    Whenever the value of the Dynamic_State_Base_Addr,
 125     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 126     *    invalidated to ensure the new surface or sampler state is fetched
 127     *    from system memory.
 128     *
 129     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 130     * which, according the PIPE_CONTROL instruction documentation in the
 131     * Broadwell PRM:
 132     *
 133     *    Setting this bit is independent of any other bit in this packet.
 134     *    This bit controls the invalidation of the L1 and L2 state caches
 135     *    at the top of the pipe i.e. at the parsing time.
 136     *
 137     * Unfortunately, experimentation seems to indicate that state cache
 138     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 139     * regards to surface state and binding tables.  In stead, it seems that
 140     * invalidating the texture cache is what is actually needed.
 141     *
 142     * XXX:  As far as we have been able to determine through
 143     * experimentation, shows that flush the texture cache appears to be
 144     * sufficient.  The theory here is that all of the sampling/rendering
 145     * units cache the binding table in the texture cache.  However, we have
 146     * yet to be able to actually confirm this.
 147     */
 148    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 149       pc.TextureCacheInvalidationEnable = true;
 150    }
 151 }
 152
 153 VkResult
 154 genX(BeginCommandBuffer)(
 155     VkCommandBuffer                             commandBuffer,
 156     const VkCommandBufferBeginInfo*             pBeginInfo)
 157 {
 158    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 159
 160    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
 161     * command buffer's state. Otherwise, we must *reset* its state. In both
 162     * cases we reset it.
 163     *
 164     * From the Vulkan 1.0 spec:
 165     *
 166     *    If a command buffer is in the executable state and the command buffer
 167     *    was allocated from a command pool with the
 168     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
 169     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
 170     *    as if vkResetCommandBuffer had been called with
 171     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
 172     *    the command buffer in the recording state.
 173     */
 174    anv_cmd_buffer_reset(cmd_buffer);
 175
 176    cmd_buffer->usage_flags = pBeginInfo->flags;
 177
 178    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
 179           !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
 180
 181    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
 182
 183    if (cmd_buffer->usage_flags &
 184        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
 185       cmd_buffer->state.framebuffer =
 186          anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
 187       cmd_buffer->state.pass =
 188          anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
 189       cmd_buffer->state.subpass =
 190          &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
 191
 192       cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
 193    }
 194
 195    return VK_SUCCESS;
 196 }
 197
 198 VkResult
 199 genX(EndCommandBuffer)(
 200     VkCommandBuffer                             commandBuffer)
 201 {
 202    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 203    struct anv_device *device = cmd_buffer->device;
 204
 205    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 206
 207    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
 208       /* The algorithm used to compute the validate list is not threadsafe as
 209        * it uses the bo->index field.  We have to lock the device around it.
 210        * Fortunately, the chances for contention here are probably very low.
 211        */
 212       pthread_mutex_lock(&device->mutex);
 213       anv_cmd_buffer_prepare_execbuf(cmd_buffer);
 214       pthread_mutex_unlock(&device->mutex);
 215    }
 216
 217    return VK_SUCCESS;
 218 }
 219
 220 void
 221 genX(CmdExecuteCommands)(
 222     VkCommandBuffer                             commandBuffer,
 223     uint32_t                                    commandBufferCount,
 224     const VkCommandBuffer*                      pCmdBuffers)
 225 {
 226    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
 227
 228    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 229
 230    for (uint32_t i = 0; i < commandBufferCount; i++) {
 231       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 232
 233       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
 234
 235       anv_cmd_buffer_add_secondary(primary, secondary);
 236    }
 237
 238    /* Each of the secondary command buffers will use its own state base
 239     * address.  We need to re-emit state base address for the primary after
 240     * all of the secondaries are done.
 241     *
 242     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
 243     * address calls?
 244     */
 245    genX(cmd_buffer_emit_state_base_address)(primary);
 246 }
 247
 248 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
 249 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
 250 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
 251
 252 /**
 253  * Program the hardware to use the specified L3 configuration.
 254  */
 255 void
 256 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
 257                            const struct gen_l3_config *cfg)
 258 {
 259    assert(cfg);
 260    if (cfg == cmd_buffer->state.current_l3_config)
 261       return;
 262
 263    if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
 264       fprintf(stderr, "L3 config transition: ");
 265       gen_dump_l3_config(cfg, stderr);
 266    }
 267
 268    const bool has_slm = cfg->n[GEN_L3P_SLM];
 269
 270    /* According to the hardware docs, the L3 partitioning can only be changed
 271     * while the pipeline is completely drained and the caches are flushed,
 272     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
 273     */
 274    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 275       pc.DCFlushEnable = true;
 276       pc.PostSyncOperation = NoWrite;
 277       pc.CommandStreamerStallEnable = true;
 278    }
 279
 280    /* ...followed by a second pipelined PIPE_CONTROL that initiates
 281     * invalidation of the relevant caches.  Note that because RO invalidation
 282     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
 283     * command is processed by the CS) we cannot combine it with the previous
 284     * stalling flush as the hardware documentation suggests, because that
 285     * would cause the CS to stall on previous rendering *after* RO
 286     * invalidation and wouldn't prevent the RO caches from being polluted by
 287     * concurrent rendering before the stall completes.  This intentionally
 288     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
 289     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
 290     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
 291     * already guarantee that there is no concurrent GPGPU kernel execution
 292     * (see SKL HSD 2132585).
 293     */
 294    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 295       pc.TextureCacheInvalidationEnable = true;
 296       pc.ConstantCacheInvalidationEnable = true;
 297       pc.InstructionCacheInvalidateEnable = true;
 298       pc.StateCacheInvalidationEnable = true;
 299       pc.PostSyncOperation = NoWrite;
 300    }
 301
 302    /* Now send a third stalling flush to make sure that invalidation is
 303     * complete when the L3 configuration registers are modified.
 304     */
 305    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 306       pc.DCFlushEnable = true;
 307       pc.PostSyncOperation = NoWrite;
 308       pc.CommandStreamerStallEnable = true;
 309    }
 310
 311 #if GEN_GEN >= 8
 312
 313    assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
 314
 315    uint32_t l3cr;
 316    anv_pack_struct(&l3cr, GENX(L3CNTLREG),
 317                    .SLMEnable = has_slm,
 318                    .URBAllocation = cfg->n[GEN_L3P_URB],
 319                    .ROAllocation = cfg->n[GEN_L3P_RO],
 320                    .DCAllocation = cfg->n[GEN_L3P_DC],
 321                    .AllAllocation = cfg->n[GEN_L3P_ALL]);
 322
 323    /* Set up the L3 partitioning. */
 324    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr);
 325
 326 #else
 327
 328    const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
 329    const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
 330                        cfg->n[GEN_L3P_ALL];
 331    const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
 332                       cfg->n[GEN_L3P_ALL];
 333    const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
 334                       cfg->n[GEN_L3P_ALL];
 335
 336    assert(!cfg->n[GEN_L3P_ALL]);
 337
 338    /* When enabled SLM only uses a portion of the L3 on half of the banks,
 339     * the matching space on the remaining banks has to be allocated to a
 340     * client (URB for all validated configurations) set to the
 341     * lower-bandwidth 2-bank address hashing mode.
 342     */
 343    const struct gen_device_info *devinfo = &cmd_buffer->device->info;
 344    const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
 345    assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
 346
 347    /* Minimum number of ways that can be allocated to the URB. */
 348    const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
 349    assert(cfg->n[GEN_L3P_URB] >= n0_urb);
 350
 351    uint32_t l3sqcr1, l3cr2, l3cr3;
 352    anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1),
 353                    .ConvertDC_UC = !has_dc,
 354                    .ConvertIS_UC = !has_is,
 355                    .ConvertC_UC = !has_c,
 356                    .ConvertT_UC = !has_t);
 357    l3sqcr1 |=
 358       GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
 359       devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
 360       IVB_L3SQCREG1_SQGHPCI_DEFAULT;
 361
 362    anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
 363                    .SLMEnable = has_slm,
 364                    .URBLowBandwidth = urb_low_bw,
 365                    .URBAllocation = cfg->n[GEN_L3P_URB],
 366 #if !GEN_IS_HASWELL
 367                    .ALLAllocation = cfg->n[GEN_L3P_ALL],
 368 #endif
 369                    .ROAllocation = cfg->n[GEN_L3P_RO],
 370                    .DCAllocation = cfg->n[GEN_L3P_DC]);
 371
 372    anv_pack_struct(&l3cr3, GENX(L3CNTLREG3),
 373                    .ISAllocation = cfg->n[GEN_L3P_IS],
 374                    .ISLowBandwidth = 0,
 375                    .CAllocation = cfg->n[GEN_L3P_C],
 376                    .CLowBandwidth = 0,
 377                    .TAllocation = cfg->n[GEN_L3P_T],
 378                    .TLowBandwidth = 0);
 379
 380    /* Set up the L3 partitioning. */
 381    emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1);
 382    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2);
 383    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
 384
 385 #if GEN_IS_HASWELL
 386    if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
 387       /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
 388        * them disabled to avoid crashing the system hard.
 389        */
 390       uint32_t scratch1, chicken3;
 391       anv_pack_struct(&scratch1, GENX(SCRATCH1),
 392                       .L3AtomicDisable = !has_dc);
 393       anv_pack_struct(&chicken3, GENX(CHICKEN3),
 394                       .L3AtomicDisableMask = true,
 395                       .L3AtomicDisable = !has_dc);
 396       emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1);
 397       emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3);
 398    }
 399 #endif
 400
 401 #endif
 402
 403    cmd_buffer->state.current_l3_config = cfg;
 404 }
 405
 406 void
 407 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
 408 {
 409    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
 410
 411    /* Flushes are pipelined while invalidations are handled immediately.
 412     * Therefore, if we're flushing anything then we need to schedule a stall
 413     * before any invalidations can happen.
 414     */
 415    if (bits & ANV_PIPE_FLUSH_BITS)
 416       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
 417
 418    /* If we're going to do an invalidate and we have a pending CS stall that
 419     * has yet to be resolved, we do the CS stall now.
 420     */
 421    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
 422        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
 423       bits |= ANV_PIPE_CS_STALL_BIT;
 424       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
 425    }
 426
 427    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
 428       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 429          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 430          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 431          pipe.RenderTargetCacheFlushEnable =
 432             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 433
 434          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
 435          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
 436          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
 437
 438          /*
 439           * According to the Broadwell documentation, any PIPE_CONTROL with the
 440           * "Command Streamer Stall" bit set must also have another bit set,
 441           * with five different options:
 442           *
 443           *  - Render Target Cache Flush
 444           *  - Depth Cache Flush
 445           *  - Stall at Pixel Scoreboard
 446           *  - Post-Sync Operation
 447           *  - Depth Stall
 448           *  - DC Flush Enable
 449           *
 450           * I chose "Stall at Pixel Scoreboard" since that's what we use in
 451           * mesa and it seems to work fine. The choice is fairly arbitrary.
 452           */
 453          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
 454              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
 455                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
 456             pipe.StallAtPixelScoreboard = true;
 457       }
 458
 459       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
 460    }
 461
 462    if (bits & ANV_PIPE_INVALIDATE_BITS) {
 463       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 464          pipe.StateCacheInvalidationEnable =
 465             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
 466          pipe.ConstantCacheInvalidationEnable =
 467             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 468          pipe.VFCacheInvalidationEnable =
 469             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 470          pipe.TextureCacheInvalidationEnable =
 471             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 472          pipe.InstructionCacheInvalidateEnable =
 473             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
 474       }
 475
 476       bits &= ~ANV_PIPE_INVALIDATE_BITS;
 477    }
 478
 479    cmd_buffer->state.pending_pipe_bits = bits;
 480 }
 481
 482 void genX(CmdPipelineBarrier)(
 483     VkCommandBuffer                             commandBuffer,
 484     VkPipelineStageFlags                        srcStageMask,
 485     VkPipelineStageFlags                        destStageMask,
 486     VkBool32                                    byRegion,
 487     uint32_t                                    memoryBarrierCount,
 488     const VkMemoryBarrier*                      pMemoryBarriers,
 489     uint32_t                                    bufferMemoryBarrierCount,
 490     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 491     uint32_t                                    imageMemoryBarrierCount,
 492     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 493 {
 494    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 495    uint32_t b;
 496
 497    /* XXX: Right now, we're really dumb and just flush whatever categories
 498     * the app asks for.  One of these days we may make this a bit better
 499     * but right now that's all the hardware allows for in most areas.
 500     */
 501    VkAccessFlags src_flags = 0;
 502    VkAccessFlags dst_flags = 0;
 503
 504    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 505       src_flags |= pMemoryBarriers[i].srcAccessMask;
 506       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 507    }
 508
 509    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 510       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 511       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 512    }
 513
 514    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 515       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 516       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 517    }
 518
 519    enum anv_pipe_bits pipe_bits = 0;
 520
 521    for_each_bit(b, src_flags) {
 522       switch ((VkAccessFlagBits)(1 << b)) {
 523       case VK_ACCESS_SHADER_WRITE_BIT:
 524          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 525          break;
 526       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 527          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 528          break;
 529       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 530          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 531          break;
 532       case VK_ACCESS_TRANSFER_WRITE_BIT:
 533          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 534          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 535          break;
 536       default:
 537          break; /* Nothing to do */
 538       }
 539    }
 540
 541    for_each_bit(b, dst_flags) {
 542       switch ((VkAccessFlagBits)(1 << b)) {
 543       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 544       case VK_ACCESS_INDEX_READ_BIT:
 545       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 546          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 547          break;
 548       case VK_ACCESS_UNIFORM_READ_BIT:
 549          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 550          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 551          break;
 552       case VK_ACCESS_SHADER_READ_BIT:
 553       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 554       case VK_ACCESS_TRANSFER_READ_BIT:
 555          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 556          break;
 557       default:
 558          break; /* Nothing to do */
 559       }
 560    }
 561
 562    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
 563 }
 564
 565 static void
 566 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
 567 {
 568    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
 569
 570    /* In order to avoid thrash, we assume that vertex and fragment stages
 571     * always exist.  In the rare case where one is missing *and* the other
 572     * uses push concstants, this may be suboptimal.  However, avoiding stalls
 573     * seems more important.
 574     */
 575    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
 576
 577    if (stages == cmd_buffer->state.push_constant_stages)
 578       return;
 579
 580 #if GEN_GEN >= 8
 581    const unsigned push_constant_kb = 32;
 582 #elif GEN_IS_HASWELL
 583    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
 584 #else
 585    const unsigned push_constant_kb = 16;
 586 #endif
 587
 588    const unsigned num_stages =
 589       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
 590    unsigned size_per_stage = push_constant_kb / num_stages;
 591
 592    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
 593     * units of 2KB.  Incidentally, these are the same platforms that have
 594     * 32KB worth of push constant space.
 595     */
 596    if (push_constant_kb == 32)
 597       size_per_stage &= ~1u;
 598
 599    uint32_t kb_used = 0;
 600    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
 601       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
 602       anv_batch_emit(&cmd_buffer->batch,
 603                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
 604          alloc._3DCommandSubOpcode  = 18 + i;
 605          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
 606          alloc.ConstantBufferSize   = push_size;
 607       }
 608       kb_used += push_size;
 609    }
 610
 611    anv_batch_emit(&cmd_buffer->batch,
 612                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
 613       alloc.ConstantBufferOffset = kb_used;
 614       alloc.ConstantBufferSize = push_constant_kb - kb_used;
 615    }
 616
 617    cmd_buffer->state.push_constant_stages = stages;
 618
 619    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
 620     *
 621     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
 622     *    the next 3DPRIMITIVE command after programming the
 623     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
 624     *
 625     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
 626     * pipeline setup, we need to dirty push constants.
 627     */
 628    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
 629 }
 630
 631 static void
 632 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
 633                                     uint32_t stages)
 634 {
 635    static const uint32_t sampler_state_opcodes[] = {
 636       [MESA_SHADER_VERTEX]                      = 43,
 637       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
 638       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
 639       [MESA_SHADER_GEOMETRY]                    = 46,
 640       [MESA_SHADER_FRAGMENT]                    = 47,
 641       [MESA_SHADER_COMPUTE]                     = 0,
 642    };
 643
 644    static const uint32_t binding_table_opcodes[] = {
 645       [MESA_SHADER_VERTEX]                      = 38,
 646       [MESA_SHADER_TESS_CTRL]                   = 39,
 647       [MESA_SHADER_TESS_EVAL]                   = 40,
 648       [MESA_SHADER_GEOMETRY]                    = 41,
 649       [MESA_SHADER_FRAGMENT]                    = 42,
 650       [MESA_SHADER_COMPUTE]                     = 0,
 651    };
 652
 653    anv_foreach_stage(s, stages) {
 654       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
 655          anv_batch_emit(&cmd_buffer->batch,
 656                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
 657             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
 658             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
 659          }
 660       }
 661
 662       /* Always emit binding table pointers if we're asked to, since on SKL
 663        * this is what flushes push constants. */
 664       anv_batch_emit(&cmd_buffer->batch,
 665                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
 666          btp._3DCommandSubOpcode = binding_table_opcodes[s];
 667          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
 668       }
 669    }
 670 }
 671
 672 static uint32_t
 673 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
 674 {
 675    static const uint32_t push_constant_opcodes[] = {
 676       [MESA_SHADER_VERTEX]                      = 21,
 677       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
 678       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
 679       [MESA_SHADER_GEOMETRY]                    = 22,
 680       [MESA_SHADER_FRAGMENT]                    = 23,
 681       [MESA_SHADER_COMPUTE]                     = 0,
 682    };
 683
 684    VkShaderStageFlags flushed = 0;
 685
 686    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
 687       if (stage == MESA_SHADER_COMPUTE)
 688          continue;
 689
 690       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
 691
 692       if (state.offset == 0) {
 693          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
 694             c._3DCommandSubOpcode = push_constant_opcodes[stage];
 695       } else {
 696          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
 697             c._3DCommandSubOpcode = push_constant_opcodes[stage],
 698             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
 699 #if GEN_GEN >= 9
 700                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
 701                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
 702 #else
 703                .PointerToConstantBuffer0 = { .offset = state.offset },
 704                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
 705 #endif
 706             };
 707          }
 708       }
 709
 710       flushed |= mesa_to_vk_shader_stage(stage);
 711    }
 712
 713    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
 714
 715    return flushed;
 716 }
 717
 718 void
 719 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 720 {
 721    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 722    uint32_t *p;
 723
 724    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
 725
 726    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
 727
 728    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
 729
 730    genX(flush_pipeline_select_3d)(cmd_buffer);
 731
 732    if (vb_emit) {
 733       const uint32_t num_buffers = __builtin_popcount(vb_emit);
 734       const uint32_t num_dwords = 1 + num_buffers * 4;
 735
 736       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
 737                           GENX(3DSTATE_VERTEX_BUFFERS));
 738       uint32_t vb, i = 0;
 739       for_each_bit(vb, vb_emit) {
 740          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
 741          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
 742
 743          struct GENX(VERTEX_BUFFER_STATE) state = {
 744             .VertexBufferIndex = vb,
 745
 746 #if GEN_GEN >= 8
 747             .MemoryObjectControlState = GENX(MOCS),
 748 #else
 749             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
 750             .InstanceDataStepRate = 1,
 751             .VertexBufferMemoryObjectControlState = GENX(MOCS),
 752 #endif
 753
 754             .AddressModifyEnable = true,
 755             .BufferPitch = pipeline->binding_stride[vb],
 756             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
 757
 758 #if GEN_GEN >= 8
 759             .BufferSize = buffer->size - offset
 760 #else
 761             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
 762 #endif
 763          };
 764
 765          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
 766          i++;
 767       }
 768    }
 769
 770    cmd_buffer->state.vb_dirty &= ~vb_emit;
 771
 772    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
 773       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
 774
 775       /* The exact descriptor layout is pulled from the pipeline, so we need
 776        * to re-emit binding tables on every pipeline change.
 777        */
 778       cmd_buffer->state.descriptors_dirty |=
 779          cmd_buffer->state.pipeline->active_stages;
 780
 781       /* If the pipeline changed, we may need to re-allocate push constant
 782        * space in the URB.
 783        */
 784       cmd_buffer_alloc_push_constants(cmd_buffer);
 785    }
 786
 787 #if GEN_GEN <= 7
 788    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
 789        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
 790       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
 791        *
 792        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
 793        *    stall needs to be sent just prior to any 3DSTATE_VS,
 794        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
 795        *    3DSTATE_BINDING_TABLE_POINTER_VS,
 796        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
 797        *    PIPE_CONTROL needs to be sent before any combination of VS
 798        *    associated 3DSTATE."
 799        */
 800       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 801          pc.DepthStallEnable  = true;
 802          pc.PostSyncOperation = WriteImmediateData;
 803          pc.Address           =
 804             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
 805       }
 806    }
 807 #endif
 808
 809    /* Render targets live in the same binding table as fragment descriptors */
 810    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
 811       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
 812
 813    /* We emit the binding tables and sampler tables first, then emit push
 814     * constants and then finally emit binding table and sampler table
 815     * pointers.  It has to happen in this order, since emitting the binding
 816     * tables may change the push constants (in case of storage images). After
 817     * emitting push constants, on SKL+ we have to emit the corresponding
 818     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
 819     */
 820    uint32_t dirty = 0;
 821    if (cmd_buffer->state.descriptors_dirty)
 822       dirty = anv_cmd_buffer_flush_descriptor_sets(cmd_buffer);
 823
 824    if (cmd_buffer->state.push_constants_dirty) {
 825 #if GEN_GEN >= 9
 826       /* On Sky Lake and later, the binding table pointers commands are
 827        * what actually flush the changes to push constant state so we need
 828        * to dirty them so they get re-emitted below.
 829        */
 830       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
 831 #else
 832       cmd_buffer_flush_push_constants(cmd_buffer);
 833 #endif
 834    }
 835
 836    if (dirty)
 837       cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
 838
 839    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
 840       gen8_cmd_buffer_emit_viewport(cmd_buffer);
 841
 842    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
 843                                   ANV_CMD_DIRTY_PIPELINE)) {
 844       gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
 845                                           pipeline->depth_clamp_enable);
 846    }
 847
 848    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
 849       gen7_cmd_buffer_emit_scissor(cmd_buffer);
 850
 851    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
 852
 853    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 854 }
 855
 856 static void
 857 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
 858                              struct anv_bo *bo, uint32_t offset)
 859 {
 860    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
 861                                  GENX(3DSTATE_VERTEX_BUFFERS));
 862
 863    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
 864       &(struct GENX(VERTEX_BUFFER_STATE)) {
 865          .VertexBufferIndex = 32, /* Reserved for this */
 866          .AddressModifyEnable = true,
 867          .BufferPitch = 0,
 868 #if (GEN_GEN >= 8)
 869          .MemoryObjectControlState = GENX(MOCS),
 870          .BufferStartingAddress = { bo, offset },
 871          .BufferSize = 8
 872 #else
 873          .VertexBufferMemoryObjectControlState = GENX(MOCS),
 874          .BufferStartingAddress = { bo, offset },
 875          .EndAddress = { bo, offset + 8 },
 876 #endif
 877       });
 878 }
 879
 880 static void
 881 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
 882                           uint32_t base_vertex, uint32_t base_instance)
 883 {
 884    struct anv_state id_state =
 885       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
 886
 887    ((uint32_t *)id_state.map)[0] = base_vertex;
 888    ((uint32_t *)id_state.map)[1] = base_instance;
 889
 890    if (!cmd_buffer->device->info.has_llc)
 891       anv_state_clflush(id_state);
 892
 893    emit_base_vertex_instance_bo(cmd_buffer,
 894       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
 895 }
 896
 897 void genX(CmdDraw)(
 898     VkCommandBuffer                             commandBuffer,
 899     uint32_t                                    vertexCount,
 900     uint32_t                                    instanceCount,
 901     uint32_t                                    firstVertex,
 902     uint32_t                                    firstInstance)
 903 {
 904    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 905    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 906    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 907
 908    genX(cmd_buffer_flush_state)(cmd_buffer);
 909
 910    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 911       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
 912
 913    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 914       prim.VertexAccessType         = SEQUENTIAL;
 915       prim.PrimitiveTopologyType    = pipeline->topology;
 916       prim.VertexCountPerInstance   = vertexCount;
 917       prim.StartVertexLocation      = firstVertex;
 918       prim.InstanceCount            = instanceCount;
 919       prim.StartInstanceLocation    = firstInstance;
 920       prim.BaseVertexLocation       = 0;
 921    }
 922 }
 923
 924 void genX(CmdDrawIndexed)(
 925     VkCommandBuffer                             commandBuffer,
 926     uint32_t                                    indexCount,
 927     uint32_t                                    instanceCount,
 928     uint32_t                                    firstIndex,
 929     int32_t                                     vertexOffset,
 930     uint32_t                                    firstInstance)
 931 {
 932    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 933    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 934    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 935
 936    genX(cmd_buffer_flush_state)(cmd_buffer);
 937
 938    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 939       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
 940
 941    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 942       prim.VertexAccessType         = RANDOM;
 943       prim.PrimitiveTopologyType    = pipeline->topology;
 944       prim.VertexCountPerInstance   = indexCount;
 945       prim.StartVertexLocation      = firstIndex;
 946       prim.InstanceCount            = instanceCount;
 947       prim.StartInstanceLocation    = firstInstance;
 948       prim.BaseVertexLocation       = vertexOffset;
 949    }
 950 }
 951
 952 /* Auto-Draw / Indirect Registers */
 953 #define GEN7_3DPRIM_END_OFFSET          0x2420
 954 #define GEN7_3DPRIM_START_VERTEX        0x2430
 955 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
 956 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
 957 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 958 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 959
 960 void genX(CmdDrawIndirect)(
 961     VkCommandBuffer                             commandBuffer,
 962     VkBuffer                                    _buffer,
 963     VkDeviceSize                                offset,
 964     uint32_t                                    drawCount,
 965     uint32_t                                    stride)
 966 {
 967    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 968    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 969    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 970    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 971    struct anv_bo *bo = buffer->bo;
 972    uint32_t bo_offset = buffer->offset + offset;
 973
 974    genX(cmd_buffer_flush_state)(cmd_buffer);
 975
 976    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 977       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
 978
 979    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 980    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 981    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 982    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
 983    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
 984
 985    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 986       prim.IndirectParameterEnable  = true;
 987       prim.VertexAccessType         = SEQUENTIAL;
 988       prim.PrimitiveTopologyType    = pipeline->topology;
 989    }
 990 }
 991
 992 void genX(CmdDrawIndexedIndirect)(
 993     VkCommandBuffer                             commandBuffer,
 994     VkBuffer                                    _buffer,
 995     VkDeviceSize                                offset,
 996     uint32_t                                    drawCount,
 997     uint32_t                                    stride)
 998 {
 999    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1000    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1001    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1002    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1003    struct anv_bo *bo = buffer->bo;
1004    uint32_t bo_offset = buffer->offset + offset;
1005
1006    genX(cmd_buffer_flush_state)(cmd_buffer);
1007
1008    /* TODO: We need to stomp base vertex to 0 somehow */
1009    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1010       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
1011
1012    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1013    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1014    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1015    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
1016    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
1017
1018    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1019       prim.IndirectParameterEnable  = true;
1020       prim.VertexAccessType         = RANDOM;
1021       prim.PrimitiveTopologyType    = pipeline->topology;
1022    }
1023 }
1024
1025 static VkResult
1026 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
1027 {
1028    struct anv_device *device = cmd_buffer->device;
1029    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1030    struct anv_state surfaces = { 0, }, samplers = { 0, };
1031    VkResult result;
1032
1033    result = anv_cmd_buffer_emit_samplers(cmd_buffer,
1034                                          MESA_SHADER_COMPUTE, &samplers);
1035    if (result != VK_SUCCESS)
1036       return result;
1037    result = anv_cmd_buffer_emit_binding_table(cmd_buffer,
1038                                               MESA_SHADER_COMPUTE, &surfaces);
1039    if (result != VK_SUCCESS)
1040       return result;
1041
1042    struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer);
1043
1044    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
1045    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
1046
1047    if (push_state.alloc_size) {
1048       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1049          curbe.CURBETotalDataLength    = push_state.alloc_size;
1050          curbe.CURBEDataStartAddress   = push_state.offset;
1051       }
1052    }
1053
1054    const uint32_t slm_size = encode_slm_size(GEN_GEN, prog_data->total_shared);
1055
1056    struct anv_state state =
1057       anv_state_pool_emit(&device->dynamic_state_pool,
1058                           GENX(INTERFACE_DESCRIPTOR_DATA), 64,
1059                           .KernelStartPointer = pipeline->cs_simd,
1060                           .BindingTablePointer = surfaces.offset,
1061                           .BindingTableEntryCount = 0,
1062                           .SamplerStatePointer = samplers.offset,
1063                           .SamplerCount = 0,
1064 #if !GEN_IS_HASWELL
1065                           .ConstantURBEntryReadOffset = 0,
1066 #endif
1067                           .ConstantURBEntryReadLength =
1068                              cs_prog_data->push.per_thread.regs,
1069 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1070                           .CrossThreadConstantDataReadLength =
1071                              cs_prog_data->push.cross_thread.regs,
1072 #endif
1073                           .BarrierEnable = cs_prog_data->uses_barrier,
1074                           .SharedLocalMemorySize = slm_size,
1075                           .NumberofThreadsinGPGPUThreadGroup =
1076                              cs_prog_data->threads);
1077
1078    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1079    anv_batch_emit(&cmd_buffer->batch,
1080                   GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1081       mid.InterfaceDescriptorTotalLength        = size;
1082       mid.InterfaceDescriptorDataStartAddress   = state.offset;
1083    }
1084
1085    return VK_SUCCESS;
1086 }
1087
1088 void
1089 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
1090 {
1091    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1092    MAYBE_UNUSED VkResult result;
1093
1094    assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
1095
1096    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1097
1098    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1099
1100    if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)
1101       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1102
1103    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
1104        (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
1105       /* FIXME: figure out descriptors for gen7 */
1106       result = flush_compute_descriptor_set(cmd_buffer);
1107       assert(result == VK_SUCCESS);
1108       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
1109    }
1110
1111    cmd_buffer->state.compute_dirty = 0;
1112
1113    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1114 }
1115
1116 #if GEN_GEN == 7
1117
1118 static bool
1119 verify_cmd_parser(const struct anv_device *device,
1120                   int required_version,
1121                   const char *function)
1122 {
1123    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
1124       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
1125                 "cmd parser version %d is required for %s",
1126                 required_version, function);
1127       return false;
1128    } else {
1129       return true;
1130    }
1131 }
1132
1133 #endif
1134
1135 void genX(CmdDispatch)(
1136     VkCommandBuffer                             commandBuffer,
1137     uint32_t                                    x,
1138     uint32_t                                    y,
1139     uint32_t                                    z)
1140 {
1141    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1142    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1143    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1144
1145    if (prog_data->uses_num_work_groups) {
1146       struct anv_state state =
1147          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
1148       uint32_t *sizes = state.map;
1149       sizes[0] = x;
1150       sizes[1] = y;
1151       sizes[2] = z;
1152       if (!cmd_buffer->device->info.has_llc)
1153          anv_state_clflush(state);
1154       cmd_buffer->state.num_workgroups_offset = state.offset;
1155       cmd_buffer->state.num_workgroups_bo =
1156          &cmd_buffer->device->dynamic_state_block_pool.bo;
1157    }
1158
1159    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1160
1161    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
1162       ggw.SIMDSize                     = prog_data->simd_size / 16;
1163       ggw.ThreadDepthCounterMaximum    = 0;
1164       ggw.ThreadHeightCounterMaximum   = 0;
1165       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1166       ggw.ThreadGroupIDXDimension      = x;
1167       ggw.ThreadGroupIDYDimension      = y;
1168       ggw.ThreadGroupIDZDimension      = z;
1169       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1170       ggw.BottomExecutionMask          = 0xffffffff;
1171    }
1172
1173    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
1174 }
1175
1176 #define GPGPU_DISPATCHDIMX 0x2500
1177 #define GPGPU_DISPATCHDIMY 0x2504
1178 #define GPGPU_DISPATCHDIMZ 0x2508
1179
1180 #define MI_PREDICATE_SRC0  0x2400
1181 #define MI_PREDICATE_SRC1  0x2408
1182
1183 void genX(CmdDispatchIndirect)(
1184     VkCommandBuffer                             commandBuffer,
1185     VkBuffer                                    _buffer,
1186     VkDeviceSize                                offset)
1187 {
1188    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1189    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1190    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1191    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1192    struct anv_bo *bo = buffer->bo;
1193    uint32_t bo_offset = buffer->offset + offset;
1194    struct anv_batch *batch = &cmd_buffer->batch;
1195
1196 #if GEN_GEN == 7
1197    /* Linux 4.4 added command parser version 5 which allows the GPGPU
1198     * indirect dispatch registers to be written.
1199     */
1200    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
1201       return;
1202 #endif
1203
1204    if (prog_data->uses_num_work_groups) {
1205       cmd_buffer->state.num_workgroups_offset = bo_offset;
1206       cmd_buffer->state.num_workgroups_bo = bo;
1207    }
1208
1209    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1210
1211    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
1212    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
1213    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
1214
1215 #if GEN_GEN <= 7
1216    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
1217    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
1218    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
1219    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
1220
1221    /* Load compute_dispatch_indirect_x_size into SRC0 */
1222    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
1223
1224    /* predicate = (compute_dispatch_indirect_x_size == 0); */
1225    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1226       mip.LoadOperation    = LOAD_LOAD;
1227       mip.CombineOperation = COMBINE_SET;
1228       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1229    }
1230
1231    /* Load compute_dispatch_indirect_y_size into SRC0 */
1232    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
1233
1234    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
1235    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1236       mip.LoadOperation    = LOAD_LOAD;
1237       mip.CombineOperation = COMBINE_OR;
1238       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1239    }
1240
1241    /* Load compute_dispatch_indirect_z_size into SRC0 */
1242    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
1243
1244    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
1245    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1246       mip.LoadOperation    = LOAD_LOAD;
1247       mip.CombineOperation = COMBINE_OR;
1248       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1249    }
1250
1251    /* predicate = !predicate; */
1252 #define COMPARE_FALSE                           1
1253    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1254       mip.LoadOperation    = LOAD_LOADINV;
1255       mip.CombineOperation = COMBINE_OR;
1256       mip.CompareOperation = COMPARE_FALSE;
1257    }
1258 #endif
1259
1260    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
1261       ggw.IndirectParameterEnable      = true;
1262       ggw.PredicateEnable              = GEN_GEN <= 7;
1263       ggw.SIMDSize                     = prog_data->simd_size / 16;
1264       ggw.ThreadDepthCounterMaximum    = 0;
1265       ggw.ThreadHeightCounterMaximum   = 0;
1266       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1267       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1268       ggw.BottomExecutionMask          = 0xffffffff;
1269    }
1270
1271    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
1272 }
1273
1274 static void
1275 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
1276                                       uint32_t pipeline)
1277 {
1278 #if GEN_GEN >= 8 && GEN_GEN < 10
1279    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1280     *
1281     *   Software must clear the COLOR_CALC_STATE Valid field in
1282     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1283     *   with Pipeline Select set to GPGPU.
1284     *
1285     * The internal hardware docs recommend the same workaround for Gen9
1286     * hardware too.
1287     */
1288    if (pipeline == GPGPU)
1289       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1290 #elif GEN_GEN <= 7
1291       /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1292        * PIPELINE_SELECT [DevBWR+]":
1293        *
1294        *   Project: DEVSNB+
1295        *
1296        *   Software must ensure all the write caches are flushed through a
1297        *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1298        *   command to invalidate read only caches prior to programming
1299        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
1300        */
1301       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1302          pc.RenderTargetCacheFlushEnable  = true;
1303          pc.DepthCacheFlushEnable         = true;
1304          pc.DCFlushEnable                 = true;
1305          pc.PostSyncOperation             = NoWrite;
1306          pc.CommandStreamerStallEnable    = true;
1307       }
1308
1309       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1310          pc.TextureCacheInvalidationEnable   = true;
1311          pc.ConstantCacheInvalidationEnable  = true;
1312          pc.StateCacheInvalidationEnable     = true;
1313          pc.InstructionCacheInvalidateEnable = true;
1314          pc.PostSyncOperation                = NoWrite;
1315       }
1316 #endif
1317 }
1318
1319 void
1320 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
1321 {
1322    if (cmd_buffer->state.current_pipeline != _3D) {
1323       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
1324
1325       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1326 #if GEN_GEN >= 9
1327          ps.MaskBits = 3;
1328 #endif
1329          ps.PipelineSelection = _3D;
1330       }
1331
1332       cmd_buffer->state.current_pipeline = _3D;
1333    }
1334 }
1335
1336 void
1337 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
1338 {
1339    if (cmd_buffer->state.current_pipeline != GPGPU) {
1340       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
1341
1342       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1343 #if GEN_GEN >= 9
1344          ps.MaskBits = 3;
1345 #endif
1346          ps.PipelineSelection = GPGPU;
1347       }
1348
1349       cmd_buffer->state.current_pipeline = GPGPU;
1350    }
1351 }
1352
1353 struct anv_state
1354 genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
1355                                           struct anv_framebuffer *fb)
1356 {
1357    struct anv_state state =
1358       anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
1359
1360    struct GENX(RENDER_SURFACE_STATE) null_ss = {
1361       .SurfaceType = SURFTYPE_NULL,
1362       .SurfaceArray = fb->layers > 0,
1363       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1364 #if GEN_GEN >= 8
1365       .TileMode = YMAJOR,
1366 #else
1367       .TiledSurface = true,
1368 #endif
1369       .Width = fb->width - 1,
1370       .Height = fb->height - 1,
1371       .Depth = fb->layers - 1,
1372       .RenderTargetViewExtent = fb->layers - 1,
1373    };
1374
1375    GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss);
1376
1377    if (!cmd_buffer->device->info.has_llc)
1378       anv_state_clflush(state);
1379
1380    return state;
1381 }
1382
1383 static void
1384 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
1385 {
1386    struct anv_device *device = cmd_buffer->device;
1387    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
1388    const struct anv_image_view *iview =
1389       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1390    const struct anv_image *image = iview ? iview->image : NULL;
1391    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
1392    const bool has_hiz = image != NULL && anv_image_has_hiz(image);
1393    const bool has_stencil =
1394       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
1395
1396    /* FIXME: Implement the PMA stall W/A */
1397    /* FIXME: Width and Height are wrong */
1398
1399    /* Emit 3DSTATE_DEPTH_BUFFER */
1400    if (has_depth) {
1401       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1402          db.SurfaceType                   = SURFTYPE_2D;
1403          db.DepthWriteEnable              = true;
1404          db.StencilWriteEnable            = has_stencil;
1405
1406          if (cmd_buffer->state.pass->subpass_count == 1) {
1407             db.HierarchicalDepthBufferEnable = has_hiz;
1408          } else {
1409             anv_finishme("Multiple-subpass HiZ not implemented");
1410          }
1411
1412          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
1413                                                       &image->depth_surface.isl);
1414
1415          db.SurfaceBaseAddress = (struct anv_address) {
1416             .bo = image->bo,
1417             .offset = image->offset + image->depth_surface.offset,
1418          };
1419          db.DepthBufferObjectControlState = GENX(MOCS);
1420
1421          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
1422          db.Height               = image->extent.height - 1;
1423          db.Width                = image->extent.width - 1;
1424          db.LOD                  = iview->isl.base_level;
1425          db.Depth                = image->array_size - 1; /* FIXME: 3-D */
1426          db.MinimumArrayElement  = iview->isl.base_array_layer;
1427
1428 #if GEN_GEN >= 8
1429          db.SurfaceQPitch =
1430             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2;
1431 #endif
1432          db.RenderTargetViewExtent = 1 - 1;
1433       }
1434    } else {
1435       /* Even when no depth buffer is present, the hardware requires that
1436        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
1437        *
1438        *    If a null depth buffer is bound, the driver must instead bind depth as:
1439        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
1440        *       3DSTATE_DEPTH.Width = 1
1441        *       3DSTATE_DEPTH.Height = 1
1442        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
1443        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
1444        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
1445        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
1446        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
1447        *
1448        * The PRM is wrong, though. The width and height must be programmed to
1449        * actual framebuffer's width and height, even when neither depth buffer
1450        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
1451        * be combined with a stencil buffer so we use D32_FLOAT instead.
1452        */
1453       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1454          db.SurfaceType          = SURFTYPE_2D;
1455          db.SurfaceFormat        = D32_FLOAT;
1456          db.Width                = fb->width - 1;
1457          db.Height               = fb->height - 1;
1458          db.StencilWriteEnable   = has_stencil;
1459       }
1460    }
1461
1462    if (has_hiz) {
1463       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) {
1464          hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS);
1465          hdb.SurfacePitch = image->hiz_surface.isl.row_pitch - 1;
1466          hdb.SurfaceBaseAddress = (struct anv_address) {
1467             .bo = image->bo,
1468             .offset = image->offset + image->hiz_surface.offset,
1469          };
1470 #if GEN_GEN >= 8
1471          /* From the SKL PRM Vol2a:
1472           *
1473           *    The interpretation of this field is dependent on Surface Type
1474           *    as follows:
1475           *    - SURFTYPE_1D: distance in pixels between array slices
1476           *    - SURFTYPE_2D/CUBE: distance in rows between array slices
1477           *    - SURFTYPE_3D: distance in rows between R - slices
1478           */
1479          hdb.SurfaceQPitch =
1480             image->hiz_surface.isl.dim == ISL_SURF_DIM_1D ?
1481                isl_surf_get_array_pitch_el(&image->hiz_surface.isl) >> 2 :
1482                isl_surf_get_array_pitch_el_rows(&image->hiz_surface.isl) >> 2;
1483 #endif
1484       }
1485    } else {
1486       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb);
1487    }
1488
1489    /* Emit 3DSTATE_STENCIL_BUFFER */
1490    if (has_stencil) {
1491       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
1492 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1493          sb.StencilBufferEnable = true;
1494 #endif
1495          sb.StencilBufferObjectControlState = GENX(MOCS);
1496
1497          sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1;
1498
1499 #if GEN_GEN >= 8
1500          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2;
1501 #endif
1502          sb.SurfaceBaseAddress = (struct anv_address) {
1503             .bo = image->bo,
1504             .offset = image->offset + image->stencil_surface.offset,
1505          };
1506       }
1507    } else {
1508       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
1509    }
1510
1511    /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS:
1512     *
1513     *    3DSTATE_CLEAR_PARAMS must always be programmed in the along with
1514     *    the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER,
1515     *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER)
1516     *
1517     * Testing also shows that some variant of this restriction may exist HSW+.
1518     * On BDW+, it is not possible to emit 2 of these packets consecutively when
1519     * both have DepthClearValueValid set. An analysis of such state programming
1520     * on SKL showed that the GPU doesn't register the latter packet's clear
1521     * value.
1522     */
1523    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) {
1524       if (has_hiz) {
1525          cp.DepthClearValueValid = true;
1526          const uint32_t ds =
1527             cmd_buffer->state.subpass->depth_stencil_attachment;
1528          cp.DepthClearValue =
1529             cmd_buffer->state.attachments[ds].clear_value.depthStencil.depth;
1530       }
1531    }
1532 }
1533
1534 static void
1535 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
1536                              struct anv_subpass *subpass)
1537 {
1538    cmd_buffer->state.subpass = subpass;
1539
1540    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1541
1542    cmd_buffer_emit_depth_stencil(cmd_buffer);
1543    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_HIZ_RESOLVE);
1544    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_CLEAR);
1545
1546    anv_cmd_buffer_clear_subpass(cmd_buffer);
1547 }
1548
1549 void genX(CmdBeginRenderPass)(
1550     VkCommandBuffer                             commandBuffer,
1551     const VkRenderPassBeginInfo*                pRenderPassBegin,
1552     VkSubpassContents                           contents)
1553 {
1554    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1555    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
1556    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1557
1558    cmd_buffer->state.framebuffer = framebuffer;
1559    cmd_buffer->state.pass = pass;
1560    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
1561    anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
1562
1563    genX(flush_pipeline_select_3d)(cmd_buffer);
1564
1565    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
1566 }
1567
1568 void genX(CmdNextSubpass)(
1569     VkCommandBuffer                             commandBuffer,
1570     VkSubpassContents                           contents)
1571 {
1572    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1573
1574    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1575
1576    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1577    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
1578 }
1579
1580 void genX(CmdEndRenderPass)(
1581     VkCommandBuffer                             commandBuffer)
1582 {
1583    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1584
1585    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_RESOLVE);
1586    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1587
1588 #ifndef NDEBUG
1589    anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
1590 #endif
1591 }
1592
1593 static void
1594 emit_ps_depth_count(struct anv_batch *batch,
1595                     struct anv_bo *bo, uint32_t offset)
1596 {
1597    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1598       pc.DestinationAddressType  = DAT_PPGTT;
1599       pc.PostSyncOperation       = WritePSDepthCount;
1600       pc.DepthStallEnable        = true;
1601       pc.Address                 = (struct anv_address) { bo, offset };
1602    }
1603 }
1604
1605 static void
1606 emit_query_availability(struct anv_batch *batch,
1607                         struct anv_bo *bo, uint32_t offset)
1608 {
1609    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1610       pc.DestinationAddressType  = DAT_PPGTT;
1611       pc.PostSyncOperation       = WriteImmediateData;
1612       pc.Address                 = (struct anv_address) { bo, offset };
1613       pc.ImmediateData           = 1;
1614    }
1615 }
1616
1617 void genX(CmdBeginQuery)(
1618     VkCommandBuffer                             commandBuffer,
1619     VkQueryPool                                 queryPool,
1620     uint32_t                                    query,
1621     VkQueryControlFlags                         flags)
1622 {
1623    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1624    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1625
1626    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
1627     * that the pipelining of the depth write breaks. What we see is that
1628     * samples from the render pass clear leaks into the first query
1629     * immediately after the clear. Doing a pipecontrol with a post-sync
1630     * operation and DepthStallEnable seems to work around the issue.
1631     */
1632    if (cmd_buffer->state.need_query_wa) {
1633       cmd_buffer->state.need_query_wa = false;
1634       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1635          pc.DepthCacheFlushEnable   = true;
1636          pc.DepthStallEnable        = true;
1637       }
1638    }
1639
1640    switch (pool->type) {
1641    case VK_QUERY_TYPE_OCCLUSION:
1642       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1643                           query * sizeof(struct anv_query_pool_slot));
1644       break;
1645
1646    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1647    default:
1648       unreachable("");
1649    }
1650 }
1651
1652 void genX(CmdEndQuery)(
1653     VkCommandBuffer                             commandBuffer,
1654     VkQueryPool                                 queryPool,
1655     uint32_t                                    query)
1656 {
1657    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1658    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1659
1660    switch (pool->type) {
1661    case VK_QUERY_TYPE_OCCLUSION:
1662       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1663                           query * sizeof(struct anv_query_pool_slot) + 8);
1664
1665       emit_query_availability(&cmd_buffer->batch, &pool->bo,
1666                               query * sizeof(struct anv_query_pool_slot) + 16);
1667       break;
1668
1669    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1670    default:
1671       unreachable("");
1672    }
1673 }
1674
1675 #define TIMESTAMP 0x2358
1676
1677 void genX(CmdWriteTimestamp)(
1678     VkCommandBuffer                             commandBuffer,
1679     VkPipelineStageFlagBits                     pipelineStage,
1680     VkQueryPool                                 queryPool,
1681     uint32_t                                    query)
1682 {
1683    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1684    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1685    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
1686
1687    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1688
1689    switch (pipelineStage) {
1690    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1691       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1692          srm.RegisterAddress  = TIMESTAMP;
1693          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
1694       }
1695       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1696          srm.RegisterAddress  = TIMESTAMP + 4;
1697          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
1698       }
1699       break;
1700
1701    default:
1702       /* Everything else is bottom-of-pipe */
1703       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1704          pc.DestinationAddressType  = DAT_PPGTT;
1705          pc.PostSyncOperation       = WriteTimestamp;
1706          pc.Address = (struct anv_address) { &pool->bo, offset };
1707       }
1708       break;
1709    }
1710
1711    emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
1712 }
1713
1714 #if GEN_GEN > 7 || GEN_IS_HASWELL
1715
1716 #define alu_opcode(v)   __gen_uint((v),  20, 31)
1717 #define alu_operand1(v) __gen_uint((v),  10, 19)
1718 #define alu_operand2(v) __gen_uint((v),   0,  9)
1719 #define alu(opcode, operand1, operand2) \
1720    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
1721
1722 #define OPCODE_NOOP      0x000
1723 #define OPCODE_LOAD      0x080
1724 #define OPCODE_LOADINV   0x480
1725 #define OPCODE_LOAD0     0x081
1726 #define OPCODE_LOAD1     0x481
1727 #define OPCODE_ADD       0x100
1728 #define OPCODE_SUB       0x101
1729 #define OPCODE_AND       0x102
1730 #define OPCODE_OR        0x103
1731 #define OPCODE_XOR       0x104
1732 #define OPCODE_STORE     0x180
1733 #define OPCODE_STOREINV  0x580
1734
1735 #define OPERAND_R0   0x00
1736 #define OPERAND_R1   0x01
1737 #define OPERAND_R2   0x02
1738 #define OPERAND_R3   0x03
1739 #define OPERAND_R4   0x04
1740 #define OPERAND_SRCA 0x20
1741 #define OPERAND_SRCB 0x21
1742 #define OPERAND_ACCU 0x31
1743 #define OPERAND_ZF   0x32
1744 #define OPERAND_CF   0x33
1745
1746 #define CS_GPR(n) (0x2600 + (n) * 8)
1747
1748 static void
1749 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
1750                       struct anv_bo *bo, uint32_t offset)
1751 {
1752    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1753       lrm.RegisterAddress  = reg,
1754       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
1755    }
1756    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1757       lrm.RegisterAddress  = reg + 4;
1758       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1759    }
1760 }
1761
1762 static void
1763 store_query_result(struct anv_batch *batch, uint32_t reg,
1764                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
1765 {
1766    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1767       srm.RegisterAddress  = reg;
1768       srm.MemoryAddress    = (struct anv_address) { bo, offset };
1769    }
1770
1771    if (flags & VK_QUERY_RESULT_64_BIT) {
1772       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1773          srm.RegisterAddress  = reg + 4;
1774          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1775       }
1776    }
1777 }
1778
1779 void genX(CmdCopyQueryPoolResults)(
1780     VkCommandBuffer                             commandBuffer,
1781     VkQueryPool                                 queryPool,
1782     uint32_t                                    firstQuery,
1783     uint32_t                                    queryCount,
1784     VkBuffer                                    destBuffer,
1785     VkDeviceSize                                destOffset,
1786     VkDeviceSize                                destStride,
1787     VkQueryResultFlags                          flags)
1788 {
1789    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1790    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1791    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1792    uint32_t slot_offset, dst_offset;
1793
1794    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1795       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1796          pc.CommandStreamerStallEnable = true;
1797          pc.StallAtPixelScoreboard     = true;
1798       }
1799    }
1800
1801    dst_offset = buffer->offset + destOffset;
1802    for (uint32_t i = 0; i < queryCount; i++) {
1803
1804       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
1805       switch (pool->type) {
1806       case VK_QUERY_TYPE_OCCLUSION:
1807          emit_load_alu_reg_u64(&cmd_buffer->batch,
1808                                CS_GPR(0), &pool->bo, slot_offset);
1809          emit_load_alu_reg_u64(&cmd_buffer->batch,
1810                                CS_GPR(1), &pool->bo, slot_offset + 8);
1811
1812          /* FIXME: We need to clamp the result for 32 bit. */
1813
1814          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
1815          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
1816          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
1817          dw[3] = alu(OPCODE_SUB, 0, 0);
1818          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
1819          break;
1820
1821       case VK_QUERY_TYPE_TIMESTAMP:
1822          emit_load_alu_reg_u64(&cmd_buffer->batch,
1823                                CS_GPR(2), &pool->bo, slot_offset);
1824          break;
1825
1826       default:
1827          unreachable("unhandled query type");
1828       }
1829
1830       store_query_result(&cmd_buffer->batch,
1831                          CS_GPR(2), buffer->bo, dst_offset, flags);
1832
1833       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1834          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
1835                                &pool->bo, slot_offset + 16);
1836          if (flags & VK_QUERY_RESULT_64_BIT)
1837             store_query_result(&cmd_buffer->batch,
1838                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
1839          else
1840             store_query_result(&cmd_buffer->batch,
1841                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
1842       }
1843
1844       dst_offset += destStride;
1845    }
1846 }
1847
1848 #else
1849 void genX(CmdCopyQueryPoolResults)(
1850     VkCommandBuffer                             commandBuffer,
1851     VkQueryPool                                 queryPool,
1852     uint32_t                                    firstQuery,
1853     uint32_t                                    queryCount,
1854     VkBuffer                                    destBuffer,
1855     VkDeviceSize                                destOffset,
1856     VkDeviceSize                                destStride,
1857     VkQueryResultFlags                          flags)
1858 {
1859    anv_finishme("Queries not yet supported on Ivy Bridge");
1860 }
1861 #endif