src/intel/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28
  29 #include "genxml/gen_macros.h"
  30 #include "genxml/genX_pack.h"
  31
  32 void
  33 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  34 {
  35    struct anv_device *device = cmd_buffer->device;
  36    struct anv_bo *scratch_bo = NULL;
  37
  38    cmd_buffer->state.scratch_size =
  39       anv_block_pool_size(&device->scratch_block_pool);
  40    if (cmd_buffer->state.scratch_size > 0)
  41       scratch_bo = &device->scratch_block_pool.bo;
  42
  43 /* XXX: Do we need this on more than just BDW? */
  44 #if (GEN_GEN >= 8)
  45    /* Emit a render target cache flush.
  46     *
  47     * This isn't documented anywhere in the PRM.  However, it seems to be
  48     * necessary prior to changing the surface state base adress.  Without
  49     * this, we get GPU hangs when using multi-level command buffers which
  50     * clear depth, reset state base address, and then go render stuff.
  51     */
  52    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
  53       pc.RenderTargetCacheFlushEnable = true;
  54    }
  55 #endif
  56
  57    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
  58       sba.GeneralStateBaseAddress = (struct anv_address) { scratch_bo, 0 };
  59       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
  60       sba.GeneralStateBaseAddressModifyEnable = true;
  61
  62       sba.SurfaceStateBaseAddress =
  63          anv_cmd_buffer_surface_base_address(cmd_buffer);
  64       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
  65       sba.SurfaceStateBaseAddressModifyEnable = true;
  66
  67       sba.DynamicStateBaseAddress =
  68          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
  69       sba.DynamicStateMemoryObjectControlState = GENX(MOCS),
  70       sba.DynamicStateBaseAddressModifyEnable = true,
  71
  72       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
  73       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
  74       sba.IndirectObjectBaseAddressModifyEnable = true;
  75
  76       sba.InstructionBaseAddress =
  77          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
  78       sba.InstructionMemoryObjectControlState = GENX(MOCS);
  79       sba.InstructionBaseAddressModifyEnable = true;
  80
  81 #  if (GEN_GEN >= 8)
  82       /* Broadwell requires that we specify a buffer size for a bunch of
  83        * these fields.  However, since we will be growing the BO's live, we
  84        * just set them all to the maximum.
  85        */
  86       sba.GeneralStateBufferSize                = 0xfffff;
  87       sba.GeneralStateBufferSizeModifyEnable    = true;
  88       sba.DynamicStateBufferSize                = 0xfffff;
  89       sba.DynamicStateBufferSizeModifyEnable    = true;
  90       sba.IndirectObjectBufferSize              = 0xfffff;
  91       sba.IndirectObjectBufferSizeModifyEnable  = true;
  92       sba.InstructionBufferSize                 = 0xfffff;
  93       sba.InstructionBuffersizeModifyEnable     = true;
  94 #  endif
  95    }
  96
  97    /* After re-setting the surface state base address, we have to do some
  98     * cache flusing so that the sampler engine will pick up the new
  99     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 100     * Shared Function > 3D Sampler > State > State Caching (page 96):
 101     *
 102     *    Coherency with system memory in the state cache, like the texture
 103     *    cache is handled partially by software. It is expected that the
 104     *    command stream or shader will issue Cache Flush operation or
 105     *    Cache_Flush sampler message to ensure that the L1 cache remains
 106     *    coherent with system memory.
 107     *
 108     *    [...]
 109     *
 110     *    Whenever the value of the Dynamic_State_Base_Addr,
 111     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 112     *    invalidated to ensure the new surface or sampler state is fetched
 113     *    from system memory.
 114     *
 115     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 116     * which, according the PIPE_CONTROL instruction documentation in the
 117     * Broadwell PRM:
 118     *
 119     *    Setting this bit is independent of any other bit in this packet.
 120     *    This bit controls the invalidation of the L1 and L2 state caches
 121     *    at the top of the pipe i.e. at the parsing time.
 122     *
 123     * Unfortunately, experimentation seems to indicate that state cache
 124     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 125     * regards to surface state and binding tables.  In stead, it seems that
 126     * invalidating the texture cache is what is actually needed.
 127     *
 128     * XXX:  As far as we have been able to determine through
 129     * experimentation, shows that flush the texture cache appears to be
 130     * sufficient.  The theory here is that all of the sampling/rendering
 131     * units cache the binding table in the texture cache.  However, we have
 132     * yet to be able to actually confirm this.
 133     */
 134    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 135       pc.TextureCacheInvalidationEnable = true;
 136    }
 137 }
 138
 139 void
 140 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
 141 {
 142    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
 143
 144    /* Flushes are pipelined while invalidations are handled immediately.
 145     * Therefore, if we're flushing anything then we need to schedule a stall
 146     * before any invalidations can happen.
 147     */
 148    if (bits & ANV_PIPE_FLUSH_BITS)
 149       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
 150
 151    /* If we're going to do an invalidate and we have a pending CS stall that
 152     * has yet to be resolved, we do the CS stall now.
 153     */
 154    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
 155        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
 156       bits |= ANV_PIPE_CS_STALL_BIT;
 157       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
 158    }
 159
 160    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
 161       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 162          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 163          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 164          pipe.RenderTargetCacheFlushEnable =
 165             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 166
 167          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
 168          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
 169          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
 170
 171          /*
 172           * According to the Broadwell documentation, any PIPE_CONTROL with the
 173           * "Command Streamer Stall" bit set must also have another bit set,
 174           * with five different options:
 175           *
 176           *  - Render Target Cache Flush
 177           *  - Depth Cache Flush
 178           *  - Stall at Pixel Scoreboard
 179           *  - Post-Sync Operation
 180           *  - Depth Stall
 181           *  - DC Flush Enable
 182           *
 183           * I chose "Stall at Pixel Scoreboard" since that's what we use in
 184           * mesa and it seems to work fine. The choice is fairly arbitrary.
 185           */
 186          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
 187              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
 188                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
 189             pipe.StallAtPixelScoreboard = true;
 190       }
 191
 192       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
 193    }
 194
 195    if (bits & ANV_PIPE_INVALIDATE_BITS) {
 196       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 197          pipe.StateCacheInvalidationEnable =
 198             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
 199          pipe.ConstantCacheInvalidationEnable =
 200             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 201          pipe.VFCacheInvalidationEnable =
 202             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 203          pipe.TextureCacheInvalidationEnable =
 204             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 205          pipe.InstructionCacheInvalidateEnable =
 206             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
 207       }
 208
 209       bits &= ~ANV_PIPE_INVALIDATE_BITS;
 210    }
 211
 212    cmd_buffer->state.pending_pipe_bits = bits;
 213 }
 214
 215 void genX(CmdPipelineBarrier)(
 216     VkCommandBuffer                             commandBuffer,
 217     VkPipelineStageFlags                        srcStageMask,
 218     VkPipelineStageFlags                        destStageMask,
 219     VkBool32                                    byRegion,
 220     uint32_t                                    memoryBarrierCount,
 221     const VkMemoryBarrier*                      pMemoryBarriers,
 222     uint32_t                                    bufferMemoryBarrierCount,
 223     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 224     uint32_t                                    imageMemoryBarrierCount,
 225     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 226 {
 227    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 228    uint32_t b;
 229
 230    /* XXX: Right now, we're really dumb and just flush whatever categories
 231     * the app asks for.  One of these days we may make this a bit better
 232     * but right now that's all the hardware allows for in most areas.
 233     */
 234    VkAccessFlags src_flags = 0;
 235    VkAccessFlags dst_flags = 0;
 236
 237    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 238       src_flags |= pMemoryBarriers[i].srcAccessMask;
 239       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 240    }
 241
 242    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 243       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 244       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 245    }
 246
 247    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 248       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 249       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 250    }
 251
 252    enum anv_pipe_bits pipe_bits = 0;
 253
 254    for_each_bit(b, src_flags) {
 255       switch ((VkAccessFlagBits)(1 << b)) {
 256       case VK_ACCESS_SHADER_WRITE_BIT:
 257          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 258          break;
 259       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 260          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 261          break;
 262       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 263          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 264          break;
 265       case VK_ACCESS_TRANSFER_WRITE_BIT:
 266          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 267          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 268          break;
 269       default:
 270          break; /* Nothing to do */
 271       }
 272    }
 273
 274    for_each_bit(b, dst_flags) {
 275       switch ((VkAccessFlagBits)(1 << b)) {
 276       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 277       case VK_ACCESS_INDEX_READ_BIT:
 278       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 279          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 280          break;
 281       case VK_ACCESS_UNIFORM_READ_BIT:
 282          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 283          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 284          break;
 285       case VK_ACCESS_SHADER_READ_BIT:
 286       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 287       case VK_ACCESS_TRANSFER_READ_BIT:
 288          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 289          break;
 290       default:
 291          break; /* Nothing to do */
 292       }
 293    }
 294
 295    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
 296 }
 297
 298 static void
 299 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
 300 {
 301    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
 302
 303    /* In order to avoid thrash, we assume that vertex and fragment stages
 304     * always exist.  In the rare case where one is missing *and* the other
 305     * uses push concstants, this may be suboptimal.  However, avoiding stalls
 306     * seems more important.
 307     */
 308    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
 309
 310    if (stages == cmd_buffer->state.push_constant_stages)
 311       return;
 312
 313 #if GEN_GEN >= 8
 314    const unsigned push_constant_kb = 32;
 315 #elif GEN_IS_HASWELL
 316    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
 317 #else
 318    const unsigned push_constant_kb = 16;
 319 #endif
 320
 321    const unsigned num_stages =
 322       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
 323    unsigned size_per_stage = push_constant_kb / num_stages;
 324
 325    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
 326     * units of 2KB.  Incidentally, these are the same platforms that have
 327     * 32KB worth of push constant space.
 328     */
 329    if (push_constant_kb == 32)
 330       size_per_stage &= ~1u;
 331
 332    uint32_t kb_used = 0;
 333    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
 334       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
 335       anv_batch_emit(&cmd_buffer->batch,
 336                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
 337          alloc._3DCommandSubOpcode  = 18 + i;
 338          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
 339          alloc.ConstantBufferSize   = push_size;
 340       }
 341       kb_used += push_size;
 342    }
 343
 344    anv_batch_emit(&cmd_buffer->batch,
 345                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
 346       alloc.ConstantBufferOffset = kb_used;
 347       alloc.ConstantBufferSize = push_constant_kb - kb_used;
 348    }
 349
 350    cmd_buffer->state.push_constant_stages = stages;
 351
 352    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
 353     *
 354     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
 355     *    the next 3DPRIMITIVE command after programming the
 356     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
 357     *
 358     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
 359     * pipeline setup, we need to dirty push constants.
 360     */
 361    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
 362 }
 363
 364 static uint32_t
 365 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
 366 {
 367    static const uint32_t push_constant_opcodes[] = {
 368       [MESA_SHADER_VERTEX]                      = 21,
 369       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
 370       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
 371       [MESA_SHADER_GEOMETRY]                    = 22,
 372       [MESA_SHADER_FRAGMENT]                    = 23,
 373       [MESA_SHADER_COMPUTE]                     = 0,
 374    };
 375
 376    VkShaderStageFlags flushed = 0;
 377
 378    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
 379       if (stage == MESA_SHADER_COMPUTE)
 380          continue;
 381
 382       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
 383
 384       if (state.offset == 0) {
 385          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
 386             c._3DCommandSubOpcode = push_constant_opcodes[stage];
 387       } else {
 388          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
 389             c._3DCommandSubOpcode = push_constant_opcodes[stage],
 390             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
 391 #if GEN_GEN >= 9
 392                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
 393                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
 394 #else
 395                .PointerToConstantBuffer0 = { .offset = state.offset },
 396                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
 397 #endif
 398             };
 399          }
 400       }
 401
 402       flushed |= mesa_to_vk_shader_stage(stage);
 403    }
 404
 405    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
 406
 407    return flushed;
 408 }
 409
 410 void
 411 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 412 {
 413    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 414    uint32_t *p;
 415
 416    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
 417
 418    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
 419
 420    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline);
 421
 422    genX(flush_pipeline_select_3d)(cmd_buffer);
 423
 424    if (vb_emit) {
 425       const uint32_t num_buffers = __builtin_popcount(vb_emit);
 426       const uint32_t num_dwords = 1 + num_buffers * 4;
 427
 428       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
 429                           GENX(3DSTATE_VERTEX_BUFFERS));
 430       uint32_t vb, i = 0;
 431       for_each_bit(vb, vb_emit) {
 432          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
 433          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
 434
 435          struct GENX(VERTEX_BUFFER_STATE) state = {
 436             .VertexBufferIndex = vb,
 437
 438 #if GEN_GEN >= 8
 439             .MemoryObjectControlState = GENX(MOCS),
 440 #else
 441             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
 442             .InstanceDataStepRate = 1,
 443             .VertexBufferMemoryObjectControlState = GENX(MOCS),
 444 #endif
 445
 446             .AddressModifyEnable = true,
 447             .BufferPitch = pipeline->binding_stride[vb],
 448             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
 449
 450 #if GEN_GEN >= 8
 451             .BufferSize = buffer->size - offset
 452 #else
 453             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
 454 #endif
 455          };
 456
 457          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
 458          i++;
 459       }
 460    }
 461
 462    cmd_buffer->state.vb_dirty &= ~vb_emit;
 463
 464    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
 465       /* If somebody compiled a pipeline after starting a command buffer the
 466        * scratch bo may have grown since we started this cmd buffer (and
 467        * emitted STATE_BASE_ADDRESS).  If we're binding that pipeline now,
 468        * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
 469       if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
 470          anv_cmd_buffer_emit_state_base_address(cmd_buffer);
 471
 472       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
 473
 474       /* If the pipeline changed, we may need to re-allocate push constant
 475        * space in the URB.
 476        */
 477       cmd_buffer_alloc_push_constants(cmd_buffer);
 478    }
 479
 480 #if GEN_GEN <= 7
 481    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
 482        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
 483       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
 484        *
 485        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
 486        *    stall needs to be sent just prior to any 3DSTATE_VS,
 487        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
 488        *    3DSTATE_BINDING_TABLE_POINTER_VS,
 489        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
 490        *    PIPE_CONTROL needs to be sent before any combination of VS
 491        *    associated 3DSTATE."
 492        */
 493       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 494          pc.DepthStallEnable  = true;
 495          pc.PostSyncOperation = WriteImmediateData;
 496          pc.Address           =
 497             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
 498       }
 499    }
 500 #endif
 501
 502    /* We emit the binding tables and sampler tables first, then emit push
 503     * constants and then finally emit binding table and sampler table
 504     * pointers.  It has to happen in this order, since emitting the binding
 505     * tables may change the push constants (in case of storage images). After
 506     * emitting push constants, on SKL+ we have to emit the corresponding
 507     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
 508     */
 509    uint32_t dirty = 0;
 510    if (cmd_buffer->state.descriptors_dirty)
 511       dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer);
 512
 513    if (cmd_buffer->state.push_constants_dirty) {
 514 #if GEN_GEN >= 9
 515       /* On Sky Lake and later, the binding table pointers commands are
 516        * what actually flush the changes to push constant state so we need
 517        * to dirty them so they get re-emitted below.
 518        */
 519       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
 520 #else
 521       cmd_buffer_flush_push_constants(cmd_buffer);
 522 #endif
 523    }
 524
 525    if (dirty)
 526       gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
 527
 528    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
 529       gen8_cmd_buffer_emit_viewport(cmd_buffer);
 530
 531    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
 532       gen7_cmd_buffer_emit_scissor(cmd_buffer);
 533
 534    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
 535
 536    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 537 }
 538
 539 static void
 540 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
 541                              struct anv_bo *bo, uint32_t offset)
 542 {
 543    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
 544                                  GENX(3DSTATE_VERTEX_BUFFERS));
 545
 546    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
 547       &(struct GENX(VERTEX_BUFFER_STATE)) {
 548          .VertexBufferIndex = 32, /* Reserved for this */
 549          .AddressModifyEnable = true,
 550          .BufferPitch = 0,
 551 #if (GEN_GEN >= 8)
 552          .MemoryObjectControlState = GENX(MOCS),
 553          .BufferStartingAddress = { bo, offset },
 554          .BufferSize = 8
 555 #else
 556          .VertexBufferMemoryObjectControlState = GENX(MOCS),
 557          .BufferStartingAddress = { bo, offset },
 558          .EndAddress = { bo, offset + 8 },
 559 #endif
 560       });
 561 }
 562
 563 static void
 564 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
 565                           uint32_t base_vertex, uint32_t base_instance)
 566 {
 567    struct anv_state id_state =
 568       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
 569
 570    ((uint32_t *)id_state.map)[0] = base_vertex;
 571    ((uint32_t *)id_state.map)[1] = base_instance;
 572
 573    if (!cmd_buffer->device->info.has_llc)
 574       anv_state_clflush(id_state);
 575
 576    emit_base_vertex_instance_bo(cmd_buffer,
 577       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
 578 }
 579
 580 void genX(CmdDraw)(
 581     VkCommandBuffer                             commandBuffer,
 582     uint32_t                                    vertexCount,
 583     uint32_t                                    instanceCount,
 584     uint32_t                                    firstVertex,
 585     uint32_t                                    firstInstance)
 586 {
 587    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 588    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 589    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 590
 591    genX(cmd_buffer_flush_state)(cmd_buffer);
 592
 593    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 594       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
 595
 596    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 597       prim.VertexAccessType         = SEQUENTIAL;
 598       prim.PrimitiveTopologyType    = pipeline->topology;
 599       prim.VertexCountPerInstance   = vertexCount;
 600       prim.StartVertexLocation      = firstVertex;
 601       prim.InstanceCount            = instanceCount;
 602       prim.StartInstanceLocation    = firstInstance;
 603       prim.BaseVertexLocation       = 0;
 604    }
 605 }
 606
 607 void genX(CmdDrawIndexed)(
 608     VkCommandBuffer                             commandBuffer,
 609     uint32_t                                    indexCount,
 610     uint32_t                                    instanceCount,
 611     uint32_t                                    firstIndex,
 612     int32_t                                     vertexOffset,
 613     uint32_t                                    firstInstance)
 614 {
 615    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 616    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 617    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 618
 619    genX(cmd_buffer_flush_state)(cmd_buffer);
 620
 621    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 622       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
 623
 624    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 625       prim.VertexAccessType         = RANDOM;
 626       prim.PrimitiveTopologyType    = pipeline->topology;
 627       prim.VertexCountPerInstance   = indexCount;
 628       prim.StartVertexLocation      = firstIndex;
 629       prim.InstanceCount            = instanceCount;
 630       prim.StartInstanceLocation    = firstInstance;
 631       prim.BaseVertexLocation       = vertexOffset;
 632    }
 633 }
 634
 635 /* Auto-Draw / Indirect Registers */
 636 #define GEN7_3DPRIM_END_OFFSET          0x2420
 637 #define GEN7_3DPRIM_START_VERTEX        0x2430
 638 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
 639 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
 640 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 641 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 642
 643 static void
 644 emit_lrm(struct anv_batch *batch,
 645          uint32_t reg, struct anv_bo *bo, uint32_t offset)
 646 {
 647    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 648       lrm.RegisterAddress  = reg;
 649       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
 650    }
 651 }
 652
 653 static void
 654 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 655 {
 656    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
 657       lri.RegisterOffset   = reg;
 658       lri.DataDWord        = imm;
 659    }
 660 }
 661
 662 void genX(CmdDrawIndirect)(
 663     VkCommandBuffer                             commandBuffer,
 664     VkBuffer                                    _buffer,
 665     VkDeviceSize                                offset,
 666     uint32_t                                    drawCount,
 667     uint32_t                                    stride)
 668 {
 669    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 670    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 671    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 672    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 673    struct anv_bo *bo = buffer->bo;
 674    uint32_t bo_offset = buffer->offset + offset;
 675
 676    genX(cmd_buffer_flush_state)(cmd_buffer);
 677
 678    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 679       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
 680
 681    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 682    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 683    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 684    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
 685    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
 686
 687    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 688       prim.IndirectParameterEnable  = true;
 689       prim.VertexAccessType         = SEQUENTIAL;
 690       prim.PrimitiveTopologyType    = pipeline->topology;
 691    }
 692 }
 693
 694 void genX(CmdDrawIndexedIndirect)(
 695     VkCommandBuffer                             commandBuffer,
 696     VkBuffer                                    _buffer,
 697     VkDeviceSize                                offset,
 698     uint32_t                                    drawCount,
 699     uint32_t                                    stride)
 700 {
 701    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 702    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 703    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 704    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 705    struct anv_bo *bo = buffer->bo;
 706    uint32_t bo_offset = buffer->offset + offset;
 707
 708    genX(cmd_buffer_flush_state)(cmd_buffer);
 709
 710    /* TODO: We need to stomp base vertex to 0 somehow */
 711    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 712       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
 713
 714    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 715    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 716    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 717    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
 718    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
 719
 720    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 721       prim.IndirectParameterEnable  = true;
 722       prim.VertexAccessType         = RANDOM;
 723       prim.PrimitiveTopologyType    = pipeline->topology;
 724    }
 725 }
 726
 727 #if GEN_GEN == 7
 728
 729 static bool
 730 verify_cmd_parser(const struct anv_device *device,
 731                   int required_version,
 732                   const char *function)
 733 {
 734    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
 735       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
 736                 "cmd parser version %d is required for %s",
 737                 required_version, function);
 738       return false;
 739    } else {
 740       return true;
 741    }
 742 }
 743
 744 #endif
 745
 746 void genX(CmdDispatch)(
 747     VkCommandBuffer                             commandBuffer,
 748     uint32_t                                    x,
 749     uint32_t                                    y,
 750     uint32_t                                    z)
 751 {
 752    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 753    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 754    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
 755
 756    if (prog_data->uses_num_work_groups) {
 757       struct anv_state state =
 758          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
 759       uint32_t *sizes = state.map;
 760       sizes[0] = x;
 761       sizes[1] = y;
 762       sizes[2] = z;
 763       if (!cmd_buffer->device->info.has_llc)
 764          anv_state_clflush(state);
 765       cmd_buffer->state.num_workgroups_offset = state.offset;
 766       cmd_buffer->state.num_workgroups_bo =
 767          &cmd_buffer->device->dynamic_state_block_pool.bo;
 768    }
 769
 770    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 771
 772    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
 773       ggw.SIMDSize                     = prog_data->simd_size / 16;
 774       ggw.ThreadDepthCounterMaximum    = 0;
 775       ggw.ThreadHeightCounterMaximum   = 0;
 776       ggw.ThreadWidthCounterMaximum    = pipeline->cs_thread_width_max - 1;
 777       ggw.ThreadGroupIDXDimension      = x;
 778       ggw.ThreadGroupIDYDimension      = y;
 779       ggw.ThreadGroupIDZDimension      = z;
 780       ggw.RightExecutionMask           = pipeline->cs_right_mask;
 781       ggw.BottomExecutionMask          = 0xffffffff;
 782    }
 783
 784    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
 785 }
 786
 787 #define GPGPU_DISPATCHDIMX 0x2500
 788 #define GPGPU_DISPATCHDIMY 0x2504
 789 #define GPGPU_DISPATCHDIMZ 0x2508
 790
 791 #define MI_PREDICATE_SRC0  0x2400
 792 #define MI_PREDICATE_SRC1  0x2408
 793
 794 void genX(CmdDispatchIndirect)(
 795     VkCommandBuffer                             commandBuffer,
 796     VkBuffer                                    _buffer,
 797     VkDeviceSize                                offset)
 798 {
 799    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 800    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 801    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 802    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
 803    struct anv_bo *bo = buffer->bo;
 804    uint32_t bo_offset = buffer->offset + offset;
 805    struct anv_batch *batch = &cmd_buffer->batch;
 806
 807 #if GEN_GEN == 7
 808    /* Linux 4.4 added command parser version 5 which allows the GPGPU
 809     * indirect dispatch registers to be written.
 810     */
 811    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
 812       return;
 813 #endif
 814
 815    if (prog_data->uses_num_work_groups) {
 816       cmd_buffer->state.num_workgroups_offset = bo_offset;
 817       cmd_buffer->state.num_workgroups_bo = bo;
 818    }
 819
 820    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 821
 822    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
 823    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
 824    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
 825
 826 #if GEN_GEN <= 7
 827    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
 828    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
 829    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
 830    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
 831
 832    /* Load compute_dispatch_indirect_x_size into SRC0 */
 833    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
 834
 835    /* predicate = (compute_dispatch_indirect_x_size == 0); */
 836    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 837       mip.LoadOperation    = LOAD_LOAD;
 838       mip.CombineOperation = COMBINE_SET;
 839       mip.CompareOperation = COMPARE_SRCS_EQUAL;
 840    }
 841
 842    /* Load compute_dispatch_indirect_y_size into SRC0 */
 843    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
 844
 845    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
 846    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 847       mip.LoadOperation    = LOAD_LOAD;
 848       mip.CombineOperation = COMBINE_OR;
 849       mip.CompareOperation = COMPARE_SRCS_EQUAL;
 850    }
 851
 852    /* Load compute_dispatch_indirect_z_size into SRC0 */
 853    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
 854
 855    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
 856    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 857       mip.LoadOperation    = LOAD_LOAD;
 858       mip.CombineOperation = COMBINE_OR;
 859       mip.CompareOperation = COMPARE_SRCS_EQUAL;
 860    }
 861
 862    /* predicate = !predicate; */
 863 #define COMPARE_FALSE                           1
 864    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 865       mip.LoadOperation    = LOAD_LOADINV;
 866       mip.CombineOperation = COMBINE_OR;
 867       mip.CompareOperation = COMPARE_FALSE;
 868    }
 869 #endif
 870
 871    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
 872       ggw.IndirectParameterEnable      = true;
 873       ggw.PredicateEnable              = GEN_GEN <= 7;
 874       ggw.SIMDSize                     = prog_data->simd_size / 16;
 875       ggw.ThreadDepthCounterMaximum    = 0;
 876       ggw.ThreadHeightCounterMaximum   = 0;
 877       ggw.ThreadWidthCounterMaximum    = pipeline->cs_thread_width_max - 1;
 878       ggw.RightExecutionMask           = pipeline->cs_right_mask;
 879       ggw.BottomExecutionMask          = 0xffffffff;
 880    }
 881
 882    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
 883 }
 884
 885 static void
 886 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
 887                                       uint32_t pipeline)
 888 {
 889 #if GEN_GEN >= 8 && GEN_GEN < 10
 890    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
 891     *
 892     *   Software must clear the COLOR_CALC_STATE Valid field in
 893     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
 894     *   with Pipeline Select set to GPGPU.
 895     *
 896     * The internal hardware docs recommend the same workaround for Gen9
 897     * hardware too.
 898     */
 899    if (pipeline == GPGPU)
 900       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
 901 #elif GEN_GEN <= 7
 902       /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
 903        * PIPELINE_SELECT [DevBWR+]":
 904        *
 905        *   Project: DEVSNB+
 906        *
 907        *   Software must ensure all the write caches are flushed through a
 908        *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
 909        *   command to invalidate read only caches prior to programming
 910        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
 911        */
 912       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 913          pc.RenderTargetCacheFlushEnable  = true;
 914          pc.DepthCacheFlushEnable         = true;
 915          pc.DCFlushEnable                 = true;
 916          pc.PostSyncOperation             = NoWrite;
 917          pc.CommandStreamerStallEnable    = true;
 918       }
 919
 920       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 921          pc.TextureCacheInvalidationEnable   = true;
 922          pc.ConstantCacheInvalidationEnable  = true;
 923          pc.StateCacheInvalidationEnable     = true;
 924          pc.InstructionCacheInvalidateEnable = true;
 925          pc.PostSyncOperation                = NoWrite;
 926       }
 927 #endif
 928 }
 929
 930 void
 931 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
 932 {
 933    if (cmd_buffer->state.current_pipeline != _3D) {
 934       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
 935
 936       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
 937 #if GEN_GEN >= 9
 938          ps.MaskBits = 3;
 939 #endif
 940          ps.PipelineSelection = _3D;
 941       }
 942
 943       cmd_buffer->state.current_pipeline = _3D;
 944    }
 945 }
 946
 947 void
 948 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
 949 {
 950    if (cmd_buffer->state.current_pipeline != GPGPU) {
 951       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
 952
 953       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
 954 #if GEN_GEN >= 9
 955          ps.MaskBits = 3;
 956 #endif
 957          ps.PipelineSelection = GPGPU;
 958       }
 959
 960       cmd_buffer->state.current_pipeline = GPGPU;
 961    }
 962 }
 963
 964 struct anv_state
 965 genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
 966                                           struct anv_framebuffer *fb)
 967 {
 968    struct anv_state state =
 969       anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
 970
 971    struct GENX(RENDER_SURFACE_STATE) null_ss = {
 972       .SurfaceType = SURFTYPE_NULL,
 973       .SurfaceArray = fb->layers > 0,
 974       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
 975 #if GEN_GEN >= 8
 976       .TileMode = YMAJOR,
 977 #else
 978       .TiledSurface = true,
 979 #endif
 980       .Width = fb->width - 1,
 981       .Height = fb->height - 1,
 982       .Depth = fb->layers - 1,
 983       .RenderTargetViewExtent = fb->layers - 1,
 984    };
 985
 986    GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss);
 987
 988    if (!cmd_buffer->device->info.has_llc)
 989       anv_state_clflush(state);
 990
 991    return state;
 992 }
 993
 994 static void
 995 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 996 {
 997    struct anv_device *device = cmd_buffer->device;
 998    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
 999    const struct anv_image_view *iview =
1000       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1001    const struct anv_image *image = iview ? iview->image : NULL;
1002    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
1003    const bool has_stencil =
1004       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
1005
1006    /* FIXME: Implement the PMA stall W/A */
1007    /* FIXME: Width and Height are wrong */
1008
1009    /* Emit 3DSTATE_DEPTH_BUFFER */
1010    if (has_depth) {
1011       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1012          db.SurfaceType                   = SURFTYPE_2D;
1013          db.DepthWriteEnable              = true;
1014          db.StencilWriteEnable            = has_stencil;
1015          db.HierarchicalDepthBufferEnable = false;
1016
1017          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
1018                                                       &image->depth_surface.isl);
1019
1020          db.SurfaceBaseAddress = (struct anv_address) {
1021             .bo = image->bo,
1022             .offset = image->offset + image->depth_surface.offset,
1023          };
1024          db.DepthBufferObjectControlState = GENX(MOCS),
1025
1026          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
1027          db.Height               = fb->height - 1;
1028          db.Width                = fb->width - 1;
1029          db.LOD                  = 0;
1030          db.Depth                = 1 - 1;
1031          db.MinimumArrayElement  = 0;
1032
1033 #if GEN_GEN >= 8
1034          db.SurfaceQPitch =
1035             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
1036 #endif
1037          db.RenderTargetViewExtent = 1 - 1;
1038       }
1039    } else {
1040       /* Even when no depth buffer is present, the hardware requires that
1041        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
1042        *
1043        *    If a null depth buffer is bound, the driver must instead bind depth as:
1044        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
1045        *       3DSTATE_DEPTH.Width = 1
1046        *       3DSTATE_DEPTH.Height = 1
1047        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
1048        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
1049        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
1050        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
1051        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
1052        *
1053        * The PRM is wrong, though. The width and height must be programmed to
1054        * actual framebuffer's width and height, even when neither depth buffer
1055        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
1056        * be combined with a stencil buffer so we use D32_FLOAT instead.
1057        */
1058       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1059          db.SurfaceType          = SURFTYPE_2D;
1060          db.SurfaceFormat        = D32_FLOAT;
1061          db.Width                = fb->width - 1;
1062          db.Height               = fb->height - 1;
1063          db.StencilWriteEnable   = has_stencil;
1064       }
1065    }
1066
1067    /* Emit 3DSTATE_STENCIL_BUFFER */
1068    if (has_stencil) {
1069       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
1070 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1071          sb.StencilBufferEnable = true,
1072 #endif
1073          sb.StencilBufferObjectControlState = GENX(MOCS),
1074
1075          /* Stencil buffers have strange pitch. The PRM says:
1076           *
1077           *    The pitch must be set to 2x the value computed based on width,
1078           *    as the stencil buffer is stored with two rows interleaved.
1079           */
1080          sb.SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
1081
1082 #if GEN_GEN >= 8
1083          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
1084 #endif
1085          sb.SurfaceBaseAddress = (struct anv_address) {
1086             .bo = image->bo,
1087             .offset = image->offset + image->stencil_surface.offset,
1088          };
1089       }
1090    } else {
1091       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
1092    }
1093
1094    /* Disable hierarchial depth buffers. */
1095    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hz);
1096
1097    /* Clear the clear params. */
1098    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp);
1099 }
1100
1101 /**
1102  * @see anv_cmd_buffer_set_subpass()
1103  */
1104 void
1105 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
1106                              struct anv_subpass *subpass)
1107 {
1108    cmd_buffer->state.subpass = subpass;
1109
1110    cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
1111
1112    cmd_buffer_emit_depth_stencil(cmd_buffer);
1113 }
1114
1115 void genX(CmdBeginRenderPass)(
1116     VkCommandBuffer                             commandBuffer,
1117     const VkRenderPassBeginInfo*                pRenderPassBegin,
1118     VkSubpassContents                           contents)
1119 {
1120    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1121    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
1122    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1123
1124    cmd_buffer->state.framebuffer = framebuffer;
1125    cmd_buffer->state.pass = pass;
1126    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
1127    anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
1128
1129    genX(flush_pipeline_select_3d)(cmd_buffer);
1130
1131    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
1132    anv_cmd_buffer_clear_subpass(cmd_buffer);
1133 }
1134
1135 void genX(CmdNextSubpass)(
1136     VkCommandBuffer                             commandBuffer,
1137     VkSubpassContents                           contents)
1138 {
1139    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1140
1141    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1142
1143    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1144    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
1145    anv_cmd_buffer_clear_subpass(cmd_buffer);
1146 }
1147
1148 void genX(CmdEndRenderPass)(
1149     VkCommandBuffer                             commandBuffer)
1150 {
1151    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1152
1153    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1154 }
1155
1156 static void
1157 emit_ps_depth_count(struct anv_batch *batch,
1158                     struct anv_bo *bo, uint32_t offset)
1159 {
1160    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1161       pc.DestinationAddressType  = DAT_PPGTT;
1162       pc.PostSyncOperation       = WritePSDepthCount;
1163       pc.DepthStallEnable        = true;
1164       pc.Address                 = (struct anv_address) { bo, offset };
1165    }
1166 }
1167
1168 static void
1169 emit_query_availability(struct anv_batch *batch,
1170                         struct anv_bo *bo, uint32_t offset)
1171 {
1172    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1173       pc.DestinationAddressType  = DAT_PPGTT;
1174       pc.PostSyncOperation       = WriteImmediateData;
1175       pc.Address                 = (struct anv_address) { bo, offset };
1176       pc.ImmediateData           = 1;
1177    }
1178 }
1179
1180 void genX(CmdBeginQuery)(
1181     VkCommandBuffer                             commandBuffer,
1182     VkQueryPool                                 queryPool,
1183     uint32_t                                    query,
1184     VkQueryControlFlags                         flags)
1185 {
1186    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1187    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1188
1189    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
1190     * that the pipelining of the depth write breaks. What we see is that
1191     * samples from the render pass clear leaks into the first query
1192     * immediately after the clear. Doing a pipecontrol with a post-sync
1193     * operation and DepthStallEnable seems to work around the issue.
1194     */
1195    if (cmd_buffer->state.need_query_wa) {
1196       cmd_buffer->state.need_query_wa = false;
1197       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1198          pc.DepthCacheFlushEnable   = true;
1199          pc.DepthStallEnable        = true;
1200       }
1201    }
1202
1203    switch (pool->type) {
1204    case VK_QUERY_TYPE_OCCLUSION:
1205       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1206                           query * sizeof(struct anv_query_pool_slot));
1207       break;
1208
1209    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1210    default:
1211       unreachable("");
1212    }
1213 }
1214
1215 void genX(CmdEndQuery)(
1216     VkCommandBuffer                             commandBuffer,
1217     VkQueryPool                                 queryPool,
1218     uint32_t                                    query)
1219 {
1220    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1221    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1222
1223    switch (pool->type) {
1224    case VK_QUERY_TYPE_OCCLUSION:
1225       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1226                           query * sizeof(struct anv_query_pool_slot) + 8);
1227
1228       emit_query_availability(&cmd_buffer->batch, &pool->bo,
1229                               query * sizeof(struct anv_query_pool_slot) + 16);
1230       break;
1231
1232    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1233    default:
1234       unreachable("");
1235    }
1236 }
1237
1238 #define TIMESTAMP 0x2358
1239
1240 void genX(CmdWriteTimestamp)(
1241     VkCommandBuffer                             commandBuffer,
1242     VkPipelineStageFlagBits                     pipelineStage,
1243     VkQueryPool                                 queryPool,
1244     uint32_t                                    query)
1245 {
1246    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1247    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1248    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
1249
1250    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1251
1252    switch (pipelineStage) {
1253    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1254       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1255          srm.RegisterAddress  = TIMESTAMP;
1256          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
1257       }
1258       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1259          srm.RegisterAddress  = TIMESTAMP + 4;
1260          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
1261       }
1262       break;
1263
1264    default:
1265       /* Everything else is bottom-of-pipe */
1266       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1267          pc.DestinationAddressType  = DAT_PPGTT,
1268          pc.PostSyncOperation       = WriteTimestamp,
1269          pc.Address = (struct anv_address) { &pool->bo, offset };
1270       }
1271       break;
1272    }
1273
1274    emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
1275 }
1276
1277 #if GEN_GEN > 7 || GEN_IS_HASWELL
1278
1279 #define alu_opcode(v)   __gen_uint((v),  20, 31)
1280 #define alu_operand1(v) __gen_uint((v),  10, 19)
1281 #define alu_operand2(v) __gen_uint((v),   0,  9)
1282 #define alu(opcode, operand1, operand2) \
1283    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
1284
1285 #define OPCODE_NOOP      0x000
1286 #define OPCODE_LOAD      0x080
1287 #define OPCODE_LOADINV   0x480
1288 #define OPCODE_LOAD0     0x081
1289 #define OPCODE_LOAD1     0x481
1290 #define OPCODE_ADD       0x100
1291 #define OPCODE_SUB       0x101
1292 #define OPCODE_AND       0x102
1293 #define OPCODE_OR        0x103
1294 #define OPCODE_XOR       0x104
1295 #define OPCODE_STORE     0x180
1296 #define OPCODE_STOREINV  0x580
1297
1298 #define OPERAND_R0   0x00
1299 #define OPERAND_R1   0x01
1300 #define OPERAND_R2   0x02
1301 #define OPERAND_R3   0x03
1302 #define OPERAND_R4   0x04
1303 #define OPERAND_SRCA 0x20
1304 #define OPERAND_SRCB 0x21
1305 #define OPERAND_ACCU 0x31
1306 #define OPERAND_ZF   0x32
1307 #define OPERAND_CF   0x33
1308
1309 #define CS_GPR(n) (0x2600 + (n) * 8)
1310
1311 static void
1312 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
1313                       struct anv_bo *bo, uint32_t offset)
1314 {
1315    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1316       lrm.RegisterAddress  = reg,
1317       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
1318    }
1319    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1320       lrm.RegisterAddress  = reg + 4;
1321       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1322    }
1323 }
1324
1325 static void
1326 store_query_result(struct anv_batch *batch, uint32_t reg,
1327                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
1328 {
1329    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1330       srm.RegisterAddress  = reg;
1331       srm.MemoryAddress    = (struct anv_address) { bo, offset };
1332    }
1333
1334    if (flags & VK_QUERY_RESULT_64_BIT) {
1335       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1336          srm.RegisterAddress  = reg + 4;
1337          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1338       }
1339    }
1340 }
1341
1342 void genX(CmdCopyQueryPoolResults)(
1343     VkCommandBuffer                             commandBuffer,
1344     VkQueryPool                                 queryPool,
1345     uint32_t                                    firstQuery,
1346     uint32_t                                    queryCount,
1347     VkBuffer                                    destBuffer,
1348     VkDeviceSize                                destOffset,
1349     VkDeviceSize                                destStride,
1350     VkQueryResultFlags                          flags)
1351 {
1352    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1353    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1354    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1355    uint32_t slot_offset, dst_offset;
1356
1357    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1358       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1359          pc.CommandStreamerStallEnable = true;
1360          pc.StallAtPixelScoreboard     = true;
1361       }
1362    }
1363
1364    dst_offset = buffer->offset + destOffset;
1365    for (uint32_t i = 0; i < queryCount; i++) {
1366
1367       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
1368       switch (pool->type) {
1369       case VK_QUERY_TYPE_OCCLUSION:
1370          emit_load_alu_reg_u64(&cmd_buffer->batch,
1371                                CS_GPR(0), &pool->bo, slot_offset);
1372          emit_load_alu_reg_u64(&cmd_buffer->batch,
1373                                CS_GPR(1), &pool->bo, slot_offset + 8);
1374
1375          /* FIXME: We need to clamp the result for 32 bit. */
1376
1377          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
1378          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
1379          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
1380          dw[3] = alu(OPCODE_SUB, 0, 0);
1381          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
1382          break;
1383
1384       case VK_QUERY_TYPE_TIMESTAMP:
1385          emit_load_alu_reg_u64(&cmd_buffer->batch,
1386                                CS_GPR(2), &pool->bo, slot_offset);
1387          break;
1388
1389       default:
1390          unreachable("unhandled query type");
1391       }
1392
1393       store_query_result(&cmd_buffer->batch,
1394                          CS_GPR(2), buffer->bo, dst_offset, flags);
1395
1396       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1397          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
1398                                &pool->bo, slot_offset + 16);
1399          if (flags & VK_QUERY_RESULT_64_BIT)
1400             store_query_result(&cmd_buffer->batch,
1401                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
1402          else
1403             store_query_result(&cmd_buffer->batch,
1404                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
1405       }
1406
1407       dst_offset += destStride;
1408    }
1409 }
1410
1411 #endif