src/intel/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28
  29 #include "genxml/gen_macros.h"
  30 #include "genxml/genX_pack.h"
  31
  32 void
  33 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  34 {
  35    struct anv_device *device = cmd_buffer->device;
  36    struct anv_bo *scratch_bo = NULL;
  37
  38    cmd_buffer->state.scratch_size =
  39       anv_block_pool_size(&device->scratch_block_pool);
  40    if (cmd_buffer->state.scratch_size > 0)
  41       scratch_bo = &device->scratch_block_pool.bo;
  42
  43 /* XXX: Do we need this on more than just BDW? */
  44 #if (GEN_GEN >= 8)
  45    /* Emit a render target cache flush.
  46     *
  47     * This isn't documented anywhere in the PRM.  However, it seems to be
  48     * necessary prior to changing the surface state base adress.  Without
  49     * this, we get GPU hangs when using multi-level command buffers which
  50     * clear depth, reset state base address, and then go render stuff.
  51     */
  52    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
  53       pc.RenderTargetCacheFlushEnable = true;
  54    }
  55 #endif
  56
  57    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
  58       sba.GeneralStateBaseAddress = (struct anv_address) { scratch_bo, 0 };
  59       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
  60       sba.GeneralStateBaseAddressModifyEnable = true;
  61
  62       sba.SurfaceStateBaseAddress =
  63          anv_cmd_buffer_surface_base_address(cmd_buffer);
  64       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
  65       sba.SurfaceStateBaseAddressModifyEnable = true;
  66
  67       sba.DynamicStateBaseAddress =
  68          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
  69       sba.DynamicStateMemoryObjectControlState = GENX(MOCS),
  70       sba.DynamicStateBaseAddressModifyEnable = true,
  71
  72       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
  73       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
  74       sba.IndirectObjectBaseAddressModifyEnable = true;
  75
  76       sba.InstructionBaseAddress =
  77          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
  78       sba.InstructionMemoryObjectControlState = GENX(MOCS);
  79       sba.InstructionBaseAddressModifyEnable = true;
  80
  81 #  if (GEN_GEN >= 8)
  82       /* Broadwell requires that we specify a buffer size for a bunch of
  83        * these fields.  However, since we will be growing the BO's live, we
  84        * just set them all to the maximum.
  85        */
  86       sba.GeneralStateBufferSize                = 0xfffff;
  87       sba.GeneralStateBufferSizeModifyEnable    = true;
  88       sba.DynamicStateBufferSize                = 0xfffff;
  89       sba.DynamicStateBufferSizeModifyEnable    = true;
  90       sba.IndirectObjectBufferSize              = 0xfffff;
  91       sba.IndirectObjectBufferSizeModifyEnable  = true;
  92       sba.InstructionBufferSize                 = 0xfffff;
  93       sba.InstructionBuffersizeModifyEnable     = true;
  94 #  endif
  95    }
  96
  97    /* After re-setting the surface state base address, we have to do some
  98     * cache flusing so that the sampler engine will pick up the new
  99     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 100     * Shared Function > 3D Sampler > State > State Caching (page 96):
 101     *
 102     *    Coherency with system memory in the state cache, like the texture
 103     *    cache is handled partially by software. It is expected that the
 104     *    command stream or shader will issue Cache Flush operation or
 105     *    Cache_Flush sampler message to ensure that the L1 cache remains
 106     *    coherent with system memory.
 107     *
 108     *    [...]
 109     *
 110     *    Whenever the value of the Dynamic_State_Base_Addr,
 111     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 112     *    invalidated to ensure the new surface or sampler state is fetched
 113     *    from system memory.
 114     *
 115     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 116     * which, according the PIPE_CONTROL instruction documentation in the
 117     * Broadwell PRM:
 118     *
 119     *    Setting this bit is independent of any other bit in this packet.
 120     *    This bit controls the invalidation of the L1 and L2 state caches
 121     *    at the top of the pipe i.e. at the parsing time.
 122     *
 123     * Unfortunately, experimentation seems to indicate that state cache
 124     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 125     * regards to surface state and binding tables.  In stead, it seems that
 126     * invalidating the texture cache is what is actually needed.
 127     *
 128     * XXX:  As far as we have been able to determine through
 129     * experimentation, shows that flush the texture cache appears to be
 130     * sufficient.  The theory here is that all of the sampling/rendering
 131     * units cache the binding table in the texture cache.  However, we have
 132     * yet to be able to actually confirm this.
 133     */
 134    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 135       pc.TextureCacheInvalidationEnable = true;
 136    }
 137 }
 138
 139 void
 140 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
 141 {
 142    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
 143
 144    /* Flushes are pipelined while invalidations are handled immediately.
 145     * Therefore, if we're flushing anything then we need to schedule a stall
 146     * before any invalidations can happen.
 147     */
 148    if (bits & ANV_PIPE_FLUSH_BITS)
 149       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
 150
 151    /* If we're going to do an invalidate and we have a pending CS stall that
 152     * has yet to be resolved, we do the CS stall now.
 153     */
 154    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
 155        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
 156       bits |= ANV_PIPE_CS_STALL_BIT;
 157       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
 158    }
 159
 160    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
 161       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 162          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 163          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 164          pipe.RenderTargetCacheFlushEnable =
 165             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 166
 167          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
 168          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
 169          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
 170
 171          /*
 172           * According to the Broadwell documentation, any PIPE_CONTROL with the
 173           * "Command Streamer Stall" bit set must also have another bit set,
 174           * with five different options:
 175           *
 176           *  - Render Target Cache Flush
 177           *  - Depth Cache Flush
 178           *  - Stall at Pixel Scoreboard
 179           *  - Post-Sync Operation
 180           *  - Depth Stall
 181           *  - DC Flush Enable
 182           *
 183           * I chose "Stall at Pixel Scoreboard" since that's what we use in
 184           * mesa and it seems to work fine. The choice is fairly arbitrary.
 185           */
 186          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
 187              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
 188                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
 189             pipe.StallAtPixelScoreboard = true;
 190       }
 191
 192       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
 193    }
 194
 195    if (bits & ANV_PIPE_INVALIDATE_BITS) {
 196       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 197          pipe.StateCacheInvalidationEnable =
 198             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
 199          pipe.ConstantCacheInvalidationEnable =
 200             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 201          pipe.VFCacheInvalidationEnable =
 202             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 203          pipe.TextureCacheInvalidationEnable =
 204             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 205          pipe.InstructionCacheInvalidateEnable =
 206             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
 207       }
 208
 209       bits &= ~ANV_PIPE_INVALIDATE_BITS;
 210    }
 211
 212    cmd_buffer->state.pending_pipe_bits = bits;
 213 }
 214
 215 void genX(CmdPipelineBarrier)(
 216     VkCommandBuffer                             commandBuffer,
 217     VkPipelineStageFlags                        srcStageMask,
 218     VkPipelineStageFlags                        destStageMask,
 219     VkBool32                                    byRegion,
 220     uint32_t                                    memoryBarrierCount,
 221     const VkMemoryBarrier*                      pMemoryBarriers,
 222     uint32_t                                    bufferMemoryBarrierCount,
 223     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 224     uint32_t                                    imageMemoryBarrierCount,
 225     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 226 {
 227    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 228    uint32_t b;
 229
 230    /* XXX: Right now, we're really dumb and just flush whatever categories
 231     * the app asks for.  One of these days we may make this a bit better
 232     * but right now that's all the hardware allows for in most areas.
 233     */
 234    VkAccessFlags src_flags = 0;
 235    VkAccessFlags dst_flags = 0;
 236
 237    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 238       src_flags |= pMemoryBarriers[i].srcAccessMask;
 239       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 240    }
 241
 242    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 243       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 244       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 245    }
 246
 247    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 248       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 249       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 250    }
 251
 252    enum anv_pipe_bits pipe_bits = 0;
 253
 254    for_each_bit(b, src_flags) {
 255       switch ((VkAccessFlagBits)(1 << b)) {
 256       case VK_ACCESS_SHADER_WRITE_BIT:
 257          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 258          break;
 259       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 260          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 261          break;
 262       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 263          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 264          break;
 265       case VK_ACCESS_TRANSFER_WRITE_BIT:
 266          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 267          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 268          break;
 269       default:
 270          break; /* Nothing to do */
 271       }
 272    }
 273
 274    for_each_bit(b, dst_flags) {
 275       switch ((VkAccessFlagBits)(1 << b)) {
 276       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 277       case VK_ACCESS_INDEX_READ_BIT:
 278       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 279          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 280          break;
 281       case VK_ACCESS_UNIFORM_READ_BIT:
 282          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 283          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 284          break;
 285       case VK_ACCESS_SHADER_READ_BIT:
 286       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 287       case VK_ACCESS_TRANSFER_READ_BIT:
 288          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 289          break;
 290       default:
 291          break; /* Nothing to do */
 292       }
 293    }
 294
 295    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
 296 }
 297
 298 static void
 299 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
 300 {
 301    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
 302
 303    /* In order to avoid thrash, we assume that vertex and fragment stages
 304     * always exist.  In the rare case where one is missing *and* the other
 305     * uses push concstants, this may be suboptimal.  However, avoiding stalls
 306     * seems more important.
 307     */
 308    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
 309
 310    if (stages == cmd_buffer->state.push_constant_stages)
 311       return;
 312
 313 #if GEN_GEN >= 8
 314    const unsigned push_constant_kb = 32;
 315 #elif GEN_IS_HASWELL
 316    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
 317 #else
 318    const unsigned push_constant_kb = 16;
 319 #endif
 320
 321    const unsigned num_stages =
 322       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
 323    unsigned size_per_stage = push_constant_kb / num_stages;
 324
 325    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
 326     * units of 2KB.  Incidentally, these are the same platforms that have
 327     * 32KB worth of push constant space.
 328     */
 329    if (push_constant_kb == 32)
 330       size_per_stage &= ~1u;
 331
 332    uint32_t kb_used = 0;
 333    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
 334       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
 335       anv_batch_emit(&cmd_buffer->batch,
 336                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
 337          alloc._3DCommandSubOpcode  = 18 + i;
 338          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
 339          alloc.ConstantBufferSize   = push_size;
 340       }
 341       kb_used += push_size;
 342    }
 343
 344    anv_batch_emit(&cmd_buffer->batch,
 345                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
 346       alloc.ConstantBufferOffset = kb_used;
 347       alloc.ConstantBufferSize = push_constant_kb - kb_used;
 348    }
 349
 350    cmd_buffer->state.push_constant_stages = stages;
 351
 352    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
 353     *
 354     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
 355     *    the next 3DPRIMITIVE command after programming the
 356     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
 357     *
 358     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
 359     * pipeline setup, we need to dirty push constants.
 360     */
 361    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
 362 }
 363
 364 static uint32_t
 365 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
 366 {
 367    static const uint32_t push_constant_opcodes[] = {
 368       [MESA_SHADER_VERTEX]                      = 21,
 369       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
 370       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
 371       [MESA_SHADER_GEOMETRY]                    = 22,
 372       [MESA_SHADER_FRAGMENT]                    = 23,
 373       [MESA_SHADER_COMPUTE]                     = 0,
 374    };
 375
 376    VkShaderStageFlags flushed = 0;
 377
 378    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
 379       if (stage == MESA_SHADER_COMPUTE)
 380          continue;
 381
 382       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
 383
 384       if (state.offset == 0) {
 385          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
 386             c._3DCommandSubOpcode = push_constant_opcodes[stage];
 387       } else {
 388          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
 389             c._3DCommandSubOpcode = push_constant_opcodes[stage],
 390             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
 391 #if GEN_GEN >= 9
 392                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
 393                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
 394 #else
 395                .PointerToConstantBuffer0 = { .offset = state.offset },
 396                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
 397 #endif
 398             };
 399          }
 400       }
 401
 402       flushed |= mesa_to_vk_shader_stage(stage);
 403    }
 404
 405    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
 406
 407    return flushed;
 408 }
 409
 410 void
 411 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 412 {
 413    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 414    uint32_t *p;
 415
 416    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
 417
 418    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
 419
 420    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline);
 421
 422    genX(flush_pipeline_select_3d)(cmd_buffer);
 423
 424    if (vb_emit) {
 425       const uint32_t num_buffers = __builtin_popcount(vb_emit);
 426       const uint32_t num_dwords = 1 + num_buffers * 4;
 427
 428       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
 429                           GENX(3DSTATE_VERTEX_BUFFERS));
 430       uint32_t vb, i = 0;
 431       for_each_bit(vb, vb_emit) {
 432          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
 433          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
 434
 435          struct GENX(VERTEX_BUFFER_STATE) state = {
 436             .VertexBufferIndex = vb,
 437
 438 #if GEN_GEN >= 8
 439             .MemoryObjectControlState = GENX(MOCS),
 440 #else
 441             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
 442             .InstanceDataStepRate = 1,
 443             .VertexBufferMemoryObjectControlState = GENX(MOCS),
 444 #endif
 445
 446             .AddressModifyEnable = true,
 447             .BufferPitch = pipeline->binding_stride[vb],
 448             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
 449
 450 #if GEN_GEN >= 8
 451             .BufferSize = buffer->size - offset
 452 #else
 453             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
 454 #endif
 455          };
 456
 457          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
 458          i++;
 459       }
 460    }
 461
 462    cmd_buffer->state.vb_dirty &= ~vb_emit;
 463
 464    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
 465       /* If somebody compiled a pipeline after starting a command buffer the
 466        * scratch bo may have grown since we started this cmd buffer (and
 467        * emitted STATE_BASE_ADDRESS).  If we're binding that pipeline now,
 468        * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
 469       if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
 470          anv_cmd_buffer_emit_state_base_address(cmd_buffer);
 471
 472       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
 473
 474       /* If the pipeline changed, we may need to re-allocate push constant
 475        * space in the URB.
 476        */
 477       cmd_buffer_alloc_push_constants(cmd_buffer);
 478    }
 479
 480 #if GEN_GEN <= 7
 481    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
 482        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
 483       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
 484        *
 485        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
 486        *    stall needs to be sent just prior to any 3DSTATE_VS,
 487        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
 488        *    3DSTATE_BINDING_TABLE_POINTER_VS,
 489        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
 490        *    PIPE_CONTROL needs to be sent before any combination of VS
 491        *    associated 3DSTATE."
 492        */
 493       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 494          pc.DepthStallEnable  = true;
 495          pc.PostSyncOperation = WriteImmediateData;
 496          pc.Address           =
 497             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
 498       }
 499    }
 500 #endif
 501
 502    /* We emit the binding tables and sampler tables first, then emit push
 503     * constants and then finally emit binding table and sampler table
 504     * pointers.  It has to happen in this order, since emitting the binding
 505     * tables may change the push constants (in case of storage images). After
 506     * emitting push constants, on SKL+ we have to emit the corresponding
 507     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
 508     */
 509    uint32_t dirty = 0;
 510    if (cmd_buffer->state.descriptors_dirty)
 511       dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer);
 512
 513    if (cmd_buffer->state.push_constants_dirty) {
 514 #if GEN_GEN >= 9
 515       /* On Sky Lake and later, the binding table pointers commands are
 516        * what actually flush the changes to push constant state so we need
 517        * to dirty them so they get re-emitted below.
 518        */
 519       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
 520 #else
 521       cmd_buffer_flush_push_constants(cmd_buffer);
 522 #endif
 523    }
 524
 525    if (dirty)
 526       gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
 527
 528    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) {
 529       gen8_cmd_buffer_emit_viewport(cmd_buffer);
 530       gen8_cmd_buffer_emit_depth_viewport(cmd_buffer);
 531    }
 532
 533    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
 534       gen7_cmd_buffer_emit_scissor(cmd_buffer);
 535
 536    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
 537
 538    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 539 }
 540
 541 static void
 542 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
 543                              struct anv_bo *bo, uint32_t offset)
 544 {
 545    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
 546                                  GENX(3DSTATE_VERTEX_BUFFERS));
 547
 548    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
 549       &(struct GENX(VERTEX_BUFFER_STATE)) {
 550          .VertexBufferIndex = 32, /* Reserved for this */
 551          .AddressModifyEnable = true,
 552          .BufferPitch = 0,
 553 #if (GEN_GEN >= 8)
 554          .MemoryObjectControlState = GENX(MOCS),
 555          .BufferStartingAddress = { bo, offset },
 556          .BufferSize = 8
 557 #else
 558          .VertexBufferMemoryObjectControlState = GENX(MOCS),
 559          .BufferStartingAddress = { bo, offset },
 560          .EndAddress = { bo, offset + 8 },
 561 #endif
 562       });
 563 }
 564
 565 static void
 566 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
 567                           uint32_t base_vertex, uint32_t base_instance)
 568 {
 569    struct anv_state id_state =
 570       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
 571
 572    ((uint32_t *)id_state.map)[0] = base_vertex;
 573    ((uint32_t *)id_state.map)[1] = base_instance;
 574
 575    if (!cmd_buffer->device->info.has_llc)
 576       anv_state_clflush(id_state);
 577
 578    emit_base_vertex_instance_bo(cmd_buffer,
 579       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
 580 }
 581
 582 void genX(CmdDraw)(
 583     VkCommandBuffer                             commandBuffer,
 584     uint32_t                                    vertexCount,
 585     uint32_t                                    instanceCount,
 586     uint32_t                                    firstVertex,
 587     uint32_t                                    firstInstance)
 588 {
 589    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 590    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 591    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 592
 593    genX(cmd_buffer_flush_state)(cmd_buffer);
 594
 595    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 596       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
 597
 598    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 599       prim.VertexAccessType         = SEQUENTIAL;
 600       prim.PrimitiveTopologyType    = pipeline->topology;
 601       prim.VertexCountPerInstance   = vertexCount;
 602       prim.StartVertexLocation      = firstVertex;
 603       prim.InstanceCount            = instanceCount;
 604       prim.StartInstanceLocation    = firstInstance;
 605       prim.BaseVertexLocation       = 0;
 606    }
 607 }
 608
 609 void genX(CmdDrawIndexed)(
 610     VkCommandBuffer                             commandBuffer,
 611     uint32_t                                    indexCount,
 612     uint32_t                                    instanceCount,
 613     uint32_t                                    firstIndex,
 614     int32_t                                     vertexOffset,
 615     uint32_t                                    firstInstance)
 616 {
 617    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 618    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 619    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 620
 621    genX(cmd_buffer_flush_state)(cmd_buffer);
 622
 623    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 624       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
 625
 626    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 627       prim.VertexAccessType         = RANDOM;
 628       prim.PrimitiveTopologyType    = pipeline->topology;
 629       prim.VertexCountPerInstance   = indexCount;
 630       prim.StartVertexLocation      = firstIndex;
 631       prim.InstanceCount            = instanceCount;
 632       prim.StartInstanceLocation    = firstInstance;
 633       prim.BaseVertexLocation       = vertexOffset;
 634    }
 635 }
 636
 637 /* Auto-Draw / Indirect Registers */
 638 #define GEN7_3DPRIM_END_OFFSET          0x2420
 639 #define GEN7_3DPRIM_START_VERTEX        0x2430
 640 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
 641 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
 642 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 643 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 644
 645 static void
 646 emit_lrm(struct anv_batch *batch,
 647          uint32_t reg, struct anv_bo *bo, uint32_t offset)
 648 {
 649    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 650       lrm.RegisterAddress  = reg;
 651       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
 652    }
 653 }
 654
 655 static void
 656 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 657 {
 658    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
 659       lri.RegisterOffset   = reg;
 660       lri.DataDWord        = imm;
 661    }
 662 }
 663
 664 void genX(CmdDrawIndirect)(
 665     VkCommandBuffer                             commandBuffer,
 666     VkBuffer                                    _buffer,
 667     VkDeviceSize                                offset,
 668     uint32_t                                    drawCount,
 669     uint32_t                                    stride)
 670 {
 671    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 672    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 673    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 674    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 675    struct anv_bo *bo = buffer->bo;
 676    uint32_t bo_offset = buffer->offset + offset;
 677
 678    genX(cmd_buffer_flush_state)(cmd_buffer);
 679
 680    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 681       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
 682
 683    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 684    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 685    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 686    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
 687    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
 688
 689    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 690       prim.IndirectParameterEnable  = true;
 691       prim.VertexAccessType         = SEQUENTIAL;
 692       prim.PrimitiveTopologyType    = pipeline->topology;
 693    }
 694 }
 695
 696 void genX(CmdDrawIndexedIndirect)(
 697     VkCommandBuffer                             commandBuffer,
 698     VkBuffer                                    _buffer,
 699     VkDeviceSize                                offset,
 700     uint32_t                                    drawCount,
 701     uint32_t                                    stride)
 702 {
 703    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 704    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 705    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
 706    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 707    struct anv_bo *bo = buffer->bo;
 708    uint32_t bo_offset = buffer->offset + offset;
 709
 710    genX(cmd_buffer_flush_state)(cmd_buffer);
 711
 712    /* TODO: We need to stomp base vertex to 0 somehow */
 713    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
 714       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
 715
 716    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
 717    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
 718    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 719    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
 720    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
 721
 722    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
 723       prim.IndirectParameterEnable  = true;
 724       prim.VertexAccessType         = RANDOM;
 725       prim.PrimitiveTopologyType    = pipeline->topology;
 726    }
 727 }
 728
 729 #if GEN_GEN == 7
 730
 731 static bool
 732 verify_cmd_parser(const struct anv_device *device,
 733                   int required_version,
 734                   const char *function)
 735 {
 736    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
 737       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
 738                 "cmd parser version %d is required for %s",
 739                 required_version, function);
 740       return false;
 741    } else {
 742       return true;
 743    }
 744 }
 745
 746 #endif
 747
 748 void genX(CmdDispatch)(
 749     VkCommandBuffer                             commandBuffer,
 750     uint32_t                                    x,
 751     uint32_t                                    y,
 752     uint32_t                                    z)
 753 {
 754    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 755    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 756    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
 757
 758    if (prog_data->uses_num_work_groups) {
 759       struct anv_state state =
 760          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
 761       uint32_t *sizes = state.map;
 762       sizes[0] = x;
 763       sizes[1] = y;
 764       sizes[2] = z;
 765       if (!cmd_buffer->device->info.has_llc)
 766          anv_state_clflush(state);
 767       cmd_buffer->state.num_workgroups_offset = state.offset;
 768       cmd_buffer->state.num_workgroups_bo =
 769          &cmd_buffer->device->dynamic_state_block_pool.bo;
 770    }
 771
 772    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 773
 774    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
 775       ggw.SIMDSize                     = prog_data->simd_size / 16;
 776       ggw.ThreadDepthCounterMaximum    = 0;
 777       ggw.ThreadHeightCounterMaximum   = 0;
 778       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
 779       ggw.ThreadGroupIDXDimension      = x;
 780       ggw.ThreadGroupIDYDimension      = y;
 781       ggw.ThreadGroupIDZDimension      = z;
 782       ggw.RightExecutionMask           = pipeline->cs_right_mask;
 783       ggw.BottomExecutionMask          = 0xffffffff;
 784    }
 785
 786    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
 787 }
 788
 789 #define GPGPU_DISPATCHDIMX 0x2500
 790 #define GPGPU_DISPATCHDIMY 0x2504
 791 #define GPGPU_DISPATCHDIMZ 0x2508
 792
 793 #define MI_PREDICATE_SRC0  0x2400
 794 #define MI_PREDICATE_SRC1  0x2408
 795
 796 void genX(CmdDispatchIndirect)(
 797     VkCommandBuffer                             commandBuffer,
 798     VkBuffer                                    _buffer,
 799     VkDeviceSize                                offset)
 800 {
 801    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 802    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
 803    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
 804    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
 805    struct anv_bo *bo = buffer->bo;
 806    uint32_t bo_offset = buffer->offset + offset;
 807    struct anv_batch *batch = &cmd_buffer->batch;
 808
 809 #if GEN_GEN == 7
 810    /* Linux 4.4 added command parser version 5 which allows the GPGPU
 811     * indirect dispatch registers to be written.
 812     */
 813    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
 814       return;
 815 #endif
 816
 817    if (prog_data->uses_num_work_groups) {
 818       cmd_buffer->state.num_workgroups_offset = bo_offset;
 819       cmd_buffer->state.num_workgroups_bo = bo;
 820    }
 821
 822    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 823
 824    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
 825    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
 826    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
 827
 828 #if GEN_GEN <= 7
 829    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
 830    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
 831    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
 832    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
 833
 834    /* Load compute_dispatch_indirect_x_size into SRC0 */
 835    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
 836
 837    /* predicate = (compute_dispatch_indirect_x_size == 0); */
 838    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 839       mip.LoadOperation    = LOAD_LOAD;
 840       mip.CombineOperation = COMBINE_SET;
 841       mip.CompareOperation = COMPARE_SRCS_EQUAL;
 842    }
 843
 844    /* Load compute_dispatch_indirect_y_size into SRC0 */
 845    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
 846
 847    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
 848    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 849       mip.LoadOperation    = LOAD_LOAD;
 850       mip.CombineOperation = COMBINE_OR;
 851       mip.CompareOperation = COMPARE_SRCS_EQUAL;
 852    }
 853
 854    /* Load compute_dispatch_indirect_z_size into SRC0 */
 855    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
 856
 857    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
 858    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 859       mip.LoadOperation    = LOAD_LOAD;
 860       mip.CombineOperation = COMBINE_OR;
 861       mip.CompareOperation = COMPARE_SRCS_EQUAL;
 862    }
 863
 864    /* predicate = !predicate; */
 865 #define COMPARE_FALSE                           1
 866    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
 867       mip.LoadOperation    = LOAD_LOADINV;
 868       mip.CombineOperation = COMBINE_OR;
 869       mip.CompareOperation = COMPARE_FALSE;
 870    }
 871 #endif
 872
 873    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
 874       ggw.IndirectParameterEnable      = true;
 875       ggw.PredicateEnable              = GEN_GEN <= 7;
 876       ggw.SIMDSize                     = prog_data->simd_size / 16;
 877       ggw.ThreadDepthCounterMaximum    = 0;
 878       ggw.ThreadHeightCounterMaximum   = 0;
 879       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
 880       ggw.RightExecutionMask           = pipeline->cs_right_mask;
 881       ggw.BottomExecutionMask          = 0xffffffff;
 882    }
 883
 884    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
 885 }
 886
 887 static void
 888 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
 889                                       uint32_t pipeline)
 890 {
 891 #if GEN_GEN >= 8 && GEN_GEN < 10
 892    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
 893     *
 894     *   Software must clear the COLOR_CALC_STATE Valid field in
 895     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
 896     *   with Pipeline Select set to GPGPU.
 897     *
 898     * The internal hardware docs recommend the same workaround for Gen9
 899     * hardware too.
 900     */
 901    if (pipeline == GPGPU)
 902       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
 903 #elif GEN_GEN <= 7
 904       /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
 905        * PIPELINE_SELECT [DevBWR+]":
 906        *
 907        *   Project: DEVSNB+
 908        *
 909        *   Software must ensure all the write caches are flushed through a
 910        *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
 911        *   command to invalidate read only caches prior to programming
 912        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
 913        */
 914       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 915          pc.RenderTargetCacheFlushEnable  = true;
 916          pc.DepthCacheFlushEnable         = true;
 917          pc.DCFlushEnable                 = true;
 918          pc.PostSyncOperation             = NoWrite;
 919          pc.CommandStreamerStallEnable    = true;
 920       }
 921
 922       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 923          pc.TextureCacheInvalidationEnable   = true;
 924          pc.ConstantCacheInvalidationEnable  = true;
 925          pc.StateCacheInvalidationEnable     = true;
 926          pc.InstructionCacheInvalidateEnable = true;
 927          pc.PostSyncOperation                = NoWrite;
 928       }
 929 #endif
 930 }
 931
 932 void
 933 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
 934 {
 935    if (cmd_buffer->state.current_pipeline != _3D) {
 936       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
 937
 938       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
 939 #if GEN_GEN >= 9
 940          ps.MaskBits = 3;
 941 #endif
 942          ps.PipelineSelection = _3D;
 943       }
 944
 945       cmd_buffer->state.current_pipeline = _3D;
 946    }
 947 }
 948
 949 void
 950 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
 951 {
 952    if (cmd_buffer->state.current_pipeline != GPGPU) {
 953       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
 954
 955       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
 956 #if GEN_GEN >= 9
 957          ps.MaskBits = 3;
 958 #endif
 959          ps.PipelineSelection = GPGPU;
 960       }
 961
 962       cmd_buffer->state.current_pipeline = GPGPU;
 963    }
 964 }
 965
 966 struct anv_state
 967 genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
 968                                           struct anv_framebuffer *fb)
 969 {
 970    struct anv_state state =
 971       anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
 972
 973    struct GENX(RENDER_SURFACE_STATE) null_ss = {
 974       .SurfaceType = SURFTYPE_NULL,
 975       .SurfaceArray = fb->layers > 0,
 976       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
 977 #if GEN_GEN >= 8
 978       .TileMode = YMAJOR,
 979 #else
 980       .TiledSurface = true,
 981 #endif
 982       .Width = fb->width - 1,
 983       .Height = fb->height - 1,
 984       .Depth = fb->layers - 1,
 985       .RenderTargetViewExtent = fb->layers - 1,
 986    };
 987
 988    GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss);
 989
 990    if (!cmd_buffer->device->info.has_llc)
 991       anv_state_clflush(state);
 992
 993    return state;
 994 }
 995
 996 static void
 997 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 998 {
 999    struct anv_device *device = cmd_buffer->device;
1000    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
1001    const struct anv_image_view *iview =
1002       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1003    const struct anv_image *image = iview ? iview->image : NULL;
1004    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
1005    const bool has_stencil =
1006       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
1007
1008    /* FIXME: Implement the PMA stall W/A */
1009    /* FIXME: Width and Height are wrong */
1010
1011    /* Emit 3DSTATE_DEPTH_BUFFER */
1012    if (has_depth) {
1013       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1014          db.SurfaceType                   = SURFTYPE_2D;
1015          db.DepthWriteEnable              = true;
1016          db.StencilWriteEnable            = has_stencil;
1017          db.HierarchicalDepthBufferEnable = false;
1018
1019          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
1020                                                       &image->depth_surface.isl);
1021
1022          db.SurfaceBaseAddress = (struct anv_address) {
1023             .bo = image->bo,
1024             .offset = image->offset + image->depth_surface.offset,
1025          };
1026          db.DepthBufferObjectControlState = GENX(MOCS),
1027
1028          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
1029          db.Height               = image->extent.height - 1;
1030          db.Width                = image->extent.width - 1;
1031          db.LOD                  = iview->base_mip;
1032          db.Depth                = image->array_size - 1; /* FIXME: 3-D */
1033          db.MinimumArrayElement  = iview->base_layer;
1034
1035 #if GEN_GEN >= 8
1036          db.SurfaceQPitch =
1037             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
1038 #endif
1039          db.RenderTargetViewExtent = 1 - 1;
1040       }
1041    } else {
1042       /* Even when no depth buffer is present, the hardware requires that
1043        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
1044        *
1045        *    If a null depth buffer is bound, the driver must instead bind depth as:
1046        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
1047        *       3DSTATE_DEPTH.Width = 1
1048        *       3DSTATE_DEPTH.Height = 1
1049        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
1050        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
1051        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
1052        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
1053        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
1054        *
1055        * The PRM is wrong, though. The width and height must be programmed to
1056        * actual framebuffer's width and height, even when neither depth buffer
1057        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
1058        * be combined with a stencil buffer so we use D32_FLOAT instead.
1059        */
1060       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1061          db.SurfaceType          = SURFTYPE_2D;
1062          db.SurfaceFormat        = D32_FLOAT;
1063          db.Width                = fb->width - 1;
1064          db.Height               = fb->height - 1;
1065          db.StencilWriteEnable   = has_stencil;
1066       }
1067    }
1068
1069    /* Emit 3DSTATE_STENCIL_BUFFER */
1070    if (has_stencil) {
1071       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
1072 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1073          sb.StencilBufferEnable = true,
1074 #endif
1075          sb.StencilBufferObjectControlState = GENX(MOCS),
1076
1077          /* Stencil buffers have strange pitch. The PRM says:
1078           *
1079           *    The pitch must be set to 2x the value computed based on width,
1080           *    as the stencil buffer is stored with two rows interleaved.
1081           */
1082          sb.SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
1083
1084 #if GEN_GEN >= 8
1085          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
1086 #endif
1087          sb.SurfaceBaseAddress = (struct anv_address) {
1088             .bo = image->bo,
1089             .offset = image->offset + image->stencil_surface.offset,
1090          };
1091       }
1092    } else {
1093       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
1094    }
1095
1096    /* Disable hierarchial depth buffers. */
1097    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hz);
1098
1099    /* Clear the clear params. */
1100    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp);
1101 }
1102
1103 /**
1104  * @see anv_cmd_buffer_set_subpass()
1105  */
1106 void
1107 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
1108                              struct anv_subpass *subpass)
1109 {
1110    cmd_buffer->state.subpass = subpass;
1111
1112    cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
1113
1114    cmd_buffer_emit_depth_stencil(cmd_buffer);
1115 }
1116
1117 void genX(CmdBeginRenderPass)(
1118     VkCommandBuffer                             commandBuffer,
1119     const VkRenderPassBeginInfo*                pRenderPassBegin,
1120     VkSubpassContents                           contents)
1121 {
1122    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1123    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
1124    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1125
1126    cmd_buffer->state.framebuffer = framebuffer;
1127    cmd_buffer->state.pass = pass;
1128    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
1129    anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
1130
1131    genX(flush_pipeline_select_3d)(cmd_buffer);
1132
1133    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
1134    anv_cmd_buffer_clear_subpass(cmd_buffer);
1135 }
1136
1137 void genX(CmdNextSubpass)(
1138     VkCommandBuffer                             commandBuffer,
1139     VkSubpassContents                           contents)
1140 {
1141    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1142
1143    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1144
1145    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1146    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
1147    anv_cmd_buffer_clear_subpass(cmd_buffer);
1148 }
1149
1150 void genX(CmdEndRenderPass)(
1151     VkCommandBuffer                             commandBuffer)
1152 {
1153    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1154
1155    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1156 }
1157
1158 static void
1159 emit_ps_depth_count(struct anv_batch *batch,
1160                     struct anv_bo *bo, uint32_t offset)
1161 {
1162    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1163       pc.DestinationAddressType  = DAT_PPGTT;
1164       pc.PostSyncOperation       = WritePSDepthCount;
1165       pc.DepthStallEnable        = true;
1166       pc.Address                 = (struct anv_address) { bo, offset };
1167    }
1168 }
1169
1170 static void
1171 emit_query_availability(struct anv_batch *batch,
1172                         struct anv_bo *bo, uint32_t offset)
1173 {
1174    anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1175       pc.DestinationAddressType  = DAT_PPGTT;
1176       pc.PostSyncOperation       = WriteImmediateData;
1177       pc.Address                 = (struct anv_address) { bo, offset };
1178       pc.ImmediateData           = 1;
1179    }
1180 }
1181
1182 void genX(CmdBeginQuery)(
1183     VkCommandBuffer                             commandBuffer,
1184     VkQueryPool                                 queryPool,
1185     uint32_t                                    query,
1186     VkQueryControlFlags                         flags)
1187 {
1188    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1189    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1190
1191    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
1192     * that the pipelining of the depth write breaks. What we see is that
1193     * samples from the render pass clear leaks into the first query
1194     * immediately after the clear. Doing a pipecontrol with a post-sync
1195     * operation and DepthStallEnable seems to work around the issue.
1196     */
1197    if (cmd_buffer->state.need_query_wa) {
1198       cmd_buffer->state.need_query_wa = false;
1199       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1200          pc.DepthCacheFlushEnable   = true;
1201          pc.DepthStallEnable        = true;
1202       }
1203    }
1204
1205    switch (pool->type) {
1206    case VK_QUERY_TYPE_OCCLUSION:
1207       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1208                           query * sizeof(struct anv_query_pool_slot));
1209       break;
1210
1211    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1212    default:
1213       unreachable("");
1214    }
1215 }
1216
1217 void genX(CmdEndQuery)(
1218     VkCommandBuffer                             commandBuffer,
1219     VkQueryPool                                 queryPool,
1220     uint32_t                                    query)
1221 {
1222    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1223    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1224
1225    switch (pool->type) {
1226    case VK_QUERY_TYPE_OCCLUSION:
1227       emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1228                           query * sizeof(struct anv_query_pool_slot) + 8);
1229
1230       emit_query_availability(&cmd_buffer->batch, &pool->bo,
1231                               query * sizeof(struct anv_query_pool_slot) + 16);
1232       break;
1233
1234    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1235    default:
1236       unreachable("");
1237    }
1238 }
1239
1240 #define TIMESTAMP 0x2358
1241
1242 void genX(CmdWriteTimestamp)(
1243     VkCommandBuffer                             commandBuffer,
1244     VkPipelineStageFlagBits                     pipelineStage,
1245     VkQueryPool                                 queryPool,
1246     uint32_t                                    query)
1247 {
1248    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1249    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1250    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
1251
1252    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1253
1254    switch (pipelineStage) {
1255    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1256       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1257          srm.RegisterAddress  = TIMESTAMP;
1258          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
1259       }
1260       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1261          srm.RegisterAddress  = TIMESTAMP + 4;
1262          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
1263       }
1264       break;
1265
1266    default:
1267       /* Everything else is bottom-of-pipe */
1268       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1269          pc.DestinationAddressType  = DAT_PPGTT,
1270          pc.PostSyncOperation       = WriteTimestamp,
1271          pc.Address = (struct anv_address) { &pool->bo, offset };
1272       }
1273       break;
1274    }
1275
1276    emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
1277 }
1278
1279 #if GEN_GEN > 7 || GEN_IS_HASWELL
1280
1281 #define alu_opcode(v)   __gen_uint((v),  20, 31)
1282 #define alu_operand1(v) __gen_uint((v),  10, 19)
1283 #define alu_operand2(v) __gen_uint((v),   0,  9)
1284 #define alu(opcode, operand1, operand2) \
1285    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
1286
1287 #define OPCODE_NOOP      0x000
1288 #define OPCODE_LOAD      0x080
1289 #define OPCODE_LOADINV   0x480
1290 #define OPCODE_LOAD0     0x081
1291 #define OPCODE_LOAD1     0x481
1292 #define OPCODE_ADD       0x100
1293 #define OPCODE_SUB       0x101
1294 #define OPCODE_AND       0x102
1295 #define OPCODE_OR        0x103
1296 #define OPCODE_XOR       0x104
1297 #define OPCODE_STORE     0x180
1298 #define OPCODE_STOREINV  0x580
1299
1300 #define OPERAND_R0   0x00
1301 #define OPERAND_R1   0x01
1302 #define OPERAND_R2   0x02
1303 #define OPERAND_R3   0x03
1304 #define OPERAND_R4   0x04
1305 #define OPERAND_SRCA 0x20
1306 #define OPERAND_SRCB 0x21
1307 #define OPERAND_ACCU 0x31
1308 #define OPERAND_ZF   0x32
1309 #define OPERAND_CF   0x33
1310
1311 #define CS_GPR(n) (0x2600 + (n) * 8)
1312
1313 static void
1314 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
1315                       struct anv_bo *bo, uint32_t offset)
1316 {
1317    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1318       lrm.RegisterAddress  = reg,
1319       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
1320    }
1321    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1322       lrm.RegisterAddress  = reg + 4;
1323       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1324    }
1325 }
1326
1327 static void
1328 store_query_result(struct anv_batch *batch, uint32_t reg,
1329                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
1330 {
1331    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1332       srm.RegisterAddress  = reg;
1333       srm.MemoryAddress    = (struct anv_address) { bo, offset };
1334    }
1335
1336    if (flags & VK_QUERY_RESULT_64_BIT) {
1337       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1338          srm.RegisterAddress  = reg + 4;
1339          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
1340       }
1341    }
1342 }
1343
1344 void genX(CmdCopyQueryPoolResults)(
1345     VkCommandBuffer                             commandBuffer,
1346     VkQueryPool                                 queryPool,
1347     uint32_t                                    firstQuery,
1348     uint32_t                                    queryCount,
1349     VkBuffer                                    destBuffer,
1350     VkDeviceSize                                destOffset,
1351     VkDeviceSize                                destStride,
1352     VkQueryResultFlags                          flags)
1353 {
1354    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1355    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1356    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1357    uint32_t slot_offset, dst_offset;
1358
1359    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1360       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1361          pc.CommandStreamerStallEnable = true;
1362          pc.StallAtPixelScoreboard     = true;
1363       }
1364    }
1365
1366    dst_offset = buffer->offset + destOffset;
1367    for (uint32_t i = 0; i < queryCount; i++) {
1368
1369       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
1370       switch (pool->type) {
1371       case VK_QUERY_TYPE_OCCLUSION:
1372          emit_load_alu_reg_u64(&cmd_buffer->batch,
1373                                CS_GPR(0), &pool->bo, slot_offset);
1374          emit_load_alu_reg_u64(&cmd_buffer->batch,
1375                                CS_GPR(1), &pool->bo, slot_offset + 8);
1376
1377          /* FIXME: We need to clamp the result for 32 bit. */
1378
1379          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
1380          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
1381          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
1382          dw[3] = alu(OPCODE_SUB, 0, 0);
1383          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
1384          break;
1385
1386       case VK_QUERY_TYPE_TIMESTAMP:
1387          emit_load_alu_reg_u64(&cmd_buffer->batch,
1388                                CS_GPR(2), &pool->bo, slot_offset);
1389          break;
1390
1391       default:
1392          unreachable("unhandled query type");
1393       }
1394
1395       store_query_result(&cmd_buffer->batch,
1396                          CS_GPR(2), buffer->bo, dst_offset, flags);
1397
1398       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1399          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
1400                                &pool->bo, slot_offset + 16);
1401          if (flags & VK_QUERY_RESULT_64_BIT)
1402             store_query_result(&cmd_buffer->batch,
1403                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
1404          else
1405             store_query_result(&cmd_buffer->batch,
1406                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
1407       }
1408
1409       dst_offset += destStride;
1410    }
1411 }
1412
1413 #endif