src/intel/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28 #include "vk_format_info.h"
  29
  30 #include "common/gen_l3_config.h"
  31 #include "genxml/gen_macros.h"
  32 #include "genxml/genX_pack.h"
  33
  34 static void
  35 emit_lrm(struct anv_batch *batch,
  36          uint32_t reg, struct anv_bo *bo, uint32_t offset)
  37 {
  38    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
  39       lrm.RegisterAddress  = reg;
  40       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
  41    }
  42 }
  43
  44 static void
  45 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
  46 {
  47    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
  48       lri.RegisterOffset   = reg;
  49       lri.DataDWord        = imm;
  50    }
  51 }
  52
  53 void
  54 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  55 {
  56    struct anv_device *device = cmd_buffer->device;
  57
  58 /* XXX: Do we need this on more than just BDW? */
  59 #if (GEN_GEN >= 8)
  60    /* Emit a render target cache flush.
  61     *
  62     * This isn't documented anywhere in the PRM.  However, it seems to be
  63     * necessary prior to changing the surface state base adress.  Without
  64     * this, we get GPU hangs when using multi-level command buffers which
  65     * clear depth, reset state base address, and then go render stuff.
  66     */
  67    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
  68       pc.RenderTargetCacheFlushEnable = true;
  69    }
  70 #endif
  71
  72    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
  73       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
  74       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
  75       sba.GeneralStateBaseAddressModifyEnable = true;
  76
  77       sba.SurfaceStateBaseAddress =
  78          anv_cmd_buffer_surface_base_address(cmd_buffer);
  79       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
  80       sba.SurfaceStateBaseAddressModifyEnable = true;
  81
  82       sba.DynamicStateBaseAddress =
  83          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
  84       sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
  85       sba.DynamicStateBaseAddressModifyEnable = true;
  86
  87       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
  88       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
  89       sba.IndirectObjectBaseAddressModifyEnable = true;
  90
  91       sba.InstructionBaseAddress =
  92          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
  93       sba.InstructionMemoryObjectControlState = GENX(MOCS);
  94       sba.InstructionBaseAddressModifyEnable = true;
  95
  96 #  if (GEN_GEN >= 8)
  97       /* Broadwell requires that we specify a buffer size for a bunch of
  98        * these fields.  However, since we will be growing the BO's live, we
  99        * just set them all to the maximum.
 100        */
 101       sba.GeneralStateBufferSize                = 0xfffff;
 102       sba.GeneralStateBufferSizeModifyEnable    = true;
 103       sba.DynamicStateBufferSize                = 0xfffff;
 104       sba.DynamicStateBufferSizeModifyEnable    = true;
 105       sba.IndirectObjectBufferSize              = 0xfffff;
 106       sba.IndirectObjectBufferSizeModifyEnable  = true;
 107       sba.InstructionBufferSize                 = 0xfffff;
 108       sba.InstructionBuffersizeModifyEnable     = true;
 109 #  endif
 110    }
 111
 112    /* After re-setting the surface state base address, we have to do some
 113     * cache flusing so that the sampler engine will pick up the new
 114     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 115     * Shared Function > 3D Sampler > State > State Caching (page 96):
 116     *
 117     *    Coherency with system memory in the state cache, like the texture
 118     *    cache is handled partially by software. It is expected that the
 119     *    command stream or shader will issue Cache Flush operation or
 120     *    Cache_Flush sampler message to ensure that the L1 cache remains
 121     *    coherent with system memory.
 122     *
 123     *    [...]
 124     *
 125     *    Whenever the value of the Dynamic_State_Base_Addr,
 126     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 127     *    invalidated to ensure the new surface or sampler state is fetched
 128     *    from system memory.
 129     *
 130     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 131     * which, according the PIPE_CONTROL instruction documentation in the
 132     * Broadwell PRM:
 133     *
 134     *    Setting this bit is independent of any other bit in this packet.
 135     *    This bit controls the invalidation of the L1 and L2 state caches
 136     *    at the top of the pipe i.e. at the parsing time.
 137     *
 138     * Unfortunately, experimentation seems to indicate that state cache
 139     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 140     * regards to surface state and binding tables.  In stead, it seems that
 141     * invalidating the texture cache is what is actually needed.
 142     *
 143     * XXX:  As far as we have been able to determine through
 144     * experimentation, shows that flush the texture cache appears to be
 145     * sufficient.  The theory here is that all of the sampling/rendering
 146     * units cache the binding table in the texture cache.  However, we have
 147     * yet to be able to actually confirm this.
 148     */
 149    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 150       pc.TextureCacheInvalidationEnable = true;
 151    }
 152 }
 153
 154 /**
 155  * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
 156  */
 157 static void
 158 genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
 159                                    struct anv_render_pass *pass,
 160                                    struct anv_framebuffer *framebuffer,
 161                                    const VkClearValue *clear_values)
 162 {
 163    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
 164    struct anv_cmd_state *state = &cmd_buffer->state;
 165
 166    vk_free(&cmd_buffer->pool->alloc, state->attachments);
 167
 168    if (pass->attachment_count == 0) {
 169       state->attachments = NULL;
 170       return;
 171    }
 172
 173    state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
 174                                  pass->attachment_count *
 175                                       sizeof(state->attachments[0]),
 176                                  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 177    if (state->attachments == NULL) {
 178       /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
 179       abort();
 180    }
 181
 182    bool need_null_state = false;
 183    for (uint32_t s = 0; s < pass->subpass_count; ++s) {
 184       if (pass->subpasses[s].color_count == 0) {
 185          need_null_state = true;
 186          break;
 187       }
 188    }
 189
 190    unsigned num_states = need_null_state;
 191    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
 192       if (vk_format_is_color(pass->attachments[i].format))
 193          num_states++;
 194    }
 195
 196    const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
 197    state->render_pass_states =
 198       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
 199                              num_states * ss_stride, isl_dev->ss.align);
 200
 201    struct anv_state next_state = state->render_pass_states;
 202    next_state.alloc_size = isl_dev->ss.size;
 203
 204    if (need_null_state) {
 205       state->null_surface_state = next_state;
 206       next_state.offset += ss_stride;
 207       next_state.map += ss_stride;
 208    }
 209
 210    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
 211       if (vk_format_is_color(pass->attachments[i].format)) {
 212          state->attachments[i].color_rt_state = next_state;
 213          next_state.offset += ss_stride;
 214          next_state.map += ss_stride;
 215       }
 216    }
 217    assert(next_state.offset == state->render_pass_states.offset +
 218                                state->render_pass_states.alloc_size);
 219
 220    if (framebuffer) {
 221       assert(pass->attachment_count == framebuffer->attachment_count);
 222
 223       if (need_null_state) {
 224          struct GENX(RENDER_SURFACE_STATE) null_ss = {
 225             .SurfaceType = SURFTYPE_NULL,
 226             .SurfaceArray = framebuffer->layers > 0,
 227             .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
 228 #if GEN_GEN >= 8
 229             .TileMode = YMAJOR,
 230 #else
 231             .TiledSurface = true,
 232 #endif
 233             .Width = framebuffer->width - 1,
 234             .Height = framebuffer->height - 1,
 235             .Depth = framebuffer->layers - 1,
 236             .RenderTargetViewExtent = framebuffer->layers - 1,
 237          };
 238          GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map,
 239                                          &null_ss);
 240       }
 241
 242       for (uint32_t i = 0; i < pass->attachment_count; ++i) {
 243          struct anv_render_pass_attachment *att = &pass->attachments[i];
 244          VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
 245          VkImageAspectFlags clear_aspects = 0;
 246
 247          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
 248             /* color attachment */
 249             if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 250                clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
 251             }
 252          } else {
 253             /* depthstencil attachment */
 254             if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
 255                 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 256                clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
 257             }
 258             if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
 259                 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 260                clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
 261             }
 262          }
 263
 264          state->attachments[i].pending_clear_aspects = clear_aspects;
 265          if (clear_aspects)
 266             state->attachments[i].clear_value = clear_values[i];
 267
 268          struct anv_image_view *iview = framebuffer->attachments[i];
 269          assert(iview->vk_format == att->format);
 270
 271          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
 272             struct isl_view view = iview->isl;
 273             view.usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
 274             isl_surf_fill_state(isl_dev,
 275                                 state->attachments[i].color_rt_state.map,
 276                                 .surf = &iview->image->color_surface.isl,
 277                                 .view = &view,
 278                                 .mocs = cmd_buffer->device->default_mocs);
 279
 280             anv_cmd_buffer_add_surface_state_reloc(cmd_buffer,
 281                state->attachments[i].color_rt_state, iview->bo, iview->offset);
 282          }
 283       }
 284
 285       if (!cmd_buffer->device->info.has_llc)
 286          anv_state_clflush(state->render_pass_states);
 287    }
 288 }
 289
 290 VkResult
 291 genX(BeginCommandBuffer)(
 292     VkCommandBuffer                             commandBuffer,
 293     const VkCommandBufferBeginInfo*             pBeginInfo)
 294 {
 295    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 296
 297    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
 298     * command buffer's state. Otherwise, we must *reset* its state. In both
 299     * cases we reset it.
 300     *
 301     * From the Vulkan 1.0 spec:
 302     *
 303     *    If a command buffer is in the executable state and the command buffer
 304     *    was allocated from a command pool with the
 305     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
 306     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
 307     *    as if vkResetCommandBuffer had been called with
 308     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
 309     *    the command buffer in the recording state.
 310     */
 311    anv_cmd_buffer_reset(cmd_buffer);
 312
 313    cmd_buffer->usage_flags = pBeginInfo->flags;
 314
 315    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
 316           !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
 317
 318    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
 319
 320    if (cmd_buffer->usage_flags &
 321        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
 322       cmd_buffer->state.framebuffer =
 323          anv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
 324       cmd_buffer->state.pass =
 325          anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
 326       cmd_buffer->state.subpass =
 327          &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
 328
 329       genX(cmd_buffer_setup_attachments)(cmd_buffer, cmd_buffer->state.pass,
 330                                          NULL, NULL);
 331
 332       cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
 333    }
 334
 335    return VK_SUCCESS;
 336 }
 337
 338 VkResult
 339 genX(EndCommandBuffer)(
 340     VkCommandBuffer                             commandBuffer)
 341 {
 342    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 343
 344    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 345
 346    return VK_SUCCESS;
 347 }
 348
 349 void
 350 genX(CmdExecuteCommands)(
 351     VkCommandBuffer                             commandBuffer,
 352     uint32_t                                    commandBufferCount,
 353     const VkCommandBuffer*                      pCmdBuffers)
 354 {
 355    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
 356
 357    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 358
 359    for (uint32_t i = 0; i < commandBufferCount; i++) {
 360       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 361
 362       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
 363
 364       if (secondary->usage_flags &
 365           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
 366          /* If we're continuing a render pass from the primary, we need to
 367           * copy the surface states for the current subpass into the storage
 368           * we allocated for them in BeginCommandBuffer.
 369           */
 370          struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo;
 371          struct anv_state src_state = primary->state.render_pass_states;
 372          struct anv_state dst_state = secondary->state.render_pass_states;
 373          assert(src_state.alloc_size == dst_state.alloc_size);
 374
 375          genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset,
 376                                      ss_bo, src_state.offset,
 377                                      src_state.alloc_size);
 378       }
 379
 380       anv_cmd_buffer_add_secondary(primary, secondary);
 381    }
 382
 383    /* Each of the secondary command buffers will use its own state base
 384     * address.  We need to re-emit state base address for the primary after
 385     * all of the secondaries are done.
 386     *
 387     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
 388     * address calls?
 389     */
 390    genX(cmd_buffer_emit_state_base_address)(primary);
 391 }
 392
 393 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
 394 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
 395 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
 396
 397 /**
 398  * Program the hardware to use the specified L3 configuration.
 399  */
 400 void
 401 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
 402                            const struct gen_l3_config *cfg)
 403 {
 404    assert(cfg);
 405    if (cfg == cmd_buffer->state.current_l3_config)
 406       return;
 407
 408    if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
 409       fprintf(stderr, "L3 config transition: ");
 410       gen_dump_l3_config(cfg, stderr);
 411    }
 412
 413    const bool has_slm = cfg->n[GEN_L3P_SLM];
 414
 415    /* According to the hardware docs, the L3 partitioning can only be changed
 416     * while the pipeline is completely drained and the caches are flushed,
 417     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
 418     */
 419    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 420       pc.DCFlushEnable = true;
 421       pc.PostSyncOperation = NoWrite;
 422       pc.CommandStreamerStallEnable = true;
 423    }
 424
 425    /* ...followed by a second pipelined PIPE_CONTROL that initiates
 426     * invalidation of the relevant caches.  Note that because RO invalidation
 427     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
 428     * command is processed by the CS) we cannot combine it with the previous
 429     * stalling flush as the hardware documentation suggests, because that
 430     * would cause the CS to stall on previous rendering *after* RO
 431     * invalidation and wouldn't prevent the RO caches from being polluted by
 432     * concurrent rendering before the stall completes.  This intentionally
 433     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
 434     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
 435     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
 436     * already guarantee that there is no concurrent GPGPU kernel execution
 437     * (see SKL HSD 2132585).
 438     */
 439    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 440       pc.TextureCacheInvalidationEnable = true;
 441       pc.ConstantCacheInvalidationEnable = true;
 442       pc.InstructionCacheInvalidateEnable = true;
 443       pc.StateCacheInvalidationEnable = true;
 444       pc.PostSyncOperation = NoWrite;
 445    }
 446
 447    /* Now send a third stalling flush to make sure that invalidation is
 448     * complete when the L3 configuration registers are modified.
 449     */
 450    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 451       pc.DCFlushEnable = true;
 452       pc.PostSyncOperation = NoWrite;
 453       pc.CommandStreamerStallEnable = true;
 454    }
 455
 456 #if GEN_GEN >= 8
 457
 458    assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
 459
 460    uint32_t l3cr;
 461    anv_pack_struct(&l3cr, GENX(L3CNTLREG),
 462                    .SLMEnable = has_slm,
 463                    .URBAllocation = cfg->n[GEN_L3P_URB],
 464                    .ROAllocation = cfg->n[GEN_L3P_RO],
 465                    .DCAllocation = cfg->n[GEN_L3P_DC],
 466                    .AllAllocation = cfg->n[GEN_L3P_ALL]);
 467
 468    /* Set up the L3 partitioning. */
 469    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr);
 470
 471 #else
 472
 473    const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
 474    const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
 475                        cfg->n[GEN_L3P_ALL];
 476    const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
 477                       cfg->n[GEN_L3P_ALL];
 478    const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
 479                       cfg->n[GEN_L3P_ALL];
 480
 481    assert(!cfg->n[GEN_L3P_ALL]);
 482
 483    /* When enabled SLM only uses a portion of the L3 on half of the banks,
 484     * the matching space on the remaining banks has to be allocated to a
 485     * client (URB for all validated configurations) set to the
 486     * lower-bandwidth 2-bank address hashing mode.
 487     */
 488    const struct gen_device_info *devinfo = &cmd_buffer->device->info;
 489    const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
 490    assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
 491
 492    /* Minimum number of ways that can be allocated to the URB. */
 493    const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
 494    assert(cfg->n[GEN_L3P_URB] >= n0_urb);
 495
 496    uint32_t l3sqcr1, l3cr2, l3cr3;
 497    anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1),
 498                    .ConvertDC_UC = !has_dc,
 499                    .ConvertIS_UC = !has_is,
 500                    .ConvertC_UC = !has_c,
 501                    .ConvertT_UC = !has_t);
 502    l3sqcr1 |=
 503       GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
 504       devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
 505       IVB_L3SQCREG1_SQGHPCI_DEFAULT;
 506
 507    anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
 508                    .SLMEnable = has_slm,
 509                    .URBLowBandwidth = urb_low_bw,
 510                    .URBAllocation = cfg->n[GEN_L3P_URB],
 511 #if !GEN_IS_HASWELL
 512                    .ALLAllocation = cfg->n[GEN_L3P_ALL],
 513 #endif
 514                    .ROAllocation = cfg->n[GEN_L3P_RO],
 515                    .DCAllocation = cfg->n[GEN_L3P_DC]);
 516
 517    anv_pack_struct(&l3cr3, GENX(L3CNTLREG3),
 518                    .ISAllocation = cfg->n[GEN_L3P_IS],
 519                    .ISLowBandwidth = 0,
 520                    .CAllocation = cfg->n[GEN_L3P_C],
 521                    .CLowBandwidth = 0,
 522                    .TAllocation = cfg->n[GEN_L3P_T],
 523                    .TLowBandwidth = 0);
 524
 525    /* Set up the L3 partitioning. */
 526    emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1);
 527    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2);
 528    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
 529
 530 #if GEN_IS_HASWELL
 531    if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
 532       /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
 533        * them disabled to avoid crashing the system hard.
 534        */
 535       uint32_t scratch1, chicken3;
 536       anv_pack_struct(&scratch1, GENX(SCRATCH1),
 537                       .L3AtomicDisable = !has_dc);
 538       anv_pack_struct(&chicken3, GENX(CHICKEN3),
 539                       .L3AtomicDisableMask = true,
 540                       .L3AtomicDisable = !has_dc);
 541       emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1);
 542       emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3);
 543    }
 544 #endif
 545
 546 #endif
 547
 548    cmd_buffer->state.current_l3_config = cfg;
 549 }
 550
 551 void
 552 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
 553 {
 554    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
 555
 556    /* Flushes are pipelined while invalidations are handled immediately.
 557     * Therefore, if we're flushing anything then we need to schedule a stall
 558     * before any invalidations can happen.
 559     */
 560    if (bits & ANV_PIPE_FLUSH_BITS)
 561       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
 562
 563    /* If we're going to do an invalidate and we have a pending CS stall that
 564     * has yet to be resolved, we do the CS stall now.
 565     */
 566    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
 567        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
 568       bits |= ANV_PIPE_CS_STALL_BIT;
 569       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
 570    }
 571
 572    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
 573       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 574          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 575          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 576          pipe.RenderTargetCacheFlushEnable =
 577             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 578
 579          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
 580          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
 581          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
 582
 583          /*
 584           * According to the Broadwell documentation, any PIPE_CONTROL with the
 585           * "Command Streamer Stall" bit set must also have another bit set,
 586           * with five different options:
 587           *
 588           *  - Render Target Cache Flush
 589           *  - Depth Cache Flush
 590           *  - Stall at Pixel Scoreboard
 591           *  - Post-Sync Operation
 592           *  - Depth Stall
 593           *  - DC Flush Enable
 594           *
 595           * I chose "Stall at Pixel Scoreboard" since that's what we use in
 596           * mesa and it seems to work fine. The choice is fairly arbitrary.
 597           */
 598          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
 599              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
 600                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
 601             pipe.StallAtPixelScoreboard = true;
 602       }
 603
 604       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
 605    }
 606
 607    if (bits & ANV_PIPE_INVALIDATE_BITS) {
 608       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 609          pipe.StateCacheInvalidationEnable =
 610             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
 611          pipe.ConstantCacheInvalidationEnable =
 612             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 613          pipe.VFCacheInvalidationEnable =
 614             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 615          pipe.TextureCacheInvalidationEnable =
 616             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 617          pipe.InstructionCacheInvalidateEnable =
 618             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
 619       }
 620
 621       bits &= ~ANV_PIPE_INVALIDATE_BITS;
 622    }
 623
 624    cmd_buffer->state.pending_pipe_bits = bits;
 625 }
 626
 627 void genX(CmdPipelineBarrier)(
 628     VkCommandBuffer                             commandBuffer,
 629     VkPipelineStageFlags                        srcStageMask,
 630     VkPipelineStageFlags                        destStageMask,
 631     VkBool32                                    byRegion,
 632     uint32_t                                    memoryBarrierCount,
 633     const VkMemoryBarrier*                      pMemoryBarriers,
 634     uint32_t                                    bufferMemoryBarrierCount,
 635     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 636     uint32_t                                    imageMemoryBarrierCount,
 637     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 638 {
 639    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 640    uint32_t b;
 641
 642    /* XXX: Right now, we're really dumb and just flush whatever categories
 643     * the app asks for.  One of these days we may make this a bit better
 644     * but right now that's all the hardware allows for in most areas.
 645     */
 646    VkAccessFlags src_flags = 0;
 647    VkAccessFlags dst_flags = 0;
 648
 649    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 650       src_flags |= pMemoryBarriers[i].srcAccessMask;
 651       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 652    }
 653
 654    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 655       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 656       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 657    }
 658
 659    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 660       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 661       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 662    }
 663
 664    enum anv_pipe_bits pipe_bits = 0;
 665
 666    for_each_bit(b, src_flags) {
 667       switch ((VkAccessFlagBits)(1 << b)) {
 668       case VK_ACCESS_SHADER_WRITE_BIT:
 669          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 670          break;
 671       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 672          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 673          break;
 674       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 675          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 676          break;
 677       case VK_ACCESS_TRANSFER_WRITE_BIT:
 678          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 679          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 680          break;
 681       default:
 682          break; /* Nothing to do */
 683       }
 684    }
 685
 686    for_each_bit(b, dst_flags) {
 687       switch ((VkAccessFlagBits)(1 << b)) {
 688       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 689       case VK_ACCESS_INDEX_READ_BIT:
 690       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 691          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 692          break;
 693       case VK_ACCESS_UNIFORM_READ_BIT:
 694          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 695          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 696          break;
 697       case VK_ACCESS_SHADER_READ_BIT:
 698       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
 699       case VK_ACCESS_TRANSFER_READ_BIT:
 700          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 701          break;
 702       default:
 703          break; /* Nothing to do */
 704       }
 705    }
 706
 707    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
 708 }
 709
 710 static void
 711 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
 712 {
 713    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
 714
 715    /* In order to avoid thrash, we assume that vertex and fragment stages
 716     * always exist.  In the rare case where one is missing *and* the other
 717     * uses push concstants, this may be suboptimal.  However, avoiding stalls
 718     * seems more important.
 719     */
 720    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
 721
 722    if (stages == cmd_buffer->state.push_constant_stages)
 723       return;
 724
 725 #if GEN_GEN >= 8
 726    const unsigned push_constant_kb = 32;
 727 #elif GEN_IS_HASWELL
 728    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
 729 #else
 730    const unsigned push_constant_kb = 16;
 731 #endif
 732
 733    const unsigned num_stages =
 734       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
 735    unsigned size_per_stage = push_constant_kb / num_stages;
 736
 737    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
 738     * units of 2KB.  Incidentally, these are the same platforms that have
 739     * 32KB worth of push constant space.
 740     */
 741    if (push_constant_kb == 32)
 742       size_per_stage &= ~1u;
 743
 744    uint32_t kb_used = 0;
 745    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
 746       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
 747       anv_batch_emit(&cmd_buffer->batch,
 748                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
 749          alloc._3DCommandSubOpcode  = 18 + i;
 750          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
 751          alloc.ConstantBufferSize   = push_size;
 752       }
 753       kb_used += push_size;
 754    }
 755
 756    anv_batch_emit(&cmd_buffer->batch,
 757                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
 758       alloc.ConstantBufferOffset = kb_used;
 759       alloc.ConstantBufferSize = push_constant_kb - kb_used;
 760    }
 761
 762    cmd_buffer->state.push_constant_stages = stages;
 763
 764    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
 765     *
 766     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
 767     *    the next 3DPRIMITIVE command after programming the
 768     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
 769     *
 770     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
 771     * pipeline setup, we need to dirty push constants.
 772     */
 773    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
 774 }
 775
 776 static VkResult
 777 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 778                    gl_shader_stage stage,
 779                    struct anv_state *bt_state)
 780 {
 781    struct anv_subpass *subpass = cmd_buffer->state.subpass;
 782    struct anv_pipeline *pipeline;
 783    uint32_t bias, state_offset;
 784
 785    switch (stage) {
 786    case  MESA_SHADER_COMPUTE:
 787       pipeline = cmd_buffer->state.compute_pipeline;
 788       bias = 1;
 789       break;
 790    default:
 791       pipeline = cmd_buffer->state.pipeline;
 792       bias = 0;
 793       break;
 794    }
 795
 796    if (!anv_pipeline_has_stage(pipeline, stage)) {
 797       *bt_state = (struct anv_state) { 0, };
 798       return VK_SUCCESS;
 799    }
 800
 801    struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
 802    if (bias + map->surface_count == 0) {
 803       *bt_state = (struct anv_state) { 0, };
 804       return VK_SUCCESS;
 805    }
 806
 807    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
 808                                                   bias + map->surface_count,
 809                                                   &state_offset);
 810    uint32_t *bt_map = bt_state->map;
 811
 812    if (bt_state->map == NULL)
 813       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 814
 815    if (stage == MESA_SHADER_COMPUTE &&
 816        get_cs_prog_data(cmd_buffer->state.compute_pipeline)->uses_num_work_groups) {
 817       struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo;
 818       uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset;
 819
 820       struct anv_state surface_state;
 821       surface_state =
 822          anv_cmd_buffer_alloc_surface_state(cmd_buffer);
 823
 824       const enum isl_format format =
 825          anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
 826       anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
 827                                     format, bo_offset, 12, 1);
 828
 829       bt_map[0] = surface_state.offset + state_offset;
 830       anv_cmd_buffer_add_surface_state_reloc(cmd_buffer, surface_state,
 831                                              bo, bo_offset);
 832    }
 833
 834    if (map->surface_count == 0)
 835       goto out;
 836
 837    if (map->image_count > 0) {
 838       VkResult result =
 839          anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
 840       if (result != VK_SUCCESS)
 841          return result;
 842
 843       cmd_buffer->state.push_constants_dirty |= 1 << stage;
 844    }
 845
 846    uint32_t image = 0;
 847    for (uint32_t s = 0; s < map->surface_count; s++) {
 848       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
 849
 850       struct anv_state surface_state;
 851       struct anv_bo *bo;
 852       uint32_t bo_offset;
 853
 854       if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) {
 855          /* Color attachment binding */
 856          assert(stage == MESA_SHADER_FRAGMENT);
 857          assert(binding->binding == 0);
 858          if (binding->index < subpass->color_count) {
 859             const unsigned att = subpass->color_attachments[binding->index];
 860             surface_state = cmd_buffer->state.attachments[att].color_rt_state;
 861          } else {
 862             surface_state = cmd_buffer->state.null_surface_state;
 863          }
 864
 865          bt_map[bias + s] = surface_state.offset + state_offset;
 866          continue;
 867       }
 868
 869       struct anv_descriptor_set *set =
 870          cmd_buffer->state.descriptors[binding->set];
 871       uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
 872       struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
 873
 874       switch (desc->type) {
 875       case VK_DESCRIPTOR_TYPE_SAMPLER:
 876          /* Nothing for us to do here */
 877          continue;
 878
 879       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
 880       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
 881       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
 882          surface_state = desc->image_view->sampler_surface_state;
 883          assert(surface_state.alloc_size);
 884          bo = desc->image_view->bo;
 885          bo_offset = desc->image_view->offset;
 886          break;
 887
 888       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
 889          surface_state = desc->image_view->storage_surface_state;
 890          assert(surface_state.alloc_size);
 891          bo = desc->image_view->bo;
 892          bo_offset = desc->image_view->offset;
 893
 894          struct brw_image_param *image_param =
 895             &cmd_buffer->state.push_constants[stage]->images[image++];
 896
 897          *image_param = desc->image_view->storage_image_param;
 898          image_param->surface_idx = bias + s;
 899          break;
 900       }
 901
 902       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
 903       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
 904       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
 905       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
 906       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
 907          surface_state = desc->buffer_view->surface_state;
 908          assert(surface_state.alloc_size);
 909          bo = desc->buffer_view->bo;
 910          bo_offset = desc->buffer_view->offset;
 911          break;
 912
 913       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
 914          surface_state = desc->buffer_view->storage_surface_state;
 915          assert(surface_state.alloc_size);
 916          bo = desc->buffer_view->bo;
 917          bo_offset = desc->buffer_view->offset;
 918
 919          struct brw_image_param *image_param =
 920             &cmd_buffer->state.push_constants[stage]->images[image++];
 921
 922          *image_param = desc->buffer_view->storage_image_param;
 923          image_param->surface_idx = bias + s;
 924          break;
 925
 926       default:
 927          assert(!"Invalid descriptor type");
 928          continue;
 929       }
 930
 931       bt_map[bias + s] = surface_state.offset + state_offset;
 932       anv_cmd_buffer_add_surface_state_reloc(cmd_buffer, surface_state,
 933                                              bo, bo_offset);
 934    }
 935    assert(image == map->image_count);
 936
 937  out:
 938    if (!cmd_buffer->device->info.has_llc)
 939       anv_state_clflush(*bt_state);
 940
 941    return VK_SUCCESS;
 942 }
 943
 944 static VkResult
 945 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
 946               gl_shader_stage stage,
 947               struct anv_state *state)
 948 {
 949    struct anv_pipeline *pipeline;
 950
 951    if (stage == MESA_SHADER_COMPUTE)
 952       pipeline = cmd_buffer->state.compute_pipeline;
 953    else
 954       pipeline = cmd_buffer->state.pipeline;
 955
 956    if (!anv_pipeline_has_stage(pipeline, stage)) {
 957       *state = (struct anv_state) { 0, };
 958       return VK_SUCCESS;
 959    }
 960
 961    struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
 962    if (map->sampler_count == 0) {
 963       *state = (struct anv_state) { 0, };
 964       return VK_SUCCESS;
 965    }
 966
 967    uint32_t size = map->sampler_count * 16;
 968    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
 969
 970    if (state->map == NULL)
 971       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 972
 973    for (uint32_t s = 0; s < map->sampler_count; s++) {
 974       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
 975       struct anv_descriptor_set *set =
 976          cmd_buffer->state.descriptors[binding->set];
 977       uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
 978       struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
 979
 980       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
 981           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
 982          continue;
 983
 984       struct anv_sampler *sampler = desc->sampler;
 985
 986       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
 987        * happens to be zero.
 988        */
 989       if (sampler == NULL)
 990          continue;
 991
 992       memcpy(state->map + (s * 16),
 993              sampler->state, sizeof(sampler->state));
 994    }
 995
 996    if (!cmd_buffer->device->info.has_llc)
 997       anv_state_clflush(*state);
 998
 999    return VK_SUCCESS;
1000 }
1001
1002 static uint32_t
1003 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
1004 {
1005    VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty &
1006                               cmd_buffer->state.pipeline->active_stages;
1007
1008    VkResult result = VK_SUCCESS;
1009    anv_foreach_stage(s, dirty) {
1010       result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
1011       if (result != VK_SUCCESS)
1012          break;
1013       result = emit_binding_table(cmd_buffer, s,
1014                                   &cmd_buffer->state.binding_tables[s]);
1015       if (result != VK_SUCCESS)
1016          break;
1017    }
1018
1019    if (result != VK_SUCCESS) {
1020       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
1021
1022       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
1023       assert(result == VK_SUCCESS);
1024
1025       /* Re-emit state base addresses so we get the new surface state base
1026        * address before we start emitting binding tables etc.
1027        */
1028       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1029
1030       /* Re-emit all active binding tables */
1031       dirty |= cmd_buffer->state.pipeline->active_stages;
1032       anv_foreach_stage(s, dirty) {
1033          result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
1034          if (result != VK_SUCCESS)
1035             return result;
1036          result = emit_binding_table(cmd_buffer, s,
1037                                      &cmd_buffer->state.binding_tables[s]);
1038          if (result != VK_SUCCESS)
1039             return result;
1040       }
1041    }
1042
1043    cmd_buffer->state.descriptors_dirty &= ~dirty;
1044
1045    return dirty;
1046 }
1047
1048 static void
1049 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
1050                                     uint32_t stages)
1051 {
1052    static const uint32_t sampler_state_opcodes[] = {
1053       [MESA_SHADER_VERTEX]                      = 43,
1054       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
1055       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
1056       [MESA_SHADER_GEOMETRY]                    = 46,
1057       [MESA_SHADER_FRAGMENT]                    = 47,
1058       [MESA_SHADER_COMPUTE]                     = 0,
1059    };
1060
1061    static const uint32_t binding_table_opcodes[] = {
1062       [MESA_SHADER_VERTEX]                      = 38,
1063       [MESA_SHADER_TESS_CTRL]                   = 39,
1064       [MESA_SHADER_TESS_EVAL]                   = 40,
1065       [MESA_SHADER_GEOMETRY]                    = 41,
1066       [MESA_SHADER_FRAGMENT]                    = 42,
1067       [MESA_SHADER_COMPUTE]                     = 0,
1068    };
1069
1070    anv_foreach_stage(s, stages) {
1071       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
1072          anv_batch_emit(&cmd_buffer->batch,
1073                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
1074             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
1075             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
1076          }
1077       }
1078
1079       /* Always emit binding table pointers if we're asked to, since on SKL
1080        * this is what flushes push constants. */
1081       anv_batch_emit(&cmd_buffer->batch,
1082                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
1083          btp._3DCommandSubOpcode = binding_table_opcodes[s];
1084          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
1085       }
1086    }
1087 }
1088
1089 static uint32_t
1090 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
1091 {
1092    static const uint32_t push_constant_opcodes[] = {
1093       [MESA_SHADER_VERTEX]                      = 21,
1094       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
1095       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
1096       [MESA_SHADER_GEOMETRY]                    = 22,
1097       [MESA_SHADER_FRAGMENT]                    = 23,
1098       [MESA_SHADER_COMPUTE]                     = 0,
1099    };
1100
1101    VkShaderStageFlags flushed = 0;
1102
1103    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
1104       if (stage == MESA_SHADER_COMPUTE)
1105          continue;
1106
1107       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
1108
1109       if (state.offset == 0) {
1110          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
1111             c._3DCommandSubOpcode = push_constant_opcodes[stage];
1112       } else {
1113          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
1114             c._3DCommandSubOpcode = push_constant_opcodes[stage],
1115             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
1116 #if GEN_GEN >= 9
1117                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
1118                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
1119 #else
1120                .PointerToConstantBuffer0 = { .offset = state.offset },
1121                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
1122 #endif
1123             };
1124          }
1125       }
1126
1127       flushed |= mesa_to_vk_shader_stage(stage);
1128    }
1129
1130    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
1131
1132    return flushed;
1133 }
1134
1135 void
1136 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
1137 {
1138    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1139    uint32_t *p;
1140
1141    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
1142
1143    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
1144
1145    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1146
1147    genX(flush_pipeline_select_3d)(cmd_buffer);
1148
1149    if (vb_emit) {
1150       const uint32_t num_buffers = __builtin_popcount(vb_emit);
1151       const uint32_t num_dwords = 1 + num_buffers * 4;
1152
1153       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
1154                           GENX(3DSTATE_VERTEX_BUFFERS));
1155       uint32_t vb, i = 0;
1156       for_each_bit(vb, vb_emit) {
1157          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
1158          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
1159
1160          struct GENX(VERTEX_BUFFER_STATE) state = {
1161             .VertexBufferIndex = vb,
1162
1163 #if GEN_GEN >= 8
1164             .MemoryObjectControlState = GENX(MOCS),
1165 #else
1166             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
1167             .InstanceDataStepRate = 1,
1168             .VertexBufferMemoryObjectControlState = GENX(MOCS),
1169 #endif
1170
1171             .AddressModifyEnable = true,
1172             .BufferPitch = pipeline->binding_stride[vb],
1173             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
1174
1175 #if GEN_GEN >= 8
1176             .BufferSize = buffer->size - offset
1177 #else
1178             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
1179 #endif
1180          };
1181
1182          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
1183          i++;
1184       }
1185    }
1186
1187    cmd_buffer->state.vb_dirty &= ~vb_emit;
1188
1189    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
1190       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1191
1192       /* The exact descriptor layout is pulled from the pipeline, so we need
1193        * to re-emit binding tables on every pipeline change.
1194        */
1195       cmd_buffer->state.descriptors_dirty |=
1196          cmd_buffer->state.pipeline->active_stages;
1197
1198       /* If the pipeline changed, we may need to re-allocate push constant
1199        * space in the URB.
1200        */
1201       cmd_buffer_alloc_push_constants(cmd_buffer);
1202    }
1203
1204 #if GEN_GEN <= 7
1205    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
1206        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
1207       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
1208        *
1209        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
1210        *    stall needs to be sent just prior to any 3DSTATE_VS,
1211        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
1212        *    3DSTATE_BINDING_TABLE_POINTER_VS,
1213        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
1214        *    PIPE_CONTROL needs to be sent before any combination of VS
1215        *    associated 3DSTATE."
1216        */
1217       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1218          pc.DepthStallEnable  = true;
1219          pc.PostSyncOperation = WriteImmediateData;
1220          pc.Address           =
1221             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
1222       }
1223    }
1224 #endif
1225
1226    /* Render targets live in the same binding table as fragment descriptors */
1227    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
1228       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
1229
1230    /* We emit the binding tables and sampler tables first, then emit push
1231     * constants and then finally emit binding table and sampler table
1232     * pointers.  It has to happen in this order, since emitting the binding
1233     * tables may change the push constants (in case of storage images). After
1234     * emitting push constants, on SKL+ we have to emit the corresponding
1235     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
1236     */
1237    uint32_t dirty = 0;
1238    if (cmd_buffer->state.descriptors_dirty)
1239       dirty = flush_descriptor_sets(cmd_buffer);
1240
1241    if (cmd_buffer->state.push_constants_dirty) {
1242 #if GEN_GEN >= 9
1243       /* On Sky Lake and later, the binding table pointers commands are
1244        * what actually flush the changes to push constant state so we need
1245        * to dirty them so they get re-emitted below.
1246        */
1247       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
1248 #else
1249       cmd_buffer_flush_push_constants(cmd_buffer);
1250 #endif
1251    }
1252
1253    if (dirty)
1254       cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
1255
1256    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
1257       gen8_cmd_buffer_emit_viewport(cmd_buffer);
1258
1259    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
1260                                   ANV_CMD_DIRTY_PIPELINE)) {
1261       gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
1262                                           pipeline->depth_clamp_enable);
1263    }
1264
1265    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
1266       gen7_cmd_buffer_emit_scissor(cmd_buffer);
1267
1268    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
1269
1270    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1271 }
1272
1273 static void
1274 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
1275                              struct anv_bo *bo, uint32_t offset)
1276 {
1277    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
1278                                  GENX(3DSTATE_VERTEX_BUFFERS));
1279
1280    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
1281       &(struct GENX(VERTEX_BUFFER_STATE)) {
1282          .VertexBufferIndex = 32, /* Reserved for this */
1283          .AddressModifyEnable = true,
1284          .BufferPitch = 0,
1285 #if (GEN_GEN >= 8)
1286          .MemoryObjectControlState = GENX(MOCS),
1287          .BufferStartingAddress = { bo, offset },
1288          .BufferSize = 8
1289 #else
1290          .VertexBufferMemoryObjectControlState = GENX(MOCS),
1291          .BufferStartingAddress = { bo, offset },
1292          .EndAddress = { bo, offset + 8 },
1293 #endif
1294       });
1295 }
1296
1297 static void
1298 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
1299                           uint32_t base_vertex, uint32_t base_instance)
1300 {
1301    struct anv_state id_state =
1302       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
1303
1304    ((uint32_t *)id_state.map)[0] = base_vertex;
1305    ((uint32_t *)id_state.map)[1] = base_instance;
1306
1307    if (!cmd_buffer->device->info.has_llc)
1308       anv_state_clflush(id_state);
1309
1310    emit_base_vertex_instance_bo(cmd_buffer,
1311       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
1312 }
1313
1314 void genX(CmdDraw)(
1315     VkCommandBuffer                             commandBuffer,
1316     uint32_t                                    vertexCount,
1317     uint32_t                                    instanceCount,
1318     uint32_t                                    firstVertex,
1319     uint32_t                                    firstInstance)
1320 {
1321    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1322    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1323    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1324
1325    genX(cmd_buffer_flush_state)(cmd_buffer);
1326
1327    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1328       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1329
1330    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1331       prim.VertexAccessType         = SEQUENTIAL;
1332       prim.PrimitiveTopologyType    = pipeline->topology;
1333       prim.VertexCountPerInstance   = vertexCount;
1334       prim.StartVertexLocation      = firstVertex;
1335       prim.InstanceCount            = instanceCount;
1336       prim.StartInstanceLocation    = firstInstance;
1337       prim.BaseVertexLocation       = 0;
1338    }
1339 }
1340
1341 void genX(CmdDrawIndexed)(
1342     VkCommandBuffer                             commandBuffer,
1343     uint32_t                                    indexCount,
1344     uint32_t                                    instanceCount,
1345     uint32_t                                    firstIndex,
1346     int32_t                                     vertexOffset,
1347     uint32_t                                    firstInstance)
1348 {
1349    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1350    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1351    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1352
1353    genX(cmd_buffer_flush_state)(cmd_buffer);
1354
1355    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1356       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
1357
1358    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1359       prim.VertexAccessType         = RANDOM;
1360       prim.PrimitiveTopologyType    = pipeline->topology;
1361       prim.VertexCountPerInstance   = indexCount;
1362       prim.StartVertexLocation      = firstIndex;
1363       prim.InstanceCount            = instanceCount;
1364       prim.StartInstanceLocation    = firstInstance;
1365       prim.BaseVertexLocation       = vertexOffset;
1366    }
1367 }
1368
1369 /* Auto-Draw / Indirect Registers */
1370 #define GEN7_3DPRIM_END_OFFSET          0x2420
1371 #define GEN7_3DPRIM_START_VERTEX        0x2430
1372 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
1373 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
1374 #define GEN7_3DPRIM_START_INSTANCE      0x243C
1375 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
1376
1377 void genX(CmdDrawIndirect)(
1378     VkCommandBuffer                             commandBuffer,
1379     VkBuffer                                    _buffer,
1380     VkDeviceSize                                offset,
1381     uint32_t                                    drawCount,
1382     uint32_t                                    stride)
1383 {
1384    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1385    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1386    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1387    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1388    struct anv_bo *bo = buffer->bo;
1389    uint32_t bo_offset = buffer->offset + offset;
1390
1391    genX(cmd_buffer_flush_state)(cmd_buffer);
1392
1393    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1394       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
1395
1396    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1397    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1398    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1399    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
1400    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
1401
1402    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1403       prim.IndirectParameterEnable  = true;
1404       prim.VertexAccessType         = SEQUENTIAL;
1405       prim.PrimitiveTopologyType    = pipeline->topology;
1406    }
1407 }
1408
1409 void genX(CmdDrawIndexedIndirect)(
1410     VkCommandBuffer                             commandBuffer,
1411     VkBuffer                                    _buffer,
1412     VkDeviceSize                                offset,
1413     uint32_t                                    drawCount,
1414     uint32_t                                    stride)
1415 {
1416    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1417    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1418    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1419    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1420    struct anv_bo *bo = buffer->bo;
1421    uint32_t bo_offset = buffer->offset + offset;
1422
1423    genX(cmd_buffer_flush_state)(cmd_buffer);
1424
1425    /* TODO: We need to stomp base vertex to 0 somehow */
1426    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1427       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
1428
1429    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1430    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1431    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1432    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
1433    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
1434
1435    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1436       prim.IndirectParameterEnable  = true;
1437       prim.VertexAccessType         = RANDOM;
1438       prim.PrimitiveTopologyType    = pipeline->topology;
1439    }
1440 }
1441
1442 static VkResult
1443 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
1444 {
1445    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1446    struct anv_state surfaces = { 0, }, samplers = { 0, };
1447    VkResult result;
1448
1449    result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers);
1450    if (result != VK_SUCCESS)
1451       return result;
1452    result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
1453    if (result != VK_SUCCESS)
1454       return result;
1455
1456    struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer);
1457
1458    if (push_state.alloc_size) {
1459       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1460          curbe.CURBETotalDataLength    = push_state.alloc_size;
1461          curbe.CURBEDataStartAddress   = push_state.offset;
1462       }
1463    }
1464
1465    uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
1466    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
1467       .BindingTablePointer = surfaces.offset,
1468       .SamplerStatePointer = samplers.offset,
1469    };
1470    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
1471
1472    struct anv_state state =
1473       anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
1474                                    pipeline->interface_descriptor_data,
1475                                    GENX(INTERFACE_DESCRIPTOR_DATA_length),
1476                                    64);
1477
1478    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1479    anv_batch_emit(&cmd_buffer->batch,
1480                   GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1481       mid.InterfaceDescriptorTotalLength        = size;
1482       mid.InterfaceDescriptorDataStartAddress   = state.offset;
1483    }
1484
1485    return VK_SUCCESS;
1486 }
1487
1488 void
1489 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
1490 {
1491    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1492    MAYBE_UNUSED VkResult result;
1493
1494    assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
1495
1496    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1497
1498    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1499
1500    if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)
1501       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1502
1503    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
1504        (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
1505       /* FIXME: figure out descriptors for gen7 */
1506       result = flush_compute_descriptor_set(cmd_buffer);
1507       assert(result == VK_SUCCESS);
1508       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
1509    }
1510
1511    cmd_buffer->state.compute_dirty = 0;
1512
1513    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1514 }
1515
1516 #if GEN_GEN == 7
1517
1518 static bool
1519 verify_cmd_parser(const struct anv_device *device,
1520                   int required_version,
1521                   const char *function)
1522 {
1523    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
1524       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
1525                 "cmd parser version %d is required for %s",
1526                 required_version, function);
1527       return false;
1528    } else {
1529       return true;
1530    }
1531 }
1532
1533 #endif
1534
1535 void genX(CmdDispatch)(
1536     VkCommandBuffer                             commandBuffer,
1537     uint32_t                                    x,
1538     uint32_t                                    y,
1539     uint32_t                                    z)
1540 {
1541    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1542    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1543    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1544
1545    if (prog_data->uses_num_work_groups) {
1546       struct anv_state state =
1547          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
1548       uint32_t *sizes = state.map;
1549       sizes[0] = x;
1550       sizes[1] = y;
1551       sizes[2] = z;
1552       if (!cmd_buffer->device->info.has_llc)
1553          anv_state_clflush(state);
1554       cmd_buffer->state.num_workgroups_offset = state.offset;
1555       cmd_buffer->state.num_workgroups_bo =
1556          &cmd_buffer->device->dynamic_state_block_pool.bo;
1557    }
1558
1559    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1560
1561    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
1562       ggw.SIMDSize                     = prog_data->simd_size / 16;
1563       ggw.ThreadDepthCounterMaximum    = 0;
1564       ggw.ThreadHeightCounterMaximum   = 0;
1565       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1566       ggw.ThreadGroupIDXDimension      = x;
1567       ggw.ThreadGroupIDYDimension      = y;
1568       ggw.ThreadGroupIDZDimension      = z;
1569       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1570       ggw.BottomExecutionMask          = 0xffffffff;
1571    }
1572
1573    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
1574 }
1575
1576 #define GPGPU_DISPATCHDIMX 0x2500
1577 #define GPGPU_DISPATCHDIMY 0x2504
1578 #define GPGPU_DISPATCHDIMZ 0x2508
1579
1580 #define MI_PREDICATE_SRC0  0x2400
1581 #define MI_PREDICATE_SRC1  0x2408
1582
1583 void genX(CmdDispatchIndirect)(
1584     VkCommandBuffer                             commandBuffer,
1585     VkBuffer                                    _buffer,
1586     VkDeviceSize                                offset)
1587 {
1588    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1589    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1590    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1591    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1592    struct anv_bo *bo = buffer->bo;
1593    uint32_t bo_offset = buffer->offset + offset;
1594    struct anv_batch *batch = &cmd_buffer->batch;
1595
1596 #if GEN_GEN == 7
1597    /* Linux 4.4 added command parser version 5 which allows the GPGPU
1598     * indirect dispatch registers to be written.
1599     */
1600    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
1601       return;
1602 #endif
1603
1604    if (prog_data->uses_num_work_groups) {
1605       cmd_buffer->state.num_workgroups_offset = bo_offset;
1606       cmd_buffer->state.num_workgroups_bo = bo;
1607    }
1608
1609    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1610
1611    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
1612    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
1613    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
1614
1615 #if GEN_GEN <= 7
1616    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
1617    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
1618    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
1619    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
1620
1621    /* Load compute_dispatch_indirect_x_size into SRC0 */
1622    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
1623
1624    /* predicate = (compute_dispatch_indirect_x_size == 0); */
1625    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1626       mip.LoadOperation    = LOAD_LOAD;
1627       mip.CombineOperation = COMBINE_SET;
1628       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1629    }
1630
1631    /* Load compute_dispatch_indirect_y_size into SRC0 */
1632    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
1633
1634    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
1635    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1636       mip.LoadOperation    = LOAD_LOAD;
1637       mip.CombineOperation = COMBINE_OR;
1638       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1639    }
1640
1641    /* Load compute_dispatch_indirect_z_size into SRC0 */
1642    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
1643
1644    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
1645    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1646       mip.LoadOperation    = LOAD_LOAD;
1647       mip.CombineOperation = COMBINE_OR;
1648       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1649    }
1650
1651    /* predicate = !predicate; */
1652 #define COMPARE_FALSE                           1
1653    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1654       mip.LoadOperation    = LOAD_LOADINV;
1655       mip.CombineOperation = COMBINE_OR;
1656       mip.CompareOperation = COMPARE_FALSE;
1657    }
1658 #endif
1659
1660    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
1661       ggw.IndirectParameterEnable      = true;
1662       ggw.PredicateEnable              = GEN_GEN <= 7;
1663       ggw.SIMDSize                     = prog_data->simd_size / 16;
1664       ggw.ThreadDepthCounterMaximum    = 0;
1665       ggw.ThreadHeightCounterMaximum   = 0;
1666       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1667       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1668       ggw.BottomExecutionMask          = 0xffffffff;
1669    }
1670
1671    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
1672 }
1673
1674 static void
1675 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
1676                                       uint32_t pipeline)
1677 {
1678 #if GEN_GEN >= 8 && GEN_GEN < 10
1679    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1680     *
1681     *   Software must clear the COLOR_CALC_STATE Valid field in
1682     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1683     *   with Pipeline Select set to GPGPU.
1684     *
1685     * The internal hardware docs recommend the same workaround for Gen9
1686     * hardware too.
1687     */
1688    if (pipeline == GPGPU)
1689       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1690 #elif GEN_GEN <= 7
1691       /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1692        * PIPELINE_SELECT [DevBWR+]":
1693        *
1694        *   Project: DEVSNB+
1695        *
1696        *   Software must ensure all the write caches are flushed through a
1697        *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1698        *   command to invalidate read only caches prior to programming
1699        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
1700        */
1701       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1702          pc.RenderTargetCacheFlushEnable  = true;
1703          pc.DepthCacheFlushEnable         = true;
1704          pc.DCFlushEnable                 = true;
1705          pc.PostSyncOperation             = NoWrite;
1706          pc.CommandStreamerStallEnable    = true;
1707       }
1708
1709       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1710          pc.TextureCacheInvalidationEnable   = true;
1711          pc.ConstantCacheInvalidationEnable  = true;
1712          pc.StateCacheInvalidationEnable     = true;
1713          pc.InstructionCacheInvalidateEnable = true;
1714          pc.PostSyncOperation                = NoWrite;
1715       }
1716 #endif
1717 }
1718
1719 void
1720 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
1721 {
1722    if (cmd_buffer->state.current_pipeline != _3D) {
1723       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
1724
1725       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1726 #if GEN_GEN >= 9
1727          ps.MaskBits = 3;
1728 #endif
1729          ps.PipelineSelection = _3D;
1730       }
1731
1732       cmd_buffer->state.current_pipeline = _3D;
1733    }
1734 }
1735
1736 void
1737 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
1738 {
1739    if (cmd_buffer->state.current_pipeline != GPGPU) {
1740       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
1741
1742       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1743 #if GEN_GEN >= 9
1744          ps.MaskBits = 3;
1745 #endif
1746          ps.PipelineSelection = GPGPU;
1747       }
1748
1749       cmd_buffer->state.current_pipeline = GPGPU;
1750    }
1751 }
1752
1753 static void
1754 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
1755 {
1756    struct anv_device *device = cmd_buffer->device;
1757    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
1758    const struct anv_image_view *iview =
1759       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1760    const struct anv_image *image = iview ? iview->image : NULL;
1761    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
1762    const bool has_hiz = image != NULL && anv_image_has_hiz(image);
1763    const bool has_stencil =
1764       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
1765
1766    /* FIXME: Implement the PMA stall W/A */
1767    /* FIXME: Width and Height are wrong */
1768
1769    /* Emit 3DSTATE_DEPTH_BUFFER */
1770    if (has_depth) {
1771       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1772          db.SurfaceType                   = SURFTYPE_2D;
1773          db.DepthWriteEnable              = true;
1774          db.StencilWriteEnable            = has_stencil;
1775
1776          if (cmd_buffer->state.pass->subpass_count == 1) {
1777             db.HierarchicalDepthBufferEnable = has_hiz;
1778          } else {
1779             anv_finishme("Multiple-subpass HiZ not implemented");
1780          }
1781
1782          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
1783                                                       &image->depth_surface.isl);
1784
1785          db.SurfaceBaseAddress = (struct anv_address) {
1786             .bo = image->bo,
1787             .offset = image->offset + image->depth_surface.offset,
1788          };
1789          db.DepthBufferObjectControlState = GENX(MOCS);
1790
1791          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
1792          db.Height               = image->extent.height - 1;
1793          db.Width                = image->extent.width - 1;
1794          db.LOD                  = iview->isl.base_level;
1795          db.Depth                = image->array_size - 1; /* FIXME: 3-D */
1796          db.MinimumArrayElement  = iview->isl.base_array_layer;
1797
1798 #if GEN_GEN >= 8
1799          db.SurfaceQPitch =
1800             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2;
1801 #endif
1802          db.RenderTargetViewExtent = 1 - 1;
1803       }
1804    } else {
1805       /* Even when no depth buffer is present, the hardware requires that
1806        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
1807        *
1808        *    If a null depth buffer is bound, the driver must instead bind depth as:
1809        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
1810        *       3DSTATE_DEPTH.Width = 1
1811        *       3DSTATE_DEPTH.Height = 1
1812        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
1813        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
1814        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
1815        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
1816        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
1817        *
1818        * The PRM is wrong, though. The width and height must be programmed to
1819        * actual framebuffer's width and height, even when neither depth buffer
1820        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
1821        * be combined with a stencil buffer so we use D32_FLOAT instead.
1822        */
1823       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1824          db.SurfaceType          = SURFTYPE_2D;
1825          db.SurfaceFormat        = D32_FLOAT;
1826          db.Width                = fb->width - 1;
1827          db.Height               = fb->height - 1;
1828          db.StencilWriteEnable   = has_stencil;
1829       }
1830    }
1831
1832    if (has_hiz) {
1833       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) {
1834          hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS);
1835          hdb.SurfacePitch = image->hiz_surface.isl.row_pitch - 1;
1836          hdb.SurfaceBaseAddress = (struct anv_address) {
1837             .bo = image->bo,
1838             .offset = image->offset + image->hiz_surface.offset,
1839          };
1840 #if GEN_GEN >= 8
1841          /* From the SKL PRM Vol2a:
1842           *
1843           *    The interpretation of this field is dependent on Surface Type
1844           *    as follows:
1845           *    - SURFTYPE_1D: distance in pixels between array slices
1846           *    - SURFTYPE_2D/CUBE: distance in rows between array slices
1847           *    - SURFTYPE_3D: distance in rows between R - slices
1848           */
1849          hdb.SurfaceQPitch =
1850             image->hiz_surface.isl.dim == ISL_SURF_DIM_1D ?
1851                isl_surf_get_array_pitch_el(&image->hiz_surface.isl) >> 2 :
1852                isl_surf_get_array_pitch_el_rows(&image->hiz_surface.isl) >> 2;
1853 #endif
1854       }
1855    } else {
1856       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb);
1857    }
1858
1859    /* Emit 3DSTATE_STENCIL_BUFFER */
1860    if (has_stencil) {
1861       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
1862 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1863          sb.StencilBufferEnable = true;
1864 #endif
1865          sb.StencilBufferObjectControlState = GENX(MOCS);
1866
1867          sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1;
1868
1869 #if GEN_GEN >= 8
1870          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2;
1871 #endif
1872          sb.SurfaceBaseAddress = (struct anv_address) {
1873             .bo = image->bo,
1874             .offset = image->offset + image->stencil_surface.offset,
1875          };
1876       }
1877    } else {
1878       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
1879    }
1880
1881    /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS:
1882     *
1883     *    3DSTATE_CLEAR_PARAMS must always be programmed in the along with
1884     *    the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER,
1885     *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER)
1886     *
1887     * Testing also shows that some variant of this restriction may exist HSW+.
1888     * On BDW+, it is not possible to emit 2 of these packets consecutively when
1889     * both have DepthClearValueValid set. An analysis of such state programming
1890     * on SKL showed that the GPU doesn't register the latter packet's clear
1891     * value.
1892     */
1893    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) {
1894       if (has_hiz) {
1895          cp.DepthClearValueValid = true;
1896          const uint32_t ds =
1897             cmd_buffer->state.subpass->depth_stencil_attachment;
1898          cp.DepthClearValue =
1899             cmd_buffer->state.attachments[ds].clear_value.depthStencil.depth;
1900       }
1901    }
1902 }
1903
1904 static void
1905 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
1906                              struct anv_subpass *subpass)
1907 {
1908    cmd_buffer->state.subpass = subpass;
1909
1910    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1911
1912    cmd_buffer_emit_depth_stencil(cmd_buffer);
1913    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_HIZ_RESOLVE);
1914    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_CLEAR);
1915
1916    anv_cmd_buffer_clear_subpass(cmd_buffer);
1917 }
1918
1919 void genX(CmdBeginRenderPass)(
1920     VkCommandBuffer                             commandBuffer,
1921     const VkRenderPassBeginInfo*                pRenderPassBegin,
1922     VkSubpassContents                           contents)
1923 {
1924    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1925    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
1926    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1927
1928    cmd_buffer->state.framebuffer = framebuffer;
1929    cmd_buffer->state.pass = pass;
1930    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
1931    genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, framebuffer,
1932                                       pRenderPassBegin->pClearValues);
1933
1934    genX(flush_pipeline_select_3d)(cmd_buffer);
1935
1936    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
1937 }
1938
1939 void genX(CmdNextSubpass)(
1940     VkCommandBuffer                             commandBuffer,
1941     VkSubpassContents                           contents)
1942 {
1943    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1944
1945    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1946
1947    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1948    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
1949 }
1950
1951 void genX(CmdEndRenderPass)(
1952     VkCommandBuffer                             commandBuffer)
1953 {
1954    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1955
1956    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_RESOLVE);
1957    anv_cmd_buffer_resolve_subpass(cmd_buffer);
1958
1959 #ifndef NDEBUG
1960    anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
1961 #endif
1962 }
1963
1964 static void
1965 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
1966                     struct anv_bo *bo, uint32_t offset)
1967 {
1968    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1969       pc.DestinationAddressType  = DAT_PPGTT;
1970       pc.PostSyncOperation       = WritePSDepthCount;
1971       pc.DepthStallEnable        = true;
1972       pc.Address                 = (struct anv_address) { bo, offset };
1973
1974       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
1975          pc.CommandStreamerStallEnable = true;
1976    }
1977 }
1978
1979 static void
1980 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
1981                         struct anv_bo *bo, uint32_t offset)
1982 {
1983    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1984       pc.DestinationAddressType  = DAT_PPGTT;
1985       pc.PostSyncOperation       = WriteImmediateData;
1986       pc.Address                 = (struct anv_address) { bo, offset };
1987       pc.ImmediateData           = 1;
1988    }
1989 }
1990
1991 void genX(CmdBeginQuery)(
1992     VkCommandBuffer                             commandBuffer,
1993     VkQueryPool                                 queryPool,
1994     uint32_t                                    query,
1995     VkQueryControlFlags                         flags)
1996 {
1997    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1998    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1999
2000    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
2001     * that the pipelining of the depth write breaks. What we see is that
2002     * samples from the render pass clear leaks into the first query
2003     * immediately after the clear. Doing a pipecontrol with a post-sync
2004     * operation and DepthStallEnable seems to work around the issue.
2005     */
2006    if (cmd_buffer->state.need_query_wa) {
2007       cmd_buffer->state.need_query_wa = false;
2008       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2009          pc.DepthCacheFlushEnable   = true;
2010          pc.DepthStallEnable        = true;
2011       }
2012    }
2013
2014    switch (pool->type) {
2015    case VK_QUERY_TYPE_OCCLUSION:
2016       emit_ps_depth_count(cmd_buffer, &pool->bo,
2017                           query * sizeof(struct anv_query_pool_slot));
2018       break;
2019
2020    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2021    default:
2022       unreachable("");
2023    }
2024 }
2025
2026 void genX(CmdEndQuery)(
2027     VkCommandBuffer                             commandBuffer,
2028     VkQueryPool                                 queryPool,
2029     uint32_t                                    query)
2030 {
2031    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2032    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2033
2034    switch (pool->type) {
2035    case VK_QUERY_TYPE_OCCLUSION:
2036       emit_ps_depth_count(cmd_buffer, &pool->bo,
2037                           query * sizeof(struct anv_query_pool_slot) + 8);
2038
2039       emit_query_availability(cmd_buffer, &pool->bo,
2040                               query * sizeof(struct anv_query_pool_slot) + 16);
2041       break;
2042
2043    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2044    default:
2045       unreachable("");
2046    }
2047 }
2048
2049 #define TIMESTAMP 0x2358
2050
2051 void genX(CmdWriteTimestamp)(
2052     VkCommandBuffer                             commandBuffer,
2053     VkPipelineStageFlagBits                     pipelineStage,
2054     VkQueryPool                                 queryPool,
2055     uint32_t                                    query)
2056 {
2057    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2058    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2059    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
2060
2061    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
2062
2063    switch (pipelineStage) {
2064    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
2065       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2066          srm.RegisterAddress  = TIMESTAMP;
2067          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
2068       }
2069       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2070          srm.RegisterAddress  = TIMESTAMP + 4;
2071          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
2072       }
2073       break;
2074
2075    default:
2076       /* Everything else is bottom-of-pipe */
2077       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2078          pc.DestinationAddressType  = DAT_PPGTT;
2079          pc.PostSyncOperation       = WriteTimestamp;
2080          pc.Address = (struct anv_address) { &pool->bo, offset };
2081
2082          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
2083             pc.CommandStreamerStallEnable = true;
2084       }
2085       break;
2086    }
2087
2088    emit_query_availability(cmd_buffer, &pool->bo, query + 16);
2089 }
2090
2091 #if GEN_GEN > 7 || GEN_IS_HASWELL
2092
2093 #define alu_opcode(v)   __gen_uint((v),  20, 31)
2094 #define alu_operand1(v) __gen_uint((v),  10, 19)
2095 #define alu_operand2(v) __gen_uint((v),   0,  9)
2096 #define alu(opcode, operand1, operand2) \
2097    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
2098
2099 #define OPCODE_NOOP      0x000
2100 #define OPCODE_LOAD      0x080
2101 #define OPCODE_LOADINV   0x480
2102 #define OPCODE_LOAD0     0x081
2103 #define OPCODE_LOAD1     0x481
2104 #define OPCODE_ADD       0x100
2105 #define OPCODE_SUB       0x101
2106 #define OPCODE_AND       0x102
2107 #define OPCODE_OR        0x103
2108 #define OPCODE_XOR       0x104
2109 #define OPCODE_STORE     0x180
2110 #define OPCODE_STOREINV  0x580
2111
2112 #define OPERAND_R0   0x00
2113 #define OPERAND_R1   0x01
2114 #define OPERAND_R2   0x02
2115 #define OPERAND_R3   0x03
2116 #define OPERAND_R4   0x04
2117 #define OPERAND_SRCA 0x20
2118 #define OPERAND_SRCB 0x21
2119 #define OPERAND_ACCU 0x31
2120 #define OPERAND_ZF   0x32
2121 #define OPERAND_CF   0x33
2122
2123 #define CS_GPR(n) (0x2600 + (n) * 8)
2124
2125 static void
2126 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
2127                       struct anv_bo *bo, uint32_t offset)
2128 {
2129    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2130       lrm.RegisterAddress  = reg,
2131       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
2132    }
2133    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2134       lrm.RegisterAddress  = reg + 4;
2135       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
2136    }
2137 }
2138
2139 static void
2140 store_query_result(struct anv_batch *batch, uint32_t reg,
2141                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
2142 {
2143    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2144       srm.RegisterAddress  = reg;
2145       srm.MemoryAddress    = (struct anv_address) { bo, offset };
2146    }
2147
2148    if (flags & VK_QUERY_RESULT_64_BIT) {
2149       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2150          srm.RegisterAddress  = reg + 4;
2151          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
2152       }
2153    }
2154 }
2155
2156 void genX(CmdCopyQueryPoolResults)(
2157     VkCommandBuffer                             commandBuffer,
2158     VkQueryPool                                 queryPool,
2159     uint32_t                                    firstQuery,
2160     uint32_t                                    queryCount,
2161     VkBuffer                                    destBuffer,
2162     VkDeviceSize                                destOffset,
2163     VkDeviceSize                                destStride,
2164     VkQueryResultFlags                          flags)
2165 {
2166    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2167    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2168    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
2169    uint32_t slot_offset, dst_offset;
2170
2171    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
2172       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2173          pc.CommandStreamerStallEnable = true;
2174          pc.StallAtPixelScoreboard     = true;
2175       }
2176    }
2177
2178    dst_offset = buffer->offset + destOffset;
2179    for (uint32_t i = 0; i < queryCount; i++) {
2180
2181       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
2182       switch (pool->type) {
2183       case VK_QUERY_TYPE_OCCLUSION:
2184          emit_load_alu_reg_u64(&cmd_buffer->batch,
2185                                CS_GPR(0), &pool->bo, slot_offset);
2186          emit_load_alu_reg_u64(&cmd_buffer->batch,
2187                                CS_GPR(1), &pool->bo, slot_offset + 8);
2188
2189          /* FIXME: We need to clamp the result for 32 bit. */
2190
2191          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
2192          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
2193          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
2194          dw[3] = alu(OPCODE_SUB, 0, 0);
2195          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
2196          break;
2197
2198       case VK_QUERY_TYPE_TIMESTAMP:
2199          emit_load_alu_reg_u64(&cmd_buffer->batch,
2200                                CS_GPR(2), &pool->bo, slot_offset);
2201          break;
2202
2203       default:
2204          unreachable("unhandled query type");
2205       }
2206
2207       store_query_result(&cmd_buffer->batch,
2208                          CS_GPR(2), buffer->bo, dst_offset, flags);
2209
2210       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
2211          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
2212                                &pool->bo, slot_offset + 16);
2213          if (flags & VK_QUERY_RESULT_64_BIT)
2214             store_query_result(&cmd_buffer->batch,
2215                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
2216          else
2217             store_query_result(&cmd_buffer->batch,
2218                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
2219       }
2220
2221       dst_offset += destStride;
2222    }
2223 }
2224
2225 #else
2226 void genX(CmdCopyQueryPoolResults)(
2227     VkCommandBuffer                             commandBuffer,
2228     VkQueryPool                                 queryPool,
2229     uint32_t                                    firstQuery,
2230     uint32_t                                    queryCount,
2231     VkBuffer                                    destBuffer,
2232     VkDeviceSize                                destOffset,
2233     VkDeviceSize                                destStride,
2234     VkQueryResultFlags                          flags)
2235 {
2236    anv_finishme("Queries not yet supported on Ivy Bridge");
2237 }
2238 #endif