src/intel/vulkan/genX_cmd_buffer.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26
  27 #include "anv_private.h"
  28 #include "vk_format_info.h"
  29
  30 #include "common/gen_l3_config.h"
  31 #include "genxml/gen_macros.h"
  32 #include "genxml/genX_pack.h"
  33
  34 static void
  35 emit_lrm(struct anv_batch *batch,
  36          uint32_t reg, struct anv_bo *bo, uint32_t offset)
  37 {
  38    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
  39       lrm.RegisterAddress  = reg;
  40       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
  41    }
  42 }
  43
  44 static void
  45 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
  46 {
  47    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
  48       lri.RegisterOffset   = reg;
  49       lri.DataDWord        = imm;
  50    }
  51 }
  52
  53 void
  54 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
  55 {
  56    struct anv_device *device = cmd_buffer->device;
  57
  58 /* XXX: Do we need this on more than just BDW? */
  59 #if (GEN_GEN >= 8)
  60    /* Emit a render target cache flush.
  61     *
  62     * This isn't documented anywhere in the PRM.  However, it seems to be
  63     * necessary prior to changing the surface state base adress.  Without
  64     * this, we get GPU hangs when using multi-level command buffers which
  65     * clear depth, reset state base address, and then go render stuff.
  66     */
  67    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
  68       pc.RenderTargetCacheFlushEnable = true;
  69    }
  70 #endif
  71
  72    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
  73       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
  74       sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
  75       sba.GeneralStateBaseAddressModifyEnable = true;
  76
  77       sba.SurfaceStateBaseAddress =
  78          anv_cmd_buffer_surface_base_address(cmd_buffer);
  79       sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
  80       sba.SurfaceStateBaseAddressModifyEnable = true;
  81
  82       sba.DynamicStateBaseAddress =
  83          (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
  84       sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
  85       sba.DynamicStateBaseAddressModifyEnable = true;
  86
  87       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
  88       sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
  89       sba.IndirectObjectBaseAddressModifyEnable = true;
  90
  91       sba.InstructionBaseAddress =
  92          (struct anv_address) { &device->instruction_block_pool.bo, 0 };
  93       sba.InstructionMemoryObjectControlState = GENX(MOCS);
  94       sba.InstructionBaseAddressModifyEnable = true;
  95
  96 #  if (GEN_GEN >= 8)
  97       /* Broadwell requires that we specify a buffer size for a bunch of
  98        * these fields.  However, since we will be growing the BO's live, we
  99        * just set them all to the maximum.
 100        */
 101       sba.GeneralStateBufferSize                = 0xfffff;
 102       sba.GeneralStateBufferSizeModifyEnable    = true;
 103       sba.DynamicStateBufferSize                = 0xfffff;
 104       sba.DynamicStateBufferSizeModifyEnable    = true;
 105       sba.IndirectObjectBufferSize              = 0xfffff;
 106       sba.IndirectObjectBufferSizeModifyEnable  = true;
 107       sba.InstructionBufferSize                 = 0xfffff;
 108       sba.InstructionBuffersizeModifyEnable     = true;
 109 #  endif
 110    }
 111
 112    /* After re-setting the surface state base address, we have to do some
 113     * cache flusing so that the sampler engine will pick up the new
 114     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
 115     * Shared Function > 3D Sampler > State > State Caching (page 96):
 116     *
 117     *    Coherency with system memory in the state cache, like the texture
 118     *    cache is handled partially by software. It is expected that the
 119     *    command stream or shader will issue Cache Flush operation or
 120     *    Cache_Flush sampler message to ensure that the L1 cache remains
 121     *    coherent with system memory.
 122     *
 123     *    [...]
 124     *
 125     *    Whenever the value of the Dynamic_State_Base_Addr,
 126     *    Surface_State_Base_Addr are altered, the L1 state cache must be
 127     *    invalidated to ensure the new surface or sampler state is fetched
 128     *    from system memory.
 129     *
 130     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
 131     * which, according the PIPE_CONTROL instruction documentation in the
 132     * Broadwell PRM:
 133     *
 134     *    Setting this bit is independent of any other bit in this packet.
 135     *    This bit controls the invalidation of the L1 and L2 state caches
 136     *    at the top of the pipe i.e. at the parsing time.
 137     *
 138     * Unfortunately, experimentation seems to indicate that state cache
 139     * invalidation through a PIPE_CONTROL does nothing whatsoever in
 140     * regards to surface state and binding tables.  In stead, it seems that
 141     * invalidating the texture cache is what is actually needed.
 142     *
 143     * XXX:  As far as we have been able to determine through
 144     * experimentation, shows that flush the texture cache appears to be
 145     * sufficient.  The theory here is that all of the sampling/rendering
 146     * units cache the binding table in the texture cache.  However, we have
 147     * yet to be able to actually confirm this.
 148     */
 149    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 150       pc.TextureCacheInvalidationEnable = true;
 151    }
 152 }
 153
 154 static void
 155 add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer,
 156                         struct anv_state state,
 157                         struct anv_bo *bo, uint32_t offset)
 158 {
 159    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
 160
 161    anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
 162                       state.offset + isl_dev->ss.addr_offset, bo, offset);
 163 }
 164
 165 static void
 166 add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer,
 167                       const struct anv_image_view *iview,
 168                       enum isl_aux_usage aux_usage,
 169                       struct anv_state state)
 170 {
 171    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
 172
 173    anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
 174                       state.offset + isl_dev->ss.addr_offset,
 175                       iview->bo, iview->offset);
 176
 177    if (aux_usage != ISL_AUX_USAGE_NONE) {
 178       uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset;
 179
 180       /* On gen7 and prior, the bottom 12 bits of the MCS base address are
 181        * used to store other information.  This should be ok, however, because
 182        * surface buffer addresses are always 4K page alinged.
 183        */
 184       assert((aux_offset & 0xfff) == 0);
 185       uint32_t *aux_addr_dw = state.map + isl_dev->ss.aux_addr_offset;
 186       aux_offset += *aux_addr_dw & 0xfff;
 187
 188       anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
 189                          state.offset + isl_dev->ss.aux_addr_offset,
 190                          iview->bo, aux_offset);
 191    }
 192 }
 193
 194 static bool
 195 color_is_zero_one(VkClearColorValue value, enum isl_format format)
 196 {
 197    if (isl_format_has_int_channel(format)) {
 198       for (unsigned i = 0; i < 4; i++) {
 199          if (value.int32[i] != 0 && value.int32[i] != 1)
 200             return false;
 201       }
 202    } else {
 203       for (unsigned i = 0; i < 4; i++) {
 204          if (value.float32[i] != 0.0f && value.float32[i] != 1.0f)
 205             return false;
 206       }
 207    }
 208
 209    return true;
 210 }
 211
 212 static void
 213 color_attachment_compute_aux_usage(struct anv_device *device,
 214                                    struct anv_attachment_state *att_state,
 215                                    struct anv_image_view *iview,
 216                                    VkRect2D render_area,
 217                                    union isl_color_value *fast_clear_color)
 218 {
 219    if (iview->image->aux_surface.isl.size == 0) {
 220       att_state->aux_usage = ISL_AUX_USAGE_NONE;
 221       att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
 222       att_state->fast_clear = false;
 223       return;
 224    }
 225
 226    assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT);
 227
 228    att_state->clear_color_is_zero_one =
 229       color_is_zero_one(att_state->clear_value.color, iview->isl.format);
 230
 231    if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
 232       /* Start off assuming fast clears are possible */
 233       att_state->fast_clear = true;
 234
 235       /* Potentially, we could do partial fast-clears but doing so has crazy
 236        * alignment restrictions.  It's easier to just restrict to full size
 237        * fast clears for now.
 238        */
 239       if (render_area.offset.x != 0 ||
 240           render_area.offset.y != 0 ||
 241           render_area.extent.width != iview->extent.width ||
 242           render_area.extent.height != iview->extent.height)
 243          att_state->fast_clear = false;
 244
 245       if (att_state->fast_clear) {
 246          memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32,
 247                 sizeof(fast_clear_color->u32));
 248       }
 249    } else {
 250       att_state->fast_clear = false;
 251    }
 252
 253    if (isl_format_supports_lossless_compression(&device->info,
 254                                                 iview->isl.format)) {
 255       att_state->aux_usage = ISL_AUX_USAGE_CCS_E;
 256       att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E;
 257    } else if (att_state->fast_clear) {
 258       att_state->aux_usage = ISL_AUX_USAGE_CCS_D;
 259       /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode:
 260        *
 261        *    "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D
 262        *    setting is only allowed if Surface Format supported for Fast
 263        *    Clear. In addition, if the surface is bound to the sampling
 264        *    engine, Surface Format must be supported for Render Target
 265        *    Compression for surfaces bound to the sampling engine."
 266        *
 267        * In other words, we can't sample from a fast-cleared image if it
 268        * doesn't also support color compression.
 269        */
 270       att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
 271    } else {
 272       att_state->aux_usage = ISL_AUX_USAGE_NONE;
 273       att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
 274    }
 275 }
 276
 277 static bool
 278 need_input_attachment_state(const struct anv_render_pass_attachment *att)
 279 {
 280    if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
 281       return false;
 282
 283    /* We only allocate input attachment states for color and depth surfaces.
 284     * Stencil doesn't allow compression so we can just use the texture surface
 285     * state from the view
 286     */
 287    return vk_format_is_color(att->format) || vk_format_has_depth(att->format);
 288 }
 289
 290 /**
 291  * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
 292  */
 293 static void
 294 genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
 295                                    struct anv_render_pass *pass,
 296                                    const VkRenderPassBeginInfo *begin)
 297 {
 298    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
 299    struct anv_cmd_state *state = &cmd_buffer->state;
 300
 301    vk_free(&cmd_buffer->pool->alloc, state->attachments);
 302
 303    if (pass->attachment_count == 0) {
 304       state->attachments = NULL;
 305       return;
 306    }
 307
 308    state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
 309                                  pass->attachment_count *
 310                                       sizeof(state->attachments[0]),
 311                                  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 312    if (state->attachments == NULL) {
 313       /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
 314       abort();
 315    }
 316
 317    bool need_null_state = false;
 318    unsigned num_states = 0;
 319    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
 320       if (vk_format_is_color(pass->attachments[i].format)) {
 321          num_states++;
 322       } else {
 323          /* We need a null state for any depth-stencil-only subpasses.
 324           * Importantly, this includes depth/stencil clears so we create one
 325           * whenever we have depth or stencil
 326           */
 327          need_null_state = true;
 328       }
 329
 330       if (need_input_attachment_state(&pass->attachments[i]))
 331          num_states++;
 332    }
 333    num_states += need_null_state;
 334
 335    const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
 336    state->render_pass_states =
 337       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
 338                              num_states * ss_stride, isl_dev->ss.align);
 339
 340    struct anv_state next_state = state->render_pass_states;
 341    next_state.alloc_size = isl_dev->ss.size;
 342
 343    if (need_null_state) {
 344       state->null_surface_state = next_state;
 345       next_state.offset += ss_stride;
 346       next_state.map += ss_stride;
 347    }
 348
 349    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
 350       if (vk_format_is_color(pass->attachments[i].format)) {
 351          state->attachments[i].color_rt_state = next_state;
 352          next_state.offset += ss_stride;
 353          next_state.map += ss_stride;
 354       }
 355
 356       if (need_input_attachment_state(&pass->attachments[i])) {
 357          state->attachments[i].input_att_state = next_state;
 358          next_state.offset += ss_stride;
 359          next_state.map += ss_stride;
 360       }
 361    }
 362    assert(next_state.offset == state->render_pass_states.offset +
 363                                state->render_pass_states.alloc_size);
 364
 365    if (begin) {
 366       ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer);
 367       assert(pass->attachment_count == framebuffer->attachment_count);
 368
 369       if (need_null_state) {
 370          struct GENX(RENDER_SURFACE_STATE) null_ss = {
 371             .SurfaceType = SURFTYPE_NULL,
 372             .SurfaceArray = framebuffer->layers > 0,
 373             .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
 374 #if GEN_GEN >= 8
 375             .TileMode = YMAJOR,
 376 #else
 377             .TiledSurface = true,
 378 #endif
 379             .Width = framebuffer->width - 1,
 380             .Height = framebuffer->height - 1,
 381             .Depth = framebuffer->layers - 1,
 382             .RenderTargetViewExtent = framebuffer->layers - 1,
 383          };
 384          GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map,
 385                                          &null_ss);
 386       }
 387
 388       for (uint32_t i = 0; i < pass->attachment_count; ++i) {
 389          struct anv_render_pass_attachment *att = &pass->attachments[i];
 390          VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
 391          VkImageAspectFlags clear_aspects = 0;
 392
 393          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
 394             /* color attachment */
 395             if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 396                clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
 397             }
 398          } else {
 399             /* depthstencil attachment */
 400             if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
 401                 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 402                clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
 403             }
 404             if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
 405                 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
 406                clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
 407             }
 408          }
 409
 410          state->attachments[i].pending_clear_aspects = clear_aspects;
 411          if (clear_aspects)
 412             state->attachments[i].clear_value = begin->pClearValues[i];
 413
 414          struct anv_image_view *iview = framebuffer->attachments[i];
 415          assert(iview->vk_format == att->format);
 416
 417          union isl_color_value clear_color = { .u32 = { 0, } };
 418          if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
 419             color_attachment_compute_aux_usage(cmd_buffer->device,
 420                                                &state->attachments[i],
 421                                                iview, begin->renderArea,
 422                                                &clear_color);
 423
 424             struct isl_view view = iview->isl;
 425             view.usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
 426             isl_surf_fill_state(isl_dev,
 427                                 state->attachments[i].color_rt_state.map,
 428                                 .surf = &iview->image->color_surface.isl,
 429                                 .view = &view,
 430                                 .aux_surf = &iview->image->aux_surface.isl,
 431                                 .aux_usage = state->attachments[i].aux_usage,
 432                                 .clear_color = clear_color,
 433                                 .mocs = cmd_buffer->device->default_mocs);
 434
 435             add_image_view_relocs(cmd_buffer, iview,
 436                                   state->attachments[i].aux_usage,
 437                                   state->attachments[i].color_rt_state);
 438          } else {
 439             state->attachments[i].aux_usage = ISL_AUX_USAGE_NONE;
 440             state->attachments[i].input_aux_usage = ISL_AUX_USAGE_NONE;
 441          }
 442
 443          if (need_input_attachment_state(&pass->attachments[i])) {
 444             const struct isl_surf *surf;
 445             if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
 446                surf = &iview->image->color_surface.isl;
 447             } else {
 448                surf = &iview->image->depth_surface.isl;
 449             }
 450
 451             struct isl_view view = iview->isl;
 452             view.usage |= ISL_SURF_USAGE_TEXTURE_BIT;
 453             isl_surf_fill_state(isl_dev,
 454                                 state->attachments[i].input_att_state.map,
 455                                 .surf = surf,
 456                                 .view = &view,
 457                                 .aux_surf = &iview->image->aux_surface.isl,
 458                                 .aux_usage = state->attachments[i].input_aux_usage,
 459                                 .clear_color = clear_color,
 460                                 .mocs = cmd_buffer->device->default_mocs);
 461
 462             add_image_view_relocs(cmd_buffer, iview,
 463                                   state->attachments[i].input_aux_usage,
 464                                   state->attachments[i].input_att_state);
 465          }
 466       }
 467
 468       if (!cmd_buffer->device->info.has_llc)
 469          anv_state_clflush(state->render_pass_states);
 470    }
 471 }
 472
 473 VkResult
 474 genX(BeginCommandBuffer)(
 475     VkCommandBuffer                             commandBuffer,
 476     const VkCommandBufferBeginInfo*             pBeginInfo)
 477 {
 478    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 479
 480    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
 481     * command buffer's state. Otherwise, we must *reset* its state. In both
 482     * cases we reset it.
 483     *
 484     * From the Vulkan 1.0 spec:
 485     *
 486     *    If a command buffer is in the executable state and the command buffer
 487     *    was allocated from a command pool with the
 488     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
 489     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
 490     *    as if vkResetCommandBuffer had been called with
 491     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
 492     *    the command buffer in the recording state.
 493     */
 494    anv_cmd_buffer_reset(cmd_buffer);
 495
 496    cmd_buffer->usage_flags = pBeginInfo->flags;
 497
 498    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
 499           !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
 500
 501    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
 502
 503    if (cmd_buffer->usage_flags &
 504        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
 505       cmd_buffer->state.pass =
 506          anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
 507       cmd_buffer->state.subpass =
 508          &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
 509       cmd_buffer->state.framebuffer = NULL;
 510
 511       genX(cmd_buffer_setup_attachments)(cmd_buffer, cmd_buffer->state.pass,
 512                                          NULL);
 513
 514       cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
 515    }
 516
 517    return VK_SUCCESS;
 518 }
 519
 520 VkResult
 521 genX(EndCommandBuffer)(
 522     VkCommandBuffer                             commandBuffer)
 523 {
 524    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 525
 526    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 527
 528    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 529
 530    return VK_SUCCESS;
 531 }
 532
 533 void
 534 genX(CmdExecuteCommands)(
 535     VkCommandBuffer                             commandBuffer,
 536     uint32_t                                    commandBufferCount,
 537     const VkCommandBuffer*                      pCmdBuffers)
 538 {
 539    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
 540
 541    assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 542
 543    for (uint32_t i = 0; i < commandBufferCount; i++) {
 544       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 545
 546       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
 547
 548       if (secondary->usage_flags &
 549           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
 550          /* If we're continuing a render pass from the primary, we need to
 551           * copy the surface states for the current subpass into the storage
 552           * we allocated for them in BeginCommandBuffer.
 553           */
 554          struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo;
 555          struct anv_state src_state = primary->state.render_pass_states;
 556          struct anv_state dst_state = secondary->state.render_pass_states;
 557          assert(src_state.alloc_size == dst_state.alloc_size);
 558
 559          genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset,
 560                                      ss_bo, src_state.offset,
 561                                      src_state.alloc_size);
 562       }
 563
 564       anv_cmd_buffer_add_secondary(primary, secondary);
 565    }
 566
 567    /* Each of the secondary command buffers will use its own state base
 568     * address.  We need to re-emit state base address for the primary after
 569     * all of the secondaries are done.
 570     *
 571     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
 572     * address calls?
 573     */
 574    genX(cmd_buffer_emit_state_base_address)(primary);
 575 }
 576
 577 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
 578 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
 579 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
 580
 581 /**
 582  * Program the hardware to use the specified L3 configuration.
 583  */
 584 void
 585 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
 586                            const struct gen_l3_config *cfg)
 587 {
 588    assert(cfg);
 589    if (cfg == cmd_buffer->state.current_l3_config)
 590       return;
 591
 592    if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
 593       fprintf(stderr, "L3 config transition: ");
 594       gen_dump_l3_config(cfg, stderr);
 595    }
 596
 597    const bool has_slm = cfg->n[GEN_L3P_SLM];
 598
 599    /* According to the hardware docs, the L3 partitioning can only be changed
 600     * while the pipeline is completely drained and the caches are flushed,
 601     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
 602     */
 603    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 604       pc.DCFlushEnable = true;
 605       pc.PostSyncOperation = NoWrite;
 606       pc.CommandStreamerStallEnable = true;
 607    }
 608
 609    /* ...followed by a second pipelined PIPE_CONTROL that initiates
 610     * invalidation of the relevant caches.  Note that because RO invalidation
 611     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
 612     * command is processed by the CS) we cannot combine it with the previous
 613     * stalling flush as the hardware documentation suggests, because that
 614     * would cause the CS to stall on previous rendering *after* RO
 615     * invalidation and wouldn't prevent the RO caches from being polluted by
 616     * concurrent rendering before the stall completes.  This intentionally
 617     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
 618     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
 619     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
 620     * already guarantee that there is no concurrent GPGPU kernel execution
 621     * (see SKL HSD 2132585).
 622     */
 623    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 624       pc.TextureCacheInvalidationEnable = true;
 625       pc.ConstantCacheInvalidationEnable = true;
 626       pc.InstructionCacheInvalidateEnable = true;
 627       pc.StateCacheInvalidationEnable = true;
 628       pc.PostSyncOperation = NoWrite;
 629    }
 630
 631    /* Now send a third stalling flush to make sure that invalidation is
 632     * complete when the L3 configuration registers are modified.
 633     */
 634    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 635       pc.DCFlushEnable = true;
 636       pc.PostSyncOperation = NoWrite;
 637       pc.CommandStreamerStallEnable = true;
 638    }
 639
 640 #if GEN_GEN >= 8
 641
 642    assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
 643
 644    uint32_t l3cr;
 645    anv_pack_struct(&l3cr, GENX(L3CNTLREG),
 646                    .SLMEnable = has_slm,
 647                    .URBAllocation = cfg->n[GEN_L3P_URB],
 648                    .ROAllocation = cfg->n[GEN_L3P_RO],
 649                    .DCAllocation = cfg->n[GEN_L3P_DC],
 650                    .AllAllocation = cfg->n[GEN_L3P_ALL]);
 651
 652    /* Set up the L3 partitioning. */
 653    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr);
 654
 655 #else
 656
 657    const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
 658    const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
 659                        cfg->n[GEN_L3P_ALL];
 660    const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
 661                       cfg->n[GEN_L3P_ALL];
 662    const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
 663                       cfg->n[GEN_L3P_ALL];
 664
 665    assert(!cfg->n[GEN_L3P_ALL]);
 666
 667    /* When enabled SLM only uses a portion of the L3 on half of the banks,
 668     * the matching space on the remaining banks has to be allocated to a
 669     * client (URB for all validated configurations) set to the
 670     * lower-bandwidth 2-bank address hashing mode.
 671     */
 672    const struct gen_device_info *devinfo = &cmd_buffer->device->info;
 673    const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
 674    assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
 675
 676    /* Minimum number of ways that can be allocated to the URB. */
 677    const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
 678    assert(cfg->n[GEN_L3P_URB] >= n0_urb);
 679
 680    uint32_t l3sqcr1, l3cr2, l3cr3;
 681    anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1),
 682                    .ConvertDC_UC = !has_dc,
 683                    .ConvertIS_UC = !has_is,
 684                    .ConvertC_UC = !has_c,
 685                    .ConvertT_UC = !has_t);
 686    l3sqcr1 |=
 687       GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
 688       devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
 689       IVB_L3SQCREG1_SQGHPCI_DEFAULT;
 690
 691    anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
 692                    .SLMEnable = has_slm,
 693                    .URBLowBandwidth = urb_low_bw,
 694                    .URBAllocation = cfg->n[GEN_L3P_URB],
 695 #if !GEN_IS_HASWELL
 696                    .ALLAllocation = cfg->n[GEN_L3P_ALL],
 697 #endif
 698                    .ROAllocation = cfg->n[GEN_L3P_RO],
 699                    .DCAllocation = cfg->n[GEN_L3P_DC]);
 700
 701    anv_pack_struct(&l3cr3, GENX(L3CNTLREG3),
 702                    .ISAllocation = cfg->n[GEN_L3P_IS],
 703                    .ISLowBandwidth = 0,
 704                    .CAllocation = cfg->n[GEN_L3P_C],
 705                    .CLowBandwidth = 0,
 706                    .TAllocation = cfg->n[GEN_L3P_T],
 707                    .TLowBandwidth = 0);
 708
 709    /* Set up the L3 partitioning. */
 710    emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1);
 711    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2);
 712    emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
 713
 714 #if GEN_IS_HASWELL
 715    if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
 716       /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
 717        * them disabled to avoid crashing the system hard.
 718        */
 719       uint32_t scratch1, chicken3;
 720       anv_pack_struct(&scratch1, GENX(SCRATCH1),
 721                       .L3AtomicDisable = !has_dc);
 722       anv_pack_struct(&chicken3, GENX(CHICKEN3),
 723                       .L3AtomicDisableMask = true,
 724                       .L3AtomicDisable = !has_dc);
 725       emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1);
 726       emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3);
 727    }
 728 #endif
 729
 730 #endif
 731
 732    cmd_buffer->state.current_l3_config = cfg;
 733 }
 734
 735 void
 736 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
 737 {
 738    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
 739
 740    /* Flushes are pipelined while invalidations are handled immediately.
 741     * Therefore, if we're flushing anything then we need to schedule a stall
 742     * before any invalidations can happen.
 743     */
 744    if (bits & ANV_PIPE_FLUSH_BITS)
 745       bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
 746
 747    /* If we're going to do an invalidate and we have a pending CS stall that
 748     * has yet to be resolved, we do the CS stall now.
 749     */
 750    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
 751        (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
 752       bits |= ANV_PIPE_CS_STALL_BIT;
 753       bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
 754    }
 755
 756    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
 757       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 758          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 759          pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 760          pipe.RenderTargetCacheFlushEnable =
 761             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 762
 763          pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
 764          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
 765          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
 766
 767          /*
 768           * According to the Broadwell documentation, any PIPE_CONTROL with the
 769           * "Command Streamer Stall" bit set must also have another bit set,
 770           * with five different options:
 771           *
 772           *  - Render Target Cache Flush
 773           *  - Depth Cache Flush
 774           *  - Stall at Pixel Scoreboard
 775           *  - Post-Sync Operation
 776           *  - Depth Stall
 777           *  - DC Flush Enable
 778           *
 779           * I chose "Stall at Pixel Scoreboard" since that's what we use in
 780           * mesa and it seems to work fine. The choice is fairly arbitrary.
 781           */
 782          if ((bits & ANV_PIPE_CS_STALL_BIT) &&
 783              !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
 784                        ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
 785             pipe.StallAtPixelScoreboard = true;
 786       }
 787
 788       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
 789    }
 790
 791    if (bits & ANV_PIPE_INVALIDATE_BITS) {
 792       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
 793          pipe.StateCacheInvalidationEnable =
 794             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
 795          pipe.ConstantCacheInvalidationEnable =
 796             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 797          pipe.VFCacheInvalidationEnable =
 798             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 799          pipe.TextureCacheInvalidationEnable =
 800             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 801          pipe.InstructionCacheInvalidateEnable =
 802             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
 803       }
 804
 805       bits &= ~ANV_PIPE_INVALIDATE_BITS;
 806    }
 807
 808    cmd_buffer->state.pending_pipe_bits = bits;
 809 }
 810
 811 void genX(CmdPipelineBarrier)(
 812     VkCommandBuffer                             commandBuffer,
 813     VkPipelineStageFlags                        srcStageMask,
 814     VkPipelineStageFlags                        destStageMask,
 815     VkBool32                                    byRegion,
 816     uint32_t                                    memoryBarrierCount,
 817     const VkMemoryBarrier*                      pMemoryBarriers,
 818     uint32_t                                    bufferMemoryBarrierCount,
 819     const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
 820     uint32_t                                    imageMemoryBarrierCount,
 821     const VkImageMemoryBarrier*                 pImageMemoryBarriers)
 822 {
 823    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 824    uint32_t b;
 825
 826    /* XXX: Right now, we're really dumb and just flush whatever categories
 827     * the app asks for.  One of these days we may make this a bit better
 828     * but right now that's all the hardware allows for in most areas.
 829     */
 830    VkAccessFlags src_flags = 0;
 831    VkAccessFlags dst_flags = 0;
 832
 833    for (uint32_t i = 0; i < memoryBarrierCount; i++) {
 834       src_flags |= pMemoryBarriers[i].srcAccessMask;
 835       dst_flags |= pMemoryBarriers[i].dstAccessMask;
 836    }
 837
 838    for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
 839       src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
 840       dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
 841    }
 842
 843    for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 844       src_flags |= pImageMemoryBarriers[i].srcAccessMask;
 845       dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
 846    }
 847
 848    enum anv_pipe_bits pipe_bits = 0;
 849
 850    for_each_bit(b, src_flags) {
 851       switch ((VkAccessFlagBits)(1 << b)) {
 852       case VK_ACCESS_SHADER_WRITE_BIT:
 853          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
 854          break;
 855       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
 856          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 857          break;
 858       case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
 859          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 860          break;
 861       case VK_ACCESS_TRANSFER_WRITE_BIT:
 862          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 863          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
 864          break;
 865       default:
 866          break; /* Nothing to do */
 867       }
 868    }
 869
 870    for_each_bit(b, dst_flags) {
 871       switch ((VkAccessFlagBits)(1 << b)) {
 872       case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
 873       case VK_ACCESS_INDEX_READ_BIT:
 874       case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
 875          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
 876          break;
 877       case VK_ACCESS_UNIFORM_READ_BIT:
 878          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
 879          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 880          break;
 881       case VK_ACCESS_SHADER_READ_BIT:
 882       case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
 883       case VK_ACCESS_TRANSFER_READ_BIT:
 884          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
 885          break;
 886       default:
 887          break; /* Nothing to do */
 888       }
 889    }
 890
 891    cmd_buffer->state.pending_pipe_bits |= pipe_bits;
 892 }
 893
 894 static void
 895 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
 896 {
 897    VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
 898
 899    /* In order to avoid thrash, we assume that vertex and fragment stages
 900     * always exist.  In the rare case where one is missing *and* the other
 901     * uses push concstants, this may be suboptimal.  However, avoiding stalls
 902     * seems more important.
 903     */
 904    stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
 905
 906    if (stages == cmd_buffer->state.push_constant_stages)
 907       return;
 908
 909 #if GEN_GEN >= 8
 910    const unsigned push_constant_kb = 32;
 911 #elif GEN_IS_HASWELL
 912    const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
 913 #else
 914    const unsigned push_constant_kb = 16;
 915 #endif
 916
 917    const unsigned num_stages =
 918       _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
 919    unsigned size_per_stage = push_constant_kb / num_stages;
 920
 921    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
 922     * units of 2KB.  Incidentally, these are the same platforms that have
 923     * 32KB worth of push constant space.
 924     */
 925    if (push_constant_kb == 32)
 926       size_per_stage &= ~1u;
 927
 928    uint32_t kb_used = 0;
 929    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
 930       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
 931       anv_batch_emit(&cmd_buffer->batch,
 932                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
 933          alloc._3DCommandSubOpcode  = 18 + i;
 934          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
 935          alloc.ConstantBufferSize   = push_size;
 936       }
 937       kb_used += push_size;
 938    }
 939
 940    anv_batch_emit(&cmd_buffer->batch,
 941                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
 942       alloc.ConstantBufferOffset = kb_used;
 943       alloc.ConstantBufferSize = push_constant_kb - kb_used;
 944    }
 945
 946    cmd_buffer->state.push_constant_stages = stages;
 947
 948    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
 949     *
 950     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
 951     *    the next 3DPRIMITIVE command after programming the
 952     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
 953     *
 954     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
 955     * pipeline setup, we need to dirty push constants.
 956     */
 957    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
 958 }
 959
 960 static VkResult
 961 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 962                    gl_shader_stage stage,
 963                    struct anv_state *bt_state)
 964 {
 965    struct anv_subpass *subpass = cmd_buffer->state.subpass;
 966    struct anv_pipeline *pipeline;
 967    uint32_t bias, state_offset;
 968
 969    switch (stage) {
 970    case  MESA_SHADER_COMPUTE:
 971       pipeline = cmd_buffer->state.compute_pipeline;
 972       bias = 1;
 973       break;
 974    default:
 975       pipeline = cmd_buffer->state.pipeline;
 976       bias = 0;
 977       break;
 978    }
 979
 980    if (!anv_pipeline_has_stage(pipeline, stage)) {
 981       *bt_state = (struct anv_state) { 0, };
 982       return VK_SUCCESS;
 983    }
 984
 985    struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
 986    if (bias + map->surface_count == 0) {
 987       *bt_state = (struct anv_state) { 0, };
 988       return VK_SUCCESS;
 989    }
 990
 991    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
 992                                                   bias + map->surface_count,
 993                                                   &state_offset);
 994    uint32_t *bt_map = bt_state->map;
 995
 996    if (bt_state->map == NULL)
 997       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 998
 999    if (stage == MESA_SHADER_COMPUTE &&
1000        get_cs_prog_data(cmd_buffer->state.compute_pipeline)->uses_num_work_groups) {
1001       struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo;
1002       uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset;
1003
1004       struct anv_state surface_state;
1005       surface_state =
1006          anv_cmd_buffer_alloc_surface_state(cmd_buffer);
1007
1008       const enum isl_format format =
1009          anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1010       anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
1011                                     format, bo_offset, 12, 1);
1012
1013       bt_map[0] = surface_state.offset + state_offset;
1014       add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
1015    }
1016
1017    if (map->surface_count == 0)
1018       goto out;
1019
1020    if (map->image_count > 0) {
1021       VkResult result =
1022          anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
1023       if (result != VK_SUCCESS)
1024          return result;
1025
1026       cmd_buffer->state.push_constants_dirty |= 1 << stage;
1027    }
1028
1029    uint32_t image = 0;
1030    for (uint32_t s = 0; s < map->surface_count; s++) {
1031       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
1032
1033       struct anv_state surface_state;
1034
1035       if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) {
1036          /* Color attachment binding */
1037          assert(stage == MESA_SHADER_FRAGMENT);
1038          assert(binding->binding == 0);
1039          if (binding->index < subpass->color_count) {
1040             const unsigned att = subpass->color_attachments[binding->index];
1041             surface_state = cmd_buffer->state.attachments[att].color_rt_state;
1042          } else {
1043             surface_state = cmd_buffer->state.null_surface_state;
1044          }
1045
1046          bt_map[bias + s] = surface_state.offset + state_offset;
1047          continue;
1048       }
1049
1050       struct anv_descriptor_set *set =
1051          cmd_buffer->state.descriptors[binding->set];
1052       uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
1053       struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
1054
1055       switch (desc->type) {
1056       case VK_DESCRIPTOR_TYPE_SAMPLER:
1057          /* Nothing for us to do here */
1058          continue;
1059
1060       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1061       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1062          surface_state = desc->image_view->sampler_surface_state;
1063          assert(surface_state.alloc_size);
1064          add_image_view_relocs(cmd_buffer, desc->image_view,
1065                                desc->image_view->image->aux_usage,
1066                                surface_state);
1067          break;
1068
1069       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
1070          assert(stage == MESA_SHADER_FRAGMENT);
1071          if (desc->image_view->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1072             /* For stencil input attachments, we treat it like any old texture
1073              * that a user may have bound.
1074              */
1075             surface_state = desc->image_view->sampler_surface_state;
1076             assert(surface_state.alloc_size);
1077             add_image_view_relocs(cmd_buffer, desc->image_view,
1078                                   desc->image_view->image->aux_usage,
1079                                   surface_state);
1080          } else {
1081             /* For depth and color input attachments, we create the surface
1082              * state at vkBeginRenderPass time so that we can include aux
1083              * and clear color information.
1084              */
1085             assert(binding->input_attachment_index < subpass->input_count);
1086             const unsigned subpass_att = binding->input_attachment_index;
1087             const unsigned att = subpass->input_attachments[subpass_att];
1088             surface_state = cmd_buffer->state.attachments[att].input_att_state;
1089          }
1090          break;
1091
1092       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1093          surface_state = desc->image_view->storage_surface_state;
1094          assert(surface_state.alloc_size);
1095          add_image_view_relocs(cmd_buffer, desc->image_view,
1096                                desc->image_view->image->aux_usage,
1097                                surface_state);
1098
1099          struct brw_image_param *image_param =
1100             &cmd_buffer->state.push_constants[stage]->images[image++];
1101
1102          *image_param = desc->image_view->storage_image_param;
1103          image_param->surface_idx = bias + s;
1104          break;
1105       }
1106
1107       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1108       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1109       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
1110       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
1111       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1112          surface_state = desc->buffer_view->surface_state;
1113          assert(surface_state.alloc_size);
1114          add_surface_state_reloc(cmd_buffer, surface_state,
1115                                  desc->buffer_view->bo,
1116                                  desc->buffer_view->offset);
1117          break;
1118
1119       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
1120          surface_state = desc->buffer_view->storage_surface_state;
1121          assert(surface_state.alloc_size);
1122          add_surface_state_reloc(cmd_buffer, surface_state,
1123                                  desc->buffer_view->bo,
1124                                  desc->buffer_view->offset);
1125
1126          struct brw_image_param *image_param =
1127             &cmd_buffer->state.push_constants[stage]->images[image++];
1128
1129          *image_param = desc->buffer_view->storage_image_param;
1130          image_param->surface_idx = bias + s;
1131          break;
1132
1133       default:
1134          assert(!"Invalid descriptor type");
1135          continue;
1136       }
1137
1138       bt_map[bias + s] = surface_state.offset + state_offset;
1139    }
1140    assert(image == map->image_count);
1141
1142  out:
1143    if (!cmd_buffer->device->info.has_llc)
1144       anv_state_clflush(*bt_state);
1145
1146    return VK_SUCCESS;
1147 }
1148
1149 static VkResult
1150 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
1151               gl_shader_stage stage,
1152               struct anv_state *state)
1153 {
1154    struct anv_pipeline *pipeline;
1155
1156    if (stage == MESA_SHADER_COMPUTE)
1157       pipeline = cmd_buffer->state.compute_pipeline;
1158    else
1159       pipeline = cmd_buffer->state.pipeline;
1160
1161    if (!anv_pipeline_has_stage(pipeline, stage)) {
1162       *state = (struct anv_state) { 0, };
1163       return VK_SUCCESS;
1164    }
1165
1166    struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
1167    if (map->sampler_count == 0) {
1168       *state = (struct anv_state) { 0, };
1169       return VK_SUCCESS;
1170    }
1171
1172    uint32_t size = map->sampler_count * 16;
1173    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
1174
1175    if (state->map == NULL)
1176       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1177
1178    for (uint32_t s = 0; s < map->sampler_count; s++) {
1179       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
1180       struct anv_descriptor_set *set =
1181          cmd_buffer->state.descriptors[binding->set];
1182       uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
1183       struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
1184
1185       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
1186           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
1187          continue;
1188
1189       struct anv_sampler *sampler = desc->sampler;
1190
1191       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
1192        * happens to be zero.
1193        */
1194       if (sampler == NULL)
1195          continue;
1196
1197       memcpy(state->map + (s * 16),
1198              sampler->state, sizeof(sampler->state));
1199    }
1200
1201    if (!cmd_buffer->device->info.has_llc)
1202       anv_state_clflush(*state);
1203
1204    return VK_SUCCESS;
1205 }
1206
1207 static uint32_t
1208 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
1209 {
1210    VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty &
1211                               cmd_buffer->state.pipeline->active_stages;
1212
1213    VkResult result = VK_SUCCESS;
1214    anv_foreach_stage(s, dirty) {
1215       result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
1216       if (result != VK_SUCCESS)
1217          break;
1218       result = emit_binding_table(cmd_buffer, s,
1219                                   &cmd_buffer->state.binding_tables[s]);
1220       if (result != VK_SUCCESS)
1221          break;
1222    }
1223
1224    if (result != VK_SUCCESS) {
1225       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
1226
1227       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
1228       assert(result == VK_SUCCESS);
1229
1230       /* Re-emit state base addresses so we get the new surface state base
1231        * address before we start emitting binding tables etc.
1232        */
1233       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1234
1235       /* Re-emit all active binding tables */
1236       dirty |= cmd_buffer->state.pipeline->active_stages;
1237       anv_foreach_stage(s, dirty) {
1238          result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
1239          if (result != VK_SUCCESS)
1240             return result;
1241          result = emit_binding_table(cmd_buffer, s,
1242                                      &cmd_buffer->state.binding_tables[s]);
1243          if (result != VK_SUCCESS)
1244             return result;
1245       }
1246    }
1247
1248    cmd_buffer->state.descriptors_dirty &= ~dirty;
1249
1250    return dirty;
1251 }
1252
1253 static void
1254 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
1255                                     uint32_t stages)
1256 {
1257    static const uint32_t sampler_state_opcodes[] = {
1258       [MESA_SHADER_VERTEX]                      = 43,
1259       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
1260       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
1261       [MESA_SHADER_GEOMETRY]                    = 46,
1262       [MESA_SHADER_FRAGMENT]                    = 47,
1263       [MESA_SHADER_COMPUTE]                     = 0,
1264    };
1265
1266    static const uint32_t binding_table_opcodes[] = {
1267       [MESA_SHADER_VERTEX]                      = 38,
1268       [MESA_SHADER_TESS_CTRL]                   = 39,
1269       [MESA_SHADER_TESS_EVAL]                   = 40,
1270       [MESA_SHADER_GEOMETRY]                    = 41,
1271       [MESA_SHADER_FRAGMENT]                    = 42,
1272       [MESA_SHADER_COMPUTE]                     = 0,
1273    };
1274
1275    anv_foreach_stage(s, stages) {
1276       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
1277          anv_batch_emit(&cmd_buffer->batch,
1278                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
1279             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
1280             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
1281          }
1282       }
1283
1284       /* Always emit binding table pointers if we're asked to, since on SKL
1285        * this is what flushes push constants. */
1286       anv_batch_emit(&cmd_buffer->batch,
1287                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
1288          btp._3DCommandSubOpcode = binding_table_opcodes[s];
1289          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
1290       }
1291    }
1292 }
1293
1294 static uint32_t
1295 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
1296 {
1297    static const uint32_t push_constant_opcodes[] = {
1298       [MESA_SHADER_VERTEX]                      = 21,
1299       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
1300       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
1301       [MESA_SHADER_GEOMETRY]                    = 22,
1302       [MESA_SHADER_FRAGMENT]                    = 23,
1303       [MESA_SHADER_COMPUTE]                     = 0,
1304    };
1305
1306    VkShaderStageFlags flushed = 0;
1307
1308    anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
1309       if (stage == MESA_SHADER_COMPUTE)
1310          continue;
1311
1312       struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
1313
1314       if (state.offset == 0) {
1315          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
1316             c._3DCommandSubOpcode = push_constant_opcodes[stage];
1317       } else {
1318          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
1319             c._3DCommandSubOpcode = push_constant_opcodes[stage],
1320             c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
1321 #if GEN_GEN >= 9
1322                .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
1323                .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
1324 #else
1325                .PointerToConstantBuffer0 = { .offset = state.offset },
1326                .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
1327 #endif
1328             };
1329          }
1330       }
1331
1332       flushed |= mesa_to_vk_shader_stage(stage);
1333    }
1334
1335    cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
1336
1337    return flushed;
1338 }
1339
1340 void
1341 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
1342 {
1343    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1344    uint32_t *p;
1345
1346    uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
1347
1348    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
1349
1350    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1351
1352    genX(flush_pipeline_select_3d)(cmd_buffer);
1353
1354    if (vb_emit) {
1355       const uint32_t num_buffers = __builtin_popcount(vb_emit);
1356       const uint32_t num_dwords = 1 + num_buffers * 4;
1357
1358       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
1359                           GENX(3DSTATE_VERTEX_BUFFERS));
1360       uint32_t vb, i = 0;
1361       for_each_bit(vb, vb_emit) {
1362          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
1363          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
1364
1365          struct GENX(VERTEX_BUFFER_STATE) state = {
1366             .VertexBufferIndex = vb,
1367
1368 #if GEN_GEN >= 8
1369             .MemoryObjectControlState = GENX(MOCS),
1370 #else
1371             .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
1372             .InstanceDataStepRate = 1,
1373             .VertexBufferMemoryObjectControlState = GENX(MOCS),
1374 #endif
1375
1376             .AddressModifyEnable = true,
1377             .BufferPitch = pipeline->binding_stride[vb],
1378             .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
1379
1380 #if GEN_GEN >= 8
1381             .BufferSize = buffer->size - offset
1382 #else
1383             .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
1384 #endif
1385          };
1386
1387          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
1388          i++;
1389       }
1390    }
1391
1392    cmd_buffer->state.vb_dirty &= ~vb_emit;
1393
1394    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
1395       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1396
1397       /* The exact descriptor layout is pulled from the pipeline, so we need
1398        * to re-emit binding tables on every pipeline change.
1399        */
1400       cmd_buffer->state.descriptors_dirty |=
1401          cmd_buffer->state.pipeline->active_stages;
1402
1403       /* If the pipeline changed, we may need to re-allocate push constant
1404        * space in the URB.
1405        */
1406       cmd_buffer_alloc_push_constants(cmd_buffer);
1407    }
1408
1409 #if GEN_GEN <= 7
1410    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
1411        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
1412       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
1413        *
1414        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
1415        *    stall needs to be sent just prior to any 3DSTATE_VS,
1416        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
1417        *    3DSTATE_BINDING_TABLE_POINTER_VS,
1418        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
1419        *    PIPE_CONTROL needs to be sent before any combination of VS
1420        *    associated 3DSTATE."
1421        */
1422       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1423          pc.DepthStallEnable  = true;
1424          pc.PostSyncOperation = WriteImmediateData;
1425          pc.Address           =
1426             (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
1427       }
1428    }
1429 #endif
1430
1431    /* Render targets live in the same binding table as fragment descriptors */
1432    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
1433       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
1434
1435    /* We emit the binding tables and sampler tables first, then emit push
1436     * constants and then finally emit binding table and sampler table
1437     * pointers.  It has to happen in this order, since emitting the binding
1438     * tables may change the push constants (in case of storage images). After
1439     * emitting push constants, on SKL+ we have to emit the corresponding
1440     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
1441     */
1442    uint32_t dirty = 0;
1443    if (cmd_buffer->state.descriptors_dirty)
1444       dirty = flush_descriptor_sets(cmd_buffer);
1445
1446    if (cmd_buffer->state.push_constants_dirty) {
1447 #if GEN_GEN >= 9
1448       /* On Sky Lake and later, the binding table pointers commands are
1449        * what actually flush the changes to push constant state so we need
1450        * to dirty them so they get re-emitted below.
1451        */
1452       dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
1453 #else
1454       cmd_buffer_flush_push_constants(cmd_buffer);
1455 #endif
1456    }
1457
1458    if (dirty)
1459       cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
1460
1461    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
1462       gen8_cmd_buffer_emit_viewport(cmd_buffer);
1463
1464    if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
1465                                   ANV_CMD_DIRTY_PIPELINE)) {
1466       gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
1467                                           pipeline->depth_clamp_enable);
1468    }
1469
1470    if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
1471       gen7_cmd_buffer_emit_scissor(cmd_buffer);
1472
1473    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
1474
1475    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1476 }
1477
1478 static void
1479 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
1480                              struct anv_bo *bo, uint32_t offset)
1481 {
1482    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
1483                                  GENX(3DSTATE_VERTEX_BUFFERS));
1484
1485    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
1486       &(struct GENX(VERTEX_BUFFER_STATE)) {
1487          .VertexBufferIndex = 32, /* Reserved for this */
1488          .AddressModifyEnable = true,
1489          .BufferPitch = 0,
1490 #if (GEN_GEN >= 8)
1491          .MemoryObjectControlState = GENX(MOCS),
1492          .BufferStartingAddress = { bo, offset },
1493          .BufferSize = 8
1494 #else
1495          .VertexBufferMemoryObjectControlState = GENX(MOCS),
1496          .BufferStartingAddress = { bo, offset },
1497          .EndAddress = { bo, offset + 8 },
1498 #endif
1499       });
1500 }
1501
1502 static void
1503 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
1504                           uint32_t base_vertex, uint32_t base_instance)
1505 {
1506    struct anv_state id_state =
1507       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
1508
1509    ((uint32_t *)id_state.map)[0] = base_vertex;
1510    ((uint32_t *)id_state.map)[1] = base_instance;
1511
1512    if (!cmd_buffer->device->info.has_llc)
1513       anv_state_clflush(id_state);
1514
1515    emit_base_vertex_instance_bo(cmd_buffer,
1516       &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
1517 }
1518
1519 void genX(CmdDraw)(
1520     VkCommandBuffer                             commandBuffer,
1521     uint32_t                                    vertexCount,
1522     uint32_t                                    instanceCount,
1523     uint32_t                                    firstVertex,
1524     uint32_t                                    firstInstance)
1525 {
1526    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1527    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1528    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1529
1530    genX(cmd_buffer_flush_state)(cmd_buffer);
1531
1532    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1533       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1534
1535    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1536       prim.VertexAccessType         = SEQUENTIAL;
1537       prim.PrimitiveTopologyType    = pipeline->topology;
1538       prim.VertexCountPerInstance   = vertexCount;
1539       prim.StartVertexLocation      = firstVertex;
1540       prim.InstanceCount            = instanceCount;
1541       prim.StartInstanceLocation    = firstInstance;
1542       prim.BaseVertexLocation       = 0;
1543    }
1544 }
1545
1546 void genX(CmdDrawIndexed)(
1547     VkCommandBuffer                             commandBuffer,
1548     uint32_t                                    indexCount,
1549     uint32_t                                    instanceCount,
1550     uint32_t                                    firstIndex,
1551     int32_t                                     vertexOffset,
1552     uint32_t                                    firstInstance)
1553 {
1554    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1555    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1556    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1557
1558    genX(cmd_buffer_flush_state)(cmd_buffer);
1559
1560    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1561       emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
1562
1563    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1564       prim.VertexAccessType         = RANDOM;
1565       prim.PrimitiveTopologyType    = pipeline->topology;
1566       prim.VertexCountPerInstance   = indexCount;
1567       prim.StartVertexLocation      = firstIndex;
1568       prim.InstanceCount            = instanceCount;
1569       prim.StartInstanceLocation    = firstInstance;
1570       prim.BaseVertexLocation       = vertexOffset;
1571    }
1572 }
1573
1574 /* Auto-Draw / Indirect Registers */
1575 #define GEN7_3DPRIM_END_OFFSET          0x2420
1576 #define GEN7_3DPRIM_START_VERTEX        0x2430
1577 #define GEN7_3DPRIM_VERTEX_COUNT        0x2434
1578 #define GEN7_3DPRIM_INSTANCE_COUNT      0x2438
1579 #define GEN7_3DPRIM_START_INSTANCE      0x243C
1580 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
1581
1582 void genX(CmdDrawIndirect)(
1583     VkCommandBuffer                             commandBuffer,
1584     VkBuffer                                    _buffer,
1585     VkDeviceSize                                offset,
1586     uint32_t                                    drawCount,
1587     uint32_t                                    stride)
1588 {
1589    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1590    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1591    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1592    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1593    struct anv_bo *bo = buffer->bo;
1594    uint32_t bo_offset = buffer->offset + offset;
1595
1596    genX(cmd_buffer_flush_state)(cmd_buffer);
1597
1598    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1599       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
1600
1601    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1602    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1603    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1604    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
1605    emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
1606
1607    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1608       prim.IndirectParameterEnable  = true;
1609       prim.VertexAccessType         = SEQUENTIAL;
1610       prim.PrimitiveTopologyType    = pipeline->topology;
1611    }
1612 }
1613
1614 void genX(CmdDrawIndexedIndirect)(
1615     VkCommandBuffer                             commandBuffer,
1616     VkBuffer                                    _buffer,
1617     VkDeviceSize                                offset,
1618     uint32_t                                    drawCount,
1619     uint32_t                                    stride)
1620 {
1621    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1622    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1623    struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1624    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1625    struct anv_bo *bo = buffer->bo;
1626    uint32_t bo_offset = buffer->offset + offset;
1627
1628    genX(cmd_buffer_flush_state)(cmd_buffer);
1629
1630    /* TODO: We need to stomp base vertex to 0 somehow */
1631    if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1632       emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
1633
1634    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1635    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1636    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1637    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
1638    emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
1639
1640    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1641       prim.IndirectParameterEnable  = true;
1642       prim.VertexAccessType         = RANDOM;
1643       prim.PrimitiveTopologyType    = pipeline->topology;
1644    }
1645 }
1646
1647 static VkResult
1648 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
1649 {
1650    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1651    struct anv_state surfaces = { 0, }, samplers = { 0, };
1652    VkResult result;
1653
1654    result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
1655    if (result != VK_SUCCESS) {
1656       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
1657       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
1658       assert(result == VK_SUCCESS);
1659
1660       /* Re-emit state base addresses so we get the new surface state base
1661        * address before we start emitting binding tables etc.
1662        */
1663       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1664
1665       result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
1666       assert(result == VK_SUCCESS);
1667    }
1668
1669    result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers);
1670    assert(result == VK_SUCCESS);
1671
1672    uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
1673    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
1674       .BindingTablePointer = surfaces.offset,
1675       .SamplerStatePointer = samplers.offset,
1676    };
1677    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
1678
1679    struct anv_state state =
1680       anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
1681                                    pipeline->interface_descriptor_data,
1682                                    GENX(INTERFACE_DESCRIPTOR_DATA_length),
1683                                    64);
1684
1685    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1686    anv_batch_emit(&cmd_buffer->batch,
1687                   GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1688       mid.InterfaceDescriptorTotalLength        = size;
1689       mid.InterfaceDescriptorDataStartAddress   = state.offset;
1690    }
1691
1692    return VK_SUCCESS;
1693 }
1694
1695 void
1696 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
1697 {
1698    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1699    MAYBE_UNUSED VkResult result;
1700
1701    assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
1702
1703    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1704
1705    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1706
1707    if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) {
1708       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
1709        *
1710        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1711        *    the only bits that are changed are scoreboard related: Scoreboard
1712        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1713        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
1714        *    sufficient."
1715        */
1716       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
1717       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1718
1719       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1720    }
1721
1722    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
1723        (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
1724       /* FIXME: figure out descriptors for gen7 */
1725       result = flush_compute_descriptor_set(cmd_buffer);
1726       assert(result == VK_SUCCESS);
1727       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
1728    }
1729
1730    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
1731       struct anv_state push_state =
1732          anv_cmd_buffer_cs_push_constants(cmd_buffer);
1733
1734       if (push_state.alloc_size) {
1735          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1736             curbe.CURBETotalDataLength    = push_state.alloc_size;
1737             curbe.CURBEDataStartAddress   = push_state.offset;
1738          }
1739       }
1740    }
1741
1742    cmd_buffer->state.compute_dirty = 0;
1743
1744    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1745 }
1746
1747 #if GEN_GEN == 7
1748
1749 static bool
1750 verify_cmd_parser(const struct anv_device *device,
1751                   int required_version,
1752                   const char *function)
1753 {
1754    if (device->instance->physicalDevice.cmd_parser_version < required_version) {
1755       vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
1756                 "cmd parser version %d is required for %s",
1757                 required_version, function);
1758       return false;
1759    } else {
1760       return true;
1761    }
1762 }
1763
1764 #endif
1765
1766 void genX(CmdDispatch)(
1767     VkCommandBuffer                             commandBuffer,
1768     uint32_t                                    x,
1769     uint32_t                                    y,
1770     uint32_t                                    z)
1771 {
1772    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1773    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1774    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1775
1776    if (prog_data->uses_num_work_groups) {
1777       struct anv_state state =
1778          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
1779       uint32_t *sizes = state.map;
1780       sizes[0] = x;
1781       sizes[1] = y;
1782       sizes[2] = z;
1783       if (!cmd_buffer->device->info.has_llc)
1784          anv_state_clflush(state);
1785       cmd_buffer->state.num_workgroups_offset = state.offset;
1786       cmd_buffer->state.num_workgroups_bo =
1787          &cmd_buffer->device->dynamic_state_block_pool.bo;
1788    }
1789
1790    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1791
1792    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
1793       ggw.SIMDSize                     = prog_data->simd_size / 16;
1794       ggw.ThreadDepthCounterMaximum    = 0;
1795       ggw.ThreadHeightCounterMaximum   = 0;
1796       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1797       ggw.ThreadGroupIDXDimension      = x;
1798       ggw.ThreadGroupIDYDimension      = y;
1799       ggw.ThreadGroupIDZDimension      = z;
1800       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1801       ggw.BottomExecutionMask          = 0xffffffff;
1802    }
1803
1804    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
1805 }
1806
1807 #define GPGPU_DISPATCHDIMX 0x2500
1808 #define GPGPU_DISPATCHDIMY 0x2504
1809 #define GPGPU_DISPATCHDIMZ 0x2508
1810
1811 #define MI_PREDICATE_SRC0  0x2400
1812 #define MI_PREDICATE_SRC1  0x2408
1813
1814 void genX(CmdDispatchIndirect)(
1815     VkCommandBuffer                             commandBuffer,
1816     VkBuffer                                    _buffer,
1817     VkDeviceSize                                offset)
1818 {
1819    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1820    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1821    struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1822    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1823    struct anv_bo *bo = buffer->bo;
1824    uint32_t bo_offset = buffer->offset + offset;
1825    struct anv_batch *batch = &cmd_buffer->batch;
1826
1827 #if GEN_GEN == 7
1828    /* Linux 4.4 added command parser version 5 which allows the GPGPU
1829     * indirect dispatch registers to be written.
1830     */
1831    if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
1832       return;
1833 #endif
1834
1835    if (prog_data->uses_num_work_groups) {
1836       cmd_buffer->state.num_workgroups_offset = bo_offset;
1837       cmd_buffer->state.num_workgroups_bo = bo;
1838    }
1839
1840    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1841
1842    emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
1843    emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
1844    emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
1845
1846 #if GEN_GEN <= 7
1847    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
1848    emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
1849    emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
1850    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
1851
1852    /* Load compute_dispatch_indirect_x_size into SRC0 */
1853    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
1854
1855    /* predicate = (compute_dispatch_indirect_x_size == 0); */
1856    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1857       mip.LoadOperation    = LOAD_LOAD;
1858       mip.CombineOperation = COMBINE_SET;
1859       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1860    }
1861
1862    /* Load compute_dispatch_indirect_y_size into SRC0 */
1863    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
1864
1865    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
1866    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1867       mip.LoadOperation    = LOAD_LOAD;
1868       mip.CombineOperation = COMBINE_OR;
1869       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1870    }
1871
1872    /* Load compute_dispatch_indirect_z_size into SRC0 */
1873    emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
1874
1875    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
1876    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1877       mip.LoadOperation    = LOAD_LOAD;
1878       mip.CombineOperation = COMBINE_OR;
1879       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1880    }
1881
1882    /* predicate = !predicate; */
1883 #define COMPARE_FALSE                           1
1884    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1885       mip.LoadOperation    = LOAD_LOADINV;
1886       mip.CombineOperation = COMBINE_OR;
1887       mip.CompareOperation = COMPARE_FALSE;
1888    }
1889 #endif
1890
1891    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
1892       ggw.IndirectParameterEnable      = true;
1893       ggw.PredicateEnable              = GEN_GEN <= 7;
1894       ggw.SIMDSize                     = prog_data->simd_size / 16;
1895       ggw.ThreadDepthCounterMaximum    = 0;
1896       ggw.ThreadHeightCounterMaximum   = 0;
1897       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
1898       ggw.RightExecutionMask           = pipeline->cs_right_mask;
1899       ggw.BottomExecutionMask          = 0xffffffff;
1900    }
1901
1902    anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
1903 }
1904
1905 static void
1906 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
1907                                       uint32_t pipeline)
1908 {
1909 #if GEN_GEN >= 8 && GEN_GEN < 10
1910    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1911     *
1912     *   Software must clear the COLOR_CALC_STATE Valid field in
1913     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1914     *   with Pipeline Select set to GPGPU.
1915     *
1916     * The internal hardware docs recommend the same workaround for Gen9
1917     * hardware too.
1918     */
1919    if (pipeline == GPGPU)
1920       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1921 #elif GEN_GEN <= 7
1922       /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1923        * PIPELINE_SELECT [DevBWR+]":
1924        *
1925        *   Project: DEVSNB+
1926        *
1927        *   Software must ensure all the write caches are flushed through a
1928        *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1929        *   command to invalidate read only caches prior to programming
1930        *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
1931        */
1932       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1933          pc.RenderTargetCacheFlushEnable  = true;
1934          pc.DepthCacheFlushEnable         = true;
1935          pc.DCFlushEnable                 = true;
1936          pc.PostSyncOperation             = NoWrite;
1937          pc.CommandStreamerStallEnable    = true;
1938       }
1939
1940       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1941          pc.TextureCacheInvalidationEnable   = true;
1942          pc.ConstantCacheInvalidationEnable  = true;
1943          pc.StateCacheInvalidationEnable     = true;
1944          pc.InstructionCacheInvalidateEnable = true;
1945          pc.PostSyncOperation                = NoWrite;
1946       }
1947 #endif
1948 }
1949
1950 void
1951 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
1952 {
1953    if (cmd_buffer->state.current_pipeline != _3D) {
1954       flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
1955
1956       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1957 #if GEN_GEN >= 9
1958          ps.MaskBits = 3;
1959 #endif
1960          ps.PipelineSelection = _3D;
1961       }
1962
1963       cmd_buffer->state.current_pipeline = _3D;
1964    }
1965 }
1966
1967 void
1968 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
1969 {
1970    if (cmd_buffer->state.current_pipeline != GPGPU) {
1971       flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
1972
1973       anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1974 #if GEN_GEN >= 9
1975          ps.MaskBits = 3;
1976 #endif
1977          ps.PipelineSelection = GPGPU;
1978       }
1979
1980       cmd_buffer->state.current_pipeline = GPGPU;
1981    }
1982 }
1983
1984 void
1985 genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
1986 {
1987    if (GEN_GEN >= 8)
1988       return;
1989
1990    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
1991     *
1992     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
1993     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
1994     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
1995     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
1996     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
1997     *    Depth Flush Bit set, followed by another pipelined depth stall
1998     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
1999     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
2000     *    via a preceding MI_FLUSH)."
2001     */
2002    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2003       pipe.DepthStallEnable = true;
2004    }
2005    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2006       pipe.DepthCacheFlushEnable = true;
2007    }
2008    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2009       pipe.DepthStallEnable = true;
2010    }
2011 }
2012
2013 static void
2014 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
2015 {
2016    struct anv_device *device = cmd_buffer->device;
2017    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
2018    const struct anv_image_view *iview =
2019       anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
2020    const struct anv_image *image = iview ? iview->image : NULL;
2021    const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
2022    const bool has_hiz = image != NULL && anv_image_has_hiz(image);
2023    const bool has_stencil =
2024       image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
2025
2026    /* FIXME: Implement the PMA stall W/A */
2027    /* FIXME: Width and Height are wrong */
2028
2029    genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
2030
2031    /* Emit 3DSTATE_DEPTH_BUFFER */
2032    if (has_depth) {
2033       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
2034          db.SurfaceType                   = SURFTYPE_2D;
2035          db.DepthWriteEnable              = true;
2036          db.StencilWriteEnable            = has_stencil;
2037
2038          if (cmd_buffer->state.pass->subpass_count == 1) {
2039             db.HierarchicalDepthBufferEnable = has_hiz;
2040          } else {
2041             anv_finishme("Multiple-subpass HiZ not implemented");
2042          }
2043
2044          db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
2045                                                       &image->depth_surface.isl);
2046
2047          db.SurfaceBaseAddress = (struct anv_address) {
2048             .bo = image->bo,
2049             .offset = image->offset + image->depth_surface.offset,
2050          };
2051          db.DepthBufferObjectControlState = GENX(MOCS);
2052
2053          db.SurfacePitch         = image->depth_surface.isl.row_pitch - 1;
2054          db.Height               = image->extent.height - 1;
2055          db.Width                = image->extent.width - 1;
2056          db.LOD                  = iview->isl.base_level;
2057          db.Depth                = image->array_size - 1; /* FIXME: 3-D */
2058          db.MinimumArrayElement  = iview->isl.base_array_layer;
2059
2060 #if GEN_GEN >= 8
2061          db.SurfaceQPitch =
2062             isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2;
2063 #endif
2064          db.RenderTargetViewExtent = 1 - 1;
2065       }
2066    } else {
2067       /* Even when no depth buffer is present, the hardware requires that
2068        * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
2069        *
2070        *    If a null depth buffer is bound, the driver must instead bind depth as:
2071        *       3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
2072        *       3DSTATE_DEPTH.Width = 1
2073        *       3DSTATE_DEPTH.Height = 1
2074        *       3DSTATE_DEPTH.SuraceFormat = D16_UNORM
2075        *       3DSTATE_DEPTH.SurfaceBaseAddress = 0
2076        *       3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
2077        *       3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
2078        *       3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
2079        *
2080        * The PRM is wrong, though. The width and height must be programmed to
2081        * actual framebuffer's width and height, even when neither depth buffer
2082        * nor stencil buffer is present.  Also, D16_UNORM is not allowed to
2083        * be combined with a stencil buffer so we use D32_FLOAT instead.
2084        */
2085       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
2086          db.SurfaceType          = SURFTYPE_2D;
2087          db.SurfaceFormat        = D32_FLOAT;
2088          db.Width                = fb->width - 1;
2089          db.Height               = fb->height - 1;
2090          db.StencilWriteEnable   = has_stencil;
2091       }
2092    }
2093
2094    if (has_hiz) {
2095       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) {
2096          hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS);
2097          hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1;
2098          hdb.SurfaceBaseAddress = (struct anv_address) {
2099             .bo = image->bo,
2100             .offset = image->offset + image->aux_surface.offset,
2101          };
2102 #if GEN_GEN >= 8
2103          /* From the SKL PRM Vol2a:
2104           *
2105           *    The interpretation of this field is dependent on Surface Type
2106           *    as follows:
2107           *    - SURFTYPE_1D: distance in pixels between array slices
2108           *    - SURFTYPE_2D/CUBE: distance in rows between array slices
2109           *    - SURFTYPE_3D: distance in rows between R - slices
2110           */
2111          hdb.SurfaceQPitch =
2112             image->aux_surface.isl.dim == ISL_SURF_DIM_1D ?
2113                isl_surf_get_array_pitch_el(&image->aux_surface.isl) >> 2 :
2114                isl_surf_get_array_pitch_el_rows(&image->aux_surface.isl) >> 2;
2115 #endif
2116       }
2117    } else {
2118       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb);
2119    }
2120
2121    /* Emit 3DSTATE_STENCIL_BUFFER */
2122    if (has_stencil) {
2123       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
2124 #if GEN_GEN >= 8 || GEN_IS_HASWELL
2125          sb.StencilBufferEnable = true;
2126 #endif
2127          sb.StencilBufferObjectControlState = GENX(MOCS);
2128
2129          sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1;
2130
2131 #if GEN_GEN >= 8
2132          sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2;
2133 #endif
2134          sb.SurfaceBaseAddress = (struct anv_address) {
2135             .bo = image->bo,
2136             .offset = image->offset + image->stencil_surface.offset,
2137          };
2138       }
2139    } else {
2140       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
2141    }
2142
2143    /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS:
2144     *
2145     *    3DSTATE_CLEAR_PARAMS must always be programmed in the along with
2146     *    the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER,
2147     *    3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER)
2148     *
2149     * Testing also shows that some variant of this restriction may exist HSW+.
2150     * On BDW+, it is not possible to emit 2 of these packets consecutively when
2151     * both have DepthClearValueValid set. An analysis of such state programming
2152     * on SKL showed that the GPU doesn't register the latter packet's clear
2153     * value.
2154     */
2155    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) {
2156       if (has_hiz) {
2157          cp.DepthClearValueValid = true;
2158          const uint32_t ds =
2159             cmd_buffer->state.subpass->depth_stencil_attachment;
2160          cp.DepthClearValue =
2161             cmd_buffer->state.attachments[ds].clear_value.depthStencil.depth;
2162       }
2163    }
2164 }
2165
2166 static void
2167 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
2168                              struct anv_subpass *subpass)
2169 {
2170    cmd_buffer->state.subpass = subpass;
2171
2172    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
2173
2174    cmd_buffer_emit_depth_stencil(cmd_buffer);
2175    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_HIZ_RESOLVE);
2176    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_CLEAR);
2177
2178    anv_cmd_buffer_clear_subpass(cmd_buffer);
2179 }
2180
2181 void genX(CmdBeginRenderPass)(
2182     VkCommandBuffer                             commandBuffer,
2183     const VkRenderPassBeginInfo*                pRenderPassBegin,
2184     VkSubpassContents                           contents)
2185 {
2186    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2187    ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
2188    ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
2189
2190    cmd_buffer->state.framebuffer = framebuffer;
2191    cmd_buffer->state.pass = pass;
2192    cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
2193    genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin);
2194
2195    genX(flush_pipeline_select_3d)(cmd_buffer);
2196
2197    genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
2198 }
2199
2200 void genX(CmdNextSubpass)(
2201     VkCommandBuffer                             commandBuffer,
2202     VkSubpassContents                           contents)
2203 {
2204    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2205
2206    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
2207
2208    anv_cmd_buffer_resolve_subpass(cmd_buffer);
2209    genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
2210 }
2211
2212 void genX(CmdEndRenderPass)(
2213     VkCommandBuffer                             commandBuffer)
2214 {
2215    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2216
2217    genX(cmd_buffer_emit_hz_op)(cmd_buffer, BLORP_HIZ_OP_DEPTH_RESOLVE);
2218    anv_cmd_buffer_resolve_subpass(cmd_buffer);
2219
2220 #ifndef NDEBUG
2221    anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
2222 #endif
2223 }
2224
2225 static void
2226 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
2227                     struct anv_bo *bo, uint32_t offset)
2228 {
2229    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2230       pc.DestinationAddressType  = DAT_PPGTT;
2231       pc.PostSyncOperation       = WritePSDepthCount;
2232       pc.DepthStallEnable        = true;
2233       pc.Address                 = (struct anv_address) { bo, offset };
2234
2235       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
2236          pc.CommandStreamerStallEnable = true;
2237    }
2238 }
2239
2240 static void
2241 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
2242                         struct anv_bo *bo, uint32_t offset)
2243 {
2244    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2245       pc.DestinationAddressType  = DAT_PPGTT;
2246       pc.PostSyncOperation       = WriteImmediateData;
2247       pc.Address                 = (struct anv_address) { bo, offset };
2248       pc.ImmediateData           = 1;
2249    }
2250 }
2251
2252 void genX(CmdBeginQuery)(
2253     VkCommandBuffer                             commandBuffer,
2254     VkQueryPool                                 queryPool,
2255     uint32_t                                    query,
2256     VkQueryControlFlags                         flags)
2257 {
2258    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2259    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2260
2261    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
2262     * that the pipelining of the depth write breaks. What we see is that
2263     * samples from the render pass clear leaks into the first query
2264     * immediately after the clear. Doing a pipecontrol with a post-sync
2265     * operation and DepthStallEnable seems to work around the issue.
2266     */
2267    if (cmd_buffer->state.need_query_wa) {
2268       cmd_buffer->state.need_query_wa = false;
2269       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2270          pc.DepthCacheFlushEnable   = true;
2271          pc.DepthStallEnable        = true;
2272       }
2273    }
2274
2275    switch (pool->type) {
2276    case VK_QUERY_TYPE_OCCLUSION:
2277       emit_ps_depth_count(cmd_buffer, &pool->bo,
2278                           query * sizeof(struct anv_query_pool_slot));
2279       break;
2280
2281    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2282    default:
2283       unreachable("");
2284    }
2285 }
2286
2287 void genX(CmdEndQuery)(
2288     VkCommandBuffer                             commandBuffer,
2289     VkQueryPool                                 queryPool,
2290     uint32_t                                    query)
2291 {
2292    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2293    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2294
2295    switch (pool->type) {
2296    case VK_QUERY_TYPE_OCCLUSION:
2297       emit_ps_depth_count(cmd_buffer, &pool->bo,
2298                           query * sizeof(struct anv_query_pool_slot) + 8);
2299
2300       emit_query_availability(cmd_buffer, &pool->bo,
2301                               query * sizeof(struct anv_query_pool_slot) + 16);
2302       break;
2303
2304    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2305    default:
2306       unreachable("");
2307    }
2308 }
2309
2310 #define TIMESTAMP 0x2358
2311
2312 void genX(CmdWriteTimestamp)(
2313     VkCommandBuffer                             commandBuffer,
2314     VkPipelineStageFlagBits                     pipelineStage,
2315     VkQueryPool                                 queryPool,
2316     uint32_t                                    query)
2317 {
2318    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2319    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2320    uint32_t offset = query * sizeof(struct anv_query_pool_slot);
2321
2322    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
2323
2324    switch (pipelineStage) {
2325    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
2326       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2327          srm.RegisterAddress  = TIMESTAMP;
2328          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset };
2329       }
2330       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2331          srm.RegisterAddress  = TIMESTAMP + 4;
2332          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 4 };
2333       }
2334       break;
2335
2336    default:
2337       /* Everything else is bottom-of-pipe */
2338       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2339          pc.DestinationAddressType  = DAT_PPGTT;
2340          pc.PostSyncOperation       = WriteTimestamp;
2341          pc.Address = (struct anv_address) { &pool->bo, offset };
2342
2343          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
2344             pc.CommandStreamerStallEnable = true;
2345       }
2346       break;
2347    }
2348
2349    emit_query_availability(cmd_buffer, &pool->bo, query + 16);
2350 }
2351
2352 #if GEN_GEN > 7 || GEN_IS_HASWELL
2353
2354 #define alu_opcode(v)   __gen_uint((v),  20, 31)
2355 #define alu_operand1(v) __gen_uint((v),  10, 19)
2356 #define alu_operand2(v) __gen_uint((v),   0,  9)
2357 #define alu(opcode, operand1, operand2) \
2358    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
2359
2360 #define OPCODE_NOOP      0x000
2361 #define OPCODE_LOAD      0x080
2362 #define OPCODE_LOADINV   0x480
2363 #define OPCODE_LOAD0     0x081
2364 #define OPCODE_LOAD1     0x481
2365 #define OPCODE_ADD       0x100
2366 #define OPCODE_SUB       0x101
2367 #define OPCODE_AND       0x102
2368 #define OPCODE_OR        0x103
2369 #define OPCODE_XOR       0x104
2370 #define OPCODE_STORE     0x180
2371 #define OPCODE_STOREINV  0x580
2372
2373 #define OPERAND_R0   0x00
2374 #define OPERAND_R1   0x01
2375 #define OPERAND_R2   0x02
2376 #define OPERAND_R3   0x03
2377 #define OPERAND_R4   0x04
2378 #define OPERAND_SRCA 0x20
2379 #define OPERAND_SRCB 0x21
2380 #define OPERAND_ACCU 0x31
2381 #define OPERAND_ZF   0x32
2382 #define OPERAND_CF   0x33
2383
2384 #define CS_GPR(n) (0x2600 + (n) * 8)
2385
2386 static void
2387 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
2388                       struct anv_bo *bo, uint32_t offset)
2389 {
2390    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2391       lrm.RegisterAddress  = reg,
2392       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
2393    }
2394    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2395       lrm.RegisterAddress  = reg + 4;
2396       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
2397    }
2398 }
2399
2400 static void
2401 store_query_result(struct anv_batch *batch, uint32_t reg,
2402                    struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
2403 {
2404    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2405       srm.RegisterAddress  = reg;
2406       srm.MemoryAddress    = (struct anv_address) { bo, offset };
2407    }
2408
2409    if (flags & VK_QUERY_RESULT_64_BIT) {
2410       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2411          srm.RegisterAddress  = reg + 4;
2412          srm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
2413       }
2414    }
2415 }
2416
2417 void genX(CmdCopyQueryPoolResults)(
2418     VkCommandBuffer                             commandBuffer,
2419     VkQueryPool                                 queryPool,
2420     uint32_t                                    firstQuery,
2421     uint32_t                                    queryCount,
2422     VkBuffer                                    destBuffer,
2423     VkDeviceSize                                destOffset,
2424     VkDeviceSize                                destStride,
2425     VkQueryResultFlags                          flags)
2426 {
2427    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2428    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2429    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
2430    uint32_t slot_offset, dst_offset;
2431
2432    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
2433       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2434          pc.CommandStreamerStallEnable = true;
2435          pc.StallAtPixelScoreboard     = true;
2436       }
2437    }
2438
2439    dst_offset = buffer->offset + destOffset;
2440    for (uint32_t i = 0; i < queryCount; i++) {
2441
2442       slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
2443       switch (pool->type) {
2444       case VK_QUERY_TYPE_OCCLUSION:
2445          emit_load_alu_reg_u64(&cmd_buffer->batch,
2446                                CS_GPR(0), &pool->bo, slot_offset);
2447          emit_load_alu_reg_u64(&cmd_buffer->batch,
2448                                CS_GPR(1), &pool->bo, slot_offset + 8);
2449
2450          /* FIXME: We need to clamp the result for 32 bit. */
2451
2452          uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
2453          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
2454          dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
2455          dw[3] = alu(OPCODE_SUB, 0, 0);
2456          dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
2457          break;
2458
2459       case VK_QUERY_TYPE_TIMESTAMP:
2460          emit_load_alu_reg_u64(&cmd_buffer->batch,
2461                                CS_GPR(2), &pool->bo, slot_offset);
2462          break;
2463
2464       default:
2465          unreachable("unhandled query type");
2466       }
2467
2468       store_query_result(&cmd_buffer->batch,
2469                          CS_GPR(2), buffer->bo, dst_offset, flags);
2470
2471       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
2472          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
2473                                &pool->bo, slot_offset + 16);
2474          if (flags & VK_QUERY_RESULT_64_BIT)
2475             store_query_result(&cmd_buffer->batch,
2476                                CS_GPR(0), buffer->bo, dst_offset + 8, flags);
2477          else
2478             store_query_result(&cmd_buffer->batch,
2479                                CS_GPR(0), buffer->bo, dst_offset + 4, flags);
2480       }
2481
2482       dst_offset += destStride;
2483    }
2484 }
2485
2486 #else
2487 void genX(CmdCopyQueryPoolResults)(
2488     VkCommandBuffer                             commandBuffer,
2489     VkQueryPool                                 queryPool,
2490     uint32_t                                    firstQuery,
2491     uint32_t                                    queryCount,
2492     VkBuffer                                    destBuffer,
2493     VkDeviceSize                                destOffset,
2494     VkDeviceSize                                destStride,
2495     VkQueryResultFlags                          flags)
2496 {
2497    anv_finishme("Queries not yet supported on Ivy Bridge");
2498 }
2499 #endif